From 5a002dba1628a6546255a6a1c0b8ab4b2e5dbdad Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Mon, 5 Feb 2018 23:13:50 -0500
Subject: [PATCH] fixes for pan

---
 mhcflurry/allele_encoding.py           |  9 ++--
 mhcflurry/class1_affinity_predictor.py | 69 ++++++++++++++++----------
 mhcflurry/class1_neural_network.py     | 14 ++----
 3 files changed, 52 insertions(+), 40 deletions(-)

diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py
index a2bc8715..d95d2ce2 100644
--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
@@ -37,7 +37,7 @@ class AlleleEncoding(object):
 
         self.encoding_cache = {}
 
-    def fixed_length_sequences(self, vector_encoding_name):
+    def fixed_length_vector_encoded_sequences(self, vector_encoding_name):
         """
         Encode alleles.
 
@@ -57,14 +57,13 @@ class AlleleEncoding(object):
             "fixed_length_vector_encoding",
             vector_encoding_name)
         if cache_key not in self.encoding_cache:
-            index_encoded_matrix = amino_acid.fixed_vectors_encoding(
+            index_encoded_matrix = amino_acid.index_encoding(
                 self.fixed_length_sequences.values,
-                amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN)
+                amino_acid.AMINO_ACID_INDEX)
             vector_encoded = amino_acid.fixed_vectors_encoding(
                 index_encoded_matrix,
                 amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
-            vector_encoded_df = pandas.DataFrame(vector_encoded)
-            result = vector_encoded_df.iloc[self.indices]
+            result = vector_encoded[self.indices]
             self.encoding_cache[cache_key] = result
         return self.encoding_cache[cache_key]
 
diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index fad3aa11..5f82e982 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -24,6 +24,7 @@ from .percent_rank_transform import PercentRankTransform
 from .regression_target import to_ic50
 from .version import __version__
 from .ensemble_centrality import CENTRALITY_MEASURES
+from .allele_encoding import AlleleEncoding
 
 
 class Class1AffinityPredictor(object):
@@ -39,7 +40,7 @@ class Class1AffinityPredictor(object):
             self,
             allele_to_allele_specific_models=None,
             class1_pan_allele_models=None,
-            allele_to_pseudosequence=None,
+            allele_to_fixed_length_sequence=None,
             manifest_df=None,
             allele_to_percent_rank_transform=None):
         """
@@ -51,7 +52,7 @@ class Class1AffinityPredictor(object):
         class1_pan_allele_models : list of `Class1NeuralNetwork`
             Ensemble of pan-allele models.
         
-        allele_to_pseudosequence : dict of string -> string
+        allele_to_fixed_length_sequence : dict of string -> string
             Required only if class1_pan_allele_models is specified.
         
         manifest_df : `pandas.DataFrame`, optional
@@ -70,11 +71,11 @@ class Class1AffinityPredictor(object):
             class1_pan_allele_models = []
 
         if class1_pan_allele_models:
-            assert allele_to_pseudosequence, "Pseudosequences required"
+            assert allele_to_fixed_length_sequence, "Allele sequences required"
 
         self.allele_to_allele_specific_models = allele_to_allele_specific_models
         self.class1_pan_allele_models = class1_pan_allele_models
-        self.allele_to_pseudosequence = allele_to_pseudosequence
+        self.allele_to_fixed_length_sequence = allele_to_fixed_length_sequence
 
         if manifest_df is None:
             rows = []
@@ -140,7 +141,7 @@ class Class1AffinityPredictor(object):
 
         allele_to_allele_specific_models = collections.defaultdict(list)
         class1_pan_allele_models = []
-        allele_to_pseudosequence = predictors[0].allele_to_pseudosequence
+        allele_to_fixed_length_sequence = predictors[0].allele_to_fixed_length_sequence
 
         for predictor in predictors:
             for (allele, networks) in (
@@ -152,7 +153,7 @@ class Class1AffinityPredictor(object):
         return Class1AffinityPredictor(
             allele_to_allele_specific_models=allele_to_allele_specific_models,
             class1_pan_allele_models=class1_pan_allele_models,
-            allele_to_pseudosequence=allele_to_pseudosequence
+            allele_to_fixed_length_sequence=allele_to_fixed_length_sequence
         )
 
     @property
@@ -165,8 +166,8 @@ class Class1AffinityPredictor(object):
         list of string
         """
         result = set(self.allele_to_allele_specific_models)
-        if self.allele_to_pseudosequence:
-            result = result.union(self.allele_to_pseudosequence)
+        if self.allele_to_fixed_length_sequence:
+            result = result.union(self.allele_to_fixed_length_sequence)
         return sorted(result)
 
     @property
@@ -196,7 +197,7 @@ class Class1AffinityPredictor(object):
         The serialization format consists of a file called "manifest.csv" with
         the configurations of each Class1NeuralNetwork, along with per-network
         files giving the model weights. If there are pan-allele predictors in
-        the ensemble, the allele pseudosequences are also stored in the
+        the ensemble, the allele sequences are also stored in the
         directory. There is also a small file "index.txt" with basic metadata:
         when the models were trained, by whom, on what host.
         
@@ -250,6 +251,15 @@ class Class1AffinityPredictor(object):
         pandas.DataFrame(rows).to_csv(
             info_path, sep="\t", header=False, index=False)
 
+        if self.allele_to_fixed_length_sequence is not None:
+            allele_to_sequence_df = pandas.DataFrame(
+                list(self.allele_to_fixed_length_sequence.items()),
+                columns=['allele', 'sequence']
+            )
+            allele_to_sequence_df.to_csv(
+                join(models_dir, "allele_sequences.csv"), index=False)
+            logging.info("Wrote: %s" % join(models_dir, "allele_sequences.csv"))
+
         if self.allele_to_percent_rank_transform:
             percent_ranks_df = None
             for (allele, transform) in self.allele_to_percent_rank_transform.items():
@@ -310,10 +320,10 @@ class Class1AffinityPredictor(object):
 
         manifest_df["model"] = all_models
 
-        pseudosequences = None
-        if exists(join(models_dir, "pseudosequences.csv")):
-            pseudosequences = pandas.read_csv(
-                join(models_dir, "pseudosequences.csv"),
+        allele_to_fixed_length_sequence = None
+        if exists(join(models_dir, "allele_sequences.csv")):
+            allele_to_fixed_length_sequence = pandas.read_csv(
+                join(models_dir, "allele_sequences.csv"),
                 index_col="allele").to_dict()
 
         allele_to_percent_rank_transform = {}
@@ -325,10 +335,10 @@ class Class1AffinityPredictor(object):
                     PercentRankTransform.from_series(percent_ranks_df[allele]))
 
         logging.info(
-            "Loaded %d class1 pan allele predictors, %d pseudosequences, "
+            "Loaded %d class1 pan allele predictors, %d allele sequences, "
             "%d percent rank distributions, and %d allele specific models: %s" % (
                 len(class1_pan_allele_models),
-                len(pseudosequences) if pseudosequences else 0,
+                len(allele_to_fixed_length_sequence) if allele_to_fixed_length_sequence else 0,
                 len(allele_to_percent_rank_transform),
                 sum(len(v) for v in allele_to_allele_specific_models.values()),
                 ", ".join(
@@ -339,7 +349,7 @@ class Class1AffinityPredictor(object):
         result = Class1AffinityPredictor(
             allele_to_allele_specific_models=allele_to_allele_specific_models,
             class1_pan_allele_models=class1_pan_allele_models,
-            allele_to_pseudosequence=pseudosequences,
+            allele_to_fixed_length_sequence=allele_to_fixed_length_sequence,
             manifest_df=manifest_df,
             allele_to_percent_rank_transform=allele_to_percent_rank_transform,
         )
@@ -516,6 +526,7 @@ class Class1AffinityPredictor(object):
             alleles,
             peptides,
             affinities,
+            inequalities,
             models_dir_for_save=None,
             verbose=1,
             progress_preamble=""):
@@ -534,12 +545,15 @@ class Class1AffinityPredictor(object):
         architecture_hyperparameters : dict
         
         alleles : list of string
-            Allele names (not pseudosequences) corresponding to each peptide 
+            Allele names (not sequences) corresponding to each peptide
         
         peptides : `EncodableSequences` or list of string
         
         affinities : list of float
             nM affinities
+
+        inequalities : list of string, each element one of ">", "<", or "="
+            See Class1NeuralNetwork.fit for details.
         
         models_dir_for_save : string, optional
             If specified, the Class1AffinityPredictor is (incrementally) written
@@ -557,7 +571,9 @@ class Class1AffinityPredictor(object):
         """
 
         alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name)
-        allele_pseudosequences = alleles.map(self.allele_to_pseudosequence)
+        allele_encoding = AlleleEncoding(
+            alleles,
+            allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence)
 
         encodable_peptides = EncodableSequences.create(peptides)
         models = []
@@ -567,7 +583,8 @@ class Class1AffinityPredictor(object):
             model.fit(
                 encodable_peptides,
                 affinities,
-                allele_pseudosequences=allele_pseudosequences,
+                inequalities=inequalities,
+                allele_encoding=allele_encoding,
                 verbose=verbose,
                 progress_preamble=progress_preamble)
 
@@ -777,27 +794,27 @@ class Class1AffinityPredictor(object):
             unsupported_alleles = [
                 allele for allele in
                 df.normalized_allele.unique()
-                if allele not in self.allele_to_pseudosequence
+                if allele not in self.allele_to_fixed_length_sequence
             ]
             if unsupported_alleles:
                 msg = (
-                    "No pseudosequences for allele(s): %s.\n"
+                    "No sequences for allele(s): %s.\n"
                     "Supported alleles: %s" % (
                         " ".join(unsupported_alleles),
-                        " ".join(sorted(self.allele_to_pseudosequence))))
+                        " ".join(sorted(self.allele_to_fixed_length_sequence))))
                 logging.warning(msg)
                 if throw:
                     raise ValueError(msg)
             mask = df.supported_peptide_length
             if mask.sum() > 0:
-                masked_allele_pseudosequences = (
-                    df.ix[mask].normalized_allele.map(
-                        self.allele_to_pseudosequence))
+                masked_allele_encoding = AlleleEncoding(
+                    df.loc[mask].normalized_allele,
+                    allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence)
                 masked_peptides = peptides.sequences[mask]
                 for (i, model) in enumerate(self.class1_pan_allele_models):
                     df.loc[mask, "model_pan_%d" % i] = model.predict(
                         masked_peptides,
-                        allele_pseudosequences=masked_allele_pseudosequences)
+                        allele_encoding=masked_allele_encoding)
 
         if self.allele_to_allele_specific_models:
             query_alleles = df.normalized_allele.unique()
diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index 5cf87569..c48b051f 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -370,7 +370,7 @@ class Class1NeuralNetwork(object):
         -------
         numpy.array
         """
-        return allele_encoding.fixed_length_sequences("BLOSUM62")
+        return allele_encoding.fixed_length_vector_encoded_sequences("BLOSUM62")
 
     def fit(
             self,
@@ -693,9 +693,8 @@ class Class1NeuralNetwork(object):
             'peptide': self.peptides_to_network_input(peptides)
         }
         if allele_encoding is not None:
-            pseudosequences_input = self.pseudosequence_to_network_input(
-                allele_pseudosequences)
-            x_dict['pseudosequence'] = pseudosequences_input
+            allele_input = self.allele_encoding_to_network_input(allele_encoding)
+            x_dict['allele'] = allele_input
 
         network = self.network(borrow=True)
         raw_predictions = network.predict(x_dict, batch_size=batch_size)
@@ -787,8 +786,8 @@ class Class1NeuralNetwork(object):
         if allele_encoding_dims:
             allele_input = Input(
                 shape=allele_encoding_dims,
-                dtype='int32',
-                name='peptide')
+                dtype='float32',
+                name='allele')
             inputs.append(allele_input)
             allele_embedding_layer = Flatten(name="allele_flat")(allele_input)
 
@@ -807,9 +806,6 @@ class Class1NeuralNetwork(object):
                 current_layer = keras.layers.multiply([
                     current_layer, allele_embedding_layer
                 ], name="allele_peptide_merged")
-
-                current_layer = keras.layers.concatenate(
-                    [current_layer, allele_embedding_layer], name="concatenated_0")
             else:
                 raise ValueError(
                     "Unsupported peptide_allele_encoding_merge_method: %s"
-- 
GitLab