From 8b01a3aaaaa40751b0f8d7e010bbfc1033e68fa2 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Mon, 5 Feb 2018 19:13:02 -0500
Subject: [PATCH] update experimental pan-allele infra

---
 mhcflurry/allele_encoding.py       |  71 +++++++++
 mhcflurry/class1_neural_network.py | 222 +++++++++++++++++++----------
 mhcflurry/encodable_sequences.py   |   1 -
 3 files changed, 218 insertions(+), 76 deletions(-)
 create mode 100644 mhcflurry/allele_encoding.py

diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py
new file mode 100644
index 00000000..a2bc8715
--- /dev/null
+++ b/mhcflurry/allele_encoding.py
@@ -0,0 +1,71 @@
+import numpy
+import pandas
+
+from .encodable_sequences import EncodableSequences
+from . import amino_acid
+
+class AlleleEncoding(object):
+    def __init__(
+            self,
+            alleles,
+            allele_to_fixed_length_sequence=None):
+        """
+        A place to cache encoding for a (potentially large) sequence of alleles.
+
+        Parameters
+        ----------
+        alleles : list of string
+            Allele names
+
+        allele_to_fixed_length_sequence : dict of str -> str
+            Allele name to fixed lengths sequence ("pseudosequence")
+        """
+
+        alleles = pandas.Series(alleles)
+
+        all_alleles = list(sorted(alleles.unique()))
+
+        self.allele_to_index = dict(
+            (allele, i)
+            for (i, allele) in enumerate(all_alleles))
+
+        self.indices = alleles.map(self.allele_to_index)
+
+        self.fixed_length_sequences = pandas.Series(
+            [allele_to_fixed_length_sequence[a] for a in all_alleles],
+            index=all_alleles)
+
+        self.encoding_cache = {}
+
+    def fixed_length_sequences(self, vector_encoding_name):
+        """
+        Encode alleles.
+
+        Parameters
+        ----------
+        vector_encoding_name : string
+            How to represent amino acids.
+            One of "BLOSUM62", "one-hot", etc. Full list of supported vector
+            encodings is given by available_vector_encodings() in amino_acid.
+
+        Returns
+        -------
+        numpy.array with shape (num sequences, sequence length, m) where m is
+        vector_encoding_length(vector_encoding_name)
+        """
+        cache_key = (
+            "fixed_length_vector_encoding",
+            vector_encoding_name)
+        if cache_key not in self.encoding_cache:
+            index_encoded_matrix = amino_acid.fixed_vectors_encoding(
+                self.fixed_length_sequences.values,
+                amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN)
+            vector_encoded = amino_acid.fixed_vectors_encoding(
+                index_encoded_matrix,
+                amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
+            vector_encoded_df = pandas.DataFrame(vector_encoded)
+            result = vector_encoded_df.iloc[self.indices]
+            self.encoding_cache[cache_key] = result
+        return self.encoding_cache[cache_key]
+
+
diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index ccd08367..9cd528ae 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -27,11 +27,13 @@ class Class1NeuralNetwork(object):
 
     network_hyperparameter_defaults = HyperparameterDefaults(
         kmer_size=15,
-        use_embedding=False,
         peptide_amino_acid_encoding="one-hot",
         embedding_input_dim=21,
         embedding_output_dim=8,
-        pseudosequence_use_embedding=False,
+        allele_dense_layer_sizes=[],
+        peptide_dense_layer_sizes=[],
+        peptide_allele_merge_method="multiply",
+        peptide_allele_merge_activation="",
         layer_sizes=[32],
         dense_layer_l1_regularization=0.001,
         dense_layer_l2_regularization=0.0,
@@ -107,9 +109,41 @@ class Class1NeuralNetwork(object):
     Combined set of all supported hyperparameters and their default values.
     """
 
+    # Hyperparameter renames.
+    # These are updated from time to time as new versions are developed. It
+    # provides a primitive way to allow new code to work with models trained
+    # using older code.
+    # None indicates the hyperparameter has been dropped.
+    hyperparameter_renames = {
+        "use_embedding": None,
+        "pseudosequence_use_embedding": None,
+    }
+
+    @classmethod
+    def apply_hyperparameter_renames(cls, hyperparameters):
+        """
+        Handle hyperparameter renames.
+
+        Parameters
+        ----------
+        hyperparameters : dict
+
+        Returns
+        -------
+        dict : updated hyperparameters
+
+        """
+        for (from_name, to_name) in cls.hyperparameter_renames.items():
+            if from_name in hyperparameters:
+                value = hyperparameters.pop(from_name)
+                if to_name:
+                    hyperparameters[to_name] = value
+        return hyperparameters
+
+
     def __init__(self, **hyperparameters):
         self.hyperparameters = self.hyperparameter_defaults.with_defaults(
-            hyperparameters)
+            self.apply_hyperparameter_renames(hyperparameters))
 
         self._network = None
         self.network_json = None
@@ -278,8 +312,7 @@ class Class1NeuralNetwork(object):
         numpy.array
         """
         encoder = EncodableSequences.create(peptides)
-        if (self.hyperparameters['use_embedding'] or
-                self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"):
+        if (self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"):
             encoded = encoder.variable_length_to_fixed_length_categorical(
                 max_length=self.hyperparameters['kmer_size'],
                 **self.input_encoding_hyperparameter_defaults.subselect(
@@ -313,33 +346,26 @@ class Class1NeuralNetwork(object):
             self.hyperparameters['right_edge'],
         self.hyperparameters['kmer_size'])
 
-    def pseudosequence_to_network_input(self, pseudosequences):
+    def allele_encoding_to_network_input(self, allele_encoding):
         """
-        Encode pseudosequences to the fixed-length encoding expected by the neural
+        Encode alleles to the fixed-length encoding expected by the neural
         network (which depends on the architecture).
 
         Parameters
         ----------
-        pseudosequences : EncodableSequences or list of string
+        allele_encoding : AlleleEncoding
 
         Returns
         -------
         numpy.array
         """
-        encoder = EncodableSequences.create(pseudosequences)
-        if self.hyperparameters['pseudosequence_use_embedding']:
-            encoded = encoder.fixed_length_categorical()
-        else:
-            raise NotImplementedError
-            # encoded = encoder.fixed_length_one_hot()
-        assert len(encoded) == len(pseudosequences)
-        return encoded
+        return allele_encoding.fixed_length_sequences("BLOSUM62")
 
     def fit(
             self,
             peptides,
             affinities,
-            allele_pseudosequences=None,
+            allele_encoding=None,
             inequalities=None,
             sample_weights=None,
             shuffle_permutation=None,
@@ -355,7 +381,7 @@ class Class1NeuralNetwork(object):
         affinities : list of float
             nM affinities. Must be same length of as peptides.
         
-        allele_pseudosequences : EncodableSequences or list of string, optional
+        allele_encoding : AlleleEncoding, optional
             If not specified, the model will be a single-allele predictor.
 
         inequalities : list of string, each element one of ">", "<", or "=".
@@ -429,13 +455,12 @@ class Class1NeuralNetwork(object):
         x_dict_without_random_negatives = {
             'peptide': peptide_encoding,
         }
-        pseudosequence_length = None
-        if allele_pseudosequences is not None:
-            pseudosequences_input = self.pseudosequence_to_network_input(
-                allele_pseudosequences)
-            pseudosequence_length = len(pseudosequences_input[0])
-            x_dict_without_random_negatives['pseudosequence'] = (
-                pseudosequences_input)
+        allele_encoding_dims = None
+        if allele_encoding is not None:
+            allele_encoding_input = self.allele_encoding_to_network_input(
+                allele_encoding)
+            allele_encoding_dims = allele_encoding_input.shape[1:]
+            x_dict_without_random_negatives['allele'] = allele_encoding_input
 
         # Shuffle y_values and the contents of x_dict_without_random_negatives
         # This ensures different data is used for the test set for early stopping
@@ -480,7 +505,7 @@ class Class1NeuralNetwork(object):
 
         if self.network() is None:
             self._network = self.make_network(
-                pseudosequence_length=pseudosequence_length,
+                allele_encoding_dims=allele_encoding_dims,
                 **self.network_hyperparameter_defaults.subselect(
                     self.hyperparameters))
             self.network().compile(
@@ -537,6 +562,7 @@ class Class1NeuralNetwork(object):
         self.loss_history = collections.defaultdict(list)
         start = time.time()
         last_progress_print = None
+        x_dict_with_random_negatives = {}
         for i in range(self.hyperparameters['max_epochs']):
             random_negative_peptides_list = []
             for (length, count) in num_random_negative.iteritems():
@@ -545,21 +571,45 @@ class Class1NeuralNetwork(object):
                         count,
                         length=length,
                         distribution=aa_distribution))
+            random_negative_peptides = EncodableSequences.create(
+                random_negative_peptides_list)
             random_negative_peptides_encoding = (
-                self.peptides_to_network_input(
-                    random_negative_peptides_list))
-
-            x_dict_with_random_negatives = {
-                "peptide": numpy.concatenate([
-                    random_negative_peptides_encoding,
-                    peptide_encoding,
-                ]) if len(random_negative_peptides_encoding) > 0
-                else peptide_encoding
-            }
-            if pseudosequence_length:
-                # TODO: add random pseudosequences for random negative peptides
-                raise NotImplementedError(
-                    "Allele pseudosequences unsupported with random negatives")
+                self.peptides_to_network_input(random_negative_peptides))
+
+            if not x_dict_with_random_negatives:
+                if len(random_negative_peptides) > 0:
+                    x_dict_with_random_negatives["peptide"] = numpy.concatenate([
+                        random_negative_peptides_encoding,
+                        peptide_encoding,
+                    ])
+                    if 'allele' in x_dict_without_random_negatives:
+                        x_dict_with_random_negatives['allele'] = numpy.concatenate([
+                            x_dict_without_random_negatives['allele'][
+                                numpy.random.choice(
+                                    x_dict_without_random_negatives[
+                                        'allele'].shape[0],
+                                    size=len(random_negative_peptides_list))],
+                            x_dict_without_random_negatives['allele']
+                        ])
+                else:
+                    x_dict_with_random_negatives = (
+                        x_dict_without_random_negatives)
+            else:
+                # Update x_dict_with_random_negatives in place.
+                # This is more memory efficient than recreating it as above.
+                if len(random_negative_peptides) > 0:
+                    x_dict_with_random_negatives["peptide"][:len(random_negative_peptides)] = (
+                        random_negative_peptides_encoding
+                    )
+                    if 'allele' in x_dict_with_random_negatives:
+                        x_dict_with_random_negatives['allele'][:len(random_negative_peptides)] = (
+                            x_dict_with_random_negatives['allele'][
+                                len(random_negative_peptides) + numpy.random.choice(
+                                    x_dict_with_random_negatives['allele'].shape[0] -
+                                    len(random_negative_peptides),
+                                    size=len(random_negative_peptides))
+                            ]
+                        )
 
             fit_history = self.network().fit(
                 x_dict_with_random_negatives,
@@ -610,7 +660,7 @@ class Class1NeuralNetwork(object):
                         break
         self.fit_seconds = time.time() - start
 
-    def predict(self, peptides, allele_pseudosequences=None, batch_size=4096):
+    def predict(self, peptides, allele_encoding=None, batch_size=4096):
         """
         Predict affinities
         
@@ -618,7 +668,7 @@ class Class1NeuralNetwork(object):
         ----------
         peptides : EncodableSequences or list of string
         
-        allele_pseudosequences : EncodableSequences or list of string, optional
+        allele_pseudosequences : AlleleEncoding, optional
             Only required when this model is a pan-allele model
 
         batch_size : int
@@ -631,7 +681,7 @@ class Class1NeuralNetwork(object):
         x_dict = {
             'peptide': self.peptides_to_network_input(peptides)
         }
-        if allele_pseudosequences is not None:
+        if allele_encoding is not None:
             pseudosequences_input = self.pseudosequence_to_network_input(
                 allele_pseudosequences)
             x_dict['pseudosequence'] = pseudosequences_input
@@ -643,13 +693,15 @@ class Class1NeuralNetwork(object):
 
     @staticmethod
     def make_network(
-            pseudosequence_length,
+            allele_encoding_dims,
             kmer_size,
             peptide_amino_acid_encoding,
-            use_embedding,
             embedding_input_dim,
             embedding_output_dim,
-            pseudosequence_use_embedding,
+            allele_dense_layer_sizes,
+            peptide_dense_layer_sizes,
+            peptide_allele_merge_method,
+            peptide_allele_merge_activation,
             layer_sizes,
             dense_layer_l1_regularization,
             dense_layer_l2_regularization,
@@ -673,7 +725,7 @@ class Class1NeuralNetwork(object):
         from keras.layers.embeddings import Embedding
         from keras.layers.normalization import BatchNormalization
 
-        if use_embedding or peptide_amino_acid_encoding == "embedding":
+        if peptide_amino_acid_encoding == "embedding":
             peptide_input = Input(
                 shape=(kmer_size,), dtype='int32', name='peptide')
             current_layer = Embedding(
@@ -693,6 +745,12 @@ class Class1NeuralNetwork(object):
 
         inputs = [peptide_input]
 
+        kernel_regularizer = None
+        l1 = dense_layer_l1_regularization
+        l2 = dense_layer_l2_regularization
+        if l1 > 0 or l2 > 0:
+            kernel_regularizer = keras.regularizers.l1_l2(l1, l2)
+
         for (i, locally_connected_params) in enumerate(locally_connected_layers):
             current_layer = keras.layers.LocallyConnected1D(
                 name="lc_%d" % i,
@@ -700,6 +758,13 @@ class Class1NeuralNetwork(object):
 
         current_layer = Flatten(name="flattened_0")(current_layer)
 
+        for (i, layer_size) in enumerate(peptide_dense_layer_sizes):
+            current_layer = Dense(
+                layer_size,
+                name="peptide_dense_%d" % i,
+                kernel_regularizer=kernel_regularizer,
+                activation=activation)(current_layer)
+
         if batch_normalization:
             current_layer = BatchNormalization(name="batch_norm_early")(
                 current_layer)
@@ -708,37 +773,44 @@ class Class1NeuralNetwork(object):
             current_layer = Dropout(dropout_probability, name="dropout_early")(
                 current_layer)
 
-        if pseudosequence_length:
-            if pseudosequence_use_embedding:
-                pseudosequence_input = Input(
-                    shape=(pseudosequence_length,),
-                    dtype='int32',
-                    name='pseudosequence')
-                pseudo_embedding_layer = Embedding(
-                    input_dim=embedding_input_dim,
-                    output_dim=embedding_output_dim,
-                    input_length=pseudosequence_length,
-                    embeddings_initializer=embedding_init_method)(
-                    pseudosequence_input)
+        if allele_encoding_dims:
+            allele_input = Input(
+                shape=allele_encoding_dims,
+                dtype='int32',
+                name='peptide')
+            inputs.append(allele_input)
+            allele_embedding_layer = Flatten(name="allele_flat")(allele_input)
+
+            for (i, layer_size) in enumerate(allele_dense_layer_sizes):
+                allele_embedding_layer = Dense(
+                    layer_size,
+                    name="allele_dense_%d" % i,
+                    kernel_regularizer=kernel_regularizer,
+                    activation=activation)(allele_embedding_layer)
+
+            if peptide_allele_merge_method == 'concatenate':
+                current_layer = keras.layers.concatenate([
+                    current_layer, allele_embedding_layer
+                ], name="allele_peptide_merged")
+            elif peptide_allele_merge_method == 'multiply':
+                current_layer = keras.layers.multiply([
+                    current_layer, allele_embedding_layer
+                ], name="allele_peptide_merged")
+
+                current_layer = keras.layers.concatenate(
+                    [current_layer, allele_embedding_layer], name="concatenated_0")
             else:
-                pseudosequence_input = Input(
-                    shape=(pseudosequence_length, 21),
-                    dtype='float32', name='peptide')
-                pseudo_embedding_layer = pseudosequence_input
-            inputs.append(pseudosequence_input)
-            pseudo_embedding_layer = Flatten(name="flattened_1")(
-                pseudo_embedding_layer)
-
-            current_layer = keras.layers.concatenate([
-                current_layer, pseudo_embedding_layer], name="concatenated_0")
+                raise ValueError(
+                    "Unsupported peptide_allele_encoding_merge_method: %s"
+                    % peptide_allele_merge_method)
+
+            if peptide_allele_merge_activation:
+                current_layer = keras.layers.Activation(
+                    peptide_allele_merge_activation,
+                    name="alelle_peptide_merged_%s" %
+                         peptide_allele_merge_activation)(current_layer)
             
         for (i, layer_size) in enumerate(layer_sizes):
-            kernel_regularizer = None
-            l1 = dense_layer_l1_regularization
-            l2 = dense_layer_l2_regularization
-            if l1 > 0 or l2 > 0:
-                kernel_regularizer = keras.regularizers.l1_l2(l1, l2)
-
             current_layer = Dense(
                 layer_size,
                 activation=activation,
diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py
index aab2b0fe..6dcd5e63 100644
--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
@@ -104,7 +104,6 @@ class EncodableSequences(object):
         -------
         numpy.array with shape (num sequences, max_length, m) where m is
         vector_encoding_length(vector_encoding_name)
-
         """
         cache_key = (
             "fixed_length_vector_encoding",
-- 
GitLab