From 984120cf6be654298d1b13f8ea6e610e687a77eb Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 14 Nov 2017 14:52:02 -0500
Subject: [PATCH] Support BLOSUM62 encoding

---
 .../class1_neural_network.py                  |  35 +++-
 mhcflurry/encodable_sequences.py              | 173 ++++++++++--------
 2 files changed, 125 insertions(+), 83 deletions(-)

diff --git a/mhcflurry/class1_affinity_prediction/class1_neural_network.py b/mhcflurry/class1_affinity_prediction/class1_neural_network.py
index 8e2a638e..e22cf8dc 100644
--- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py
+++ b/mhcflurry/class1_affinity_prediction/class1_neural_network.py
@@ -16,7 +16,10 @@ from keras.layers.normalization import BatchNormalization
 
 from mhcflurry.hyperparameters import HyperparameterDefaults
 
-from ..encodable_sequences import EncodableSequences
+from ..encodable_sequences import (
+    EncodableSequences,
+    available_vector_encodings,
+    vector_encoding_length)
 from ..regression_target import to_ic50, from_ic50
 from ..common import random_peptides, amino_acid_distribution
 
@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object):
     network_hyperparameter_defaults = HyperparameterDefaults(
         kmer_size=15,
         use_embedding=False,
+        peptide_amino_acid_encoding="one-hot",
         embedding_input_dim=21,
         embedding_output_dim=8,
         pseudosequence_use_embedding=False,
@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object):
         numpy.array
         """
         encoder = EncodableSequences.create(peptides)
-        if self.hyperparameters['use_embedding']:
+        if (self.hyperparameters['use_embedding'] or
+                self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"):
             encoded = encoder.variable_length_to_fixed_length_categorical(
                 max_length=self.hyperparameters['kmer_size'],
                 **self.input_encoding_hyperparameter_defaults.subselect(
                     self.hyperparameters))
-        else:
-            encoded = encoder.variable_length_to_fixed_length_one_hot(
+        elif (
+                self.hyperparameters['peptide_amino_acid_encoding'] in
+                    available_vector_encodings()):
+            encoded = encoder.variable_length_to_fixed_length_vector_encoding(
+                self.hyperparameters['peptide_amino_acid_encoding'],
                 max_length=self.hyperparameters['kmer_size'],
                 **self.input_encoding_hyperparameter_defaults.subselect(
                     self.hyperparameters))
+        else:
+            raise ValueError("Unsupported peptide_amino_acid_encoding: %s" %
+                             self.hyperparameters['peptide_amino_acid_encoding'])
         assert len(encoded) == len(peptides)
         return encoded
 
-
     @property
     def supported_peptide_lengths(self):
         """
@@ -492,7 +502,8 @@ class Class1NeuralNetwork(object):
             pseudosequences_input = self.pseudosequence_to_network_input(
                 allele_pseudosequences)
             x_dict['pseudosequence'] = pseudosequences_input
-        (predictions,) = numpy.array(self.network(borrow=True).predict(x_dict)).T
+        (predictions,) = numpy.array(
+            self.network(borrow=True).predict(x_dict), dtype="float64").T
         return to_ic50(predictions)
 
     def compile(self):
@@ -507,6 +518,7 @@ class Class1NeuralNetwork(object):
     def make_network(
             pseudosequence_length,
             kmer_size,
+            peptide_amino_acid_encoding,
             use_embedding,
             embedding_input_dim,
             embedding_output_dim,
@@ -524,7 +536,7 @@ class Class1NeuralNetwork(object):
         """
         Helper function to make a keras network for class1 affinity prediction.
         """
-        if use_embedding:
+        if use_embedding or peptide_amino_acid_encoding == "embedding":
             peptide_input = Input(
                 shape=(kmer_size,), dtype='int32', name='peptide')
             current_layer = Embedding(
@@ -535,7 +547,11 @@ class Class1NeuralNetwork(object):
                 name="peptide_embedding")(peptide_input)
         else:
             peptide_input = Input(
-                shape=(kmer_size, 21), dtype='float32', name='peptide')
+                shape=(
+                    kmer_size,
+                    vector_encoding_length(peptide_amino_acid_encoding)),
+                dtype='float32',
+                name='peptide')
             current_layer = peptide_input
 
         inputs = [peptide_input]
@@ -609,4 +625,7 @@ class Class1NeuralNetwork(object):
             inputs=inputs,
             outputs=[output],
             name="predictor")
+
+        print("*** ARCHITECTURE ***")
+        model.summary()
         return model
diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py
index d5e9c7a0..4abc94a1 100644
--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
@@ -22,11 +22,75 @@ import math
 
 import pandas
 import numpy
+from six import StringIO
 
 import typechecks
 
 from . import amino_acid
 
+AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
+
+BLOSUM62_MATRIX = pandas.read_table(StringIO("""
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  X
+A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0  0
+R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3  0
+N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  0
+D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  0
+C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1  0
+Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0
+E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  0
+G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3  0
+H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0
+I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3  0
+L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1  0
+K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0
+M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1  0
+F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1  0
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2  0
+S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0
+T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0  0 
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3  0
+Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1  0
+V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4  0
+X  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
+"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
+assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
+
+ENCODING_DFS = {
+    "BLOSUM62": BLOSUM62_MATRIX,
+    "one-hot": pandas.DataFrame([
+        [1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
+        for j in range(len(AMINO_ACIDS))
+    ], index=AMINO_ACIDS, columns=AMINO_ACIDS)
+}
+
+
+def available_vector_encodings():
+    """
+    Return list of supported amino acid vector encodings.
+
+    Returns
+    -------
+    list of string
+
+    """
+    return list(ENCODING_DFS)
+
+
+def vector_encoding_length(name):
+    """
+    Return the length of the given vector encoding.
+
+    Parameters
+    ----------
+    name : string
+
+    Returns
+    -------
+    int
+    """
+    return ENCODING_DFS[name].shape[1]
+
 
 def index_encoding(sequences, letter_to_index_dict):
     """
@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict):
     return result.values
 
 
-def one_hot_encoding(index_encoded, alphabet_size):
+def fixed_vectors_encoding(sequences, letter_to_vector_function):
     """
-    Given an n * k array of integers in the range [0, alphabet_size), return
-    an n * k * alphabet_size array where element (i, k, j) is 1 if element
-    (i, k) == j in the input array and zero otherwise.
-    
+    Given a sequence of n strings all of length k, return a n * k * m array where
+    the (i, j)th element is letter_to_vector_function(sequence[i][j]).
+
     Parameters
     ----------
-    index_encoded : numpy.array of integers with shape (n, k)
-    alphabet_size : int 
+    sequences : list of length n of strings of length k
+    letter_to_vector_function : function : string -> vector of length m
 
     Returns
     -------
-    numpy.array of integers of shape (n, k, alphabet_size)
-
+    numpy.array of integers with shape (n, k, m)
     """
-    alphabet_size = int(alphabet_size)
-    (num_sequences, sequence_length) = index_encoded.shape
-    result = numpy.zeros(
-        (num_sequences, sequence_length, alphabet_size),
-        dtype='int32')
-
-    # Transform the index encoded array into an array of indices into the
-    # flattened result, which we will set to 1.
-    flattened_indices = (
-        index_encoded +
-        (
-            sequence_length * alphabet_size * numpy.arange(num_sequences)
-        ).reshape((-1, 1)) +
-        numpy.tile(numpy.arange(sequence_length),
-                   (num_sequences, 1)) * alphabet_size)
-    result.put(flattened_indices, 1)
+    arr = numpy.array([list(s) for s in sequences])
+    result = numpy.vectorize(
+        letter_to_vector_function, signature='()->(n)')(arr)
     return result
 
 
@@ -114,40 +163,6 @@ class EncodableSequences(object):
     def __len__(self):
         return len(self.sequences)
 
-    def fixed_length_categorical(self):
-        """
-        Returns a categorical encoding (i.e. integers 0 <= x < 21) of the
-        sequences, which must already be all the same length.
-        
-        Returns
-        -------
-        numpy.array of integers
-        """
-        cache_key = ("categorical",)
-        if cache_key not in self.encoding_cache:
-            assert self.fixed_sequence_length
-            self.encoding_cache[cache_key] = index_encoding(
-                self.sequences, amino_acid.AMINO_ACID_INDEX)
-        return self.encoding_cache[cache_key]
-
-    def fixed_length_one_hot(self):
-        """
-        Returns a binary one-hot encoding of the  sequences, which must already
-        be all the same length.
-        
-        Returns
-        -------
-        numpy.array of integers
-        """
-        cache_key = ("one_hot",)
-        if cache_key not in self.encoding_cache:
-            assert self.fixed_sequence_length
-            encoded = self.categorical_encoding()
-            result = one_hot_encoding(
-                encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
-            self.encoding_cache[cache_key] = result
-        return self.encoding_cache[cache_key]
-
     def variable_length_to_fixed_length_categorical(
             self, left_edge=4, right_edge=4, max_length=15):
         """
@@ -187,8 +202,8 @@ class EncodableSequences(object):
                 fixed_length_sequences, amino_acid.AMINO_ACID_INDEX)
         return self.encoding_cache[cache_key]
 
-    def variable_length_to_fixed_length_one_hot(
-            self, left_edge=4, right_edge=4, max_length=15):
+    def variable_length_to_fixed_length_vector_encoding(
+            self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15):
         """
         Encode variable-length sequences using a fixed-length encoding designed
         for preserving the anchor positions of class I peptides.
@@ -198,35 +213,43 @@ class EncodableSequences(object):
 
         Parameters
         ----------
+        vector_encoding_name : string
+            How to represent amino acids.
+            One of "BLOSUM62", "one-hot", etc. Full list of supported vector
+            encodings is given by available_vector_encodings().
         left_edge : int, size of fixed-position left side
         right_edge : int, size of the fixed-position right side
         max_length : sequence length of the resulting encoding
 
         Returns
         -------
-        binary numpy.array with shape (num sequences, max_length, 21)
-        """
+        numpy.array with shape (num sequences, max_length, m) where m is
+        vector_encoding_length(vector_encoding_name)
 
+        """
         cache_key = (
-            "fixed_length_one_hot",
+            "fixed_length_vector_encoding",
+            vector_encoding_name,
             left_edge,
             right_edge,
             max_length)
-
         if cache_key not in self.encoding_cache:
-            encoded = self.variable_length_to_fixed_length_categorical(
-                left_edge=left_edge,
-                right_edge=right_edge,
-                max_length=max_length)
-            result = one_hot_encoding(
-                encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
-            assert result.shape == (
-                len(self.sequences),
-                encoded.shape[1],
-                len(amino_acid.AMINO_ACID_INDEX))
+            fixed_length_sequences = [
+                self.sequence_to_fixed_length_string(
+                    sequence,
+                    left_edge=left_edge,
+                    right_edge=right_edge,
+                    max_length=max_length)
+                for sequence in self.sequences
+            ]
+            result = fixed_vectors_encoding(
+                fixed_length_sequences,
+                ENCODING_DFS[vector_encoding_name].loc.__getitem__)
+            assert result.shape[0] == len(self.sequences)
             self.encoding_cache[cache_key] = result
         return self.encoding_cache[cache_key]
 
+
     @classmethod
     def sequence_to_fixed_length_string(
             klass, sequence, left_edge=4, right_edge=4, max_length=15):
-- 
GitLab