Merge pull request #114 from hammerlab/2017-11-13

Implement BLOSUM62 encoding

Merge pull request #114 from hammerlab/2017-11-13
Implement BLOSUM62 encoding
bbcfc2ba · Tim O'Donnell · GitHub · 1c3da71b · ed9372c2 · bbcfc2ba
Unverified Commit bbcfc2ba authored 7 years ago by Tim O'Donnell Committed by GitHub 7 years ago
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ If you find MHCflurry useful in your research please cite:
 > O'Donnell, T. et al., 2017. MHCflurry: open-source class I MHC binding affinity prediction. bioRxiv. Available at: http://www.biorxiv.org/content/early/2017/08/09/174243.
-## Setup
+## Setup (pip)
 Install the package:
@@ -56,6 +56,32 @@ export KERAS_BACKEND=theano
 You may also needs to `pip install theano`.
+## Setup (conda)
+You can alternatively get up and running with a [conda](https://conda.io/docs/)
+environment as follows:
+```
+conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.0'
+source activate mhcflurry-env
+```
+Then continue as above:
+```
+pip install mhcflurry
+mhcflurry-downloads fetch
+```
+If you wish to test your installation, you can install `nose` and run the tests
+from a checkout:
+```
+pip install nose
+nosetests .
+```
 ## Making predictions from the command-line
@@ -103,7 +129,7 @@ notebook for an overview of the Python API, including fitting your own predictor
 An ensemble of eight single-allele models was trained for each allele with at least
 100 measurements in the training set (118 alleles). The models were trained on a
 random 80% sample of the data for the allele and the remaining 20% was used for
-early stopping. All models use the same [architecture](downloads-generation/models_class1/hyperparameters.json). The
+early stopping. All models use the same [architecture](downloads-generation/models_class1/hyperparameters.yaml). The
 predictions are taken to be the geometric mean of the nM binding affinity
 predictions of the individual models. The training script is [here](downloads-generation/models_class1/GENERATE.sh).

--- a/downloads-generation/models_class1/GENERATE.sh
+++ b/downloads-generation/models_class1/GENERATE.sh
@@ -25,11 +25,11 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
 mkdir models
-cp $SCRIPT_DIR/hyperparameters.json .
+cp $SCRIPT_DIR/hyperparameters.yaml .
 time mhcflurry-class1-train-allele-specific-models \
    --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \
-    --hyperparameters hyperparameters.json \
+    --hyperparameters hyperparameters.yaml \
    --out-models-dir models \
    --min-measurements-per-allele 200

--- a/downloads-generation/models_class1/hyperparameters.json
+++ b/downloads-generation/models_class1/hyperparameters.json
-[
-    {
-        "n_models": 12,
-        "max_epochs": 500,
-        "patience": 10,
-        "early_stopping": true,
-        "validation_split": 0.2,
-        "random_negative_rate": 0.0,
-        "random_negative_constant": 25,
-        "use_embedding": false,
-        "kmer_size": 15,
-        "batch_normalization": false,
-        "locally_connected_layers": [
-            {
-                "filters": 8,
-                "activation": "tanh",
-                "kernel_size": 3
-            },
-            {
-                "filters": 8,
-                "activation": "tanh",
-                "kernel_size": 3
-            }
-        ],
-        "activation": "relu",
-        "output_activation": "sigmoid",
-        "layer_sizes": [
-            32
-        ],
-        "random_negative_affinity_min": 20000.0,
-        "random_negative_affinity_max": 50000.0,
-        "dense_layer_l1_regularization": 0.001,
-        "dropout_probability": 0.0
-    }
-]
--- a/downloads-generation/models_class1/hyperparameters.yaml
+++ b/downloads-generation/models_class1/hyperparameters.yaml
+[{
+##########################################
+# ENSEMBLE SIZE
+##########################################
+"n_models": 12,
+##########################################
+# OPTIMIZATION
+##########################################
+"max_epochs": 500,
+"patience": 10,
+"early_stopping": true,
+"validation_split": 0.2,
+##########################################
+# RANDOM NEGATIVE PEPTIDES
+##########################################
+"random_negative_rate": 0.0,
+"random_negative_constant": 25,
+"random_negative_affinity_min": 20000.0,
+"random_negative_affinity_max": 50000.0,
+##########################################
+# PEPTIDE REPRESENTATION
+##########################################
+# One of "one-hot", "embedding", or "BLOSUM62".
+"peptide_amino_acid_encoding": "BLOSUM62",
+"use_embedding": false,  # maintained for backward compatability
+"kmer_size": 15,
+##########################################
+# NEURAL NETWORK ARCHITECTURE
+##########################################
+"locally_connected_layers": [
+    {
+        "filters": 8,
+        "activation": "tanh",
+        "kernel_size": 3
+    },
+    {
+        "filters": 8,
+        "activation": "tanh",
+        "kernel_size": 3
+    }
+],
+"activation": "relu",
+"output_activation": "sigmoid",
+"layer_sizes": [
+    32
+],
+"dense_layer_l1_regularization": 0.001,
+"batch_normalization": false,
+"dropout_probability": 0.0,
+}]
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -17,7 +17,7 @@ from .class1_affinity_prediction.class1_neural_network import (
 from .class1_affinity_prediction.class1_affinity_predictor import (
    Class1AffinityPredictor)
-__version__ = "0.9.2"
+__version__ = "0.9.3"
 __all__ = [
    "Class1NeuralNetwork",

--- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py
+++ b/mhcflurry/class1_affinity_prediction/class1_neural_network.py
@@ -9,14 +9,17 @@ import keras.models
 import keras.layers.pooling
 import keras.regularizers
 from keras.layers import Input
-import keras.layers.merge
+import keras.layers
 from keras.layers.core import Dense, Flatten, Dropout
 from keras.layers.embeddings import Embedding
 from keras.layers.normalization import BatchNormalization
 from mhcflurry.hyperparameters import HyperparameterDefaults
-from ..encodable_sequences import EncodableSequences
+from ..encodable_sequences import (
+    EncodableSequences,
+    available_vector_encodings,
+    vector_encoding_length)
 from ..regression_target import to_ic50, from_ic50
 from ..common import random_peptides, amino_acid_distribution
@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object):
    network_hyperparameter_defaults = HyperparameterDefaults(
        kmer_size=15,
        use_embedding=False,
+        peptide_amino_acid_encoding="one-hot",
        embedding_input_dim=21,
        embedding_output_dim=8,
        pseudosequence_use_embedding=False,
@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object):
        numpy.array
        """
        encoder = EncodableSequences.create(peptides)
-        if self.hyperparameters['use_embedding']:
+        if (self.hyperparameters['use_embedding'] or
+                self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"):
            encoded = encoder.variable_length_to_fixed_length_categorical(
                max_length=self.hyperparameters['kmer_size'],
                **self.input_encoding_hyperparameter_defaults.subselect(
                    self.hyperparameters))
-        else:
+        elif (
-            encoded = encoder.variable_length_to_fixed_length_one_hot(
+                self.hyperparameters['peptide_amino_acid_encoding'] in
+                    available_vector_encodings()):
+            encoded = encoder.variable_length_to_fixed_length_vector_encoding(
+                self.hyperparameters['peptide_amino_acid_encoding'],
                max_length=self.hyperparameters['kmer_size'],
                **self.input_encoding_hyperparameter_defaults.subselect(
                    self.hyperparameters))
+        else:
+            raise ValueError("Unsupported peptide_amino_acid_encoding: %s" %
+                             self.hyperparameters['peptide_amino_acid_encoding'])
        assert len(encoded) == len(peptides)
        return encoded
    @property
    def supported_peptide_lengths(self):
        """
@@ -302,7 +312,8 @@ class Class1NeuralNetwork(object):
        if self.hyperparameters['pseudosequence_use_embedding']:
            encoded = encoder.fixed_length_categorical()
        else:
-            encoded = encoder.fixed_length_one_hot()
+            raise NotImplementedError
+            # encoded = encoder.fixed_length_one_hot()
        assert len(encoded) == len(pseudosequences)
        return encoded
@@ -492,7 +503,8 @@ class Class1NeuralNetwork(object):
            pseudosequences_input = self.pseudosequence_to_network_input(
                allele_pseudosequences)
            x_dict['pseudosequence'] = pseudosequences_input
-        (predictions,) = numpy.array(self.network(borrow=True).predict(x_dict)).T
+        (predictions,) = numpy.array(
+            self.network(borrow=True).predict(x_dict), dtype="float64").T
        return to_ic50(predictions)
    def compile(self):
@@ -507,6 +519,7 @@ class Class1NeuralNetwork(object):
    def make_network(
            pseudosequence_length,
            kmer_size,
+            peptide_amino_acid_encoding,
            use_embedding,
            embedding_input_dim,
            embedding_output_dim,
@@ -524,7 +537,7 @@ class Class1NeuralNetwork(object):
        """
        Helper function to make a keras network for class1 affinity prediction.
        """
-        if use_embedding:
+        if use_embedding or peptide_amino_acid_encoding == "embedding":
            peptide_input = Input(
                shape=(kmer_size,), dtype='int32', name='peptide')
            current_layer = Embedding(
@@ -535,7 +548,11 @@ class Class1NeuralNetwork(object):
                name="peptide_embedding")(peptide_input)
        else:
            peptide_input = Input(
-                shape=(kmer_size, 21), dtype='float32', name='peptide')
+                shape=(
+                    kmer_size,
+                    vector_encoding_length(peptide_amino_acid_encoding)),
+                dtype='float32',
+                name='peptide')
            current_layer = peptide_input
        inputs = [peptide_input]
@@ -609,4 +626,7 @@ class Class1NeuralNetwork(object):
            inputs=inputs,
            outputs=[output],
            name="predictor")
+        print("*** ARCHITECTURE ***")
+        model.summary()
        return model
--- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
+++ b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py
@@ -5,7 +5,7 @@ Train Class1 single allele models.
 import os
 import sys
 import argparse
-import json
+import yaml
 import pandas
@@ -31,7 +31,7 @@ parser.add_argument(
    "--hyperparameters",
    metavar="FILE.json",
    required=True,
-    help="JSON of hyperparameters")
+    help="JSON or YAML of hyperparameters")
 parser.add_argument(
    "--allele",
    default=None,
@@ -61,7 +61,7 @@ def run(argv=sys.argv[1:]):
    configure_logging(verbose=args.verbosity > 1)
-    hyperparameters_lst = json.load(open(args.hyperparameters))
+    hyperparameters_lst = yaml.load(open(args.hyperparameters))
    assert isinstance(hyperparameters_lst, list)
    print("Loaded hyperparameters list: %s" % str(hyperparameters_lst))

--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
@@ -22,11 +22,75 @@ import math
 import pandas
 import numpy
+from six import StringIO
 import typechecks
 from . import amino_acid
+AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
+BLOSUM62_MATRIX = pandas.read_table(StringIO("""
+   A  R  N  D  C  Q  E  G  H  I  L  K  M  F  P  S  T  W  Y  V  X
+A  4 -1 -2 -2  0 -1 -1  0 -2 -1 -1 -1 -1 -2 -1  1  0 -3 -2  0  0
+R -1  5  0 -2 -3  1  0 -2  0 -3 -2  2 -1 -3 -2 -1 -1 -3 -2 -3  0
+N -2  0  6  1 -3  0  0  0  1 -3 -3  0 -2 -3 -2  1  0 -4 -2 -3  0
+D -2 -2  1  6 -3  0  2 -1 -1 -3 -4 -1 -3 -3 -1  0 -1 -4 -3 -3  0
+C  0 -3 -3 -3  9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1  0
+Q -1  1  0  0 -3  5  2 -2  0 -3 -2  1  0 -3 -1  0 -1 -2 -1 -2  0
+E -1  0  0  2 -4  2  5 -2  0 -3 -3  1 -2 -3 -1  0 -1 -3 -2 -2  0
+G  0 -2  0 -1 -3 -2 -2  6 -2 -4 -4 -2 -3 -3 -2  0 -2 -2 -3 -3  0
+H -2  0  1 -1 -3  0  0 -2  8 -3 -3 -1 -2 -1 -2 -1 -2 -2  2 -3  0
+I -1 -3 -3 -3 -1 -3 -3 -4 -3  4  2 -3  1  0 -3 -2 -1 -3 -1  3  0
+L -1 -2 -3 -4 -1 -2 -3 -4 -3  2  4 -2  2  0 -3 -2 -1 -2 -1  1  0
+K -1  2  0 -1 -3  1  1 -2 -1 -3 -2  5 -1 -3 -1  0 -1 -3 -2 -2  0
+M -1 -1 -2 -3 -1  0 -2 -3 -2  1  2 -1  5  0 -2 -1 -1 -1 -1  1  0
+F -2 -3 -3 -3 -2 -3 -3 -3 -1  0  0 -3  0  6 -4 -2 -2  1  3 -1  0
+P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4  7 -1 -1 -4 -3 -2  0
+S  1 -1  1  0 -1  0  0  0 -1 -2 -2  0 -1 -2 -1  4  1 -3 -2 -2  0
+T  0 -1  0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1  1  5 -2 -2  0  0 
+W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1  1 -4 -3 -2 11  2 -3  0
+Y -2 -2 -2 -3 -2 -1 -2 -3  2 -1 -1 -2 -1  3 -3 -2 -2  2  7 -1  0
+V  0 -3 -3 -3 -1 -2 -2 -3 -3  3  1 -2  1 -1 -2 -2  0 -3 -1  4  0
+X  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  1
+"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
+assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
+ENCODING_DFS = {
+    "BLOSUM62": BLOSUM62_MATRIX,
+    "one-hot": pandas.DataFrame([
+        [1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
+        for j in range(len(AMINO_ACIDS))
+    ], index=AMINO_ACIDS, columns=AMINO_ACIDS)
+}
+def available_vector_encodings():
+    """
+    Return list of supported amino acid vector encodings.
+    Returns
+    -------
+    list of string
+    """
+    return list(ENCODING_DFS)
+def vector_encoding_length(name):
+    """
+    Return the length of the given vector encoding.
+    Parameters
+    ----------
+    name : string
+    Returns
+    -------
+    int
+    """
+    return ENCODING_DFS[name].shape[1]
 def index_encoding(sequences, letter_to_index_dict):
    """
@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict):
    return result.values
-def one_hot_encoding(index_encoded, alphabet_size):
+def fixed_vectors_encoding(sequences, letter_to_vector_function):
    """
-    Given an n * k array of integers in the range [0, alphabet_size), return
+    Given a sequence of n strings all of length k, return a n * k * m array where
-    an n * k * alphabet_size array where element (i, k, j) is 1 if element
+    the (i, j)th element is letter_to_vector_function(sequence[i][j]).
-    (i, k) == j in the input array and zero otherwise.
    Parameters
    ----------
-    index_encoded : numpy.array of integers with shape (n, k)
+    sequences : list of length n of strings of length k
-    alphabet_size : int 
+    letter_to_vector_function : function : string -> vector of length m
    Returns
    -------
-    numpy.array of integers of shape (n, k, alphabet_size)
+    numpy.array of integers with shape (n, k, m)
    """
-    alphabet_size = int(alphabet_size)
+    arr = numpy.array([list(s) for s in sequences])
-    (num_sequences, sequence_length) = index_encoded.shape
+    result = numpy.vectorize(
-    result = numpy.zeros(
+        letter_to_vector_function, signature='()->(n)')(arr)
-        (num_sequences, sequence_length, alphabet_size),
-        dtype='int32')
-    # Transform the index encoded array into an array of indices into the
-    # flattened result, which we will set to 1.
-    flattened_indices = (
-        index_encoded +
-        (
-            sequence_length * alphabet_size * numpy.arange(num_sequences)
-        ).reshape((-1, 1)) +
-        numpy.tile(numpy.arange(sequence_length),
-                   (num_sequences, 1)) * alphabet_size)
-    result.put(flattened_indices, 1)
    return result
@@ -114,40 +163,6 @@ class EncodableSequences(object):
    def __len__(self):
        return len(self.sequences)
-    def fixed_length_categorical(self):
-        """
-        Returns a categorical encoding (i.e. integers 0 <= x < 21) of the
-        sequences, which must already be all the same length.
-        Returns
-        -------
-        numpy.array of integers
-        """
-        cache_key = ("categorical",)
-        if cache_key not in self.encoding_cache:
-            assert self.fixed_sequence_length
-            self.encoding_cache[cache_key] = index_encoding(
-                self.sequences, amino_acid.AMINO_ACID_INDEX)
-        return self.encoding_cache[cache_key]
-    def fixed_length_one_hot(self):
-        """
-        Returns a binary one-hot encoding of the  sequences, which must already
-        be all the same length.
-        Returns
-        -------
-        numpy.array of integers
-        """
-        cache_key = ("one_hot",)
-        if cache_key not in self.encoding_cache:
-            assert self.fixed_sequence_length
-            encoded = self.categorical_encoding()
-            result = one_hot_encoding(
-                encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
-            self.encoding_cache[cache_key] = result
-        return self.encoding_cache[cache_key]
    def variable_length_to_fixed_length_categorical(
            self, left_edge=4, right_edge=4, max_length=15):
        """
@@ -187,8 +202,8 @@ class EncodableSequences(object):
                fixed_length_sequences, amino_acid.AMINO_ACID_INDEX)
        return self.encoding_cache[cache_key]
-    def variable_length_to_fixed_length_one_hot(
+    def variable_length_to_fixed_length_vector_encoding(
-            self, left_edge=4, right_edge=4, max_length=15):
+            self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15):
        """
        Encode variable-length sequences using a fixed-length encoding designed
        for preserving the anchor positions of class I peptides.
@@ -198,35 +213,43 @@ class EncodableSequences(object):
        Parameters
        ----------
+        vector_encoding_name : string
+            How to represent amino acids.
+            One of "BLOSUM62", "one-hot", etc. Full list of supported vector
+            encodings is given by available_vector_encodings().
        left_edge : int, size of fixed-position left side
        right_edge : int, size of the fixed-position right side
        max_length : sequence length of the resulting encoding
        Returns
        -------
-        binary numpy.array with shape (num sequences, max_length, 21)
+        numpy.array with shape (num sequences, max_length, m) where m is
-        """
+        vector_encoding_length(vector_encoding_name)
+        """
        cache_key = (
-            "fixed_length_one_hot",
+            "fixed_length_vector_encoding",
+            vector_encoding_name,
            left_edge,
            right_edge,
            max_length)
        if cache_key not in self.encoding_cache:
-            encoded = self.variable_length_to_fixed_length_categorical(
+            fixed_length_sequences = [
-                left_edge=left_edge,
+                self.sequence_to_fixed_length_string(
-                right_edge=right_edge,
+                    sequence,
-                max_length=max_length)
+                    left_edge=left_edge,
-            result = one_hot_encoding(
+                    right_edge=right_edge,
-                encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
+                    max_length=max_length)
-            assert result.shape == (
+                for sequence in self.sequences
-                len(self.sequences),
+            ]
-                encoded.shape[1],
+            result = fixed_vectors_encoding(
-                len(amino_acid.AMINO_ACID_INDEX))
+                fixed_length_sequences,
+                ENCODING_DFS[vector_encoding_name].loc.__getitem__)
+            assert result.shape[0] == len(self.sequences)
            self.encoding_cache[cache_key] = result
        return self.encoding_cache[cache_key]
    @classmethod
    def sequence_to_fixed_length_string(
            klass, sequence, left_edge=4, right_edge=4, max_length=15):

--- a/requirements.txt
+++ b/requirements.txt
 six
-numpy>= 1.11
+numpy>=1.11
 pandas>=0.13.1
-Keras==2.0.8
+Keras==2.0.9
+tensorflow>=1.1.0
 appdirs
-tensorflow
 scikit-learn
 typechecks
 mhcnames
+pyyaml
--- a/setup.py
+++ b/setup.py
@@ -30,14 +30,14 @@ try:
    with open(readme_filename, 'r') as f:
        readme = f.read()
 except:
-    logging.warn("Failed to load %s" % readme_filename)
+    logging.warning("Failed to load %s" % readme_filename)
    readme = ""
 try:
    import pypandoc
    readme = pypandoc.convert(readme, to='rst', format='md')
 except:
-    logging.warn("Conversion of long_description from MD to RST failed")
+    logging.warning("Conversion of long_description from MD to RST failed")
    pass
 with open('mhcflurry/__init__.py', 'r') as f:
@@ -51,12 +51,13 @@ if __name__ == '__main__':
        'six',
        'numpy>=1.11',
        'pandas>=0.13.1',
-        'Keras==2.0.8',
+        'Keras==2.0.9',
        'appdirs',
-        'tensorflow',
+        'tensorflow>=1.1.0',
        'scikit-learn',
        'typechecks',
        'mhcnames',
+        'pyyaml',
    ]
    if PY2:
        # concurrent.futures is a standard library in Py3 but Py2

--- a/test/test_encodable_sequences.py
+++ b/test/test_encodable_sequences.py
 from mhcflurry import encodable_sequences
 from nose.tools import eq_
 from numpy.testing import assert_equal
+import numpy
 letter_to_index_dict = {
    'A': 0,
@@ -18,7 +19,13 @@ def test_index_and_one_hot_encoding():
            [0, 0, 0, 0],
            [0, 1, 2, 0],
        ])
-    one_hot = encodable_sequences.one_hot_encoding(index_encoding, 3)
+    one_hot = encodable_sequences.fixed_vectors_encoding(
+        index_encoding,
+        {
+            0: numpy.array([1, 0, 0]),
+            1: numpy.array([0, 1, 0]),
+            2: numpy.array([0, 0, 1]),
+        }.get)
    eq_(one_hot.shape, (2, 4, 3))
    assert_equal(
        one_hot[0],

--- a/test/test_train_allele_specific_models_command.py
+++ b/test/test_train_allele_specific_models_command.py
@@ -53,7 +53,7 @@ def test_run():
    try:
        models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
        hyperparameters_filename = os.path.join(
-            models_dir, "hyperparameters.json")
+            models_dir, "hyperparameters.yaml")
        with open(hyperparameters_filename, "w") as fd:
            json.dump(HYPERPARAMETERS, fd)