diff --git a/README.md b/README.md index 99945648ed109e901bdd426a32a45bbefd061409..bf02a88b45a5e4d4a3de740685a082dfc4c8105c 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ If you find MHCflurry useful in your research please cite: > O'Donnell, T. et al., 2017. MHCflurry: open-source class I MHC binding affinity prediction. bioRxiv. Available at: http://www.biorxiv.org/content/early/2017/08/09/174243. -## Setup +## Setup (pip) Install the package: @@ -56,6 +56,32 @@ export KERAS_BACKEND=theano You may also needs to `pip install theano`. +## Setup (conda) + +You can alternatively get up and running with a [conda](https://conda.io/docs/) +environment as follows: + +``` +conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.0' +source activate mhcflurry-env +``` + +Then continue as above: + +``` +pip install mhcflurry +mhcflurry-downloads fetch +``` + +If you wish to test your installation, you can install `nose` and run the tests +from a checkout: + +``` +pip install nose +nosetests . +``` + + ## Making predictions from the command-line @@ -103,7 +129,7 @@ notebook for an overview of the Python API, including fitting your own predictor An ensemble of eight single-allele models was trained for each allele with at least 100 measurements in the training set (118 alleles). The models were trained on a random 80% sample of the data for the allele and the remaining 20% was used for -early stopping. All models use the same [architecture](downloads-generation/models_class1/hyperparameters.json). The +early stopping. All models use the same [architecture](downloads-generation/models_class1/hyperparameters.yaml). The predictions are taken to be the geometric mean of the nM binding affinity predictions of the individual models. The training script is [here](downloads-generation/models_class1/GENERATE.sh). diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh index ff06a043083a585eacad7c0ef8425f9cee9f9686..ae98f51ebe9a09381c3dd497d49dfcdd2643dfdb 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1/GENERATE.sh @@ -25,11 +25,11 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME mkdir models -cp $SCRIPT_DIR/hyperparameters.json . +cp $SCRIPT_DIR/hyperparameters.yaml . time mhcflurry-class1-train-allele-specific-models \ --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ - --hyperparameters hyperparameters.json \ + --hyperparameters hyperparameters.yaml \ --out-models-dir models \ --min-measurements-per-allele 200 diff --git a/downloads-generation/models_class1/hyperparameters.json b/downloads-generation/models_class1/hyperparameters.json deleted file mode 100644 index 75fd5d646d05fd3a655aea0b42527f18892c44cc..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1/hyperparameters.json +++ /dev/null @@ -1,37 +0,0 @@ -[ - { - "n_models": 12, - "max_epochs": 500, - "patience": 10, - "early_stopping": true, - "validation_split": 0.2, - - "random_negative_rate": 0.0, - "random_negative_constant": 25, - - "use_embedding": false, - "kmer_size": 15, - "batch_normalization": false, - "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } - ], - "activation": "relu", - "output_activation": "sigmoid", - "layer_sizes": [ - 32 - ], - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - "dense_layer_l1_regularization": 0.001, - "dropout_probability": 0.0 - } -] diff --git a/downloads-generation/models_class1/hyperparameters.yaml b/downloads-generation/models_class1/hyperparameters.yaml new file mode 100644 index 0000000000000000000000000000000000000000..0e114b320231a585d8ef78df925caff74dbf43c6 --- /dev/null +++ b/downloads-generation/models_class1/hyperparameters.yaml @@ -0,0 +1,54 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 12, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "BLOSUM62", +"use_embedding": false, # maintained for backward compatability +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + }, + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 32 +], +"dense_layer_l1_regularization": 0.001, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index ab21634ef51b18b023376f6919e7a15382aa44f2..cf13e1e6339778da3c658e5d9b64afddfac31ecb 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -17,7 +17,7 @@ from .class1_affinity_prediction.class1_neural_network import ( from .class1_affinity_prediction.class1_affinity_predictor import ( Class1AffinityPredictor) -__version__ = "0.9.2" +__version__ = "0.9.3" __all__ = [ "Class1NeuralNetwork", diff --git a/mhcflurry/class1_affinity_prediction/class1_neural_network.py b/mhcflurry/class1_affinity_prediction/class1_neural_network.py index fe8de016b2de9009bdcb3a0e087471760e969ea5..13a7f55c8250255a3fdc43a4c5017505801bfe6a 100644 --- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py +++ b/mhcflurry/class1_affinity_prediction/class1_neural_network.py @@ -9,14 +9,17 @@ import keras.models import keras.layers.pooling import keras.regularizers from keras.layers import Input -import keras.layers.merge +import keras.layers from keras.layers.core import Dense, Flatten, Dropout from keras.layers.embeddings import Embedding from keras.layers.normalization import BatchNormalization from mhcflurry.hyperparameters import HyperparameterDefaults -from ..encodable_sequences import EncodableSequences +from ..encodable_sequences import ( + EncodableSequences, + available_vector_encodings, + vector_encoding_length) from ..regression_target import to_ic50, from_ic50 from ..common import random_peptides, amino_acid_distribution @@ -35,6 +38,7 @@ class Class1NeuralNetwork(object): network_hyperparameter_defaults = HyperparameterDefaults( kmer_size=15, use_embedding=False, + peptide_amino_acid_encoding="one-hot", embedding_input_dim=21, embedding_output_dim=8, pseudosequence_use_embedding=False, @@ -256,20 +260,26 @@ class Class1NeuralNetwork(object): numpy.array """ encoder = EncodableSequences.create(peptides) - if self.hyperparameters['use_embedding']: + if (self.hyperparameters['use_embedding'] or + self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"): encoded = encoder.variable_length_to_fixed_length_categorical( max_length=self.hyperparameters['kmer_size'], **self.input_encoding_hyperparameter_defaults.subselect( self.hyperparameters)) - else: - encoded = encoder.variable_length_to_fixed_length_one_hot( + elif ( + self.hyperparameters['peptide_amino_acid_encoding'] in + available_vector_encodings()): + encoded = encoder.variable_length_to_fixed_length_vector_encoding( + self.hyperparameters['peptide_amino_acid_encoding'], max_length=self.hyperparameters['kmer_size'], **self.input_encoding_hyperparameter_defaults.subselect( self.hyperparameters)) + else: + raise ValueError("Unsupported peptide_amino_acid_encoding: %s" % + self.hyperparameters['peptide_amino_acid_encoding']) assert len(encoded) == len(peptides) return encoded - @property def supported_peptide_lengths(self): """ @@ -302,7 +312,8 @@ class Class1NeuralNetwork(object): if self.hyperparameters['pseudosequence_use_embedding']: encoded = encoder.fixed_length_categorical() else: - encoded = encoder.fixed_length_one_hot() + raise NotImplementedError + # encoded = encoder.fixed_length_one_hot() assert len(encoded) == len(pseudosequences) return encoded @@ -492,7 +503,8 @@ class Class1NeuralNetwork(object): pseudosequences_input = self.pseudosequence_to_network_input( allele_pseudosequences) x_dict['pseudosequence'] = pseudosequences_input - (predictions,) = numpy.array(self.network(borrow=True).predict(x_dict)).T + (predictions,) = numpy.array( + self.network(borrow=True).predict(x_dict), dtype="float64").T return to_ic50(predictions) def compile(self): @@ -507,6 +519,7 @@ class Class1NeuralNetwork(object): def make_network( pseudosequence_length, kmer_size, + peptide_amino_acid_encoding, use_embedding, embedding_input_dim, embedding_output_dim, @@ -524,7 +537,7 @@ class Class1NeuralNetwork(object): """ Helper function to make a keras network for class1 affinity prediction. """ - if use_embedding: + if use_embedding or peptide_amino_acid_encoding == "embedding": peptide_input = Input( shape=(kmer_size,), dtype='int32', name='peptide') current_layer = Embedding( @@ -535,7 +548,11 @@ class Class1NeuralNetwork(object): name="peptide_embedding")(peptide_input) else: peptide_input = Input( - shape=(kmer_size, 21), dtype='float32', name='peptide') + shape=( + kmer_size, + vector_encoding_length(peptide_amino_acid_encoding)), + dtype='float32', + name='peptide') current_layer = peptide_input inputs = [peptide_input] @@ -609,4 +626,7 @@ class Class1NeuralNetwork(object): inputs=inputs, outputs=[output], name="predictor") + + print("*** ARCHITECTURE ***") + model.summary() return model diff --git a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py index 6f0aa97529613321e2f3986a9d38201f3bd27983..1128ca99a3dfaef999d2b131ea3f6865faf53127 100644 --- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py +++ b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py @@ -5,7 +5,7 @@ Train Class1 single allele models. import os import sys import argparse -import json +import yaml import pandas @@ -31,7 +31,7 @@ parser.add_argument( "--hyperparameters", metavar="FILE.json", required=True, - help="JSON of hyperparameters") + help="JSON or YAML of hyperparameters") parser.add_argument( "--allele", default=None, @@ -61,7 +61,7 @@ def run(argv=sys.argv[1:]): configure_logging(verbose=args.verbosity > 1) - hyperparameters_lst = json.load(open(args.hyperparameters)) + hyperparameters_lst = yaml.load(open(args.hyperparameters)) assert isinstance(hyperparameters_lst, list) print("Loaded hyperparameters list: %s" % str(hyperparameters_lst)) diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index d5e9c7a0b0e1cd4c18054d546f2fa04c456a5645..4abc94a1032860594bc091ba9043cc94320ffd30 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -22,11 +22,75 @@ import math import pandas import numpy +from six import StringIO import typechecks from . import amino_acid +AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys()) + +BLOSUM62_MATRIX = pandas.read_table(StringIO(""" + A R N D C Q E G H I L K M F P S T W Y V X +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0 +X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] +assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all() + +ENCODING_DFS = { + "BLOSUM62": BLOSUM62_MATRIX, + "one-hot": pandas.DataFrame([ + [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] + for j in range(len(AMINO_ACIDS)) + ], index=AMINO_ACIDS, columns=AMINO_ACIDS) +} + + +def available_vector_encodings(): + """ + Return list of supported amino acid vector encodings. + + Returns + ------- + list of string + + """ + return list(ENCODING_DFS) + + +def vector_encoding_length(name): + """ + Return the length of the given vector encoding. + + Parameters + ---------- + name : string + + Returns + ------- + int + """ + return ENCODING_DFS[name].shape[1] + def index_encoding(sequences, letter_to_index_dict): """ @@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict): return result.values -def one_hot_encoding(index_encoded, alphabet_size): +def fixed_vectors_encoding(sequences, letter_to_vector_function): """ - Given an n * k array of integers in the range [0, alphabet_size), return - an n * k * alphabet_size array where element (i, k, j) is 1 if element - (i, k) == j in the input array and zero otherwise. - + Given a sequence of n strings all of length k, return a n * k * m array where + the (i, j)th element is letter_to_vector_function(sequence[i][j]). + Parameters ---------- - index_encoded : numpy.array of integers with shape (n, k) - alphabet_size : int + sequences : list of length n of strings of length k + letter_to_vector_function : function : string -> vector of length m Returns ------- - numpy.array of integers of shape (n, k, alphabet_size) - + numpy.array of integers with shape (n, k, m) """ - alphabet_size = int(alphabet_size) - (num_sequences, sequence_length) = index_encoded.shape - result = numpy.zeros( - (num_sequences, sequence_length, alphabet_size), - dtype='int32') - - # Transform the index encoded array into an array of indices into the - # flattened result, which we will set to 1. - flattened_indices = ( - index_encoded + - ( - sequence_length * alphabet_size * numpy.arange(num_sequences) - ).reshape((-1, 1)) + - numpy.tile(numpy.arange(sequence_length), - (num_sequences, 1)) * alphabet_size) - result.put(flattened_indices, 1) + arr = numpy.array([list(s) for s in sequences]) + result = numpy.vectorize( + letter_to_vector_function, signature='()->(n)')(arr) return result @@ -114,40 +163,6 @@ class EncodableSequences(object): def __len__(self): return len(self.sequences) - def fixed_length_categorical(self): - """ - Returns a categorical encoding (i.e. integers 0 <= x < 21) of the - sequences, which must already be all the same length. - - Returns - ------- - numpy.array of integers - """ - cache_key = ("categorical",) - if cache_key not in self.encoding_cache: - assert self.fixed_sequence_length - self.encoding_cache[cache_key] = index_encoding( - self.sequences, amino_acid.AMINO_ACID_INDEX) - return self.encoding_cache[cache_key] - - def fixed_length_one_hot(self): - """ - Returns a binary one-hot encoding of the sequences, which must already - be all the same length. - - Returns - ------- - numpy.array of integers - """ - cache_key = ("one_hot",) - if cache_key not in self.encoding_cache: - assert self.fixed_sequence_length - encoded = self.categorical_encoding() - result = one_hot_encoding( - encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX)) - self.encoding_cache[cache_key] = result - return self.encoding_cache[cache_key] - def variable_length_to_fixed_length_categorical( self, left_edge=4, right_edge=4, max_length=15): """ @@ -187,8 +202,8 @@ class EncodableSequences(object): fixed_length_sequences, amino_acid.AMINO_ACID_INDEX) return self.encoding_cache[cache_key] - def variable_length_to_fixed_length_one_hot( - self, left_edge=4, right_edge=4, max_length=15): + def variable_length_to_fixed_length_vector_encoding( + self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15): """ Encode variable-length sequences using a fixed-length encoding designed for preserving the anchor positions of class I peptides. @@ -198,35 +213,43 @@ class EncodableSequences(object): Parameters ---------- + vector_encoding_name : string + How to represent amino acids. + One of "BLOSUM62", "one-hot", etc. Full list of supported vector + encodings is given by available_vector_encodings(). left_edge : int, size of fixed-position left side right_edge : int, size of the fixed-position right side max_length : sequence length of the resulting encoding Returns ------- - binary numpy.array with shape (num sequences, max_length, 21) - """ + numpy.array with shape (num sequences, max_length, m) where m is + vector_encoding_length(vector_encoding_name) + """ cache_key = ( - "fixed_length_one_hot", + "fixed_length_vector_encoding", + vector_encoding_name, left_edge, right_edge, max_length) - if cache_key not in self.encoding_cache: - encoded = self.variable_length_to_fixed_length_categorical( - left_edge=left_edge, - right_edge=right_edge, - max_length=max_length) - result = one_hot_encoding( - encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX)) - assert result.shape == ( - len(self.sequences), - encoded.shape[1], - len(amino_acid.AMINO_ACID_INDEX)) + fixed_length_sequences = [ + self.sequence_to_fixed_length_string( + sequence, + left_edge=left_edge, + right_edge=right_edge, + max_length=max_length) + for sequence in self.sequences + ] + result = fixed_vectors_encoding( + fixed_length_sequences, + ENCODING_DFS[vector_encoding_name].loc.__getitem__) + assert result.shape[0] == len(self.sequences) self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] + @classmethod def sequence_to_fixed_length_string( klass, sequence, left_edge=4, right_edge=4, max_length=15): diff --git a/requirements.txt b/requirements.txt index 51a6f3a0b6275aaed36f757a1695f2150e59acd8..8d31007f08081ee473243222f7edf4eef1b2a4b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,10 @@ six -numpy>= 1.11 +numpy>=1.11 pandas>=0.13.1 -Keras==2.0.8 +Keras==2.0.9 +tensorflow>=1.1.0 appdirs -tensorflow scikit-learn typechecks mhcnames +pyyaml diff --git a/setup.py b/setup.py index ae7b0a20d219218b0e7980dff534a08887f6a55c..6f45d03764462b9b7248bd59b61e402893aace5b 100644 --- a/setup.py +++ b/setup.py @@ -30,14 +30,14 @@ try: with open(readme_filename, 'r') as f: readme = f.read() except: - logging.warn("Failed to load %s" % readme_filename) + logging.warning("Failed to load %s" % readme_filename) readme = "" try: import pypandoc readme = pypandoc.convert(readme, to='rst', format='md') except: - logging.warn("Conversion of long_description from MD to RST failed") + logging.warning("Conversion of long_description from MD to RST failed") pass with open('mhcflurry/__init__.py', 'r') as f: @@ -51,12 +51,13 @@ if __name__ == '__main__': 'six', 'numpy>=1.11', 'pandas>=0.13.1', - 'Keras==2.0.8', + 'Keras==2.0.9', 'appdirs', - 'tensorflow', + 'tensorflow>=1.1.0', 'scikit-learn', 'typechecks', 'mhcnames', + 'pyyaml', ] if PY2: # concurrent.futures is a standard library in Py3 but Py2 diff --git a/test/test_encodable_sequences.py b/test/test_encodable_sequences.py index 8252f00ac72fcdc7d58aed6caa4357a08f696379..cc47290a08eb05e3aa8f282f7cefbe57af669b5a 100644 --- a/test/test_encodable_sequences.py +++ b/test/test_encodable_sequences.py @@ -1,6 +1,7 @@ from mhcflurry import encodable_sequences from nose.tools import eq_ from numpy.testing import assert_equal +import numpy letter_to_index_dict = { 'A': 0, @@ -18,7 +19,13 @@ def test_index_and_one_hot_encoding(): [0, 0, 0, 0], [0, 1, 2, 0], ]) - one_hot = encodable_sequences.one_hot_encoding(index_encoding, 3) + one_hot = encodable_sequences.fixed_vectors_encoding( + index_encoding, + { + 0: numpy.array([1, 0, 0]), + 1: numpy.array([0, 1, 0]), + 2: numpy.array([0, 0, 1]), + }.get) eq_(one_hot.shape, (2, 4, 3)) assert_equal( one_hot[0], diff --git a/test/test_train_allele_specific_models_command.py b/test/test_train_allele_specific_models_command.py index 41fa9e3f88d3acdb365fbb2eb957c87573ddead8..5c3fbcfc961d1e2690c6dc46ca79262b11fe3451 100644 --- a/test/test_train_allele_specific_models_command.py +++ b/test/test_train_allele_specific_models_command.py @@ -53,7 +53,7 @@ def test_run(): try: models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join( - models_dir, "hyperparameters.json") + models_dir, "hyperparameters.yaml") with open(hyperparameters_filename, "w") as fd: json.dump(HYPERPARAMETERS, fd)