From 6866818808a8189200695385945852577482df66 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 14 Jan 2020 07:21:22 -0500 Subject: [PATCH] Working on cleavage predictor --- mhcflurry/class1_cleavage_neural_network.py | 165 +++++++++--- mhcflurry/class1_cleavage_predictor.py | 262 ++++++++++++++++++++ mhcflurry/downloads.py | 34 +++ test/test_class1_cleavage_neural_network.py | 17 +- test/test_class1_cleavage_predictor.py | 62 +++++ 5 files changed, 499 insertions(+), 41 deletions(-) create mode 100644 mhcflurry/class1_cleavage_predictor.py create mode 100644 test/test_class1_cleavage_predictor.py diff --git a/mhcflurry/class1_cleavage_neural_network.py b/mhcflurry/class1_cleavage_neural_network.py index 0f98fd0b..1d105658 100644 --- a/mhcflurry/class1_cleavage_neural_network.py +++ b/mhcflurry/class1_cleavage_neural_network.py @@ -80,27 +80,11 @@ from __future__ import print_function import time import collections -from six import string_types - import numpy -import pandas -import mhcnames -import hashlib from .hyperparameters import HyperparameterDefaults -from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE +from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE from .encodable_sequences import EncodableSequences -from .regression_target import from_ic50, to_ic50 -from .random_negative_peptides import RandomNegativePeptides -from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding -from .auxiliary_input import AuxiliaryInputEncoder -from .batch_generator import BatchGenerator -from .custom_loss import ( - MSEWithInequalities, - TransformPredictionsLossWrapper, - MultiallelicMassSpecLoss) - - class Class1CleavageNeuralNetwork(object): @@ -111,7 +95,11 @@ class Class1CleavageNeuralNetwork(object): c_flank_length=10, vector_encoding_name="BLOSUM62", flanking_averages=False, - convoluational_kernel_l1=0.001, + convolutional_filters=16, + convolutional_kernel_size=8, + convolutional_activation="relu", + convoluational_kernel_l1_l2=(0.001, 0.001), + dropout_rate=0.5, ) """ Hyperparameters (and their default values) that affect the neural network @@ -121,14 +109,15 @@ class Class1CleavageNeuralNetwork(object): fit_hyperparameter_defaults = HyperparameterDefaults( max_epochs=500, validation_split=0.1, - early_stopping=True + early_stopping=True, + minibatch_size=256, ) """ Hyperparameters for neural network training. """ early_stopping_hyperparameter_defaults = HyperparameterDefaults( - patience=5, + patience=30, min_delta=0.0, ) """ @@ -229,7 +218,7 @@ class Class1CleavageNeuralNetwork(object): x_list, targets, validation_split=self.hyperparameters['validation_split'], - shuffle=True, + batch_size=self.hyperparameters['minibatch_size'], epochs=i + 1, sample_weight=sample_weights, initial_epoch=i, @@ -288,6 +277,12 @@ class Class1CleavageNeuralNetwork(object): fit_info["num_points"] = len(peptides) self.fit_info.append(dict(fit_info)) + if verbose: + print( + "Output weights", + *numpy.array( + self.network.get_layer("output").get_weights()).flatten()) + def predict( self, peptides, @@ -300,7 +295,7 @@ class Class1CleavageNeuralNetwork(object): peptides, n_flanks, c_flanks) raw_predictions = self.network.predict( x_list, batch_size=batch_size) - predictions = numpy.array(raw_predictions, dtype="float64") + predictions = numpy.array(raw_predictions, dtype="float64")[:,0] return predictions def peptides_and_flanking_to_network_input(self, peptides, n_flanks, c_flanks): @@ -353,7 +348,11 @@ class Class1CleavageNeuralNetwork(object): c_flank_length, vector_encoding_name, flanking_averages, - convoluational_kernel_l1): + convolutional_filters, + convolutional_kernel_size, + convolutional_activation, + convoluational_kernel_l1_l2, + dropout_rate): """ Helper function to make a keras network """ @@ -378,8 +377,6 @@ class Class1CleavageNeuralNetwork(object): print((peptides_empty, _, n_flanks_empty, c_flanks_empty)) - #import ipdb ; ipdb.set_trace() - peptide_input1 = Input( shape=peptides_empty.shape[1:], dtype='float32', @@ -404,19 +401,26 @@ class Class1CleavageNeuralNetwork(object): for input_pair in [(n_flank_input, peptide_input1), (peptide_input2, c_flank_input)]: # need to stack them together current_layer = Concatenate(axis=1)(list(input_pair)) - for i in range(1): - current_layer = keras.layers.Conv1D( - filters=int(16 / 2**i), - padding="same", - kernel_size=8, - kernel_regularizer=keras.regularizers.l1(convoluational_kernel_l1), - activation="relu")(current_layer) + current_layer = keras.layers.Conv1D( + filters=convolutional_filters, + kernel_size=convolutional_kernel_size, + kernel_regularizer=keras.regularizers.l1_l2( + *convoluational_kernel_l1_l2), + padding="same", + activation=convolutional_activation)(current_layer) + if dropout_rate > 0: + current_layer = keras.layers.Dropout( + rate=dropout_rate, + noise_shape=( + None, 1, int(current_layer.get_shape()[-1])))( + current_layer) conv_outputs.append(current_layer) current_layer = keras.layers.Conv1D( filters=1, kernel_size=1, - kernel_regularizer=keras.regularizers.l1(convoluational_kernel_l1), - activation="relu")(current_layer) + kernel_regularizer=keras.regularizers.l1_l2( + *convoluational_kernel_l1_l2), + activation=convolutional_activation)(current_layer) single_outputs.append(current_layer) extracted_layers = [] @@ -435,7 +439,9 @@ class Class1CleavageNeuralNetwork(object): :, (n_flank_length + 1) : ])(single_outputs[0]) extracted_layers.append( - keras.layers.pooling.GlobalMaxPooling1D()(peptide_n_cleavage)) + keras.layers.Lambda(lambda x: -x)( + keras.layers.pooling.GlobalMaxPooling1D()( + peptide_n_cleavage))) extracted_layers.append( keras.layers.Lambda( @@ -452,13 +458,16 @@ class Class1CleavageNeuralNetwork(object): :, 0 : peptide_max_length ])(single_outputs[1]) extracted_layers.append( - keras.layers.pooling.GlobalMaxPooling1D()(peptide_c_cleavage)) + keras.layers.Lambda(lambda x: -x)( + keras.layers.pooling.GlobalMaxPooling1D()(peptide_c_cleavage))) current_layer = Concatenate()(extracted_layers) output = Dense( 1, activation="sigmoid", - name="output")(current_layer) + name="output", + kernel_initializer=keras.initializers.Ones(), + )(current_layer) model = keras.models.Model( inputs=inputs, outputs=[output], @@ -466,3 +475,85 @@ class Class1CleavageNeuralNetwork(object): return model + def __getstate__(self): + """ + serialize to a dict. Model weights are included. For pickle support. + + Returns + ------- + dict + + """ + result = self.get_config() + result['network_weights'] = self.get_weights() + return result + + def __setstate__(self, state): + """ + Deserialize. For pickle support. + """ + network_json = state.pop("network_json") + network_weights = state.pop("network_weights") + self.__dict__.update(state) + if network_json is not None: + import keras.models + self.network = keras.models.model_from_json(network_json) + if network_weights is not None: + self.network.set_weights(network_weights) + + def get_weights(self): + """ + Get the network weights + + Returns + ------- + list of numpy.array giving weights for each layer or None if there is no + network + """ + if self.network is None: + return None + return self.network.get_weights() + + def get_config(self): + """ + serialize to a dict all attributes except model weights + + Returns + ------- + dict + """ + result = dict(self.__dict__) + del result['network'] + result['network_json'] = None + if self.network: + result['network_json'] = self.network.to_json() + return result + + @classmethod + def from_config(cls, config, weights=None): + """ + deserialize from a dict returned by get_config(). + + Parameters + ---------- + config : dict + weights : list of array, optional + Network weights to restore + weights_loader : callable, optional + Function to call (no arguments) to load weights when needed + + Returns + ------- + Class1CleavageNeuralNetwork + """ + config = dict(config) + instance = cls(**config.pop('hyperparameters')) + network_json = config.pop('network_json') + instance.__dict__.update(config) + assert instance.network is None + if network_json is not None: + import keras.models + instance.network = keras.models.model_from_json(network_json) + if weights is not None: + instance.network.set_weights(weights) + return instance \ No newline at end of file diff --git a/mhcflurry/class1_cleavage_predictor.py b/mhcflurry/class1_cleavage_predictor.py new file mode 100644 index 00000000..ac44de84 --- /dev/null +++ b/mhcflurry/class1_cleavage_predictor.py @@ -0,0 +1,262 @@ +from __future__ import print_function + +from os.path import join, exists, abspath +from os import mkdir +from socket import gethostname +from getpass import getuser + +import time +import json +import hashlib +import logging +from six import string_types + + +import numpy +import pandas + +from .version import __version__ +from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE +from .encodable_sequences import EncodableSequences +from .downloads import get_default_class1_cleavage_models_dir +from .class1_cleavage_neural_network import Class1CleavageNeuralNetwork +from .common import save_weights, load_weights, NumpyJSONEncoder + + +class Class1CleavagePredictor(object): + def __init__( + self, + models, + manifest_df=None, + metadata_dataframes=None): + self.models = models + self._manifest_df = manifest_df + self.metadata_dataframes = ( + dict(metadata_dataframes) if metadata_dataframes else {}) + + @property + def manifest_df(self): + """ + A pandas.DataFrame describing the models included in this predictor. + + Returns + ------- + pandas.DataFrame + """ + if self._manifest_df is None: + rows = [] + for (i, model) in enumerate(self.models): + model_config = model.get_config() + rows.append(( + self.model_name(i), + json.dumps(model_config, cls=NumpyJSONEncoder), + model + )) + self._manifest_df = pandas.DataFrame( + rows, + columns=["model_name", "config_json", "model"]) + return self._manifest_df + + @staticmethod + def model_name(num): + """ + Generate a model name + + Returns + ------- + string + + """ + random_string = hashlib.sha1( + str(time.time()).encode()).hexdigest()[:16] + return "CLEAVAGE-CLASSI-%d-%s" % ( + num, + random_string) + + @staticmethod + def weights_path(models_dir, model_name): + """ + Generate the path to the weights file for a model + + Parameters + ---------- + models_dir : string + model_name : string + + Returns + ------- + string + """ + return join(models_dir, "weights_%s.npz" % model_name) + + def predict( + self, + peptides, + n_flanks, + c_flanks, + batch_size=DEFAULT_PREDICT_BATCH_SIZE): + return self.predict_to_dataframe( + peptides=peptides, + n_flanks=n_flanks, + c_flanks=c_flanks, + batch_size=batch_size).score.values + + def predict_to_dataframe( + self, + peptides, + n_flanks, + c_flanks, + batch_size=DEFAULT_PREDICT_BATCH_SIZE): + + for (name, value) in [ + ("peptides", peptides), + ("n_flanks", n_flanks), + ("c_flanks", c_flanks)]: + if isinstance(value, string_types): + raise TypeError( + "%s must be a list or array, not a string" % name) + + peptides = EncodableSequences.create(peptides) + n_flanks = EncodableSequences.create(n_flanks) + c_flanks = EncodableSequences.create(c_flanks) + + score_array = [] + + for (i, network) in enumerate(self.models): + predictions = network.predict( + peptides=peptides, + n_flanks=n_flanks, + c_flanks=c_flanks, + batch_size=batch_size) + score_array.append(predictions) + + score_array = numpy.array(score_array) + + result_df = pandas.DataFrame({ + "peptide": peptides.sequences, + "n_flank": n_flanks.sequences, + "c_flank": c_flanks.sequences, + "score": numpy.mean(score_array, axis=0), + }) + return result_df + + def check_consistency(self): + """ + Verify that self.manifest_df is consistent with instance variables. + + Currently only checks for agreement on the total number of models. + + Throws AssertionError if inconsistent. + """ + assert len(self.manifest_df) == len(self.models), ( + "Manifest seems out of sync with models: %d vs %d entries: \n%s"% ( + len(self.manifest_df), + len(self.models), + str(self.manifest_df))) + + def save(self, models_dir, model_names_to_write=None, write_metadata=True): + """ + Serialize the predictor to a directory on disk. If the directory does + not exist it will be created. + + The serialization format consists of a file called "manifest.csv" with + the configurations of each Class1CleavageNeuralNetwork, along with + per-network files giving the model weights. + + Parameters + ---------- + models_dir : string + Path to directory. It will be created if it doesn't exist. + """ + self.check_consistency() + + if model_names_to_write is None: + # Write all models + model_names_to_write = self.manifest_df.model_name.values + + if not exists(models_dir): + mkdir(models_dir) + + sub_manifest_df = self.manifest_df.loc[ + self.manifest_df.model_name.isin(model_names_to_write) + ].copy() + + # Network JSON configs may have changed since the models were added, + # so we update the JSON configs here also. + updated_network_config_jsons = [] + for (_, row) in sub_manifest_df.iterrows(): + updated_network_config_jsons.append( + json.dumps(row.model.get_config(), cls=NumpyJSONEncoder)) + weights_path = self.weights_path(models_dir, row.model_name) + save_weights(row.model.get_weights(), weights_path) + logging.info("Wrote: %s", weights_path) + sub_manifest_df["config_json"] = updated_network_config_jsons + self.manifest_df.loc[ + sub_manifest_df.index, + "config_json" + ] = updated_network_config_jsons + + write_manifest_df = self.manifest_df[[ + c for c in self.manifest_df.columns if c != "model" + ]] + manifest_path = join(models_dir, "manifest.csv") + write_manifest_df.to_csv(manifest_path, index=False) + logging.info("Wrote: %s", manifest_path) + + if write_metadata: + # Write "info.txt" + info_path = join(models_dir, "info.txt") + rows = [ + ("trained on", time.asctime()), + ("package ", "mhcflurry %s" % __version__), + ("hostname ", gethostname()), + ("user ", getuser()), + ] + pandas.DataFrame(rows).to_csv( + info_path, sep="\t", header=False, index=False) + + if self.metadata_dataframes: + for (name, df) in self.metadata_dataframes.items(): + metadata_df_path = join(models_dir, "%s.csv.bz2" % name) + df.to_csv(metadata_df_path, index=False, compression="bz2") + + @classmethod + def load(cls, models_dir=None, max_models=None): + """ + Deserialize a predictor from a directory on disk. + + Parameters + ---------- + models_dir : string + Path to directory. If unspecified the default downloaded models are + used. + + max_models : int, optional + Maximum number of models to load + + Returns + ------- + `Class1CleavagePredictor` instance + """ + if models_dir is None: + models_dir = get_default_class1_cleavage_models_dir() + + manifest_path = join(models_dir, "manifest.csv") + manifest_df = pandas.read_csv(manifest_path, nrows=max_models) + + models = [] + for (_, row) in manifest_df.iterrows(): + weights_filename = cls.weights_path(models_dir, row.model_name) + config = json.loads(row.config_json) + model = Class1CleavageNeuralNetwork.from_config( + config, + weights=load_weights(abspath(weights_filename))) + models.append(model) + + manifest_df["model"] = models + + logging.info("Loaded %d class1 cleavage models", len(models)) + result = cls( + models=models, + manifest_df=manifest_df) + return result diff --git a/mhcflurry/downloads.py b/mhcflurry/downloads.py index 1380ade1..33e3116c 100644 --- a/mhcflurry/downloads.py +++ b/mhcflurry/downloads.py @@ -32,6 +32,8 @@ _MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR = environ.get( "MHCFLURRY_DEFAULT_CLASS1_MODELS") _MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR = environ.get( "MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR") +_MHCFLURRY_DEFAULT_CLASS1_CLEAVAGE_MODELS_DIR = environ.get( + "MHCFLURRY_DEFAULT_CLASS1_CLEAVAGE_MODELS_DIR") def get_downloads_dir(): @@ -120,6 +122,38 @@ def get_default_class1_presentation_models_dir(test_exists=True): "models_class1_pan_refined", "presentation", test_exists=test_exists) +def get_default_class1_cleavage_models_dir(test_exists=True): + """ + Return the absolute path to the default class1 cleavage models dir. + + See `get_default_class1_models_dir`. + + If environment variable MHCFLURRY_DEFAULT_CLASS1_CLEAVAGE_MODELS is set + to an absolute path, return that path. If it's set to a relative path (does + not start with /) then return that path taken to be relative to the mhcflurry + downloads dir. + + Parameters + ---------- + + test_exists : boolean, optional + Whether to raise an exception of the path does not exist + + Returns + ------- + string : absolute path + """ + if _MHCFLURRY_DEFAULT_CLASS1_CLEAVAGE_MODELS_DIR: + result = join( + get_downloads_dir(), + _MHCFLURRY_DEFAULT_CLASS1_CLEAVAGE_MODELS_DIR) + if test_exists and not exists(result): + raise IOError("No such directory: %s" % result) + return result + return get_path( + "models_class1_cleavage", "models", test_exists=test_exists) + + def get_current_release_downloads(): """ Return a dict of all available downloads in the current release. diff --git a/test/test_class1_cleavage_neural_network.py b/test/test_class1_cleavage_neural_network.py index 1ebd24d5..22ce5652 100644 --- a/test/test_class1_cleavage_neural_network.py +++ b/test/test_class1_cleavage_neural_network.py @@ -22,17 +22,20 @@ teardown = cleanup setup = startup -def test_basic(): - hyperparameters = dict() +def test_big(): + train_basic_network(num=100000) - num = 100000 +def test_small(): + train_basic_network(num=10000) + + +def train_basic_network(num, do_assertions=True, **hyperparameters): df = pandas.DataFrame({ "n_flank": random_peptides(num, 10), "c_flank": random_peptides(num, 10), "peptide": random_peptides(num, 9), }) - #df["hit"] = df.peptide.str.get(0).isin(["A", "I", "L"]) n_cleavage_regex = "[AILQSV].[MNPQYK]" @@ -75,3 +78,9 @@ def test_basic(): print("Train auc", train_auc) print("Test auc", test_auc) + + if do_assertions: + assert_greater(train_auc, 0.9) + assert_greater(test_auc, 0.85) + + return network diff --git a/test/test_class1_cleavage_predictor.py b/test/test_class1_cleavage_predictor.py new file mode 100644 index 00000000..1ade3f22 --- /dev/null +++ b/test/test_class1_cleavage_predictor.py @@ -0,0 +1,62 @@ +import logging +logging.getLogger('tensorflow').disabled = True +logging.getLogger('matplotlib').disabled = True + +import pandas +import tempfile +import pickle + +from numpy.testing import assert_, assert_equal, assert_allclose, assert_array_equal +from nose.tools import assert_greater, assert_less +import numpy + +from mhcflurry.class1_cleavage_predictor import Class1CleavagePredictor + +from mhcflurry.common import random_peptides + +from mhcflurry.testing_utils import cleanup, startup + +from .test_class1_cleavage_neural_network import train_basic_network + +AFFINITY_PREDICTOR = None + +def setup(): + pass + + +def teardown(): + pass + + +def test_basic(): + network = train_basic_network(num=10000, do_assertions=False, max_epochs=10) + predictor = Class1CleavagePredictor(models=[network]) + + num=10000 + df = pandas.DataFrame({ + "n_flank": random_peptides(num, 10), + "c_flank": random_peptides(num, 10), + "peptide": random_peptides(num, 9), + }) + df["score"] = predictor.predict(df.peptide, df.n_flank, df.c_flank) + + # Test saving and loading + models_dir = tempfile.mkdtemp("_models") + print(models_dir) + predictor.save(models_dir) + predictor2 = Class1CleavagePredictor.load(models_dir) + + df2 = predictor2.predict_to_dataframe( + peptides=df.peptide.values, + n_flanks=df.n_flank.values, + c_flanks=df.c_flank.values) + assert_array_equal(df.score.values, df2.score.values) + + # Test pickling + predictor3 = pickle.loads( + pickle.dumps(predictor, protocol=pickle.HIGHEST_PROTOCOL)) + df3 = predictor3.predict_to_dataframe( + peptides=df.peptide.values, + n_flanks=df.n_flank.values, + c_flanks=df.c_flank.values) + assert_array_equal(df.score.values, df3.score.values) -- GitLab