diff --git a/mhcflurry/class1_cleavage_neural_network.py b/mhcflurry/class1_cleavage_neural_network.py new file mode 100644 index 0000000000000000000000000000000000000000..a6f2077d7ebab51901123f24bd8dc11a9bae0281 --- /dev/null +++ b/mhcflurry/class1_cleavage_neural_network.py @@ -0,0 +1,443 @@ +""" +Idea: + +Fully convolutional network with softmax output. Let it take a 35mer: +- N flank [10 aa] +- peptide [7-15 aa] +- C flank [10 aa] + +Train on monoallelic mass spec. Match positive examples (hits) to negatives +from the same sample by finding unobserved peptides with as close as possible +a match for predicted binding affinity. + +In final layer, take max cleavage score over peptide and the individual score +for the position right before the peptide terminus. Compute the ratio of these. +Or actually reverse of that. Hits get label 1, decoys get 0. + +For a hit with sequence + +NNNNNNNNNNPPPPPPPPPCCCCCCCCCC + +penalize on: + +[----------1000000000---------] + +For a decoy with same sequence, penalize it on: + +[----------0-----------------] + +Train separate models for N- and C-terminal cleavage. + +Issue: +- it'll learn mass spec biases in the peptide + +Possible fix: +- Also provide the amino acid counts of the peptide as auxiliary inputs. After +training, set the cysteine value to 0. + +Architecture: +architecture (for N terminal: for C terminal reverse the sequences): + +input of length S=25 [flank + left-aligned peptide] +*** conv [vector of length S] *** +*** [more convs and local pools] *** +*** output [vector of length S] *** +*** extract: position 10 and max of peptide positions [2-vector] +*** concat:[position 10, max of peptide positions, number of Alananine, ..., number of Y in peptide] +*** single dense node, softmax activation [1-vector] + +Train on monoallelic. + +Decoys are length-matched to hits and sampled from the same transcripts, selecting +an unobeserved peptide with as close as possible the same predicted affinity. + + + *** + repeat vector for each position + + +*** conv *** +*** conv *** +*** ... conv n *** +*** repeat vector for each position +*** dense per-position +*** output [35-vector] +*** extract: position 10 and max of peptide positions [2-vector] +*** dense +*** output + + +IDEA 2: + +- Two inputs: N-flank + peptide (left aligned), peptide (right alighted + C-flank +- Bunch of convolutions +- Global max pooling +- Dense + + +""" + +from __future__ import print_function + +import time +import collections +from six import string_types + +import numpy +import pandas +import mhcnames +import hashlib + +from .hyperparameters import HyperparameterDefaults +from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE +from .encodable_sequences import EncodableSequences +from .regression_target import from_ic50, to_ic50 +from .random_negative_peptides import RandomNegativePeptides +from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding +from .auxiliary_input import AuxiliaryInputEncoder +from .batch_generator import BatchGenerator +from .custom_loss import ( + MSEWithInequalities, + TransformPredictionsLossWrapper, + MultiallelicMassSpecLoss) + + + + +class Class1CleavageNeuralNetwork(object): + network_hyperparameter_defaults = HyperparameterDefaults( + amino_acid_encoding="BLOSUM62", + peptide_max_length=15, + n_flank_length=10, + c_flank_length=10, + vector_encoding_name="BLOSUM62", + ) + """ + Hyperparameters (and their default values) that affect the neural network + architecture. + """ + + fit_hyperparameter_defaults = HyperparameterDefaults( + max_epochs=500, + validation_split=0.1, + early_stopping=True + ) + """ + Hyperparameters for neural network training. + """ + + early_stopping_hyperparameter_defaults = HyperparameterDefaults( + patience=5, + min_delta=0.0, + ) + """ + Hyperparameters for early stopping. + """ + + compile_hyperparameter_defaults = HyperparameterDefaults( + optimizer="rmsprop", + learning_rate=None, + ) + """ + Loss and optimizer hyperparameters. Any values supported by keras may be + used. + """ + + auxiliary_input_hyperparameter_defaults = HyperparameterDefaults( + ) + """ + Allele feature hyperparameters. + """ + + hyperparameter_defaults = network_hyperparameter_defaults.extend( + fit_hyperparameter_defaults).extend( + early_stopping_hyperparameter_defaults).extend( + compile_hyperparameter_defaults).extend( + auxiliary_input_hyperparameter_defaults) + + def __init__(self, **hyperparameters): + self.hyperparameters = self.hyperparameter_defaults.with_defaults( + hyperparameters) + self.network = None + self.fit_info = [] + + def fit( + self, + peptides, + n_flanks, + c_flanks, + targets, + sample_weights=None, + shuffle_permutation=None, + verbose=1, + progress_callback=None, + progress_preamble="", + progress_print_interval=5.0): + """ + + + Parameters + ---------- + peptides + n_flanks + c_flanks + targets : array of {0, 1} indicating hits (1) or decoys (0) + + Returns + ------- + + """ + import keras.backend as K + + peptides = EncodableSequences.create(peptides) + n_flanks = EncodableSequences.create(n_flanks) + c_flanks = EncodableSequences.create(c_flanks) + + x_list = self.peptides_and_flanking_to_network_input( + peptides, n_flanks, c_flanks) + + # Shuffle + if shuffle_permutation is None: + shuffle_permutation = numpy.random.permutation(len(targets)) + targets = targets[shuffle_permutation] + assert numpy.isnan(targets).sum() == 0, targets + if sample_weights is not None: + sample_weights = numpy.array(sample_weights)[shuffle_permutation] + x_list = [x[shuffle_permutation] for x in x_list] + + fit_info = collections.defaultdict(list) + + if self.network is None: + self.network = self.make_network( + **self.network_hyperparameter_defaults.subselect( + self.hyperparameters)) + if verbose > 0: + self.network.summary() + + self.network.compile( + loss="binary_crossentropy", + optimizer=self.hyperparameters['optimizer']) + + last_progress_print = None + min_val_loss_iteration = None + min_val_loss = None + start = time.time() + for i in range(self.hyperparameters['max_epochs']): + epoch_start = time.time() + fit_history = self.network.fit( + x_list, + targets, + validation_split=self.hyperparameters['validation_split'], + shuffle=True, + epochs=i + 1, + sample_weight=sample_weights, + initial_epoch=i, + verbose=verbose) + epoch_time = time.time() - epoch_start + + for (key, value) in fit_history.history.items(): + fit_info[key].extend(value) + + # Print progress no more often than once every few seconds. + if progress_print_interval is not None and ( + not last_progress_print or ( + time.time() - last_progress_print + > progress_print_interval)): + print((progress_preamble + " " + + "Epoch %3d / %3d [%0.2f sec]: loss=%g. " + "Min val loss (%s) at epoch %s" % ( + i, + self.hyperparameters['max_epochs'], + epoch_time, + fit_info['loss'][-1], + str(min_val_loss), + min_val_loss_iteration)).strip()) + last_progress_print = time.time() + + if self.hyperparameters['validation_split']: + val_loss = fit_info['val_loss'][-1] + + if min_val_loss is None or ( + val_loss < min_val_loss - self.hyperparameters['min_delta']): + min_val_loss = val_loss + min_val_loss_iteration = i + + if self.hyperparameters['early_stopping']: + threshold = ( + min_val_loss_iteration + + self.hyperparameters['patience']) + if i > threshold: + if progress_print_interval is not None: + print((progress_preamble + " " + + "Stopping at epoch %3d / %3d: loss=%g. " + "Min val loss (%g) at epoch %s" % ( + i, + self.hyperparameters['max_epochs'], + fit_info['loss'][-1], + ( + min_val_loss if min_val_loss is not None + else numpy.nan), + min_val_loss_iteration)).strip()) + break + + if progress_callback: + progress_callback() + + fit_info["time"] = time.time() - start + fit_info["num_points"] = len(peptides) + self.fit_info.append(dict(fit_info)) + + def predict( + self, + peptides, + n_flanks, + c_flanks, + batch_size=DEFAULT_PREDICT_BATCH_SIZE): + """ + """ + x_list = self.peptides_and_flanking_to_network_input( + peptides, n_flanks, c_flanks) + raw_predictions = self.network.predict( + x_list, batch_size=batch_size) + predictions = numpy.array(raw_predictions, dtype="float64") + return predictions + + def peptides_and_flanking_to_network_input(self, peptides, n_flanks, c_flanks): + """ + Encode peptides to the fixed-length encoding expected by the neural + network (which depends on the architecture). + + Parameters + ---------- + peptides : EncodableSequences or list of string + + Returns + ------- + numpy.array + """ + peptides = EncodableSequences.create(peptides) + n_flanks = EncodableSequences.create(n_flanks) + c_flanks = EncodableSequences.create(c_flanks) + + peptide_encoded1 = peptides.variable_length_to_fixed_length_vector_encoding( + vector_encoding_name=self.hyperparameters['vector_encoding_name'], + max_length=self.hyperparameters['peptide_max_length'], + alignment_method='right_pad') + peptide_encoded2 = peptides.variable_length_to_fixed_length_vector_encoding( + vector_encoding_name=self.hyperparameters['vector_encoding_name'], + max_length=self.hyperparameters['peptide_max_length'], + alignment_method='left_pad') + n_flanks_encoded = n_flanks.variable_length_to_fixed_length_vector_encoding( + vector_encoding_name=self.hyperparameters['vector_encoding_name'], + max_length=self.hyperparameters['n_flank_length'], + alignment_method='right_pad') + c_flanks_encoded = c_flanks.variable_length_to_fixed_length_vector_encoding( + vector_encoding_name=self.hyperparameters['vector_encoding_name'], + max_length=self.hyperparameters['c_flank_length'], + alignment_method='left_pad') + + return [ + peptide_encoded1, + peptide_encoded2, + n_flanks_encoded, + c_flanks_encoded + ] + + + def make_network( + self, + amino_acid_encoding, + peptide_max_length, + n_flank_length, + c_flank_length, + vector_encoding_name): + """ + Helper function to make a keras network + """ + + # We import keras here to avoid tensorflow debug output, etc. unless we + # are actually about to use Keras. + + from keras.layers import Input + import keras.layers + import keras.layers.pooling + from keras.layers.core import Dense, Flatten, Dropout + from keras.layers.embeddings import Embedding + from keras.layers.normalization import BatchNormalization + from keras.layers.merge import Concatenate + import keras.backend as K + + (peptides_empty, _, n_flanks_empty, c_flanks_empty) = ( + self.peptides_and_flanking_to_network_input( + peptides=[], + n_flanks=[], + c_flanks=[])) + + print((peptides_empty, _, n_flanks_empty, c_flanks_empty)) + + #import ipdb ; ipdb.set_trace() + + peptide_input1 = Input( + shape=peptides_empty.shape[1:], + dtype='float32', + name='peptide1') + peptide_input2 = Input( + shape=peptides_empty.shape[1:], + dtype='float32', + name='peptide2') + n_flank_input = Input( + shape=n_flanks_empty.shape[1:], + dtype='float32', + name='n_flank') + c_flank_input = Input( + shape=c_flanks_empty.shape[1:], + dtype='float32', + name='c_flank') + + inputs = [peptide_input1, peptide_input2, n_flank_input, c_flank_input] + + sub_networks = [] + for input_pair in [(n_flank_input, peptide_input1), (peptide_input2, c_flank_input)]: + # need to stack them together + current_layer = Concatenate(axis=1)(list(input_pair)) + for i in range(1): + current_layer = keras.layers.Conv1D( + filters=int(16 / 2**i), + kernel_size=8, + activation="tanh")(current_layer) + #current_layer = keras.layers.pooling.MaxPooling1D( + # pool_size=4)(current_layer) + #current_layer = Flatten()(current_layer) + sub_networks.append(current_layer) + + extracted_layers = [] + extracted_layers.append( + keras.layers.Lambda(lambda x: x[:, n_flank_length])(sub_networks[0])) + peptide_n_cleavage = keras.layers.Lambda( + lambda x: x[ + :, (n_flank_length + 1) : + ])(sub_networks[0]) + extracted_layers.append( + keras.layers.pooling.GlobalMaxPooling1D()(peptide_n_cleavage)) + extracted_layers.append( + keras.layers.Lambda( + lambda x: x[:, peptide_max_length])(sub_networks[1])) + peptide_c_cleavage = keras.layers.Lambda( + lambda x: x[ + :, 0 : peptide_max_length + ])(sub_networks[1]) + extracted_layers.append( + keras.layers.pooling.GlobalMaxPooling1D()(peptide_c_cleavage)) + + current_layer = Concatenate()(extracted_layers) + + output = Dense( + 1, + activation="sigmoid", + name="output")(current_layer) + model = keras.models.Model( + inputs=inputs, + outputs=[output], + name="predictor") + + return model + diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index 19696e23b9f901811e0fb01b9f16f58a7c0ac613..9cd04e0df1cc9b109d1d754e182ef397fdb6b24e 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -382,7 +382,48 @@ class EncodableSequences(object): sub_df.index, center_left_offset : center_left_offset + length ] = fixed_length_sequences + elif alignment_method in ("right_pad", "left_pad"): + # We arbitrarily set a minimum length of 5, although this encoding + # could handle smaller peptides. + min_length = 5 + + # Result array is int32, filled with X (null amino acid) value. + result = numpy.full( + fill_value=amino_acid.AMINO_ACID_INDEX['X'], + shape=(len(sequences), max_length), + dtype="int32") + + df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_) + + # For efficiency we handle each supported peptide length using bulk + # array operations. + for (length, sub_df) in df.groupby(df.peptide.str.len()): + if length < min_length or length > max_length: + raise EncodingError( + "Sequence '%s' (length %d) unsupported. There are %d " + "total peptides with this length." % ( + sub_df.iloc[0].peptide, + length, + len(sub_df)), supported_peptide_lengths=( + min_length, max_length)) + + # Array of shape (num peptides, length) giving fixed-length + # amino acid encoding each peptide of the current length. + fixed_length_sequences = numpy.stack(sub_df.peptide.map( + lambda s: numpy.array([ + amino_acid.AMINO_ACID_INDEX[char] for char in s + ])).values) + + if alignment_method == "right_pad": + # Left align (i.e. pad right): set left edge + result[sub_df.index, :length] = fixed_length_sequences + else: + # Right align: set right edge. + result[sub_df.index, -length:] = fixed_length_sequences + else: raise NotImplementedError( "Unsupported alignment method: %s" % alignment_method) + + return result diff --git a/test/test_class1_cleavage_neural_network.py b/test/test_class1_cleavage_neural_network.py new file mode 100644 index 0000000000000000000000000000000000000000..051a48c30503e80376de84805252dd90fb28b043 --- /dev/null +++ b/test/test_class1_cleavage_neural_network.py @@ -0,0 +1,65 @@ +import logging +logging.getLogger('tensorflow').disabled = True +logging.getLogger('matplotlib').disabled = True + +import numpy +from numpy import testing +numpy.random.seed(0) +from tensorflow import set_random_seed +set_random_seed(2) + +from sklearn.metrics import roc_auc_score + +from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal + +import pandas + +from mhcflurry.class1_cleavage_neural_network import Class1CleavageNeuralNetwork +from mhcflurry.common import random_peptides + +from mhcflurry.testing_utils import cleanup, startup +teardown = cleanup +setup = startup + + +def test_basic(): + hyperparameters = dict() + + num = 10000 + + df = pandas.DataFrame({ + "n_flank": random_peptides(num, 10), + "c_flank": random_peptides(num, 10), + "peptide": random_peptides(num, 9), + }) + df["hit"] = df.peptide.str.get(0).isin(["A", "I", "L"]) + + train_df = df.sample(frac=0.1) + test_df = df.loc[~df.index.isin(train_df.index)] + + print( + "Generated dataset", + len(df), + "hits: ", + df.hit.sum(), + "frac:", + df.hit.mean()) + + network = Class1CleavageNeuralNetwork(**hyperparameters) + network.fit( + train_df.peptide.values, + train_df.n_flank.values, + train_df.c_flank.values, + train_df.hit.values) + + for df in [train_df, test_df]: + df["predictions"] = network.predict( + df.peptide.values, + df.n_flank.values, + df.c_flank.values) + + train_auc = roc_auc_score(train_df.hit.values, train_df.predictions.values) + test_auc = roc_auc_score(test_df.hit.values, test_df.predictions.values) + + print("Train auc", train_auc) + print("Test auc", test_auc)