From 2e8dc426edc3565d7158ff01777434c4557ca2d4 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Thu, 14 Nov 2019 18:14:45 -0500 Subject: [PATCH] working on ligandome --- mhcflurry/class1_ligandome_predictor.py | 122 ++++++++++++------- mhcflurry/custom_loss.py | 76 ++++++++++-- test/test_class1_ligandome_predictor.py | 148 ++++++++++++------------ 3 files changed, 223 insertions(+), 123 deletions(-) diff --git a/mhcflurry/class1_ligandome_predictor.py b/mhcflurry/class1_ligandome_predictor.py index 0f7790ab..4d433e48 100644 --- a/mhcflurry/class1_ligandome_predictor.py +++ b/mhcflurry/class1_ligandome_predictor.py @@ -5,11 +5,17 @@ import collections from functools import partial import numpy +import pandas from .hyperparameters import HyperparameterDefaults from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE from .encodable_sequences import EncodableSequences - +from .regression_target import from_ic50, to_ic50 +from .random_negative_peptides import RandomNegativePeptides +from .custom_loss import ( + MSEWithInequalities, + MSEWithInequalitiesAndMultipleOutputs, + MultiallelicMassSpecLoss) class Class1LigandomePredictor(object): network_hyperparameter_defaults = HyperparameterDefaults( @@ -19,6 +25,8 @@ class Class1LigandomePredictor(object): 'alignment_method': 'left_pad_centered_right_pad', 'max_length': 15, }, + additional_dense_layers=[], + additional_dense_activation="sigmoid", ) """ Hyperparameters (and their default values) that affect the neural network @@ -29,9 +37,8 @@ class Class1LigandomePredictor(object): max_epochs=500, validation_split=0.1, early_stopping=True, - minibatch_size=128, - random_negative_rate=0.0, - random_negative_constant=0, + minibatch_size=128,).extend( + RandomNegativePeptides.hyperparameter_defaults ) """ Hyperparameters for neural network training. @@ -47,7 +54,6 @@ class Class1LigandomePredictor(object): compile_hyperparameter_defaults = HyperparameterDefaults( loss_delta=0.2, - loss_alpha=None, optimizer="rmsprop", learning_rate=None, ) @@ -56,10 +62,18 @@ class Class1LigandomePredictor(object): used. """ + allele_features_hyperparameter_defaults = HyperparameterDefaults( + allele_features_include_gene=True, + ) + """ + Allele feature hyperparameters. + """ + hyperparameter_defaults = network_hyperparameter_defaults.extend( fit_hyperparameter_defaults).extend( early_stopping_hyperparameter_defaults).extend( - compile_hyperparameter_defaults) + compile_hyperparameter_defaults).extend( + allele_features_hyperparameter_defaults) def __init__( self, @@ -87,8 +101,15 @@ class Class1LigandomePredictor(object): @staticmethod def make_network(pan_allele_class1_neural_networks, hyperparameters): import keras.backend as K - from keras.layers import Input, TimeDistributed, Lambda, Flatten, RepeatVector, concatenate, Dropout, Reshape, Embedding - from keras.activations import sigmoid + from keras.layers import ( + Input, + TimeDistributed, + Dense, + Flatten, + RepeatVector, + concatenate, + Reshape, + Embedding) from keras.models import Model networks = [model.network() for model in pan_allele_class1_neural_networks] @@ -163,39 +184,28 @@ class Class1LigandomePredictor(object): layer_name_to_new_node[layer.name] = node + affinity_predictor_output = node + + for (i, layer_size) in enumerate( + hyperparameters['additional_dense_layers']): + layer = Dense( + layer_size, + activation=hyperparameters['additional_dense_activation']) + lifted = TimeDistributed(layer) + node = lifted(node) + + layer = Dense(1, activation="sigmoid") + lifted = TimeDistributed(layer) + ligandome_output = lifted(node) + network = Model( inputs=[input_peptides, input_alleles], - outputs=node, + outputs=[affinity_predictor_output, ligandome_output], name="ligandome", ) network.summary() return network - @staticmethod - def loss(y_true, y_pred, sample_weight=None, delta=0.2, alpha=None): - """ - Loss function for ligandome prediction. - """ - import tensorflow as tf - - y_pred = tf.squeeze(y_pred, axis=-1) - y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,)) - - pos = tf.boolean_mask(y_pred, y_true) - - if alpha is None: - pos_max = tf.reduce_max(pos, axis=1) - else: - # Smooth maximum - exp_alpha_x = tf.exp(alpha * pos) - numerator = tf.reduce_sum(tf.multiply(pos, exp_alpha_x), axis=1) - denominator = tf.reduce_sum(exp_alpha_x, axis=1) - pos_max = numerator / denominator - neg = tf.boolean_mask(y_pred, tf.logical_not(y_true)) - result = tf.reduce_sum( - tf.maximum(0.0, tf.reshape(neg, (-1, 1)) - pos_max + delta) ** 2) - return result - def peptides_to_network_input(self, peptides): """ Encode peptides to the fixed-length encoding expected by the neural @@ -241,6 +251,8 @@ class Class1LigandomePredictor(object): peptides, labels, allele_encoding, + affinities_mask=None, # True when a peptide/label is actually a peptide and an affinity + inequalities=None, # interpreted only for elements where affinities_mask is True, otherwise ignored shuffle_permutation=None, verbose=1, progress_callback=None, @@ -275,14 +287,25 @@ class Class1LigandomePredictor(object): 'allele': allele_encoding_input, } + loss = MSEWithInequalitiesAndMultipleOutputs(losses=[ + MSEWithInequalities(), + MultiallelicMassSpecLoss( + delta=self.hyperparameters['loss_delta']), + ]) + + y_values_pre_encoding = labels.copy() + if affinities_mask is not None: + y_values_pre_encoding[affinities_mask] = from_ic50(labels) + y_values = loss.encode_y( + y_values_pre_encoding, + inequalities=inequalities[affinities_mask] if inequalities is not None else None, + output_indices=(~affinities_mask).astype(int) if affinities_mask is not None else numpy.ones(len(y_values_pre_encoding), dtype=int)) + fit_info = collections.defaultdict(list) self.set_allele_representations(allele_representations) self.network.compile( - loss=partial( - self.loss, - delta=self.hyperparameters['loss_delta'], - alpha=self.hyperparameters['loss_alpha']), + loss=loss.loss, optimizer=self.hyperparameters['optimizer']) if self.hyperparameters['learning_rate'] is not None: K.set_value( @@ -371,6 +394,7 @@ class Class1LigandomePredictor(object): self, peptides, allele_encoding, + output="affinities", batch_size=DEFAULT_PREDICT_BATCH_SIZE): (allele_encoding_input, allele_representations) = ( self.allele_encoding_to_network_input(allele_encoding.compact())) @@ -380,12 +404,16 @@ class Class1LigandomePredictor(object): 'allele': allele_encoding_input, } predictions = self.network.predict(x_dict, batch_size=batch_size) + if output == "affinities": + predictions = to_ic50(predictions[0]) + elif output == "ligandome_presentation": + predictions = predictions[1] + elif output == "both": + pass + else: + raise NotImplementedError("Unknown output", output) return numpy.squeeze(predictions) - #def predict(self): - - - def set_allele_representations(self, allele_representations): """ """ @@ -440,3 +468,13 @@ class Class1LigandomePredictor(object): original_model.fit_generator = throw layer.set_weights([reshaped]) + + @staticmethod + def allele_features(allele_names, hyperparameters): + df = pandas.DataFrame({"allele_name": allele_names}) + if hyperparameters['allele_features_include_gene']: + # TODO: support other organisms. + for gene in ["A", "B", "C"]: + df[gene] = df.allele_name.str.startswith( + "HLA-%s" % gene).astype(float) + return gene diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py index 8c523c0a..2b520423 100644 --- a/mhcflurry/custom_loss.py +++ b/mhcflurry/custom_loss.py @@ -182,8 +182,10 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss): supports_inequalities = True supports_multiple_outputs = True - @staticmethod - def encode_y(y, inequalities=None, output_indices=None): + def __init__(self, losses=MSEWithInequalities): + self.losses = losses + + def encode_y(self, y, inequalities=None, output_indices=None): y = array(y, dtype="float32") if isnan(y).any(): raise ValueError("y contains NaN", y) @@ -192,8 +194,25 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss): if (y < 0.0).any(): raise ValueError("y contains values < 0.0", y) - encoded = MSEWithInequalities.encode_y( - y, inequalities=inequalities) + if isinstance(self.losses, Loss): + # Single loss applied to all outputs + encoded = MSEWithInequalities.encode_y(y, inequalities=inequalities) + else: + assert output_indices is not None + df = pandas.DataFrame({ + "y": y, + "inequality": inequalities, + "output_index": output_indices, + }) + encoded = y.copy() + encoded[:] = numpy.nan + for (output_index, sub_df) in df.groupby("output_index"): + loss = self.losses[output_index] + loss_kwargs = {} + if not sub_df.inequality.isnull().all(): + loss_kwargs['inequalities'] = sub_df.inequality.values + encoded[sub_df.index.values] = loss.encode_y( + sub_df.y.values, **loss_kwargs) if output_indices is not None: output_indices = numpy.array(output_indices) @@ -205,8 +224,7 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss): return encoded - @staticmethod - def loss(y_true, y_pred): + def loss(self, y_true, y_pred): from keras import backend as K y_true = K.flatten(y_true) @@ -230,7 +248,49 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss): # ], axis=-1) #updated_y_pred = tf.gather_nd(y_pred, indexer) - return MSEWithInequalities.loss(updated_y_true, updated_y_pred) + if isinstance(self.losses, Loss): + # Single loss for all outputs. + return self.losses.loss(updated_y_true, updated_y_pred) + else: + # TODO: make this more efficient? + result = None + for (i, loss) in enumerate(self.losses): + values = ( + loss.loss(updated_y_true, updated_y_pred) * + K.cast(K.equal(output_indices, i), "float32")) + if result is None: + result = values + else: + result += values + return result + + +class MultiallelicMassSpecLoss(Loss): + """ + """ + name = "multiallelic_mass_spec_loss" + supports_inequalities = True + supports_multiple_outputs = False + + def __init__(self, delta=0.2): + self.delta = delta + + def encode_y(self, y): + return y + + def loss(self, y_true, y_pred): + import tensorflow as tf + + #y_pred = tf.squeeze(y_pred, axis=-1) + y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,)) + + pos = tf.boolean_mask(y_pred, y_true) + pos_max = tf.reduce_max(pos, axis=1) + neg = tf.boolean_mask(y_pred, tf.logical_not(y_true)) + result = tf.reduce_sum( + tf.maximum( + 0.0, tf.reshape(neg, (-1, 1)) - pos_max + self.delta) ** 2) + return result def check_shape(name, arr, expected_shape): @@ -250,5 +310,5 @@ def check_shape(name, arr, expected_shape): # Register custom losses. -for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]: +for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss]: CUSTOM_LOSSES[cls.name] = cls() diff --git a/test/test_class1_ligandome_predictor.py b/test/test_class1_ligandome_predictor.py index 5e0daf26..b8348192 100644 --- a/test/test_class1_ligandome_predictor.py +++ b/test/test_class1_ligandome_predictor.py @@ -38,6 +38,7 @@ from mhcflurry.regression_target import from_ic50 from mhcflurry.common import random_peptides, positional_frequency_matrix from mhcflurry.testing_utils import cleanup, startup from mhcflurry.amino_acid import COMMON_AMINO_ACIDS +from mhcflurry.custom_loss import MultiallelicMassSpecLoss COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) @@ -110,73 +111,66 @@ def evaluate_loss(loss, y_true, y_pred): def test_loss(): for delta in [0.0, 0.3]: - for alpha in [None, 1.0, 20.0]: - print("delta", delta) - print("alpha", alpha) - # Hit labels - y_true = [ - 1.0, - 0.0, - 1.0, - 1.0, - 0.0 - ] - y_true = numpy.array(y_true) - y_pred = [ - [0.3, 0.7, 0.5], - [0.2, 0.4, 0.6], - [0.1, 0.5, 0.3], - [0.1, 0.7, 0.1], - [0.8, 0.2, 0.4], - ] - y_pred = numpy.array(y_pred) - - # reference implementation 1 - - def smooth_max(x, alpha): - x = numpy.array(x) - alpha = numpy.array([alpha]) - return (x * numpy.exp(x * alpha)).sum() / ( - numpy.exp(x * alpha)).sum() - - if alpha is None: - max_func = max - else: - max_func = partial(smooth_max, alpha=alpha) - - contributions = [] - for i in range(len(y_true)): - if y_true[i] == 1.0: - for j in range(len(y_true)): - if y_true[j] == 0.0: - tightest_i = max_func(y_pred[i]) - contribution = sum( - max(0, y_pred[j, k] - tightest_i + delta)**2 - for k in range(y_pred.shape[1]) - ) - contributions.append(contribution) - contributions = numpy.array(contributions) - expected1 = contributions.sum() - - # reference implementation 2: numpy - pos = numpy.array([ - max_func(y_pred[i]) - for i in range(len(y_pred)) - if y_true[i] == 1.0 - ]) - - neg = y_pred[~y_true.astype(bool)] - expected2 = ( - numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() - - yield numpy.testing.assert_almost_equal, expected1, expected2, 4 - - computed = evaluate_loss( - partial(Class1LigandomePredictor.loss, delta=delta, alpha=alpha), - y_true, - y_pred.reshape(y_pred.shape + (1,))) - - yield numpy.testing.assert_almost_equal, computed, expected1, 4 + print("delta", delta) + # Hit labels + y_true = [ + 1.0, + 0.0, + 1.0, + 1.0, + 0.0 + ] + y_true = numpy.array(y_true) + y_pred = [ + [0.3, 0.7, 0.5], + [0.2, 0.4, 0.6], + [0.1, 0.5, 0.3], + [0.1, 0.7, 0.1], + [0.8, 0.2, 0.4], + ] + y_pred = numpy.array(y_pred) + + # reference implementation 1 + + def smooth_max(x, alpha): + x = numpy.array(x) + alpha = numpy.array([alpha]) + return (x * numpy.exp(x * alpha)).sum() / ( + numpy.exp(x * alpha)).sum() + + contributions = [] + for i in range(len(y_true)): + if y_true[i] == 1.0: + for j in range(len(y_true)): + if y_true[j] == 0.0: + tightest_i = max(y_pred[i]) + contribution = sum( + max(0, y_pred[j, k] - tightest_i + delta)**2 + for k in range(y_pred.shape[1]) + ) + contributions.append(contribution) + contributions = numpy.array(contributions) + expected1 = contributions.sum() + + # reference implementation 2: numpy + pos = numpy.array([ + max(y_pred[i]) + for i in range(len(y_pred)) + if y_true[i] == 1.0 + ]) + + neg = y_pred[~y_true.astype(bool)] + expected2 = ( + numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() + + yield numpy.testing.assert_almost_equal, expected1, expected2, 4 + + computed = evaluate_loss( + MultiallelicMassSpecLoss(delta=delta).loss, + y_true, + y_pred.reshape(y_pred.shape + (1,))) + + yield numpy.testing.assert_almost_equal, computed, expected1, 4 AA_DIST = pandas.Series( @@ -284,6 +278,7 @@ def test_synthetic_allele_refinement(max_epochs=10): predictor = Class1LigandomePredictor( PAN_ALLELE_PREDICTOR_NO_MASS_SPEC, + additional_dense_layers=[8, 1], max_ensemble_size=1, max_epochs=max_epochs, learning_rate=0.0001, @@ -299,9 +294,11 @@ def test_synthetic_allele_refinement(max_epochs=10): allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence, ).compact() - pre_predictions = predictor.predict( - peptides=train_df.peptide.values, - allele_encoding=allele_encoding) + pre_predictions = from_ic50( + predictor.predict( + output="affinities", + peptides=train_df.peptide.values, + allele_encoding=allele_encoding)) (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models expected_pre_predictions = from_ic50( @@ -310,11 +307,13 @@ def test_synthetic_allele_refinement(max_epochs=10): allele_encoding=allele_encoding.allele_encoding, )).reshape((-1, len(alleles))) + #import ipdb ; ipdb.set_trace() + train_df["pre_max_prediction"] = pre_predictions.max(1) pre_auc = roc_auc_score(train_df.hit.values, train_df.pre_max_prediction.values) print("PRE_AUC", pre_auc) - assert_allclose(pre_predictions, expected_pre_predictions) + assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4) motifs_history = [] random_peptides_encodable = make_random_peptides(10000, [9]) @@ -328,8 +327,11 @@ def test_synthetic_allele_refinement(max_epochs=10): metric_rows = [] def progress(): - predictions = predictor.predict(peptides=train_df.peptide.values, - allele_encoding=allele_encoding, ) + predictions = from_ic50( + predictor.predict( + output="affinities", + peptides=train_df.peptide.values, + allele_encoding=allele_encoding)) train_df["max_prediction"] = predictions.max(1) train_df["predicted_allele"] = pandas.Series(alleles).loc[ -- GitLab