diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 7667da157b04f48a1e2786126dfe82842f404359..fa4f4e41000b85444680e9644f1f60f3972a69d6 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -55,6 +55,7 @@ class Class1NeuralNetwork(object): "kernel_size": 3 } ], + num_outputs=1, ) """ Hyperparameters (and their default values) that affect the neural network @@ -424,6 +425,7 @@ class Class1NeuralNetwork(object): affinities, allele_encoding=None, inequalities=None, + output_indices=None, sample_weights=None, shuffle_permutation=None, verbose=1, @@ -536,7 +538,7 @@ class Class1NeuralNetwork(object): sample_weights = sample_weights[shuffle_permutation] if self.hyperparameters['loss'].startswith("custom:"): - # Using a custom loss that supports inequalities + # Using a custom loss try: custom_loss = CUSTOM_LOSSES[ self.hyperparameters['loss'].replace("custom:", "") @@ -550,11 +552,13 @@ class Class1NeuralNetwork(object): ]))) loss_name_or_function = custom_loss.loss loss_supports_inequalities = custom_loss.supports_inequalities + loss_supports_multiple_outputs = custom_loss.supports_multiple_outputs loss_encode_y_function = custom_loss.encode_y else: - # Using a regular keras loss. No inequalities supported. + # Using a regular keras loss. loss_name_or_function = self.hyperparameters['loss'] loss_supports_inequalities = False + loss_supports_multiple_outputs = False loss_encode_y_function = None if not loss_supports_inequalities and ( @@ -562,6 +566,18 @@ class Class1NeuralNetwork(object): raise ValueError("Loss %s does not support inequalities" % ( loss_name_or_function)) + if ( + not loss_supports_multiple_outputs and + output_indices is not None and + (output_indices != 0).any()): + raise ValueError("Loss %s does not support multiple outputs" % ( + output_indices)) + + if self.hyperparameters['num_outputs'] != 1: + if output_indices is None: + raise ValueError( + "Must supply output_indices for multi-output predictor") + if self.network() is None: self._network = self.make_network( allele_representations=allele_representations, @@ -621,10 +637,26 @@ class Class1NeuralNetwork(object): else: sample_weights_with_random_negatives = None + if output_indices is not None: + output_indices_with_random_negatives = numpy.concatenate([ + numpy.zeros(int(num_random_negative.sum()), dtype=int), + output_indices + ]) + else: + output_indices_with_random_negatives = None + if loss_encode_y_function is not None: + encode_y_kwargs = {} + if adjusted_inequalities_with_random_negatives is not None: + encode_y_kwargs["inequalities"] = ( + adjusted_inequalities_with_random_negatives) + if output_indices_with_random_negatives is not None: + encode_y_kwargs["output_indices"] = ( + output_indices_with_random_negatives) + y_dict_with_random_negatives['output'] = loss_encode_y_function( y_dict_with_random_negatives['output'], - adjusted_inequalities_with_random_negatives) + **encode_y_kwargs) val_losses = [] min_val_loss_iteration = None @@ -741,7 +773,12 @@ class Class1NeuralNetwork(object): fit_info["num_points"] = len(peptides) self.fit_info.append(dict(fit_info)) - def predict(self, peptides, allele_encoding=None, batch_size=4096): + def predict( + self, + peptides, + allele_encoding=None, + batch_size=4096, + output_index=0): """ Predict affinities. @@ -784,7 +821,9 @@ class Class1NeuralNetwork(object): else: network = self.network(borrow=True) raw_predictions = network.predict(x_dict, batch_size=batch_size) - predictions = numpy.array(raw_predictions, dtype = "float64")[:,0] + predictions = numpy.array(raw_predictions, dtype = "float64") + if output_index is not None: + predictions = predictions[:,output_index] result = to_ic50(predictions) if use_cache: self.prediction_cache[peptides] = result @@ -807,6 +846,7 @@ class Class1NeuralNetwork(object): dropout_probability, batch_normalization, locally_connected_layers, + num_outputs=1, allele_representations=None): """ Helper function to make a keras network for class1 affinity prediction. @@ -913,7 +953,7 @@ class Class1NeuralNetwork(object): name="dropout_%d" % i)(current_layer) output = Dense( - 1, + num_outputs, kernel_initializer=init, activation=output_activation, name="output")(current_layer) diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py index cc885e003af6ec5105db900efaf3709df429f88e..7cad898b11e423e3969cf09b95b90fa6f3027b51 100644 --- a/mhcflurry/custom_loss.py +++ b/mhcflurry/custom_loss.py @@ -5,8 +5,9 @@ For losses supporting inequalities, each training data point is associated with one of (=), (<), or (>). For e.g. (>) inequalities, penalization is applied only if the prediction is less than the given value. """ - +from __future__ import division import pandas +import numpy from numpy import isnan, array CUSTOM_LOSSES = {} @@ -39,6 +40,7 @@ class MSEWithInequalities(object): """ name = "mse_with_inequalities" supports_inequalities = True + supports_multiple_outputs = False @staticmethod def encode_y(y, inequalities=None): @@ -70,6 +72,8 @@ class MSEWithInequalities(object): # without tensorflow debug output, etc. from keras import backend as K + y_pred = K.flatten(y_pred) + # Handle (=) inequalities diff1 = y_pred - y_true diff1 *= K.cast(y_true >= 0.0, "float32") @@ -89,8 +93,69 @@ class MSEWithInequalities(object): return ( K.sum(K.square(diff1), axis=-1) + K.sum(K.square(diff2), axis=-1) + - K.sum(K.square(diff3), axis=-1)) + K.sum(K.square(diff3), axis=-1)) / K.cast(K.shape(y_true)[0], "float32") + + +class MSEWithInequalitiesAndMultipleOutputs(object): + name = "mse_with_inequalities_and_multiple_outputs" + supports_inequalities = True + supports_multiple_outputs = True + + @staticmethod + def encode_y(y, inequalities=None, output_indices=None): + y = array(y, dtype="float32") + if isnan(y).any(): + raise ValueError("y contains NaN") + if (y > 1.0).any(): + raise ValueError("y contains values > 1.0") + if (y < 0.0).any(): + raise ValueError("y contains values < 0.0") + + encoded = MSEWithInequalities.encode_y( + y, inequalities=inequalities) + + if output_indices is not None: + output_indices = numpy.array(output_indices) + check_shape("output_indices", output_indices, (len(encoded),)) + if (output_indices < 0).any(): + raise ValueError("Invalid output indices: ", output_indices) + + encoded += output_indices * 10 + + return encoded + + @staticmethod + def loss(y_true, y_pred): + from keras import backend as K + + output_indices = y_true // 10 + updated_y_true = y_true - (10 * output_indices) + + # We index into y_pred using flattened indices since Keras backend + # supports gather but has no equivalent of tf.gather_nd: + ordinals = K.arange(K.shape(y_true)[0]) + flattened_indices = ( + ordinals * y_pred.shape[1] + K.cast(output_indices, "int32")) + updated_y_pred = K.gather(K.flatten(y_pred), flattened_indices) + + # Alternative implementation using tensorflow, which could be used if + # we drop support for other backends: + # import tensorflow as tf + # indexer = K.stack([ + # ordinals, + # K.cast(output_indices, "int32") + # ], axis=-1) + #updated_y_pred = tf.gather_nd(y_pred, indexer) + + return MSEWithInequalities.loss(updated_y_true, updated_y_pred) + + +def check_shape(name, arr, expected_shape): + if arr.shape != expected_shape: + raise ValueError("Expected %s to have shape %s not %s" % ( + name, str(expected_shape), str(arr.shape))) + # Register custom losses. -for cls in [MSEWithInequalities]: +for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]: CUSTOM_LOSSES[cls.name] = cls() \ No newline at end of file diff --git a/test/test_custom_loss.py b/test/test_custom_loss.py index f3644c66652136ca7c9f05299f1a20e6995fc8fd..005418ad5f475446af8c563754445c808fe678d8 100644 --- a/test/test_custom_loss.py +++ b/test/test_custom_loss.py @@ -13,6 +13,14 @@ from mhcflurry.custom_loss import CUSTOM_LOSSES def evaluate_loss(loss, y_true, y_pred): + y_true = numpy.array(y_true) + y_pred = numpy.array(y_pred) + if y_pred.ndim == 1: + y_pred = y_pred.reshape((len(y_pred), 1)) + + assert y_true.ndim == 1 + assert y_pred.ndim == 2 + if K.backend() == "tensorflow": session = K.get_session() y_true_var = K.constant(y_true, name="y_true") @@ -28,14 +36,13 @@ def evaluate_loss(loss, y_true, y_pred): raise ValueError("Unsupported backend: %s" % K.backend()) -def test_mse_with_inequalities(): - - loss_obj = CUSTOM_LOSSES['mse_with_inequalities'] - +def test_mse_with_inequalities(loss_obj=CUSTOM_LOSSES['mse_with_inequalities']): y_values = [0.0, 0.5, 0.8, 1.0] adjusted_y = loss_obj.encode_y(y_values) + print(adjusted_y) loss0 = evaluate_loss(loss_obj.loss, adjusted_y, y_values) + print(loss0) eq_(loss0, 0.0) adjusted_y = loss_obj.encode_y(y_values, [">", ">", ">", ">"]) @@ -64,9 +71,70 @@ def test_mse_with_inequalities(): adjusted_y = loss_obj.encode_y(y_values, ["=", "<", ">", ">"]) loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.1, 0.6, 0.9, 1.0]) - assert_almost_equal(loss0, 0.02) + assert_almost_equal(loss0, 0.02 / 4) adjusted_y = loss_obj.encode_y(y_values, ["=", "<", "=", ">"]) loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.1, 0.6, 0.9, 1.0]) - assert_almost_equal(loss0, 0.03) + assert_almost_equal(loss0, 0.03 / 4) + + +def test_mse_with_inequalities_and_multiple_outputs(): + loss_obj = CUSTOM_LOSSES['mse_with_inequalities_and_multiple_outputs'] + test_mse_with_inequalities(loss_obj) + + y_values = [0.0, 0.5, 0.8, 1.0] + adjusted_y = loss_obj.encode_y( + y_values, output_indices=[0, 1, 1, 1]) + loss0 = evaluate_loss( + loss_obj.loss, + adjusted_y, + [ + [0.0, 1000], + [2000, 0.5], + [3000, 0.8], + [4000, 1.0], + ]) + assert_almost_equal(loss0, 0.0) + + y_values = [0.0, 0.5, 0.8, 1.0] + adjusted_y = loss_obj.encode_y( + y_values, output_indices=[0, 1, 1, 0]) + loss0 = evaluate_loss( + loss_obj.loss, + adjusted_y, + [ + [0.1, 1000], + [2000, 0.6], + [3000, 0.8], + [1.0, 4000], + ]) + assert_almost_equal(loss0, 0.02 / 4) + + y_values = [0.0, 0.5, 0.8, 1.0] + adjusted_y = loss_obj.encode_y( + y_values, output_indices=[0, 1, 1, 0], inequalities=["=", ">", "<", "<"]) + loss0 = evaluate_loss( + loss_obj.loss, + adjusted_y, + [ + [0.1, 1000], + [2000, 0.6], + [3000, 0.8], + [1.0, 4000], + ]) + assert_almost_equal(loss0, 0.01 / 4) + + y_values = [0.0, 0.5, 0.8, 1.0] + adjusted_y = loss_obj.encode_y( + y_values, output_indices=[0, 1, 1, 0], inequalities=["=", "<", "<", "<"]) + loss0 = evaluate_loss( + loss_obj.loss, + adjusted_y, + [ + [0.1, 1000], + [2000, 0.6], + [3000, 0.8], + [1.0, 4000], + ]) + assert_almost_equal(loss0, 0.02 / 4) diff --git a/test/test_multi_output.py b/test/test_multi_output.py new file mode 100644 index 0000000000000000000000000000000000000000..436c4c9de792ac0413d95d9803bde11287981cc9 --- /dev/null +++ b/test/test_multi_output.py @@ -0,0 +1,126 @@ +from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal + +import numpy +import pandas +from numpy import testing + +numpy.random.seed(0) + +import logging +logging.getLogger('tensorflow').disabled = True + +from mhcflurry.class1_neural_network import Class1NeuralNetwork +from mhcflurry.downloads import get_path +from mhcflurry.common import random_peptides + + +def test_multi_output(): + # Memorize the dataset. + hyperparameters = dict( + loss="custom:mse_with_inequalities_and_multiple_outputs", + activation="tanh", + layer_sizes=[16], + max_epochs=50, + minibatch_size=32, + random_negative_rate=0.0, + random_negative_constant=0.0, + early_stopping=False, + validation_split=0.0, + locally_connected_layers=[ + ], + dense_layer_l1_regularization=0.0, + dropout_probability=0.0, + num_outputs=2) + + df = pandas.DataFrame() + df["peptide"] = random_peptides(10000, length=9) + df["output1"] = df.peptide.map(lambda s: s[4] == 'K').astype(int) * 10000 + 0.01 + df["output2"] = df.peptide.map(lambda s: s[3] == 'Q').astype(int) * 10000 + 0.01 + + print("output1 mean", df.output1.mean()) + print("output2 mean", df.output2.mean()) + + stacked = df.set_index("peptide").stack().reset_index() + stacked.columns = ['peptide', 'output_name', 'value'] + stacked["output_index"] = stacked.output_name.map({ + "output1": 0, + "output2": 1, + }) + assert not stacked.output_index.isnull().any(), stacked + + fit_kwargs = { + 'verbose': 1, + } + + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + stacked.peptide.values, + stacked.value.values, + output_indices=stacked.output_index.values, + **fit_kwargs) + result = predictor.predict(df.peptide.values, output_index=None) + print(df.shape, result.shape) + print(result) + + df["prediction1"] = result[:,0] + df["prediction2"] = result[:,1] + + import ipdb ; ipdb.set_trace() + + + + # Prediction2 has a (<) inequality on binders and an (=) on non-binders + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + df.peptide.values, + df.value.values, + inequalities=df.inequality2.values, + **fit_kwargs) + df["prediction2"] = predictor.predict(df.peptide.values) + + # Prediction3 has a (=) inequality on binders and an (>) on non-binders + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + df.peptide.values, + df.value.values, + inequalities=df.inequality3.values, + **fit_kwargs) + df["prediction3"] = predictor.predict(df.peptide.values) + + df_binders = df.loc[df.binder] + df_nonbinders = df.loc[~df.binder] + + print("***** Binders: *****") + print(df_binders.head(5)) + + print("***** Non-binders: *****") + print(df_nonbinders.head(5)) + + # Binders should always be given tighter predicted affinity than non-binders + assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean()) + assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean()) + assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean()) + + # prediction2 binders should be tighter on average than prediction1 + # binders, since prediction2 has a (<) inequality for binders. + # Non-binders should be about the same between prediction2 and prediction1 + assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean()) + assert_almost_equal( + df_nonbinders.prediction2.mean(), + df_nonbinders.prediction1.mean(), + delta=3000) + + # prediction3 non-binders should be weaker on average than prediction2 (or 1) + # non-binders, since prediction3 has a (>) inequality for these peptides. + # Binders should be about the same. + assert_greater( + df_nonbinders.prediction3.mean(), + df_nonbinders.prediction2.mean()) + assert_greater( + df_nonbinders.prediction3.mean(), + df_nonbinders.prediction1.mean()) + assert_almost_equal( + df_binders.prediction3.mean(), + df_binders.prediction1.mean(), + delta=3000) +