From 289dbb090bc76cab5fd2ca1555e9a5988a43623f Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 16 Jan 2018 14:15:19 -0500 Subject: [PATCH] Fix bug in inequality support, bump to version 1.1.0, add test --- mhcflurry/__init__.py | 2 +- mhcflurry/class1_neural_network.py | 24 +++--- mhcflurry/custom_loss.py | 96 ++++++++++++++++++++++ mhcflurry/downloads.yml | 29 ++++++- mhcflurry/loss_with_inequalities.py | 78 ------------------ test/test_class1_neural_network.py | 118 ++++++++++++++++++++++++++-- 6 files changed, 252 insertions(+), 95 deletions(-) create mode 100644 mhcflurry/custom_loss.py delete mode 100644 mhcflurry/loss_with_inequalities.py diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index 600882fd..3e0abd04 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -1,7 +1,7 @@ from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor from mhcflurry.class1_neural_network import Class1NeuralNetwork -__version__ = "1.0.0" +__version__ = "1.1.0" __all__ = [ "__version__", diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 58349987..73405fdb 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -11,7 +11,7 @@ from .encodable_sequences import EncodableSequences from .amino_acid import available_vector_encodings, vector_encoding_length from .regression_target import to_ic50, from_ic50 from .common import random_peptides, amino_acid_distribution -from .loss_with_inequalities import encode_y, LOSSES +from .custom_loss import CUSTOM_LOSSES class Class1NeuralNetwork(object): @@ -447,23 +447,29 @@ class Class1NeuralNetwork(object): if self.hyperparameters['loss'].startswith("custom:"): # Using a custom loss that supports inequalities try: - loss_name_or_function = LOSSES[ + custom_loss = CUSTOM_LOSSES[ self.hyperparameters['loss'].replace("custom:", "") ] except KeyError: raise ValueError( "No such custom loss function: %s. Supported losses are: %s" % ( self.hyperparameters['loss'], - ", ".join(["custom:" + loss_name for loss_name in LOSSES]))) - loss_supports_inequalities = True + ", ".join([ + "custom:" + loss_name for loss_name in CUSTOM_LOSSES + ]))) + loss_name_or_function = custom_loss.loss + loss_supports_inequalities = custom_loss.supports_inequalities + loss_encode_y_function = custom_loss.encode_y else: # Using a regular keras loss. No inequalities supported. loss_name_or_function = self.hyperparameters['loss'] loss_supports_inequalities = False + loss_encode_y_function = None - if any(inequality != "=" for inequality in adjusted_inequalities): - raise ValueError("Loss %s does not support inequalities" % ( - loss_name_or_function)) + if not loss_supports_inequalities and ( + any(inequality != "=" for inequality in adjusted_inequalities)): + raise ValueError("Loss %s does not support inequalities" % ( + loss_name_or_function)) if self.network() is None: self._network = self.make_network( @@ -512,8 +518,8 @@ class Class1NeuralNetwork(object): else: sample_weights_with_random_negatives = None - if loss_supports_inequalities: - y_dict_with_random_negatives['output'] = encode_y( + if loss_encode_y_function is not None: + y_dict_with_random_negatives['output'] = loss_encode_y_function( y_dict_with_random_negatives['output'], adjusted_inequalities_with_random_negatives) diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py new file mode 100644 index 00000000..cc885e00 --- /dev/null +++ b/mhcflurry/custom_loss.py @@ -0,0 +1,96 @@ +""" +Custom loss functions. + +For losses supporting inequalities, each training data point is associated with +one of (=), (<), or (>). For e.g. (>) inequalities, penalization is applied only +if the prediction is less than the given value. +""" + +import pandas +from numpy import isnan, array + +CUSTOM_LOSSES = {} + + +class MSEWithInequalities(object): + """ + Supports training a regressor on data that includes inequalities + (e.g. x < 100). Mean square error is used as the loss for elements with + an (=) inequality. For elements with e.g. a (> 0.5) inequality, then the loss + for that element is (y - 0.5)^2 (standard MSE) if y < 500 and 0 otherwise. + + This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a + hack, the implementation uses other intervals for y_pred to encode the + inequality information. + + y_true is interpreted as follows: + + between 0 - 1 + Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if + y_pred is greater or less than y_true. + + between 2 - 3: + Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is + applied only if y_pred is greater than y_true - 2. + + between 4 - 5: + Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is + applied only if y_pred is less than y_true - 4. + """ + name = "mse_with_inequalities" + supports_inequalities = True + + @staticmethod + def encode_y(y, inequalities=None): + y = array(y, dtype="float32") + if isnan(y).any(): + raise ValueError("y contains NaN") + if (y > 1.0).any(): + raise ValueError("y contains values > 1.0") + if (y < 0.0).any(): + raise ValueError("y contains values < 0.0") + + if inequalities is None: + encoded = y + else: + offsets = pandas.Series(inequalities).map({ + '=': 0, + '>': 2, + '<': 4, + }).values + if isnan(offsets).any(): + raise ValueError("Invalid inequality. Must be =, <, or >") + encoded = y + offsets + assert not isnan(encoded).any() + return encoded + + @staticmethod + def loss(y_true, y_pred): + # We always delay import of Keras so that mhcflurry can be imported initially + # without tensorflow debug output, etc. + from keras import backend as K + + # Handle (=) inequalities + diff1 = y_pred - y_true + diff1 *= K.cast(y_true >= 0.0, "float32") + diff1 *= K.cast(y_true <= 1.0, "float32") + + # Handle (>) inequalities + diff2 = y_pred - (y_true - 2.0) + diff2 *= K.cast(y_true >= 2.0, "float32") + diff2 *= K.cast(y_true <= 3.0, "float32") + diff2 *= K.cast(diff2 < 0.0, "float32") + + # Handle (<) inequalities + diff3 = y_pred - (y_true - 4.0) + diff3 *= K.cast(y_true >= 4.0, "float32") + diff3 *= K.cast(diff3 > 0.0, "float32") + + return ( + K.sum(K.square(diff1), axis=-1) + + K.sum(K.square(diff2), axis=-1) + + K.sum(K.square(diff3), axis=-1)) + +# Register custom losses. +for cls in [MSEWithInequalities]: + CUSTOM_LOSSES[cls.name] = cls() \ No newline at end of file diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index cf7709b9..26f46fb5 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -8,7 +8,7 @@ # by name, the downloads with "default=true" are downloaded. # This should usually be the latest release. -current-release: 1.0.0 +current-release: 1.1.0 # An integer indicating what models the current MHCflurry code base is compatible # with. Increment this integer when changes are made to MHCflurry that would break @@ -17,6 +17,33 @@ current-compatibility-version: 2 # Add new releases here as they are made. releases: + 1.1.0: + compatibility-version: 2 + downloads: + - name: models_class1 + url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/models_class1.20180116.tar.bz2 + default: true + + - name: models_class1_experiments1 + url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2 + default: false + + - name: cross_validation_class1 + url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2 + default: false + + - name: data_iedb + url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 + default: false + + - name: data_kim2014 + url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 + default: false + + - name: data_curated + url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2 + default: true + 1.0.0: compatibility-version: 2 downloads: diff --git a/mhcflurry/loss_with_inequalities.py b/mhcflurry/loss_with_inequalities.py deleted file mode 100644 index 98a0383e..00000000 --- a/mhcflurry/loss_with_inequalities.py +++ /dev/null @@ -1,78 +0,0 @@ -""" -Custom loss functions. - -Supports training a regressor on data that includes inequalities (e.g. x < 100). - -This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a -hack, the implementation uses other intervals for y_pred to encode the -inequality information. - -y_true is interpreted as follows: - -between 0 - 1 - Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if - y_pred is greater or less than y_true. - -between 2 - 3: - Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is - applied only if y_pred is greater than y_true - 2. - -between 4 - 5: - Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is - applied only if y_pred is less than y_true - 4. -""" - -from keras import backend as K - -import pandas -from numpy import isnan, array - -LOSSES = {} - - -def encode_y(y, inequalities=None): - y = array(y, dtype="float32") - if isnan(y).any(): - raise ValueError("y contains NaN") - if (y > 1.0).any(): - raise ValueError("y contains values > 1.0") - if (y < 0.0).any(): - raise ValueError("y contains values < 0.0") - - if inequalities is None: - encoded = y - else: - offsets = pandas.Series(inequalities).map({ - '=': 0, - '<': 2, - '>': 4, - }).values - if isnan(offsets).any(): - raise ValueError("Invalid inequality. Must be =, <, or >") - encoded = y + offsets - assert not isnan(encoded).any() - return encoded - - -def mse_with_inequalities(y_true, y_pred): - # Handle (=) inequalities - diff1 = y_pred - y_true - diff1 *= K.cast(y_true >= 0.0, "float32") - diff1 *= K.cast(y_true <= 1.0, "float32") - - # Handle (>) inequalities - diff2 = y_pred - (y_true - 2.0) - diff2 *= K.cast(y_true >= 2.0, "float32") - diff2 *= K.cast(y_true <= 3.0, "float32") - diff2 *= K.cast(diff2 < 0.0, "float32") - - # Handle (<) inequalities - diff3 = y_pred - (y_true - 4.0) - diff3 *= K.cast(y_true >= 4.0, "float32") - diff3 *= K.cast(diff3 > 0.0, "float32") - - return ( - K.sum(K.square(diff1), axis=-1) + - K.sum(K.square(diff2), axis=-1) + - K.sum(K.square(diff3), axis=-1)) -LOSSES["mse_with_inequalities"] = mse_with_inequalities \ No newline at end of file diff --git a/test/test_class1_neural_network.py b/test/test_class1_neural_network.py index 835c632d..da83940b 100644 --- a/test/test_class1_neural_network.py +++ b/test/test_class1_neural_network.py @@ -1,16 +1,19 @@ +from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal + import numpy import pandas -numpy.random.seed(0) +from numpy import testing -from mhcflurry.class1_neural_network import Class1NeuralNetwork +numpy.random.seed(0) -from nose.tools import eq_ -from numpy import testing +import logging +logging.getLogger('tensorflow').disabled = True +from mhcflurry.class1_neural_network import Class1NeuralNetwork from mhcflurry.downloads import get_path +from mhcflurry.common import random_peptides - -def test_class1_affinity_predictor_a0205_training_accuracy(): +def test_class1_neural_network_a0205_training_accuracy(): # Memorize the dataset. hyperparameters = dict( activation="tanh", @@ -80,3 +83,106 @@ def test_class1_affinity_predictor_a0205_training_accuracy(): predictor2.fit(df.peptide.values, df.measurement_value.values) eq_(predictor.network().to_json(), predictor2.network().to_json()) + +def test_inequalities(): + # Memorize the dataset. + hyperparameters = dict( + loss="custom:mse_with_inequalities", + activation="tanh", + layer_sizes=[16], + max_epochs=50, + early_stopping=False, + validation_split=0.0, + locally_connected_layers=[ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } + ], + dense_layer_l1_regularization=0.0, + dropout_probability=0.0) + + df = pandas.DataFrame() + df["peptide"] = random_peptides(1000, length=9) + + # First half are binders + df["binder"] = df.index < len(df) / 2 + df["value"] = df.binder.map({True: 100, False: 5000}) + df.loc[:10, "value"] = 1.0 # some strong binders + df["inequality1"] = "=" + df["inequality2"] = df.binder.map({True: "<", False: "="}) + df["inequality3"] = df.binder.map({True: "=", False: ">"}) + + # "A" at start of peptide indicates strong binder + df["peptide"] = [ + ("C" if not row.binder else "A") + row.peptide[1:] + for _, row in df.iterrows() + ] + + fit_kwargs = {'verbose': 0} + + # Prediction1 uses no inequalities (i.e. all are (=)) + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + df.peptide.values, + df.value.values, + inequalities=df.inequality1.values, + **fit_kwargs) + df["prediction1"] = predictor.predict(df.peptide.values) + + # Prediction2 has a (<) inequality on binders and an (=) on non-binders + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + df.peptide.values, + df.value.values, + inequalities=df.inequality2.values, + **fit_kwargs) + df["prediction2"] = predictor.predict(df.peptide.values) + + # Prediction3 has a (=) inequality on binders and an (>) on non-binders + predictor = Class1NeuralNetwork(**hyperparameters) + predictor.fit( + df.peptide.values, + df.value.values, + inequalities=df.inequality3.values, + **fit_kwargs) + df["prediction3"] = predictor.predict(df.peptide.values) + + df_binders = df.loc[df.binder] + df_nonbinders = df.loc[~df.binder] + + print("***** Binders: *****") + print(df_binders.head(5)) + + print("***** Non-binders: *****") + print(df_nonbinders.head(5)) + + # Binders should always be given tighter predicted affinity than non-binders + assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean()) + assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean()) + assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean()) + + # prediction2 binders should be tighter on average than prediction1 + # binders, since prediction2 has a (<) inequality for binders. + # Non-binders should be about the same between prediction2 and prediction1 + assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean()) + assert_almost_equal( + df_nonbinders.prediction2.mean(), + df_nonbinders.prediction1.mean(), + delta=2000) + + # prediction3 non-binders should be weaker on average than prediction2 (or 1) + # non-binders, since prediction3 has a (>) inequality for these peptides. + # Binders should be about the same. + assert_greater( + df_nonbinders.prediction3.mean(), + df_nonbinders.prediction2.mean()) + assert_greater( + df_nonbinders.prediction3.mean(), + df_nonbinders.prediction1.mean()) + assert_almost_equal( + df_binders.prediction3.mean(), + df_binders.prediction1.mean(), + delta=2000) + -- GitLab