From 289dbb090bc76cab5fd2ca1555e9a5988a43623f Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 16 Jan 2018 14:15:19 -0500
Subject: [PATCH] Fix bug in inequality support, bump to version 1.1.0, add
 test

---
 mhcflurry/__init__.py               |   2 +-
 mhcflurry/class1_neural_network.py  |  24 +++---
 mhcflurry/custom_loss.py            |  96 ++++++++++++++++++++++
 mhcflurry/downloads.yml             |  29 ++++++-
 mhcflurry/loss_with_inequalities.py |  78 ------------------
 test/test_class1_neural_network.py  | 118 ++++++++++++++++++++++++++--
 6 files changed, 252 insertions(+), 95 deletions(-)
 create mode 100644 mhcflurry/custom_loss.py
 delete mode 100644 mhcflurry/loss_with_inequalities.py

diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py
index 600882fd..3e0abd04 100644
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -1,7 +1,7 @@
 from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor
 from mhcflurry.class1_neural_network import Class1NeuralNetwork
 
-__version__ = "1.0.0"
+__version__ = "1.1.0"
 
 __all__ = [
     "__version__",
diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index 58349987..73405fdb 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -11,7 +11,7 @@ from .encodable_sequences import EncodableSequences
 from .amino_acid import available_vector_encodings, vector_encoding_length
 from .regression_target import to_ic50, from_ic50
 from .common import random_peptides, amino_acid_distribution
-from .loss_with_inequalities import encode_y, LOSSES
+from .custom_loss import CUSTOM_LOSSES
 
 
 class Class1NeuralNetwork(object):
@@ -447,23 +447,29 @@ class Class1NeuralNetwork(object):
         if self.hyperparameters['loss'].startswith("custom:"):
             # Using a custom loss that supports inequalities
             try:
-                loss_name_or_function = LOSSES[
+                custom_loss = CUSTOM_LOSSES[
                     self.hyperparameters['loss'].replace("custom:", "")
                 ]
             except KeyError:
                 raise ValueError(
                     "No such custom loss function: %s. Supported losses are: %s" % (
                         self.hyperparameters['loss'],
-                        ", ".join(["custom:" + loss_name for loss_name in LOSSES])))
-            loss_supports_inequalities = True
+                        ", ".join([
+                            "custom:" + loss_name for loss_name in CUSTOM_LOSSES
+                        ])))
+            loss_name_or_function = custom_loss.loss
+            loss_supports_inequalities = custom_loss.supports_inequalities
+            loss_encode_y_function = custom_loss.encode_y
         else:
             # Using a regular keras loss. No inequalities supported.
             loss_name_or_function = self.hyperparameters['loss']
             loss_supports_inequalities = False
+            loss_encode_y_function = None
 
-            if any(inequality != "=" for inequality in adjusted_inequalities):
-                raise ValueError("Loss %s does not support inequalities" % (
-                    loss_name_or_function))
+        if not loss_supports_inequalities and (
+                any(inequality != "=" for inequality in adjusted_inequalities)):
+            raise ValueError("Loss %s does not support inequalities" % (
+                loss_name_or_function))
 
         if self.network() is None:
             self._network = self.make_network(
@@ -512,8 +518,8 @@ class Class1NeuralNetwork(object):
         else:
             sample_weights_with_random_negatives = None
 
-        if loss_supports_inequalities:
-            y_dict_with_random_negatives['output'] = encode_y(
+        if loss_encode_y_function is not None:
+            y_dict_with_random_negatives['output'] = loss_encode_y_function(
                 y_dict_with_random_negatives['output'],
                 adjusted_inequalities_with_random_negatives)
 
diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py
new file mode 100644
index 00000000..cc885e00
--- /dev/null
+++ b/mhcflurry/custom_loss.py
@@ -0,0 +1,96 @@
+"""
+Custom loss functions.
+
+For losses supporting inequalities, each training data point is associated with
+one of (=), (<), or (>). For e.g. (>) inequalities, penalization is applied only
+if the prediction is less than the given value.
+"""
+
+import pandas
+from numpy import isnan, array
+
+CUSTOM_LOSSES = {}
+
+
+class MSEWithInequalities(object):
+    """
+    Supports training a regressor on data that includes inequalities
+    (e.g. x < 100). Mean square error is used as the loss for elements with
+    an (=) inequality. For elements with e.g. a (> 0.5) inequality, then the loss
+    for that element is (y - 0.5)^2 (standard MSE) if y < 500 and 0 otherwise.
+
+    This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a
+    hack, the implementation uses other intervals for y_pred to encode the
+    inequality information.
+
+    y_true is interpreted as follows:
+
+    between 0 - 1
+       Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if
+       y_pred is greater or less than y_true.
+
+    between 2 - 3:
+       Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is
+       applied only if y_pred is greater than y_true - 2.
+
+    between 4 - 5:
+       Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is
+       applied only if y_pred is less than y_true - 4.
+    """
+    name = "mse_with_inequalities"
+    supports_inequalities = True
+
+    @staticmethod
+    def encode_y(y, inequalities=None):
+        y = array(y, dtype="float32")
+        if isnan(y).any():
+            raise ValueError("y contains NaN")
+        if (y > 1.0).any():
+            raise ValueError("y contains values > 1.0")
+        if (y < 0.0).any():
+            raise ValueError("y contains values < 0.0")
+
+        if inequalities is None:
+            encoded = y
+        else:
+            offsets = pandas.Series(inequalities).map({
+                '=': 0,
+                '>': 2,
+                '<': 4,
+            }).values
+            if isnan(offsets).any():
+                raise ValueError("Invalid inequality. Must be =, <, or >")
+            encoded = y + offsets
+        assert not isnan(encoded).any()
+        return encoded
+
+    @staticmethod
+    def loss(y_true, y_pred):
+        # We always delay import of Keras so that mhcflurry can be imported initially
+        # without tensorflow debug output, etc.
+        from keras import backend as K
+
+        # Handle (=) inequalities
+        diff1 = y_pred - y_true
+        diff1 *= K.cast(y_true >= 0.0, "float32")
+        diff1 *= K.cast(y_true <= 1.0, "float32")
+
+        # Handle (>) inequalities
+        diff2 = y_pred - (y_true - 2.0)
+        diff2 *= K.cast(y_true >= 2.0, "float32")
+        diff2 *= K.cast(y_true <= 3.0, "float32")
+        diff2 *= K.cast(diff2 < 0.0, "float32")
+
+        # Handle (<) inequalities
+        diff3 = y_pred - (y_true - 4.0)
+        diff3 *= K.cast(y_true >= 4.0, "float32")
+        diff3 *= K.cast(diff3 > 0.0, "float32")
+
+        return (
+            K.sum(K.square(diff1), axis=-1) +
+            K.sum(K.square(diff2), axis=-1) +
+            K.sum(K.square(diff3), axis=-1))
+
+# Register custom losses.
+for cls in [MSEWithInequalities]:
+    CUSTOM_LOSSES[cls.name] = cls()
\ No newline at end of file
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index cf7709b9..26f46fb5 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -8,7 +8,7 @@
 # by name, the downloads with "default=true" are downloaded.
 
 # This should usually be the latest release.
-current-release: 1.0.0
+current-release: 1.1.0
 
 # An integer indicating what models the current MHCflurry code base is compatible
 # with. Increment this integer when changes are made to MHCflurry that would break
@@ -17,6 +17,33 @@ current-compatibility-version: 2
 
 # Add new releases here as they are made.
 releases:
+    1.1.0:
+        compatibility-version: 2
+        downloads:
+            - name: models_class1
+              url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.1/models_class1.20180116.tar.bz2
+              default: true
+
+            - name: models_class1_experiments1
+              url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2
+              default: false
+
+            - name: cross_validation_class1
+              url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2
+              default: false
+
+            - name: data_iedb
+              url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2
+              default: false
+
+            - name: data_kim2014
+              url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2
+              default: false
+
+            - name: data_curated
+              url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2
+              default: true
+
     1.0.0:
         compatibility-version: 2
         downloads:
diff --git a/mhcflurry/loss_with_inequalities.py b/mhcflurry/loss_with_inequalities.py
deleted file mode 100644
index 98a0383e..00000000
--- a/mhcflurry/loss_with_inequalities.py
+++ /dev/null
@@ -1,78 +0,0 @@
-"""
-Custom loss functions.
-
-Supports training a regressor on data that includes inequalities (e.g. x < 100).
-
-This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a
-hack, the implementation uses other intervals for y_pred to encode the
-inequality information.
-
-y_true is interpreted as follows:
-
-between 0 - 1
-   Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if
-   y_pred is greater or less than y_true.
-
-between 2 - 3:
-   Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is
-   applied only if y_pred is greater than y_true - 2.
-
-between 4 - 5:
-   Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is
-   applied only if y_pred is less than y_true - 4.
-"""
-
-from keras import backend as K
-
-import pandas
-from numpy import isnan, array
-
-LOSSES = {}
-
-
-def encode_y(y, inequalities=None):
-    y = array(y, dtype="float32")
-    if isnan(y).any():
-        raise ValueError("y contains NaN")
-    if (y > 1.0).any():
-        raise ValueError("y contains values > 1.0")
-    if (y < 0.0).any():
-        raise ValueError("y contains values < 0.0")
-
-    if inequalities is None:
-        encoded = y
-    else:
-        offsets = pandas.Series(inequalities).map({
-            '=': 0,
-            '<': 2,
-            '>': 4,
-        }).values
-        if isnan(offsets).any():
-            raise ValueError("Invalid inequality. Must be =, <, or >")
-        encoded = y + offsets
-    assert not isnan(encoded).any()
-    return encoded
-
-
-def mse_with_inequalities(y_true, y_pred):
-    # Handle (=) inequalities
-    diff1 = y_pred - y_true
-    diff1 *= K.cast(y_true >= 0.0, "float32")
-    diff1 *= K.cast(y_true <= 1.0, "float32")
-
-    # Handle (>) inequalities
-    diff2 = y_pred - (y_true - 2.0)
-    diff2 *= K.cast(y_true >= 2.0, "float32")
-    diff2 *= K.cast(y_true <= 3.0, "float32")
-    diff2 *= K.cast(diff2 < 0.0, "float32")
-
-    # Handle (<) inequalities
-    diff3 = y_pred - (y_true - 4.0)
-    diff3 *= K.cast(y_true >= 4.0, "float32")
-    diff3 *= K.cast(diff3 > 0.0, "float32")
-
-    return (
-        K.sum(K.square(diff1), axis=-1) +
-        K.sum(K.square(diff2), axis=-1) +
-        K.sum(K.square(diff3), axis=-1))
-LOSSES["mse_with_inequalities"] = mse_with_inequalities
\ No newline at end of file
diff --git a/test/test_class1_neural_network.py b/test/test_class1_neural_network.py
index 835c632d..da83940b 100644
--- a/test/test_class1_neural_network.py
+++ b/test/test_class1_neural_network.py
@@ -1,16 +1,19 @@
+from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal
+
 import numpy
 import pandas
-numpy.random.seed(0)
+from numpy import testing
 
-from mhcflurry.class1_neural_network import Class1NeuralNetwork
+numpy.random.seed(0)
 
-from nose.tools import eq_
-from numpy import testing
+import logging
+logging.getLogger('tensorflow').disabled = True
 
+from mhcflurry.class1_neural_network import Class1NeuralNetwork
 from mhcflurry.downloads import get_path
+from mhcflurry.common import random_peptides
 
-
-def test_class1_affinity_predictor_a0205_training_accuracy():
+def test_class1_neural_network_a0205_training_accuracy():
     # Memorize the dataset.
     hyperparameters = dict(
         activation="tanh",
@@ -80,3 +83,106 @@ def test_class1_affinity_predictor_a0205_training_accuracy():
     predictor2.fit(df.peptide.values, df.measurement_value.values)
     eq_(predictor.network().to_json(), predictor2.network().to_json())
 
+
+def test_inequalities():
+    # Memorize the dataset.
+    hyperparameters = dict(
+        loss="custom:mse_with_inequalities",
+        activation="tanh",
+        layer_sizes=[16],
+        max_epochs=50,
+        early_stopping=False,
+        validation_split=0.0,
+        locally_connected_layers=[
+            {
+                "filters": 8,
+                "activation": "tanh",
+                "kernel_size": 3
+            }
+        ],
+        dense_layer_l1_regularization=0.0,
+        dropout_probability=0.0)
+
+    df = pandas.DataFrame()
+    df["peptide"] = random_peptides(1000, length=9)
+
+    # First half are binders
+    df["binder"] = df.index < len(df) / 2
+    df["value"] = df.binder.map({True: 100, False: 5000})
+    df.loc[:10, "value"] = 1.0  # some strong binders
+    df["inequality1"] = "="
+    df["inequality2"] = df.binder.map({True: "<", False: "="})
+    df["inequality3"] = df.binder.map({True: "=", False: ">"})
+
+    # "A" at start of peptide indicates strong binder
+    df["peptide"] = [
+        ("C" if not row.binder else "A") + row.peptide[1:]
+        for _, row in df.iterrows()
+    ]
+
+    fit_kwargs = {'verbose': 0}
+
+    # Prediction1 uses no inequalities (i.e. all are (=))
+    predictor = Class1NeuralNetwork(**hyperparameters)
+    predictor.fit(
+        df.peptide.values,
+        df.value.values,
+        inequalities=df.inequality1.values,
+        **fit_kwargs)
+    df["prediction1"] = predictor.predict(df.peptide.values)
+
+    # Prediction2 has a (<) inequality on binders and an (=) on non-binders
+    predictor = Class1NeuralNetwork(**hyperparameters)
+    predictor.fit(
+        df.peptide.values,
+        df.value.values,
+        inequalities=df.inequality2.values,
+        **fit_kwargs)
+    df["prediction2"] = predictor.predict(df.peptide.values)
+
+    # Prediction3 has a (=) inequality on binders and an (>) on non-binders
+    predictor = Class1NeuralNetwork(**hyperparameters)
+    predictor.fit(
+        df.peptide.values,
+        df.value.values,
+        inequalities=df.inequality3.values,
+        **fit_kwargs)
+    df["prediction3"] = predictor.predict(df.peptide.values)
+
+    df_binders = df.loc[df.binder]
+    df_nonbinders = df.loc[~df.binder]
+
+    print("***** Binders: *****")
+    print(df_binders.head(5))
+
+    print("***** Non-binders: *****")
+    print(df_nonbinders.head(5))
+
+    # Binders should always be given tighter predicted affinity than non-binders
+    assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean())
+    assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean())
+    assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean())
+
+    # prediction2 binders should be tighter on average than prediction1
+    # binders, since prediction2 has a (<) inequality for binders.
+    # Non-binders should be about the same between prediction2 and prediction1
+    assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean())
+    assert_almost_equal(
+        df_nonbinders.prediction2.mean(),
+        df_nonbinders.prediction1.mean(),
+        delta=2000)
+
+    # prediction3 non-binders should be weaker on average than prediction2 (or 1)
+    # non-binders, since prediction3 has a (>) inequality for these peptides.
+    # Binders should be about the same.
+    assert_greater(
+        df_nonbinders.prediction3.mean(),
+        df_nonbinders.prediction2.mean())
+    assert_greater(
+        df_nonbinders.prediction3.mean(),
+        df_nonbinders.prediction1.mean())
+    assert_almost_equal(
+        df_binders.prediction3.mean(),
+        df_binders.prediction1.mean(),
+        delta=2000)
+
-- 
GitLab