Add support for multiple outputs. Tests still failing though"

99207e78 · Tim O'Donnell · 5cf8563f · 99207e78 · 99207e78 · 99207e78
Commit 99207e78 authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -55,6 +55,7 @@ class Class1NeuralNetwork(object):
                "kernel_size": 3
            }
        ],
+        num_outputs=1,
    )
    """
    Hyperparameters (and their default values) that affect the neural network
@@ -424,6 +425,7 @@ class Class1NeuralNetwork(object):
            affinities,
            allele_encoding=None,
            inequalities=None,
+            output_indices=None,
            sample_weights=None,
            shuffle_permutation=None,
            verbose=1,
@@ -536,7 +538,7 @@ class Class1NeuralNetwork(object):
            sample_weights = sample_weights[shuffle_permutation]

        if self.hyperparameters['loss'].startswith("custom:"):
-            # Using a custom loss that supports inequalities
+            # Using a custom loss
            try:
                custom_loss = CUSTOM_LOSSES[
                    self.hyperparameters['loss'].replace("custom:", "")
@@ -550,11 +552,13 @@ class Class1NeuralNetwork(object):
                        ])))
            loss_name_or_function = custom_loss.loss
            loss_supports_inequalities = custom_loss.supports_inequalities
+            loss_supports_multiple_outputs = custom_loss.supports_multiple_outputs
            loss_encode_y_function = custom_loss.encode_y
        else:
-            # Using a regular keras loss. No inequalities supported.
+            # Using a regular keras loss.
            loss_name_or_function = self.hyperparameters['loss']
            loss_supports_inequalities = False
+            loss_supports_multiple_outputs = False
            loss_encode_y_function = None

        if not loss_supports_inequalities and (
@@ -562,6 +566,18 @@ class Class1NeuralNetwork(object):
            raise ValueError("Loss %s does not support inequalities" % (
                loss_name_or_function))

+        if (
+                not loss_supports_multiple_outputs and
+                output_indices is not None and
+                (output_indices != 0).any()):
+            raise ValueError("Loss %s does not support multiple outputs" % (
+                output_indices))
+
+        if self.hyperparameters['num_outputs'] != 1:
+            if output_indices is None:
+                raise ValueError(
+                    "Must supply output_indices for multi-output predictor")
+
        if self.network() is None:
            self._network = self.make_network(
                allele_representations=allele_representations,
@@ -621,10 +637,26 @@ class Class1NeuralNetwork(object):
        else:
            sample_weights_with_random_negatives = None

+        if output_indices is not None:
+            output_indices_with_random_negatives = numpy.concatenate([
+                numpy.zeros(int(num_random_negative.sum()), dtype=int),
+                output_indices
+            ])
+        else:
+            output_indices_with_random_negatives = None
+
        if loss_encode_y_function is not None:
+            encode_y_kwargs = {}
+            if adjusted_inequalities_with_random_negatives is not None:
+                encode_y_kwargs["inequalities"] = (
+                    adjusted_inequalities_with_random_negatives)
+            if output_indices_with_random_negatives is not None:
+                encode_y_kwargs["output_indices"] = (
+                    output_indices_with_random_negatives)
+
            y_dict_with_random_negatives['output'] = loss_encode_y_function(
                y_dict_with_random_negatives['output'],
-                adjusted_inequalities_with_random_negatives)
+                **encode_y_kwargs)

        val_losses = []
        min_val_loss_iteration = None
@@ -741,7 +773,12 @@ class Class1NeuralNetwork(object):
        fit_info["num_points"] = len(peptides)
        self.fit_info.append(dict(fit_info))

-    def predict(self, peptides, allele_encoding=None, batch_size=4096):
+    def predict(
+            self,
+            peptides,
+            allele_encoding=None,
+            batch_size=4096,
+            output_index=0):
        """
        Predict affinities.

@@ -784,7 +821,9 @@ class Class1NeuralNetwork(object):
        else:
            network = self.network(borrow=True)
        raw_predictions = network.predict(x_dict, batch_size=batch_size)
-        predictions = numpy.array(raw_predictions, dtype = "float64")[:,0]
+        predictions = numpy.array(raw_predictions, dtype = "float64")
+        if output_index is not None:
+            predictions = predictions[:,output_index]
        result = to_ic50(predictions)
        if use_cache:
            self.prediction_cache[peptides] = result
@@ -807,6 +846,7 @@ class Class1NeuralNetwork(object):
            dropout_probability,
            batch_normalization,
            locally_connected_layers,
+            num_outputs=1,
            allele_representations=None):
        """
        Helper function to make a keras network for class1 affinity prediction.
@@ -913,7 +953,7 @@ class Class1NeuralNetwork(object):
                    name="dropout_%d" % i)(current_layer)

        output = Dense(
-            1,
+            num_outputs,
            kernel_initializer=init,
            activation=output_activation,
            name="output")(current_layer)

--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
@@ -5,8 +5,9 @@ For losses supporting inequalities, each training data point is associated with
 one of (=), (<), or (>). For e.g. (>) inequalities, penalization is applied only
 if the prediction is less than the given value.
 """
-
+from __future__ import division
 import pandas
+import numpy
 from numpy import isnan, array

 CUSTOM_LOSSES = {}
@@ -39,6 +40,7 @@ class MSEWithInequalities(object):
    """
    name = "mse_with_inequalities"
    supports_inequalities = True
+    supports_multiple_outputs = False

    @staticmethod
    def encode_y(y, inequalities=None):
@@ -70,6 +72,8 @@ class MSEWithInequalities(object):
        # without tensorflow debug output, etc.
        from keras import backend as K

+        y_pred = K.flatten(y_pred)
+
        # Handle (=) inequalities
        diff1 = y_pred - y_true
        diff1 *= K.cast(y_true >= 0.0, "float32")
@@ -89,8 +93,69 @@ class MSEWithInequalities(object):
        return (
            K.sum(K.square(diff1), axis=-1) +
            K.sum(K.square(diff2), axis=-1) +
-            K.sum(K.square(diff3), axis=-1))
+            K.sum(K.square(diff3), axis=-1)) / K.cast(K.shape(y_true)[0], "float32")
+
+
+class MSEWithInequalitiesAndMultipleOutputs(object):
+    name = "mse_with_inequalities_and_multiple_outputs"
+    supports_inequalities = True
+    supports_multiple_outputs = True
+
+    @staticmethod
+    def encode_y(y, inequalities=None, output_indices=None):
+        y = array(y, dtype="float32")
+        if isnan(y).any():
+            raise ValueError("y contains NaN")
+        if (y > 1.0).any():
+            raise ValueError("y contains values > 1.0")
+        if (y < 0.0).any():
+            raise ValueError("y contains values < 0.0")
+
+        encoded = MSEWithInequalities.encode_y(
+            y, inequalities=inequalities)
+
+        if output_indices is not None:
+            output_indices = numpy.array(output_indices)
+            check_shape("output_indices", output_indices, (len(encoded),))
+            if (output_indices < 0).any():
+                raise ValueError("Invalid output indices: ", output_indices)
+
+            encoded += output_indices * 10
+
+        return encoded
+
+    @staticmethod
+    def loss(y_true, y_pred):
+        from keras import backend as K
+
+        output_indices = y_true // 10
+        updated_y_true = y_true - (10 * output_indices)
+
+        # We index into y_pred using flattened indices since Keras backend
+        # supports gather but has no equivalent of tf.gather_nd:
+        ordinals = K.arange(K.shape(y_true)[0])
+        flattened_indices = (
+            ordinals * y_pred.shape[1] + K.cast(output_indices, "int32"))
+        updated_y_pred = K.gather(K.flatten(y_pred), flattened_indices)
+
+        # Alternative implementation using tensorflow, which could be used if
+        # we drop support for other backends:
+        # import tensorflow as tf
+        # indexer = K.stack([
+        #     ordinals,
+        #     K.cast(output_indices, "int32")
+        # ], axis=-1)
+        #updated_y_pred = tf.gather_nd(y_pred, indexer)
+
+        return MSEWithInequalities.loss(updated_y_true, updated_y_pred)
+
+
+def check_shape(name, arr, expected_shape):
+    if arr.shape != expected_shape:
+        raise ValueError("Expected %s to have shape %s not %s" % (
+            name, str(expected_shape), str(arr.shape)))
+

 # Register custom losses.
-for cls in [MSEWithInequalities]:
+for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
    CUSTOM_LOSSES[cls.name] = cls()
\ No newline at end of file
--- a/test/test_custom_loss.py
+++ b/test/test_custom_loss.py
@@ -13,6 +13,14 @@ from mhcflurry.custom_loss import CUSTOM_LOSSES


 def evaluate_loss(loss, y_true, y_pred):
+    y_true = numpy.array(y_true)
+    y_pred = numpy.array(y_pred)
+    if y_pred.ndim == 1:
+        y_pred = y_pred.reshape((len(y_pred), 1))
+
+    assert y_true.ndim == 1
+    assert y_pred.ndim == 2
+
    if K.backend() == "tensorflow":
        session = K.get_session()
        y_true_var = K.constant(y_true, name="y_true")
@@ -28,14 +36,13 @@ def evaluate_loss(loss, y_true, y_pred):
        raise ValueError("Unsupported backend: %s" % K.backend())


-def test_mse_with_inequalities():
-
-    loss_obj = CUSTOM_LOSSES['mse_with_inequalities']
-
+def test_mse_with_inequalities(loss_obj=CUSTOM_LOSSES['mse_with_inequalities']):
    y_values = [0.0, 0.5, 0.8, 1.0]

    adjusted_y = loss_obj.encode_y(y_values)
+    print(adjusted_y)
    loss0 = evaluate_loss(loss_obj.loss, adjusted_y, y_values)
+    print(loss0)
    eq_(loss0, 0.0)

    adjusted_y = loss_obj.encode_y(y_values, [">", ">", ">", ">"])
@@ -64,9 +71,70 @@ def test_mse_with_inequalities():

    adjusted_y = loss_obj.encode_y(y_values, ["=", "<", ">", ">"])
    loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.1, 0.6, 0.9, 1.0])
-    assert_almost_equal(loss0, 0.02)
+    assert_almost_equal(loss0, 0.02 / 4)

    adjusted_y = loss_obj.encode_y(y_values, ["=", "<", "=", ">"])
    loss0 = evaluate_loss(loss_obj.loss, adjusted_y, [0.1, 0.6, 0.9, 1.0])
-    assert_almost_equal(loss0, 0.03)
+    assert_almost_equal(loss0, 0.03 / 4)
+
+
+def test_mse_with_inequalities_and_multiple_outputs():
+    loss_obj = CUSTOM_LOSSES['mse_with_inequalities_and_multiple_outputs']
+    test_mse_with_inequalities(loss_obj)
+
+    y_values = [0.0, 0.5, 0.8, 1.0]
+    adjusted_y = loss_obj.encode_y(
+        y_values, output_indices=[0, 1, 1, 1])
+    loss0 = evaluate_loss(
+        loss_obj.loss,
+        adjusted_y,
+        [
+            [0.0, 1000],
+            [2000, 0.5],
+            [3000, 0.8],
+            [4000, 1.0],
+        ])
+    assert_almost_equal(loss0, 0.0)
+
+    y_values = [0.0, 0.5, 0.8, 1.0]
+    adjusted_y = loss_obj.encode_y(
+        y_values, output_indices=[0, 1, 1, 0])
+    loss0 = evaluate_loss(
+        loss_obj.loss,
+        adjusted_y,
+        [
+            [0.1, 1000],
+            [2000, 0.6],
+            [3000, 0.8],
+            [1.0, 4000],
+        ])
+    assert_almost_equal(loss0, 0.02 / 4)
+
+    y_values = [0.0, 0.5, 0.8, 1.0]
+    adjusted_y = loss_obj.encode_y(
+        y_values, output_indices=[0, 1, 1, 0], inequalities=["=", ">", "<", "<"])
+    loss0 = evaluate_loss(
+        loss_obj.loss,
+        adjusted_y,
+        [
+            [0.1, 1000],
+            [2000, 0.6],
+            [3000, 0.8],
+            [1.0, 4000],
+        ])
+    assert_almost_equal(loss0, 0.01 / 4)
+
+    y_values = [0.0, 0.5, 0.8, 1.0]
+    adjusted_y = loss_obj.encode_y(
+        y_values, output_indices=[0, 1, 1, 0], inequalities=["=", "<", "<", "<"])
+    loss0 = evaluate_loss(
+        loss_obj.loss,
+        adjusted_y,
+        [
+            [0.1, 1000],
+            [2000, 0.6],
+            [3000, 0.8],
+            [1.0, 4000],
+        ])
+    assert_almost_equal(loss0, 0.02 / 4)

--- a/test/test_multi_output.py
+++ b/test/test_multi_output.py
+from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal
+
+import numpy
+import pandas
+from numpy import testing
+
+numpy.random.seed(0)
+
+import logging
+logging.getLogger('tensorflow').disabled = True
+
+from mhcflurry.class1_neural_network import Class1NeuralNetwork
+from mhcflurry.downloads import get_path
+from mhcflurry.common import random_peptides
+
+
+def test_multi_output():
+    # Memorize the dataset.
+    hyperparameters = dict(
+        loss="custom:mse_with_inequalities_and_multiple_outputs",
+        activation="tanh",
+        layer_sizes=[16],
+        max_epochs=50,
+        minibatch_size=32,
+        random_negative_rate=0.0,
+        random_negative_constant=0.0,
+        early_stopping=False,
+        validation_split=0.0,
+        locally_connected_layers=[
+        ],
+        dense_layer_l1_regularization=0.0,
+        dropout_probability=0.0,
+        num_outputs=2)
+
+    df = pandas.DataFrame()
+    df["peptide"] = random_peptides(10000, length=9)
+    df["output1"] = df.peptide.map(lambda s: s[4] == 'K').astype(int) * 10000 + 0.01
+    df["output2"] = df.peptide.map(lambda s: s[3] == 'Q').astype(int) * 10000 + 0.01
+
+    print("output1 mean", df.output1.mean())
+    print("output2 mean", df.output2.mean())
+
+    stacked = df.set_index("peptide").stack().reset_index()
+    stacked.columns = ['peptide', 'output_name', 'value']
+    stacked["output_index"] = stacked.output_name.map({
+        "output1": 0,
+        "output2": 1,
+    })
+    assert not stacked.output_index.isnull().any(), stacked
+
+    fit_kwargs = {
+        'verbose': 1,
+    }
+
+    predictor = Class1NeuralNetwork(**hyperparameters)
+    predictor.fit(
+        stacked.peptide.values,
+        stacked.value.values,
+        output_indices=stacked.output_index.values,
+        **fit_kwargs)
+    result = predictor.predict(df.peptide.values, output_index=None)
+    print(df.shape, result.shape)
+    print(result)
+
+    df["prediction1"] = result[:,0]
+    df["prediction2"] = result[:,1]
+
+    import ipdb ; ipdb.set_trace()
+
+
+
+    # Prediction2 has a (<) inequality on binders and an (=) on non-binders
+    predictor = Class1NeuralNetwork(**hyperparameters)
+    predictor.fit(
+        df.peptide.values,
+        df.value.values,
+        inequalities=df.inequality2.values,
+        **fit_kwargs)
+    df["prediction2"] = predictor.predict(df.peptide.values)
+
+    # Prediction3 has a (=) inequality on binders and an (>) on non-binders
+    predictor = Class1NeuralNetwork(**hyperparameters)
+    predictor.fit(
+        df.peptide.values,
+        df.value.values,
+        inequalities=df.inequality3.values,
+        **fit_kwargs)
+    df["prediction3"] = predictor.predict(df.peptide.values)
+
+    df_binders = df.loc[df.binder]
+    df_nonbinders = df.loc[~df.binder]
+
+    print("***** Binders: *****")
+    print(df_binders.head(5))
+
+    print("***** Non-binders: *****")
+    print(df_nonbinders.head(5))
+
+    # Binders should always be given tighter predicted affinity than non-binders
+    assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean())
+    assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean())
+    assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean())
+
+    # prediction2 binders should be tighter on average than prediction1
+    # binders, since prediction2 has a (<) inequality for binders.
+    # Non-binders should be about the same between prediction2 and prediction1
+    assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean())
+    assert_almost_equal(
+        df_nonbinders.prediction2.mean(),
+        df_nonbinders.prediction1.mean(),
+        delta=3000)
+
+    # prediction3 non-binders should be weaker on average than prediction2 (or 1)
+    # non-binders, since prediction3 has a (>) inequality for these peptides.
+    # Binders should be about the same.
+    assert_greater(
+        df_nonbinders.prediction3.mean(),
+        df_nonbinders.prediction2.mean())
+    assert_greater(
+        df_nonbinders.prediction3.mean(),
+        df_nonbinders.prediction1.mean())
+    assert_almost_equal(
+        df_binders.prediction3.mean(),
+        df_binders.prediction1.mean(),
+        delta=3000)
+