From 2e8dc426edc3565d7158ff01777434c4557ca2d4 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Thu, 14 Nov 2019 18:14:45 -0500
Subject: [PATCH] working on ligandome

---
 mhcflurry/class1_ligandome_predictor.py | 122 ++++++++++++-------
 mhcflurry/custom_loss.py                |  76 ++++++++++--
 test/test_class1_ligandome_predictor.py | 148 ++++++++++++------------
 3 files changed, 223 insertions(+), 123 deletions(-)

diff --git a/mhcflurry/class1_ligandome_predictor.py b/mhcflurry/class1_ligandome_predictor.py
index 0f7790ab..4d433e48 100644
--- a/mhcflurry/class1_ligandome_predictor.py
+++ b/mhcflurry/class1_ligandome_predictor.py
@@ -5,11 +5,17 @@ import collections
 from functools import partial
 
 import numpy
+import pandas
 
 from .hyperparameters import HyperparameterDefaults
 from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
 from .encodable_sequences import EncodableSequences
-
+from .regression_target import from_ic50, to_ic50
+from .random_negative_peptides import RandomNegativePeptides
+from .custom_loss import (
+    MSEWithInequalities,
+    MSEWithInequalitiesAndMultipleOutputs,
+    MultiallelicMassSpecLoss)
 
 class Class1LigandomePredictor(object):
     network_hyperparameter_defaults = HyperparameterDefaults(
@@ -19,6 +25,8 @@ class Class1LigandomePredictor(object):
             'alignment_method': 'left_pad_centered_right_pad',
             'max_length': 15,
         },
+        additional_dense_layers=[],
+        additional_dense_activation="sigmoid",
     )
     """
     Hyperparameters (and their default values) that affect the neural network
@@ -29,9 +37,8 @@ class Class1LigandomePredictor(object):
         max_epochs=500,
         validation_split=0.1,
         early_stopping=True,
-        minibatch_size=128,
-        random_negative_rate=0.0,
-        random_negative_constant=0,
+        minibatch_size=128,).extend(
+        RandomNegativePeptides.hyperparameter_defaults
     )
     """
     Hyperparameters for neural network training.
@@ -47,7 +54,6 @@ class Class1LigandomePredictor(object):
 
     compile_hyperparameter_defaults = HyperparameterDefaults(
         loss_delta=0.2,
-        loss_alpha=None,
         optimizer="rmsprop",
         learning_rate=None,
     )
@@ -56,10 +62,18 @@ class Class1LigandomePredictor(object):
     used.
     """
 
+    allele_features_hyperparameter_defaults = HyperparameterDefaults(
+        allele_features_include_gene=True,
+    )
+    """
+    Allele feature hyperparameters.
+    """
+
     hyperparameter_defaults = network_hyperparameter_defaults.extend(
         fit_hyperparameter_defaults).extend(
         early_stopping_hyperparameter_defaults).extend(
-        compile_hyperparameter_defaults)
+        compile_hyperparameter_defaults).extend(
+        allele_features_hyperparameter_defaults)
 
     def __init__(
             self,
@@ -87,8 +101,15 @@ class Class1LigandomePredictor(object):
     @staticmethod
     def make_network(pan_allele_class1_neural_networks, hyperparameters):
         import keras.backend as K
-        from keras.layers import Input, TimeDistributed, Lambda, Flatten, RepeatVector, concatenate, Dropout, Reshape, Embedding
-        from keras.activations import sigmoid
+        from keras.layers import (
+            Input,
+            TimeDistributed,
+            Dense,
+            Flatten,
+            RepeatVector,
+            concatenate,
+            Reshape,
+            Embedding)
         from keras.models import Model
 
         networks = [model.network() for model in pan_allele_class1_neural_networks]
@@ -163,39 +184,28 @@ class Class1LigandomePredictor(object):
 
             layer_name_to_new_node[layer.name] = node
 
+        affinity_predictor_output = node
+
+        for (i, layer_size) in enumerate(
+                hyperparameters['additional_dense_layers']):
+            layer = Dense(
+                layer_size,
+                activation=hyperparameters['additional_dense_activation'])
+            lifted = TimeDistributed(layer)
+            node = lifted(node)
+
+        layer = Dense(1, activation="sigmoid")
+        lifted = TimeDistributed(layer)
+        ligandome_output = lifted(node)
+
         network = Model(
             inputs=[input_peptides, input_alleles],
-            outputs=node,
+            outputs=[affinity_predictor_output, ligandome_output],
             name="ligandome",
         )
         network.summary()
         return network
 
-    @staticmethod
-    def loss(y_true, y_pred, sample_weight=None, delta=0.2, alpha=None):
-        """
-        Loss function for ligandome prediction.
-        """
-        import tensorflow as tf
-
-        y_pred = tf.squeeze(y_pred, axis=-1)
-        y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
-
-        pos = tf.boolean_mask(y_pred, y_true)
-
-        if alpha is None:
-            pos_max = tf.reduce_max(pos, axis=1)
-        else:
-            # Smooth maximum
-            exp_alpha_x = tf.exp(alpha * pos)
-            numerator = tf.reduce_sum(tf.multiply(pos, exp_alpha_x), axis=1)
-            denominator = tf.reduce_sum(exp_alpha_x, axis=1)
-            pos_max = numerator / denominator
-        neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
-        result = tf.reduce_sum(
-            tf.maximum(0.0, tf.reshape(neg, (-1, 1)) - pos_max + delta) ** 2)
-        return result
-
     def peptides_to_network_input(self, peptides):
         """
         Encode peptides to the fixed-length encoding expected by the neural
@@ -241,6 +251,8 @@ class Class1LigandomePredictor(object):
             peptides,
             labels,
             allele_encoding,
+            affinities_mask=None,  # True when a peptide/label is actually a peptide and an affinity
+            inequalities=None,  # interpreted only for elements where affinities_mask is True, otherwise ignored
             shuffle_permutation=None,
             verbose=1,
             progress_callback=None,
@@ -275,14 +287,25 @@ class Class1LigandomePredictor(object):
             'allele': allele_encoding_input,
         }
 
+        loss = MSEWithInequalitiesAndMultipleOutputs(losses=[
+            MSEWithInequalities(),
+            MultiallelicMassSpecLoss(
+                delta=self.hyperparameters['loss_delta']),
+        ])
+
+        y_values_pre_encoding = labels.copy()
+        if affinities_mask is not None:
+            y_values_pre_encoding[affinities_mask] = from_ic50(labels)
+        y_values = loss.encode_y(
+            y_values_pre_encoding,
+            inequalities=inequalities[affinities_mask] if inequalities is not None else None,
+            output_indices=(~affinities_mask).astype(int) if affinities_mask is not None else numpy.ones(len(y_values_pre_encoding), dtype=int))
+
         fit_info = collections.defaultdict(list)
 
         self.set_allele_representations(allele_representations)
         self.network.compile(
-            loss=partial(
-                self.loss,
-                delta=self.hyperparameters['loss_delta'],
-                alpha=self.hyperparameters['loss_alpha']),
+            loss=loss.loss,
             optimizer=self.hyperparameters['optimizer'])
         if self.hyperparameters['learning_rate'] is not None:
             K.set_value(
@@ -371,6 +394,7 @@ class Class1LigandomePredictor(object):
             self,
             peptides,
             allele_encoding,
+            output="affinities",
             batch_size=DEFAULT_PREDICT_BATCH_SIZE):
         (allele_encoding_input, allele_representations) = (
                 self.allele_encoding_to_network_input(allele_encoding.compact()))
@@ -380,12 +404,16 @@ class Class1LigandomePredictor(object):
             'allele': allele_encoding_input,
         }
         predictions = self.network.predict(x_dict, batch_size=batch_size)
+        if output == "affinities":
+            predictions = to_ic50(predictions[0])
+        elif output == "ligandome_presentation":
+            predictions = predictions[1]
+        elif output == "both":
+            pass
+        else:
+            raise NotImplementedError("Unknown output", output)
         return numpy.squeeze(predictions)
 
-    #def predict(self):
-
-
-
     def set_allele_representations(self, allele_representations):
         """
         """
@@ -440,3 +468,13 @@ class Class1LigandomePredictor(object):
                 original_model.fit_generator = throw
 
         layer.set_weights([reshaped])
+
+    @staticmethod
+    def allele_features(allele_names, hyperparameters):
+        df = pandas.DataFrame({"allele_name": allele_names})
+        if hyperparameters['allele_features_include_gene']:
+            # TODO: support other organisms.
+            for gene in ["A", "B", "C"]:
+                df[gene] = df.allele_name.str.startswith(
+                    "HLA-%s" % gene).astype(float)
+        return gene
diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py
index 8c523c0a..2b520423 100644
--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
@@ -182,8 +182,10 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
     supports_inequalities = True
     supports_multiple_outputs = True
 
-    @staticmethod
-    def encode_y(y, inequalities=None, output_indices=None):
+    def __init__(self, losses=MSEWithInequalities):
+        self.losses = losses
+
+    def encode_y(self, y, inequalities=None, output_indices=None):
         y = array(y, dtype="float32")
         if isnan(y).any():
             raise ValueError("y contains NaN", y)
@@ -192,8 +194,25 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
         if (y < 0.0).any():
             raise ValueError("y contains values < 0.0", y)
 
-        encoded = MSEWithInequalities.encode_y(
-            y, inequalities=inequalities)
+        if isinstance(self.losses, Loss):
+            # Single loss applied to all outputs
+            encoded = MSEWithInequalities.encode_y(y, inequalities=inequalities)
+        else:
+            assert output_indices is not None
+            df = pandas.DataFrame({
+                "y": y,
+                "inequality": inequalities,
+                "output_index": output_indices,
+            })
+            encoded = y.copy()
+            encoded[:] = numpy.nan
+            for (output_index, sub_df) in df.groupby("output_index"):
+                loss = self.losses[output_index]
+                loss_kwargs = {}
+                if not sub_df.inequality.isnull().all():
+                    loss_kwargs['inequalities'] = sub_df.inequality.values
+                encoded[sub_df.index.values] = loss.encode_y(
+                    sub_df.y.values, **loss_kwargs)
 
         if output_indices is not None:
             output_indices = numpy.array(output_indices)
@@ -205,8 +224,7 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
 
         return encoded
 
-    @staticmethod
-    def loss(y_true, y_pred):
+    def loss(self, y_true, y_pred):
         from keras import backend as K
 
         y_true = K.flatten(y_true)
@@ -230,7 +248,49 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
         # ], axis=-1)
         #updated_y_pred = tf.gather_nd(y_pred, indexer)
 
-        return MSEWithInequalities.loss(updated_y_true, updated_y_pred)
+        if isinstance(self.losses, Loss):
+            # Single loss for all outputs.
+            return self.losses.loss(updated_y_true, updated_y_pred)
+        else:
+            # TODO: make this more efficient?
+            result = None
+            for (i, loss) in enumerate(self.losses):
+                values = (
+                    loss.loss(updated_y_true, updated_y_pred) *
+                    K.cast(K.equal(output_indices, i), "float32"))
+                if result is None:
+                    result = values
+                else:
+                    result += values
+            return result
+
+
+class MultiallelicMassSpecLoss(Loss):
+    """
+    """
+    name = "multiallelic_mass_spec_loss"
+    supports_inequalities = True
+    supports_multiple_outputs = False
+
+    def __init__(self, delta=0.2):
+        self.delta = delta
+
+    def encode_y(self, y):
+        return y
+
+    def loss(self, y_true, y_pred):
+        import tensorflow as tf
+
+        #y_pred = tf.squeeze(y_pred, axis=-1)
+        y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
+
+        pos = tf.boolean_mask(y_pred, y_true)
+        pos_max = tf.reduce_max(pos, axis=1)
+        neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
+        result = tf.reduce_sum(
+            tf.maximum(
+                0.0, tf.reshape(neg, (-1, 1)) - pos_max + self.delta) ** 2)
+        return result
 
 
 def check_shape(name, arr, expected_shape):
@@ -250,5 +310,5 @@ def check_shape(name, arr, expected_shape):
 
 
 # Register custom losses.
-for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
+for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss]:
     CUSTOM_LOSSES[cls.name] = cls()
diff --git a/test/test_class1_ligandome_predictor.py b/test/test_class1_ligandome_predictor.py
index 5e0daf26..b8348192 100644
--- a/test/test_class1_ligandome_predictor.py
+++ b/test/test_class1_ligandome_predictor.py
@@ -38,6 +38,7 @@ from mhcflurry.regression_target import from_ic50
 from mhcflurry.common import random_peptides, positional_frequency_matrix
 from mhcflurry.testing_utils import cleanup, startup
 from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
+from mhcflurry.custom_loss import MultiallelicMassSpecLoss
 
 COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)
 
@@ -110,73 +111,66 @@ def evaluate_loss(loss, y_true, y_pred):
 
 def test_loss():
     for delta in [0.0, 0.3]:
-        for alpha in [None, 1.0, 20.0]:
-            print("delta", delta)
-            print("alpha", alpha)
-            # Hit labels
-            y_true = [
-                1.0,
-                0.0,
-                1.0,
-                1.0,
-                0.0
-            ]
-            y_true = numpy.array(y_true)
-            y_pred = [
-                [0.3, 0.7, 0.5],
-                [0.2, 0.4, 0.6],
-                [0.1, 0.5, 0.3],
-                [0.1, 0.7, 0.1],
-                [0.8, 0.2, 0.4],
-            ]
-            y_pred = numpy.array(y_pred)
-
-            # reference implementation 1
-
-            def smooth_max(x, alpha):
-                x = numpy.array(x)
-                alpha = numpy.array([alpha])
-                return (x * numpy.exp(x * alpha)).sum() / (
-                    numpy.exp(x * alpha)).sum()
-
-            if alpha is None:
-                max_func = max
-            else:
-                max_func = partial(smooth_max, alpha=alpha)
-
-            contributions = []
-            for i in range(len(y_true)):
-                if y_true[i] == 1.0:
-                    for j in range(len(y_true)):
-                        if y_true[j] == 0.0:
-                            tightest_i = max_func(y_pred[i])
-                            contribution = sum(
-                                max(0, y_pred[j, k] - tightest_i + delta)**2
-                                for k in range(y_pred.shape[1])
-                            )
-                            contributions.append(contribution)
-            contributions = numpy.array(contributions)
-            expected1 = contributions.sum()
-
-            # reference implementation 2: numpy
-            pos = numpy.array([
-                max_func(y_pred[i])
-                for i in range(len(y_pred))
-                if y_true[i] == 1.0
-            ])
-
-            neg = y_pred[~y_true.astype(bool)]
-            expected2 = (
-                    numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
-
-            yield numpy.testing.assert_almost_equal, expected1, expected2, 4
-
-            computed = evaluate_loss(
-                partial(Class1LigandomePredictor.loss, delta=delta, alpha=alpha),
-                y_true,
-                y_pred.reshape(y_pred.shape + (1,)))
-
-            yield numpy.testing.assert_almost_equal, computed, expected1, 4
+        print("delta", delta)
+        # Hit labels
+        y_true = [
+            1.0,
+            0.0,
+            1.0,
+            1.0,
+            0.0
+        ]
+        y_true = numpy.array(y_true)
+        y_pred = [
+            [0.3, 0.7, 0.5],
+            [0.2, 0.4, 0.6],
+            [0.1, 0.5, 0.3],
+            [0.1, 0.7, 0.1],
+            [0.8, 0.2, 0.4],
+        ]
+        y_pred = numpy.array(y_pred)
+
+        # reference implementation 1
+
+        def smooth_max(x, alpha):
+            x = numpy.array(x)
+            alpha = numpy.array([alpha])
+            return (x * numpy.exp(x * alpha)).sum() / (
+                numpy.exp(x * alpha)).sum()
+
+        contributions = []
+        for i in range(len(y_true)):
+            if y_true[i] == 1.0:
+                for j in range(len(y_true)):
+                    if y_true[j] == 0.0:
+                        tightest_i = max(y_pred[i])
+                        contribution = sum(
+                            max(0, y_pred[j, k] - tightest_i + delta)**2
+                            for k in range(y_pred.shape[1])
+                        )
+                        contributions.append(contribution)
+        contributions = numpy.array(contributions)
+        expected1 = contributions.sum()
+
+        # reference implementation 2: numpy
+        pos = numpy.array([
+            max(y_pred[i])
+            for i in range(len(y_pred))
+            if y_true[i] == 1.0
+        ])
+
+        neg = y_pred[~y_true.astype(bool)]
+        expected2 = (
+                numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
+
+        yield numpy.testing.assert_almost_equal, expected1, expected2, 4
+
+        computed = evaluate_loss(
+            MultiallelicMassSpecLoss(delta=delta).loss,
+            y_true,
+            y_pred.reshape(y_pred.shape + (1,)))
+
+        yield numpy.testing.assert_almost_equal, computed, expected1, 4
 
 
 AA_DIST = pandas.Series(
@@ -284,6 +278,7 @@ def test_synthetic_allele_refinement(max_epochs=10):
 
     predictor = Class1LigandomePredictor(
         PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
+        additional_dense_layers=[8, 1],
         max_ensemble_size=1,
         max_epochs=max_epochs,
         learning_rate=0.0001,
@@ -299,9 +294,11 @@ def test_synthetic_allele_refinement(max_epochs=10):
         allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
     ).compact()
 
-    pre_predictions = predictor.predict(
-        peptides=train_df.peptide.values,
-        allele_encoding=allele_encoding)
+    pre_predictions = from_ic50(
+        predictor.predict(
+            output="affinities",
+            peptides=train_df.peptide.values,
+            allele_encoding=allele_encoding))
 
     (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
     expected_pre_predictions = from_ic50(
@@ -310,11 +307,13 @@ def test_synthetic_allele_refinement(max_epochs=10):
             allele_encoding=allele_encoding.allele_encoding,
     )).reshape((-1, len(alleles)))
 
+    #import ipdb ; ipdb.set_trace()
+
     train_df["pre_max_prediction"] = pre_predictions.max(1)
     pre_auc = roc_auc_score(train_df.hit.values, train_df.pre_max_prediction.values)
     print("PRE_AUC", pre_auc)
 
-    assert_allclose(pre_predictions, expected_pre_predictions)
+    assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
 
     motifs_history = []
     random_peptides_encodable = make_random_peptides(10000, [9])
@@ -328,8 +327,11 @@ def test_synthetic_allele_refinement(max_epochs=10):
     metric_rows = []
 
     def progress():
-        predictions = predictor.predict(peptides=train_df.peptide.values,
-            allele_encoding=allele_encoding, )
+        predictions = from_ic50(
+            predictor.predict(
+                output="affinities",
+                peptides=train_df.peptide.values,
+                allele_encoding=allele_encoding))
 
         train_df["max_prediction"] = predictions.max(1)
         train_df["predicted_allele"] = pandas.Series(alleles).loc[
-- 
GitLab