working on ligandome

2e8dc426 · Tim O'Donnell · 0eb0a4e1 · 2e8dc426 · 2e8dc426 · 2e8dc426
Commit 2e8dc426 authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/class1_ligandome_predictor.py
+++ b/mhcflurry/class1_ligandome_predictor.py
@@ -5,11 +5,17 @@ import collections
 from functools import partial
 import numpy
+import pandas
 from .hyperparameters import HyperparameterDefaults
 from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
 from .encodable_sequences import EncodableSequences
+from .regression_target import from_ic50, to_ic50
+from .random_negative_peptides import RandomNegativePeptides
+from .custom_loss import (
+    MSEWithInequalities,
+    MSEWithInequalitiesAndMultipleOutputs,
+    MultiallelicMassSpecLoss)
 class Class1LigandomePredictor(object):
    network_hyperparameter_defaults = HyperparameterDefaults(
@@ -19,6 +25,8 @@ class Class1LigandomePredictor(object):
            'alignment_method': 'left_pad_centered_right_pad',
            'max_length': 15,
        },
+        additional_dense_layers=[],
+        additional_dense_activation="sigmoid",
    )
    """
    Hyperparameters (and their default values) that affect the neural network
@@ -29,9 +37,8 @@ class Class1LigandomePredictor(object):
        max_epochs=500,
        validation_split=0.1,
        early_stopping=True,
-        minibatch_size=128,
+        minibatch_size=128,).extend(
-        random_negative_rate=0.0,
+        RandomNegativePeptides.hyperparameter_defaults
-        random_negative_constant=0,
    )
    """
    Hyperparameters for neural network training.
@@ -47,7 +54,6 @@ class Class1LigandomePredictor(object):
    compile_hyperparameter_defaults = HyperparameterDefaults(
        loss_delta=0.2,
-        loss_alpha=None,
        optimizer="rmsprop",
        learning_rate=None,
    )
@@ -56,10 +62,18 @@ class Class1LigandomePredictor(object):
    used.
    """
+    allele_features_hyperparameter_defaults = HyperparameterDefaults(
+        allele_features_include_gene=True,
+    )
+    """
+    Allele feature hyperparameters.
+    """
    hyperparameter_defaults = network_hyperparameter_defaults.extend(
        fit_hyperparameter_defaults).extend(
        early_stopping_hyperparameter_defaults).extend(
-        compile_hyperparameter_defaults)
+        compile_hyperparameter_defaults).extend(
+        allele_features_hyperparameter_defaults)
    def __init__(
            self,
@@ -87,8 +101,15 @@ class Class1LigandomePredictor(object):
    @staticmethod
    def make_network(pan_allele_class1_neural_networks, hyperparameters):
        import keras.backend as K
-        from keras.layers import Input, TimeDistributed, Lambda, Flatten, RepeatVector, concatenate, Dropout, Reshape, Embedding
+        from keras.layers import (
-        from keras.activations import sigmoid
+            Input,
+            TimeDistributed,
+            Dense,
+            Flatten,
+            RepeatVector,
+            concatenate,
+            Reshape,
+            Embedding)
        from keras.models import Model
        networks = [model.network() for model in pan_allele_class1_neural_networks]
@@ -163,39 +184,28 @@ class Class1LigandomePredictor(object):
            layer_name_to_new_node[layer.name] = node
+        affinity_predictor_output = node
+        for (i, layer_size) in enumerate(
+                hyperparameters['additional_dense_layers']):
+            layer = Dense(
+                layer_size,
+                activation=hyperparameters['additional_dense_activation'])
+            lifted = TimeDistributed(layer)
+            node = lifted(node)
+        layer = Dense(1, activation="sigmoid")
+        lifted = TimeDistributed(layer)
+        ligandome_output = lifted(node)
        network = Model(
            inputs=[input_peptides, input_alleles],
-            outputs=node,
+            outputs=[affinity_predictor_output, ligandome_output],
            name="ligandome",
        )
        network.summary()
        return network
-    @staticmethod
-    def loss(y_true, y_pred, sample_weight=None, delta=0.2, alpha=None):
-        """
-        Loss function for ligandome prediction.
-        """
-        import tensorflow as tf
-        y_pred = tf.squeeze(y_pred, axis=-1)
-        y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
-        pos = tf.boolean_mask(y_pred, y_true)
-        if alpha is None:
-            pos_max = tf.reduce_max(pos, axis=1)
-        else:
-            # Smooth maximum
-            exp_alpha_x = tf.exp(alpha * pos)
-            numerator = tf.reduce_sum(tf.multiply(pos, exp_alpha_x), axis=1)
-            denominator = tf.reduce_sum(exp_alpha_x, axis=1)
-            pos_max = numerator / denominator
-        neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
-        result = tf.reduce_sum(
-            tf.maximum(0.0, tf.reshape(neg, (-1, 1)) - pos_max + delta) ** 2)
-        return result
    def peptides_to_network_input(self, peptides):
        """
        Encode peptides to the fixed-length encoding expected by the neural
@@ -241,6 +251,8 @@ class Class1LigandomePredictor(object):
            peptides,
            labels,
            allele_encoding,
+            affinities_mask=None,  # True when a peptide/label is actually a peptide and an affinity
+            inequalities=None,  # interpreted only for elements where affinities_mask is True, otherwise ignored
            shuffle_permutation=None,
            verbose=1,
            progress_callback=None,
@@ -275,14 +287,25 @@ class Class1LigandomePredictor(object):
            'allele': allele_encoding_input,
        }
+        loss = MSEWithInequalitiesAndMultipleOutputs(losses=[
+            MSEWithInequalities(),
+            MultiallelicMassSpecLoss(
+                delta=self.hyperparameters['loss_delta']),
+        ])
+        y_values_pre_encoding = labels.copy()
+        if affinities_mask is not None:
+            y_values_pre_encoding[affinities_mask] = from_ic50(labels)
+        y_values = loss.encode_y(
+            y_values_pre_encoding,
+            inequalities=inequalities[affinities_mask] if inequalities is not None else None,
+            output_indices=(~affinities_mask).astype(int) if affinities_mask is not None else numpy.ones(len(y_values_pre_encoding), dtype=int))
        fit_info = collections.defaultdict(list)
        self.set_allele_representations(allele_representations)
        self.network.compile(
-            loss=partial(
+            loss=loss.loss,
-                self.loss,
-                delta=self.hyperparameters['loss_delta'],
-                alpha=self.hyperparameters['loss_alpha']),
            optimizer=self.hyperparameters['optimizer'])
        if self.hyperparameters['learning_rate'] is not None:
            K.set_value(
@@ -371,6 +394,7 @@ class Class1LigandomePredictor(object):
            self,
            peptides,
            allele_encoding,
+            output="affinities",
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        (allele_encoding_input, allele_representations) = (
                self.allele_encoding_to_network_input(allele_encoding.compact()))
@@ -380,12 +404,16 @@ class Class1LigandomePredictor(object):
            'allele': allele_encoding_input,
        }
        predictions = self.network.predict(x_dict, batch_size=batch_size)
+        if output == "affinities":
+            predictions = to_ic50(predictions[0])
+        elif output == "ligandome_presentation":
+            predictions = predictions[1]
+        elif output == "both":
+            pass
+        else:
+            raise NotImplementedError("Unknown output", output)
        return numpy.squeeze(predictions)
-    #def predict(self):
    def set_allele_representations(self, allele_representations):
        """
        """
@@ -440,3 +468,13 @@ class Class1LigandomePredictor(object):
                original_model.fit_generator = throw
        layer.set_weights([reshaped])
+    @staticmethod
+    def allele_features(allele_names, hyperparameters):
+        df = pandas.DataFrame({"allele_name": allele_names})
+        if hyperparameters['allele_features_include_gene']:
+            # TODO: support other organisms.
+            for gene in ["A", "B", "C"]:
+                df[gene] = df.allele_name.str.startswith(
+                    "HLA-%s" % gene).astype(float)
+        return gene
--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
@@ -182,8 +182,10 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
    supports_inequalities = True
    supports_multiple_outputs = True
-    @staticmethod
+    def __init__(self, losses=MSEWithInequalities):
-    def encode_y(y, inequalities=None, output_indices=None):
+        self.losses = losses
+    def encode_y(self, y, inequalities=None, output_indices=None):
        y = array(y, dtype="float32")
        if isnan(y).any():
            raise ValueError("y contains NaN", y)
@@ -192,8 +194,25 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
        if (y < 0.0).any():
            raise ValueError("y contains values < 0.0", y)
-        encoded = MSEWithInequalities.encode_y(
+        if isinstance(self.losses, Loss):
-            y, inequalities=inequalities)
+            # Single loss applied to all outputs
+            encoded = MSEWithInequalities.encode_y(y, inequalities=inequalities)
+        else:
+            assert output_indices is not None
+            df = pandas.DataFrame({
+                "y": y,
+                "inequality": inequalities,
+                "output_index": output_indices,
+            })
+            encoded = y.copy()
+            encoded[:] = numpy.nan
+            for (output_index, sub_df) in df.groupby("output_index"):
+                loss = self.losses[output_index]
+                loss_kwargs = {}
+                if not sub_df.inequality.isnull().all():
+                    loss_kwargs['inequalities'] = sub_df.inequality.values
+                encoded[sub_df.index.values] = loss.encode_y(
+                    sub_df.y.values, **loss_kwargs)
        if output_indices is not None:
            output_indices = numpy.array(output_indices)
@@ -205,8 +224,7 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
        return encoded
-    @staticmethod
+    def loss(self, y_true, y_pred):
-    def loss(y_true, y_pred):
        from keras import backend as K
        y_true = K.flatten(y_true)
@@ -230,7 +248,49 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
        # ], axis=-1)
        #updated_y_pred = tf.gather_nd(y_pred, indexer)
-        return MSEWithInequalities.loss(updated_y_true, updated_y_pred)
+        if isinstance(self.losses, Loss):
+            # Single loss for all outputs.
+            return self.losses.loss(updated_y_true, updated_y_pred)
+        else:
+            # TODO: make this more efficient?
+            result = None
+            for (i, loss) in enumerate(self.losses):
+                values = (
+                    loss.loss(updated_y_true, updated_y_pred) *
+                    K.cast(K.equal(output_indices, i), "float32"))
+                if result is None:
+                    result = values
+                else:
+                    result += values
+            return result
+class MultiallelicMassSpecLoss(Loss):
+    """
+    """
+    name = "multiallelic_mass_spec_loss"
+    supports_inequalities = True
+    supports_multiple_outputs = False
+    def __init__(self, delta=0.2):
+        self.delta = delta
+    def encode_y(self, y):
+        return y
+    def loss(self, y_true, y_pred):
+        import tensorflow as tf
+        #y_pred = tf.squeeze(y_pred, axis=-1)
+        y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
+        pos = tf.boolean_mask(y_pred, y_true)
+        pos_max = tf.reduce_max(pos, axis=1)
+        neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
+        result = tf.reduce_sum(
+            tf.maximum(
+                0.0, tf.reshape(neg, (-1, 1)) - pos_max + self.delta) ** 2)
+        return result
 def check_shape(name, arr, expected_shape):
@@ -250,5 +310,5 @@ def check_shape(name, arr, expected_shape):
 # Register custom losses.
-for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
+for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss]:
    CUSTOM_LOSSES[cls.name] = cls()
--- a/test/test_class1_ligandome_predictor.py
+++ b/test/test_class1_ligandome_predictor.py
@@ -38,6 +38,7 @@ from mhcflurry.regression_target import from_ic50
 from mhcflurry.common import random_peptides, positional_frequency_matrix
 from mhcflurry.testing_utils import cleanup, startup
 from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
+from mhcflurry.custom_loss import MultiallelicMassSpecLoss
 COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)
@@ -110,73 +111,66 @@ def evaluate_loss(loss, y_true, y_pred):
 def test_loss():
    for delta in [0.0, 0.3]:
-        for alpha in [None, 1.0, 20.0]:
+        print("delta", delta)
-            print("delta", delta)
+        # Hit labels
-            print("alpha", alpha)
+        y_true = [
-            # Hit labels
+            1.0,
-            y_true = [
+            0.0,
-                1.0,
+            1.0,
-                0.0,
+            1.0,
-                1.0,
+            0.0
-                1.0,
+        ]
-                0.0
+        y_true = numpy.array(y_true)
-            ]
+        y_pred = [
-            y_true = numpy.array(y_true)
+            [0.3, 0.7, 0.5],
-            y_pred = [
+            [0.2, 0.4, 0.6],
-                [0.3, 0.7, 0.5],
+            [0.1, 0.5, 0.3],
-                [0.2, 0.4, 0.6],
+            [0.1, 0.7, 0.1],
-                [0.1, 0.5, 0.3],
+            [0.8, 0.2, 0.4],
-                [0.1, 0.7, 0.1],
+        ]
-                [0.8, 0.2, 0.4],
+        y_pred = numpy.array(y_pred)
-            ]
-            y_pred = numpy.array(y_pred)
+        # reference implementation 1
-            # reference implementation 1
+        def smooth_max(x, alpha):
+            x = numpy.array(x)
-            def smooth_max(x, alpha):
+            alpha = numpy.array([alpha])
-                x = numpy.array(x)
+            return (x * numpy.exp(x * alpha)).sum() / (
-                alpha = numpy.array([alpha])
+                numpy.exp(x * alpha)).sum()
-                return (x * numpy.exp(x * alpha)).sum() / (
-                    numpy.exp(x * alpha)).sum()
+        contributions = []
+        for i in range(len(y_true)):
-            if alpha is None:
+            if y_true[i] == 1.0:
-                max_func = max
+                for j in range(len(y_true)):
-            else:
+                    if y_true[j] == 0.0:
-                max_func = partial(smooth_max, alpha=alpha)
+                        tightest_i = max(y_pred[i])
+                        contribution = sum(
-            contributions = []
+                            max(0, y_pred[j, k] - tightest_i + delta)**2
-            for i in range(len(y_true)):
+                            for k in range(y_pred.shape[1])
-                if y_true[i] == 1.0:
+                        )
-                    for j in range(len(y_true)):
+                        contributions.append(contribution)
-                        if y_true[j] == 0.0:
+        contributions = numpy.array(contributions)
-                            tightest_i = max_func(y_pred[i])
+        expected1 = contributions.sum()
-                            contribution = sum(
-                                max(0, y_pred[j, k] - tightest_i + delta)**2
+        # reference implementation 2: numpy
-                                for k in range(y_pred.shape[1])
+        pos = numpy.array([
-                            )
+            max(y_pred[i])
-                            contributions.append(contribution)
+            for i in range(len(y_pred))
-            contributions = numpy.array(contributions)
+            if y_true[i] == 1.0
-            expected1 = contributions.sum()
+        ])
-            # reference implementation 2: numpy
+        neg = y_pred[~y_true.astype(bool)]
-            pos = numpy.array([
+        expected2 = (
-                max_func(y_pred[i])
+                numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
-                for i in range(len(y_pred))
-                if y_true[i] == 1.0
+        yield numpy.testing.assert_almost_equal, expected1, expected2, 4
-            ])
+        computed = evaluate_loss(
-            neg = y_pred[~y_true.astype(bool)]
+            MultiallelicMassSpecLoss(delta=delta).loss,
-            expected2 = (
+            y_true,
-                    numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
+            y_pred.reshape(y_pred.shape + (1,)))
-            yield numpy.testing.assert_almost_equal, expected1, expected2, 4
+        yield numpy.testing.assert_almost_equal, computed, expected1, 4
-            computed = evaluate_loss(
-                partial(Class1LigandomePredictor.loss, delta=delta, alpha=alpha),
-                y_true,
-                y_pred.reshape(y_pred.shape + (1,)))
-            yield numpy.testing.assert_almost_equal, computed, expected1, 4
 AA_DIST = pandas.Series(
@@ -284,6 +278,7 @@ def test_synthetic_allele_refinement(max_epochs=10):
    predictor = Class1LigandomePredictor(
        PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
+        additional_dense_layers=[8, 1],
        max_ensemble_size=1,
        max_epochs=max_epochs,
        learning_rate=0.0001,
@@ -299,9 +294,11 @@ def test_synthetic_allele_refinement(max_epochs=10):
        allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
    ).compact()
-    pre_predictions = predictor.predict(
+    pre_predictions = from_ic50(
-        peptides=train_df.peptide.values,
+        predictor.predict(
-        allele_encoding=allele_encoding)
+            output="affinities",
+            peptides=train_df.peptide.values,
+            allele_encoding=allele_encoding))
    (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
    expected_pre_predictions = from_ic50(
@@ -310,11 +307,13 @@ def test_synthetic_allele_refinement(max_epochs=10):
            allele_encoding=allele_encoding.allele_encoding,
    )).reshape((-1, len(alleles)))
+    #import ipdb ; ipdb.set_trace()
    train_df["pre_max_prediction"] = pre_predictions.max(1)
    pre_auc = roc_auc_score(train_df.hit.values, train_df.pre_max_prediction.values)
    print("PRE_AUC", pre_auc)
-    assert_allclose(pre_predictions, expected_pre_predictions)
+    assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
    motifs_history = []
    random_peptides_encodable = make_random_peptides(10000, [9])
@@ -328,8 +327,11 @@ def test_synthetic_allele_refinement(max_epochs=10):
    metric_rows = []
    def progress():
-        predictions = predictor.predict(peptides=train_df.peptide.values,
+        predictions = from_ic50(
-            allele_encoding=allele_encoding, )
+            predictor.predict(
+                output="affinities",
+                peptides=train_df.peptide.values,
+                allele_encoding=allele_encoding))
        train_df["max_prediction"] = predictions.max(1)
        train_df["predicted_allele"] = pandas.Series(alleles).loc[