fixes

3d5ba47e · Tim O'Donnell · 024618ec · 3d5ba47e · 3d5ba47e · 3d5ba47e
Commit 3d5ba47e authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/class1_presentation_neural_network.py
+++ b/mhcflurry/class1_presentation_neural_network.py
@@ -89,7 +89,7 @@ class Class1PresentationNeuralNetwork(object):
        self.fit_info = []
        self.allele_representation_hash = None
-    def lift_from_class1_neural_network(self, class1_neural_network):
+    def load_from_class1_neural_network(self, class1_neural_network):
        import keras.backend as K
        from keras.layers import (
            Input,
@@ -101,8 +101,13 @@ class Class1PresentationNeuralNetwork(object):
            Activation,
            Lambda,
            Add,
+            Multiply,
            Embedding)
        from keras.models import Model
+        from keras.initializers import Zeros
+        if isinstance(class1_neural_network, Class1NeuralNetwork):
+            class1_neural_network = class1_neural_network.network()
        peptide_shape = tuple(
            int(x) for x in K.int_shape(class1_neural_network.inputs[0])[1:])
@@ -173,8 +178,21 @@ class Class1PresentationNeuralNetwork(object):
                node = layer(input_nodes)
            layer_name_to_new_node[layer.name] = node
-        affinity_predictor_matrix_output = node
+        pre_mask_affinity_predictor_matrix_output = node
+        # Apply allele mask: zero out all outputs corresponding to alleles
+        # with the special index 0.
+        affinity_predictor_matrix_output = Multiply(
+                name="affinity_matrix_output")([
+            Lambda(
+                lambda x: K.cast(
+                    K.expand_dims(K.not_equal(x, 0.0)),
+                    "float32"))(input_alleles),
+            pre_mask_affinity_predictor_matrix_output
+        ])
+        # First allele (i.e. the first column of the alleles matrix) is given
+        # its own output. This is used for the affinity prediction loss.
        affinity_predictor_output = Lambda(
            lambda x: x[:, 0], name="affinity_output")(
                affinity_predictor_matrix_output)
@@ -195,11 +213,17 @@ class Class1PresentationNeuralNetwork(object):
                [node, auxiliary_input], name="affinities_with_auxiliary")
        layer = Dense(8, activation="tanh")
-        lifted = TimeDistributed(layer, name="presentation_hidden1")
+        lifted = TimeDistributed(layer, name="presentation_adjustment_hidden1")
        node = lifted(node)
-        layer = Dense(1, activation="tanh")
+        # By initializing to zero we ensure that before training the
-        lifted = TimeDistributed(layer, name="presentation_hidden2")
+        # presentation output is the same as the affinity output.
+        layer = Dense(
+            1,
+            activation="tanh",
+            kernel_initializer=Zeros(),
+            bias_initializer=Zeros())
+        lifted = TimeDistributed(layer, name="presentation_adjustment")
        presentation_adjustment = lifted(node)
        def logit(x):
@@ -210,8 +234,19 @@ class Class1PresentationNeuralNetwork(object):
            Lambda(logit)(affinity_predictor_matrix_output),
            presentation_adjustment,
        ])
-        presentation_output = Activation("sigmoid", name="presentation_output")(
+        pre_mask_presentation_output = Activation(
-            presentation_output_pre_sigmoid)
+            "sigmoid", name="unmasked_presentation_output")(
+                presentation_output_pre_sigmoid)
+        # Apply allele mask: zero out all outputs corresponding to alleles
+        # with the special index 0.
+        presentation_output = Multiply(name="presentation_output")([
+            Lambda(
+                lambda x: K.cast(
+                    K.expand_dims(K.not_equal(x, 0.0)),
+                    "float32"))(input_alleles),
+            pre_mask_presentation_output
+        ])
        self.network = Model(
            inputs=[
@@ -578,7 +613,7 @@ class Class1PresentationNeuralNetwork(object):
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        peptides = EncodableSequences.create(peptides)
-        assert isinstance(AlleleEncoding, MultipleAlleleEncoding)
+        assert isinstance(allele_encoding, MultipleAlleleEncoding)
        (allele_encoding_input, allele_representations) = (
                self.allele_encoding_to_network_input(allele_encoding.compact()))

--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -75,7 +75,7 @@ class Class1PresentationPredictor(object):
    def max_alleles(self):
        max_alleles = self.models[0].hyperparameters['max_alleles']
        assert all(
-            n.hyperparameters['max_alleles'] == self.max_alleles
+            n.hyperparameters['max_alleles'] == max_alleles
            for n in self.models)
        return max_alleles
@@ -170,11 +170,12 @@ class Class1PresentationPredictor(object):
        ensemble_scores = numpy.mean(score_array, axis=0)
        ensemble_affinity = numpy.mean(affinity_array, axis=0)
        top_allele_index = numpy.argmax(ensemble_scores, axis=-1)
-        top_score = ensemble_scores[top_allele_index]
+        top_allele_flat_indices = (
-        top_affinity = ensemble_affinity[top_allele_index]
+            numpy.arange(len(peptides)) * self.max_alleles + top_allele_index)
+        top_score = ensemble_scores.flatten()[top_allele_flat_indices]
+        top_affinity = ensemble_affinity.flatten()[top_allele_flat_indices]
        result_df = pandas.DataFrame({"peptide": peptides.sequences})
-        result_df["allele"] = alleles.alleles[top_allele_index]
+        result_df["allele"] = alleles.alleles.flatten()[top_allele_flat_indices]
        result_df["score"] = top_score
        result_df["affinity"] = to_ic50(top_affinity)

--- a/test/test_class1_presentation_predictor.py
+++ b/test/test_class1_presentation_predictor.py
@@ -24,16 +24,17 @@ import sys
 import copy
 import os
-from numpy.testing import assert_, assert_equal, assert_allclose
+from numpy.testing import assert_, assert_equal, assert_allclose, assert_array_equal
 from nose.tools import assert_greater, assert_less
 import numpy
 from random import shuffle
 from sklearn.metrics import roc_auc_score
-from mhcflurry import Class1AffinityPredictor, Class1NeuralNetwork
+from mhcflurry import Class1AffinityPredictor
 from mhcflurry.allele_encoding import MultipleAlleleEncoding
 from mhcflurry.class1_presentation_neural_network import Class1PresentationNeuralNetwork
+from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor
 from mhcflurry.encodable_sequences import EncodableSequences
 from mhcflurry.downloads import get_path
 from mhcflurry.regression_target import from_ic50
@@ -41,6 +42,7 @@ from mhcflurry.common import random_peptides, positional_frequency_matrix
 from mhcflurry.testing_utils import cleanup, startup
 from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
 from mhcflurry.custom_loss import MultiallelicMassSpecLoss
+from mhcflurry.regression_target import to_ic50
 COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)
@@ -87,6 +89,40 @@ def teardown():
    cleanup()
+def test_basic():
+    affinity_predictor = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC
+    models = []
+    for affinity_network in affinity_predictor.class1_pan_allele_models:
+        presentation_network = Class1PresentationNeuralNetwork()
+        presentation_network.load_from_class1_neural_network(affinity_network)
+        models.append(presentation_network)
+    predictor = Class1PresentationPredictor(
+        models=models,
+        allele_to_sequence=affinity_predictor.allele_to_sequence)
+    alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:02"]
+    df = pandas.DataFrame(index=numpy.unique(random_peptides(1000, length=9)))
+    for allele in alleles:
+        df[allele] = affinity_predictor.predict(
+            df.index.values, allele=allele)
+    df["tightest_affinity"] = df.min(1)
+    df["tightest_allele"] = df.idxmin(1)
+    df2 = predictor.predict_to_dataframe(
+        peptides=df.index.values,
+        alleles=alleles)
+    merged_df = pandas.merge(
+        df, df2.set_index("peptide"), left_index=True, right_index=True)
+    assert_array_equal(merged_df["tightest_affinity"], merged_df["affinity"])
+    assert_array_equal(merged_df["tightest_affinity"], to_ic50(merged_df["score"]))
+    assert_array_equal(merged_df["tightest_allele"], merged_df["allele"])
+    # TODO: test fitting, saving, and loading
 def scramble_peptide(peptide):
    lst = list(peptide)
    shuffle(lst)
@@ -118,7 +154,7 @@ def evaluate_loss(loss, y_true, y_pred):
        raise ValueError("Unsupported backend: %s" % K.backend())
-def test_loss():
+def Xtest_loss():
    for delta in [0.0, 0.3]:
        print("delta", delta)
        # Hit labels
@@ -236,7 +272,7 @@ def make_motif(allele, peptides, frac=0.01):
    return matrix
-def test_real_data_multiallelic_refinement(max_epochs=10):
+def Xtest_real_data_multiallelic_refinement(max_epochs=10):
    ms_df = pandas.read_csv(
        get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
    ms_df = ms_df.loc[
@@ -349,7 +385,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
    import ipdb ; ipdb.set_trace()
-def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
+def Xtest_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
    refine_allele = "HLA-C*01:02"
    alleles = [
        "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
@@ -562,7 +598,7 @@ def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
-def test_synthetic_allele_refinement(max_epochs=10):
+def Xtest_synthetic_allele_refinement(max_epochs=10):
    refine_allele = "HLA-C*01:02"
    alleles = [
        "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
@@ -752,7 +788,7 @@ def test_synthetic_allele_refinement(max_epochs=10):
    return (predictor, predictions, metrics, motifs)
-def test_batch_generator(sample_rate=0.1):
+def Xtest_batch_generator(sample_rate=0.1):
    multi_train_df = pandas.read_csv(
        data_path("multiallelic_ms.benchmark1.csv.bz2"))
    multi_train_df["label"] = multi_train_df.hit