From 288e0db70b116466acf5256f14157018314099d2 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Wed, 18 Sep 2019 00:17:20 -0400
Subject: [PATCH] updates

---
 mhcflurry/class1_ligandome_predictor.py |  58 ++++-
 test/test_class1_ligandome_predictor.py | 303 ++++++++++++++++++------
 2 files changed, 281 insertions(+), 80 deletions(-)

diff --git a/mhcflurry/class1_ligandome_predictor.py b/mhcflurry/class1_ligandome_predictor.py
index 7eba3357..5be86637 100644
--- a/mhcflurry/class1_ligandome_predictor.py
+++ b/mhcflurry/class1_ligandome_predictor.py
@@ -146,7 +146,52 @@ class Class1LigandomePredictor(object):
         return network
 
     @staticmethod
-    def loss(y_true, y_pred):
+    def loss(y_true, y_pred, lmbda=0.001):
+        import keras.backend as K
+        import tensorflow as tf
+
+        y_pred = tf.squeeze(y_pred, axis=-1)
+
+        #y_pred = tf.Print(y_pred, [y_pred, tf.shape(y_pred)], "y_pred", summarize=20)
+        #y_true = tf.Print(y_true, [y_true, tf.shape(y_true)], "y_true", summarize=20)
+
+        y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
+
+        pos = tf.boolean_mask(y_pred, y_true)
+        pos_max = tf.reduce_max(pos, axis=1)
+        #pos_max = tf.reduce_logsumexp(tf.boolean_mask(y_pred, y_true), axis=1)
+        neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
+
+        result = tf.reduce_sum(
+            tf.maximum(0.0, tf.reshape(neg, (-1, 1)) - pos_max)**2)
+
+        term2 = tf.reduce_sum(
+            tf.minimum(0.0, tf.reshape(neg, (-1, 1)) - pos_max))
+        result = result + lmbda * term2
+
+        #differences = tf.reshape(neg, (-1, 1)) - pos
+
+        #result = tf.reduce_sum(tf.sign(differences) * differences**2)
+        #result = tf.Print(result, [result], "result", summarize=20)
+
+        #term2 = lmbda * tf.reduce_mean((1 - pos)**2)
+        #result = result + term2
+        return result
+
+        """
+        pos = tf.boolean_mask(y_pred, y_true)
+
+        pos = y_pred[y_true.astype(bool)].max(1)
+        neg = y_pred[~y_true.astype(bool)]
+        expected2 = (numpy.maximum(0,
+            neg.flatten().reshape((-1, 1)) - pos) ** 2).sum()
+        """
+
+
+
+
+    @staticmethod
+    def loss_old(y_true, y_pred):
         """Binary cross entropy after taking logsumexp over predictions"""
         import keras.backend as K
         import tensorflow as tf
@@ -230,6 +275,11 @@ class Class1LigandomePredictor(object):
 
         import keras.backend as K
 
+        #for layer in self.network._layers[:8]:
+        #    print("Setting non trainable", layer)
+        #    layer.trainable = False
+        #    import ipdb ; ipdb.set_trace()
+
         peptides = EncodableSequences.create(peptides)
         peptide_encoding = self.peptides_to_network_input(peptides)
 
@@ -273,6 +323,9 @@ class Class1LigandomePredictor(object):
         start = time.time()
         for i in range(self.hyperparameters['max_epochs']):
             epoch_start = time.time()
+
+            # TODO: need to use fit_generator to keep each minibatch corresponding
+            # to a single experiment
             fit_history = self.network.fit(
                 x_dict,
                 labels,
@@ -281,7 +334,8 @@ class Class1LigandomePredictor(object):
                 verbose=verbose,
                 epochs=i + 1,
                 initial_epoch=i,
-                validation_split=self.hyperparameters['validation_split'])
+                validation_split=self.hyperparameters['validation_split'],
+            )
             epoch_time = time.time() - epoch_start
 
             for (key, value) in fit_history.history.items():
diff --git a/test/test_class1_ligandome_predictor.py b/test/test_class1_ligandome_predictor.py
index 568ff97e..411d9c06 100644
--- a/test/test_class1_ligandome_predictor.py
+++ b/test/test_class1_ligandome_predictor.py
@@ -14,7 +14,10 @@ Idea:
 
 """
 
-from sklearn.metrics import roc_auc_score
+import logging
+logging.getLogger('tensorflow').disabled = True
+logging.getLogger('matplotlib').disabled = True
+
 import pandas
 import argparse
 import sys
@@ -25,12 +28,13 @@ from random import shuffle
 
 from sklearn.metrics import roc_auc_score
 
-from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork
+from mhcflurry import Class1AffinityPredictor, Class1NeuralNetwork
 from mhcflurry.allele_encoding import MultipleAlleleEncoding
 from mhcflurry.class1_ligandome_predictor import Class1LigandomePredictor
+from mhcflurry.encodable_sequences import EncodableSequences
 from mhcflurry.downloads import get_path
 from mhcflurry.regression_target import from_ic50
-
+from mhcflurry.common import random_peptides, positional_frequency_matrix
 from mhcflurry.testing_utils import cleanup, startup
 from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
 
@@ -72,29 +76,140 @@ def teardown():
     cleanup()
 
 
-def sample_peptides_from_pssm(pssm, count):
-    result = pandas.DataFrame(
-        index=numpy.arange(count),
-        columns=pssm.index,
-        dtype=object,
-    )
-
-    for (position, vector) in pssm.iterrows():
-        result.loc[:, position] = numpy.random.choice(
-            pssm.columns,
-            size=count,
-            replace=True,
-            p=vector.values)
-
-    return result.apply("".join, axis=1)
-
-
 def scramble_peptide(peptide):
     lst = list(peptide)
     shuffle(lst)
     return "".join(lst)
 
 
+def evaluate_loss(loss, y_true, y_pred):
+    import keras.backend as K
+
+    y_true = numpy.array(y_true)
+    y_pred = numpy.array(y_pred)
+    if y_pred.ndim == 1:
+        y_pred = y_pred.reshape((len(y_pred), 1))
+    if y_true.ndim == 1:
+        y_true = y_true.reshape((len(y_true), 1))
+
+    if K.backend() == "tensorflow":
+        session = K.get_session()
+        y_true_var = K.constant(y_true, name="y_true")
+        y_pred_var = K.constant(y_pred, name="y_pred")
+        result = loss(y_true_var, y_pred_var)
+        return result.eval(session=session)
+    elif K.backend() == "theano":
+        y_true_var = K.constant(y_true, name="y_true")
+        y_pred_var = K.constant(y_pred, name="y_pred")
+        result = loss(y_true_var, y_pred_var)
+        return result.eval()
+    else:
+        raise ValueError("Unsupported backend: %s" % K.backend())
+
+
+def Xtest_loss():
+    # Hit labels
+    y_true = [
+        1.0,
+        0.0,
+        1.0,
+        1.0,
+        0.0
+    ]
+    y_true = numpy.array(y_true)
+    y_pred = [
+        [0.3, 0.7, 0.5],
+        [0.2, 0.4, 0.6],
+        [0.1, 0.5, 0.3],
+        [0.1, 0.7, 0.1],
+        [0.8, 0.2, 0.4],
+    ]
+    y_pred = numpy.array(y_pred)
+
+    # reference implementation 1
+    contributions = []
+    for i in range(len(y_true)):
+        if y_true[i] == 1.0:
+            for j in range(len(y_true)):
+                if y_true[j] == 0.0:
+                    tightest_i = max(y_pred[i])
+                    contribution = sum(
+                        max(0, y_pred[j, k] - tightest_i)**2
+                        for k in range(y_pred.shape[1])
+                    )
+                    contributions.append(contribution)
+    contributions = numpy.array(contributions)
+    expected1 = contributions.sum()
+
+    # reference implementation 2: numpy
+    pos = y_pred[y_true.astype(bool)].max(1)
+    neg = y_pred[~y_true.astype(bool)]
+    expected2 = (
+            numpy.maximum(0, neg.reshape((-1, 1)) - pos)**2).sum()
+
+    numpy.testing.assert_almost_equal(expected1, expected2)
+
+    computed = evaluate_loss(
+        Class1LigandomePredictor.loss,
+        y_true,
+        y_pred.reshape(y_pred.shape + (1,)))
+    numpy.testing.assert_almost_equal(computed, expected1)
+
+
+AA_DIST = pandas.Series(
+    dict((line.split()[0], float(line.split()[1])) for line in """
+A    0.071732
+E    0.060102
+N    0.034679
+D    0.039601
+T    0.055313
+L    0.115337
+V    0.070498
+S    0.071882
+Q    0.040436
+F    0.050178
+G    0.053176
+C    0.005429
+H    0.025487
+I    0.056312
+W    0.013593
+K    0.057832
+M    0.021079
+Y    0.043372
+R    0.060330
+P    0.053632
+""".strip().split("\n")))
+print(AA_DIST)
+
+
+def make_random_peptides(num_peptides_per_length=10000, lengths=[9]):
+    peptides = []
+    for length in lengths:
+        peptides.extend(
+            random_peptides
+                (num_peptides_per_length, length=length, distribution=AA_DIST))
+    return EncodableSequences.create(peptides)
+
+
+def make_motif(allele, peptides, frac=0.01):
+    peptides = EncodableSequences.create(peptides)
+    predictions = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.predict(
+        peptides=peptides,
+        allele=allele,
+    )
+
+    random_predictions_df = pandas.DataFrame({"peptide": peptides.sequences})
+    random_predictions_df["prediction"] = predictions
+    random_predictions_df = random_predictions_df.sort_values(
+        "prediction", ascending=True)
+    #print("Random peptide predictions", allele)
+    #print(random_predictions_df)
+    top = random_predictions_df.iloc[:int(len(random_predictions_df) * frac)]
+    matrix = positional_frequency_matrix(top.peptide.values)
+    #print("Matrix")
+    return matrix
+
+
 def test_synthetic_allele_refinement():
     refine_allele = "HLA-C*01:02"
     alleles = [
@@ -151,8 +266,10 @@ def test_synthetic_allele_refinement():
     predictor = Class1LigandomePredictor(
         PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
         max_ensemble_size=1,
-        max_epochs=100,
-        patience=5)
+        max_epochs=10,
+        learning_rate=0.00001,
+        patience=5,
+        min_delta=0.0)
 
     allele_encoding = MultipleAlleleEncoding(
         experiment_names=["experiment1"] * len(train_df),
@@ -182,85 +299,115 @@ def test_synthetic_allele_refinement():
 
     assert_allclose(pre_predictions, expected_pre_predictions)
 
-    predictor.fit(
-        peptides=train_df.peptide.values,
-        labels=train_df.hit.values,
-        allele_encoding=allele_encoding
-    )
+    motifs_history = []
+    random_peptides_encodable = make_random_peptides(10000, [9])
 
-    predictions = predictor.predict(
-        peptides=train_df.peptide.values,
-        allele_encoding=allele_encoding,
-    )
 
-    train_df["max_prediction"] = predictions.max(1)
-    train_df["predicted_allele"] = pandas.Series(alleles).loc[
-        predictions.argmax(1).flatten()
-    ].values
+    def update_motifs():
+        for allele in alleles:
+            motif = make_motif(allele, random_peptides_encodable)
+            motifs_history.append((allele, motif))
 
-    print(predictions)
+    metric_rows = []
 
-    auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
-    print("AUC", auc)
+    def progress():
+        predictions = predictor.predict(peptides=train_df.peptide.values,
+            allele_encoding=allele_encoding, )
 
-    import ipdb ; ipdb.set_trace()
+        train_df["max_prediction"] = predictions.max(1)
+        train_df["predicted_allele"] = pandas.Series(alleles).loc[
+            predictions.argmax(1).flatten()].values
 
-    return predictions
+        print(predictions)
 
+        mean_predictions_for_hit = train_df.loc[
+            train_df.hit == 1.0
+        ].max_prediction.mean()
+        mean_predictions_for_decoy = train_df.loc[
+            train_df.hit == 0.0
+        ].max_prediction.mean()
+        correct_allele_fraction = (
+                train_df.loc[train_df.hit == 1.0].predicted_allele ==
+                train_df.loc[train_df.hit == 1.0].true_allele
+        ).mean()
+        auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
 
+        print("Mean prediction for hit", mean_predictions_for_hit)
+        print("Mean prediction for decoy", mean_predictions_for_decoy)
+        print("Correct predicted allele fraction", correct_allele_fraction)
+        print("AUC", auc)
 
-"""
-def test_simple_synethetic(
-        num_peptide_per_allele_and_length=100, lengths=[8,9,10,11]):
-    alleles = [
-        "HLA-A*02:01", "HLA-B*52:01", "HLA-C*07:01",
-        "HLA-A*03:01", "HLA-B*57:02", "HLA-C*03:01",
-    ]
-    cutoff = PAN_ALLELE_MOTIFS_DF.cutoff_fraction.min()
-    peptides_and_alleles = []
-    for allele in alleles:
-        sub_df = PAN_ALLELE_MOTIFS_DF.loc[
-            (PAN_ALLELE_MOTIFS_DF.allele == allele) &
-            (PAN_ALLELE_MOTIFS_DF.cutoff_fraction == cutoff)
+        metric_rows.append((
+            mean_predictions_for_hit,
+            mean_predictions_for_decoy,
+            correct_allele_fraction,
+            auc,
+        ))
+
+        update_motifs()
+
+        return (predictions, auc)
+
+    print("Pre fitting:")
+    progress()
+    update_motifs()
+    print("Fitting...")
+
+    predictor.fit(
+        peptides=train_df.peptide.values,
+        labels=train_df.hit.values,
+        allele_encoding=allele_encoding,
+        progress_callback=progress,
+    )
+
+    (predictions, final_auc) = progress()
+    print("Final AUC", final_auc)
+
+    update_motifs()
+
+    motifs = pandas.DataFrame(
+        motifs_history,
+        columns=[
+            "allele",
+            "motif",
         ]
-        assert len(sub_df) > 0, allele
-        for length in lengths:
-            pssm = sub_df.loc[
-                sub_df.length == length
-            ].set_index("position")[COMMON_AMINO_ACIDS]
-            peptides = sample_peptides_from_pssm(pssm, num_peptide_per_allele_and_length)
-            for peptide in peptides:
-                peptides_and_alleles.append((peptide, allele))
-
-    hits_df = pandas.DataFrame(
-        peptides_and_alleles,
-        columns=["peptide", "allele"]
     )
-    hits_df["hit"] = 1
 
-    decoys = hits_df.copy()
-    decoys["peptide"] = decoys.peptide.map(scramble_peptide)
-    decoys["hit"] = 0.0
+    metrics = pandas.DataFrame(
+        metric_rows,
+        columns=[
+            "mean_predictions_for_hit",
+            "mean_predictions_for_decoy",
+            "correct_allele_fraction",
+            "auc"
+        ])
 
-    train_df = pandas.concat([hits_df, decoys], ignore_index=True)
+    #import ipdb ; ipdb.set_trace()
+
+    return (predictor, predictions, metrics, motifs)
 
-    return train_df
-    return
-    pass
-"""
 
 parser = argparse.ArgumentParser(usage=__doc__)
 parser.add_argument(
-    "--alleles",
-    nargs="+",
+    "--out-metrics-csv",
+    default=None,
+    help="Metrics output")
+parser.add_argument(
+    "--out-motifs-pickle",
     default=None,
-    help="Which alleles to test")
+    help="Metrics output")
+
 
 if __name__ == '__main__':
     # If run directly from python, leave the user in a shell to explore results.
     setup()
     args = parser.parse_args(sys.argv[1:])
-    result = test_synthetic_allele_refinement()
+    (predictor, predictions, metrics, motifs) = test_synthetic_allele_refinement()
+
+    if args.out_metrics_csv:
+        metrics.to_csv(args.out_metrics_csv)
+    if args.out_motifs_pickle:
+        motifs.to_pickle(args.out_motifs_pickle)
 
     # Leave in ipython
     import ipdb  # pylint: disable=import-error
-- 
GitLab