fixes

694e6634 · Tim O'Donnell · 7205db5c · 694e6634 · 694e6634 · 694e6634
Commit 694e6634 authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
@@ -150,8 +150,8 @@ class AlleleEncoding(object):
 class MultipleAlleleEncoding(object):
    def __init__(
            self,
-            experiment_names,
-            experiment_to_allele_list,
+            experiment_names=[],
+            experiment_to_allele_list={},
            max_alleles_per_experiment=6,
            allele_to_sequence=None,
            borrow_from=None):
@@ -194,6 +194,12 @@ class MultipleAlleleEncoding(object):
        return self.allele_encoding.indices.values.reshape(
            (-1, self.max_alleles_per_experiment))

+    @property
+    def alleles(self):
+        return numpy.reshape(
+            self.allele_encoding.alleles.values,
+            (-1, self.max_alleles_per_experiment))
+
    def compact(self):
        result = copy(self)
        result.allele_encoding = self.allele_encoding.compact()

--- a/mhcflurry/class1_ligandome_predictor.py
+++ b/mhcflurry/class1_ligandome_predictor.py
--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
@@ -92,12 +92,12 @@ class MSEWithInequalities(Loss):
       y_pred is greater or less than y_true.

    between 2 - 3:
-       Treated as a "<" inequality. Penalty (y_pred - (y_true - 2))**2 is
-       applied only if y_pred is greater than y_true - 2.
+       Treated as a ">" inequality. Penalty (y_pred - (y_true - 2))**2 is
+       applied only if y_pred is less than y_true - 2.

    between 4 - 5:
-       Treated as a ">" inequality. Penalty (y_pred - (y_true - 4))**2 is
-       applied only if y_pred is less than y_true - 4.
+       Treated as a "<" inequality. Penalty (y_pred - (y_true - 4))**2 is
+       applied only if y_pred is greater than y_true - 4.
    """
    name = "mse_with_inequalities"
    supports_inequalities = True
@@ -240,11 +240,12 @@ class MultiallelicMassSpecLoss(Loss):
    supports_inequalities = True
    supports_multiple_outputs = False

-    def __init__(self, delta=0.2):
+    def __init__(self, delta=0.2, multiplier=1.0):
        self.delta = delta
+        self.multiplier = multiplier

    @staticmethod
-    def encode_y(y, affinities_mask=None, inequalities=None):
+    def encode_y(y):
        encoded = pandas.Series(y, dtype="float32", copy=True)
        assert all(item in (-1.0, 1.0, 0.0) for item in encoded), set(y)
        print(
@@ -262,9 +263,25 @@ class MultiallelicMassSpecLoss(Loss):
        pos_max = tf.reduce_max(pos, axis=1)
        neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0))
        term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta
-        result = tf.reduce_sum(tf.maximum(0.0, term) ** 2)
-        return result
+        result = tf.reduce_sum(tf.maximum(0.0, term) ** 2) / tf.cast(
+            tf.shape(term)[0], tf.float32) * self.multiplier
+        return tf.where(tf.is_nan(result), 0.0, result)
+
+
+class ZeroLoss(Loss):
+    """
+    """
+    name = "zero_loss"
+    supports_inequalities = False
+    supports_multiple_outputs = False

+    @staticmethod
+    def encode_y(y):
+        return y
+
+    @staticmethod
+    def loss(y_true, y_pred):
+        return 0.0


 def check_shape(name, arr, expected_shape):
@@ -284,5 +301,7 @@ def check_shape(name, arr, expected_shape):


 # Register custom losses.
-for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
+for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss, ZeroLoss]:
    CUSTOM_LOSSES[cls.name] = cls()
+
+
--- a/test/test_class1_ligandome_predictor.py
+++ b/test/test_class1_ligandome_predictor.py
@@ -21,6 +21,7 @@ logging.getLogger('matplotlib').disabled = True
 import pandas
 import argparse
 import sys
+import copy
 from functools import partial

 from numpy.testing import assert_, assert_equal, assert_allclose
@@ -153,7 +154,7 @@ def test_loss():
                        )
                        contributions.append(contribution)
        contributions = numpy.array(contributions)
-        expected1 = contributions.sum()
+        expected1 = contributions.sum() / len(contributions)

        # reference implementation 2: numpy
        pos = numpy.array([
@@ -164,7 +165,8 @@ def test_loss():

        neg = y_pred[(y_true == 0.0).astype(bool)]
        expected2 = (
-                numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
+                numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() / (
+            len(pos) * len(neg))

        yield numpy.testing.assert_almost_equal, expected1, expected2, 4

@@ -226,7 +228,7 @@ def make_motif(allele, peptides, frac=0.01):
    return matrix


-def test_real_data_multiallelic_refinement(max_epochs=10):
+def Xtest_real_data_multiallelic_refinement(max_epochs=10):
    ms_df = pandas.read_csv(
        get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
    ms_df = ms_df.loc[
@@ -240,10 +242,20 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
            del sample_table[col]
    sample_table["alleles"] = sample_table.hla.str.split()

-    multi_train_df = ms_df.loc[
+    multi_train_hit_df = ms_df.loc[
        ms_df.sample_id  == "RA957"
    ].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True)
-    multi_train_df["label"] = 1.0
+    multi_train_hit_df["label"] = 1.0
+
+    multi_train_decoy_df = ms_df.loc[
+        (ms_df.sample_id  == "CD165") &
+        (~ms_df.peptide.isin(multi_train_hit_df.peptide.unique()))
+    ].drop_duplicates("peptide")[["peptide"]]
+    (multi_train_decoy_df["sample_id"],) = multi_train_hit_df.sample_id.unique()
+    multi_train_decoy_df["label"] = 0.0
+
+    multi_train_df = pandas.concat(
+        [multi_train_hit_df, multi_train_decoy_df], ignore_index=True)
    multi_train_df["is_affinity"] = False

    multi_train_alleles = set()
@@ -281,6 +293,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):

    ligandome_predictor = Class1LigandomePredictor(
        pan_predictor,
+        auxiliary_input_features=[],
        max_ensemble_size=1,
        max_epochs=50,
        learning_rate=0.0001,
@@ -292,7 +305,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
        ligandome_predictor.predict(
            output="affinities",
            peptides=combined_train_df.peptide.values,
-            allele_encoding=allele_encoding))
+            alleles=allele_encoding))

    (model,) = pan_predictor.class1_pan_allele_models
    expected_pre_predictions = from_ic50(
@@ -325,11 +338,220 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
        progress_callback=update_motifs,
    )

+    import ipdb ; ipdb.set_trace()
+
+
+def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
+    refine_allele = "HLA-C*01:02"
+    alleles = [
+        "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
+        "HLA-A*03:01", "HLA-B*15:01", refine_allele
+    ]
+    peptides_per_allele = [
+        2000, 1000, 500,
+        1500, 1200, 800,
+    ]
+
+    allele_to_peptides = dict(zip(alleles, peptides_per_allele))
+
+    length = 9
+
+    train_with_ms = pandas.read_csv(
+        get_path("data_curated", "curated_training_data.with_mass_spec.csv.bz2"))
+    train_no_ms = pandas.read_csv(get_path("data_curated",
+        "curated_training_data.no_mass_spec.csv.bz2"))
+
+    def filter_df(df):
+        df = df.loc[
+            (df.allele.isin(alleles)) &
+            (df.peptide.str.len() == length)
+        ]
+        return df
+
+    train_with_ms = filter_df(train_with_ms)
+    train_no_ms = filter_df(train_no_ms)
+
+    ms_specific = train_with_ms.loc[
+        ~train_with_ms.peptide.isin(train_no_ms.peptide)
+    ]
+
+    train_peptides = []
+    train_true_alleles = []
+    for allele in alleles:
+        peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample(
+            n=allele_to_peptides[allele])
+        train_peptides.extend(peptides)
+        train_true_alleles.extend([allele] * len(peptides))
+
+    hits_df = pandas.DataFrame({"peptide": train_peptides})
+    hits_df["true_allele"] = train_true_alleles
+    hits_df["hit"] = 1.0
+
+    decoys_df = hits_df.copy()
+    decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide)
+    decoys_df["true_allele"] = ""
+    decoys_df["hit"] = 0.0
+
+    mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True)
+    mms_train_df["label"] =  mms_train_df.hit
+    mms_train_df["is_affinity"] = False
+
+    affinity_train_df = pandas.read_csv(
+        get_path(
+            "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
+    affinity_train_df = affinity_train_df.loc[
+        affinity_train_df.allele.isin(alleles),
+        ["peptide", "allele",  "measurement_inequality", "measurement_value"]]
+
+    affinity_train_df["label"] = affinity_train_df["measurement_value"]
+    del affinity_train_df["measurement_value"]
+    affinity_train_df["is_affinity"] = True
+
+    predictor = Class1LigandomePredictor(
+        PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
+        auxiliary_input_features=["gene"],
+        max_ensemble_size=1,
+        max_epochs=max_epochs,
+        learning_rate=0.0001,
+        patience=5,
+        min_delta=0.0,
+        random_negative_rate=1.0,
+        random_negative_constant=25)
+
+    mms_allele_encoding = MultipleAlleleEncoding(
+        experiment_names=["experiment1"] * len(mms_train_df),
+        experiment_to_allele_list={
+            "experiment1": alleles,
+        },
+        max_alleles_per_experiment=6,
+        allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
+    )
+    allele_encoding = copy.deepcopy(mms_allele_encoding)
+    allele_encoding.append_alleles(affinity_train_df.allele.values)
+    allele_encoding = allele_encoding.compact()
+
+    train_df = pandas.concat(
+        [mms_train_df, affinity_train_df], ignore_index=True, sort=False)
+
+    pre_predictions = from_ic50(
+        predictor.predict(
+            output="affinities_matrix",
+            peptides=mms_train_df.peptide.values,
+            alleles=mms_allele_encoding))
+
+    (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
+    expected_pre_predictions = from_ic50(
+        model.predict(
+            peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)),
+            allele_encoding=mms_allele_encoding.allele_encoding,
+    )).reshape((-1, len(alleles)))
+
    #import ipdb ; ipdb.set_trace()

+    mms_train_df["pre_max_prediction"] = pre_predictions.max(1)
+    pre_auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.pre_max_prediction.values)
+    print("PRE_AUC", pre_auc)
+
+    assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)

+    motifs_history = []
+    random_peptides_encodable = make_random_peptides(10000, [9])


+    def update_motifs():
+        for allele in alleles:
+            motif = make_motif(allele, random_peptides_encodable)
+            motifs_history.append((allele, motif))
+
+    metric_rows = []
+
+    def progress():
+        (_, ligandome_prediction, affinities_predictions) = (
+            predictor.predict(
+                output="all",
+                peptides=mms_train_df.peptide.values,
+                alleles=mms_allele_encoding))
+        affinities_predictions = from_ic50(affinities_predictions)
+        for (kind, predictions) in [
+                ("affinities", affinities_predictions),
+                ("ligandome", ligandome_prediction)]:
+
+            mms_train_df["max_prediction"] = predictions.max(1)
+            mms_train_df["predicted_allele"] = pandas.Series(alleles).loc[
+                predictions.argmax(1).flatten()
+            ].values
+
+            print(kind)
+            print(predictions)
+
+            mean_predictions_for_hit = mms_train_df.loc[
+                mms_train_df.hit == 1.0
+            ].max_prediction.mean()
+            mean_predictions_for_decoy = mms_train_df.loc[
+                mms_train_df.hit == 0.0
+            ].max_prediction.mean()
+            correct_allele_fraction = (
+                    mms_train_df.loc[mms_train_df.hit == 1.0].predicted_allele ==
+                    mms_train_df.loc[mms_train_df.hit == 1.0].true_allele
+            ).mean()
+            auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.max_prediction.values)
+
+            print(kind, "Mean prediction for hit", mean_predictions_for_hit)
+            print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
+            print(kind, "Correct predicted allele fraction", correct_allele_fraction)
+            print(kind, "AUC", auc)
+
+            metric_rows.append((
+                kind,
+                mean_predictions_for_hit,
+                mean_predictions_for_decoy,
+                correct_allele_fraction,
+                auc,
+            ))
+
+            update_motifs()
+
+        return (ligandome_prediction, auc)
+
+    print("Pre fitting:")
+    progress()
+    update_motifs()
+    print("Fitting...")
+
+    predictor.fit(
+        peptides=train_df.peptide.values,
+        labels=train_df.label.values,
+        inequalities=train_df.measurement_inequality.values,
+        affinities_mask=train_df.is_affinity.values,
+        allele_encoding=allele_encoding,
+        progress_callback=progress,
+    )
+
+    (predictions, final_auc) = progress()
+    print("Final AUC", final_auc)
+
+    update_motifs()
+
+    motifs = pandas.DataFrame(
+        motifs_history,
+        columns=[
+            "allele",
+            "motif",
+        ]
+    )
+
+    metrics = pandas.DataFrame(
+        metric_rows,
+        columns=[
+            "output",
+            "mean_predictions_for_hit",
+            "mean_predictions_for_decoy",
+            "correct_allele_fraction",
+            "auc"
+        ])
+
+    return (predictor, predictions, metrics, motifs)
+


 def Xtest_synthetic_allele_refinement(max_epochs=10):
@@ -387,12 +609,13 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):

    predictor = Class1LigandomePredictor(
        PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
-        additional_dense_layers=[8, 1],
        max_ensemble_size=1,
        max_epochs=max_epochs,
        learning_rate=0.0001,
        patience=5,
-        min_delta=0.0)
+        min_delta=0.0,
+        random_negative_rate=0.0,
+        random_negative_constant=0)

    allele_encoding = MultipleAlleleEncoding(
        experiment_names=["experiment1"] * len(train_df),
@@ -405,9 +628,9 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):

    pre_predictions = from_ic50(
        predictor.predict(
-            output="affinities",
+            output="affinities_matrix",
            peptides=train_df.peptide.values,
-            allele_encoding=allele_encoding))
+            alleles=allele_encoding))

    (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
    expected_pre_predictions = from_ic50(
@@ -436,45 +659,52 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
    metric_rows = []

    def progress():
-        predictions = from_ic50(
+        (_, ligandome_prediction, affinities_predictions) = (
            predictor.predict(
-                output="affinities",
+                output="all",
                peptides=train_df.peptide.values,
-                allele_encoding=allele_encoding))
-
-        train_df["max_prediction"] = predictions.max(1)
-        train_df["predicted_allele"] = pandas.Series(alleles).loc[
-            predictions.argmax(1).flatten()].values
-
-        print(predictions)
-
-        mean_predictions_for_hit = train_df.loc[
-            train_df.hit == 1.0
-        ].max_prediction.mean()
-        mean_predictions_for_decoy = train_df.loc[
-            train_df.hit == 0.0
-        ].max_prediction.mean()
-        correct_allele_fraction = (
-                train_df.loc[train_df.hit == 1.0].predicted_allele ==
-                train_df.loc[train_df.hit == 1.0].true_allele
-        ).mean()
-        auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
-
-        print("Mean prediction for hit", mean_predictions_for_hit)
-        print("Mean prediction for decoy", mean_predictions_for_decoy)
-        print("Correct predicted allele fraction", correct_allele_fraction)
-        print("AUC", auc)
-
-        metric_rows.append((
-            mean_predictions_for_hit,
-            mean_predictions_for_decoy,
-            correct_allele_fraction,
-            auc,
-        ))
-
-        update_motifs()
-
-        return (predictions, auc)
+                alleles=allele_encoding))
+        affinities_predictions = from_ic50(affinities_predictions)
+        for (kind, predictions) in [
+                ("affinities", affinities_predictions),
+                ("ligandome", ligandome_prediction)]:
+
+            train_df["max_prediction"] = predictions.max(1)
+            train_df["predicted_allele"] = pandas.Series(alleles).loc[
+                predictions.argmax(1).flatten()
+            ].values
+
+            print(kind)
+            print(predictions)
+
+            mean_predictions_for_hit = train_df.loc[
+                train_df.hit == 1.0
+            ].max_prediction.mean()
+            mean_predictions_for_decoy = train_df.loc[
+                train_df.hit == 0.0
+            ].max_prediction.mean()
+            correct_allele_fraction = (
+                    train_df.loc[train_df.hit == 1.0].predicted_allele ==
+                    train_df.loc[train_df.hit == 1.0].true_allele
+            ).mean()
+            auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
+
+            print(kind, "Mean prediction for hit", mean_predictions_for_hit)
+            print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
+            print(kind, "Correct predicted allele fraction", correct_allele_fraction)
+            print(kind, "AUC", auc)
+
+            metric_rows.append((
+                kind,
+                mean_predictions_for_hit,
+                mean_predictions_for_decoy,
+                correct_allele_fraction,
+                auc,
+            ))
+
+            update_motifs()
+
+        return (ligandome_prediction, auc)

    print("Pre fitting:")
    progress()
@@ -504,6 +734,7 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
    metrics = pandas.DataFrame(
        metric_rows,
        columns=[
+            "output",
            "mean_predictions_for_hit",
            "mean_predictions_for_decoy",
            "correct_allele_fraction",