From 7205db5c099c12d752cbf7183c995ac9b985b330 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 19 Nov 2019 18:05:11 -0500
Subject: [PATCH] fixes

---
 mhcflurry/allele_encoding.py            |  14 +++
 mhcflurry/class1_ligandome_predictor.py | 124 ++++++++++++++++++++----
 mhcflurry/custom_loss.py                |  26 ++---
 mhcflurry/random_negative_peptides.py   |   6 ++
 test/test_class1_ligandome_predictor.py |   5 +-
 5 files changed, 134 insertions(+), 41 deletions(-)

diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py
index 9974baca..273f8fcb 100644
--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
@@ -208,3 +208,17 @@ class MultipleAlleleEncoding(object):
 
     def fixed_length_vector_encoded_sequences(self, encoding_name):
         raise NotImplementedError()
+
+    def shuffle_in_place(self, shuffle_permutation=None):
+        flattened_alleles = self.allele_encoding.alleles.values
+        reshaped_alleles = numpy.reshape(
+            flattened_alleles, (-1, self.max_alleles_per_experiment))
+
+        if shuffle_permutation is None:
+            shuffle_permutation = numpy.random.permutation(
+                len(reshaped_alleles))
+
+        self.allele_encoding = AlleleEncoding(
+            alleles=reshaped_alleles[shuffle_permutation].flatten(),
+            borrow_from=self.allele_encoding
+        )
\ No newline at end of file
diff --git a/mhcflurry/class1_ligandome_predictor.py b/mhcflurry/class1_ligandome_predictor.py
index f1d34505..32c2c1df 100644
--- a/mhcflurry/class1_ligandome_predictor.py
+++ b/mhcflurry/class1_ligandome_predictor.py
@@ -12,11 +12,12 @@ from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SI
 from .encodable_sequences import EncodableSequences
 from .regression_target import from_ic50, to_ic50
 from .random_negative_peptides import RandomNegativePeptides
+from .allele_encoding import MultipleAlleleEncoding
 from .custom_loss import (
     MSEWithInequalities,
-    MSEWithInequalitiesAndMultipleOutputs,
     MultiallelicMassSpecLoss)
 
+
 class Class1LigandomePredictor(object):
     network_hyperparameter_defaults = HyperparameterDefaults(
         allele_amino_acid_encoding="BLOSUM62",
@@ -37,7 +38,8 @@ class Class1LigandomePredictor(object):
         max_epochs=500,
         validation_split=0.1,
         early_stopping=True,
-        minibatch_size=128,).extend(
+        minibatch_size=128,
+        random_negative_affinity_min=20000.0,).extend(
         RandomNegativePeptides.hyperparameter_defaults
     )
     """
@@ -273,26 +275,47 @@ class Class1LigandomePredictor(object):
         #    layer.trainable = False
         #    import ipdb ; ipdb.set_trace()
 
-        peptides = EncodableSequences.create(peptides)
-        peptide_encoding = self.peptides_to_network_input(peptides)
-
-        # Optional optimization
-        allele_encoding = allele_encoding.compact()
-
-        (allele_encoding_input, allele_representations) = (
-            self.allele_encoding_to_network_input(allele_encoding))
+        encodable_peptides = EncodableSequences.create(peptides)
+        if labels is not None:
+            labels = numpy.array(labels, copy=False)
+        if affinities_mask is not None:
+            affinities_mask = numpy.array(affinities_mask, copy=False)
+        if inequalities is not None:
+            inequalities = numpy.array(inequalities, copy=False)
+
+        random_negatives_planner = RandomNegativePeptides(
+            **RandomNegativePeptides.hyperparameter_defaults.subselect(
+                self.hyperparameters))
+        random_negatives_planner.plan(
+            peptides=encodable_peptides.sequences[affinities_mask],
+            affinities=(labels[affinities_mask]),
+            alleles=numpy.reshape(
+                allele_encoding.allele_encoding.alleles.values,
+                (-1, allele_encoding.max_alleles_per_experiment))[
+                    affinities_mask, 0
+            ].flatten(),
+            inequalities=inequalities[affinities_mask])
+
+        peptide_input = self.peptides_to_network_input(encodable_peptides)
 
         # Shuffle
         if shuffle_permutation is None:
             shuffle_permutation = numpy.random.permutation(len(labels))
-        peptide_encoding = peptide_encoding[shuffle_permutation]
-        allele_encoding_input = allele_encoding_input[shuffle_permutation]
+        peptide_input = peptide_input[shuffle_permutation]
+        allele_encoding.shuffle_in_place(shuffle_permutation)
         labels = labels[shuffle_permutation]
         inequalities = inequalities[shuffle_permutation]
         affinities_mask = affinities_mask[shuffle_permutation]
+        del peptides
+        del encodable_peptides
 
-        x_dict = {
-            'peptide': peptide_encoding,
+        # Optional optimization
+        allele_encoding = allele_encoding.compact()
+        (allele_encoding_input, allele_representations) = (
+            self.allele_encoding_to_network_input(allele_encoding))
+
+        x_dict_without_random_negatives = {
+            'peptide': peptide_input,
             'allele': allele_encoding_input,
         }
 
@@ -300,6 +323,18 @@ class Class1LigandomePredictor(object):
         if affinities_mask is not None:
             y1[affinities_mask] = from_ic50(labels[affinities_mask])
 
+        random_negatives_allele_encoding = None
+        if allele_encoding is not None:
+            random_negative_alleles = random_negatives_planner.get_alleles()
+            random_negatives_allele_encoding = MultipleAlleleEncoding(
+                experiment_names=random_negative_alleles,
+                experiment_to_allele_list=dict(
+                    (a, [a]) for a in random_negative_alleles),
+                max_alleles_per_experiment=(
+                    allele_encoding.max_alleles_per_experiment),
+                borrow_from=allele_encoding.allele_encoding)
+        num_random_negatives = random_negatives_planner.get_total_count()
+
         if inequalities is not None:
             # Reverse inequalities because from_ic50() flips the direction
             # (i.e. lower affinity results in higher y values).
@@ -313,16 +348,35 @@ class Class1LigandomePredictor(object):
 
         adjusted_inequalities[~affinities_mask] = ">"
 
+        # Note: we are using "<" here not ">" because the inequalities are
+        # now in target-space (0-1) not affinity-space.
+        adjusted_inequalities_with_random_negative = numpy.concatenate([
+            numpy.tile("<", num_random_negatives),
+            adjusted_inequalities
+        ])
+        random_negative_ic50 = self.hyperparameters[
+            'random_negative_affinity_min'
+        ]
+        y1_with_random_negatives = numpy.concatenate([
+            numpy.tile(
+                from_ic50(random_negative_ic50), num_random_negatives),
+            y1,
+        ])
+
         affinities_loss = MSEWithInequalities()
         encoded_y1 = affinities_loss.encode_y(
-            y1, inequalities=adjusted_inequalities)
+            y1_with_random_negatives,
+            inequalities=adjusted_inequalities_with_random_negative)
 
         mms_loss = MultiallelicMassSpecLoss(
             delta=self.hyperparameters['loss_delta'])
-
         y2 = labels.copy()
         y2[affinities_mask] = -1
-        encoded_y2 = mms_loss.encode_y(y2)
+        y2_with_random_negatives = numpy.concatenate([
+            numpy.tile(0.0, num_random_negatives),
+            y2,
+        ])
+        encoded_y2 = mms_loss.encode_y(y2_with_random_negatives)
 
         fit_info = collections.defaultdict(list)
 
@@ -344,13 +398,45 @@ class Class1LigandomePredictor(object):
         min_val_loss = None
         last_progress_print = 0
         start = time.time()
+        x_dict_with_random_negatives = {}
         for i in range(self.hyperparameters['max_epochs']):
             epoch_start = time.time()
 
+            random_negative_peptides = EncodableSequences.create(
+                random_negatives_planner.get_peptides())
+            random_negative_peptides_encoding = (
+                self.peptides_to_network_input(random_negative_peptides))
+
+            if not x_dict_with_random_negatives:
+                if len(random_negative_peptides) > 0:
+                    x_dict_with_random_negatives[
+                        "peptide"
+                    ] = numpy.concatenate([
+                        random_negative_peptides_encoding,
+                        x_dict_without_random_negatives['peptide'],
+                    ])
+                    x_dict_with_random_negatives[
+                        'allele'
+                    ] = numpy.concatenate([
+                        self.allele_encoding_to_network_input(
+                            random_negatives_allele_encoding)[0],
+                        x_dict_without_random_negatives['allele']
+                    ])
+                else:
+                    x_dict_with_random_negatives = (
+                        x_dict_without_random_negatives)
+            else:
+                # Update x_dict_with_random_negatives in place.
+                # This is more memory efficient than recreating it as above.
+                if len(random_negative_peptides) > 0:
+                    x_dict_with_random_negatives[
+                        "peptide"
+                    ][:num_random_negatives] = random_negative_peptides_encoding
+
             # TODO: need to use fit_generator to keep each minibatch corresponding
             # to a single experiment
             fit_history = self.network.fit(
-                x_dict,
+                x_dict_with_random_negatives,
                 [encoded_y1, encoded_y2],
                 shuffle=True,
                 batch_size=self.hyperparameters['minibatch_size'],
@@ -410,7 +496,7 @@ class Class1LigandomePredictor(object):
                 progress_callback()
 
         fit_info["time"] = time.time() - start
-        fit_info["num_points"] = len(peptides)
+        fit_info["num_points"] = len(encodable_peptides)
         self.fit_info.append(dict(fit_info))
 
     def predict(
diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py
index 84a83459..2434bfaf 100644
--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
@@ -235,12 +235,6 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
 
 class MultiallelicMassSpecLoss(Loss):
     """
-    Affiniities are mapped according to MSEWithInequalities, then by:
-        x -> x + 10.
-
-    Mass spec hit vs. decoy are kept at [0, 1].
-
-
     """
     name = "multiallelic_mass_spec_loss"
     supports_inequalities = True
@@ -251,20 +245,12 @@ class MultiallelicMassSpecLoss(Loss):
 
     @staticmethod
     def encode_y(y, affinities_mask=None, inequalities=None):
-        y = array(y, dtype="float32")
-        if isnan(y).any():
-            raise ValueError("y contains NaN", y)
-        if (y > 1.0).any():
-            raise ValueError("y contains values > 1.0", y)
-        if (y < 0.0).any():
-            raise ValueError("y contains values < 0.0", y)
-
-        encoded = y.copy()
-        if affinities_mask is not None:
-            encoded[affinities_mask] = MSEWithInequalities.encode_y(
-                encoded[affinities_mask], inequalities=inequalities) + 10.0
-
-        return encoded
+        encoded = pandas.Series(y, dtype="float32", copy=True)
+        assert all(item in (-1.0, 1.0, 0.0) for item in encoded), set(y)
+        print(
+            "MultiallelicMassSpecLoss: y-value counts:\n",
+            encoded.value_counts())
+        return encoded.values
 
     def loss(self, y_true, y_pred):
         import tensorflow as tf
diff --git a/mhcflurry/random_negative_peptides.py b/mhcflurry/random_negative_peptides.py
index b29adc9c..978a93e3 100644
--- a/mhcflurry/random_negative_peptides.py
+++ b/mhcflurry/random_negative_peptides.py
@@ -70,6 +70,12 @@ class RandomNegativePeptides(object):
         pandas.DataFrame indicating number of random negatives for each length
         and allele.
         """
+        numpy.testing.assert_equal(len(peptides), len(affinities))
+        if alleles is not None:
+            numpy.testing.assert_equal(len(peptides), len(alleles))
+        if inequalities is not None:
+            numpy.testing.assert_equal(len(peptides), len(inequalities))
+
         peptides = pandas.Series(peptides, copy=False)
         peptide_lengths = peptides.str.len()
 
diff --git a/test/test_class1_ligandome_predictor.py b/test/test_class1_ligandome_predictor.py
index f081d09b..41ff42ff 100644
--- a/test/test_class1_ligandome_predictor.py
+++ b/test/test_class1_ligandome_predictor.py
@@ -282,10 +282,11 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
     ligandome_predictor = Class1LigandomePredictor(
         pan_predictor,
         max_ensemble_size=1,
-        max_epochs=500,
+        max_epochs=50,
         learning_rate=0.0001,
         patience=5,
-        min_delta=0.0)
+        min_delta=0.0,
+        random_negative_rate=1.0)
 
     pre_predictions = from_ic50(
         ligandome_predictor.predict(
-- 
GitLab