From f1d17bf092a4e37205a81740c8b949bc5ba6ebb4 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Thu, 26 Sep 2019 15:07:50 -0400
Subject: [PATCH] better random negative peptides

---
 mhcflurry/class1_neural_network.py    | 229 ++------------------------
 mhcflurry/random_negative_peptides.py | 179 ++++++++++++++++++++
 test/test_random_negative_peptides.py |  90 +++++-----
 3 files changed, 235 insertions(+), 263 deletions(-)
 create mode 100644 mhcflurry/random_negative_peptides.py

diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index e6d33a57..1299dd8d 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -18,6 +18,7 @@ from .regression_target import to_ic50, from_ic50
 from .common import random_peptides, amino_acid_distribution
 from .custom_loss import get_loss
 from .data_dependent_weights_initialization import lsuv_init
+from .random_negative_peptides import RandomNegativePeptides
 
 
 DEFAULT_PREDICT_BATCH_SIZE = 4096
@@ -91,16 +92,10 @@ class Class1NeuralNetwork(object):
         early_stopping=True,
         minibatch_size=128,
         data_dependent_initialization_method=None,
-        random_negative_rate=0.0,
-        random_negative_constant=25,
         random_negative_affinity_min=20000.0,
         random_negative_affinity_max=50000.0,
-        random_negative_match_distribution=True,
-        random_negative_distribution_smoothing=0.0,
-        random_negative_output_indices=None,
-        random_negative_method="by_length",
-        random_negative_binder_threshold=None,
-        random_negative_lengths=[8,9,10,11,12,13,14,15])
+        random_negative_output_indices=None).extend(
+            RandomNegativePeptides.hyperparameter_defaults)
     """
     Hyperparameters for neural network training.
     """
@@ -677,202 +672,6 @@ class Class1NeuralNetwork(object):
         fit_info["num_points"] = mutable_generator_state["yielded_values"]
         self.fit_info.append(dict(fit_info))
 
-    def random_negatives_generator(
-            self,
-            encodable_peptides,
-            affinities,
-            allele_encoding,
-            inequalities):
-
-        random_negative_lengths = self.hyperparameters['random_negative_lengths']
-
-        df = pandas.DataFrame({
-            "peptide": encodable_peptides.sequences,
-            "affinity": affinities,
-        })
-        if allele_encoding is not None:
-            df["allele"] = allele_encoding.alleles
-        df["length"] = df.peptide.str.len()
-        if inequalities is None:
-            df["inequality"] = "="
-        else:
-            df["inequality"] = inequalities
-        df_nonbinders = None
-        if self.hyperparameters['random_negative_binder_threshold']:
-            df_nonbinders = df.loc[
-                (df.inequality != "<") &
-                (df.affinity > self.hyperparameters[
-                    'random_negative_binder_threshold'
-                ])
-            ]
-            df = df.loc[
-                (df.inequality != ">") &
-                (df.affinity <= self.hyperparameters[
-                    'random_negative_binder_threshold'
-                ])
-            ]
-
-        aa_distribution = None
-        if self.hyperparameters['random_negative_match_distribution']:
-            aa_distribution = amino_acid_distribution(
-                encodable_peptides.sequences,
-                smoothing=self.hyperparameters[
-                    'random_negative_distribution_smoothing'
-                ])
-            logging.info(
-                "Using amino acid distribution for random negative:\n%s" % (
-                    str(aa_distribution.to_dict())))
-
-        random_negative_alleles = None
-        if self.hyperparameters["random_negative_method"] == "by_length":
-            # Different numbers of random negatives per length. Alleles are
-            # sampled proportionally to the number of times they are used in
-            # the training data.
-            length_to_num_random_negative = {}
-            random_negative_lengths = self.hyperparameters[
-                'random_negative_lengths'
-            ]
-
-            length_counts = df.length.value_counts().to_dict()
-            for length in random_negative_lengths:
-                length_to_num_random_negative[length] = int(
-                    length_counts.get(length, 0) *
-                    self.hyperparameters['random_negative_rate'] +
-                    self.hyperparameters['random_negative_constant'])
-            length_to_num_random_negative = pandas.Series(
-                length_to_num_random_negative)
-            total_random_negatives = length_to_num_random_negative.sum()
-            logging.info("Random negative counts per length:\n%s" % (
-                str(length_to_num_random_negative.to_dict())))
-
-            if allele_encoding is not None:
-                random_negative_alleles = df.allele.sample(
-                    n=total_random_negatives, replace=True).values
-
-            def sample_peptides():
-                peptides = []
-                for (length, count) in length_to_num_random_negative.items():
-                    peptides.extend(
-                        random_peptides(
-                            count,
-                            length=length,
-                            distribution=aa_distribution))
-                random.shuffle(peptides) # important
-                return EncodableSequences.create(peptides)
-        elif self.hyperparameters["random_negative_method"] == "by_allele":
-            # For each allele, a particular number of random negatives are used
-            # for all lengths. Across alleles, the number of random negatives
-            # varies; within an allele, the number of random negatives for each
-            # length is a constant
-            allele_to_num_per_length = {}
-            total_random_peptides_per_length = 0
-            for (allele, sub_df) in df.groupby("allele"):
-                num_for_allele = len(sub_df) * (
-                    self.hyperparameters['random_negative_rate']
-                ) + self.hyperparameters['random_negative_constant']
-                num_per_length = int(math.ceil(
-                    num_for_allele / len(random_negative_lengths)))
-                total_random_peptides_per_length += num_per_length
-                allele_to_num_per_length[allele] = num_per_length
-
-            if allele_encoding is not None:
-                random_negative_alleles = []
-                for _ in random_negative_lengths:
-                    for (allele, num) in allele_to_num_per_length.items():
-                        random_negative_alleles.extend([allele] * num)
-
-            numpy.testing.assert_equal(
-                len(random_negative_alleles),
-                total_random_peptides_per_length * len(random_negative_lengths))
-
-            logging.info(
-                "Random negative counts for each length by allele:\n%s" % (
-                    str(allele_to_num_per_length)))
-
-            def sample_peptides():
-                peptides = []
-                for length in random_negative_lengths:
-                    peptides.extend(
-                        random_peptides(
-                            total_random_peptides_per_length,
-                            length=length,
-                            distribution=aa_distribution))
-                # important NOT to shuffle peptides, since they correspond with
-                # specific alleles.
-                return EncodableSequences.create(peptides)
-        elif self.hyperparameters["random_negative_method"] == "by_allele_equalize_nonbinders":
-            assert df_nonbinders is not None
-
-            allele_and_length_to_num_random_negative = {}
-
-            # First pass
-            allele_to_num_per_length_first_pass = {}
-            for (allele, sub_df) in df.groupby("allele"):
-                num_for_allele = len(sub_df) * (
-                    self.hyperparameters['random_negative_rate']
-                ) + self.hyperparameters['random_negative_constant']
-                num_per_length = int(math.ceil(
-                    num_for_allele / len(random_negative_lengths)))
-                allele_to_num_per_length_first_pass[allele] = num_per_length
-
-            # Second pass
-            for (allele, sub_df) in df_nonbinders.groupby("allele"):
-                nonbinders_by_length = sub_df.peptide.str.len().value_counts()
-                nonbinders_by_length = nonbinders_by_length.reindex(
-                    random_negative_lengths
-                ).fillna(0)
-
-                total_nonbinders_by_length = (
-                    nonbinders_by_length +
-                    allele_to_num_per_length_first_pass[allele])
-                max_total_nonbinders_by_length = total_nonbinders_by_length.max()
-                for (length, total) in total_nonbinders_by_length.items():
-                    allele_and_length_to_num_random_negative[
-                        (allele, length)
-                    ] = math.ceil(
-                        allele_to_num_per_length_first_pass[allele] +
-                        max_total_nonbinders_by_length -
-                        total)
-
-            plan_df = []
-            for (
-                    (allele, length), num_random_negative) in (
-                    allele_and_length_to_num_random_negative.items()):
-                plan_df.append((allele, length, num_random_negative))
-            plan_df = pandas.DataFrame(
-                plan_df, columns=["allele", "length", "num"])
-
-            logging.info("Random negative plan:\n%s", plan_df)
-
-            if allele_encoding is not None:
-                random_negative_alleles = []
-                for _, row in plan_df.iterrows():
-                    random_negative_alleles.extend([row.allele] * int(row.num))
-
-            def sample_peptides():
-                peptides = []
-                for _, row in plan_df.iterrows():
-                    peptides.extend(
-                        random_peptides(
-                            row.num,
-                            length=row.length,
-                            distribution=aa_distribution))
-                # important NOT to shuffle peptides, since they correspond with
-                # specific alleles.
-                return EncodableSequences.create(peptides)
-        else:
-            raise NotImplementedError(
-                self.hyperparameters["random_negative_method"])
-
-        random_negative_allele_encoding = None
-        if random_negative_alleles is not None:
-            random_negative_allele_encoding = AlleleEncoding(
-                random_negative_alleles, borrow_from=allele_encoding)
-
-        yield random_negative_allele_encoding
-        while True:
-            yield sample_peptides()
-
     def fit(
             self,
             peptides,
@@ -937,14 +736,21 @@ class Class1NeuralNetwork(object):
         peptide_encoding = self.peptides_to_network_input(encodable_peptides)
         fit_info = collections.defaultdict(list)
 
-        random_negatives_generator = self.random_negatives_generator(
-            encodable_peptides=encodable_peptides,
+        random_negatives_planner = RandomNegativePeptides(
+            **RandomNegativePeptides.hyperparameter_defaults.subselect(
+                self.hyperparameters))
+        random_negatives_planner.plan(
+            peptides=encodable_peptides.sequences,
             affinities=affinities,
-            allele_encoding=allele_encoding,
+            alleles=allele_encoding.alleles if allele_encoding else None,
             inequalities=inequalities)
-        random_negatives_allele_encoding = next(random_negatives_generator)
-        num_random_negatives = len(
-            next(random_negatives_generator).sequences)
+
+        random_negatives_allele_encoding = None
+        if allele_encoding is not None:
+            random_negatives_allele_encoding = AlleleEncoding(
+                random_negatives_planner.get_alleles(),
+                borrow_from=allele_encoding)
+        num_random_negatives = random_negatives_planner.get_total_count()
 
         y_values = from_ic50(numpy.array(affinities, copy=False))
         assert numpy.isnan(y_values).sum() == 0, y_values
@@ -1107,7 +913,8 @@ class Class1NeuralNetwork(object):
         last_progress_print = None
         x_dict_with_random_negatives = {}
         for i in range(self.hyperparameters['max_epochs']):
-            random_negative_peptides = next(random_negatives_generator)
+            random_negative_peptides = EncodableSequences.create(
+                random_negatives_planner.get_peptides())
             random_negative_peptides_encoding = (
                 self.peptides_to_network_input(random_negative_peptides))
 
diff --git a/mhcflurry/random_negative_peptides.py b/mhcflurry/random_negative_peptides.py
new file mode 100644
index 00000000..80fe7ab4
--- /dev/null
+++ b/mhcflurry/random_negative_peptides.py
@@ -0,0 +1,179 @@
+import logging
+import math
+
+import numpy
+import pandas
+
+from .hyperparameters import HyperparameterDefaults
+from .common import amino_acid_distribution, random_peptides
+
+class RandomNegativePeptides(object):
+    hyperparameter_defaults = HyperparameterDefaults(
+        random_negative_rate=0.0,
+        random_negative_constant=25,
+        random_negative_match_distribution=True,
+        random_negative_distribution_smoothing=0.0,
+        random_negative_method="by_length",
+        random_negative_binder_threshold=None,
+        random_negative_lengths=[8,9,10,11,12,13,14,15])
+
+
+    def __init__(self, **hyperparameters):
+        self.hyperparameters = self.hyperparameter_defaults.with_defaults(
+            hyperparameters)
+        self.plan_df = None
+        self.aa_distribution = None
+
+    def plan(self, peptides, affinities, alleles=None, inequalities=None):
+        peptides = pandas.Series(peptides, copy=False)
+        peptide_lengths = peptides.str.len()
+
+        if self.hyperparameters['random_negative_match_distribution']:
+            self.aa_distribution = amino_acid_distribution(
+                peptides.values,
+                smoothing=self.hyperparameters[
+                    'random_negative_distribution_smoothing'
+                ])
+            logging.info(
+                "Using amino acid distribution for random negative:\n%s" % (
+                    str(self.aa_distribution.to_dict())))
+
+        df_all = pandas.DataFrame({
+            'length': peptide_lengths,
+            'affinity': affinities,
+        })
+        df_all["allele"] = "" if alleles is None else alleles
+        df_all["inequality"] = "=" if inequalities is None else inequalities
+
+        df_binders = None
+        df_nonbinders = None
+        if self.hyperparameters['random_negative_binder_threshold']:
+            df_nonbinders = df_all.loc[
+                (df_all.inequality != "<") &
+                (df_all.affinity > self.hyperparameters[
+                    'random_negative_binder_threshold'
+                ])
+            ]
+            df_binders = df_all.loc[
+                (df_all.inequality != ">") &
+                (df_all.affinity <= self.hyperparameters[
+                    'random_negative_binder_threshold'
+                ])
+            ]
+
+        method = self.hyperparameters['random_negative_method']
+        function = {
+            'by_length': self.plan_by_length,
+            'by_allele': self.plan_by_allele,
+            'by_allele_equalize_nonbinders':
+                self.plan_by_allele_equalize_nonbinders,
+        }[method]
+        function(df_all, df_binders, df_nonbinders)
+        assert self.plan_df is not None
+        logging.info("Random negative plan [%s]:\n%s", method, self.plan_df)
+        return self.plan_df
+
+    def plan_by_length(self, df_all, df_binders=None, df_nonbinders=None):
+        """
+        Different numbers of random negatives per length. Alleles are sampled
+        proportionally to the number of times they are used in the training
+        data.
+
+        Used for allele-specific predictors. Does not work well for pan-allele.
+        """
+        assert list(df_all.allele.unique()) == [""], (
+            "by_length only recommended for allele specific prediction")
+
+        df = df_all if df_binders is None else df_binders
+        lengths = self.hyperparameters['random_negative_lengths']
+
+        length_to_num_random_negative = {}
+        length_counts = df.length.value_counts().to_dict()
+        for length in lengths:
+            length_to_num_random_negative[length] = int(
+                length_counts.get(length, 0) *
+                self.hyperparameters['random_negative_rate'] +
+                self.hyperparameters['random_negative_constant'])
+
+        plan_df = pandas.DataFrame(index=sorted(df.allele.unique()))
+        for length in lengths:
+            plan_df[length] = length_to_num_random_negative[length]
+        self.plan_df = plan_df.astype(int)
+
+    def plan_by_allele(self, df_all, df_binders=None, df_nonbinders=None):
+        """
+        For each allele, a particular number of random negatives are used
+        for all lengths. Across alleles, the number of random negatives
+        varies; within an allele, the number of random negatives for each
+        length is a constant
+        """
+        allele_to_num_per_length = {}
+        total_random_peptides_per_length = 0
+        df = df_all if df_binders is None else df_binders
+        lengths = self.hyperparameters['random_negative_lengths']
+        all_alleles = df_all.allele.unique()
+        for allele in all_alleles:
+            sub_df = df.loc[df.allele == allele]
+            num_for_allele = len(sub_df) * (
+                self.hyperparameters['random_negative_rate']
+            ) + self.hyperparameters['random_negative_constant']
+            num_per_length = int(math.ceil(
+                num_for_allele / len(lengths)))
+            total_random_peptides_per_length += num_per_length
+            allele_to_num_per_length[allele] = num_per_length
+
+        plan_df = pandas.DataFrame(index=sorted(df.allele.unique()))
+        for length in lengths:
+            plan_df[length] = plan_df.index.map(allele_to_num_per_length)
+        self.plan_df = plan_df.astype(int)
+
+    def plan_by_allele_equalize_nonbinders(
+            self, df_all, df_binders, df_nonbinders):
+        """
+        """
+        assert df_binders is not None
+        assert df_nonbinders is not None
+
+        lengths = self.hyperparameters['random_negative_lengths']
+
+        self.plan_by_allele(df_all, df_binders, df_nonbinders)
+        first_pass_plan = self.plan_df
+        self.plan_df = None
+
+        new_plan = first_pass_plan.copy()
+        new_plan[:] = numpy.nan
+
+        for (allele, first_pass_per_length) in first_pass_plan.iterrows():
+            real_nonbinders_by_length = df_nonbinders.loc[
+                df_nonbinders.allele == allele
+            ].length.value_counts().reindex(lengths).fillna(0)
+            total_nonbinders_by_length = (
+                real_nonbinders_by_length + first_pass_per_length)
+            new_plan.loc[allele] = first_pass_per_length + (
+                total_nonbinders_by_length.max() - total_nonbinders_by_length)
+
+        self.plan_df = new_plan.astype(int)
+
+    def get_alleles(self):
+        assert self.plan_df is not None, "Call plan() first"
+        alleles = []
+        for allele, row in self.plan_df.iterrows():
+            alleles.extend([allele] * int(row.sum()))
+        assert len(alleles) == self.get_total_count()
+        return alleles
+
+    def get_peptides(self):
+        assert self.plan_df is not None, "Call plan() first"
+        peptides = []
+        for allele, row in self.plan_df.iterrows():
+            for (length, num) in row.items():
+                peptides.extend(
+                    random_peptides(
+                        num,
+                        length=length,
+                        distribution=self.aa_distribution))
+        assert len(peptides) == self.get_total_count()
+        return peptides
+
+    def get_total_count(self):
+        return self.plan_df.sum().sum()
\ No newline at end of file
diff --git a/test/test_random_negative_peptides.py b/test/test_random_negative_peptides.py
index c3c978f2..06765cb4 100644
--- a/test/test_random_negative_peptides.py
+++ b/test/test_random_negative_peptides.py
@@ -1,3 +1,7 @@
+import logging
+logging.getLogger('matplotlib').disabled = True
+logging.getLogger('tensorflow').disabled = True
+
 from mhcflurry import amino_acid
 from nose.tools import eq_
 from numpy.testing import assert_equal
@@ -9,20 +13,16 @@ from mhcflurry import Class1NeuralNetwork
 from mhcflurry.encodable_sequences import EncodableSequences
 from mhcflurry.allele_encoding import AlleleEncoding
 from mhcflurry.common import random_peptides
+from mhcflurry.random_negative_peptides import RandomNegativePeptides
 
 
 def test_random_negative_peptides_by_allele():
-    network = Class1NeuralNetwork(
+    planner = RandomNegativePeptides(
         random_negative_method="by_allele",
         random_negative_binder_threshold=500,
         random_negative_rate=1.0,
         random_negative_constant=2)
 
-    allele_to_sequence = {
-        "HLA-A*02:01": "AAAAA",
-        "HLA-B*44:02": "CCCCC",
-        "HLA-C*07:02": "EEEEE",
-    }
     data_rows = [
         ("HLA-A*02:01", "SIINFEKL", 400, "="),
         ("HLA-A*02:01", "SIINFEKLL", 300, "="),
@@ -42,25 +42,18 @@ def test_random_negative_peptides_by_allele():
         columns=["allele", "peptide", "affinity", "inequality"])
     data["length"] = data.peptide.str.len()
 
-    peptides = EncodableSequences.create(data.peptide.values)
-    allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence)
-
-    results = network.random_negatives_generator(
-        peptides,
-        data.affinity.values,
-        allele_encoding,
-        data.inequality.values
-    )
-    result_allele_encoding = next(results)
-    first_peptide_sample = next(results)
-    second_peptide_sample = next(results)
-
-    result_df1 = pandas.DataFrame({
-        "allele": result_allele_encoding.alleles,
-        "peptide": first_peptide_sample.sequences,
+    planner.plan(
+        peptides=data.peptide.values,
+        affinities=data.affinity.values,
+        alleles=data.allele.values,
+        inequalities=data.inequality.values)
+    result_df = pandas.DataFrame({
+        "allele": planner.get_alleles(),
+        "peptide": planner.get_peptides(),
     })
-    result_df1["length"] = result_df1.peptide.str.len()
-    random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack()
+
+    result_df["length"] = result_df.peptide.str.len()
+    random_negatives = result_df.groupby(["allele", "length"]).peptide.count().unstack()
     real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
     real_binders = data.loc[
         data.affinity <= 500
@@ -77,23 +70,20 @@ def test_random_negative_peptides_by_allele():
 
 
 def test_random_negative_peptides_by_allele():
-    network = Class1NeuralNetwork(
+    planner = RandomNegativePeptides(
         random_negative_method="by_allele_equalize_nonbinders",
         random_negative_binder_threshold=500,
         random_negative_rate=1.0,
         random_negative_constant=2)
-
-    allele_to_sequence = {
-        "HLA-A*02:01": "AAAAA",
-        "HLA-B*44:02": "CCCCC",
-        "HLA-C*07:02": "EEEEE",
-    }
     data_rows = [
         ("HLA-A*02:01", "SIINFEKL", 400, "="),
         ("HLA-A*02:01", "SIINFEKLL", 300, "="),
         ("HLA-A*02:01", "SIINFEKLL", 300, "="),
         ("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
         ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
+        ("HLA-C*01:02", "SIINFEKLQ", 100, "="),  # only binders
+        ("HLA-C*07:02", "SIINFEKLL", 1000, "=")   # only non-binders
+
     ]
     for peptide in random_peptides(1000, length=9):
         data_rows.append(("HLA-B*44:02", peptide, 100, "="))
@@ -107,25 +97,17 @@ def test_random_negative_peptides_by_allele():
         columns=["allele", "peptide", "affinity", "inequality"])
     data["length"] = data.peptide.str.len()
 
-    peptides = EncodableSequences.create(data.peptide.values)
-    allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence)
-
-    results = network.random_negatives_generator(
-        peptides,
-        data.affinity.values,
-        allele_encoding,
-        data.inequality.values
-    )
-    result_allele_encoding = next(results)
-    first_peptide_sample = next(results)
-    second_peptide_sample = next(results)
-
-    result_df1 = pandas.DataFrame({
-        "allele": result_allele_encoding.alleles,
-        "peptide": first_peptide_sample.sequences,
+    planner.plan(
+        peptides=data.peptide.values,
+        affinities=data.affinity.values,
+        alleles=data.allele.values,
+        inequalities=data.inequality.values)
+    result_df = pandas.DataFrame({
+        "allele": planner.get_alleles(),
+        "peptide": planner.get_peptides(),
     })
-    result_df1["length"] = result_df1.peptide.str.len()
-    random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack()
+    result_df["length"] = result_df.peptide.str.len()
+    random_negatives = result_df.groupby(["allele", "length"]).peptide.count().unstack()
     real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
     real_binders = data.loc[
         data.affinity <= 500
@@ -136,7 +118,11 @@ def test_random_negative_peptides_by_allele():
     for length in random_negatives.columns:
         if length not in real_nonbinders.columns:
             real_nonbinders[length] = 0
-    total_nonbinders = random_negatives + real_nonbinders
+    total_nonbinders = (
+            random_negatives.reindex(real_data.index).fillna(0) +
+            real_nonbinders.reindex(real_data.index).fillna(0))
+
+    assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all(), total_nonbinders
+    assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all(), total_nonbinders
 
-    assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all()
-    assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all()
+    assert not total_nonbinders.isnull().any().any()
-- 
GitLab