From a0b0a8a994de54c7c6f31fc60a7faa77221711b6 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Wed, 25 Sep 2019 18:09:28 -0400
Subject: [PATCH] add random negative peptide method
 by_allele_equalize_nonbinders

---
 .../generate_hyperparameters.py               |   2 +-
 mhcflurry/class1_neural_network.py            |  72 ++++++++-
 test/test_random_negative_peptides.py         | 142 ++++++++++++++++++
 3 files changed, 213 insertions(+), 3 deletions(-)
 create mode 100644 test/test_random_negative_peptides.py

diff --git a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
index a6d3297e..08873dfc 100644
--- a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
+++ b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
@@ -41,7 +41,7 @@ base_hyperparameters = {
     'random_negative_distribution_smoothing': 0.0,
     'random_negative_match_distribution': True,
     'random_negative_rate': 1.0,
-    'random_negative_method': 'by_allele',
+    'random_negative_method': 'by_allele_equalize_nonbinders',
     'train_data': {
         'pretrain': True,
         'pretrain_peptides_per_epoch': 64,
diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index 8b8f19f8..e6d33a57 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -6,6 +6,7 @@ import itertools
 import os
 import logging
 import random
+import math
 
 import numpy
 import pandas
@@ -696,7 +697,14 @@ class Class1NeuralNetwork(object):
             df["inequality"] = "="
         else:
             df["inequality"] = inequalities
+        df_nonbinders = None
         if self.hyperparameters['random_negative_binder_threshold']:
+            df_nonbinders = df.loc[
+                (df.inequality != "<") &
+                (df.affinity > self.hyperparameters[
+                    'random_negative_binder_threshold'
+                ])
+            ]
             df = df.loc[
                 (df.inequality != ">") &
                 (df.affinity <= self.hyperparameters[
@@ -762,8 +770,8 @@ class Class1NeuralNetwork(object):
                 num_for_allele = len(sub_df) * (
                     self.hyperparameters['random_negative_rate']
                 ) + self.hyperparameters['random_negative_constant']
-                num_per_length = int(
-                    num_for_allele / len(random_negative_lengths))
+                num_per_length = int(math.ceil(
+                    num_for_allele / len(random_negative_lengths)))
                 total_random_peptides_per_length += num_per_length
                 allele_to_num_per_length[allele] = num_per_length
 
@@ -792,6 +800,66 @@ class Class1NeuralNetwork(object):
                 # important NOT to shuffle peptides, since they correspond with
                 # specific alleles.
                 return EncodableSequences.create(peptides)
+        elif self.hyperparameters["random_negative_method"] == "by_allele_equalize_nonbinders":
+            assert df_nonbinders is not None
+
+            allele_and_length_to_num_random_negative = {}
+
+            # First pass
+            allele_to_num_per_length_first_pass = {}
+            for (allele, sub_df) in df.groupby("allele"):
+                num_for_allele = len(sub_df) * (
+                    self.hyperparameters['random_negative_rate']
+                ) + self.hyperparameters['random_negative_constant']
+                num_per_length = int(math.ceil(
+                    num_for_allele / len(random_negative_lengths)))
+                allele_to_num_per_length_first_pass[allele] = num_per_length
+
+            # Second pass
+            for (allele, sub_df) in df_nonbinders.groupby("allele"):
+                nonbinders_by_length = sub_df.peptide.str.len().value_counts()
+                nonbinders_by_length = nonbinders_by_length.reindex(
+                    random_negative_lengths
+                ).fillna(0)
+
+                total_nonbinders_by_length = (
+                    nonbinders_by_length +
+                    allele_to_num_per_length_first_pass[allele])
+                max_total_nonbinders_by_length = total_nonbinders_by_length.max()
+                for (length, total) in total_nonbinders_by_length.items():
+                    allele_and_length_to_num_random_negative[
+                        (allele, length)
+                    ] = math.ceil(
+                        allele_to_num_per_length_first_pass[allele] +
+                        max_total_nonbinders_by_length -
+                        total)
+
+            plan_df = []
+            for (
+                    (allele, length), num_random_negative) in (
+                    allele_and_length_to_num_random_negative.items()):
+                plan_df.append((allele, length, num_random_negative))
+            plan_df = pandas.DataFrame(
+                plan_df, columns=["allele", "length", "num"])
+
+            logging.info("Random negative plan:\n%s", plan_df)
+
+            if allele_encoding is not None:
+                random_negative_alleles = []
+                for _, row in plan_df.iterrows():
+                    random_negative_alleles.extend([row.allele] * int(row.num))
+
+            def sample_peptides():
+                peptides = []
+                for _, row in plan_df.iterrows():
+                    peptides.extend(
+                        random_peptides(
+                            row.num,
+                            length=row.length,
+                            distribution=aa_distribution))
+                # important NOT to shuffle peptides, since they correspond with
+                # specific alleles.
+                return EncodableSequences.create(peptides)
         else:
             raise NotImplementedError(
                 self.hyperparameters["random_negative_method"])
diff --git a/test/test_random_negative_peptides.py b/test/test_random_negative_peptides.py
new file mode 100644
index 00000000..c3c978f2
--- /dev/null
+++ b/test/test_random_negative_peptides.py
@@ -0,0 +1,142 @@
+from mhcflurry import amino_acid
+from nose.tools import eq_
+from numpy.testing import assert_equal
+import numpy
+import pandas
+import math
+
+from mhcflurry import Class1NeuralNetwork
+from mhcflurry.encodable_sequences import EncodableSequences
+from mhcflurry.allele_encoding import AlleleEncoding
+from mhcflurry.common import random_peptides
+
+
+def test_random_negative_peptides_by_allele():
+    network = Class1NeuralNetwork(
+        random_negative_method="by_allele",
+        random_negative_binder_threshold=500,
+        random_negative_rate=1.0,
+        random_negative_constant=2)
+
+    allele_to_sequence = {
+        "HLA-A*02:01": "AAAAA",
+        "HLA-B*44:02": "CCCCC",
+        "HLA-C*07:02": "EEEEE",
+    }
+    data_rows = [
+        ("HLA-A*02:01", "SIINFEKL", 400, "="),
+        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
+        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
+        ("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
+        ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
+    ]
+    for peptide in random_peptides(1000, length=9):
+        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
+    for peptide in random_peptides(1000, length=9):
+        data_rows.append(("HLA-B*44:02", peptide, 1000, "="))
+    for peptide in random_peptides(5, length=10):
+        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
+
+    data = pandas.DataFrame(
+        data_rows,
+        columns=["allele", "peptide", "affinity", "inequality"])
+    data["length"] = data.peptide.str.len()
+
+    peptides = EncodableSequences.create(data.peptide.values)
+    allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence)
+
+    results = network.random_negatives_generator(
+        peptides,
+        data.affinity.values,
+        allele_encoding,
+        data.inequality.values
+    )
+    result_allele_encoding = next(results)
+    first_peptide_sample = next(results)
+    second_peptide_sample = next(results)
+
+    result_df1 = pandas.DataFrame({
+        "allele": result_allele_encoding.alleles,
+        "peptide": first_peptide_sample.sequences,
+    })
+    result_df1["length"] = result_df1.peptide.str.len()
+    random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack()
+    real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    real_binders = data.loc[
+        data.affinity <= 500
+    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    real_nonbinders = data.loc[
+        data.affinity > 500
+    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    total_nonbinders = random_negatives + real_nonbinders
+
+    assert (random_negatives.loc["HLA-A*02:01"] == 1.0).all()
+    assert (random_negatives.loc["HLA-B*44:02"] == math.ceil(1007 / 8)).all(), (
+        random_negatives.loc["HLA-B*44:02"], math.ceil(1007 / 8))
+
+
+
+def test_random_negative_peptides_by_allele():
+    network = Class1NeuralNetwork(
+        random_negative_method="by_allele_equalize_nonbinders",
+        random_negative_binder_threshold=500,
+        random_negative_rate=1.0,
+        random_negative_constant=2)
+
+    allele_to_sequence = {
+        "HLA-A*02:01": "AAAAA",
+        "HLA-B*44:02": "CCCCC",
+        "HLA-C*07:02": "EEEEE",
+    }
+    data_rows = [
+        ("HLA-A*02:01", "SIINFEKL", 400, "="),
+        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
+        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
+        ("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
+        ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
+    ]
+    for peptide in random_peptides(1000, length=9):
+        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
+    for peptide in random_peptides(1000, length=9):
+        data_rows.append(("HLA-B*44:02", peptide, 1000, "="))
+    for peptide in random_peptides(5, length=10):
+        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
+
+    data = pandas.DataFrame(
+        data_rows,
+        columns=["allele", "peptide", "affinity", "inequality"])
+    data["length"] = data.peptide.str.len()
+
+    peptides = EncodableSequences.create(data.peptide.values)
+    allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence)
+
+    results = network.random_negatives_generator(
+        peptides,
+        data.affinity.values,
+        allele_encoding,
+        data.inequality.values
+    )
+    result_allele_encoding = next(results)
+    first_peptide_sample = next(results)
+    second_peptide_sample = next(results)
+
+    result_df1 = pandas.DataFrame({
+        "allele": result_allele_encoding.alleles,
+        "peptide": first_peptide_sample.sequences,
+    })
+    result_df1["length"] = result_df1.peptide.str.len()
+    random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack()
+    real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    real_binders = data.loc[
+        data.affinity <= 500
+    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    real_nonbinders = data.loc[
+        data.affinity > 500
+    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    for length in random_negatives.columns:
+        if length not in real_nonbinders.columns:
+            real_nonbinders[length] = 0
+    total_nonbinders = random_negatives + real_nonbinders
+
+    assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all()
+    assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all()
-- 
GitLab