add random negative peptide method by_allele_equalize_nonbinders

a0b0a8a9 · Tim O'Donnell · 4192c780 · a0b0a8a9 · a0b0a8a9 · a0b0a8a9
Commit a0b0a8a9 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
+++ b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
@@ -41,7 +41,7 @@ base_hyperparameters = {
    'random_negative_distribution_smoothing': 0.0,
    'random_negative_match_distribution': True,
    'random_negative_rate': 1.0,
-    'random_negative_method': 'by_allele',
+    'random_negative_method': 'by_allele_equalize_nonbinders',
    'train_data': {
        'pretrain': True,
        'pretrain_peptides_per_epoch': 64,

--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -6,6 +6,7 @@ import itertools
 import os
 import logging
 import random
+import math
 import numpy
 import pandas
@@ -696,7 +697,14 @@ class Class1NeuralNetwork(object):
            df["inequality"] = "="
        else:
            df["inequality"] = inequalities
+        df_nonbinders = None
        if self.hyperparameters['random_negative_binder_threshold']:
+            df_nonbinders = df.loc[
+                (df.inequality != "<") &
+                (df.affinity > self.hyperparameters[
+                    'random_negative_binder_threshold'
+                ])
+            ]
            df = df.loc[
                (df.inequality != ">") &
                (df.affinity <= self.hyperparameters[
@@ -762,8 +770,8 @@ class Class1NeuralNetwork(object):
                num_for_allele = len(sub_df) * (
                    self.hyperparameters['random_negative_rate']
                ) + self.hyperparameters['random_negative_constant']
-                num_per_length = int(
+                num_per_length = int(math.ceil(
-                    num_for_allele / len(random_negative_lengths))
+                    num_for_allele / len(random_negative_lengths)))
                total_random_peptides_per_length += num_per_length
                allele_to_num_per_length[allele] = num_per_length
@@ -792,6 +800,66 @@ class Class1NeuralNetwork(object):
                # important NOT to shuffle peptides, since they correspond with
                # specific alleles.
                return EncodableSequences.create(peptides)
+        elif self.hyperparameters["random_negative_method"] == "by_allele_equalize_nonbinders":
+            assert df_nonbinders is not None
+            allele_and_length_to_num_random_negative = {}
+            # First pass
+            allele_to_num_per_length_first_pass = {}
+            for (allele, sub_df) in df.groupby("allele"):
+                num_for_allele = len(sub_df) * (
+                    self.hyperparameters['random_negative_rate']
+                ) + self.hyperparameters['random_negative_constant']
+                num_per_length = int(math.ceil(
+                    num_for_allele / len(random_negative_lengths)))
+                allele_to_num_per_length_first_pass[allele] = num_per_length
+            # Second pass
+            for (allele, sub_df) in df_nonbinders.groupby("allele"):
+                nonbinders_by_length = sub_df.peptide.str.len().value_counts()
+                nonbinders_by_length = nonbinders_by_length.reindex(
+                    random_negative_lengths
+                ).fillna(0)
+                total_nonbinders_by_length = (
+                    nonbinders_by_length +
+                    allele_to_num_per_length_first_pass[allele])
+                max_total_nonbinders_by_length = total_nonbinders_by_length.max()
+                for (length, total) in total_nonbinders_by_length.items():
+                    allele_and_length_to_num_random_negative[
+                        (allele, length)
+                    ] = math.ceil(
+                        allele_to_num_per_length_first_pass[allele] +
+                        max_total_nonbinders_by_length -
+                        total)
+            plan_df = []
+            for (
+                    (allele, length), num_random_negative) in (
+                    allele_and_length_to_num_random_negative.items()):
+                plan_df.append((allele, length, num_random_negative))
+            plan_df = pandas.DataFrame(
+                plan_df, columns=["allele", "length", "num"])
+            logging.info("Random negative plan:\n%s", plan_df)
+            if allele_encoding is not None:
+                random_negative_alleles = []
+                for _, row in plan_df.iterrows():
+                    random_negative_alleles.extend([row.allele] * int(row.num))
+            def sample_peptides():
+                peptides = []
+                for _, row in plan_df.iterrows():
+                    peptides.extend(
+                        random_peptides(
+                            row.num,
+                            length=row.length,
+                            distribution=aa_distribution))
+                # important NOT to shuffle peptides, since they correspond with
+                # specific alleles.
+                return EncodableSequences.create(peptides)
        else:
            raise NotImplementedError(
                self.hyperparameters["random_negative_method"])

--- a/test/test_random_negative_peptides.py
+++ b/test/test_random_negative_peptides.py
+from mhcflurry import amino_acid
+from nose.tools import eq_
+from numpy.testing import assert_equal
+import numpy
+import pandas
+import math
+from mhcflurry import Class1NeuralNetwork
+from mhcflurry.encodable_sequences import EncodableSequences
+from mhcflurry.allele_encoding import AlleleEncoding
+from mhcflurry.common import random_peptides
+def test_random_negative_peptides_by_allele():
+    network = Class1NeuralNetwork(
+        random_negative_method="by_allele",
+        random_negative_binder_threshold=500,
+        random_negative_rate=1.0,
+        random_negative_constant=2)
+    allele_to_sequence = {
+        "HLA-A*02:01": "AAAAA",
+        "HLA-B*44:02": "CCCCC",
+        "HLA-C*07:02": "EEEEE",
+    }
+    data_rows = [
+        ("HLA-A*02:01", "SIINFEKL", 400, "="),
+        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
+        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
+        ("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
+        ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
+    ]
+    for peptide in random_peptides(1000, length=9):
+        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
+    for peptide in random_peptides(1000, length=9):
+        data_rows.append(("HLA-B*44:02", peptide, 1000, "="))
+    for peptide in random_peptides(5, length=10):
+        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
+    data = pandas.DataFrame(
+        data_rows,
+        columns=["allele", "peptide", "affinity", "inequality"])
+    data["length"] = data.peptide.str.len()
+    peptides = EncodableSequences.create(data.peptide.values)
+    allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence)
+    results = network.random_negatives_generator(
+        peptides,
+        data.affinity.values,
+        allele_encoding,
+        data.inequality.values
+    )
+    result_allele_encoding = next(results)
+    first_peptide_sample = next(results)
+    second_peptide_sample = next(results)
+    result_df1 = pandas.DataFrame({
+        "allele": result_allele_encoding.alleles,
+        "peptide": first_peptide_sample.sequences,
+    })
+    result_df1["length"] = result_df1.peptide.str.len()
+    random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack()
+    real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    real_binders = data.loc[
+        data.affinity <= 500
+    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    real_nonbinders = data.loc[
+        data.affinity > 500
+    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    total_nonbinders = random_negatives + real_nonbinders
+    assert (random_negatives.loc["HLA-A*02:01"] == 1.0).all()
+    assert (random_negatives.loc["HLA-B*44:02"] == math.ceil(1007 / 8)).all(), (
+        random_negatives.loc["HLA-B*44:02"], math.ceil(1007 / 8))
+def test_random_negative_peptides_by_allele():
+    network = Class1NeuralNetwork(
+        random_negative_method="by_allele_equalize_nonbinders",
+        random_negative_binder_threshold=500,
+        random_negative_rate=1.0,
+        random_negative_constant=2)
+    allele_to_sequence = {
+        "HLA-A*02:01": "AAAAA",
+        "HLA-B*44:02": "CCCCC",
+        "HLA-C*07:02": "EEEEE",
+    }
+    data_rows = [
+        ("HLA-A*02:01", "SIINFEKL", 400, "="),
+        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
+        ("HLA-A*02:01", "SIINFEKLL", 300, "="),
+        ("HLA-A*02:01", "SIINFEKLQ", 1000, "="),
+        ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"),
+    ]
+    for peptide in random_peptides(1000, length=9):
+        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
+    for peptide in random_peptides(1000, length=9):
+        data_rows.append(("HLA-B*44:02", peptide, 1000, "="))
+    for peptide in random_peptides(5, length=10):
+        data_rows.append(("HLA-B*44:02", peptide, 100, "="))
+    data = pandas.DataFrame(
+        data_rows,
+        columns=["allele", "peptide", "affinity", "inequality"])
+    data["length"] = data.peptide.str.len()
+    peptides = EncodableSequences.create(data.peptide.values)
+    allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence)
+    results = network.random_negatives_generator(
+        peptides,
+        data.affinity.values,
+        allele_encoding,
+        data.inequality.values
+    )
+    result_allele_encoding = next(results)
+    first_peptide_sample = next(results)
+    second_peptide_sample = next(results)
+    result_df1 = pandas.DataFrame({
+        "allele": result_allele_encoding.alleles,
+        "peptide": first_peptide_sample.sequences,
+    })
+    result_df1["length"] = result_df1.peptide.str.len()
+    random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack()
+    real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    real_binders = data.loc[
+        data.affinity <= 500
+    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    real_nonbinders = data.loc[
+        data.affinity > 500
+    ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0)
+    for length in random_negatives.columns:
+        if length not in real_nonbinders.columns:
+            real_nonbinders[length] = 0
+    total_nonbinders = random_negatives + real_nonbinders
+    assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all()
+    assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all()