diff --git a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py index a6d3297e0318a0a0ad7e0076083b0a9e8b87fef3..08873dfc7f0d710f2293b0d04be9379c64383575 100644 --- a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py +++ b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py @@ -41,7 +41,7 @@ base_hyperparameters = { 'random_negative_distribution_smoothing': 0.0, 'random_negative_match_distribution': True, 'random_negative_rate': 1.0, - 'random_negative_method': 'by_allele', + 'random_negative_method': 'by_allele_equalize_nonbinders', 'train_data': { 'pretrain': True, 'pretrain_peptides_per_epoch': 64, diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 8b8f19f88d3b172dd00d800f8a12e5ba90d7bab5..e6d33a5727075a5db42727af1e381d589769bf6e 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -6,6 +6,7 @@ import itertools import os import logging import random +import math import numpy import pandas @@ -696,7 +697,14 @@ class Class1NeuralNetwork(object): df["inequality"] = "=" else: df["inequality"] = inequalities + df_nonbinders = None if self.hyperparameters['random_negative_binder_threshold']: + df_nonbinders = df.loc[ + (df.inequality != "<") & + (df.affinity > self.hyperparameters[ + 'random_negative_binder_threshold' + ]) + ] df = df.loc[ (df.inequality != ">") & (df.affinity <= self.hyperparameters[ @@ -762,8 +770,8 @@ class Class1NeuralNetwork(object): num_for_allele = len(sub_df) * ( self.hyperparameters['random_negative_rate'] ) + self.hyperparameters['random_negative_constant'] - num_per_length = int( - num_for_allele / len(random_negative_lengths)) + num_per_length = int(math.ceil( + num_for_allele / len(random_negative_lengths))) total_random_peptides_per_length += num_per_length allele_to_num_per_length[allele] = num_per_length @@ -792,6 +800,66 @@ class Class1NeuralNetwork(object): # important NOT to shuffle peptides, since they correspond with # specific alleles. return EncodableSequences.create(peptides) + elif self.hyperparameters["random_negative_method"] == "by_allele_equalize_nonbinders": + assert df_nonbinders is not None + + allele_and_length_to_num_random_negative = {} + + # First pass + allele_to_num_per_length_first_pass = {} + for (allele, sub_df) in df.groupby("allele"): + num_for_allele = len(sub_df) * ( + self.hyperparameters['random_negative_rate'] + ) + self.hyperparameters['random_negative_constant'] + num_per_length = int(math.ceil( + num_for_allele / len(random_negative_lengths))) + allele_to_num_per_length_first_pass[allele] = num_per_length + + # Second pass + for (allele, sub_df) in df_nonbinders.groupby("allele"): + nonbinders_by_length = sub_df.peptide.str.len().value_counts() + nonbinders_by_length = nonbinders_by_length.reindex( + random_negative_lengths + ).fillna(0) + + total_nonbinders_by_length = ( + nonbinders_by_length + + allele_to_num_per_length_first_pass[allele]) + max_total_nonbinders_by_length = total_nonbinders_by_length.max() + for (length, total) in total_nonbinders_by_length.items(): + allele_and_length_to_num_random_negative[ + (allele, length) + ] = math.ceil( + allele_to_num_per_length_first_pass[allele] + + max_total_nonbinders_by_length - + total) + + plan_df = [] + for ( + (allele, length), num_random_negative) in ( + allele_and_length_to_num_random_negative.items()): + plan_df.append((allele, length, num_random_negative)) + plan_df = pandas.DataFrame( + plan_df, columns=["allele", "length", "num"]) + + logging.info("Random negative plan:\n%s", plan_df) + + if allele_encoding is not None: + random_negative_alleles = [] + for _, row in plan_df.iterrows(): + random_negative_alleles.extend([row.allele] * int(row.num)) + + def sample_peptides(): + peptides = [] + for _, row in plan_df.iterrows(): + peptides.extend( + random_peptides( + row.num, + length=row.length, + distribution=aa_distribution)) + # important NOT to shuffle peptides, since they correspond with + # specific alleles. + return EncodableSequences.create(peptides) else: raise NotImplementedError( self.hyperparameters["random_negative_method"]) diff --git a/test/test_random_negative_peptides.py b/test/test_random_negative_peptides.py new file mode 100644 index 0000000000000000000000000000000000000000..c3c978f27c38d15ccbaac536eab13de5640b95d0 --- /dev/null +++ b/test/test_random_negative_peptides.py @@ -0,0 +1,142 @@ +from mhcflurry import amino_acid +from nose.tools import eq_ +from numpy.testing import assert_equal +import numpy +import pandas +import math + +from mhcflurry import Class1NeuralNetwork +from mhcflurry.encodable_sequences import EncodableSequences +from mhcflurry.allele_encoding import AlleleEncoding +from mhcflurry.common import random_peptides + + +def test_random_negative_peptides_by_allele(): + network = Class1NeuralNetwork( + random_negative_method="by_allele", + random_negative_binder_threshold=500, + random_negative_rate=1.0, + random_negative_constant=2) + + allele_to_sequence = { + "HLA-A*02:01": "AAAAA", + "HLA-B*44:02": "CCCCC", + "HLA-C*07:02": "EEEEE", + } + data_rows = [ + ("HLA-A*02:01", "SIINFEKL", 400, "="), + ("HLA-A*02:01", "SIINFEKLL", 300, "="), + ("HLA-A*02:01", "SIINFEKLL", 300, "="), + ("HLA-A*02:01", "SIINFEKLQ", 1000, "="), + ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"), + ] + for peptide in random_peptides(1000, length=9): + data_rows.append(("HLA-B*44:02", peptide, 100, "=")) + for peptide in random_peptides(1000, length=9): + data_rows.append(("HLA-B*44:02", peptide, 1000, "=")) + for peptide in random_peptides(5, length=10): + data_rows.append(("HLA-B*44:02", peptide, 100, "=")) + + data = pandas.DataFrame( + data_rows, + columns=["allele", "peptide", "affinity", "inequality"]) + data["length"] = data.peptide.str.len() + + peptides = EncodableSequences.create(data.peptide.values) + allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence) + + results = network.random_negatives_generator( + peptides, + data.affinity.values, + allele_encoding, + data.inequality.values + ) + result_allele_encoding = next(results) + first_peptide_sample = next(results) + second_peptide_sample = next(results) + + result_df1 = pandas.DataFrame({ + "allele": result_allele_encoding.alleles, + "peptide": first_peptide_sample.sequences, + }) + result_df1["length"] = result_df1.peptide.str.len() + random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack() + real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0) + real_binders = data.loc[ + data.affinity <= 500 + ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0) + real_nonbinders = data.loc[ + data.affinity > 500 + ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0) + total_nonbinders = random_negatives + real_nonbinders + + assert (random_negatives.loc["HLA-A*02:01"] == 1.0).all() + assert (random_negatives.loc["HLA-B*44:02"] == math.ceil(1007 / 8)).all(), ( + random_negatives.loc["HLA-B*44:02"], math.ceil(1007 / 8)) + + + +def test_random_negative_peptides_by_allele(): + network = Class1NeuralNetwork( + random_negative_method="by_allele_equalize_nonbinders", + random_negative_binder_threshold=500, + random_negative_rate=1.0, + random_negative_constant=2) + + allele_to_sequence = { + "HLA-A*02:01": "AAAAA", + "HLA-B*44:02": "CCCCC", + "HLA-C*07:02": "EEEEE", + } + data_rows = [ + ("HLA-A*02:01", "SIINFEKL", 400, "="), + ("HLA-A*02:01", "SIINFEKLL", 300, "="), + ("HLA-A*02:01", "SIINFEKLL", 300, "="), + ("HLA-A*02:01", "SIINFEKLQ", 1000, "="), + ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"), + ] + for peptide in random_peptides(1000, length=9): + data_rows.append(("HLA-B*44:02", peptide, 100, "=")) + for peptide in random_peptides(1000, length=9): + data_rows.append(("HLA-B*44:02", peptide, 1000, "=")) + for peptide in random_peptides(5, length=10): + data_rows.append(("HLA-B*44:02", peptide, 100, "=")) + + data = pandas.DataFrame( + data_rows, + columns=["allele", "peptide", "affinity", "inequality"]) + data["length"] = data.peptide.str.len() + + peptides = EncodableSequences.create(data.peptide.values) + allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence) + + results = network.random_negatives_generator( + peptides, + data.affinity.values, + allele_encoding, + data.inequality.values + ) + result_allele_encoding = next(results) + first_peptide_sample = next(results) + second_peptide_sample = next(results) + + result_df1 = pandas.DataFrame({ + "allele": result_allele_encoding.alleles, + "peptide": first_peptide_sample.sequences, + }) + result_df1["length"] = result_df1.peptide.str.len() + random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack() + real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0) + real_binders = data.loc[ + data.affinity <= 500 + ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0) + real_nonbinders = data.loc[ + data.affinity > 500 + ].groupby(["allele", "length"]).peptide.count().unstack().fillna(0) + for length in random_negatives.columns: + if length not in real_nonbinders.columns: + real_nonbinders[length] = 0 + total_nonbinders = random_negatives + real_nonbinders + + assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all() + assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all()