diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index e6d33a5727075a5db42727af1e381d589769bf6e..1299dd8deb907d59fb174cfa31cd0786778577db 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -18,6 +18,7 @@ from .regression_target import to_ic50, from_ic50 from .common import random_peptides, amino_acid_distribution from .custom_loss import get_loss from .data_dependent_weights_initialization import lsuv_init +from .random_negative_peptides import RandomNegativePeptides DEFAULT_PREDICT_BATCH_SIZE = 4096 @@ -91,16 +92,10 @@ class Class1NeuralNetwork(object): early_stopping=True, minibatch_size=128, data_dependent_initialization_method=None, - random_negative_rate=0.0, - random_negative_constant=25, random_negative_affinity_min=20000.0, random_negative_affinity_max=50000.0, - random_negative_match_distribution=True, - random_negative_distribution_smoothing=0.0, - random_negative_output_indices=None, - random_negative_method="by_length", - random_negative_binder_threshold=None, - random_negative_lengths=[8,9,10,11,12,13,14,15]) + random_negative_output_indices=None).extend( + RandomNegativePeptides.hyperparameter_defaults) """ Hyperparameters for neural network training. """ @@ -677,202 +672,6 @@ class Class1NeuralNetwork(object): fit_info["num_points"] = mutable_generator_state["yielded_values"] self.fit_info.append(dict(fit_info)) - def random_negatives_generator( - self, - encodable_peptides, - affinities, - allele_encoding, - inequalities): - - random_negative_lengths = self.hyperparameters['random_negative_lengths'] - - df = pandas.DataFrame({ - "peptide": encodable_peptides.sequences, - "affinity": affinities, - }) - if allele_encoding is not None: - df["allele"] = allele_encoding.alleles - df["length"] = df.peptide.str.len() - if inequalities is None: - df["inequality"] = "=" - else: - df["inequality"] = inequalities - df_nonbinders = None - if self.hyperparameters['random_negative_binder_threshold']: - df_nonbinders = df.loc[ - (df.inequality != "<") & - (df.affinity > self.hyperparameters[ - 'random_negative_binder_threshold' - ]) - ] - df = df.loc[ - (df.inequality != ">") & - (df.affinity <= self.hyperparameters[ - 'random_negative_binder_threshold' - ]) - ] - - aa_distribution = None - if self.hyperparameters['random_negative_match_distribution']: - aa_distribution = amino_acid_distribution( - encodable_peptides.sequences, - smoothing=self.hyperparameters[ - 'random_negative_distribution_smoothing' - ]) - logging.info( - "Using amino acid distribution for random negative:\n%s" % ( - str(aa_distribution.to_dict()))) - - random_negative_alleles = None - if self.hyperparameters["random_negative_method"] == "by_length": - # Different numbers of random negatives per length. Alleles are - # sampled proportionally to the number of times they are used in - # the training data. - length_to_num_random_negative = {} - random_negative_lengths = self.hyperparameters[ - 'random_negative_lengths' - ] - - length_counts = df.length.value_counts().to_dict() - for length in random_negative_lengths: - length_to_num_random_negative[length] = int( - length_counts.get(length, 0) * - self.hyperparameters['random_negative_rate'] + - self.hyperparameters['random_negative_constant']) - length_to_num_random_negative = pandas.Series( - length_to_num_random_negative) - total_random_negatives = length_to_num_random_negative.sum() - logging.info("Random negative counts per length:\n%s" % ( - str(length_to_num_random_negative.to_dict()))) - - if allele_encoding is not None: - random_negative_alleles = df.allele.sample( - n=total_random_negatives, replace=True).values - - def sample_peptides(): - peptides = [] - for (length, count) in length_to_num_random_negative.items(): - peptides.extend( - random_peptides( - count, - length=length, - distribution=aa_distribution)) - random.shuffle(peptides) # important - return EncodableSequences.create(peptides) - elif self.hyperparameters["random_negative_method"] == "by_allele": - # For each allele, a particular number of random negatives are used - # for all lengths. Across alleles, the number of random negatives - # varies; within an allele, the number of random negatives for each - # length is a constant - allele_to_num_per_length = {} - total_random_peptides_per_length = 0 - for (allele, sub_df) in df.groupby("allele"): - num_for_allele = len(sub_df) * ( - self.hyperparameters['random_negative_rate'] - ) + self.hyperparameters['random_negative_constant'] - num_per_length = int(math.ceil( - num_for_allele / len(random_negative_lengths))) - total_random_peptides_per_length += num_per_length - allele_to_num_per_length[allele] = num_per_length - - if allele_encoding is not None: - random_negative_alleles = [] - for _ in random_negative_lengths: - for (allele, num) in allele_to_num_per_length.items(): - random_negative_alleles.extend([allele] * num) - - numpy.testing.assert_equal( - len(random_negative_alleles), - total_random_peptides_per_length * len(random_negative_lengths)) - - logging.info( - "Random negative counts for each length by allele:\n%s" % ( - str(allele_to_num_per_length))) - - def sample_peptides(): - peptides = [] - for length in random_negative_lengths: - peptides.extend( - random_peptides( - total_random_peptides_per_length, - length=length, - distribution=aa_distribution)) - # important NOT to shuffle peptides, since they correspond with - # specific alleles. - return EncodableSequences.create(peptides) - elif self.hyperparameters["random_negative_method"] == "by_allele_equalize_nonbinders": - assert df_nonbinders is not None - - allele_and_length_to_num_random_negative = {} - - # First pass - allele_to_num_per_length_first_pass = {} - for (allele, sub_df) in df.groupby("allele"): - num_for_allele = len(sub_df) * ( - self.hyperparameters['random_negative_rate'] - ) + self.hyperparameters['random_negative_constant'] - num_per_length = int(math.ceil( - num_for_allele / len(random_negative_lengths))) - allele_to_num_per_length_first_pass[allele] = num_per_length - - # Second pass - for (allele, sub_df) in df_nonbinders.groupby("allele"): - nonbinders_by_length = sub_df.peptide.str.len().value_counts() - nonbinders_by_length = nonbinders_by_length.reindex( - random_negative_lengths - ).fillna(0) - - total_nonbinders_by_length = ( - nonbinders_by_length + - allele_to_num_per_length_first_pass[allele]) - max_total_nonbinders_by_length = total_nonbinders_by_length.max() - for (length, total) in total_nonbinders_by_length.items(): - allele_and_length_to_num_random_negative[ - (allele, length) - ] = math.ceil( - allele_to_num_per_length_first_pass[allele] + - max_total_nonbinders_by_length - - total) - - plan_df = [] - for ( - (allele, length), num_random_negative) in ( - allele_and_length_to_num_random_negative.items()): - plan_df.append((allele, length, num_random_negative)) - plan_df = pandas.DataFrame( - plan_df, columns=["allele", "length", "num"]) - - logging.info("Random negative plan:\n%s", plan_df) - - if allele_encoding is not None: - random_negative_alleles = [] - for _, row in plan_df.iterrows(): - random_negative_alleles.extend([row.allele] * int(row.num)) - - def sample_peptides(): - peptides = [] - for _, row in plan_df.iterrows(): - peptides.extend( - random_peptides( - row.num, - length=row.length, - distribution=aa_distribution)) - # important NOT to shuffle peptides, since they correspond with - # specific alleles. - return EncodableSequences.create(peptides) - else: - raise NotImplementedError( - self.hyperparameters["random_negative_method"]) - - random_negative_allele_encoding = None - if random_negative_alleles is not None: - random_negative_allele_encoding = AlleleEncoding( - random_negative_alleles, borrow_from=allele_encoding) - - yield random_negative_allele_encoding - while True: - yield sample_peptides() - def fit( self, peptides, @@ -937,14 +736,21 @@ class Class1NeuralNetwork(object): peptide_encoding = self.peptides_to_network_input(encodable_peptides) fit_info = collections.defaultdict(list) - random_negatives_generator = self.random_negatives_generator( - encodable_peptides=encodable_peptides, + random_negatives_planner = RandomNegativePeptides( + **RandomNegativePeptides.hyperparameter_defaults.subselect( + self.hyperparameters)) + random_negatives_planner.plan( + peptides=encodable_peptides.sequences, affinities=affinities, - allele_encoding=allele_encoding, + alleles=allele_encoding.alleles if allele_encoding else None, inequalities=inequalities) - random_negatives_allele_encoding = next(random_negatives_generator) - num_random_negatives = len( - next(random_negatives_generator).sequences) + + random_negatives_allele_encoding = None + if allele_encoding is not None: + random_negatives_allele_encoding = AlleleEncoding( + random_negatives_planner.get_alleles(), + borrow_from=allele_encoding) + num_random_negatives = random_negatives_planner.get_total_count() y_values = from_ic50(numpy.array(affinities, copy=False)) assert numpy.isnan(y_values).sum() == 0, y_values @@ -1107,7 +913,8 @@ class Class1NeuralNetwork(object): last_progress_print = None x_dict_with_random_negatives = {} for i in range(self.hyperparameters['max_epochs']): - random_negative_peptides = next(random_negatives_generator) + random_negative_peptides = EncodableSequences.create( + random_negatives_planner.get_peptides()) random_negative_peptides_encoding = ( self.peptides_to_network_input(random_negative_peptides)) diff --git a/mhcflurry/random_negative_peptides.py b/mhcflurry/random_negative_peptides.py new file mode 100644 index 0000000000000000000000000000000000000000..80fe7ab4e4cbad42376596550c14f7998a16ac0f --- /dev/null +++ b/mhcflurry/random_negative_peptides.py @@ -0,0 +1,179 @@ +import logging +import math + +import numpy +import pandas + +from .hyperparameters import HyperparameterDefaults +from .common import amino_acid_distribution, random_peptides + +class RandomNegativePeptides(object): + hyperparameter_defaults = HyperparameterDefaults( + random_negative_rate=0.0, + random_negative_constant=25, + random_negative_match_distribution=True, + random_negative_distribution_smoothing=0.0, + random_negative_method="by_length", + random_negative_binder_threshold=None, + random_negative_lengths=[8,9,10,11,12,13,14,15]) + + + def __init__(self, **hyperparameters): + self.hyperparameters = self.hyperparameter_defaults.with_defaults( + hyperparameters) + self.plan_df = None + self.aa_distribution = None + + def plan(self, peptides, affinities, alleles=None, inequalities=None): + peptides = pandas.Series(peptides, copy=False) + peptide_lengths = peptides.str.len() + + if self.hyperparameters['random_negative_match_distribution']: + self.aa_distribution = amino_acid_distribution( + peptides.values, + smoothing=self.hyperparameters[ + 'random_negative_distribution_smoothing' + ]) + logging.info( + "Using amino acid distribution for random negative:\n%s" % ( + str(self.aa_distribution.to_dict()))) + + df_all = pandas.DataFrame({ + 'length': peptide_lengths, + 'affinity': affinities, + }) + df_all["allele"] = "" if alleles is None else alleles + df_all["inequality"] = "=" if inequalities is None else inequalities + + df_binders = None + df_nonbinders = None + if self.hyperparameters['random_negative_binder_threshold']: + df_nonbinders = df_all.loc[ + (df_all.inequality != "<") & + (df_all.affinity > self.hyperparameters[ + 'random_negative_binder_threshold' + ]) + ] + df_binders = df_all.loc[ + (df_all.inequality != ">") & + (df_all.affinity <= self.hyperparameters[ + 'random_negative_binder_threshold' + ]) + ] + + method = self.hyperparameters['random_negative_method'] + function = { + 'by_length': self.plan_by_length, + 'by_allele': self.plan_by_allele, + 'by_allele_equalize_nonbinders': + self.plan_by_allele_equalize_nonbinders, + }[method] + function(df_all, df_binders, df_nonbinders) + assert self.plan_df is not None + logging.info("Random negative plan [%s]:\n%s", method, self.plan_df) + return self.plan_df + + def plan_by_length(self, df_all, df_binders=None, df_nonbinders=None): + """ + Different numbers of random negatives per length. Alleles are sampled + proportionally to the number of times they are used in the training + data. + + Used for allele-specific predictors. Does not work well for pan-allele. + """ + assert list(df_all.allele.unique()) == [""], ( + "by_length only recommended for allele specific prediction") + + df = df_all if df_binders is None else df_binders + lengths = self.hyperparameters['random_negative_lengths'] + + length_to_num_random_negative = {} + length_counts = df.length.value_counts().to_dict() + for length in lengths: + length_to_num_random_negative[length] = int( + length_counts.get(length, 0) * + self.hyperparameters['random_negative_rate'] + + self.hyperparameters['random_negative_constant']) + + plan_df = pandas.DataFrame(index=sorted(df.allele.unique())) + for length in lengths: + plan_df[length] = length_to_num_random_negative[length] + self.plan_df = plan_df.astype(int) + + def plan_by_allele(self, df_all, df_binders=None, df_nonbinders=None): + """ + For each allele, a particular number of random negatives are used + for all lengths. Across alleles, the number of random negatives + varies; within an allele, the number of random negatives for each + length is a constant + """ + allele_to_num_per_length = {} + total_random_peptides_per_length = 0 + df = df_all if df_binders is None else df_binders + lengths = self.hyperparameters['random_negative_lengths'] + all_alleles = df_all.allele.unique() + for allele in all_alleles: + sub_df = df.loc[df.allele == allele] + num_for_allele = len(sub_df) * ( + self.hyperparameters['random_negative_rate'] + ) + self.hyperparameters['random_negative_constant'] + num_per_length = int(math.ceil( + num_for_allele / len(lengths))) + total_random_peptides_per_length += num_per_length + allele_to_num_per_length[allele] = num_per_length + + plan_df = pandas.DataFrame(index=sorted(df.allele.unique())) + for length in lengths: + plan_df[length] = plan_df.index.map(allele_to_num_per_length) + self.plan_df = plan_df.astype(int) + + def plan_by_allele_equalize_nonbinders( + self, df_all, df_binders, df_nonbinders): + """ + """ + assert df_binders is not None + assert df_nonbinders is not None + + lengths = self.hyperparameters['random_negative_lengths'] + + self.plan_by_allele(df_all, df_binders, df_nonbinders) + first_pass_plan = self.plan_df + self.plan_df = None + + new_plan = first_pass_plan.copy() + new_plan[:] = numpy.nan + + for (allele, first_pass_per_length) in first_pass_plan.iterrows(): + real_nonbinders_by_length = df_nonbinders.loc[ + df_nonbinders.allele == allele + ].length.value_counts().reindex(lengths).fillna(0) + total_nonbinders_by_length = ( + real_nonbinders_by_length + first_pass_per_length) + new_plan.loc[allele] = first_pass_per_length + ( + total_nonbinders_by_length.max() - total_nonbinders_by_length) + + self.plan_df = new_plan.astype(int) + + def get_alleles(self): + assert self.plan_df is not None, "Call plan() first" + alleles = [] + for allele, row in self.plan_df.iterrows(): + alleles.extend([allele] * int(row.sum())) + assert len(alleles) == self.get_total_count() + return alleles + + def get_peptides(self): + assert self.plan_df is not None, "Call plan() first" + peptides = [] + for allele, row in self.plan_df.iterrows(): + for (length, num) in row.items(): + peptides.extend( + random_peptides( + num, + length=length, + distribution=self.aa_distribution)) + assert len(peptides) == self.get_total_count() + return peptides + + def get_total_count(self): + return self.plan_df.sum().sum() \ No newline at end of file diff --git a/test/test_random_negative_peptides.py b/test/test_random_negative_peptides.py index c3c978f27c38d15ccbaac536eab13de5640b95d0..06765cb4aba05ad43901d72e014e79d4ca73bec2 100644 --- a/test/test_random_negative_peptides.py +++ b/test/test_random_negative_peptides.py @@ -1,3 +1,7 @@ +import logging +logging.getLogger('matplotlib').disabled = True +logging.getLogger('tensorflow').disabled = True + from mhcflurry import amino_acid from nose.tools import eq_ from numpy.testing import assert_equal @@ -9,20 +13,16 @@ from mhcflurry import Class1NeuralNetwork from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry.allele_encoding import AlleleEncoding from mhcflurry.common import random_peptides +from mhcflurry.random_negative_peptides import RandomNegativePeptides def test_random_negative_peptides_by_allele(): - network = Class1NeuralNetwork( + planner = RandomNegativePeptides( random_negative_method="by_allele", random_negative_binder_threshold=500, random_negative_rate=1.0, random_negative_constant=2) - allele_to_sequence = { - "HLA-A*02:01": "AAAAA", - "HLA-B*44:02": "CCCCC", - "HLA-C*07:02": "EEEEE", - } data_rows = [ ("HLA-A*02:01", "SIINFEKL", 400, "="), ("HLA-A*02:01", "SIINFEKLL", 300, "="), @@ -42,25 +42,18 @@ def test_random_negative_peptides_by_allele(): columns=["allele", "peptide", "affinity", "inequality"]) data["length"] = data.peptide.str.len() - peptides = EncodableSequences.create(data.peptide.values) - allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence) - - results = network.random_negatives_generator( - peptides, - data.affinity.values, - allele_encoding, - data.inequality.values - ) - result_allele_encoding = next(results) - first_peptide_sample = next(results) - second_peptide_sample = next(results) - - result_df1 = pandas.DataFrame({ - "allele": result_allele_encoding.alleles, - "peptide": first_peptide_sample.sequences, + planner.plan( + peptides=data.peptide.values, + affinities=data.affinity.values, + alleles=data.allele.values, + inequalities=data.inequality.values) + result_df = pandas.DataFrame({ + "allele": planner.get_alleles(), + "peptide": planner.get_peptides(), }) - result_df1["length"] = result_df1.peptide.str.len() - random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack() + + result_df["length"] = result_df.peptide.str.len() + random_negatives = result_df.groupby(["allele", "length"]).peptide.count().unstack() real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0) real_binders = data.loc[ data.affinity <= 500 @@ -77,23 +70,20 @@ def test_random_negative_peptides_by_allele(): def test_random_negative_peptides_by_allele(): - network = Class1NeuralNetwork( + planner = RandomNegativePeptides( random_negative_method="by_allele_equalize_nonbinders", random_negative_binder_threshold=500, random_negative_rate=1.0, random_negative_constant=2) - - allele_to_sequence = { - "HLA-A*02:01": "AAAAA", - "HLA-B*44:02": "CCCCC", - "HLA-C*07:02": "EEEEE", - } data_rows = [ ("HLA-A*02:01", "SIINFEKL", 400, "="), ("HLA-A*02:01", "SIINFEKLL", 300, "="), ("HLA-A*02:01", "SIINFEKLL", 300, "="), ("HLA-A*02:01", "SIINFEKLQ", 1000, "="), ("HLA-A*02:01", "SIINFEKLZZ", 12000, ">"), + ("HLA-C*01:02", "SIINFEKLQ", 100, "="), # only binders + ("HLA-C*07:02", "SIINFEKLL", 1000, "=") # only non-binders + ] for peptide in random_peptides(1000, length=9): data_rows.append(("HLA-B*44:02", peptide, 100, "=")) @@ -107,25 +97,17 @@ def test_random_negative_peptides_by_allele(): columns=["allele", "peptide", "affinity", "inequality"]) data["length"] = data.peptide.str.len() - peptides = EncodableSequences.create(data.peptide.values) - allele_encoding = AlleleEncoding(data.allele.values, allele_to_sequence) - - results = network.random_negatives_generator( - peptides, - data.affinity.values, - allele_encoding, - data.inequality.values - ) - result_allele_encoding = next(results) - first_peptide_sample = next(results) - second_peptide_sample = next(results) - - result_df1 = pandas.DataFrame({ - "allele": result_allele_encoding.alleles, - "peptide": first_peptide_sample.sequences, + planner.plan( + peptides=data.peptide.values, + affinities=data.affinity.values, + alleles=data.allele.values, + inequalities=data.inequality.values) + result_df = pandas.DataFrame({ + "allele": planner.get_alleles(), + "peptide": planner.get_peptides(), }) - result_df1["length"] = result_df1.peptide.str.len() - random_negatives = result_df1.groupby(["allele", "length"]).peptide.count().unstack() + result_df["length"] = result_df.peptide.str.len() + random_negatives = result_df.groupby(["allele", "length"]).peptide.count().unstack() real_data = data.groupby(["allele", "length"]).peptide.count().unstack().fillna(0) real_binders = data.loc[ data.affinity <= 500 @@ -136,7 +118,11 @@ def test_random_negative_peptides_by_allele(): for length in random_negatives.columns: if length not in real_nonbinders.columns: real_nonbinders[length] = 0 - total_nonbinders = random_negatives + real_nonbinders + total_nonbinders = ( + random_negatives.reindex(real_data.index).fillna(0) + + real_nonbinders.reindex(real_data.index).fillna(0)) + + assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all(), total_nonbinders + assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all(), total_nonbinders - assert (total_nonbinders.loc["HLA-A*02:01"] == 2.0).all() - assert (total_nonbinders.loc["HLA-B*44:02"] == 1126).all() + assert not total_nonbinders.isnull().any().any()