diff --git a/mhcflurry/random_negative_peptides.py b/mhcflurry/random_negative_peptides.py index 96f4866512ffa086b32b554751e53765e17b5bd3..f81e29ddc052ecc73c355b28c64e18bcb6f3203d 100644 --- a/mhcflurry/random_negative_peptides.py +++ b/mhcflurry/random_negative_peptides.py @@ -9,6 +9,11 @@ from .common import amino_acid_distribution, random_peptides class RandomNegativePeptides(object): + """ + Generate random negative (peptide, allele) pairs. These are used during + model training, where they are resampled at each epoch. + """ + hyperparameter_defaults = HyperparameterDefaults( random_negative_rate=0.0, random_negative_constant=25, @@ -17,7 +22,30 @@ class RandomNegativePeptides(object): random_negative_method="recommended", random_negative_binder_threshold=None, random_negative_lengths=[8,9,10,11,12,13,14,15]) - + """ + Hyperperameters for random negative peptides. + + Number of random negatives will be: + random_negative_rate * (num measurements) + random_negative_constant + + where the exact meaning of (num measurements) depends on the particular + random_negative_method in use. + + If random_negative_match_distribution is True, then the amino acid + frequencies of the training data peptides are used to generate the + random peptides. + + Valid values for random_negative_method are: + "by_length": used for allele-specific prediction. See description in + `RandomNegativePeptides.plan_by_length` method. + "by_allele": used for pan-allele prediction. See + `RandomNegativePeptides.plan_by_allele` method. + "by_allele_equalize_nonbinders": used for pan-allele prediction. See + `RandomNegativePeptides.plan_by_allele_equalize_nonbinders` method. + "recommended": the default. Use by_length if the predictor is allele- + specific and by_allele if it's pan-allele. + + """ def __init__(self, **hyperparameters): self.hyperparameters = self.hyperparameter_defaults.with_defaults( @@ -26,6 +54,22 @@ class RandomNegativePeptides(object): self.aa_distribution = None def plan(self, peptides, affinities, alleles=None, inequalities=None): + """ + Calculate the number of random negatives for each allele and peptide + length. Call this once after instantiating the object. + + Parameters + ---------- + peptides : list of string + affinities : list of float + alleles : list of string, optional + inequalities : list of string (">", "<", or "="), optional + + Returns + ------- + pandas.DataFrame indicating number of random negatives for each length + and allele. + """ peptides = pandas.Series(peptides, copy=False) peptide_lengths = peptides.str.len() @@ -83,11 +127,15 @@ class RandomNegativePeptides(object): def plan_by_length(self, df_all, df_binders=None, df_nonbinders=None): """ + Generate a random negative plan using the "by_length" policy. + + Parameters are as in the `plan` method. No return value. + + Used for allele-specific predictors. Does not work well for pan-allele. + Different numbers of random negatives per length. Alleles are sampled proportionally to the number of times they are used in the training data. - - Used for allele-specific predictors. Does not work well for pan-allele. """ assert list(df_all.allele.unique()) == [""], ( "by_length only recommended for allele specific prediction") @@ -110,6 +158,10 @@ class RandomNegativePeptides(object): def plan_by_allele(self, df_all, df_binders=None, df_nonbinders=None): """ + Generate a random negative plan using the "by_allele" policy. + + Parameters are as in the `plan` method. No return value. + For each allele, a particular number of random negatives are used for all lengths. Across alleles, the number of random negatives varies; within an allele, the number of random negatives for each @@ -138,6 +190,19 @@ class RandomNegativePeptides(object): def plan_by_allele_equalize_nonbinders( self, df_all, df_binders, df_nonbinders): """ + Generate a random negative plan using the "by_allele" policy. + + Parameters are as in the `plan` method. No return value. + + Requires that the random_negative_binder_threshold hyperparameter is set. + + In a first step, the number of random negatives selected by the + "by_allele" method are added (see `plan_by_allele`). Then, the total + number of non-binders are calculated for each allele and length. This + total includes non-binder measurements in the training data plus the + random negative peptides added in the first step. In a second step, + additional random negative peptides are added so that for each allele, + all peptide lengths have the same total number of non-binders. """ assert df_binders is not None assert df_nonbinders is not None @@ -163,6 +228,15 @@ class RandomNegativePeptides(object): self.plan_df = new_plan.astype(int) def get_alleles(self): + """ + Get the list of alleles corresponding to each random negative peptide + as returned by `get_peptides`. This does NOT change and can be safely + called once and reused. + + Returns + ------- + list of string + """ assert self.plan_df is not None, "Call plan() first" alleles = [] for allele, row in self.plan_df.iterrows(): @@ -171,6 +245,15 @@ class RandomNegativePeptides(object): return alleles def get_peptides(self): + """ + Get the list of random negative peptides. This will be different each + time the method is called. + + Returns + ------- + list of string + + """ assert self.plan_df is not None, "Call plan() first" peptides = [] for allele, row in self.plan_df.iterrows(): @@ -184,4 +267,11 @@ class RandomNegativePeptides(object): return peptides def get_total_count(self): + """ + Total number of planned random negative peptides. + + Returns + ------- + int + """ return self.plan_df.sum().sum() \ No newline at end of file