From ec75a7b635f0aa301ef00dcd140156d1f7eac48a Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sat, 11 Feb 2017 10:32:40 -0500 Subject: [PATCH] Add test for PresentationModel --- .../percent_rank_transform.py | 4 +- .../mhcflurry_trained_on_hits.py | 24 ++++--- .../presentation_model.py | 15 ++--- test/test_antigen_presentation.py | 67 ++++++++++++++++--- 4 files changed, 83 insertions(+), 27 deletions(-) diff --git a/mhcflurry/antigen_presentation/percent_rank_transform.py b/mhcflurry/antigen_presentation/percent_rank_transform.py index abfbf07f..0b21298f 100644 --- a/mhcflurry/antigen_presentation/percent_rank_transform.py +++ b/mhcflurry/antigen_presentation/percent_rank_transform.py @@ -33,4 +33,6 @@ class PercentRankTransform(object): assert self.cdf is not None assert self.bin_edges is not None indices = numpy.searchsorted(self.bin_edges, values) - return self.cdf[indices] + result = self.cdf[indices] + assert len(result) == len(values) + return result diff --git a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py index 5eb8153c..7ea2272f 100644 --- a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py +++ b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py @@ -165,7 +165,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel): (allele,) = alleles mhcflurry_allele = normalize_allele_name(allele) assert allele not in self.allele_to_model, \ - "TODO: Support training on >1 experiments with same allele" + "TODO: Support training on >1 experiments with same allele " \ + + str(self.allele_to_model) extra_hits = hit_list = set(hit_list) @@ -237,18 +238,22 @@ class MHCflurryTrainedOnHits(PresentationComponentModel): def predict_for_experiment(self, experiment_name, peptides): assert self.allele_to_model is not None, "Must fit first" + peptides_deduped = pandas.unique(peptides) + print(len(peptides_deduped)) + alleles = self.experiment_to_alleles[experiment_name] - predictions = pandas.DataFrame(index=peptides) + predictions = pandas.DataFrame(index=peptides_deduped) for allele in alleles: predictions[allele] = self.predict_affinity_for_allele( - allele, peptides) + allele, peptides_deduped) result = { - self.column_name_affinity(): predictions.min(axis=1).values + self.column_name_affinity(): ( + predictions.min(axis=1).ix[peptides].values) } if self.percent_rank_transforms is not None: self.fit_percentile_rank_if_needed(alleles) - percentile_ranks = pandas.DataFrame(index=peptides) + percentile_ranks = pandas.DataFrame(index=peptides_deduped) for allele in alleles: percentile_ranks[allele] = ( self.percent_rank_transforms[allele] @@ -256,10 +261,13 @@ class MHCflurryTrainedOnHits(PresentationComponentModel): result[self.column_name_percentile_rank()] = ( percentile_ranks.min(axis=1).ix[peptides].values) assert all(len(x) == len(peptides) for x in result.values()), ( - "Result lengths don't match peptide lengths. %s:\n%s" % ( + "Result lengths don't match peptide lengths. peptides=%d, " + "peptides_deduped=%d, %s" % ( + len(peptides), + len(peptides_deduped), ", ".join( - "%s=%d" % (key, value) for (key, value) in result.items()), - result)) + "%s=%d" % (key, len(value)) + for (key, value) in result.items()))) return result def get_fit(self): diff --git a/mhcflurry/antigen_presentation/presentation_model.py b/mhcflurry/antigen_presentation/presentation_model.py index 4b0aafe0..7f6bc651 100644 --- a/mhcflurry/antigen_presentation/presentation_model.py +++ b/mhcflurry/antigen_presentation/presentation_model.py @@ -12,13 +12,8 @@ from sklearn.linear_model import LogisticRegression from ..common import assert_no_null, drop_nulls_and_warn -EVAL_CONTEXT = { - 'log': numpy.log, - 'exp': numpy.exp, -} - -def build_presentation_models_from_formulas(term_dict, formulas, **kwargs): +def build_presentation_models(term_dict, formulas, **kwargs): """ Convenience function for creating multiple final models based on shared terms. @@ -164,14 +159,16 @@ class PresentationModel(object): self.fit_experiments = set(hits_df.experiment_name.unique()) if self.component_models_require_fitting and not self.ensemble_size: - print("Using 2-fold fit.") # Use two fold CV to train model inputs then final models. cv = StratifiedKFold( n_splits=2, shuffle=True, random_state=self.random_state) self.trained_component_models = [] self.presentation_models_predictors = [] + fold_num = 1 for (fold1, fold2) in cv.split(hits_df, hits_df.experiment_name): + print("Two fold fit: fitting fold %d" % fold_num) + fold_num += 1 assert len(fold1) > 0 assert len(fold2) > 0 model_input_training_hits_df = hits_df.iloc[fold1] @@ -282,7 +279,9 @@ class PresentationModel(object): def evaluate_expressions(self, input_df): result = pandas.DataFrame() for expression in self.feature_expressions: - values = eval(expression, EVAL_CONTEXT, input_df) + # We use numpy module as globals here so math functions + # like log, log1p, exp, are in scope. + values = eval(expression, numpy.__dict__, input_df) assert len(values) == len(input_df), expression if hasattr(values, 'values'): values = values.values diff --git a/test/test_antigen_presentation.py b/test/test_antigen_presentation.py index 33d1f590..1f69f2f7 100644 --- a/test/test_antigen_presentation.py +++ b/test/test_antigen_presentation.py @@ -1,12 +1,17 @@ from nose.tools import eq_, assert_less import numpy -from numpy import testing import pandas from mhcflurry import amino_acid -from mhcflurry.antigen_presentation import presentation_component_models +from mhcflurry.antigen_presentation import ( + decoy_strategies, + presentation_component_models, + presentation_model) +###################### +# Helper functions + def make_random_peptides(num, length=9): return [ ''.join(peptide_sequence) @@ -16,6 +21,14 @@ def make_random_peptides(num, length=9): ] +def hit_criterion(experiment_name, peptide): + # Peptides with 'A' are always hits. Easy for model to learn. + return 'A' in peptide + + +###################### +# Small test dataset + PEPTIDES = make_random_peptides(100, 9) TRANSCRIPTS = [ @@ -42,11 +55,6 @@ PEPTIDES_AND_TRANSCRIPTS_DF = TRANSCIPTS_DF.stack().to_frame().reset_index() PEPTIDES_AND_TRANSCRIPTS_DF.columns = ["peptide", "group", "transcript"] del PEPTIDES_AND_TRANSCRIPTS_DF["group"] - -def hit_criterion(experiment_name, peptide): - return 'A' in peptide - - PEPTIDES_DF = pandas.DataFrame({"peptide": PEPTIDES}) PEPTIDES_DF["experiment_name"] = "exp1" PEPTIDES_DF["hit"] = [ @@ -58,9 +66,12 @@ PEPTIDES_DF["hit"] = [ HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy() del HITS_DF["hit"] +###################### +# Tests + def test_mhcflurry_trained_on_hits(): - model = presentation_component_models.MHCflurryTrainedOnHits( + mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits( "basic", experiment_to_alleles=EXPERIMENT_TO_ALLELES, experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP, @@ -68,10 +79,10 @@ def test_mhcflurry_trained_on_hits(): peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF, random_peptides_for_percent_rank=make_random_peptides(10000, 9), ) - model.fit(HITS_DF) + mhcflurry_model.fit(HITS_DF) peptides = PEPTIDES_DF.copy() - predictions = model.predict(peptides) + predictions = mhcflurry_model.predict(peptides) peptides["affinity"] = predictions["mhcflurry_basic_affinity"] peptides["percent_rank"] = predictions["mhcflurry_basic_percentile_rank"] assert_less( @@ -80,3 +91,39 @@ def test_mhcflurry_trained_on_hits(): assert_less( peptides.percent_rank[peptides.hit].mean(), peptides.percent_rank[~peptides.hit].mean()) + + +def test_presentation_model(): + mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits( + "basic", + experiment_to_alleles=EXPERIMENT_TO_ALLELES, + experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP, + transcripts=TRANSCIPTS_DF, + peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF, + random_peptides_for_percent_rank=make_random_peptides(10000, 9), + ) + + decoys = decoy_strategies.UniformRandom( + make_random_peptides(10000, 9), + decoys_per_hit=50) + + terms = { + 'A_ms': ( + [mhcflurry_model], + ["log1p(mhcflurry_basic_affinity)"]), + } + + models = presentation_model.build_presentation_models( + terms, + ["A_ms"], + decoy_strategy=decoys) + eq_(len(models), 1) + + model = models["A_ms"] + model.fit(HITS_DF.ix[HITS_DF.experiment_name == "exp1"]) + + peptides = PEPTIDES_DF.copy() + peptides["prediction"] = model.predict(peptides) + assert_less( + peptides.prediction[~peptides.hit].mean(), + peptides.prediction[peptides.hit].mean()) -- GitLab