From 14cf5749afb2320dc7dd6548b73df1ff4b640516 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sat, 11 Feb 2017 11:13:36 -0500 Subject: [PATCH] codes cleanup --- .../percent_rank_transform.py | 5 +++-- .../mhcflurry_trained_on_hits.py | 2 ++ .../presentation_component_model.py | 9 ++++++--- .../presentation_model.py | 20 ++++--------------- test/test_antigen_presentation.py | 15 ++++++++++++-- 5 files changed, 28 insertions(+), 23 deletions(-) diff --git a/mhcflurry/antigen_presentation/percent_rank_transform.py b/mhcflurry/antigen_presentation/percent_rank_transform.py index 0b21298f..5d729f1c 100644 --- a/mhcflurry/antigen_presentation/percent_rank_transform.py +++ b/mhcflurry/antigen_presentation/percent_rank_transform.py @@ -20,10 +20,11 @@ class PercentRankTransform(object): assert self.bin_edges is None assert len(values) > 0 (hist, self.bin_edges) = numpy.histogram(values, bins=self.n_bins) - self.cdf = numpy.ones(len(hist) + 2) * numpy.nan + self.cdf = numpy.ones(len(hist) + 3) * numpy.nan self.cdf[0] = 0.0 + self.cdf[1] = 0.0 self.cdf[-1] = 100.0 - numpy.cumsum(hist / numpy.sum(hist) * 100.0, out=self.cdf[1:-1]) + numpy.cumsum(hist / numpy.sum(hist) * 100.0, out=self.cdf[2:-1]) assert not numpy.isnan(self.cdf).any() def transform(self, values): diff --git a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py index 7ea2272f..70f933c2 100644 --- a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py +++ b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py @@ -144,6 +144,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel): def fit(self, hits_df): assert 'experiment_name' in hits_df.columns assert 'peptide' in hits_df.columns + if 'hit' in hits_df.columns: + assert (hits_df.hit == 1).all() grouped = hits_df.groupby("experiment_name") for (experiment_name, sub_df) in grouped: diff --git a/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py b/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py index 76bcb7f7..388f1bf9 100644 --- a/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py +++ b/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py @@ -53,7 +53,7 @@ class PresentationComponentModel(object): """ raise NotImplementedError(str(self)) - def clone_and_fit(self, peptides_df): + def clone_and_fit(self, hits_df): """ Clone the object and fit to given dataset with a weakref cache. """ @@ -65,12 +65,12 @@ class PresentationComponentModel(object): result = None else: key = dataframe_cryptographic_hash( - peptides_df[["experiment_name", "peptide"]]) + hits_df[["experiment_name", "peptide"]]) result = self.cached_fits.get(key) if result is None: print("Cache miss in clone_and_fit: %s" % str(self)) result = self.clone() - result.fit(peptides_df) + result.fit(hits_df) if self.cached_fits is not None: self.cached_fits[key] = result else: @@ -229,6 +229,9 @@ class PresentationComponentModel(object): self.cached_predictions[cache_key] = return_value return return_value + def fit_ensemble_and_predict(peptides_df): + raise NotImplementedError + def reset_cache(self): for key in PresentationComponentModel.cache_fields: obj_type = type(getattr(self, key)) diff --git a/mhcflurry/antigen_presentation/presentation_model.py b/mhcflurry/antigen_presentation/presentation_model.py index 7f6bc651..f538ac2e 100644 --- a/mhcflurry/antigen_presentation/presentation_model.py +++ b/mhcflurry/antigen_presentation/presentation_model.py @@ -321,7 +321,6 @@ class PresentationModel(object): assert len(self.presentation_models_predictors) == \ len(self.trained_component_models) - # peptides_df = peptides_df.reset_index(drop=True) prediction_cols = [] presentation_model_predictions = {} zipped = enumerate( @@ -343,20 +342,6 @@ class PresentationModel(object): x_df = self.evaluate_expressions(df) assert_no_null(x_df) - """ - with pandas.option_context('mode.use_inf_as_null', True): - null_x = x_df.ix[x_df.isnull().sum(axis=1) > 0] - if len(null_x) > 0: - null_x = null_x.copy() - null_x["peptide"] = peptides_df.ix[null_x.index, "peptide"] - null_x["experiment_name"] = peptides_df.ix[ - null_x.index, "experiment_name" - ] - raise ValueError( - "Null values in features:\n%s\ninputs:\n%s" % ( - str(null_x), str(df.iloc[null_x.index]))) - """ - prediction_col = "Prediction (Model %d)" % (i + 1) assert prediction_col not in presentation_model_predictions presentation_model_predictions[prediction_col] = ( @@ -370,7 +355,10 @@ class PresentationModel(object): del presentation_model_predictions[prediction_cols[0]] else: presentation_model_predictions["Prediction"] = numpy.mean( - [presentation_model_predictions[col] for col in prediction_cols], + [ + presentation_model_predictions[col] + for col in prediction_cols + ], axis=0) return pandas.DataFrame(presentation_model_predictions) diff --git a/test/test_antigen_presentation.py b/test/test_antigen_presentation.py index 1f69f2f7..16d31bc0 100644 --- a/test/test_antigen_presentation.py +++ b/test/test_antigen_presentation.py @@ -1,10 +1,12 @@ from nose.tools import eq_, assert_less import numpy +from numpy.testing import assert_almost_equal import pandas from mhcflurry import amino_acid from mhcflurry.antigen_presentation import ( decoy_strategies, + percent_rank_transform, presentation_component_models, presentation_model) @@ -62,6 +64,7 @@ PEPTIDES_DF["hit"] = [ for _, row in PEPTIDES_DF.iterrows() ] +print("Hit rate: %0.3f" % PEPTIDES_DF.hit.mean()) HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy() del HITS_DF["hit"] @@ -70,6 +73,14 @@ del HITS_DF["hit"] # Tests +def test_percent_rank_transform(): + model = percent_rank_transform.PercentRankTransform() + model.fit(numpy.arange(1000)) + assert_almost_equal( + model.transform([-2, 0, 50, 100, 2000]), + [0.0, 0.0, 5.0, 10.0, 100.0]) + + def test_mhcflurry_trained_on_hits(): mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits( "basic", @@ -100,11 +111,11 @@ def test_presentation_model(): experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP, transcripts=TRANSCIPTS_DF, peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF, - random_peptides_for_percent_rank=make_random_peptides(10000, 9), + random_peptides_for_percent_rank=make_random_peptides(1000, 9), ) decoys = decoy_strategies.UniformRandom( - make_random_peptides(10000, 9), + make_random_peptides(1000, 9), decoys_per_hit=50) terms = { -- GitLab