From 14cf5749afb2320dc7dd6548b73df1ff4b640516 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sat, 11 Feb 2017 11:13:36 -0500
Subject: [PATCH] codes cleanup

---
 .../percent_rank_transform.py                 |  5 +++--
 .../mhcflurry_trained_on_hits.py              |  2 ++
 .../presentation_component_model.py           |  9 ++++++---
 .../presentation_model.py                     | 20 ++++---------------
 test/test_antigen_presentation.py             | 15 ++++++++++++--
 5 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/mhcflurry/antigen_presentation/percent_rank_transform.py b/mhcflurry/antigen_presentation/percent_rank_transform.py
index 0b21298f..5d729f1c 100644
--- a/mhcflurry/antigen_presentation/percent_rank_transform.py
+++ b/mhcflurry/antigen_presentation/percent_rank_transform.py
@@ -20,10 +20,11 @@ class PercentRankTransform(object):
         assert self.bin_edges is None
         assert len(values) > 0
         (hist, self.bin_edges) = numpy.histogram(values, bins=self.n_bins)
-        self.cdf = numpy.ones(len(hist) + 2) * numpy.nan
+        self.cdf = numpy.ones(len(hist) + 3) * numpy.nan
         self.cdf[0] = 0.0
+        self.cdf[1] = 0.0
         self.cdf[-1] = 100.0
-        numpy.cumsum(hist / numpy.sum(hist) * 100.0, out=self.cdf[1:-1])
+        numpy.cumsum(hist / numpy.sum(hist) * 100.0, out=self.cdf[2:-1])
         assert not numpy.isnan(self.cdf).any()
 
     def transform(self, values):
diff --git a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py
index 7ea2272f..70f933c2 100644
--- a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py
@@ -144,6 +144,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
     def fit(self, hits_df):
         assert 'experiment_name' in hits_df.columns
         assert 'peptide' in hits_df.columns
+        if 'hit' in hits_df.columns:
+            assert (hits_df.hit == 1).all()
 
         grouped = hits_df.groupby("experiment_name")
         for (experiment_name, sub_df) in grouped:
diff --git a/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py b/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py
index 76bcb7f7..388f1bf9 100644
--- a/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py
@@ -53,7 +53,7 @@ class PresentationComponentModel(object):
         """
         raise NotImplementedError(str(self))
 
-    def clone_and_fit(self, peptides_df):
+    def clone_and_fit(self, hits_df):
         """
         Clone the object and fit to given dataset with a weakref cache.
         """
@@ -65,12 +65,12 @@ class PresentationComponentModel(object):
             result = None
         else:
             key = dataframe_cryptographic_hash(
-                peptides_df[["experiment_name", "peptide"]])
+                hits_df[["experiment_name", "peptide"]])
             result = self.cached_fits.get(key)
         if result is None:
             print("Cache miss in clone_and_fit: %s" % str(self))
             result = self.clone()
-            result.fit(peptides_df)
+            result.fit(hits_df)
             if self.cached_fits is not None:
                 self.cached_fits[key] = result
         else:
@@ -229,6 +229,9 @@ class PresentationComponentModel(object):
             self.cached_predictions[cache_key] = return_value
         return return_value
 
+    def fit_ensemble_and_predict(peptides_df):
+        raise NotImplementedError
+
     def reset_cache(self):
         for key in PresentationComponentModel.cache_fields:
             obj_type = type(getattr(self, key))
diff --git a/mhcflurry/antigen_presentation/presentation_model.py b/mhcflurry/antigen_presentation/presentation_model.py
index 7f6bc651..f538ac2e 100644
--- a/mhcflurry/antigen_presentation/presentation_model.py
+++ b/mhcflurry/antigen_presentation/presentation_model.py
@@ -321,7 +321,6 @@ class PresentationModel(object):
         assert len(self.presentation_models_predictors) == \
             len(self.trained_component_models)
 
-        # peptides_df = peptides_df.reset_index(drop=True)
         prediction_cols = []
         presentation_model_predictions = {}
         zipped = enumerate(
@@ -343,20 +342,6 @@ class PresentationModel(object):
             x_df = self.evaluate_expressions(df)
             assert_no_null(x_df)
 
-            """
-            with pandas.option_context('mode.use_inf_as_null', True):
-                null_x = x_df.ix[x_df.isnull().sum(axis=1) > 0]
-                if len(null_x) > 0:
-                    null_x = null_x.copy()
-                    null_x["peptide"] = peptides_df.ix[null_x.index, "peptide"]
-                    null_x["experiment_name"] = peptides_df.ix[
-                        null_x.index, "experiment_name"
-                    ]
-                    raise ValueError(
-                        "Null values in features:\n%s\ninputs:\n%s" % (
-                            str(null_x), str(df.iloc[null_x.index])))
-            """
-
             prediction_col = "Prediction (Model %d)" % (i + 1)
             assert prediction_col not in presentation_model_predictions
             presentation_model_predictions[prediction_col] = (
@@ -370,7 +355,10 @@ class PresentationModel(object):
             del presentation_model_predictions[prediction_cols[0]]
         else:
             presentation_model_predictions["Prediction"] = numpy.mean(
-                [presentation_model_predictions[col] for col in prediction_cols],
+                [
+                    presentation_model_predictions[col]
+                    for col in prediction_cols
+                ],
                 axis=0)
 
         return pandas.DataFrame(presentation_model_predictions)
diff --git a/test/test_antigen_presentation.py b/test/test_antigen_presentation.py
index 1f69f2f7..16d31bc0 100644
--- a/test/test_antigen_presentation.py
+++ b/test/test_antigen_presentation.py
@@ -1,10 +1,12 @@
 from nose.tools import eq_, assert_less
 
 import numpy
+from numpy.testing import assert_almost_equal
 import pandas
 from mhcflurry import amino_acid
 from mhcflurry.antigen_presentation import (
     decoy_strategies,
+    percent_rank_transform,
     presentation_component_models,
     presentation_model)
 
@@ -62,6 +64,7 @@ PEPTIDES_DF["hit"] = [
     for _, row in
     PEPTIDES_DF.iterrows()
 ]
+print("Hit rate: %0.3f" % PEPTIDES_DF.hit.mean())
 
 HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy()
 del HITS_DF["hit"]
@@ -70,6 +73,14 @@ del HITS_DF["hit"]
 # Tests
 
 
+def test_percent_rank_transform():
+    model = percent_rank_transform.PercentRankTransform()
+    model.fit(numpy.arange(1000))
+    assert_almost_equal(
+        model.transform([-2, 0, 50, 100, 2000]),
+        [0.0, 0.0, 5.0, 10.0, 100.0])
+
+
 def test_mhcflurry_trained_on_hits():
     mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
         "basic",
@@ -100,11 +111,11 @@ def test_presentation_model():
         experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
         transcripts=TRANSCIPTS_DF,
         peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
-        random_peptides_for_percent_rank=make_random_peptides(10000, 9),
+        random_peptides_for_percent_rank=make_random_peptides(1000, 9),
     )
 
     decoys = decoy_strategies.UniformRandom(
-        make_random_peptides(10000, 9),
+        make_random_peptides(1000, 9),
         decoys_per_hit=50)
 
     terms = {
-- 
GitLab