From ec75a7b635f0aa301ef00dcd140156d1f7eac48a Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sat, 11 Feb 2017 10:32:40 -0500
Subject: [PATCH] Add test for PresentationModel

---
 .../percent_rank_transform.py                 |  4 +-
 .../mhcflurry_trained_on_hits.py              | 24 ++++---
 .../presentation_model.py                     | 15 ++---
 test/test_antigen_presentation.py             | 67 ++++++++++++++++---
 4 files changed, 83 insertions(+), 27 deletions(-)

diff --git a/mhcflurry/antigen_presentation/percent_rank_transform.py b/mhcflurry/antigen_presentation/percent_rank_transform.py
index abfbf07f..0b21298f 100644
--- a/mhcflurry/antigen_presentation/percent_rank_transform.py
+++ b/mhcflurry/antigen_presentation/percent_rank_transform.py
@@ -33,4 +33,6 @@ class PercentRankTransform(object):
         assert self.cdf is not None
         assert self.bin_edges is not None
         indices = numpy.searchsorted(self.bin_edges, values)
-        return self.cdf[indices]
+        result = self.cdf[indices]
+        assert len(result) == len(values)
+        return result
diff --git a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py
index 5eb8153c..7ea2272f 100644
--- a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py
@@ -165,7 +165,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
         (allele,) = alleles
         mhcflurry_allele = normalize_allele_name(allele)
         assert allele not in self.allele_to_model, \
-            "TODO: Support training on >1 experiments with same allele"
+            "TODO: Support training on >1 experiments with same allele " \
+            + str(self.allele_to_model)
 
         extra_hits = hit_list = set(hit_list)
 
@@ -237,18 +238,22 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
     def predict_for_experiment(self, experiment_name, peptides):
         assert self.allele_to_model is not None, "Must fit first"
 
+        peptides_deduped = pandas.unique(peptides)
+        print(len(peptides_deduped))
+
         alleles = self.experiment_to_alleles[experiment_name]
-        predictions = pandas.DataFrame(index=peptides)
+        predictions = pandas.DataFrame(index=peptides_deduped)
         for allele in alleles:
             predictions[allele] = self.predict_affinity_for_allele(
-                allele, peptides)
+                allele, peptides_deduped)
 
         result = {
-            self.column_name_affinity(): predictions.min(axis=1).values
+            self.column_name_affinity(): (
+                predictions.min(axis=1).ix[peptides].values)
         }
         if self.percent_rank_transforms is not None:
             self.fit_percentile_rank_if_needed(alleles)
-            percentile_ranks = pandas.DataFrame(index=peptides)
+            percentile_ranks = pandas.DataFrame(index=peptides_deduped)
             for allele in alleles:
                 percentile_ranks[allele] = (
                     self.percent_rank_transforms[allele]
@@ -256,10 +261,13 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
             result[self.column_name_percentile_rank()] = (
                 percentile_ranks.min(axis=1).ix[peptides].values)
         assert all(len(x) == len(peptides) for x in result.values()), (
-            "Result lengths don't match peptide lengths. %s:\n%s" % (
+            "Result lengths don't match peptide lengths. peptides=%d, "
+            "peptides_deduped=%d, %s" % (
+                len(peptides),
+                len(peptides_deduped),
                 ", ".join(
-                    "%s=%d" % (key, value) for (key, value) in result.items()),
-                result))
+                    "%s=%d" % (key, len(value))
+                    for (key, value) in result.items())))
         return result
 
     def get_fit(self):
diff --git a/mhcflurry/antigen_presentation/presentation_model.py b/mhcflurry/antigen_presentation/presentation_model.py
index 4b0aafe0..7f6bc651 100644
--- a/mhcflurry/antigen_presentation/presentation_model.py
+++ b/mhcflurry/antigen_presentation/presentation_model.py
@@ -12,13 +12,8 @@ from sklearn.linear_model import LogisticRegression
 
 from ..common import assert_no_null, drop_nulls_and_warn
 
-EVAL_CONTEXT = {
-    'log': numpy.log,
-    'exp': numpy.exp,
-}
 
-
-def build_presentation_models_from_formulas(term_dict, formulas, **kwargs):
+def build_presentation_models(term_dict, formulas, **kwargs):
     """
     Convenience function for creating multiple final models based on
     shared terms.
@@ -164,14 +159,16 @@ class PresentationModel(object):
         self.fit_experiments = set(hits_df.experiment_name.unique())
 
         if self.component_models_require_fitting and not self.ensemble_size:
-            print("Using 2-fold fit.")
             # Use two fold CV to train model inputs then final models.
             cv = StratifiedKFold(
                 n_splits=2, shuffle=True, random_state=self.random_state)
 
             self.trained_component_models = []
             self.presentation_models_predictors = []
+            fold_num = 1
             for (fold1, fold2) in cv.split(hits_df, hits_df.experiment_name):
+                print("Two fold fit: fitting fold %d" % fold_num)
+                fold_num += 1
                 assert len(fold1) > 0
                 assert len(fold2) > 0
                 model_input_training_hits_df = hits_df.iloc[fold1]
@@ -282,7 +279,9 @@ class PresentationModel(object):
     def evaluate_expressions(self, input_df):
         result = pandas.DataFrame()
         for expression in self.feature_expressions:
-            values = eval(expression, EVAL_CONTEXT, input_df)
+            # We use numpy module as globals here so math functions
+            # like log, log1p, exp, are in scope.
+            values = eval(expression, numpy.__dict__, input_df)
             assert len(values) == len(input_df), expression
             if hasattr(values, 'values'):
                 values = values.values
diff --git a/test/test_antigen_presentation.py b/test/test_antigen_presentation.py
index 33d1f590..1f69f2f7 100644
--- a/test/test_antigen_presentation.py
+++ b/test/test_antigen_presentation.py
@@ -1,12 +1,17 @@
 from nose.tools import eq_, assert_less
 
 import numpy
-from numpy import testing
 import pandas
 from mhcflurry import amino_acid
-from mhcflurry.antigen_presentation import presentation_component_models
+from mhcflurry.antigen_presentation import (
+    decoy_strategies,
+    presentation_component_models,
+    presentation_model)
 
 
+######################
+# Helper functions
+
 def make_random_peptides(num, length=9):
     return [
         ''.join(peptide_sequence)
@@ -16,6 +21,14 @@ def make_random_peptides(num, length=9):
     ]
 
 
+def hit_criterion(experiment_name, peptide):
+    # Peptides with 'A' are always hits. Easy for model to learn.
+    return 'A' in peptide
+
+
+######################
+# Small test dataset
+
 PEPTIDES = make_random_peptides(100, 9)
 
 TRANSCRIPTS = [
@@ -42,11 +55,6 @@ PEPTIDES_AND_TRANSCRIPTS_DF = TRANSCIPTS_DF.stack().to_frame().reset_index()
 PEPTIDES_AND_TRANSCRIPTS_DF.columns = ["peptide", "group", "transcript"]
 del PEPTIDES_AND_TRANSCRIPTS_DF["group"]
 
-
-def hit_criterion(experiment_name, peptide):
-    return 'A' in peptide
-
-
 PEPTIDES_DF = pandas.DataFrame({"peptide": PEPTIDES})
 PEPTIDES_DF["experiment_name"] = "exp1"
 PEPTIDES_DF["hit"] = [
@@ -58,9 +66,12 @@ PEPTIDES_DF["hit"] = [
 HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy()
 del HITS_DF["hit"]
 
+######################
+# Tests
+
 
 def test_mhcflurry_trained_on_hits():
-    model = presentation_component_models.MHCflurryTrainedOnHits(
+    mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
         "basic",
         experiment_to_alleles=EXPERIMENT_TO_ALLELES,
         experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
@@ -68,10 +79,10 @@ def test_mhcflurry_trained_on_hits():
         peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
         random_peptides_for_percent_rank=make_random_peptides(10000, 9),
     )
-    model.fit(HITS_DF)
+    mhcflurry_model.fit(HITS_DF)
 
     peptides = PEPTIDES_DF.copy()
-    predictions = model.predict(peptides)
+    predictions = mhcflurry_model.predict(peptides)
     peptides["affinity"] = predictions["mhcflurry_basic_affinity"]
     peptides["percent_rank"] = predictions["mhcflurry_basic_percentile_rank"]
     assert_less(
@@ -80,3 +91,39 @@ def test_mhcflurry_trained_on_hits():
     assert_less(
         peptides.percent_rank[peptides.hit].mean(),
         peptides.percent_rank[~peptides.hit].mean())
+
+
+def test_presentation_model():
+    mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
+        "basic",
+        experiment_to_alleles=EXPERIMENT_TO_ALLELES,
+        experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
+        transcripts=TRANSCIPTS_DF,
+        peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
+        random_peptides_for_percent_rank=make_random_peptides(10000, 9),
+    )
+
+    decoys = decoy_strategies.UniformRandom(
+        make_random_peptides(10000, 9),
+        decoys_per_hit=50)
+
+    terms = {
+        'A_ms': (
+            [mhcflurry_model],
+            ["log1p(mhcflurry_basic_affinity)"]),
+    }
+
+    models = presentation_model.build_presentation_models(
+        terms,
+        ["A_ms"],
+        decoy_strategy=decoys)
+    eq_(len(models), 1)
+
+    model = models["A_ms"]
+    model.fit(HITS_DF.ix[HITS_DF.experiment_name == "exp1"])
+
+    peptides = PEPTIDES_DF.copy()
+    peptides["prediction"] = model.predict(peptides)
+    assert_less(
+        peptides.prediction[~peptides.hit].mean(),
+        peptides.prediction[peptides.hit].mean())
-- 
GitLab