Skip to content
Snippets Groups Projects
Commit ec75a7b6 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Add test for PresentationModel

parent dfdbd8a4
No related merge requests found
......@@ -33,4 +33,6 @@ class PercentRankTransform(object):
assert self.cdf is not None
assert self.bin_edges is not None
indices = numpy.searchsorted(self.bin_edges, values)
return self.cdf[indices]
result = self.cdf[indices]
assert len(result) == len(values)
return result
......@@ -165,7 +165,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
(allele,) = alleles
mhcflurry_allele = normalize_allele_name(allele)
assert allele not in self.allele_to_model, \
"TODO: Support training on >1 experiments with same allele"
"TODO: Support training on >1 experiments with same allele " \
+ str(self.allele_to_model)
extra_hits = hit_list = set(hit_list)
......@@ -237,18 +238,22 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
def predict_for_experiment(self, experiment_name, peptides):
assert self.allele_to_model is not None, "Must fit first"
peptides_deduped = pandas.unique(peptides)
print(len(peptides_deduped))
alleles = self.experiment_to_alleles[experiment_name]
predictions = pandas.DataFrame(index=peptides)
predictions = pandas.DataFrame(index=peptides_deduped)
for allele in alleles:
predictions[allele] = self.predict_affinity_for_allele(
allele, peptides)
allele, peptides_deduped)
result = {
self.column_name_affinity(): predictions.min(axis=1).values
self.column_name_affinity(): (
predictions.min(axis=1).ix[peptides].values)
}
if self.percent_rank_transforms is not None:
self.fit_percentile_rank_if_needed(alleles)
percentile_ranks = pandas.DataFrame(index=peptides)
percentile_ranks = pandas.DataFrame(index=peptides_deduped)
for allele in alleles:
percentile_ranks[allele] = (
self.percent_rank_transforms[allele]
......@@ -256,10 +261,13 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
result[self.column_name_percentile_rank()] = (
percentile_ranks.min(axis=1).ix[peptides].values)
assert all(len(x) == len(peptides) for x in result.values()), (
"Result lengths don't match peptide lengths. %s:\n%s" % (
"Result lengths don't match peptide lengths. peptides=%d, "
"peptides_deduped=%d, %s" % (
len(peptides),
len(peptides_deduped),
", ".join(
"%s=%d" % (key, value) for (key, value) in result.items()),
result))
"%s=%d" % (key, len(value))
for (key, value) in result.items())))
return result
def get_fit(self):
......
......@@ -12,13 +12,8 @@ from sklearn.linear_model import LogisticRegression
from ..common import assert_no_null, drop_nulls_and_warn
EVAL_CONTEXT = {
'log': numpy.log,
'exp': numpy.exp,
}
def build_presentation_models_from_formulas(term_dict, formulas, **kwargs):
def build_presentation_models(term_dict, formulas, **kwargs):
"""
Convenience function for creating multiple final models based on
shared terms.
......@@ -164,14 +159,16 @@ class PresentationModel(object):
self.fit_experiments = set(hits_df.experiment_name.unique())
if self.component_models_require_fitting and not self.ensemble_size:
print("Using 2-fold fit.")
# Use two fold CV to train model inputs then final models.
cv = StratifiedKFold(
n_splits=2, shuffle=True, random_state=self.random_state)
self.trained_component_models = []
self.presentation_models_predictors = []
fold_num = 1
for (fold1, fold2) in cv.split(hits_df, hits_df.experiment_name):
print("Two fold fit: fitting fold %d" % fold_num)
fold_num += 1
assert len(fold1) > 0
assert len(fold2) > 0
model_input_training_hits_df = hits_df.iloc[fold1]
......@@ -282,7 +279,9 @@ class PresentationModel(object):
def evaluate_expressions(self, input_df):
result = pandas.DataFrame()
for expression in self.feature_expressions:
values = eval(expression, EVAL_CONTEXT, input_df)
# We use numpy module as globals here so math functions
# like log, log1p, exp, are in scope.
values = eval(expression, numpy.__dict__, input_df)
assert len(values) == len(input_df), expression
if hasattr(values, 'values'):
values = values.values
......
from nose.tools import eq_, assert_less
import numpy
from numpy import testing
import pandas
from mhcflurry import amino_acid
from mhcflurry.antigen_presentation import presentation_component_models
from mhcflurry.antigen_presentation import (
decoy_strategies,
presentation_component_models,
presentation_model)
######################
# Helper functions
def make_random_peptides(num, length=9):
return [
''.join(peptide_sequence)
......@@ -16,6 +21,14 @@ def make_random_peptides(num, length=9):
]
def hit_criterion(experiment_name, peptide):
# Peptides with 'A' are always hits. Easy for model to learn.
return 'A' in peptide
######################
# Small test dataset
PEPTIDES = make_random_peptides(100, 9)
TRANSCRIPTS = [
......@@ -42,11 +55,6 @@ PEPTIDES_AND_TRANSCRIPTS_DF = TRANSCIPTS_DF.stack().to_frame().reset_index()
PEPTIDES_AND_TRANSCRIPTS_DF.columns = ["peptide", "group", "transcript"]
del PEPTIDES_AND_TRANSCRIPTS_DF["group"]
def hit_criterion(experiment_name, peptide):
return 'A' in peptide
PEPTIDES_DF = pandas.DataFrame({"peptide": PEPTIDES})
PEPTIDES_DF["experiment_name"] = "exp1"
PEPTIDES_DF["hit"] = [
......@@ -58,9 +66,12 @@ PEPTIDES_DF["hit"] = [
HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy()
del HITS_DF["hit"]
######################
# Tests
def test_mhcflurry_trained_on_hits():
model = presentation_component_models.MHCflurryTrainedOnHits(
mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
"basic",
experiment_to_alleles=EXPERIMENT_TO_ALLELES,
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
......@@ -68,10 +79,10 @@ def test_mhcflurry_trained_on_hits():
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(10000, 9),
)
model.fit(HITS_DF)
mhcflurry_model.fit(HITS_DF)
peptides = PEPTIDES_DF.copy()
predictions = model.predict(peptides)
predictions = mhcflurry_model.predict(peptides)
peptides["affinity"] = predictions["mhcflurry_basic_affinity"]
peptides["percent_rank"] = predictions["mhcflurry_basic_percentile_rank"]
assert_less(
......@@ -80,3 +91,39 @@ def test_mhcflurry_trained_on_hits():
assert_less(
peptides.percent_rank[peptides.hit].mean(),
peptides.percent_rank[~peptides.hit].mean())
def test_presentation_model():
mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
"basic",
experiment_to_alleles=EXPERIMENT_TO_ALLELES,
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
transcripts=TRANSCIPTS_DF,
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(10000, 9),
)
decoys = decoy_strategies.UniformRandom(
make_random_peptides(10000, 9),
decoys_per_hit=50)
terms = {
'A_ms': (
[mhcflurry_model],
["log1p(mhcflurry_basic_affinity)"]),
}
models = presentation_model.build_presentation_models(
terms,
["A_ms"],
decoy_strategy=decoys)
eq_(len(models), 1)
model = models["A_ms"]
model.fit(HITS_DF.ix[HITS_DF.experiment_name == "exp1"])
peptides = PEPTIDES_DF.copy()
peptides["prediction"] = model.predict(peptides)
assert_less(
peptides.prediction[~peptides.hit].mean(),
peptides.prediction[peptides.hit].mean())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment