Skip to content
Snippets Groups Projects
Commit ec75a7b6 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Add test for PresentationModel

parent dfdbd8a4
No related branches found
No related tags found
No related merge requests found
......@@ -33,4 +33,6 @@ class PercentRankTransform(object):
assert self.cdf is not None
assert self.bin_edges is not None
indices = numpy.searchsorted(self.bin_edges, values)
return self.cdf[indices]
result = self.cdf[indices]
assert len(result) == len(values)
return result
......@@ -165,7 +165,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
(allele,) = alleles
mhcflurry_allele = normalize_allele_name(allele)
assert allele not in self.allele_to_model, \
"TODO: Support training on >1 experiments with same allele"
"TODO: Support training on >1 experiments with same allele " \
+ str(self.allele_to_model)
extra_hits = hit_list = set(hit_list)
......@@ -237,18 +238,22 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
def predict_for_experiment(self, experiment_name, peptides):
assert self.allele_to_model is not None, "Must fit first"
peptides_deduped = pandas.unique(peptides)
print(len(peptides_deduped))
alleles = self.experiment_to_alleles[experiment_name]
predictions = pandas.DataFrame(index=peptides)
predictions = pandas.DataFrame(index=peptides_deduped)
for allele in alleles:
predictions[allele] = self.predict_affinity_for_allele(
allele, peptides)
allele, peptides_deduped)
result = {
self.column_name_affinity(): predictions.min(axis=1).values
self.column_name_affinity(): (
predictions.min(axis=1).ix[peptides].values)
}
if self.percent_rank_transforms is not None:
self.fit_percentile_rank_if_needed(alleles)
percentile_ranks = pandas.DataFrame(index=peptides)
percentile_ranks = pandas.DataFrame(index=peptides_deduped)
for allele in alleles:
percentile_ranks[allele] = (
self.percent_rank_transforms[allele]
......@@ -256,10 +261,13 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
result[self.column_name_percentile_rank()] = (
percentile_ranks.min(axis=1).ix[peptides].values)
assert all(len(x) == len(peptides) for x in result.values()), (
"Result lengths don't match peptide lengths. %s:\n%s" % (
"Result lengths don't match peptide lengths. peptides=%d, "
"peptides_deduped=%d, %s" % (
len(peptides),
len(peptides_deduped),
", ".join(
"%s=%d" % (key, value) for (key, value) in result.items()),
result))
"%s=%d" % (key, len(value))
for (key, value) in result.items())))
return result
def get_fit(self):
......
......@@ -12,13 +12,8 @@ from sklearn.linear_model import LogisticRegression
from ..common import assert_no_null, drop_nulls_and_warn
EVAL_CONTEXT = {
'log': numpy.log,
'exp': numpy.exp,
}
def build_presentation_models_from_formulas(term_dict, formulas, **kwargs):
def build_presentation_models(term_dict, formulas, **kwargs):
"""
Convenience function for creating multiple final models based on
shared terms.
......@@ -164,14 +159,16 @@ class PresentationModel(object):
self.fit_experiments = set(hits_df.experiment_name.unique())
if self.component_models_require_fitting and not self.ensemble_size:
print("Using 2-fold fit.")
# Use two fold CV to train model inputs then final models.
cv = StratifiedKFold(
n_splits=2, shuffle=True, random_state=self.random_state)
self.trained_component_models = []
self.presentation_models_predictors = []
fold_num = 1
for (fold1, fold2) in cv.split(hits_df, hits_df.experiment_name):
print("Two fold fit: fitting fold %d" % fold_num)
fold_num += 1
assert len(fold1) > 0
assert len(fold2) > 0
model_input_training_hits_df = hits_df.iloc[fold1]
......@@ -282,7 +279,9 @@ class PresentationModel(object):
def evaluate_expressions(self, input_df):
result = pandas.DataFrame()
for expression in self.feature_expressions:
values = eval(expression, EVAL_CONTEXT, input_df)
# We use numpy module as globals here so math functions
# like log, log1p, exp, are in scope.
values = eval(expression, numpy.__dict__, input_df)
assert len(values) == len(input_df), expression
if hasattr(values, 'values'):
values = values.values
......
from nose.tools import eq_, assert_less
import numpy
from numpy import testing
import pandas
from mhcflurry import amino_acid
from mhcflurry.antigen_presentation import presentation_component_models
from mhcflurry.antigen_presentation import (
decoy_strategies,
presentation_component_models,
presentation_model)
######################
# Helper functions
def make_random_peptides(num, length=9):
return [
''.join(peptide_sequence)
......@@ -16,6 +21,14 @@ def make_random_peptides(num, length=9):
]
def hit_criterion(experiment_name, peptide):
# Peptides with 'A' are always hits. Easy for model to learn.
return 'A' in peptide
######################
# Small test dataset
PEPTIDES = make_random_peptides(100, 9)
TRANSCRIPTS = [
......@@ -42,11 +55,6 @@ PEPTIDES_AND_TRANSCRIPTS_DF = TRANSCIPTS_DF.stack().to_frame().reset_index()
PEPTIDES_AND_TRANSCRIPTS_DF.columns = ["peptide", "group", "transcript"]
del PEPTIDES_AND_TRANSCRIPTS_DF["group"]
def hit_criterion(experiment_name, peptide):
return 'A' in peptide
PEPTIDES_DF = pandas.DataFrame({"peptide": PEPTIDES})
PEPTIDES_DF["experiment_name"] = "exp1"
PEPTIDES_DF["hit"] = [
......@@ -58,9 +66,12 @@ PEPTIDES_DF["hit"] = [
HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy()
del HITS_DF["hit"]
######################
# Tests
def test_mhcflurry_trained_on_hits():
model = presentation_component_models.MHCflurryTrainedOnHits(
mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
"basic",
experiment_to_alleles=EXPERIMENT_TO_ALLELES,
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
......@@ -68,10 +79,10 @@ def test_mhcflurry_trained_on_hits():
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(10000, 9),
)
model.fit(HITS_DF)
mhcflurry_model.fit(HITS_DF)
peptides = PEPTIDES_DF.copy()
predictions = model.predict(peptides)
predictions = mhcflurry_model.predict(peptides)
peptides["affinity"] = predictions["mhcflurry_basic_affinity"]
peptides["percent_rank"] = predictions["mhcflurry_basic_percentile_rank"]
assert_less(
......@@ -80,3 +91,39 @@ def test_mhcflurry_trained_on_hits():
assert_less(
peptides.percent_rank[peptides.hit].mean(),
peptides.percent_rank[~peptides.hit].mean())
def test_presentation_model():
mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
"basic",
experiment_to_alleles=EXPERIMENT_TO_ALLELES,
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
transcripts=TRANSCIPTS_DF,
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(10000, 9),
)
decoys = decoy_strategies.UniformRandom(
make_random_peptides(10000, 9),
decoys_per_hit=50)
terms = {
'A_ms': (
[mhcflurry_model],
["log1p(mhcflurry_basic_affinity)"]),
}
models = presentation_model.build_presentation_models(
terms,
["A_ms"],
decoy_strategy=decoys)
eq_(len(models), 1)
model = models["A_ms"]
model.fit(HITS_DF.ix[HITS_DF.experiment_name == "exp1"])
peptides = PEPTIDES_DF.copy()
peptides["prediction"] = model.predict(peptides)
assert_less(
peptides.prediction[~peptides.hit].mean(),
peptides.prediction[peptides.hit].mean())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment