Skip to content
Snippets Groups Projects
Commit ec75a7b6 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Add test for PresentationModel

parent dfdbd8a4
No related merge requests found
...@@ -33,4 +33,6 @@ class PercentRankTransform(object): ...@@ -33,4 +33,6 @@ class PercentRankTransform(object):
assert self.cdf is not None assert self.cdf is not None
assert self.bin_edges is not None assert self.bin_edges is not None
indices = numpy.searchsorted(self.bin_edges, values) indices = numpy.searchsorted(self.bin_edges, values)
return self.cdf[indices] result = self.cdf[indices]
assert len(result) == len(values)
return result
...@@ -165,7 +165,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel): ...@@ -165,7 +165,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
(allele,) = alleles (allele,) = alleles
mhcflurry_allele = normalize_allele_name(allele) mhcflurry_allele = normalize_allele_name(allele)
assert allele not in self.allele_to_model, \ assert allele not in self.allele_to_model, \
"TODO: Support training on >1 experiments with same allele" "TODO: Support training on >1 experiments with same allele " \
+ str(self.allele_to_model)
extra_hits = hit_list = set(hit_list) extra_hits = hit_list = set(hit_list)
...@@ -237,18 +238,22 @@ class MHCflurryTrainedOnHits(PresentationComponentModel): ...@@ -237,18 +238,22 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
def predict_for_experiment(self, experiment_name, peptides): def predict_for_experiment(self, experiment_name, peptides):
assert self.allele_to_model is not None, "Must fit first" assert self.allele_to_model is not None, "Must fit first"
peptides_deduped = pandas.unique(peptides)
print(len(peptides_deduped))
alleles = self.experiment_to_alleles[experiment_name] alleles = self.experiment_to_alleles[experiment_name]
predictions = pandas.DataFrame(index=peptides) predictions = pandas.DataFrame(index=peptides_deduped)
for allele in alleles: for allele in alleles:
predictions[allele] = self.predict_affinity_for_allele( predictions[allele] = self.predict_affinity_for_allele(
allele, peptides) allele, peptides_deduped)
result = { result = {
self.column_name_affinity(): predictions.min(axis=1).values self.column_name_affinity(): (
predictions.min(axis=1).ix[peptides].values)
} }
if self.percent_rank_transforms is not None: if self.percent_rank_transforms is not None:
self.fit_percentile_rank_if_needed(alleles) self.fit_percentile_rank_if_needed(alleles)
percentile_ranks = pandas.DataFrame(index=peptides) percentile_ranks = pandas.DataFrame(index=peptides_deduped)
for allele in alleles: for allele in alleles:
percentile_ranks[allele] = ( percentile_ranks[allele] = (
self.percent_rank_transforms[allele] self.percent_rank_transforms[allele]
...@@ -256,10 +261,13 @@ class MHCflurryTrainedOnHits(PresentationComponentModel): ...@@ -256,10 +261,13 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
result[self.column_name_percentile_rank()] = ( result[self.column_name_percentile_rank()] = (
percentile_ranks.min(axis=1).ix[peptides].values) percentile_ranks.min(axis=1).ix[peptides].values)
assert all(len(x) == len(peptides) for x in result.values()), ( assert all(len(x) == len(peptides) for x in result.values()), (
"Result lengths don't match peptide lengths. %s:\n%s" % ( "Result lengths don't match peptide lengths. peptides=%d, "
"peptides_deduped=%d, %s" % (
len(peptides),
len(peptides_deduped),
", ".join( ", ".join(
"%s=%d" % (key, value) for (key, value) in result.items()), "%s=%d" % (key, len(value))
result)) for (key, value) in result.items())))
return result return result
def get_fit(self): def get_fit(self):
......
...@@ -12,13 +12,8 @@ from sklearn.linear_model import LogisticRegression ...@@ -12,13 +12,8 @@ from sklearn.linear_model import LogisticRegression
from ..common import assert_no_null, drop_nulls_and_warn from ..common import assert_no_null, drop_nulls_and_warn
EVAL_CONTEXT = {
'log': numpy.log,
'exp': numpy.exp,
}
def build_presentation_models(term_dict, formulas, **kwargs):
def build_presentation_models_from_formulas(term_dict, formulas, **kwargs):
""" """
Convenience function for creating multiple final models based on Convenience function for creating multiple final models based on
shared terms. shared terms.
...@@ -164,14 +159,16 @@ class PresentationModel(object): ...@@ -164,14 +159,16 @@ class PresentationModel(object):
self.fit_experiments = set(hits_df.experiment_name.unique()) self.fit_experiments = set(hits_df.experiment_name.unique())
if self.component_models_require_fitting and not self.ensemble_size: if self.component_models_require_fitting and not self.ensemble_size:
print("Using 2-fold fit.")
# Use two fold CV to train model inputs then final models. # Use two fold CV to train model inputs then final models.
cv = StratifiedKFold( cv = StratifiedKFold(
n_splits=2, shuffle=True, random_state=self.random_state) n_splits=2, shuffle=True, random_state=self.random_state)
self.trained_component_models = [] self.trained_component_models = []
self.presentation_models_predictors = [] self.presentation_models_predictors = []
fold_num = 1
for (fold1, fold2) in cv.split(hits_df, hits_df.experiment_name): for (fold1, fold2) in cv.split(hits_df, hits_df.experiment_name):
print("Two fold fit: fitting fold %d" % fold_num)
fold_num += 1
assert len(fold1) > 0 assert len(fold1) > 0
assert len(fold2) > 0 assert len(fold2) > 0
model_input_training_hits_df = hits_df.iloc[fold1] model_input_training_hits_df = hits_df.iloc[fold1]
...@@ -282,7 +279,9 @@ class PresentationModel(object): ...@@ -282,7 +279,9 @@ class PresentationModel(object):
def evaluate_expressions(self, input_df): def evaluate_expressions(self, input_df):
result = pandas.DataFrame() result = pandas.DataFrame()
for expression in self.feature_expressions: for expression in self.feature_expressions:
values = eval(expression, EVAL_CONTEXT, input_df) # We use numpy module as globals here so math functions
# like log, log1p, exp, are in scope.
values = eval(expression, numpy.__dict__, input_df)
assert len(values) == len(input_df), expression assert len(values) == len(input_df), expression
if hasattr(values, 'values'): if hasattr(values, 'values'):
values = values.values values = values.values
......
from nose.tools import eq_, assert_less from nose.tools import eq_, assert_less
import numpy import numpy
from numpy import testing
import pandas import pandas
from mhcflurry import amino_acid from mhcflurry import amino_acid
from mhcflurry.antigen_presentation import presentation_component_models from mhcflurry.antigen_presentation import (
decoy_strategies,
presentation_component_models,
presentation_model)
######################
# Helper functions
def make_random_peptides(num, length=9): def make_random_peptides(num, length=9):
return [ return [
''.join(peptide_sequence) ''.join(peptide_sequence)
...@@ -16,6 +21,14 @@ def make_random_peptides(num, length=9): ...@@ -16,6 +21,14 @@ def make_random_peptides(num, length=9):
] ]
def hit_criterion(experiment_name, peptide):
# Peptides with 'A' are always hits. Easy for model to learn.
return 'A' in peptide
######################
# Small test dataset
PEPTIDES = make_random_peptides(100, 9) PEPTIDES = make_random_peptides(100, 9)
TRANSCRIPTS = [ TRANSCRIPTS = [
...@@ -42,11 +55,6 @@ PEPTIDES_AND_TRANSCRIPTS_DF = TRANSCIPTS_DF.stack().to_frame().reset_index() ...@@ -42,11 +55,6 @@ PEPTIDES_AND_TRANSCRIPTS_DF = TRANSCIPTS_DF.stack().to_frame().reset_index()
PEPTIDES_AND_TRANSCRIPTS_DF.columns = ["peptide", "group", "transcript"] PEPTIDES_AND_TRANSCRIPTS_DF.columns = ["peptide", "group", "transcript"]
del PEPTIDES_AND_TRANSCRIPTS_DF["group"] del PEPTIDES_AND_TRANSCRIPTS_DF["group"]
def hit_criterion(experiment_name, peptide):
return 'A' in peptide
PEPTIDES_DF = pandas.DataFrame({"peptide": PEPTIDES}) PEPTIDES_DF = pandas.DataFrame({"peptide": PEPTIDES})
PEPTIDES_DF["experiment_name"] = "exp1" PEPTIDES_DF["experiment_name"] = "exp1"
PEPTIDES_DF["hit"] = [ PEPTIDES_DF["hit"] = [
...@@ -58,9 +66,12 @@ PEPTIDES_DF["hit"] = [ ...@@ -58,9 +66,12 @@ PEPTIDES_DF["hit"] = [
HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy() HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy()
del HITS_DF["hit"] del HITS_DF["hit"]
######################
# Tests
def test_mhcflurry_trained_on_hits(): def test_mhcflurry_trained_on_hits():
model = presentation_component_models.MHCflurryTrainedOnHits( mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
"basic", "basic",
experiment_to_alleles=EXPERIMENT_TO_ALLELES, experiment_to_alleles=EXPERIMENT_TO_ALLELES,
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP, experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
...@@ -68,10 +79,10 @@ def test_mhcflurry_trained_on_hits(): ...@@ -68,10 +79,10 @@ def test_mhcflurry_trained_on_hits():
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF, peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(10000, 9), random_peptides_for_percent_rank=make_random_peptides(10000, 9),
) )
model.fit(HITS_DF) mhcflurry_model.fit(HITS_DF)
peptides = PEPTIDES_DF.copy() peptides = PEPTIDES_DF.copy()
predictions = model.predict(peptides) predictions = mhcflurry_model.predict(peptides)
peptides["affinity"] = predictions["mhcflurry_basic_affinity"] peptides["affinity"] = predictions["mhcflurry_basic_affinity"]
peptides["percent_rank"] = predictions["mhcflurry_basic_percentile_rank"] peptides["percent_rank"] = predictions["mhcflurry_basic_percentile_rank"]
assert_less( assert_less(
...@@ -80,3 +91,39 @@ def test_mhcflurry_trained_on_hits(): ...@@ -80,3 +91,39 @@ def test_mhcflurry_trained_on_hits():
assert_less( assert_less(
peptides.percent_rank[peptides.hit].mean(), peptides.percent_rank[peptides.hit].mean(),
peptides.percent_rank[~peptides.hit].mean()) peptides.percent_rank[~peptides.hit].mean())
def test_presentation_model():
mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
"basic",
experiment_to_alleles=EXPERIMENT_TO_ALLELES,
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
transcripts=TRANSCIPTS_DF,
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(10000, 9),
)
decoys = decoy_strategies.UniformRandom(
make_random_peptides(10000, 9),
decoys_per_hit=50)
terms = {
'A_ms': (
[mhcflurry_model],
["log1p(mhcflurry_basic_affinity)"]),
}
models = presentation_model.build_presentation_models(
terms,
["A_ms"],
decoy_strategy=decoys)
eq_(len(models), 1)
model = models["A_ms"]
model.fit(HITS_DF.ix[HITS_DF.experiment_name == "exp1"])
peptides = PEPTIDES_DF.copy()
peptides["prediction"] = model.predict(peptides)
assert_less(
peptides.prediction[~peptides.hit].mean(),
peptides.prediction[peptides.hit].mean())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment