Skip to content
Snippets Groups Projects
Commit dec7c6dc authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

more tests

parent dfe6f88f
No related merge requests found
......@@ -22,10 +22,11 @@ def cache_dict_for_policy(policy):
class PresentationComponentModel(object):
'''
Base class for inputs to a "final model". By "final model" we mean
something like a logistic regression model that takes as input expression,
mhc binding affinity, cleavage, etc. By "final model input" we mean
the predictors for expression, mhc binding affinity, etc.
Base class for component models to a presentation model.
The component models are things like mhc binding affinity and cleavage,
and the presentation model is typically a logistic regression model
over these.
'''
def __init__(
self, fit_cache_policy="weak", predictions_cache_policy="weak"):
......
......@@ -52,7 +52,7 @@ def build_presentation_models(term_dict, formulas, **kwargs):
(term_inputs, term_expressions) = term_dict[name]
inputs.extend(term_inputs)
expressions.extend(term_expressions)
assert len(set(expressions)) == len(expressions)
assert len(set(expressions)) == len(expressions), expressions
presentation_model = PresentationModel(
inputs,
expressions,
......@@ -301,7 +301,11 @@ class PresentationModel(object):
for expression in self.feature_expressions:
# We use numpy module as globals here so math functions
# like log, log1p, exp, are in scope.
values = eval(expression, numpy.__dict__, input_df)
try:
values = eval(expression, numpy.__dict__, input_df)
except SyntaxError:
logging.error("Syntax error in expression: %s" % expression)
raise
assert len(values) == len(input_df), expression
if hasattr(values, 'values'):
values = values.values
......@@ -419,7 +423,6 @@ class PresentationModel(object):
assert 'hit' in peptides_df.columns
peptides_df["prediction"] = self.predict(peptides_df)
# print(sorted(peptides_df.prediction[peptides_df.hit].values))
top_n = float(peptides_df.hit.sum())
if not include_hit_indices:
......@@ -503,9 +506,8 @@ class PresentationModel(object):
"those of this PresentationModel: '%s'" % (
feature_expressions, self.feature_expressions))
assert not fit, "Unhandled data in fit: %s" % fit
assert len(model_input_fits) == (
2 if self.component_models_require_fitting else 1), (
"Wrong length: %s" % model_input_fits)
assert (
len(model_input_fits) == len(self.presentation_models_predictors))
self.trained_component_models = []
for model_input_fits_for_fold in model_input_fits:
......
......@@ -201,10 +201,10 @@ def describe_nulls(df, related_df_with_same_index_to_describe=None):
def raise_or_debug(exception):
"""
Raise the exception unless the NEON_DEBUG environment variable is set,
Raise the exception unless the MHCFLURRY_DEBUG environment variable is set,
in which case drop into ipython debugger (ipdb).
"""
if environ.get("NEON_DEBUG"):
if environ.get("MHCFLURRY_DEBUG"):
import ipdb
ipdb.set_trace()
raise exception
......
import pickle
from nose.tools import eq_, assert_less
import numpy
from numpy.testing import assert_allclose
from numpy.testing import assert_allclose, assert_array_equal
import pandas
from mhcflurry import amino_acid
from mhcflurry.antigen_presentation import (
......@@ -10,6 +12,8 @@ from mhcflurry.antigen_presentation import (
presentation_component_models,
presentation_model)
from mhcflurry.amino_acid import common_amino_acid_letters
######################
# Helper functions
......@@ -31,7 +35,8 @@ def hit_criterion(experiment_name, peptide):
######################
# Small test dataset
PEPTIDES = make_random_peptides(100, 9)
PEPTIDES = make_random_peptides(1000, 9)
OTHER_PEPTIDES = make_random_peptides(1000, 9)
TRANSCRIPTS = [
"transcript-%d" % i
......@@ -66,6 +71,15 @@ PEPTIDES_DF["hit"] = [
]
print("Hit rate: %0.3f" % PEPTIDES_DF.hit.mean())
AA_COMPOSITION_DF = pandas.DataFrame({
'peptide': sorted(set(PEPTIDES).union(set(OTHER_PEPTIDES))),
})
for aa in sorted(common_amino_acid_letters):
AA_COMPOSITION_DF[aa] = AA_COMPOSITION_DF.peptide.str.count(aa)
AA_COMPOSITION_DF.index = AA_COMPOSITION_DF.peptide
del AA_COMPOSITION_DF['peptide']
HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy()
del HITS_DF["hit"]
......@@ -89,7 +103,7 @@ def test_mhcflurry_trained_on_hits():
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
transcripts=TRANSCIPTS_DF,
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(10000, 9),
random_peptides_for_percent_rank=OTHER_PEPTIDES,
)
mhcflurry_model.fit(HITS_DF)
......@@ -112,32 +126,67 @@ def test_presentation_model():
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
transcripts=TRANSCIPTS_DF,
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(1000, 9),
random_peptides_for_percent_rank=OTHER_PEPTIDES,
)
aa_content_model = (
presentation_component_models.FixedPerPeptideQuantity(
"aa composition",
numpy.log1p(AA_COMPOSITION_DF)))
decoys = decoy_strategies.UniformRandom(
make_random_peptides(1000, 9),
OTHER_PEPTIDES,
decoys_per_hit=50)
terms = {
'A_ms': (
[mhcflurry_model],
["log1p(mhcflurry_basic_affinity)"]),
'P': (
[aa_content_model],
list(AA_COMPOSITION_DF.columns)),
}
for kwargs in [{}, {'ensemble_size': 6}]:
for kwargs in [{}, {'ensemble_size': 3}]:
models = presentation_model.build_presentation_models(
terms,
["A_ms"],
["A_ms", "A_ms + P"],
decoy_strategy=decoys,
**kwargs)
eq_(len(models), 1)
eq_(len(models), 2)
model = models["A_ms"]
unfit_model = models["A_ms"]
model = unfit_model.clone()
model.fit(HITS_DF.ix[HITS_DF.experiment_name == "exp1"])
peptides = PEPTIDES_DF.copy()
peptides["prediction"] = model.predict(peptides)
print(peptides)
print("Hit mean", peptides.prediction[peptides.hit].mean())
print("Decoy mean", peptides.prediction[~peptides.hit].mean())
assert_less(
peptides.prediction[~peptides.hit].mean(),
peptides.prediction[peptides.hit].mean())
model2 = pickle.loads(pickle.dumps(model))
assert_array_equal(
model.predict(peptides), model2.predict(peptides))
model3 = unfit_model.clone()
assert not model3.has_been_fit
model3.restore_fit(model2.get_fit())
assert_array_equal(
model.predict(peptides), model3.predict(peptides))
better_unfit_model = models["A_ms + P"]
model = better_unfit_model.clone()
model.fit(HITS_DF.ix[HITS_DF.experiment_name == "exp1"])
peptides["prediction_better"] = model.predict(peptides)
assert_less(
peptides.prediction_better[~peptides.hit].mean(),
peptides.prediction[~peptides.hit].mean())
assert_less(
peptides.prediction[peptides.hit].mean(),
peptides.prediction_better[peptides.hit].mean())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment