Skip to content
Snippets Groups Projects
Commit 14cf5749 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

codes cleanup

parent ec75a7b6
No related branches found
No related tags found
No related merge requests found
......@@ -20,10 +20,11 @@ class PercentRankTransform(object):
assert self.bin_edges is None
assert len(values) > 0
(hist, self.bin_edges) = numpy.histogram(values, bins=self.n_bins)
self.cdf = numpy.ones(len(hist) + 2) * numpy.nan
self.cdf = numpy.ones(len(hist) + 3) * numpy.nan
self.cdf[0] = 0.0
self.cdf[1] = 0.0
self.cdf[-1] = 100.0
numpy.cumsum(hist / numpy.sum(hist) * 100.0, out=self.cdf[1:-1])
numpy.cumsum(hist / numpy.sum(hist) * 100.0, out=self.cdf[2:-1])
assert not numpy.isnan(self.cdf).any()
def transform(self, values):
......
......@@ -144,6 +144,8 @@ class MHCflurryTrainedOnHits(PresentationComponentModel):
def fit(self, hits_df):
assert 'experiment_name' in hits_df.columns
assert 'peptide' in hits_df.columns
if 'hit' in hits_df.columns:
assert (hits_df.hit == 1).all()
grouped = hits_df.groupby("experiment_name")
for (experiment_name, sub_df) in grouped:
......
......@@ -53,7 +53,7 @@ class PresentationComponentModel(object):
"""
raise NotImplementedError(str(self))
def clone_and_fit(self, peptides_df):
def clone_and_fit(self, hits_df):
"""
Clone the object and fit to given dataset with a weakref cache.
"""
......@@ -65,12 +65,12 @@ class PresentationComponentModel(object):
result = None
else:
key = dataframe_cryptographic_hash(
peptides_df[["experiment_name", "peptide"]])
hits_df[["experiment_name", "peptide"]])
result = self.cached_fits.get(key)
if result is None:
print("Cache miss in clone_and_fit: %s" % str(self))
result = self.clone()
result.fit(peptides_df)
result.fit(hits_df)
if self.cached_fits is not None:
self.cached_fits[key] = result
else:
......@@ -229,6 +229,9 @@ class PresentationComponentModel(object):
self.cached_predictions[cache_key] = return_value
return return_value
def fit_ensemble_and_predict(peptides_df):
raise NotImplementedError
def reset_cache(self):
for key in PresentationComponentModel.cache_fields:
obj_type = type(getattr(self, key))
......
......@@ -321,7 +321,6 @@ class PresentationModel(object):
assert len(self.presentation_models_predictors) == \
len(self.trained_component_models)
# peptides_df = peptides_df.reset_index(drop=True)
prediction_cols = []
presentation_model_predictions = {}
zipped = enumerate(
......@@ -343,20 +342,6 @@ class PresentationModel(object):
x_df = self.evaluate_expressions(df)
assert_no_null(x_df)
"""
with pandas.option_context('mode.use_inf_as_null', True):
null_x = x_df.ix[x_df.isnull().sum(axis=1) > 0]
if len(null_x) > 0:
null_x = null_x.copy()
null_x["peptide"] = peptides_df.ix[null_x.index, "peptide"]
null_x["experiment_name"] = peptides_df.ix[
null_x.index, "experiment_name"
]
raise ValueError(
"Null values in features:\n%s\ninputs:\n%s" % (
str(null_x), str(df.iloc[null_x.index])))
"""
prediction_col = "Prediction (Model %d)" % (i + 1)
assert prediction_col not in presentation_model_predictions
presentation_model_predictions[prediction_col] = (
......@@ -370,7 +355,10 @@ class PresentationModel(object):
del presentation_model_predictions[prediction_cols[0]]
else:
presentation_model_predictions["Prediction"] = numpy.mean(
[presentation_model_predictions[col] for col in prediction_cols],
[
presentation_model_predictions[col]
for col in prediction_cols
],
axis=0)
return pandas.DataFrame(presentation_model_predictions)
......
from nose.tools import eq_, assert_less
import numpy
from numpy.testing import assert_almost_equal
import pandas
from mhcflurry import amino_acid
from mhcflurry.antigen_presentation import (
decoy_strategies,
percent_rank_transform,
presentation_component_models,
presentation_model)
......@@ -62,6 +64,7 @@ PEPTIDES_DF["hit"] = [
for _, row in
PEPTIDES_DF.iterrows()
]
print("Hit rate: %0.3f" % PEPTIDES_DF.hit.mean())
HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy()
del HITS_DF["hit"]
......@@ -70,6 +73,14 @@ del HITS_DF["hit"]
# Tests
def test_percent_rank_transform():
model = percent_rank_transform.PercentRankTransform()
model.fit(numpy.arange(1000))
assert_almost_equal(
model.transform([-2, 0, 50, 100, 2000]),
[0.0, 0.0, 5.0, 10.0, 100.0])
def test_mhcflurry_trained_on_hits():
mhcflurry_model = presentation_component_models.MHCflurryTrainedOnHits(
"basic",
......@@ -100,11 +111,11 @@ def test_presentation_model():
experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
transcripts=TRANSCIPTS_DF,
peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
random_peptides_for_percent_rank=make_random_peptides(10000, 9),
random_peptides_for_percent_rank=make_random_peptides(1000, 9),
)
decoys = decoy_strategies.UniformRandom(
make_random_peptides(10000, 9),
make_random_peptides(1000, 9),
decoys_per_hit=50)
terms = {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment