Newer
Older
from nose.tools import eq_, assert_raises
from numpy import testing
from mhcflurry.downloads import get_path
# To hunt down a weird warning we were seeing in pandas.
def warn_with_traceback(message, category, filename, lineno, file=None, line=None):
log = file if hasattr(file,'write') else sys.stderr
traceback.print_stack(file=log)
log.write(warnings.formatwarning(message, category, filename, lineno, line))
warnings.showwarning = warn_with_traceback
def debug():
print("\n%s" % (
predictor.predict_to_dataframe(
peptides=[peptide],
allele=allele,
include_individual_model_predictions=True)))
(prediction,) = predictor.predict(allele=allele, peptides=[peptide])
assert prediction >= expected_range[0], (predictor, prediction, debug())
assert prediction <= expected_range[1], (predictor, prediction, debug())
allele = "HLA-A*01:01"
df = pandas.read_csv(
get_path(
"data_curated", "curated_training_data.csv.bz2"))
df = df.ix[
(df.allele == allele) &
(df.peptide.str.len() >= 8) &
(df.peptide.str.len() <= 15)
]
hyperparameters = {
Tim O'Donnell
committed
"max_epochs": 100,
"patience": 10,
"early_stopping": True,
"validation_split": 0.2,
"random_negative_rate": 0.0,
"random_negative_constant": 25,
Tim O'Donnell
committed
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": False,
"kmer_size": 15,
"batch_normalization": False,
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [
32
],
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
"dense_layer_l1_regularization": 0.001,
"dropout_probability": 0.0
}
predictor = Class1AffinityPredictor()
predictor.fit_allele_specific_predictors(
n_models=2,
architecture_hyperparameters=hyperparameters,
allele=allele,
peptides=df.peptide.values,
affinities=df.measurement_value.values,
)
predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor)
models_dir = tempfile.mkdtemp("_models")
print(models_dir)
predictor.save(models_dir)
predictor2 = Class1AffinityPredictor.load(models_dir)
predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor2)
shutil.rmtree(models_dir)
predictor3 = Class1AffinityPredictor(
allele_to_allele_specific_models={
allele: [predictor.allele_to_allele_specific_models[allele][0]]
})
predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor3)
models_dir = tempfile.mkdtemp("_models")
print(models_dir)
predictor3.save(models_dir)
predictor4 = Class1AffinityPredictor.load(models_dir)
predict_and_check("HLA-A*01:01", "EVDPIGHLY", predictor=predictor4)
def test_class1_affinity_predictor_a0205_memorize_training_data():
# Memorize the dataset.
hyperparameters = dict(
activation="tanh",
layer_sizes=[64],
Tim O'Donnell
committed
max_epochs=100,
early_stopping=False,
validation_split=0.0,
locally_connected_layers=[],
dense_layer_l1_regularization=0.0,
dropout_probability=0.0)
# First test a Class1NeuralNetwork, then a Class1AffinityPredictor.
allele = "HLA-A*02:05"
df = pandas.read_csv(
get_path(
"data_curated", "curated_training_data.csv.bz2"))
df = df.ix[
]
df = df.ix[
df.peptide.str.len() == 9
]
df = df.ix[
df.measurement_type == "quantitative"
]
df = df.ix[
df.measurement_source == "kim2014"
]
predictor = Class1AffinityPredictor()
predictor.fit_allele_specific_predictors(
n_models=2,
architecture_hyperparameters=hyperparameters,
allele=allele,
peptides=df.peptide.values,
affinities=df.measurement_value.values,
)
Tim O'Donnell
committed
predictor.calibrate_percentile_ranks(num_peptides_per_length=1000)
ic50_pred = predictor.predict(df.peptide.values, allele=allele)
ic50_true = df.measurement_value.values
eq_(len(ic50_pred), len(ic50_true))
testing.assert_allclose(
numpy.log(ic50_pred),
numpy.log(ic50_true),
rtol=0.2,
atol=0.2)
ic50_pred_df = predictor.predict_to_dataframe(
df.peptide.values, allele=allele)
print(ic50_pred_df)
assert 'prediction_percentile' in ic50_pred_df.columns
assert ic50_pred_df.prediction_percentile.isnull().sum() == 0
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
ic50_pred_df2 = predictor.predict_to_dataframe(
df.peptide.values,
allele=allele,
include_individual_model_predictions=True)
print(ic50_pred_df2)
# Test an unknown allele
print("Starting unknown allele check")
eq_(predictor.supported_alleles, [allele])
ic50_pred = predictor.predict(
df.peptide.values,
allele="HLA-A*02:01",
throw=False)
assert numpy.isnan(ic50_pred).all()
assert_raises(
ValueError,
predictor.predict,
df.peptide.values,
allele="HLA-A*02:01")
eq_(predictor.supported_alleles, [allele])
assert_raises(
ValueError,
predictor.predict,
["AAAAA"], # too short
allele=allele)
assert_raises(
ValueError,
predictor.predict,
["AAAAAAAAAAAAAAAAAAAA"], # too long
allele=allele)
ic50_pred = predictor.predict(
["AAAAA", "AAAAAAAAA", "AAAAAAAAAAAAAAAAAAAA"],
allele=allele,
throw=False)
assert numpy.isnan(ic50_pred[0])
assert not numpy.isnan(ic50_pred[1])
assert numpy.isnan(ic50_pred[2])