Newer
Older
import tempfile
import shutil
import os
import json
from os.path import join
from os import mkdir
from numpy.testing import assert_allclose, assert_equal
from nose.tools import eq_
from mhcflurry.class1_allele_specific import scoring
from mhcflurry.measurement_collection import MeasurementCollection
from mhcflurry.class1_allele_specific_ensemble import train_command
from mhcflurry.dataset import Dataset
from mhcflurry.downloads import get_path
from mhcflurry \
.class1_allele_specific_ensemble \
.class1_ensemble_multi_allele_predictor import (
Class1EnsembleMultiAllelePredictor,
HYPERPARAMETER_DEFAULTS)
def test_basic():
model_hyperparameters = HYPERPARAMETER_DEFAULTS.models_grid(
impute=[False, True],
activation=["tanh"],
embedding_output_dim=[16],
dropout_probability=[.25],
n_training_epochs=[20])
model = Class1EnsembleMultiAllelePredictor(
ensemble_size=3,
hyperparameters_to_search=model_hyperparameters)
print(model)
dataset = Dataset.from_csv(get_path(
"data_combined_iedb_kim2014", "combined_human_class1_dataset.csv"))
sub_dataset = Dataset(
dataset._df.ix[
(dataset._df.allele.isin(["HLA-A0101", "HLA-A0205"])) &
(dataset._df.peptide.str.len() == 9)
])
mc = MeasurementCollection.from_dataset(sub_dataset)
print(model.description())
print("Now fitting.")
model2 = Class1EnsembleMultiAllelePredictor(
ensemble_size=3,
hyperparameters_to_search=model_hyperparameters)
model2.fit(mc)
print(set(model.manifest_df.all_alleles_train_data_hash))
assert_equal(
set(model.manifest_df.all_alleles_train_data_hash),
set(model2.manifest_df.all_alleles_train_data_hash))
assert_equal(
set(model.manifest_df.all_alleles_test_data_hash),
set(model2.manifest_df.all_alleles_test_data_hash))
assert_equal(
set(model.manifest_df.all_alleles_imputed_data_hash),
set(model2.manifest_df.all_alleles_imputed_data_hash))
ic50_pred = model.predict(mc)
ic50_true = mc.df.measurement_value
scores = scoring.make_scores(ic50_true, ic50_pred)
print(scores)
assert scores['auc'] > 0.85, "Expected higher AUC"
# test save and restore
try:
tmpdir = tempfile.mkdtemp(prefix="mhcflurry-test")
model.write_fit(
model2 = Class1EnsembleMultiAllelePredictor.load_fit(
finally:
shutil.rmtree(tmpdir)
eq_(model.ensemble_size, model2.ensemble_size)
eq_(model.supported_alleles(), model2.supported_alleles())
eq_(model.hyperparameters_to_search, model2.hyperparameters_to_search)
ic50_pred2 = model.predict(mc)
def test_train_command():
base_temp_dir = tempfile.mkdtemp()
temp_dir = join(base_temp_dir, "models_class1_allele_specific_single")
mkdir(temp_dir)
def write_json(payload, filename):
path = join(temp_dir, filename)
with open(path, 'w') as fd:
json.dump(payload, fd)
return path
models = HYPERPARAMETER_DEFAULTS.models_grid(
impute=[False, True],
activation=["tanh"],
layer_sizes=[[4], [8]],
embedding_output_dim=[16],
dropout_probability=[.25],
n_training_epochs=[10],
imputer_args=[{"n_burn_in": 2, "n_imputations": 10}],
impute_min_observations_per_peptide=[1],
impute_min_observations_per_allele=[1],
)
print("Model selection will be over %d models" % len(models))
bdata2009 = get_path(
"data_kim2014", "bdata.2009.mhci.public.1.txt")
mkdir(join(temp_dir, "models"))
args = [
'--parallel-backend', 'local-threads',
"--model-architectures", write_json(models, "models.json"),
"--train-data", bdata2009,
"--out-manifest", join(temp_dir, "models.csv"),
"--out-models", join(temp_dir, "models"),
"--alleles", "HLA-A0201", "HLA-A0301",
"--verbose",
"--num-local-threads", "1",
"--ensemble-size", "2",
"--target-tasks", "1",
]
print("Running train command with args: %s " % str(args))
train_command.run(args)