From 956efdba71ca92a8e7e2dbb58e82222429c2d3fb Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Fri, 29 Apr 2016 12:45:52 -0400 Subject: [PATCH] adding test to make sure imputation improves predictive accuracy on a small allele, added allow_unknown_amino_acids to from_hyperparameters --- .travis.yml | 8 +++---- mhcflurry/class1_binding_predictor.py | 5 +++- ...rry-train-class1-allele-specific-models.py | 8 +------ test/test_imputation.py | 24 ++++++++++++++++++- 4 files changed, 32 insertions(+), 13 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5cc2b664..bbd40a5a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -39,14 +39,14 @@ script: # download training data - script/download-kim-2013-dataset.sh - script/download-iedb.sh - # only install data for A*01:01 and A*02:01 for testing - - script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 + # only install data for A*01:01, A*02:01, HLA-A*02:05 for testing + - script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05 - script/create-combined-class1-dataset.py - # only installing A0101 and A0201 for testing purposes and with very limited + # only installing A0101, A0201, A0205 for testing purposes and with very limited # training - > mhcflurry-train-class1-allele-specific-models.py - --alleles HLA-A0101 HLA-A0201 + --alleles HLA-A0101 HLA-A0201 HLA-A0205 --embedding-size 10 --hidden-layer-size 10 --training-epochs 100 diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py index 366a5a40..28378df3 100644 --- a/mhcflurry/class1_binding_predictor.py +++ b/mhcflurry/class1_binding_predictor.py @@ -91,7 +91,8 @@ class Class1BindingPredictor(PredictorBase): name=None, max_ic50=MAX_IC50, peptide_length=9, - embedding_input_dim=20, + n_amino_acids=20, + allow_unknown_amino_acids=True, embedding_output_dim=20, layer_sizes=[50], activation="tanh", @@ -104,6 +105,7 @@ class Class1BindingPredictor(PredictorBase): """ Create untrained predictor with the given hyperparameters. """ + embedding_input_dim = n_amino_acids + int(allow_unknown_amino_acids) model = make_embedding_network( peptide_length=peptide_length, embedding_input_dim=embedding_input_dim, @@ -119,6 +121,7 @@ class Class1BindingPredictor(PredictorBase): name=name, max_ic50=max_ic50, model=model, + allow_unknown_amino_acids=allow_unknown_amino_acids, **kwargs) def _combine_training_data( diff --git a/script/mhcflurry-train-class1-allele-specific-models.py b/script/mhcflurry-train-class1-allele-specific-models.py index 7a1475e9..fddf9241 100755 --- a/script/mhcflurry-train-class1-allele-specific-models.py +++ b/script/mhcflurry-train-class1-allele-specific-models.py @@ -159,18 +159,12 @@ if __name__ == "__main__": name=allele_name, peptide_length=9, max_ic50=args.max_ic50, - # 21 instead of 20 amino acids since we're also allowing - # an explicit unknown "X" - embedding_input_dim=21, embedding_output_dim=args.embedding_size, layer_sizes=(args.hidden_layer_size,), activation=args.activation, init=args.initialization, dropout_probability=args.dropout, - learning_rate=args.learning_rate, - # this argument isn't a model hyperparameter but gets passed on to - # the initializer method - allow_unknown_amino_acids=True) + learning_rate=args.learning_rate) json_filename = allele_name + ".json" json_path = join(args.output_dir, json_filename) diff --git a/test/test_imputation.py b/test/test_imputation.py index fcc2ccb7..23c3bbb2 100644 --- a/test/test_imputation.py +++ b/test/test_imputation.py @@ -1,7 +1,12 @@ from mhcflurry.imputation import ( create_imputed_datasets, ) -from mhcflurry.data import create_allele_data_from_peptide_to_ic50_dict +from mhcflurry.data import ( + create_allele_data_from_peptide_to_ic50_dict, + load_allele_datasets +) +from mhcflurry.paths import CLASS1_DATA_CSV_PATH +from mhcflurry import Class1BindingPredictor from fancyimpute import MICE from nose.tools import eq_ @@ -28,3 +33,20 @@ def test_create_imputed_datasets_two_alleles(): print(allele_name) print(allele_data) eq_(set(allele_data.peptides), expected_peptides) + +def test_performance_improves_for_A0205_with_pretraining(): + # test to make sure that imputation improves predictive accuracy after a + # small number of training iterations (5 epochs) + allele_data_dict = load_allele_datasets(CLASS1_DATA_CSV_PATH) + a0205_data_without_imputation = allele_data_dict["A0205"] + predictor_without_imputation = \ + Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute") + predictor_without_imputation.fit( + X=a0205_data_without_imputation.X_index, + Y=a0205_data_without_imputation.Y) + + predictor_with_imputation = \ + Class1BindingPredictor.from_hyperparameters(name="A0205-impute") + predictor_with_imputation.fit( + X=a0205_data_without_imputation.X_index, + Y=a0205_data_without_imputation.Y) -- GitLab