diff --git a/.travis.yml b/.travis.yml index 5cc2b66481affc2e19978e3481a5dccc68e2f701..bbd40a5af803f3b0afd5331dba69063788ae2292 100644 --- a/.travis.yml +++ b/.travis.yml @@ -39,14 +39,14 @@ script: # download training data - script/download-kim-2013-dataset.sh - script/download-iedb.sh - # only install data for A*01:01 and A*02:01 for testing - - script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 + # only install data for A*01:01, A*02:01, HLA-A*02:05 for testing + - script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05 - script/create-combined-class1-dataset.py - # only installing A0101 and A0201 for testing purposes and with very limited + # only installing A0101, A0201, A0205 for testing purposes and with very limited # training - > mhcflurry-train-class1-allele-specific-models.py - --alleles HLA-A0101 HLA-A0201 + --alleles HLA-A0101 HLA-A0201 HLA-A0205 --embedding-size 10 --hidden-layer-size 10 --training-epochs 100 diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py index 366a5a40b5850f33b3d89e246f04efa6e338f232..28378df3b3c3a58d328546d1a1cb09cce5cd9323 100644 --- a/mhcflurry/class1_binding_predictor.py +++ b/mhcflurry/class1_binding_predictor.py @@ -91,7 +91,8 @@ class Class1BindingPredictor(PredictorBase): name=None, max_ic50=MAX_IC50, peptide_length=9, - embedding_input_dim=20, + n_amino_acids=20, + allow_unknown_amino_acids=True, embedding_output_dim=20, layer_sizes=[50], activation="tanh", @@ -104,6 +105,7 @@ class Class1BindingPredictor(PredictorBase): """ Create untrained predictor with the given hyperparameters. """ + embedding_input_dim = n_amino_acids + int(allow_unknown_amino_acids) model = make_embedding_network( peptide_length=peptide_length, embedding_input_dim=embedding_input_dim, @@ -119,6 +121,7 @@ class Class1BindingPredictor(PredictorBase): name=name, max_ic50=max_ic50, model=model, + allow_unknown_amino_acids=allow_unknown_amino_acids, **kwargs) def _combine_training_data( diff --git a/script/mhcflurry-train-class1-allele-specific-models.py b/script/mhcflurry-train-class1-allele-specific-models.py index 7a1475e9d672ba62e612492ec6d78023b168865d..fddf924124f6bc3ecdf04210489b03f3f34d916f 100755 --- a/script/mhcflurry-train-class1-allele-specific-models.py +++ b/script/mhcflurry-train-class1-allele-specific-models.py @@ -159,18 +159,12 @@ if __name__ == "__main__": name=allele_name, peptide_length=9, max_ic50=args.max_ic50, - # 21 instead of 20 amino acids since we're also allowing - # an explicit unknown "X" - embedding_input_dim=21, embedding_output_dim=args.embedding_size, layer_sizes=(args.hidden_layer_size,), activation=args.activation, init=args.initialization, dropout_probability=args.dropout, - learning_rate=args.learning_rate, - # this argument isn't a model hyperparameter but gets passed on to - # the initializer method - allow_unknown_amino_acids=True) + learning_rate=args.learning_rate) json_filename = allele_name + ".json" json_path = join(args.output_dir, json_filename) diff --git a/test/test_imputation.py b/test/test_imputation.py index fcc2ccb7d6dc90e65df42d8607130d2a7f87ffa7..23c3bbb234f3cd55a0d3025c48a49dd62863af8f 100644 --- a/test/test_imputation.py +++ b/test/test_imputation.py @@ -1,7 +1,12 @@ from mhcflurry.imputation import ( create_imputed_datasets, ) -from mhcflurry.data import create_allele_data_from_peptide_to_ic50_dict +from mhcflurry.data import ( + create_allele_data_from_peptide_to_ic50_dict, + load_allele_datasets +) +from mhcflurry.paths import CLASS1_DATA_CSV_PATH +from mhcflurry import Class1BindingPredictor from fancyimpute import MICE from nose.tools import eq_ @@ -28,3 +33,20 @@ def test_create_imputed_datasets_two_alleles(): print(allele_name) print(allele_data) eq_(set(allele_data.peptides), expected_peptides) + +def test_performance_improves_for_A0205_with_pretraining(): + # test to make sure that imputation improves predictive accuracy after a + # small number of training iterations (5 epochs) + allele_data_dict = load_allele_datasets(CLASS1_DATA_CSV_PATH) + a0205_data_without_imputation = allele_data_dict["A0205"] + predictor_without_imputation = \ + Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute") + predictor_without_imputation.fit( + X=a0205_data_without_imputation.X_index, + Y=a0205_data_without_imputation.Y) + + predictor_with_imputation = \ + Class1BindingPredictor.from_hyperparameters(name="A0205-impute") + predictor_with_imputation.fit( + X=a0205_data_without_imputation.X_index, + Y=a0205_data_without_imputation.Y)