adding test to make sure imputation improves predictive accuracy on a small...

adding test to make sure imputation improves predictive accuracy on a small allele, added allow_unknown_amino_acids to from_hyperparameters

adding test to make sure imputation improves predictive accuracy on a small...
adding test to make sure imputation improves predictive accuracy on a small allele, added allow_unknown_amino_acids to from_hyperparameters
956efdba · Alex Rubinsteyn · e6bef3d1 · 956efdba · 956efdba · 956efdba
Commit 956efdba authored 8 years ago by Alex Rubinsteyn
--- a/.travis.yml
+++ b/.travis.yml
@@ -39,14 +39,14 @@ script:
  # download training data
  - script/download-kim-2013-dataset.sh
  - script/download-iedb.sh
-  # only install data for A*01:01 and A*02:01 for testing
-  - script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01
+  # only install data for A*01:01, A*02:01, HLA-A*02:05 for testing
+  - script/create-iedb-class1-dataset.py --alleles HLA-A*01:01 HLA-A*02:01 HLA-A*02:05
  - script/create-combined-class1-dataset.py
-  # only installing A0101 and A0201 for testing purposes and with very limited
+  # only installing A0101, A0201, A0205 for testing purposes and with very limited
  # training
  - >
      mhcflurry-train-class1-allele-specific-models.py
-      --alleles HLA-A0101 HLA-A0201
+      --alleles HLA-A0101 HLA-A0201 HLA-A0205
      --embedding-size 10
      --hidden-layer-size 10
      --training-epochs 100

--- a/mhcflurry/class1_binding_predictor.py
+++ b/mhcflurry/class1_binding_predictor.py
@@ -91,7 +91,8 @@ class Class1BindingPredictor(PredictorBase):
            name=None,
            max_ic50=MAX_IC50,
            peptide_length=9,
-            embedding_input_dim=20,
+            n_amino_acids=20,
+            allow_unknown_amino_acids=True,
            embedding_output_dim=20,
            layer_sizes=[50],
            activation="tanh",
@@ -104,6 +105,7 @@ class Class1BindingPredictor(PredictorBase):
        """
        Create untrained predictor with the given hyperparameters.
        """
+        embedding_input_dim = n_amino_acids + int(allow_unknown_amino_acids)
        model = make_embedding_network(
            peptide_length=peptide_length,
            embedding_input_dim=embedding_input_dim,
@@ -119,6 +121,7 @@ class Class1BindingPredictor(PredictorBase):
            name=name,
            max_ic50=max_ic50,
            model=model,
+            allow_unknown_amino_acids=allow_unknown_amino_acids,
            **kwargs)

    def _combine_training_data(

--- a/script/mhcflurry-train-class1-allele-specific-models.py
+++ b/script/mhcflurry-train-class1-allele-specific-models.py
@@ -159,18 +159,12 @@ if __name__ == "__main__":
            name=allele_name,
            peptide_length=9,
            max_ic50=args.max_ic50,
-            # 21 instead of 20 amino acids since we're also allowing
-            # an explicit unknown "X"
-            embedding_input_dim=21,
            embedding_output_dim=args.embedding_size,
            layer_sizes=(args.hidden_layer_size,),
            activation=args.activation,
            init=args.initialization,
            dropout_probability=args.dropout,
-            learning_rate=args.learning_rate,
-            # this argument isn't a model hyperparameter but gets passed on to
-            # the initializer method
-            allow_unknown_amino_acids=True)
+            learning_rate=args.learning_rate)

        json_filename = allele_name + ".json"
        json_path = join(args.output_dir, json_filename)

--- a/test/test_imputation.py
+++ b/test/test_imputation.py
 from mhcflurry.imputation import (
    create_imputed_datasets,
 )
-from mhcflurry.data import create_allele_data_from_peptide_to_ic50_dict
+from mhcflurry.data import (
+    create_allele_data_from_peptide_to_ic50_dict,
+    load_allele_datasets
+)
+from mhcflurry.paths import CLASS1_DATA_CSV_PATH
+from mhcflurry import Class1BindingPredictor

 from fancyimpute import MICE
 from nose.tools import eq_
@@ -28,3 +33,20 @@ def test_create_imputed_datasets_two_alleles():
        print(allele_name)
        print(allele_data)
        eq_(set(allele_data.peptides), expected_peptides)
+
+def test_performance_improves_for_A0205_with_pretraining():
+    # test to make sure that imputation improves predictive accuracy after a
+    # small number of training iterations (5 epochs)
+    allele_data_dict = load_allele_datasets(CLASS1_DATA_CSV_PATH)
+    a0205_data_without_imputation = allele_data_dict["A0205"]
+    predictor_without_imputation = \
+        Class1BindingPredictor.from_hyperparameters(name="A0205-no-impute")
+    predictor_without_imputation.fit(
+        X=a0205_data_without_imputation.X_index,
+        Y=a0205_data_without_imputation.Y)
+
+    predictor_with_imputation = \
+        Class1BindingPredictor.from_hyperparameters(name="A0205-impute")
+    predictor_with_imputation.fit(
+        X=a0205_data_without_imputation.X_index,
+        Y=a0205_data_without_imputation.Y)