From 6d52392f4964420f6074376e3dddd50ca2c9886f Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Mon, 18 Apr 2016 19:25:32 -0400
Subject: [PATCH] adding options to training script and updating to work with
 new predictor class

---
 .../class1_allele_specific_hyperparameters.py | 49 ++++++++++++++++++-
 .../train-class1-allele-specific-models.py    | 37 ++++++--------
 test/test_known_class1_epitopes.py            |  0
 3 files changed, 63 insertions(+), 23 deletions(-)
 create mode 100644 test/test_known_class1_epitopes.py

diff --git a/mhcflurry/class1_allele_specific_hyperparameters.py b/mhcflurry/class1_allele_specific_hyperparameters.py
index 6d2a4f2d..c39cc60f 100644
--- a/mhcflurry/class1_allele_specific_hyperparameters.py
+++ b/mhcflurry/class1_allele_specific_hyperparameters.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-N_PRETRAIN_EPOCHS = 5
 N_EPOCHS = 250
 ACTIVATION = "tanh"
 INITIALIZATION_METHOD = "lecun_uniform"
@@ -20,3 +19,51 @@ EMBEDDING_DIM = 32
 HIDDEN_LAYER_SIZE = 200
 DROPOUT_PROBABILITY = 0.25
 MAX_IC50 = 50000.0
+
+def add_hyperparameter_arguments_to_parser(parser):
+    """
+    Extend an argument parser with the following options:
+        --training-epochs
+        --activation
+        --initialization
+        --embedding-size
+        --hidden-layer-size
+        --dropout
+        --max-ic50
+    """
+    parser.add_argument(
+        "--training-epochs",
+        default=N_EPOCHS,
+        help="Number of training epochs")
+
+    parser.add_argument(
+        "--initialization",
+        default=INITIALIZATION_METHOD,
+        help="Initialization for neural network weights")
+
+    parser.add_argument(
+        "--activation",
+        default=ACTIVATION,
+        help="Activation function for neural network layers")
+
+    parser.add_argument(
+        "--embedding-size",
+        default=EMBEDDING_DIM,
+        help="Size of vector representations for embedding amino acids")
+
+    parser.add_argument(
+        "--hidden-layer-size",
+        default=HIDDEN_LAYER_SIZE,
+        help="Size of hidden neural network layer")
+
+    parser.add_argument(
+        "--dropout",
+        default=DROPOUT_PROBABILITY,
+        help="Dropout probability after neural network layers")
+
+    parser.add_argument(
+        "--max-ic50",
+        default=MAX_IC50,
+        help="Largest IC50 represented by neural network output")
+
+    return parser
diff --git a/scripts/train-class1-allele-specific-models.py b/scripts/train-class1-allele-specific-models.py
index 6342e05e..5b25db86 100755
--- a/scripts/train-class1-allele-specific-models.py
+++ b/scripts/train-class1-allele-specific-models.py
@@ -47,14 +47,7 @@ from mhcflurry.common import normalize_allele_name
 from mhcflurry.feedforward import make_network
 from mhcflurry.data_helpers import load_allele_datasets
 from mhcflurry.class1_allele_specific_hyperparameters import (
-    N_PRETRAIN_EPOCHS,
-    N_EPOCHS,
-    ACTIVATION,
-    INITIALIZATION_METHOD,
-    EMBEDDING_DIM,
-    HIDDEN_LAYER_SIZE,
-    DROPOUT_PROBABILITY,
-    MAX_IC50
+    add_hyperparamer_arguments_to_parser
 )
 from mhcflurry.paths import (
     CLASS1_MODEL_DIRECTORY,
@@ -70,6 +63,7 @@ parser.add_argument(
     default=CLASS1_MODEL_DIRECTORY,
     help="Output directory for allele-specific predictor HDF weights files")
 
+
 parser.add_argument(
     "--overwrite",
     default=False,
@@ -87,6 +81,9 @@ parser.add_argument(
     help="Don't train predictors for alleles with fewer samples than this",
     type=int)
 
+# add options for neural network hyperparameters
+parser = add_hyperparamer_arguments_to_parser(parser)
+
 if __name__ == "__main__":
     args = parser.parse_args()
 
@@ -97,7 +94,7 @@ if __name__ == "__main__":
         args.binding_data_csv_path,
         peptide_length=9,
         binary_encoding=False,
-        max_ic50=MAX_IC50,
+        max_ic50=args.max_ic50,
         sep=",",
         peptide_column_name="peptide")
 
@@ -107,18 +104,15 @@ if __name__ == "__main__":
     Y_all = np.concatenate([group.Y for group in allele_groups.values()])
     print("Total Dataset size = %d" % len(Y_all))
 
-    model = make_network(
-        input_size=9,
-        embedding_input_dim=20,
-        embedding_output_dim=EMBEDDING_DIM,
-        layer_sizes=(HIDDEN_LAYER_SIZE,),
-        activation=ACTIVATION,
-        init=INITIALIZATION_METHOD,
-        dropout_probability=DROPOUT_PROBABILITY)
-    print("Model config: %s" % (model.get_config(),))
-    model.fit(X_all, Y_all, nb_epoch=N_PRETRAIN_EPOCHS)
-    old_weights = model.get_weights()
     for allele_name, allele_data in allele_groups.items():
+        model = make_network(
+            input_size=9,
+            embedding_input_dim=20,
+            embedding_output_dim=args.embedding_size,
+            layer_sizes=(args.hidden_layer_size,),
+            activation=args.activation,
+            init=args.initialization,
+            dropout_probability=args.dropout)
         allele_name = normalize_allele_name(allele_name)
         if allele_name.isdigit():
             print("Skipping allele %s" % (allele_name,))
@@ -147,11 +141,10 @@ if __name__ == "__main__":
             print("-- removing old weights file %s" % hdf_path)
             remove(hdf_path)
 
-        model.set_weights(old_weights)
         model.fit(
             allele_data.X,
             allele_data.Y,
-            nb_epoch=N_EPOCHS,
+            nb_epoch=args.training_epochs,
             show_accuracy=True)
         print("Saving model description for %s to %s" % (
             allele_name, json_path))
diff --git a/test/test_known_class1_epitopes.py b/test/test_known_class1_epitopes.py
new file mode 100644
index 00000000..e69de29b
-- 
GitLab