From 6d52392f4964420f6074376e3dddd50ca2c9886f Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Mon, 18 Apr 2016 19:25:32 -0400 Subject: [PATCH] adding options to training script and updating to work with new predictor class --- .../class1_allele_specific_hyperparameters.py | 49 ++++++++++++++++++- .../train-class1-allele-specific-models.py | 37 ++++++-------- test/test_known_class1_epitopes.py | 0 3 files changed, 63 insertions(+), 23 deletions(-) create mode 100644 test/test_known_class1_epitopes.py diff --git a/mhcflurry/class1_allele_specific_hyperparameters.py b/mhcflurry/class1_allele_specific_hyperparameters.py index 6d2a4f2d..c39cc60f 100644 --- a/mhcflurry/class1_allele_specific_hyperparameters.py +++ b/mhcflurry/class1_allele_specific_hyperparameters.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -N_PRETRAIN_EPOCHS = 5 N_EPOCHS = 250 ACTIVATION = "tanh" INITIALIZATION_METHOD = "lecun_uniform" @@ -20,3 +19,51 @@ EMBEDDING_DIM = 32 HIDDEN_LAYER_SIZE = 200 DROPOUT_PROBABILITY = 0.25 MAX_IC50 = 50000.0 + +def add_hyperparameter_arguments_to_parser(parser): + """ + Extend an argument parser with the following options: + --training-epochs + --activation + --initialization + --embedding-size + --hidden-layer-size + --dropout + --max-ic50 + """ + parser.add_argument( + "--training-epochs", + default=N_EPOCHS, + help="Number of training epochs") + + parser.add_argument( + "--initialization", + default=INITIALIZATION_METHOD, + help="Initialization for neural network weights") + + parser.add_argument( + "--activation", + default=ACTIVATION, + help="Activation function for neural network layers") + + parser.add_argument( + "--embedding-size", + default=EMBEDDING_DIM, + help="Size of vector representations for embedding amino acids") + + parser.add_argument( + "--hidden-layer-size", + default=HIDDEN_LAYER_SIZE, + help="Size of hidden neural network layer") + + parser.add_argument( + "--dropout", + default=DROPOUT_PROBABILITY, + help="Dropout probability after neural network layers") + + parser.add_argument( + "--max-ic50", + default=MAX_IC50, + help="Largest IC50 represented by neural network output") + + return parser diff --git a/scripts/train-class1-allele-specific-models.py b/scripts/train-class1-allele-specific-models.py index 6342e05e..5b25db86 100755 --- a/scripts/train-class1-allele-specific-models.py +++ b/scripts/train-class1-allele-specific-models.py @@ -47,14 +47,7 @@ from mhcflurry.common import normalize_allele_name from mhcflurry.feedforward import make_network from mhcflurry.data_helpers import load_allele_datasets from mhcflurry.class1_allele_specific_hyperparameters import ( - N_PRETRAIN_EPOCHS, - N_EPOCHS, - ACTIVATION, - INITIALIZATION_METHOD, - EMBEDDING_DIM, - HIDDEN_LAYER_SIZE, - DROPOUT_PROBABILITY, - MAX_IC50 + add_hyperparamer_arguments_to_parser ) from mhcflurry.paths import ( CLASS1_MODEL_DIRECTORY, @@ -70,6 +63,7 @@ parser.add_argument( default=CLASS1_MODEL_DIRECTORY, help="Output directory for allele-specific predictor HDF weights files") + parser.add_argument( "--overwrite", default=False, @@ -87,6 +81,9 @@ parser.add_argument( help="Don't train predictors for alleles with fewer samples than this", type=int) +# add options for neural network hyperparameters +parser = add_hyperparamer_arguments_to_parser(parser) + if __name__ == "__main__": args = parser.parse_args() @@ -97,7 +94,7 @@ if __name__ == "__main__": args.binding_data_csv_path, peptide_length=9, binary_encoding=False, - max_ic50=MAX_IC50, + max_ic50=args.max_ic50, sep=",", peptide_column_name="peptide") @@ -107,18 +104,15 @@ if __name__ == "__main__": Y_all = np.concatenate([group.Y for group in allele_groups.values()]) print("Total Dataset size = %d" % len(Y_all)) - model = make_network( - input_size=9, - embedding_input_dim=20, - embedding_output_dim=EMBEDDING_DIM, - layer_sizes=(HIDDEN_LAYER_SIZE,), - activation=ACTIVATION, - init=INITIALIZATION_METHOD, - dropout_probability=DROPOUT_PROBABILITY) - print("Model config: %s" % (model.get_config(),)) - model.fit(X_all, Y_all, nb_epoch=N_PRETRAIN_EPOCHS) - old_weights = model.get_weights() for allele_name, allele_data in allele_groups.items(): + model = make_network( + input_size=9, + embedding_input_dim=20, + embedding_output_dim=args.embedding_size, + layer_sizes=(args.hidden_layer_size,), + activation=args.activation, + init=args.initialization, + dropout_probability=args.dropout) allele_name = normalize_allele_name(allele_name) if allele_name.isdigit(): print("Skipping allele %s" % (allele_name,)) @@ -147,11 +141,10 @@ if __name__ == "__main__": print("-- removing old weights file %s" % hdf_path) remove(hdf_path) - model.set_weights(old_weights) model.fit( allele_data.X, allele_data.Y, - nb_epoch=N_EPOCHS, + nb_epoch=args.training_epochs, show_accuracy=True) print("Saving model description for %s to %s" % ( allele_name, json_path)) diff --git a/test/test_known_class1_epitopes.py b/test/test_known_class1_epitopes.py new file mode 100644 index 00000000..e69de29b -- GitLab