From 21dc2918db86c6b9937f7caecb1bad480c031dbe Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 16 Jul 2019 12:23:28 -0400 Subject: [PATCH] fix --- .../GENERATE.WITH_HPC_CLUSTER.sh | 2 +- .../cluster_submit_script_header.mssm_hpc.lsf | 1 + .../generate_hyperparameters.py | 4 +- mhcflurry/class1_neural_network.py | 96 +++++++++++++------ mhcflurry/train_pan_allele_models_command.py | 30 +++--- test/expensive_test_pretrain_optimizable.py | 5 +- 6 files changed, 91 insertions(+), 47 deletions(-) diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh index 84801026..2e4dc910 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh @@ -42,7 +42,7 @@ do --hyperparameters hyperparameters.yaml \ --out-models-dir models.${kind} \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --verbosity 1 \ + --verbosity 0 \ --cluster-parallelism \ --cluster-submit-command bsub \ --cluster-results-workdir ~/mhcflurry-scratch \ diff --git a/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf index 16a8411d..f312bb07 100644 --- a/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf +++ b/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf @@ -2,6 +2,7 @@ #BSUB -J mhcf-{work_item_num} # Job name #BSUB -P acc_nkcancer # allocation account or Unix group #BSUB -q gpu # queue +#BSUB -R rusage[ngpus_excl_p=1] # 1 exclusive GPU #BSUB -R span[hosts=1] # one node #BSUB -n 1 # number of compute cores #BSUB -W 36:00 # walltime in HH:MM diff --git a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py index 12d5748c..47101408 100644 --- a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py +++ b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py @@ -44,9 +44,11 @@ base_hyperparameters = { 'pretrain': True, 'pretrain_peptides_per_epoch': 64, 'pretrain_steps_per_epoch': 256, - 'pretrain_patience': 5, + 'pretrain_patience': 2, 'pretrain_min_delta': 0.0001, 'pretrain_max_val_loss': 0.10, + 'pretrain_max_epochs': 50, + 'pretrain_min_epochs': 5, }, 'validation_split': 0.1, 'data_dependent_initialization_method': "lsuv", diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 40efdbc2..55729a44 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -457,9 +457,13 @@ class Class1NeuralNetwork(object): validation_output_indices=None, steps_per_epoch=10, epochs=1000, + min_epochs=0, patience=10, min_delta=0.0, - verbose=1): + verbose=1, + progress_callback=None, + progress_preamble="", + progress_print_interval=5.0): """ Fit using a generator. Does not support many of the features of fit(), such as random negative peptides. @@ -569,31 +573,63 @@ class Class1NeuralNetwork(object): verbose=verbose) iterator = itertools.chain([first_chunk], iterator) - def progress_update(epoch, logs): - if verbose: - print( - "Cumulative training points:", - mutable_generator_state['yielded_values']) - - fit_history = network.fit_generator( - iterator, - steps_per_epoch=steps_per_epoch, - epochs=epochs, - use_multiprocessing=False, - workers=1, - validation_data=(validation_x_dict, validation_y_dict), - verbose=verbose, - callbacks=[ - keras.callbacks.EarlyStopping( - monitor="val_loss", - patience=patience, - min_delta=min_delta, - verbose=verbose), - keras.callbacks.LambdaCallback(on_epoch_end=progress_update), - ] - ) - for (key, value) in fit_history.history.items(): - fit_info[key].extend(value) + min_val_loss_iteration = None + min_val_loss = None + last_progress_print = 0 + epoch = 1 + while True: + epoch_start_time = time.time() + fit_history = network.fit_generator( + iterator, + steps_per_epoch=steps_per_epoch, + initial_epoch=epoch - 1, + epochs=epoch, + use_multiprocessing=False, + workers=1, + validation_data=(validation_x_dict, validation_y_dict), + verbose=verbose, + ) + epoch_time = time.time() - epoch_start_time + for (key, value) in fit_history.history.items(): + fit_info[key].extend(value) + val_loss = fit_info['val_loss'][-1] + + if min_val_loss is None or val_loss < min_val_loss - min_delta: + min_val_loss = val_loss + min_val_loss_iteration = epoch + + patience_epoch_threshold = min( + epochs, max(min_val_loss_iteration + patience, min_epochs)) + + progress_message = ( + "epoch %3d / %3d [%0.2f sec.]: loss=%g val_loss=%g. Min val " + "loss (%g) at epoch %s. Cumulative training points: %d. " + "Earliest stop epoch: %d." % ( + epoch, + epochs, + epoch_time, + fit_info['loss'][-1], + val_loss, + min_val_loss, + min_val_loss_iteration, + mutable_generator_state['yielded_values'], + patience_epoch_threshold, + )).strip() + + # Print progress no more often than once every few seconds. + if progress_print_interval is not None and ( + time.time() - last_progress_print > progress_print_interval): + print(progress_preamble, progress_message) + last_progress_print = time.time() + + if progress_callback: + progress_callback() + + if epoch >= patience_epoch_threshold: + if progress_print_interval is not None: + print(progress_preamble, "STOPPING", progress_message) + break + epoch += 1 fit_info["time"] = time.time() - start fit_info["num_points"] = mutable_generator_state["yielded_values"] @@ -828,7 +864,6 @@ class Class1NeuralNetwork(object): y_dict_with_random_negatives['output'], **encode_y_kwargs) - val_losses = [] min_val_loss_iteration = None min_val_loss = None @@ -929,7 +964,6 @@ class Class1NeuralNetwork(object): if self.hyperparameters['validation_split']: val_loss = fit_info['val_loss'][-1] - val_losses.append(val_loss) if min_val_loss is None or ( val_loss < min_val_loss - self.hyperparameters['min_delta']): @@ -944,11 +978,13 @@ class Class1NeuralNetwork(object): if progress_print_interval is not None: print((progress_preamble + " " + "Stopping at epoch %3d / %3d: loss=%g. " - "Min val loss (%s) at epoch %s" % ( + "Min val loss (%g) at epoch %s" % ( i, self.hyperparameters['max_epochs'], fit_info['loss'][-1], - str(min_val_loss), + ( + min_val_loss if min_val_loss is not None + else numpy.nan), min_val_loss_iteration)).strip()) break diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index ca5a5a75..78c29da6 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -195,19 +195,20 @@ def pretrain_data_iterator( numpy.tile(usable_alleles, peptides_per_chunk), borrow_from=master_allele_encoding) - synthetic_iter = pandas.read_csv( - filename, index_col=0, chunksize=peptides_per_chunk) - for (k, df) in enumerate(synthetic_iter): - if len(df) != peptides_per_chunk: - continue + while True: + synthetic_iter = pandas.read_csv( + filename, index_col=0, chunksize=peptides_per_chunk) + for (k, df) in enumerate(synthetic_iter): + if len(df) != peptides_per_chunk: + continue - df = df[usable_alleles] - encodable_peptides = EncodableSequences( - numpy.repeat( - df.index.values, - len(usable_alleles))) + df = df[usable_alleles] + encodable_peptides = EncodableSequences( + numpy.repeat( + df.index.values, + len(usable_alleles))) - yield (allele_encoding, encodable_peptides, df.stack().values) + yield (allele_encoding, encodable_peptides, df.stack().values) def run(argv=sys.argv[1:]): @@ -493,8 +494,8 @@ def train_model( pretrain_min_delta = get_train_param("pretrain_min_delta", 0.0) pretrain_steps_per_epoch = get_train_param( "pretrain_steps_per_epoch", 10) - pretrain_max_epochs = get_train_param( - "pretrain_max_epochs", 1000) + pretrain_max_epochs = get_train_param("pretrain_max_epochs", 1000) + pretrain_min_epochs = get_train_param("pretrain_min_epochs", 0) pretrain_peptides_per_step = get_train_param( "pretrain_peptides_per_step", 1024) max_val_loss = get_train_param("pretrain_max_val_loss", None) @@ -527,7 +528,10 @@ def train_model( min_delta=pretrain_min_delta, steps_per_epoch=pretrain_steps_per_epoch, epochs=pretrain_max_epochs, + min_epochs=pretrain_min_epochs, verbose=verbose, + progress_preamble=progress_preamble + "PRETRAIN", + progress_print_interval=progress_print_interval, ) model.fit_info[-1].setdefault( "training_info", {})["pretrain_attempt"] = attempt diff --git a/test/expensive_test_pretrain_optimizable.py b/test/expensive_test_pretrain_optimizable.py index 027a88bb..99f9497e 100644 --- a/test/expensive_test_pretrain_optimizable.py +++ b/test/expensive_test_pretrain_optimizable.py @@ -51,8 +51,9 @@ HYPERPARAMTERS = { 'random_negative_match_distribution': True, 'random_negative_rate': 0.2, 'train_data': {'pretrain': True, 'pretrain_max_epochs': 30, - 'pretrain_patience': 5, - 'pretrain_peptides_per_step': 32, + 'pretrain_min_epochs': 5, + 'pretrain_patience': 3, + 'pretrain_peptides_per_step': 8, 'pretrain_steps_per_epoch': 256}, 'validation_split': 0.1, 'data_dependent_initialization_method': "lsuv", -- GitLab