diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh index 848010267a95868d94b188a31fd0748be38d180b..2e4dc9109bcba2571ac3340eff78f2748bc01d3b 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh @@ -42,7 +42,7 @@ do --hyperparameters hyperparameters.yaml \ --out-models-dir models.${kind} \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --verbosity 1 \ + --verbosity 0 \ --cluster-parallelism \ --cluster-submit-command bsub \ --cluster-results-workdir ~/mhcflurry-scratch \ diff --git a/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf index 16a8411daa08990bd3f030509118eda8bce75a87..f312bb0709afa97c395a627df568d06383cfb30a 100644 --- a/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf +++ b/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf @@ -2,6 +2,7 @@ #BSUB -J mhcf-{work_item_num} # Job name #BSUB -P acc_nkcancer # allocation account or Unix group #BSUB -q gpu # queue +#BSUB -R rusage[ngpus_excl_p=1] # 1 exclusive GPU #BSUB -R span[hosts=1] # one node #BSUB -n 1 # number of compute cores #BSUB -W 36:00 # walltime in HH:MM diff --git a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py index 12d5748c329cb59aaf4f1d19d0d24effaf4a6007..471014080f062c980c5403bc515bd9a109c35189 100644 --- a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py +++ b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py @@ -44,9 +44,11 @@ base_hyperparameters = { 'pretrain': True, 'pretrain_peptides_per_epoch': 64, 'pretrain_steps_per_epoch': 256, - 'pretrain_patience': 5, + 'pretrain_patience': 2, 'pretrain_min_delta': 0.0001, 'pretrain_max_val_loss': 0.10, + 'pretrain_max_epochs': 50, + 'pretrain_min_epochs': 5, }, 'validation_split': 0.1, 'data_dependent_initialization_method': "lsuv", diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 40efdbc22299616dd12b290a808f49850473a315..55729a44c0a973242e4d143637ece369858e4c1c 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -457,9 +457,13 @@ class Class1NeuralNetwork(object): validation_output_indices=None, steps_per_epoch=10, epochs=1000, + min_epochs=0, patience=10, min_delta=0.0, - verbose=1): + verbose=1, + progress_callback=None, + progress_preamble="", + progress_print_interval=5.0): """ Fit using a generator. Does not support many of the features of fit(), such as random negative peptides. @@ -569,31 +573,63 @@ class Class1NeuralNetwork(object): verbose=verbose) iterator = itertools.chain([first_chunk], iterator) - def progress_update(epoch, logs): - if verbose: - print( - "Cumulative training points:", - mutable_generator_state['yielded_values']) - - fit_history = network.fit_generator( - iterator, - steps_per_epoch=steps_per_epoch, - epochs=epochs, - use_multiprocessing=False, - workers=1, - validation_data=(validation_x_dict, validation_y_dict), - verbose=verbose, - callbacks=[ - keras.callbacks.EarlyStopping( - monitor="val_loss", - patience=patience, - min_delta=min_delta, - verbose=verbose), - keras.callbacks.LambdaCallback(on_epoch_end=progress_update), - ] - ) - for (key, value) in fit_history.history.items(): - fit_info[key].extend(value) + min_val_loss_iteration = None + min_val_loss = None + last_progress_print = 0 + epoch = 1 + while True: + epoch_start_time = time.time() + fit_history = network.fit_generator( + iterator, + steps_per_epoch=steps_per_epoch, + initial_epoch=epoch - 1, + epochs=epoch, + use_multiprocessing=False, + workers=1, + validation_data=(validation_x_dict, validation_y_dict), + verbose=verbose, + ) + epoch_time = time.time() - epoch_start_time + for (key, value) in fit_history.history.items(): + fit_info[key].extend(value) + val_loss = fit_info['val_loss'][-1] + + if min_val_loss is None or val_loss < min_val_loss - min_delta: + min_val_loss = val_loss + min_val_loss_iteration = epoch + + patience_epoch_threshold = min( + epochs, max(min_val_loss_iteration + patience, min_epochs)) + + progress_message = ( + "epoch %3d / %3d [%0.2f sec.]: loss=%g val_loss=%g. Min val " + "loss (%g) at epoch %s. Cumulative training points: %d. " + "Earliest stop epoch: %d." % ( + epoch, + epochs, + epoch_time, + fit_info['loss'][-1], + val_loss, + min_val_loss, + min_val_loss_iteration, + mutable_generator_state['yielded_values'], + patience_epoch_threshold, + )).strip() + + # Print progress no more often than once every few seconds. + if progress_print_interval is not None and ( + time.time() - last_progress_print > progress_print_interval): + print(progress_preamble, progress_message) + last_progress_print = time.time() + + if progress_callback: + progress_callback() + + if epoch >= patience_epoch_threshold: + if progress_print_interval is not None: + print(progress_preamble, "STOPPING", progress_message) + break + epoch += 1 fit_info["time"] = time.time() - start fit_info["num_points"] = mutable_generator_state["yielded_values"] @@ -828,7 +864,6 @@ class Class1NeuralNetwork(object): y_dict_with_random_negatives['output'], **encode_y_kwargs) - val_losses = [] min_val_loss_iteration = None min_val_loss = None @@ -929,7 +964,6 @@ class Class1NeuralNetwork(object): if self.hyperparameters['validation_split']: val_loss = fit_info['val_loss'][-1] - val_losses.append(val_loss) if min_val_loss is None or ( val_loss < min_val_loss - self.hyperparameters['min_delta']): @@ -944,11 +978,13 @@ class Class1NeuralNetwork(object): if progress_print_interval is not None: print((progress_preamble + " " + "Stopping at epoch %3d / %3d: loss=%g. " - "Min val loss (%s) at epoch %s" % ( + "Min val loss (%g) at epoch %s" % ( i, self.hyperparameters['max_epochs'], fit_info['loss'][-1], - str(min_val_loss), + ( + min_val_loss if min_val_loss is not None + else numpy.nan), min_val_loss_iteration)).strip()) break diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index ca5a5a753a376f62010826542b451e34bdfcdcf0..78c29da608131097071bff805d5ba42bbce9b2d5 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -195,19 +195,20 @@ def pretrain_data_iterator( numpy.tile(usable_alleles, peptides_per_chunk), borrow_from=master_allele_encoding) - synthetic_iter = pandas.read_csv( - filename, index_col=0, chunksize=peptides_per_chunk) - for (k, df) in enumerate(synthetic_iter): - if len(df) != peptides_per_chunk: - continue + while True: + synthetic_iter = pandas.read_csv( + filename, index_col=0, chunksize=peptides_per_chunk) + for (k, df) in enumerate(synthetic_iter): + if len(df) != peptides_per_chunk: + continue - df = df[usable_alleles] - encodable_peptides = EncodableSequences( - numpy.repeat( - df.index.values, - len(usable_alleles))) + df = df[usable_alleles] + encodable_peptides = EncodableSequences( + numpy.repeat( + df.index.values, + len(usable_alleles))) - yield (allele_encoding, encodable_peptides, df.stack().values) + yield (allele_encoding, encodable_peptides, df.stack().values) def run(argv=sys.argv[1:]): @@ -493,8 +494,8 @@ def train_model( pretrain_min_delta = get_train_param("pretrain_min_delta", 0.0) pretrain_steps_per_epoch = get_train_param( "pretrain_steps_per_epoch", 10) - pretrain_max_epochs = get_train_param( - "pretrain_max_epochs", 1000) + pretrain_max_epochs = get_train_param("pretrain_max_epochs", 1000) + pretrain_min_epochs = get_train_param("pretrain_min_epochs", 0) pretrain_peptides_per_step = get_train_param( "pretrain_peptides_per_step", 1024) max_val_loss = get_train_param("pretrain_max_val_loss", None) @@ -527,7 +528,10 @@ def train_model( min_delta=pretrain_min_delta, steps_per_epoch=pretrain_steps_per_epoch, epochs=pretrain_max_epochs, + min_epochs=pretrain_min_epochs, verbose=verbose, + progress_preamble=progress_preamble + "PRETRAIN", + progress_print_interval=progress_print_interval, ) model.fit_info[-1].setdefault( "training_info", {})["pretrain_attempt"] = attempt diff --git a/test/expensive_test_pretrain_optimizable.py b/test/expensive_test_pretrain_optimizable.py index 027a88bb1f0e83400e4019569c745147b9dcbfd8..99f9497eb22e87aa7242cf1fe76e5a1ee4b8a337 100644 --- a/test/expensive_test_pretrain_optimizable.py +++ b/test/expensive_test_pretrain_optimizable.py @@ -51,8 +51,9 @@ HYPERPARAMTERS = { 'random_negative_match_distribution': True, 'random_negative_rate': 0.2, 'train_data': {'pretrain': True, 'pretrain_max_epochs': 30, - 'pretrain_patience': 5, - 'pretrain_peptides_per_step': 32, + 'pretrain_min_epochs': 5, + 'pretrain_patience': 3, + 'pretrain_peptides_per_step': 8, 'pretrain_steps_per_epoch': 256}, 'validation_split': 0.1, 'data_dependent_initialization_method': "lsuv",