From 21dc2918db86c6b9937f7caecb1bad480c031dbe Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 16 Jul 2019 12:23:28 -0400
Subject: [PATCH] fix

---
 .../GENERATE.WITH_HPC_CLUSTER.sh              |  2 +-
 .../cluster_submit_script_header.mssm_hpc.lsf |  1 +
 .../generate_hyperparameters.py               |  4 +-
 mhcflurry/class1_neural_network.py            | 96 +++++++++++++------
 mhcflurry/train_pan_allele_models_command.py  | 30 +++---
 test/expensive_test_pretrain_optimizable.py   |  5 +-
 6 files changed, 91 insertions(+), 47 deletions(-)

diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh
index 84801026..2e4dc910 100755
--- a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh
@@ -42,7 +42,7 @@ do
         --hyperparameters hyperparameters.yaml \
         --out-models-dir models.${kind} \
         --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
-        --verbosity 1 \
+        --verbosity 0 \
         --cluster-parallelism \
         --cluster-submit-command bsub \
         --cluster-results-workdir ~/mhcflurry-scratch \
diff --git a/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf
index 16a8411d..f312bb07 100644
--- a/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf
+++ b/downloads-generation/models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf
@@ -2,6 +2,7 @@
 #BSUB -J mhcf-{work_item_num} # Job name
 #BSUB -P acc_nkcancer # allocation account or Unix group
 #BSUB -q gpu # queue
+#BSUB -R rusage[ngpus_excl_p=1]  # 1 exclusive GPU
 #BSUB -R span[hosts=1] # one node
 #BSUB -n 1 # number of compute cores
 #BSUB -W 36:00 # walltime in HH:MM
diff --git a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
index 12d5748c..47101408 100644
--- a/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
+++ b/downloads-generation/models_class1_pan_unselected/generate_hyperparameters.py
@@ -44,9 +44,11 @@ base_hyperparameters = {
         'pretrain': True,
         'pretrain_peptides_per_epoch': 64,
         'pretrain_steps_per_epoch': 256,
-        'pretrain_patience': 5,
+        'pretrain_patience': 2,
         'pretrain_min_delta': 0.0001,
         'pretrain_max_val_loss': 0.10,
+        'pretrain_max_epochs': 50,
+        'pretrain_min_epochs': 5,
     },
     'validation_split': 0.1,
     'data_dependent_initialization_method': "lsuv",
diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index 40efdbc2..55729a44 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -457,9 +457,13 @@ class Class1NeuralNetwork(object):
             validation_output_indices=None,
             steps_per_epoch=10,
             epochs=1000,
+            min_epochs=0,
             patience=10,
             min_delta=0.0,
-            verbose=1):
+            verbose=1,
+            progress_callback=None,
+            progress_preamble="",
+            progress_print_interval=5.0):
         """
         Fit using a generator. Does not support many of the features of fit(),
         such as random negative peptides.
@@ -569,31 +573,63 @@ class Class1NeuralNetwork(object):
                 verbose=verbose)
             iterator = itertools.chain([first_chunk], iterator)
 
-        def progress_update(epoch, logs):
-            if verbose:
-                print(
-                    "Cumulative training points:",
-                    mutable_generator_state['yielded_values'])
-
-        fit_history = network.fit_generator(
-            iterator,
-            steps_per_epoch=steps_per_epoch,
-            epochs=epochs,
-            use_multiprocessing=False,
-            workers=1,
-            validation_data=(validation_x_dict, validation_y_dict),
-            verbose=verbose,
-            callbacks=[
-                keras.callbacks.EarlyStopping(
-                    monitor="val_loss",
-                    patience=patience,
-                    min_delta=min_delta,
-                    verbose=verbose),
-                keras.callbacks.LambdaCallback(on_epoch_end=progress_update),
-            ]
-        )
-        for (key, value) in fit_history.history.items():
-            fit_info[key].extend(value)
+        min_val_loss_iteration = None
+        min_val_loss = None
+        last_progress_print = 0
+        epoch = 1
+        while True:
+            epoch_start_time = time.time()
+            fit_history = network.fit_generator(
+                iterator,
+                steps_per_epoch=steps_per_epoch,
+                initial_epoch=epoch - 1,
+                epochs=epoch,
+                use_multiprocessing=False,
+                workers=1,
+                validation_data=(validation_x_dict, validation_y_dict),
+                verbose=verbose,
+            )
+            epoch_time = time.time() - epoch_start_time
+            for (key, value) in fit_history.history.items():
+                fit_info[key].extend(value)
+            val_loss = fit_info['val_loss'][-1]
+
+            if min_val_loss is None or val_loss < min_val_loss - min_delta:
+                min_val_loss = val_loss
+                min_val_loss_iteration = epoch
+
+            patience_epoch_threshold = min(
+                epochs, max(min_val_loss_iteration + patience, min_epochs))
+
+            progress_message = (
+                "epoch %3d / %3d [%0.2f sec.]: loss=%g val_loss=%g. Min val "
+                "loss (%g) at epoch %s. Cumulative training points: %d. "
+                "Earliest stop epoch: %d." % (
+                    epoch,
+                    epochs,
+                    epoch_time,
+                    fit_info['loss'][-1],
+                    val_loss,
+                    min_val_loss,
+                    min_val_loss_iteration,
+                    mutable_generator_state['yielded_values'],
+                    patience_epoch_threshold,
+                )).strip()
+
+            # Print progress no more often than once every few seconds.
+            if progress_print_interval is not None and (
+                    time.time() - last_progress_print > progress_print_interval):
+                print(progress_preamble, progress_message)
+                last_progress_print = time.time()
+
+            if progress_callback:
+                progress_callback()
+
+            if epoch >= patience_epoch_threshold:
+                if progress_print_interval is not None:
+                    print(progress_preamble, "STOPPING", progress_message)
+                    break
+            epoch += 1
 
         fit_info["time"] = time.time() - start
         fit_info["num_points"] = mutable_generator_state["yielded_values"]
@@ -828,7 +864,6 @@ class Class1NeuralNetwork(object):
             y_dict_with_random_negatives['output'],
             **encode_y_kwargs)
 
-        val_losses = []
         min_val_loss_iteration = None
         min_val_loss = None
 
@@ -929,7 +964,6 @@ class Class1NeuralNetwork(object):
 
             if self.hyperparameters['validation_split']:
                 val_loss = fit_info['val_loss'][-1]
-                val_losses.append(val_loss)
 
                 if min_val_loss is None or (
                         val_loss < min_val_loss - self.hyperparameters['min_delta']):
@@ -944,11 +978,13 @@ class Class1NeuralNetwork(object):
                         if progress_print_interval is not None:
                             print((progress_preamble + " " +
                                 "Stopping at epoch %3d / %3d: loss=%g. "
-                                "Min val loss (%s) at epoch %s" % (
+                                "Min val loss (%g) at epoch %s" % (
                                     i,
                                     self.hyperparameters['max_epochs'],
                                     fit_info['loss'][-1],
-                                    str(min_val_loss),
+                                    (
+                                        min_val_loss if min_val_loss is not None
+                                        else numpy.nan),
                                     min_val_loss_iteration)).strip())
                         break
 
diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py
index ca5a5a75..78c29da6 100644
--- a/mhcflurry/train_pan_allele_models_command.py
+++ b/mhcflurry/train_pan_allele_models_command.py
@@ -195,19 +195,20 @@ def pretrain_data_iterator(
         numpy.tile(usable_alleles, peptides_per_chunk),
         borrow_from=master_allele_encoding)
 
-    synthetic_iter = pandas.read_csv(
-        filename, index_col=0, chunksize=peptides_per_chunk)
-    for (k, df) in enumerate(synthetic_iter):
-        if len(df) != peptides_per_chunk:
-            continue
+    while True:
+        synthetic_iter = pandas.read_csv(
+            filename, index_col=0, chunksize=peptides_per_chunk)
+        for (k, df) in enumerate(synthetic_iter):
+            if len(df) != peptides_per_chunk:
+                continue
 
-        df = df[usable_alleles]
-        encodable_peptides = EncodableSequences(
-            numpy.repeat(
-                df.index.values,
-                len(usable_alleles)))
+            df = df[usable_alleles]
+            encodable_peptides = EncodableSequences(
+                numpy.repeat(
+                    df.index.values,
+                    len(usable_alleles)))
 
-        yield (allele_encoding, encodable_peptides, df.stack().values)
+            yield (allele_encoding, encodable_peptides, df.stack().values)
 
 
 def run(argv=sys.argv[1:]):
@@ -493,8 +494,8 @@ def train_model(
         pretrain_min_delta = get_train_param("pretrain_min_delta", 0.0)
         pretrain_steps_per_epoch = get_train_param(
             "pretrain_steps_per_epoch", 10)
-        pretrain_max_epochs = get_train_param(
-            "pretrain_max_epochs", 1000)
+        pretrain_max_epochs = get_train_param("pretrain_max_epochs", 1000)
+        pretrain_min_epochs = get_train_param("pretrain_min_epochs", 0)
         pretrain_peptides_per_step = get_train_param(
             "pretrain_peptides_per_step", 1024)
         max_val_loss = get_train_param("pretrain_max_val_loss", None)
@@ -527,7 +528,10 @@ def train_model(
                 min_delta=pretrain_min_delta,
                 steps_per_epoch=pretrain_steps_per_epoch,
                 epochs=pretrain_max_epochs,
+                min_epochs=pretrain_min_epochs,
                 verbose=verbose,
+                progress_preamble=progress_preamble + "PRETRAIN",
+                progress_print_interval=progress_print_interval,
             )
             model.fit_info[-1].setdefault(
                 "training_info", {})["pretrain_attempt"] = attempt
diff --git a/test/expensive_test_pretrain_optimizable.py b/test/expensive_test_pretrain_optimizable.py
index 027a88bb..99f9497e 100644
--- a/test/expensive_test_pretrain_optimizable.py
+++ b/test/expensive_test_pretrain_optimizable.py
@@ -51,8 +51,9 @@ HYPERPARAMTERS = {
     'random_negative_match_distribution': True, 'random_negative_rate': 0.2,
     'train_data': {'pretrain': True,
                    'pretrain_max_epochs': 30,
-                   'pretrain_patience': 5,
-                   'pretrain_peptides_per_step': 32,
+                   'pretrain_min_epochs': 5,
+                   'pretrain_patience': 3,
+                   'pretrain_peptides_per_step': 8,
                    'pretrain_steps_per_epoch': 256},
     'validation_split': 0.1,
     'data_dependent_initialization_method': "lsuv",
-- 
GitLab