From f472d4b71a24f43fea7add5ffe62b3ac0dddfa00 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 9 Feb 2018 17:31:32 -0500
Subject: [PATCH] update comments

---
 mhcflurry/class1_affinity_predictor.py        | 18 +++++++---
 mhcflurry/class1_neural_network.py            | 29 +++++++++------
 .../train_allele_specific_models_command.py   | 35 ++++++++++++-------
 3 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index 5cf8bf5f..aa41f0cf 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -400,7 +400,8 @@ class Class1AffinityPredictor(object):
             inequalities=None,
             models_dir_for_save=None,
             verbose=0,
-            progress_preamble=""):
+            progress_preamble="",
+            progress_print_interval=5.0):
         """
         Fit one or more allele specific predictors for a single allele using a
         single neural network architecture.
@@ -438,6 +439,9 @@ class Class1AffinityPredictor(object):
         progress_preamble : string
             Optional string of information to include in each progress update
 
+        progress_print_interval : float
+            How often (in seconds) to print progress. Set to None to disable.
+
         Returns
         -------
         list of `Class1NeuralNetwork`
@@ -488,7 +492,8 @@ class Class1AffinityPredictor(object):
                         model_num=model_num + 1,
                         n_models=n_models,
                         architecture_num=architecture_num + 1,
-                        n_architectures=n_architectures))
+                        n_architectures=n_architectures),
+                    progress_print_interval=progress_print_interval)
 
                 if n_architectures > 1:
                     # We require val_loss (i.e. a validation set) if we have
@@ -532,7 +537,8 @@ class Class1AffinityPredictor(object):
             inequalities,
             models_dir_for_save=None,
             verbose=1,
-            progress_preamble=""):
+            progress_preamble="",
+            progress_print_interval=5.0):
         """
         Fit one or more pan-allele predictors using a single neural network
         architecture.
@@ -568,6 +574,9 @@ class Class1AffinityPredictor(object):
         progress_preamble : string
             Optional string of information to include in each progress update
 
+        progress_print_interval : float
+            How often (in seconds) to print progress. Set to None to disable.
+
         Returns
         -------
         list of `Class1NeuralNetwork`
@@ -589,7 +598,8 @@ class Class1AffinityPredictor(object):
                 inequalities=inequalities,
                 allele_encoding=allele_encoding,
                 verbose=verbose,
-                progress_preamble=progress_preamble)
+                progress_preamble=progress_preamble,
+                progress_print_interval=progress_print_interval)
 
             model_name = self.model_name("pan-class1", i)
             self.class1_pan_allele_models.append(model)
diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index 6ce50ffc..4b9e7878 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -418,7 +418,8 @@ class Class1NeuralNetwork(object):
             sample_weights=None,
             shuffle_permutation=None,
             verbose=1,
-            progress_preamble=""):
+            progress_preamble="",
+            progress_print_interval=5.0):
         """
         Fit the neural network.
         
@@ -454,6 +455,10 @@ class Class1NeuralNetwork(object):
 
         progress_preamble : string
             Optional string of information to include in each progress update
+
+        progress_print_interval : float
+            How often (in seconds) to print progress update. Set to None to
+            disable.
         """
 
         self.fit_num_points = len(peptides)
@@ -673,7 +678,10 @@ class Class1NeuralNetwork(object):
                 self.loss_history[key].extend(value)
 
             # Print progress no more often than once every few seconds.
-            if not last_progress_print or time.time() - last_progress_print > 5:
+            if progress_print_interval is not None and (
+                    not last_progress_print or (
+                        time.time() - last_progress_print
+                        > progress_print_interval)):
                 print((progress_preamble + " " +
                        "Epoch %3d / %3d: loss=%g. "
                        "Min val loss (%s) at epoch %s" % (
@@ -697,14 +705,15 @@ class Class1NeuralNetwork(object):
                         min_val_loss_iteration +
                         self.hyperparameters['patience'])
                     if i > threshold:
-                        print((progress_preamble + " " +
-                            "Stopping at epoch %3d / %3d: loss=%g. "
-                            "Min val loss (%s) at epoch %s" % (
-                                i,
-                                self.hyperparameters['max_epochs'],
-                                self.loss_history['loss'][-1],
-                                str(min_val_loss),
-                                min_val_loss_iteration)).strip())
+                        if progress_print_interval is not None:
+                            print((progress_preamble + " " +
+                                "Stopping at epoch %3d / %3d: loss=%g. "
+                                "Min val loss (%s) at epoch %s" % (
+                                    i,
+                                    self.hyperparameters['max_epochs'],
+                                    self.loss_history['loss'][-1],
+                                    str(min_val_loss),
+                                    min_val_loss_iteration)).strip())
                         break
         self.fit_seconds = time.time() - start
 
diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py
index 4b525127..9396230b 100644
--- a/mhcflurry/train_allele_specific_models_command.py
+++ b/mhcflurry/train_allele_specific_models_command.py
@@ -26,7 +26,7 @@ from .common import configure_logging, set_keras_backend
 # parallel, we use this global variable as a place to store data. Data that is
 # stored here before creating the thread pool will be inherited to the child
 # processes upon fork() call, allowing us to share large data with the workers
-# efficiently.
+# via shared memory.
 GLOBAL_DATA = {}
 
 
@@ -119,7 +119,11 @@ parser.add_argument(
     help="Keras backend. If not specified will use system default.")
 parser.add_argument(
     "--gpus",
-    type=int)
+    type=int,
+    metavar="N",
+    help="Number of GPUs to attempt to parallelize across. Requires running "
+    "in parallel.")
+
 
 def run(argv=sys.argv[1:]):
     global GLOBAL_DATA
@@ -174,28 +178,30 @@ def run(argv=sys.argv[1:]):
 
     predictor = Class1AffinityPredictor()
     if args.num_jobs[0] == 1:
-        # Serial run
+        # Serial run.
         print("Running in serial.")
         worker_pool = None
         if args.backend:
             set_keras_backend(args.backend)
 
     else:
+        # Parallel run.
         env_queue = None
         if args.gpus:
+            print("Attempting to round-robin assign each worker a GPU.")
+
+            # We assign each worker to a GPU using the CUDA_VISIBLE_DEVICES
+            # environment variable. To do this, we push environment variables
+            # onto a queue. Each worker reads a single item from the queue,
+            # which is a list of environment variables to set.
             next_device = itertools.cycle([
-                "%d" % num
-                for num in range(args.gpus)
+                "%d" % num for num in range(args.gpus)
             ])
-            queue_items = []
+            env_queue = Queue()
             for num in range(args.num_jobs[0]):
-                queue_items.append([
+                item = [
                     ("CUDA_VISIBLE_DEVICES", next(next_device)),
-                ])
-        
-            print("Attempting to round-robin assign each worker a GPU", queue_items)
-            env_queue = Queue()
-            for item in queue_items:
+                ]
                 env_queue.put(item)
 
         worker_pool = Pool(
@@ -238,6 +244,7 @@ def run(argv=sys.argv[1:]):
                     'data': None,  # subselect from GLOBAL_DATA["train_data"]
                     'hyperparameters': hyperparameters,
                     'verbose': args.verbosity,
+                    'progress_print_interval': None if worker_pool else 5.0,
                     'predictor': predictor if not worker_pool else None,
                     'save_to': args.out_models_dir if not worker_pool else None,
                 }
@@ -361,6 +368,7 @@ def train_model(
         data,
         hyperparameters,
         verbose,
+        progress_print_interval,
         predictor,
         save_to):
 
@@ -395,6 +403,7 @@ def train_model(
             if "measurement_inequality" in train_data.columns else None),
         models_dir_for_save=save_to,
         progress_preamble=progress_preamble,
+        progress_print_interval=progress_print_interval,
         verbose=verbose)
 
     if allele_num == 0 and model_group == 0:
@@ -427,6 +436,8 @@ def calibrate_percentile_ranks(allele, predictor, peptides=None):
 def worker_init(env_queue=None):
     global GLOBAL_DATA
 
+    # The env_queue provides a way for each worker to be configured with a
+    # specific set of environment variables. We use it to assign GPUs to workers.
     if env_queue:
         settings = env_queue.get()
         print("Setting: ", settings)
-- 
GitLab