From 038704d2d10a0f7414bbad4a173bcc32ba4e6faa Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sun, 28 Jan 2018 22:50:30 -0500
Subject: [PATCH] specify parallelization for train and calibrate percentile
 ranks seperately

---
 .../models_class1/GENERATE.sh                 |  2 +-
 .../train_allele_specific_models_command.py   | 41 +++++++++++--------
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh
index 79a48cf7..d2bce04f 100755
--- a/downloads-generation/models_class1/GENERATE.sh
+++ b/downloads-generation/models_class1/GENERATE.sh
@@ -37,7 +37,7 @@ time mhcflurry-class1-train-allele-specific-models \
     --out-models-dir models \
     --percent-rank-calibration-num-peptides-per-length 1000000 \
     --min-measurements-per-allele 75 \
-    --num-jobs 32
+    --num-jobs 32 8
 
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt
diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py
index 87c6beef..7894af14 100644
--- a/mhcflurry/train_allele_specific_models_command.py
+++ b/mhcflurry/train_allele_specific_models_command.py
@@ -103,11 +103,13 @@ parser.add_argument(
     default=0)
 parser.add_argument(
     "--num-jobs",
-    default=1,
+    default=[1],
     type=int,
     metavar="N",
+    nargs="+",
     help="Number of processes to parallelize training and percent rank "
-    "calibration over. Experimental. "
+    "calibration over, respectively. Experimental. If only one value is specified "
+    "then the same number of jobs is used for both phases."
     "Set to 1 for serial run. Set to 0 to use number of cores. Default: %(default)s.")
 parser.add_argument(
     "--backend",
@@ -169,15 +171,15 @@ def run(argv=sys.argv[1:]):
     GLOBAL_DATA["train_data"] = df
 
     predictor = Class1AffinityPredictor()
-    if args.num_jobs == 1:
+    if args.num_jobs[0] == 1:
         # Serial run
         print("Running in serial.")
         worker_pool = None
     else:
         worker_pool = Pool(
             processes=(
-                args.num_jobs
-                if args.num_jobs else None))
+                args.num_jobs[0]
+                if args.num_jobs[0] else None))
         print("Using worker pool: %s" % str(worker_pool))
 
     if args.out_models_dir and not os.path.exists(args.out_models_dir):
@@ -219,12 +221,19 @@ def run(argv=sys.argv[1:]):
 
     if worker_pool:
         print("Processing %d work items in parallel." % len(work_items))
-        predictors = list(
-            tqdm.tqdm(
-                worker_pool.imap_unordered(
-                    train_model_entrypoint, work_items, chunksize=1),
-                ascii=True,
-                total=len(work_items)))
+
+        # We sort here so the predictors are in order of hyperparameter set num.
+        # This is convenient so that the neural networks get merged for each
+        # allele in the same order.
+        predictors = [
+            predictor for (_, predictor)
+            in sorted(
+                tqdm.tqdm(
+                    worker_pool.imap_unordered(
+                        train_model_entrypoint, work_items, chunksize=1),
+                    ascii=True,
+                    total=len(work_items)))
+        ]
 
         print("Merging %d predictors fit in parallel." % (len(predictors)))
         predictor = Class1AffinityPredictor.merge([predictor] + predictors)
@@ -237,7 +246,7 @@ def run(argv=sys.argv[1:]):
         start = time.time()
         for _ in tqdm.trange(len(work_items)):
             item = work_items.pop(0)  # want to keep freeing up memory
-            work_predictor = train_model_entrypoint(item)
+            (_, work_predictor) = train_model_entrypoint(item)
             assert work_predictor is predictor
         assert not work_items
 
@@ -270,7 +279,7 @@ def run(argv=sys.argv[1:]):
             time.time() - start))
         print("Calibrating percent rank calibration for %d alleles." % len(alleles))
 
-        if args.num_jobs == 1:
+        if args.num_jobs[-1] == 1:
             # Serial run
             print("Running in serial.")
             worker_pool = None
@@ -287,8 +296,8 @@ def run(argv=sys.argv[1:]):
             GLOBAL_DATA["calibration_peptides"] = encoded_peptides
             worker_pool = Pool(
                 processes=(
-                    args.num_jobs
-                    if args.num_jobs else None))
+                    args.num_jobs[-1]
+                    if args.num_jobs[-1] else None))
             print("Using worker pool: %s" % str(worker_pool))
             results = worker_pool.imap_unordered(
                 partial(
@@ -374,7 +383,7 @@ def train_model(
               (hyperparameter_set_num + 1))
         model.network(borrow=True).summary()
 
-    return predictor
+    return (hyperparameter_set_num, predictor)
 
 
 def calibrate_percentile_ranks(allele, predictor, peptides=None):
-- 
GitLab