From 5186ccf059c2c2ffad74429785d9999916d5d2e0 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sat, 27 Jan 2018 16:15:58 -0500
Subject: [PATCH] Support calibrating percentile ranks in parallel

---
 mhcflurry/class1_affinity_predictor.py        | 104 +++++++++++++++++-
 .../train_allele_specific_models_command.py   |  28 ++---
 2 files changed, 112 insertions(+), 20 deletions(-)

diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index 162a5fbe..10ac0e55 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -9,10 +9,12 @@ from os.path import join, exists
 from os import mkdir
 from socket import gethostname
 from getpass import getuser
+from functools import partial
 
 import mhcnames
 import numpy
 import pandas
+import tqdm  # progress bars
 from numpy.testing import assert_equal
 from six import string_types
 
@@ -588,7 +590,8 @@ class Class1AffinityPredictor(object):
             num_peptides_per_length=int(1e5),
             alleles=None,
             bins=None,
-            quiet=False):
+            quiet=False,
+            worker_pool=None):
         """
         Compute the cumulative distribution of ic50 values for a set of alleles
         over a large universe of random peptides, to enable computing quantiles in
@@ -627,6 +630,8 @@ class Class1AffinityPredictor(object):
                 peptides.extend(
                     random_peptides(num_peptides_per_length, length))
 
+
+
         if quiet:
             def msg(s):
                 pass
@@ -948,3 +953,100 @@ class Class1AffinityPredictor(object):
         ]
         loaded.close()
         return weights
+
+    def calibrate_percentile_ranks(
+            self,
+            peptides=None,
+            num_peptides_per_length=int(1e5),
+            alleles=None,
+            bins=None,
+            worker_pool=None):
+        """
+        Compute the cumulative distribution of ic50 values for a set of alleles
+        over a large universe of random peptides, to enable computing quantiles in
+        this distribution later.
+
+        Parameters
+        ----------
+        peptides : sequence of string, optional
+            Peptides to use
+        num_peptides_per_length : int, optional
+            If peptides argument is not specified, then num_peptides_per_length
+            peptides are randomly sampled from a uniform distribution for each
+            supported length
+        alleles : sequence of string, optional
+            Alleles to perform calibration for. If not specified all supported
+            alleles will be calibrated.
+        bins : object
+            Anything that can be passed to numpy.histogram's "bins" argument
+            can be used here, i.e. either an integer or a sequence giving bin
+            edges. This is in ic50 space.
+        worker_pool : multiprocessing.Pool, optional
+            If specified multiple alleles will be calibrated in parallel
+        """
+        if bins is None:
+            bins = to_ic50(numpy.linspace(1, 0, 1000))
+
+        if alleles is None:
+            alleles = self.supported_alleles
+
+        if peptides is None:
+            peptides = []
+            lengths = range(
+                self.supported_peptide_lengths[0],
+                self.supported_peptide_lengths[1] + 1)
+            for length in lengths:
+                peptides.extend(
+                    random_peptides(num_peptides_per_length, length))
+
+        encoded_peptides = EncodableSequences.create(peptides)
+
+        if worker_pool and len(alleles) > 1:
+            # Run in parallel
+            do_work = partial(
+                _calibrate_percentile_ranks,
+                predictor=self,
+                peptides=encoded_peptides,
+                bins=bins)
+            list_of_singleton_alleles = [ [allele] for allele in alleles ]
+            results = worker_pool.imap_unordered(
+                do_work, list_of_singleton_alleles, chunksize=1)
+
+            # Add progress bar
+            results = tqdm.tqdm(results, ascii=True, total=len(alleles))
+
+            # Merge results
+            for partial_dict in results:
+                self.allele_to_percent_rank_transform.update(partial_dict)
+        else:
+            # Run in serial
+            self.allele_to_percent_rank_transform.update(
+                _calibrate_percentile_ranks(
+                    alleles=alleles,
+                    predictor=self,
+                    peptides=encoded_peptides,
+                    bins=bins))
+
+
+def _calibrate_percentile_ranks(alleles, predictor, peptides, bins):
+    """
+    Private helper function.
+
+    Parameters
+    ----------
+    alleles
+    predictor
+    peptides
+    bins
+
+    Returns
+    -------
+
+    """
+    result = {}
+    for (i, allele) in enumerate(alleles):
+        predictions = predictor.predict(peptides, allele=allele)
+        transform = PercentRankTransform()
+        transform.fit(predictions, bins=bins)
+        result[allele] = transform
+    return result
diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py
index c52ca700..4cc6fe5f 100644
--- a/mhcflurry/train_allele_specific_models_command.py
+++ b/mhcflurry/train_allele_specific_models_command.py
@@ -215,36 +215,26 @@ def run(argv=sys.argv[1:]):
             # as it goes so no saving is required at the end.
             start = time.time()
             data_trained_on = 0
-            while work_items:
-                item = work_items.pop(0)
+
+            for _ in tqdm.trange(len(work_items)):
+                item = work_items.pop(0)  # want to keep freeing up memory
                 work_predictor = work_entrypoint(item)
                 assert work_predictor is predictor
 
-                # When running in serial we try to estimate time remaining.
-                data_trained_on += len(item['data'])
-                progress = float(data_trained_on) / total_data_to_train_on
-                time_elapsed = time.time() - start
-                total_time = time_elapsed / progress
-                print(
-                    "Estimated total training time: %0.2f min, "
-                    "remaining: %0.2f min" % (
-                        total_time / 60,
-                        (total_time - time_elapsed) / 60))
-
-
-    if worker_pool:
-        worker_pool.close()
-        worker_pool.join()
-
     if args.percent_rank_calibration_num_peptides_per_length > 0:
         start = time.time()
         print("Performing percent rank calibration.")
         predictor.calibrate_percentile_ranks(
-            num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length)
+            num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length,
+            worker_pool=worker_pool)
         print("Finished calibrating percent ranks in %0.2f sec." % (
             time.time() - start))
         predictor.save(args.out_models_dir, model_names_to_write=[])
 
+    if worker_pool:
+        worker_pool.close()
+        worker_pool.join()
+
 
 def work_entrypoint(item):
     return process_work(**item)
-- 
GitLab