Support calibrating percentile ranks in parallel

5186ccf0 · Tim O'Donnell · 67f17c27 · 5186ccf0 · 5186ccf0
Commit 5186ccf0 authored 7 years ago by Tim O'Donnell
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -9,10 +9,12 @@ from os.path import join, exists
 from os import mkdir
 from socket import gethostname
 from getpass import getuser
+from functools import partial

 import mhcnames
 import numpy
 import pandas
+import tqdm  # progress bars
 from numpy.testing import assert_equal
 from six import string_types

@@ -588,7 +590,8 @@ class Class1AffinityPredictor(object):
            num_peptides_per_length=int(1e5),
            alleles=None,
            bins=None,
-            quiet=False):
+            quiet=False,
+            worker_pool=None):
        """
        Compute the cumulative distribution of ic50 values for a set of alleles
        over a large universe of random peptides, to enable computing quantiles in
@@ -627,6 +630,8 @@ class Class1AffinityPredictor(object):
                peptides.extend(
                    random_peptides(num_peptides_per_length, length))

+
+
        if quiet:
            def msg(s):
                pass
@@ -948,3 +953,100 @@ class Class1AffinityPredictor(object):
        ]
        loaded.close()
        return weights
+
+    def calibrate_percentile_ranks(
+            self,
+            peptides=None,
+            num_peptides_per_length=int(1e5),
+            alleles=None,
+            bins=None,
+            worker_pool=None):
+        """
+        Compute the cumulative distribution of ic50 values for a set of alleles
+        over a large universe of random peptides, to enable computing quantiles in
+        this distribution later.
+
+        Parameters
+        ----------
+        peptides : sequence of string, optional
+            Peptides to use
+        num_peptides_per_length : int, optional
+            If peptides argument is not specified, then num_peptides_per_length
+            peptides are randomly sampled from a uniform distribution for each
+            supported length
+        alleles : sequence of string, optional
+            Alleles to perform calibration for. If not specified all supported
+            alleles will be calibrated.
+        bins : object
+            Anything that can be passed to numpy.histogram's "bins" argument
+            can be used here, i.e. either an integer or a sequence giving bin
+            edges. This is in ic50 space.
+        worker_pool : multiprocessing.Pool, optional
+            If specified multiple alleles will be calibrated in parallel
+        """
+        if bins is None:
+            bins = to_ic50(numpy.linspace(1, 0, 1000))
+
+        if alleles is None:
+            alleles = self.supported_alleles
+
+        if peptides is None:
+            peptides = []
+            lengths = range(
+                self.supported_peptide_lengths[0],
+                self.supported_peptide_lengths[1] + 1)
+            for length in lengths:
+                peptides.extend(
+                    random_peptides(num_peptides_per_length, length))
+
+        encoded_peptides = EncodableSequences.create(peptides)
+
+        if worker_pool and len(alleles) > 1:
+            # Run in parallel
+            do_work = partial(
+                _calibrate_percentile_ranks,
+                predictor=self,
+                peptides=encoded_peptides,
+                bins=bins)
+            list_of_singleton_alleles = [ [allele] for allele in alleles ]
+            results = worker_pool.imap_unordered(
+                do_work, list_of_singleton_alleles, chunksize=1)
+
+            # Add progress bar
+            results = tqdm.tqdm(results, ascii=True, total=len(alleles))
+
+            # Merge results
+            for partial_dict in results:
+                self.allele_to_percent_rank_transform.update(partial_dict)
+        else:
+            # Run in serial
+            self.allele_to_percent_rank_transform.update(
+                _calibrate_percentile_ranks(
+                    alleles=alleles,
+                    predictor=self,
+                    peptides=encoded_peptides,
+                    bins=bins))
+
+
+def _calibrate_percentile_ranks(alleles, predictor, peptides, bins):
+    """
+    Private helper function.
+
+    Parameters
+    ----------
+    alleles
+    predictor
+    peptides
+    bins
+
+    Returns
+    -------
+
+    """
+    result = {}
+    for (i, allele) in enumerate(alleles):
+        predictions = predictor.predict(peptides, allele=allele)
+        transform = PercentRankTransform()
+        transform.fit(predictions, bins=bins)
+        result[allele] = transform
+    return result
--- a/mhcflurry/train_allele_specific_models_command.py
+++ b/mhcflurry/train_allele_specific_models_command.py
@@ -215,36 +215,26 @@ def run(argv=sys.argv[1:]):
            # as it goes so no saving is required at the end.
            start = time.time()
            data_trained_on = 0
-            while work_items:
-                item = work_items.pop(0)
+
+            for _ in tqdm.trange(len(work_items)):
+                item = work_items.pop(0)  # want to keep freeing up memory
                work_predictor = work_entrypoint(item)
                assert work_predictor is predictor

-                # When running in serial we try to estimate time remaining.
-                data_trained_on += len(item['data'])
-                progress = float(data_trained_on) / total_data_to_train_on
-                time_elapsed = time.time() - start
-                total_time = time_elapsed / progress
-                print(
-                    "Estimated total training time: %0.2f min, "
-                    "remaining: %0.2f min" % (
-                        total_time / 60,
-                        (total_time - time_elapsed) / 60))
-
-
-    if worker_pool:
-        worker_pool.close()
-        worker_pool.join()
-
    if args.percent_rank_calibration_num_peptides_per_length > 0:
        start = time.time()
        print("Performing percent rank calibration.")
        predictor.calibrate_percentile_ranks(
-            num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length)
+            num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length,
+            worker_pool=worker_pool)
        print("Finished calibrating percent ranks in %0.2f sec." % (
            time.time() - start))
        predictor.save(args.out_models_dir, model_names_to_write=[])

+    if worker_pool:
+        worker_pool.close()
+        worker_pool.join()
+

 def work_entrypoint(item):
    return process_work(**item)