From 5186ccf059c2c2ffad74429785d9999916d5d2e0 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sat, 27 Jan 2018 16:15:58 -0500 Subject: [PATCH] Support calibrating percentile ranks in parallel --- mhcflurry/class1_affinity_predictor.py | 104 +++++++++++++++++- .../train_allele_specific_models_command.py | 28 ++--- 2 files changed, 112 insertions(+), 20 deletions(-) diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 162a5fbe..10ac0e55 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -9,10 +9,12 @@ from os.path import join, exists from os import mkdir from socket import gethostname from getpass import getuser +from functools import partial import mhcnames import numpy import pandas +import tqdm # progress bars from numpy.testing import assert_equal from six import string_types @@ -588,7 +590,8 @@ class Class1AffinityPredictor(object): num_peptides_per_length=int(1e5), alleles=None, bins=None, - quiet=False): + quiet=False, + worker_pool=None): """ Compute the cumulative distribution of ic50 values for a set of alleles over a large universe of random peptides, to enable computing quantiles in @@ -627,6 +630,8 @@ class Class1AffinityPredictor(object): peptides.extend( random_peptides(num_peptides_per_length, length)) + + if quiet: def msg(s): pass @@ -948,3 +953,100 @@ class Class1AffinityPredictor(object): ] loaded.close() return weights + + def calibrate_percentile_ranks( + self, + peptides=None, + num_peptides_per_length=int(1e5), + alleles=None, + bins=None, + worker_pool=None): + """ + Compute the cumulative distribution of ic50 values for a set of alleles + over a large universe of random peptides, to enable computing quantiles in + this distribution later. + + Parameters + ---------- + peptides : sequence of string, optional + Peptides to use + num_peptides_per_length : int, optional + If peptides argument is not specified, then num_peptides_per_length + peptides are randomly sampled from a uniform distribution for each + supported length + alleles : sequence of string, optional + Alleles to perform calibration for. If not specified all supported + alleles will be calibrated. + bins : object + Anything that can be passed to numpy.histogram's "bins" argument + can be used here, i.e. either an integer or a sequence giving bin + edges. This is in ic50 space. + worker_pool : multiprocessing.Pool, optional + If specified multiple alleles will be calibrated in parallel + """ + if bins is None: + bins = to_ic50(numpy.linspace(1, 0, 1000)) + + if alleles is None: + alleles = self.supported_alleles + + if peptides is None: + peptides = [] + lengths = range( + self.supported_peptide_lengths[0], + self.supported_peptide_lengths[1] + 1) + for length in lengths: + peptides.extend( + random_peptides(num_peptides_per_length, length)) + + encoded_peptides = EncodableSequences.create(peptides) + + if worker_pool and len(alleles) > 1: + # Run in parallel + do_work = partial( + _calibrate_percentile_ranks, + predictor=self, + peptides=encoded_peptides, + bins=bins) + list_of_singleton_alleles = [ [allele] for allele in alleles ] + results = worker_pool.imap_unordered( + do_work, list_of_singleton_alleles, chunksize=1) + + # Add progress bar + results = tqdm.tqdm(results, ascii=True, total=len(alleles)) + + # Merge results + for partial_dict in results: + self.allele_to_percent_rank_transform.update(partial_dict) + else: + # Run in serial + self.allele_to_percent_rank_transform.update( + _calibrate_percentile_ranks( + alleles=alleles, + predictor=self, + peptides=encoded_peptides, + bins=bins)) + + +def _calibrate_percentile_ranks(alleles, predictor, peptides, bins): + """ + Private helper function. + + Parameters + ---------- + alleles + predictor + peptides + bins + + Returns + ------- + + """ + result = {} + for (i, allele) in enumerate(alleles): + predictions = predictor.predict(peptides, allele=allele) + transform = PercentRankTransform() + transform.fit(predictions, bins=bins) + result[allele] = transform + return result diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py index c52ca700..4cc6fe5f 100644 --- a/mhcflurry/train_allele_specific_models_command.py +++ b/mhcflurry/train_allele_specific_models_command.py @@ -215,36 +215,26 @@ def run(argv=sys.argv[1:]): # as it goes so no saving is required at the end. start = time.time() data_trained_on = 0 - while work_items: - item = work_items.pop(0) + + for _ in tqdm.trange(len(work_items)): + item = work_items.pop(0) # want to keep freeing up memory work_predictor = work_entrypoint(item) assert work_predictor is predictor - # When running in serial we try to estimate time remaining. - data_trained_on += len(item['data']) - progress = float(data_trained_on) / total_data_to_train_on - time_elapsed = time.time() - start - total_time = time_elapsed / progress - print( - "Estimated total training time: %0.2f min, " - "remaining: %0.2f min" % ( - total_time / 60, - (total_time - time_elapsed) / 60)) - - - if worker_pool: - worker_pool.close() - worker_pool.join() - if args.percent_rank_calibration_num_peptides_per_length > 0: start = time.time() print("Performing percent rank calibration.") predictor.calibrate_percentile_ranks( - num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length) + num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length, + worker_pool=worker_pool) print("Finished calibrating percent ranks in %0.2f sec." % ( time.time() - start)) predictor.save(args.out_models_dir, model_names_to_write=[]) + if worker_pool: + worker_pool.close() + worker_pool.join() + def work_entrypoint(item): return process_work(**item) -- GitLab