From ced183f7ca06f649864ac5d248d85d74b83880d9 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 31 Jul 2019 10:58:11 -0400 Subject: [PATCH] fixes --- .../models_class1_pan/GENERATE.sh | 5 +- .../calibrate_percentile_ranks_command.py | 101 +++++++++++++++--- mhcflurry/class1_affinity_predictor.py | 74 ++++++++++++- mhcflurry/common.py | 14 +++ mhcflurry/local_parallelism.py | 2 +- 5 files changed, 176 insertions(+), 20 deletions(-) diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh index c3aa7fad..2c2dd104 100755 --- a/downloads-generation/models_class1_pan/GENERATE.sh +++ b/downloads-generation/models_class1_pan/GENERATE.sh @@ -56,7 +56,10 @@ do time mhcflurry-calibrate-percentile-ranks \ --models-dir models.${kind} \ - --num-peptides-per-length 10000 \ + --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ + --motif-summary \ + --num-peptides-per-length 100000 \ + --verbosity 1 \ --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 done diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py index 43cc9d86..5cd1334e 100644 --- a/mhcflurry/calibrate_percentile_ranks_command.py +++ b/mhcflurry/calibrate_percentile_ranks_command.py @@ -7,14 +7,20 @@ import signal import sys import time import traceback +import collections from functools import partial +import pandas +import numpy + from mhcnames import normalize_allele_name import tqdm # progress bar tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 from .class1_affinity_predictor import Class1AffinityPredictor -from .common import configure_logging +from .encodable_sequences import EncodableSequences +from .common import configure_logging, random_peptides, amino_acid_distribution +from .amino_acid import BLOSUM62_MATRIX from .local_parallelism import ( add_local_parallelism_args, worker_pool_with_gpu_assignments_from_args, @@ -41,6 +47,11 @@ parser.add_argument( nargs="+", help="Alleles to calibrate percentile ranks for. If not specified all " "alleles are used") +parser.add_argument( + "--match-amino-acid-distribution-data", + help="Sample random peptides from the amino acid distribution of the " + "peptides listed in the supplied CSV file, which must have a 'peptide' " + "column. If not specified a uniform distribution is used.") parser.add_argument( "--num-peptides-per-length", type=int, @@ -48,6 +59,25 @@ parser.add_argument( default=int(1e5), help="Number of peptides per length to use to calibrate percent ranks. " "Default: %(default)s.") +parser.add_argument( + "--motif-summary", + default=False, + action="store_true", + help="Calculate motifs and length preferences for each allele") +parser.add_argument( + "--summary-top-peptide-fraction", + default=0.001, + type=float, + metavar="X", + help="The top X fraction of predictions (i.e. tightest binders) to use to " + "generate motifs and length preferences. Default: %(default)s") +parser.add_argument( + "--length-range", + default=(8, 15), + type=int, + nargs=2, + help="Min and max peptide length to calibrate, inclusive. " + "Default: %(default)s") parser.add_argument( "--verbosity", type=int, @@ -77,13 +107,26 @@ def run(argv=sys.argv[1:]): else: alleles = predictor.supported_alleles + distribution = None + if args.match_amino_acid_distribution_data: + distribution_peptides = pandas.read_csv( + args.match_amino_acid_distribution_data).peptide.unique() + distribution = amino_acid_distribution(distribution_peptides) + print("Using amino acid distribution:") + print(distribution) + start = time.time() - print("Performing percent rank calibration for %d alleles: encoding peptides" % ( + print("Percent rank calibration for %d alleles. Encoding peptides." % ( len(alleles))) - encoded_peptides = predictor.calibrate_percentile_ranks( - alleles=[], # don't actually do any calibration, just return peptides - num_peptides_per_length=args.num_peptides_per_length) + + peptides = [] + lengths = range(args.length_range[0], args.length_range[1] + 1) + for length in lengths: + peptides.extend( + random_peptides( + args.num_peptides_per_length, length, distribution=distribution)) + encoded_peptides = EncodableSequences.create(peptides) # Now we encode the peptides for each neural network, so the encoding # becomes cached. @@ -100,6 +143,12 @@ def run(argv=sys.argv[1:]): worker_pool = worker_pool_with_gpu_assignments_from_args(args) + constant_kwargs = { + 'motif_summary': args.motif_summary, + 'summary_top_peptide_fraction': args.summary_top_peptide_fraction, + 'verbose': args.verbosity > 0, + } + if worker_pool is None: # Serial run print("Running in serial.") @@ -107,20 +156,34 @@ def run(argv=sys.argv[1:]): calibrate_percentile_ranks( allele=allele, predictor=predictor, - peptides=encoded_peptides) + peptides=encoded_peptides, + **constant_kwargs, + ) for allele in alleles) else: # Parallel run results = worker_pool.imap_unordered( partial( partial(call_wrapped, calibrate_percentile_ranks), - predictor=predictor), + predictor=predictor, + **constant_kwargs), alleles, chunksize=1) - for result in tqdm.tqdm(results, total=len(alleles)): - predictor.allele_to_percent_rank_transform.update(result) - print("Done calibrating %d additional alleles." % len(alleles)) + summary_results_lists = collections.defaultdict(list) + for (transforms, summary_results) in tqdm.tqdm(results, total=len(alleles)): + predictor.allele_to_percent_rank_transform.update(transforms) + if summary_results is not None: + for (item, value) in summary_results.items(): + summary_results_lists[item].append(value) + print("Done calibrating %d alleles." % len(alleles)) + if summary_results_lists: + for (name, lst) in summary_results_lists.items(): + df = pandas.concat(lst, ignore_index=True) + predictor.metadata_dataframes[name] = df + print("Including summary result: %s" % name) + print(df) + predictor.save(args.models_dir, model_names_to_write=[]) percent_rank_calibration_time = time.time() - start @@ -134,19 +197,29 @@ def run(argv=sys.argv[1:]): print("Predictor written to: %s" % args.models_dir) -def calibrate_percentile_ranks(allele, predictor, peptides=None): +def calibrate_percentile_ranks( + allele, + predictor, + peptides=None, + motif_summary=False, + summary_top_peptide_fraction=0.001, + verbose=False): """ Private helper function. """ global GLOBAL_DATA if peptides is None: peptides = GLOBAL_DATA["calibration_peptides"] - predictor.calibrate_percentile_ranks( + summary_results = predictor.calibrate_percentile_ranks( peptides=peptides, - alleles=[allele]) - return { + alleles=[allele], + motif_summary=motif_summary, + summary_top_peptide_fraction=summary_top_peptide_fraction, + verbose=verbose) + transforms = { allele: predictor.allele_to_percent_rank_transform[allele], } + return (transforms, summary_results) if __name__ == '__main__': diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 0c383739..1f2507e8 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -17,7 +17,7 @@ from numpy.testing import assert_equal from six import string_types from .class1_neural_network import Class1NeuralNetwork -from .common import random_peptides +from .common import random_peptides, positional_frequency_matrix from .downloads import get_default_class1_models_dir from .encodable_sequences import EncodableSequences from .percent_rank_transform import PercentRankTransform @@ -95,7 +95,8 @@ class Class1AffinityPredictor(object): if not allele_to_percent_rank_transform: allele_to_percent_rank_transform = {} self.allele_to_percent_rank_transform = allele_to_percent_rank_transform - self.metadata_dataframes = metadata_dataframes + self.metadata_dataframes = ( + dict(metadata_dataframes) if metadata_dataframes else {}) self._cache = {} assert isinstance( self.allele_to_allele_specific_models, dict) @@ -1153,7 +1154,10 @@ class Class1AffinityPredictor(object): peptides=None, num_peptides_per_length=int(1e5), alleles=None, - bins=None): + bins=None, + motif_summary=False, + summary_top_peptide_fraction=0.001, + verbose=False): """ Compute the cumulative distribution of ic50 values for a set of alleles over a large universe of random peptides, to enable computing quantiles in @@ -1196,13 +1200,75 @@ class Class1AffinityPredictor(object): encoded_peptides = EncodableSequences.create(peptides) + if motif_summary: + frequency_matrices = [] + length_distributions = [] + else: + frequency_matrices = None + length_distributions = None for (i, allele) in enumerate(alleles): + start = time.time() predictions = self.predict(encoded_peptides, allele=allele) + if verbose: + elapsed = time.time() - start + print( + "Generated %d predictions for allele %s in %0.2f sec: " + "%0.2f predictions / sec" % ( + len(encoded_peptides.sequences), + allele, + elapsed, + len(encoded_peptides.sequences) / elapsed)) transform = PercentRankTransform() transform.fit(predictions, bins=bins) self.allele_to_percent_rank_transform[allele] = transform - return encoded_peptides + if frequency_matrices is not None: + predictions_df = pandas.DataFrame({ + 'peptide': encoded_peptides.sequences, + 'prediction': predictions + }).drop_duplicates('peptide').set_index("peptide") + predictions_df["length"] = predictions_df.index.str.len() + for (length, sub_df) in predictions_df.groupby("length"): + selected = sub_df.prediction.nsmallest( + max( + int(len(sub_df) * summary_top_peptide_fraction), + 1)).index.values + matrix = positional_frequency_matrix(selected).reset_index() + original_columns = list(matrix.columns) + matrix["length"] = length + matrix["allele"] = allele + matrix = matrix[["allele", "length"] + original_columns] + frequency_matrices.append(matrix) + + # Length distribution + length_distribution = predictions_df.prediction.nsmallest( + max( + int(len(predictions_df) * summary_top_peptide_fraction), + 1)).index.str.len().value_counts() + length_distribution.index.name = "length" + length_distribution /= length_distribution.sum() + length_distribution = length_distribution.to_frame() + length_distribution.columns = ["fraction"] + length_distribution = length_distribution.reset_index() + length_distribution["allele"] = allele + length_distribution = length_distribution[ + ["allele", "length", "fraction"] + ].sort_values("length") + length_distributions.append(length_distribution) + + if frequency_matrices is not None: + frequency_matrices = pandas.concat( + frequency_matrices, ignore_index=True) + + if length_distributions is not None: + length_distributions = pandas.concat( + length_distributions, ignore_index=True) + + if motif_summary: + return { + 'frequency_matrices': frequency_matrices, + 'length_distributions': length_distributions, + } def filter_networks(self, predicate): """ diff --git a/mhcflurry/common.py b/mhcflurry/common.py index b5120b52..68a51084 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -147,3 +147,17 @@ def random_peptides(num, length=9, distribution=None): p=distribution.values, size=(int(num), int(length))) ] + + +def positional_frequency_matrix(peptides): + length = len(peptides[0]) + assert all(len(peptide) == length for peptide in peptides) + counts = pandas.DataFrame( + index=[a for a in amino_acid.BLOSUM62_MATRIX.index if a != 'X'], + columns=numpy.arange(1, length + 1), + ) + for i in range(length): + counts[i + 1] = pandas.Series([p[i] for p in peptides]).value_counts() + result = (counts / len(peptides)).fillna(0.0).T + result.index.name = 'position' + return result \ No newline at end of file diff --git a/mhcflurry/local_parallelism.py b/mhcflurry/local_parallelism.py index 5a3d311b..85806b71 100644 --- a/mhcflurry/local_parallelism.py +++ b/mhcflurry/local_parallelism.py @@ -18,7 +18,7 @@ def add_local_parallelism_args(parser): group.add_argument( "--num-jobs", - default=1, + default=0, type=int, metavar="N", help="Number of processes to parallelize training over. Experimental. " -- GitLab