Skip to content
Snippets Groups Projects
Commit ced183f7 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent b7763e97
No related merge requests found
......@@ -56,7 +56,10 @@ do
time mhcflurry-calibrate-percentile-ranks \
--models-dir models.${kind} \
--num-peptides-per-length 10000 \
--match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
--motif-summary \
--num-peptides-per-length 100000 \
--verbosity 1 \
--num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
done
......
......@@ -7,14 +7,20 @@ import signal
import sys
import time
import traceback
import collections
from functools import partial
import pandas
import numpy
from mhcnames import normalize_allele_name
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
from .class1_affinity_predictor import Class1AffinityPredictor
from .common import configure_logging
from .encodable_sequences import EncodableSequences
from .common import configure_logging, random_peptides, amino_acid_distribution
from .amino_acid import BLOSUM62_MATRIX
from .local_parallelism import (
add_local_parallelism_args,
worker_pool_with_gpu_assignments_from_args,
......@@ -41,6 +47,11 @@ parser.add_argument(
nargs="+",
help="Alleles to calibrate percentile ranks for. If not specified all "
"alleles are used")
parser.add_argument(
"--match-amino-acid-distribution-data",
help="Sample random peptides from the amino acid distribution of the "
"peptides listed in the supplied CSV file, which must have a 'peptide' "
"column. If not specified a uniform distribution is used.")
parser.add_argument(
"--num-peptides-per-length",
type=int,
......@@ -48,6 +59,25 @@ parser.add_argument(
default=int(1e5),
help="Number of peptides per length to use to calibrate percent ranks. "
"Default: %(default)s.")
parser.add_argument(
"--motif-summary",
default=False,
action="store_true",
help="Calculate motifs and length preferences for each allele")
parser.add_argument(
"--summary-top-peptide-fraction",
default=0.001,
type=float,
metavar="X",
help="The top X fraction of predictions (i.e. tightest binders) to use to "
"generate motifs and length preferences. Default: %(default)s")
parser.add_argument(
"--length-range",
default=(8, 15),
type=int,
nargs=2,
help="Min and max peptide length to calibrate, inclusive. "
"Default: %(default)s")
parser.add_argument(
"--verbosity",
type=int,
......@@ -77,13 +107,26 @@ def run(argv=sys.argv[1:]):
else:
alleles = predictor.supported_alleles
distribution = None
if args.match_amino_acid_distribution_data:
distribution_peptides = pandas.read_csv(
args.match_amino_acid_distribution_data).peptide.unique()
distribution = amino_acid_distribution(distribution_peptides)
print("Using amino acid distribution:")
print(distribution)
start = time.time()
print("Performing percent rank calibration for %d alleles: encoding peptides" % (
print("Percent rank calibration for %d alleles. Encoding peptides." % (
len(alleles)))
encoded_peptides = predictor.calibrate_percentile_ranks(
alleles=[], # don't actually do any calibration, just return peptides
num_peptides_per_length=args.num_peptides_per_length)
peptides = []
lengths = range(args.length_range[0], args.length_range[1] + 1)
for length in lengths:
peptides.extend(
random_peptides(
args.num_peptides_per_length, length, distribution=distribution))
encoded_peptides = EncodableSequences.create(peptides)
# Now we encode the peptides for each neural network, so the encoding
# becomes cached.
......@@ -100,6 +143,12 @@ def run(argv=sys.argv[1:]):
worker_pool = worker_pool_with_gpu_assignments_from_args(args)
constant_kwargs = {
'motif_summary': args.motif_summary,
'summary_top_peptide_fraction': args.summary_top_peptide_fraction,
'verbose': args.verbosity > 0,
}
if worker_pool is None:
# Serial run
print("Running in serial.")
......@@ -107,20 +156,34 @@ def run(argv=sys.argv[1:]):
calibrate_percentile_ranks(
allele=allele,
predictor=predictor,
peptides=encoded_peptides)
peptides=encoded_peptides,
**constant_kwargs,
)
for allele in alleles)
else:
# Parallel run
results = worker_pool.imap_unordered(
partial(
partial(call_wrapped, calibrate_percentile_ranks),
predictor=predictor),
predictor=predictor,
**constant_kwargs),
alleles,
chunksize=1)
for result in tqdm.tqdm(results, total=len(alleles)):
predictor.allele_to_percent_rank_transform.update(result)
print("Done calibrating %d additional alleles." % len(alleles))
summary_results_lists = collections.defaultdict(list)
for (transforms, summary_results) in tqdm.tqdm(results, total=len(alleles)):
predictor.allele_to_percent_rank_transform.update(transforms)
if summary_results is not None:
for (item, value) in summary_results.items():
summary_results_lists[item].append(value)
print("Done calibrating %d alleles." % len(alleles))
if summary_results_lists:
for (name, lst) in summary_results_lists.items():
df = pandas.concat(lst, ignore_index=True)
predictor.metadata_dataframes[name] = df
print("Including summary result: %s" % name)
print(df)
predictor.save(args.models_dir, model_names_to_write=[])
percent_rank_calibration_time = time.time() - start
......@@ -134,19 +197,29 @@ def run(argv=sys.argv[1:]):
print("Predictor written to: %s" % args.models_dir)
def calibrate_percentile_ranks(allele, predictor, peptides=None):
def calibrate_percentile_ranks(
allele,
predictor,
peptides=None,
motif_summary=False,
summary_top_peptide_fraction=0.001,
verbose=False):
"""
Private helper function.
"""
global GLOBAL_DATA
if peptides is None:
peptides = GLOBAL_DATA["calibration_peptides"]
predictor.calibrate_percentile_ranks(
summary_results = predictor.calibrate_percentile_ranks(
peptides=peptides,
alleles=[allele])
return {
alleles=[allele],
motif_summary=motif_summary,
summary_top_peptide_fraction=summary_top_peptide_fraction,
verbose=verbose)
transforms = {
allele: predictor.allele_to_percent_rank_transform[allele],
}
return (transforms, summary_results)
if __name__ == '__main__':
......
......@@ -17,7 +17,7 @@ from numpy.testing import assert_equal
from six import string_types
from .class1_neural_network import Class1NeuralNetwork
from .common import random_peptides
from .common import random_peptides, positional_frequency_matrix
from .downloads import get_default_class1_models_dir
from .encodable_sequences import EncodableSequences
from .percent_rank_transform import PercentRankTransform
......@@ -95,7 +95,8 @@ class Class1AffinityPredictor(object):
if not allele_to_percent_rank_transform:
allele_to_percent_rank_transform = {}
self.allele_to_percent_rank_transform = allele_to_percent_rank_transform
self.metadata_dataframes = metadata_dataframes
self.metadata_dataframes = (
dict(metadata_dataframes) if metadata_dataframes else {})
self._cache = {}
assert isinstance( self.allele_to_allele_specific_models, dict)
......@@ -1153,7 +1154,10 @@ class Class1AffinityPredictor(object):
peptides=None,
num_peptides_per_length=int(1e5),
alleles=None,
bins=None):
bins=None,
motif_summary=False,
summary_top_peptide_fraction=0.001,
verbose=False):
"""
Compute the cumulative distribution of ic50 values for a set of alleles
over a large universe of random peptides, to enable computing quantiles in
......@@ -1196,13 +1200,75 @@ class Class1AffinityPredictor(object):
encoded_peptides = EncodableSequences.create(peptides)
if motif_summary:
frequency_matrices = []
length_distributions = []
else:
frequency_matrices = None
length_distributions = None
for (i, allele) in enumerate(alleles):
start = time.time()
predictions = self.predict(encoded_peptides, allele=allele)
if verbose:
elapsed = time.time() - start
print(
"Generated %d predictions for allele %s in %0.2f sec: "
"%0.2f predictions / sec" % (
len(encoded_peptides.sequences),
allele,
elapsed,
len(encoded_peptides.sequences) / elapsed))
transform = PercentRankTransform()
transform.fit(predictions, bins=bins)
self.allele_to_percent_rank_transform[allele] = transform
return encoded_peptides
if frequency_matrices is not None:
predictions_df = pandas.DataFrame({
'peptide': encoded_peptides.sequences,
'prediction': predictions
}).drop_duplicates('peptide').set_index("peptide")
predictions_df["length"] = predictions_df.index.str.len()
for (length, sub_df) in predictions_df.groupby("length"):
selected = sub_df.prediction.nsmallest(
max(
int(len(sub_df) * summary_top_peptide_fraction),
1)).index.values
matrix = positional_frequency_matrix(selected).reset_index()
original_columns = list(matrix.columns)
matrix["length"] = length
matrix["allele"] = allele
matrix = matrix[["allele", "length"] + original_columns]
frequency_matrices.append(matrix)
# Length distribution
length_distribution = predictions_df.prediction.nsmallest(
max(
int(len(predictions_df) * summary_top_peptide_fraction),
1)).index.str.len().value_counts()
length_distribution.index.name = "length"
length_distribution /= length_distribution.sum()
length_distribution = length_distribution.to_frame()
length_distribution.columns = ["fraction"]
length_distribution = length_distribution.reset_index()
length_distribution["allele"] = allele
length_distribution = length_distribution[
["allele", "length", "fraction"]
].sort_values("length")
length_distributions.append(length_distribution)
if frequency_matrices is not None:
frequency_matrices = pandas.concat(
frequency_matrices, ignore_index=True)
if length_distributions is not None:
length_distributions = pandas.concat(
length_distributions, ignore_index=True)
if motif_summary:
return {
'frequency_matrices': frequency_matrices,
'length_distributions': length_distributions,
}
def filter_networks(self, predicate):
"""
......
......@@ -147,3 +147,17 @@ def random_peptides(num, length=9, distribution=None):
p=distribution.values,
size=(int(num), int(length)))
]
def positional_frequency_matrix(peptides):
length = len(peptides[0])
assert all(len(peptide) == length for peptide in peptides)
counts = pandas.DataFrame(
index=[a for a in amino_acid.BLOSUM62_MATRIX.index if a != 'X'],
columns=numpy.arange(1, length + 1),
)
for i in range(length):
counts[i + 1] = pandas.Series([p[i] for p in peptides]).value_counts()
result = (counts / len(peptides)).fillna(0.0).T
result.index.name = 'position'
return result
\ No newline at end of file
......@@ -18,7 +18,7 @@ def add_local_parallelism_args(parser):
group.add_argument(
"--num-jobs",
default=1,
default=0,
type=int,
metavar="N",
help="Number of processes to parallelize training over. Experimental. "
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment