Skip to content
Snippets Groups Projects
Commit ba4b1cfb authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Updates pre cleavage -> processing rename

parent e6861c3f
No related merge requests found
......@@ -46,18 +46,21 @@ script:
$(mhcflurry-downloads url data_curated)
$(mhcflurry-downloads url data_mass_spec_annotated)
$(mhcflurry-downloads url models_class1)
$(mhcflurry-downloads url models_class1_presentation)
$(mhcflurry-downloads url models_class1_pan)
$(mhcflurry-downloads url models_class1_pan_variants)
$(mhcflurry-downloads url allele_sequences)
-P /tmp/downloads
- ls -lh /tmp/downloads
-
mhcflurry-downloads fetch
data_curated
data_mass_spec_annotated
models_class1
models_class1_pan
models_class1_pan_variants
allele_sequences
--already-downloaded-dir /tmp/downloads
data_curated
data_mass_spec_annotated
models_class1
models_class1_presentation
models_class1_pan
models_class1_pan_variants
allele_sequences
--already-downloaded-dir /tmp/downloads
- mhcflurry-downloads info # just to test this command works
- nosetests --with-timer -sv test
......@@ -892,11 +892,15 @@ class Class1AffinityPredictor(object):
numpy.array of float
"""
if allele is not None:
normalized_allele = mhcnames.normalize_allele_name(allele)
try:
transform = self.allele_to_percent_rank_transform[allele]
transform = self.allele_to_percent_rank_transform[normalized_allele]
return transform.transform(affinities)
except KeyError:
msg = "Allele %s has no percentile rank information" % allele
msg = "Allele %s has no percentile rank information" % (
allele + (
"" if allele == normalized_allele
else " (normalized to %s)" % normalized_allele))
if throw:
raise ValueError(msg)
warnings.warn(msg)
......
......@@ -58,9 +58,22 @@ class Class1PresentationPredictor(object):
dict(metadata_dataframes) if metadata_dataframes else {})
self._models_cache = {}
def get_affinity_predictions(
self, peptides, experiment_names, alleles, verbose=1):
@property
def supported_alleles(self):
return self.affinity_predictor.supported_alleles
@property
def supported_peptide_lengths(self):
return self.affinity_predictor.supported_peptide_lengths
def predict_affinity(
self,
peptides,
experiment_names,
alleles,
include_affinity_percentile=False,
verbose=1,
throw=True):
df = pandas.DataFrame({
"peptide": numpy.array(peptides, copy=False),
"experiment_name": numpy.array(experiment_names, copy=False),
......@@ -80,17 +93,25 @@ class Class1PresentationPredictor(object):
predictions_df[allele] = self.affinity_predictor.predict(
peptides=experiment_peptides,
allele=allele,
model_kwargs={'batch_size': PREDICT_BATCH_SIZE})
model_kwargs={'batch_size': PREDICT_BATCH_SIZE},
throw=throw)
df.loc[
sub_df.index, "tightest_affinity"
sub_df.index, "affinity"
] = predictions_df.min(1).values
df.loc[
sub_df.index, "tightest_affinity_allele"
sub_df.index, "best_allele"
] = predictions_df.idxmin(1).values
if include_affinity_percentile:
df.loc[sub_df.index, "affinity_percentile"] = (
self.affinity_predictor.percentile_ranks(
df.loc[sub_df.index, "affinity"].values,
alleles=df.loc[sub_df.index, "best_allele"].values,
throw=False))
return df
def get_cleavage_predictions(
def predict_cleavage(
self, peptides, n_flanks=None, c_flanks=None, verbose=1):
if verbose > 0:
......@@ -128,12 +149,12 @@ class Class1PresentationPredictor(object):
c_flanks=None,
verbose=1):
df = self.get_affinity_predictions(
df = self.predict_affinity(
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
verbose=verbose)
df["affinity_score"] = from_ic50(df.tightest_affinity)
df["affinity_score"] = from_ic50(df.affinity)
df["target"] = numpy.array(targets, copy=False)
if (n_flanks is None) != (c_flanks is None):
......@@ -157,7 +178,7 @@ class Class1PresentationPredictor(object):
if verbose > 0:
print("Training variant", model_name)
df["cleavage_prediction"] = self.get_cleavage_predictions(
df["cleavage_prediction"] = self.predict_cleavage(
peptides=df.peptide.values,
n_flanks=n_flanks if with_flanks else None,
c_flanks=c_flanks if with_flanks else None,
......@@ -206,7 +227,7 @@ class Class1PresentationPredictor(object):
experiment_names=experiment_names,
n_flanks=n_flanks,
c_flanks=c_flanks,
verbose=verbose).score.values
verbose=verbose).presentation_score.values
def predict_to_dataframe(
self,
......@@ -215,7 +236,9 @@ class Class1PresentationPredictor(object):
experiment_names=None,
n_flanks=None,
c_flanks=None,
verbose=1):
include_affinity_percentile=False,
verbose=1,
throw=True):
if isinstance(peptides, string_types):
raise TypeError("peptides must be a list not a string")
......@@ -246,17 +269,19 @@ class Class1PresentationPredictor(object):
"experiment1": alleles,
}
df = self.get_affinity_predictions(
df = self.predict_affinity(
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
verbose=verbose)
df["affinity_score"] = from_ic50(df.tightest_affinity)
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
df["affinity_score"] = from_ic50(df.affinity)
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
df["cleavage_prediction"] = self.get_cleavage_predictions(
df["cleavage_prediction"] = self.predict_cleavage(
peptides=df.peptide.values,
n_flanks=n_flanks,
c_flanks=c_flanks,
......@@ -264,7 +289,9 @@ class Class1PresentationPredictor(object):
model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name)
df["score"] = model.predict_proba(df[self.model_inputs].values)[:,1]
df["presentation_score"] = model.predict_proba(
df[self.model_inputs].values)[:,1]
del df["affinity_score"]
return df
def save(self, models_dir):
......
......@@ -120,7 +120,7 @@ def get_default_class1_presentation_models_dir(test_exists=True):
raise IOError("No such directory: %s" % result)
return result
return get_path(
"models_class1_pan_refined", "presentation", test_exists=test_exists)
"models_class1_presentation", "models", test_exists=test_exists)
def get_default_class1_cleavage_models_dir(test_exists=True):
......
......@@ -26,16 +26,19 @@ from __future__ import (
division,
absolute_import,
)
import sys
import argparse
import itertools
import logging
import os
import pandas
from .common import set_keras_backend
from .downloads import get_default_class1_models_dir
from .downloads import get_default_class1_models_dir, get_default_class1_presentation_models_dir
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_presentation_predictor import Class1PresentationPredictor
from .version import __version__
......@@ -79,13 +82,12 @@ input_args.add_argument(
"--alleles",
metavar="ALLELE",
nargs="+",
help="Alleles to predict (exclusive with --input)")
help="Alleles to predict (exclusive with passing an input CSV)")
input_args.add_argument(
"--peptides",
metavar="PEPTIDE",
nargs="+",
help="Peptides to predict (exclusive with --input)")
help="Peptides to predict (exclusive with passing an input CSV)")
input_mod_args = parser.add_argument_group(title="Input options")
input_mod_args.add_argument(
......@@ -98,13 +100,22 @@ input_mod_args.add_argument(
metavar="NAME",
default="peptide",
help="Input column name for peptides. Default: '%(default)s'")
input_mod_args.add_argument(
"--n-flank-column",
metavar="NAME",
default="n_flank",
help="Column giving N-terminal flanking sequence. Default: '%(default)s'")
input_mod_args.add_argument(
"--c-flank-column",
metavar="NAME",
default="c_flank",
help="Column giving C-terminal flanking sequence. Default: '%(default)s'")
input_mod_args.add_argument(
"--no-throw",
action="store_true",
default=False,
help="Return NaNs for unsupported alleles or peptides instead of raising")
output_args = parser.add_argument_group(title="Output options")
output_args.add_argument(
"--out",
......@@ -121,11 +132,16 @@ output_args.add_argument(
default=",",
help="Delimiter character for results. Default: '%(default)s'")
output_args.add_argument(
"--include-individual-model-predictions",
"--no-affinity-percentile",
default=False,
action="store_true",
help="Do not include affinity percentile rank")
output_args.add_argument(
"--always-include-best-allele",
default=False,
help="Include predictions from each model in the ensemble"
)
action="store_true",
help="Always include the best_allele column even when it is identical "
"to the allele column (i.e. all queries are monoallelic).")
model_args = parser.add_argument_group(title="Model options")
model_args.add_argument(
......@@ -134,29 +150,26 @@ model_args.add_argument(
default=None,
help="Directory containing models. "
"Default: %s" % get_default_class1_models_dir(test_exists=False))
implementation_args = parser.add_argument_group(title="Implementation options")
implementation_args.add_argument(
"--backend",
choices=("tensorflow-gpu", "tensorflow-cpu", "tensorflow-default"),
help="Keras backend. If not specified will use system default.")
implementation_args.add_argument(
"--threads",
metavar="N",
type=int,
help="Num threads for tensorflow to use. If unspecified, tensorflow will "
"pick a value based on the number of cores.")
model_args.add_argument(
"--affinity-only",
action="store_true",
default=False,
help="Affinity prediction only (no cleavage or presentation)")
model_args.add_argument(
"--no-flanking",
action="store_true",
default=False,
help="Do not use flanking sequence information even when available")
def run(argv=sys.argv[1:]):
logging.getLogger('tensorflow').disabled = True
if not argv:
parser.print_help()
parser.exit(1)
args = parser.parse_args(argv)
set_keras_backend(backend=args.backend, num_threads=args.threads)
# It's hard to pass a tab in a shell, so we correct a common error:
if args.output_delimiter == "\\t":
args.output_delimiter = "\t"
......@@ -166,12 +179,24 @@ def run(argv=sys.argv[1:]):
# The reason we set the default here instead of in the argument parser
# is that we want to test_exists at this point, so the user gets a
# message instructing them to download the models if needed.
models_dir = get_default_class1_models_dir(test_exists=True)
predictor = Class1AffinityPredictor.load(models_dir)
models_dir = get_default_class1_presentation_models_dir(test_exists=True)
if os.path.exists(os.path.join(models_dir, "weights.csv")):
# Using a presentation predictor.
predictor = Class1PresentationPredictor.load(models_dir)
else:
# Using just an affinity predictor.
affinity_predictor = Class1AffinityPredictor.load(models_dir)
predictor = Class1PresentationPredictor(
affinity_predictor=affinity_predictor)
if not args.affinity_only:
logging.warning(
"Specified models are an affinity predictor, which implies "
"--affinity-only. Specify this argument to silence this warning.")
args.affinity_only = True
# The following two are informative commands that can come
# if a wrapper would like to incorporate input validation
# to not delibaretly make mhcflurry fail
# if a wrapper would like to incorporate input validation.
if args.list_supported_alleles:
print("\n".join(predictor.supported_alleles))
return
......@@ -180,7 +205,6 @@ def run(argv=sys.argv[1:]):
min_len, max_len = predictor.supported_peptide_lengths
print("\n".join([str(l) for l in range(min_len, max_len+1)]))
return
# End of early terminating routines
if args.input:
if args.alleles or args.peptides:
......@@ -200,19 +224,8 @@ def run(argv=sys.argv[1:]):
parser.error(
"Specify either an input CSV file or both the "
"--alleles and --peptides arguments")
# split user specified allele and peptide strings in case they
# contain multiple entries separated by commas
alleles = []
for allele_string in args.alleles:
alleles.extend([s.strip() for s in allele_string.split(",")])
peptides = []
for peptide in args.peptides:
peptides.extend(peptide.strip() for p in peptide.split(","))
for peptide in peptides:
if not peptide.isalpha():
raise ValueError(
"Unexpected character(s) in peptide '%s'" % peptide)
pairs = list(itertools.product(alleles, peptides))
pairs = list(itertools.product(args.alleles, args.peptides))
df = pandas.DataFrame({
"allele": [p[0] for p in pairs],
"peptide": [p[1] for p in pairs],
......@@ -221,15 +234,48 @@ def run(argv=sys.argv[1:]):
"Predicting for %d alleles and %d peptides = %d predictions" % (
len(args.alleles), len(args.peptides), len(df)))
predictions = predictor.predict_to_dataframe(
peptides=df[args.peptide_column].values,
alleles=df[args.allele_column].values,
include_individual_model_predictions=(
args.include_individual_model_predictions),
throw=not args.no_throw)
allele_string_to_alleles = (
df.drop_duplicates(args.allele_column).set_index(
args.allele_column, drop=False)[
args.allele_column
].str.split(r"[,\s]+")).to_dict()
if args.affinity_only:
predictions = predictor.predict_affinity(
peptides=df[args.peptide_column].values,
alleles=allele_string_to_alleles,
experiment_names=df[args.allele_column],
throw=not args.no_throw,
include_affinity_percentile=not args.no_affinity_percentile)
else:
n_flanks = None
c_flanks = None
if not args.no_flanking:
if args.n_flank_column in df.columns and args.c_flank_column in df.columns:
n_flanks = df[args.n_flank_column]
c_flanks = df[args.c_flank_column]
else:
logging.warning(
"No flanking information provided. Specify --no-flanking "
"to silence this warning")
predictions = predictor.predict_to_dataframe(
peptides=df[args.peptide_column].values,
n_flanks=n_flanks,
c_flanks=c_flanks,
alleles=allele_string_to_alleles,
experiment_names=df[args.allele_column],
throw=not args.no_throw,
include_affinity_percentile=not args.no_affinity_percentile)
# If each query is just for a single allele, the "best_allele" column
# is redundant so we remove it.
if not args.always_include_best_allele:
if all(len(a) == 1 for a in allele_string_to_alleles.values()):
del predictions["best_allele"]
for col in predictions.columns:
if col not in ("allele", "peptide"):
if col not in ("allele", "peptide", "experiment_name"):
df[args.prediction_column_prefix + col] = predictions[col]
if args.out:
......
......@@ -63,6 +63,7 @@ def test_no_csv():
print(result)
assert_equal(result.shape, (6, 6))
sub_result1 = result.loc[result.peptide == "SIINFEKL"].set_index("allele")
print(sub_result1)
assert (
sub_result1.loc["H-2-Kb"].mhcflurry1_prediction <
sub_result1.loc["HLA-A0201"].mhcflurry1_prediction)
sub_result1.loc["H-2-Kb"].mhcflurry1_affinity <
sub_result1.loc["HLA-A0201"].mhcflurry1_affinity)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment