From ba4b1cfb16a600fda7c67f647d64726e345d284b Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sat, 25 Jan 2020 16:38:44 -0500 Subject: [PATCH] Updates pre cleavage -> processing rename --- .travis.yml | 17 ++- mhcflurry/class1_affinity_predictor.py | 8 +- mhcflurry/class1_presentation_predictor.py | 59 ++++++--- mhcflurry/downloads.py | 2 +- mhcflurry/predict_command.py | 142 ++++++++++++++------- test/test_predict_command.py | 5 +- 6 files changed, 157 insertions(+), 76 deletions(-) diff --git a/.travis.yml b/.travis.yml index b4b2f9f5..c0f95d62 100644 --- a/.travis.yml +++ b/.travis.yml @@ -46,18 +46,21 @@ script: $(mhcflurry-downloads url data_curated) $(mhcflurry-downloads url data_mass_spec_annotated) $(mhcflurry-downloads url models_class1) + $(mhcflurry-downloads url models_class1_presentation) $(mhcflurry-downloads url models_class1_pan) + $(mhcflurry-downloads url models_class1_pan_variants) $(mhcflurry-downloads url allele_sequences) -P /tmp/downloads - ls -lh /tmp/downloads - mhcflurry-downloads fetch - data_curated - data_mass_spec_annotated - models_class1 - models_class1_pan - models_class1_pan_variants - allele_sequences - --already-downloaded-dir /tmp/downloads + data_curated + data_mass_spec_annotated + models_class1 + models_class1_presentation + models_class1_pan + models_class1_pan_variants + allele_sequences + --already-downloaded-dir /tmp/downloads - mhcflurry-downloads info # just to test this command works - nosetests --with-timer -sv test diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 6b16ef07..9a3258cf 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -892,11 +892,15 @@ class Class1AffinityPredictor(object): numpy.array of float """ if allele is not None: + normalized_allele = mhcnames.normalize_allele_name(allele) try: - transform = self.allele_to_percent_rank_transform[allele] + transform = self.allele_to_percent_rank_transform[normalized_allele] return transform.transform(affinities) except KeyError: - msg = "Allele %s has no percentile rank information" % allele + msg = "Allele %s has no percentile rank information" % ( + allele + ( + "" if allele == normalized_allele + else " (normalized to %s)" % normalized_allele)) if throw: raise ValueError(msg) warnings.warn(msg) diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py index e1181a15..a7430cb4 100644 --- a/mhcflurry/class1_presentation_predictor.py +++ b/mhcflurry/class1_presentation_predictor.py @@ -58,9 +58,22 @@ class Class1PresentationPredictor(object): dict(metadata_dataframes) if metadata_dataframes else {}) self._models_cache = {} - def get_affinity_predictions( - self, peptides, experiment_names, alleles, verbose=1): + @property + def supported_alleles(self): + return self.affinity_predictor.supported_alleles + @property + def supported_peptide_lengths(self): + return self.affinity_predictor.supported_peptide_lengths + + def predict_affinity( + self, + peptides, + experiment_names, + alleles, + include_affinity_percentile=False, + verbose=1, + throw=True): df = pandas.DataFrame({ "peptide": numpy.array(peptides, copy=False), "experiment_name": numpy.array(experiment_names, copy=False), @@ -80,17 +93,25 @@ class Class1PresentationPredictor(object): predictions_df[allele] = self.affinity_predictor.predict( peptides=experiment_peptides, allele=allele, - model_kwargs={'batch_size': PREDICT_BATCH_SIZE}) + model_kwargs={'batch_size': PREDICT_BATCH_SIZE}, + throw=throw) df.loc[ - sub_df.index, "tightest_affinity" + sub_df.index, "affinity" ] = predictions_df.min(1).values df.loc[ - sub_df.index, "tightest_affinity_allele" + sub_df.index, "best_allele" ] = predictions_df.idxmin(1).values + if include_affinity_percentile: + df.loc[sub_df.index, "affinity_percentile"] = ( + self.affinity_predictor.percentile_ranks( + df.loc[sub_df.index, "affinity"].values, + alleles=df.loc[sub_df.index, "best_allele"].values, + throw=False)) + return df - def get_cleavage_predictions( + def predict_cleavage( self, peptides, n_flanks=None, c_flanks=None, verbose=1): if verbose > 0: @@ -128,12 +149,12 @@ class Class1PresentationPredictor(object): c_flanks=None, verbose=1): - df = self.get_affinity_predictions( + df = self.predict_affinity( peptides=peptides, experiment_names=experiment_names, alleles=alleles, verbose=verbose) - df["affinity_score"] = from_ic50(df.tightest_affinity) + df["affinity_score"] = from_ic50(df.affinity) df["target"] = numpy.array(targets, copy=False) if (n_flanks is None) != (c_flanks is None): @@ -157,7 +178,7 @@ class Class1PresentationPredictor(object): if verbose > 0: print("Training variant", model_name) - df["cleavage_prediction"] = self.get_cleavage_predictions( + df["cleavage_prediction"] = self.predict_cleavage( peptides=df.peptide.values, n_flanks=n_flanks if with_flanks else None, c_flanks=c_flanks if with_flanks else None, @@ -206,7 +227,7 @@ class Class1PresentationPredictor(object): experiment_names=experiment_names, n_flanks=n_flanks, c_flanks=c_flanks, - verbose=verbose).score.values + verbose=verbose).presentation_score.values def predict_to_dataframe( self, @@ -215,7 +236,9 @@ class Class1PresentationPredictor(object): experiment_names=None, n_flanks=None, c_flanks=None, - verbose=1): + include_affinity_percentile=False, + verbose=1, + throw=True): if isinstance(peptides, string_types): raise TypeError("peptides must be a list not a string") @@ -246,17 +269,19 @@ class Class1PresentationPredictor(object): "experiment1": alleles, } - df = self.get_affinity_predictions( + df = self.predict_affinity( peptides=peptides, experiment_names=experiment_names, alleles=alleles, - verbose=verbose) - df["affinity_score"] = from_ic50(df.tightest_affinity) + include_affinity_percentile=include_affinity_percentile, + verbose=verbose, + throw=throw) + df["affinity_score"] = from_ic50(df.affinity) if (n_flanks is None) != (c_flanks is None): raise ValueError("Specify both or neither of n_flanks, c_flanks") - df["cleavage_prediction"] = self.get_cleavage_predictions( + df["cleavage_prediction"] = self.predict_cleavage( peptides=df.peptide.values, n_flanks=n_flanks, c_flanks=c_flanks, @@ -264,7 +289,9 @@ class Class1PresentationPredictor(object): model_name = 'with_flanks' if n_flanks is not None else "without_flanks" model = self.get_model(model_name) - df["score"] = model.predict_proba(df[self.model_inputs].values)[:,1] + df["presentation_score"] = model.predict_proba( + df[self.model_inputs].values)[:,1] + del df["affinity_score"] return df def save(self, models_dir): diff --git a/mhcflurry/downloads.py b/mhcflurry/downloads.py index abeedadb..3f8522ec 100644 --- a/mhcflurry/downloads.py +++ b/mhcflurry/downloads.py @@ -120,7 +120,7 @@ def get_default_class1_presentation_models_dir(test_exists=True): raise IOError("No such directory: %s" % result) return result return get_path( - "models_class1_pan_refined", "presentation", test_exists=test_exists) + "models_class1_presentation", "models", test_exists=test_exists) def get_default_class1_cleavage_models_dir(test_exists=True): diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py index 17a687f5..facbe0d7 100644 --- a/mhcflurry/predict_command.py +++ b/mhcflurry/predict_command.py @@ -26,16 +26,19 @@ from __future__ import ( division, absolute_import, ) + import sys import argparse import itertools import logging +import os import pandas from .common import set_keras_backend -from .downloads import get_default_class1_models_dir +from .downloads import get_default_class1_models_dir, get_default_class1_presentation_models_dir from .class1_affinity_predictor import Class1AffinityPredictor +from .class1_presentation_predictor import Class1PresentationPredictor from .version import __version__ @@ -79,13 +82,12 @@ input_args.add_argument( "--alleles", metavar="ALLELE", nargs="+", - help="Alleles to predict (exclusive with --input)") + help="Alleles to predict (exclusive with passing an input CSV)") input_args.add_argument( "--peptides", metavar="PEPTIDE", nargs="+", - help="Peptides to predict (exclusive with --input)") - + help="Peptides to predict (exclusive with passing an input CSV)") input_mod_args = parser.add_argument_group(title="Input options") input_mod_args.add_argument( @@ -98,13 +100,22 @@ input_mod_args.add_argument( metavar="NAME", default="peptide", help="Input column name for peptides. Default: '%(default)s'") +input_mod_args.add_argument( + "--n-flank-column", + metavar="NAME", + default="n_flank", + help="Column giving N-terminal flanking sequence. Default: '%(default)s'") +input_mod_args.add_argument( + "--c-flank-column", + metavar="NAME", + default="c_flank", + help="Column giving C-terminal flanking sequence. Default: '%(default)s'") input_mod_args.add_argument( "--no-throw", action="store_true", default=False, help="Return NaNs for unsupported alleles or peptides instead of raising") - output_args = parser.add_argument_group(title="Output options") output_args.add_argument( "--out", @@ -121,11 +132,16 @@ output_args.add_argument( default=",", help="Delimiter character for results. Default: '%(default)s'") output_args.add_argument( - "--include-individual-model-predictions", + "--no-affinity-percentile", + default=False, action="store_true", + help="Do not include affinity percentile rank") +output_args.add_argument( + "--always-include-best-allele", default=False, - help="Include predictions from each model in the ensemble" -) + action="store_true", + help="Always include the best_allele column even when it is identical " + "to the allele column (i.e. all queries are monoallelic).") model_args = parser.add_argument_group(title="Model options") model_args.add_argument( @@ -134,29 +150,26 @@ model_args.add_argument( default=None, help="Directory containing models. " "Default: %s" % get_default_class1_models_dir(test_exists=False)) - -implementation_args = parser.add_argument_group(title="Implementation options") -implementation_args.add_argument( - "--backend", - choices=("tensorflow-gpu", "tensorflow-cpu", "tensorflow-default"), - help="Keras backend. If not specified will use system default.") -implementation_args.add_argument( - "--threads", - metavar="N", - type=int, - help="Num threads for tensorflow to use. If unspecified, tensorflow will " - "pick a value based on the number of cores.") - +model_args.add_argument( + "--affinity-only", + action="store_true", + default=False, + help="Affinity prediction only (no cleavage or presentation)") +model_args.add_argument( + "--no-flanking", + action="store_true", + default=False, + help="Do not use flanking sequence information even when available") def run(argv=sys.argv[1:]): + logging.getLogger('tensorflow').disabled = True + if not argv: parser.print_help() parser.exit(1) args = parser.parse_args(argv) - set_keras_backend(backend=args.backend, num_threads=args.threads) - # It's hard to pass a tab in a shell, so we correct a common error: if args.output_delimiter == "\\t": args.output_delimiter = "\t" @@ -166,12 +179,24 @@ def run(argv=sys.argv[1:]): # The reason we set the default here instead of in the argument parser # is that we want to test_exists at this point, so the user gets a # message instructing them to download the models if needed. - models_dir = get_default_class1_models_dir(test_exists=True) - predictor = Class1AffinityPredictor.load(models_dir) + models_dir = get_default_class1_presentation_models_dir(test_exists=True) + + if os.path.exists(os.path.join(models_dir, "weights.csv")): + # Using a presentation predictor. + predictor = Class1PresentationPredictor.load(models_dir) + else: + # Using just an affinity predictor. + affinity_predictor = Class1AffinityPredictor.load(models_dir) + predictor = Class1PresentationPredictor( + affinity_predictor=affinity_predictor) + if not args.affinity_only: + logging.warning( + "Specified models are an affinity predictor, which implies " + "--affinity-only. Specify this argument to silence this warning.") + args.affinity_only = True # The following two are informative commands that can come - # if a wrapper would like to incorporate input validation - # to not delibaretly make mhcflurry fail + # if a wrapper would like to incorporate input validation. if args.list_supported_alleles: print("\n".join(predictor.supported_alleles)) return @@ -180,7 +205,6 @@ def run(argv=sys.argv[1:]): min_len, max_len = predictor.supported_peptide_lengths print("\n".join([str(l) for l in range(min_len, max_len+1)])) return - # End of early terminating routines if args.input: if args.alleles or args.peptides: @@ -200,19 +224,8 @@ def run(argv=sys.argv[1:]): parser.error( "Specify either an input CSV file or both the " "--alleles and --peptides arguments") - # split user specified allele and peptide strings in case they - # contain multiple entries separated by commas - alleles = [] - for allele_string in args.alleles: - alleles.extend([s.strip() for s in allele_string.split(",")]) - peptides = [] - for peptide in args.peptides: - peptides.extend(peptide.strip() for p in peptide.split(",")) - for peptide in peptides: - if not peptide.isalpha(): - raise ValueError( - "Unexpected character(s) in peptide '%s'" % peptide) - pairs = list(itertools.product(alleles, peptides)) + + pairs = list(itertools.product(args.alleles, args.peptides)) df = pandas.DataFrame({ "allele": [p[0] for p in pairs], "peptide": [p[1] for p in pairs], @@ -221,15 +234,48 @@ def run(argv=sys.argv[1:]): "Predicting for %d alleles and %d peptides = %d predictions" % ( len(args.alleles), len(args.peptides), len(df))) - predictions = predictor.predict_to_dataframe( - peptides=df[args.peptide_column].values, - alleles=df[args.allele_column].values, - include_individual_model_predictions=( - args.include_individual_model_predictions), - throw=not args.no_throw) + allele_string_to_alleles = ( + df.drop_duplicates(args.allele_column).set_index( + args.allele_column, drop=False)[ + args.allele_column + ].str.split(r"[,\s]+")).to_dict() + + if args.affinity_only: + predictions = predictor.predict_affinity( + peptides=df[args.peptide_column].values, + alleles=allele_string_to_alleles, + experiment_names=df[args.allele_column], + throw=not args.no_throw, + include_affinity_percentile=not args.no_affinity_percentile) + else: + n_flanks = None + c_flanks = None + if not args.no_flanking: + if args.n_flank_column in df.columns and args.c_flank_column in df.columns: + n_flanks = df[args.n_flank_column] + c_flanks = df[args.c_flank_column] + else: + logging.warning( + "No flanking information provided. Specify --no-flanking " + "to silence this warning") + + predictions = predictor.predict_to_dataframe( + peptides=df[args.peptide_column].values, + n_flanks=n_flanks, + c_flanks=c_flanks, + alleles=allele_string_to_alleles, + experiment_names=df[args.allele_column], + throw=not args.no_throw, + include_affinity_percentile=not args.no_affinity_percentile) + + # If each query is just for a single allele, the "best_allele" column + # is redundant so we remove it. + if not args.always_include_best_allele: + if all(len(a) == 1 for a in allele_string_to_alleles.values()): + del predictions["best_allele"] for col in predictions.columns: - if col not in ("allele", "peptide"): + if col not in ("allele", "peptide", "experiment_name"): df[args.prediction_column_prefix + col] = predictions[col] if args.out: diff --git a/test/test_predict_command.py b/test/test_predict_command.py index c3f0a5c1..a4fbdc51 100644 --- a/test/test_predict_command.py +++ b/test/test_predict_command.py @@ -63,6 +63,7 @@ def test_no_csv(): print(result) assert_equal(result.shape, (6, 6)) sub_result1 = result.loc[result.peptide == "SIINFEKL"].set_index("allele") + print(sub_result1) assert ( - sub_result1.loc["H-2-Kb"].mhcflurry1_prediction < - sub_result1.loc["HLA-A0201"].mhcflurry1_prediction) + sub_result1.loc["H-2-Kb"].mhcflurry1_affinity < + sub_result1.loc["HLA-A0201"].mhcflurry1_affinity) -- GitLab