''' Run MHCflurry predictor on specified peptides. By default, the presentation predictor is used, and predictions for MHC I binding affinity, antigen processing, and the composite presentation score are returned. If you just want binding affinity predictions, pass --affinity-only. Examples: Write a CSV file containing the contents of INPUT.csv plus additional columns giving MHCflurry predictions: $ mhcflurry-predict INPUT.csv --out RESULT.csv The input CSV file is expected to contain columns "allele", "peptide", and, optionally, "n_flank", and "c_flank". If `--out` is not specified, results are written to stdout. You can also run on alleles and peptides specified on the commandline, in which case predictions are written for *all combinations* of alleles and peptides: $ mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL Instead of individual alleles (in a CSV or on the command line), you can also give a comma separated list of alleles giving a sample genotype. In this case, the tightest binding affinity across the alleles for the sample will be returned. For example: $ mhcflurry-predict --peptides SIINFEKL DENDREKLLL \ --alleles \ HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 \ HLA-A*01:01,HLA-A*02:06,HLA-B*44:02,HLA-B*07:02,HLA-C*01:01,HLA-C*03:01 will give the tightest predicted affinities across alleles for each of the two genotypes specified for each peptide. ''' from __future__ import ( print_function, division, absolute_import, ) import sys import argparse import itertools import logging import os import pandas from .downloads import get_default_class1_presentation_models_dir from .class1_affinity_predictor import Class1AffinityPredictor from .class1_presentation_predictor import Class1PresentationPredictor from .version import __version__ parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, add_help=False) helper_args = parser.add_argument_group(title="Help") helper_args.add_argument( "-h", "--help", action="help", help="Show this help message and exit" ) helper_args.add_argument( "--list-supported-alleles", action="store_true", default=False, help="Prints the list of supported alleles and exits" ) helper_args.add_argument( "--list-supported-peptide-lengths", action="store_true", default=False, help="Prints the list of supported peptide lengths and exits" ) helper_args.add_argument( "--version", action="version", version="mhcflurry %s" % __version__, ) input_args = parser.add_argument_group(title="Input (required)") input_args.add_argument( "input", metavar="INPUT.csv", nargs="?", help="Input CSV") input_args.add_argument( "--alleles", metavar="ALLELE", nargs="+", help="Alleles to predict (exclusive with passing an input CSV)") input_args.add_argument( "--peptides", metavar="PEPTIDE", nargs="+", help="Peptides to predict (exclusive with passing an input CSV)") input_mod_args = parser.add_argument_group(title="Input options") input_mod_args.add_argument( "--allele-column", metavar="NAME", default="allele", help="Input column name for alleles. Default: '%(default)s'") input_mod_args.add_argument( "--peptide-column", metavar="NAME", default="peptide", help="Input column name for peptides. Default: '%(default)s'") input_mod_args.add_argument( "--n-flank-column", metavar="NAME", default="n_flank", help="Column giving N-terminal flanking sequence. Default: '%(default)s'") input_mod_args.add_argument( "--c-flank-column", metavar="NAME", default="c_flank", help="Column giving C-terminal flanking sequence. Default: '%(default)s'") input_mod_args.add_argument( "--no-throw", action="store_true", default=False, help="Return NaNs for unsupported alleles or peptides instead of raising") output_args = parser.add_argument_group(title="Output options") output_args.add_argument( "--out", metavar="OUTPUT.csv", help="Output CSV") output_args.add_argument( "--prediction-column-prefix", metavar="NAME", default="mhcflurry_", help="Prefix for output column names. Default: '%(default)s'") output_args.add_argument( "--output-delimiter", metavar="CHAR", default=",", help="Delimiter character for results. Default: '%(default)s'") output_args.add_argument( "--no-affinity-percentile", default=False, action="store_true", help="Do not include affinity percentile rank") output_args.add_argument( "--always-include-best-allele", default=False, action="store_true", help="Always include the best_allele column even when it is identical " "to the allele column (i.e. all queries are monoallelic).") model_args = parser.add_argument_group(title="Model options") model_args.add_argument( "--models", metavar="DIR", default=None, help="Directory containing models. Either a binding affinity predictor or " "a presentation predictor can be used. " "Default: %s" % get_default_class1_presentation_models_dir( test_exists=False)) model_args.add_argument( "--affinity-only", action="store_true", default=False, help="Affinity prediction only (no antigen processing or presentation)") model_args.add_argument( "--no-flanking", action="store_true", default=False, help="Do not use flanking sequence information even when available") def run(argv=sys.argv[1:]): logging.getLogger('tensorflow').disabled = True if not argv: parser.print_help() parser.exit(1) args = parser.parse_args(argv) # It's hard to pass a tab in a shell, so we correct a common error: if args.output_delimiter == "\\t": args.output_delimiter = "\t" models_dir = args.models if models_dir is None: # The reason we set the default here instead of in the argument parser # is that we want to test_exists at this point, so the user gets a # message instructing them to download the models if needed. models_dir = get_default_class1_presentation_models_dir(test_exists=True) if os.path.exists(os.path.join(models_dir, "weights.csv")): # Using a presentation predictor. predictor = Class1PresentationPredictor.load(models_dir) else: # Using just an affinity predictor. affinity_predictor = Class1AffinityPredictor.load(models_dir) predictor = Class1PresentationPredictor( affinity_predictor=affinity_predictor) if not args.affinity_only: logging.warning( "Specified models are an affinity predictor, which implies " "--affinity-only. Specify this argument to silence this warning.") args.affinity_only = True if args.list_supported_alleles: print("\n".join(predictor.supported_alleles)) return if args.list_supported_peptide_lengths: min_len, max_len = predictor.supported_peptide_lengths print("\n".join([str(l) for l in range(min_len, max_len+1)])) return if args.input: if args.alleles or args.peptides: parser.error( "If an input file is specified, do not specify --alleles " "or --peptides") df = pandas.read_csv(args.input) print("Read input CSV with %d rows, columns are: %s" % ( len(df), ", ".join(df.columns))) for col in [args.allele_column, args.peptide_column]: if col not in df.columns: raise ValueError( "No such column '%s' in CSV. Columns are: %s" % ( col, ", ".join(["'%s'" % c for c in df.columns]))) else: if not args.alleles or not args.peptides: parser.error( "Specify either an input CSV file or both the " "--alleles and --peptides arguments") pairs = list(itertools.product(args.alleles, args.peptides)) df = pandas.DataFrame({ "allele": [p[0] for p in pairs], "peptide": [p[1] for p in pairs], }) logging.info( "Predicting for %d alleles and %d peptides = %d predictions" % ( len(args.alleles), len(args.peptides), len(df))) allele_string_to_alleles = ( df.drop_duplicates(args.allele_column).set_index( args.allele_column, drop=False)[ args.allele_column ].str.split(r"[,\s]+")).to_dict() if args.affinity_only: predictions = predictor.predict_affinity( peptides=df[args.peptide_column].values, alleles=allele_string_to_alleles, experiment_names=df[args.allele_column], throw=not args.no_throw, include_affinity_percentile=not args.no_affinity_percentile) else: n_flanks = None c_flanks = None if not args.no_flanking: if args.n_flank_column in df.columns and args.c_flank_column in df.columns: n_flanks = df[args.n_flank_column] c_flanks = df[args.c_flank_column] else: logging.warning( "No flanking information provided. Specify --no-flanking " "to silence this warning") predictions = predictor.predict_to_dataframe( peptides=df[args.peptide_column].values, n_flanks=n_flanks, c_flanks=c_flanks, alleles=allele_string_to_alleles, experiment_names=df[args.allele_column], throw=not args.no_throw, include_affinity_percentile=not args.no_affinity_percentile) # If each query is just for a single allele, the "best_allele" column # is redundant so we remove it. if not args.always_include_best_allele: if all(len(a) == 1 for a in allele_string_to_alleles.values()): del predictions["best_allele"] for col in predictions.columns: if col not in ("allele", "peptide", "experiment_name", "peptide_num"): df[args.prediction_column_prefix + col] = predictions[col] if args.out: df.to_csv(args.out, index=False, sep=args.output_delimiter) print("Wrote: %s" % args.out) else: df.to_csv(sys.stdout, index=False, sep=args.output_delimiter)