''' Scan protein sequences using the MHCflurry presentation predictor. By default, sub-sequences (peptides) with affinity percentile ranks less than 2.0 are returned. You can also specify --results-all to return predictions for all peptides, or --results-best to return the top peptide for each sequence. Examples: Scan a set of sequences in a FASTA file for binders to any alleles in a MHC I genotype: $ mhcflurry-predict-scan \ test/data/example.fasta \ --alleles HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 Instead of a FASTA, you can also pass a CSV that has "sequence_id" and "sequence" columns. You can also specify multiple MHC I genotypes to scan as space-separated arguments to the --alleles option: $ mhcflurry-predict-scan \ test/data/example.fasta \ --alleles \ HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02 \ HLA-A*01:01,HLA-A*02:06,HLA-B*44:02,HLA-B*07:02,HLA-C*01:02,HLA-C*03:01 If `--out` is not specified, results are written to standard out. You can also specify sequences on the commandline: mhcflurry-predict-scan \ --sequences MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT \ --alleles HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02 ''' from __future__ import ( print_function, division, absolute_import, ) import sys import argparse import itertools import logging import os import pandas from .downloads import get_default_class1_presentation_models_dir from .class1_presentation_predictor import Class1PresentationPredictor from .fasta import read_fasta_to_dataframe from .version import __version__ parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, add_help=False) helper_args = parser.add_argument_group(title="Help") helper_args.add_argument( "-h", "--help", action="help", help="Show this help message and exit" ) helper_args.add_argument( "--list-supported-alleles", action="store_true", default=False, help="Print the list of supported alleles and exits" ) helper_args.add_argument( "--list-supported-peptide-lengths", action="store_true", default=False, help="Print the list of supported peptide lengths and exits" ) helper_args.add_argument( "--version", action="version", version="mhcflurry %s" % __version__, ) input_args = parser.add_argument_group(title="Input options") input_args.add_argument( "input", metavar="INPUT", nargs="?", help="Input CSV or FASTA") input_args.add_argument( "--input-format", choices=("guess", "csv", "fasta"), default="guess", help="Format of input file. By default, it is guessed from the file " "extension.") input_args.add_argument( "--alleles", metavar="ALLELE", nargs="+", help="Alleles to predict") input_args.add_argument( "--sequences", metavar="SEQ", nargs="+", help="Sequences to predict (exclusive with passing an input file)") input_args.add_argument( "--sequence-id-column", metavar="NAME", default="sequence_id", help="Input CSV column name for sequence IDs. Default: '%(default)s'") input_args.add_argument( "--sequence-column", metavar="NAME", default="sequence", help="Input CSV column name for sequences. Default: '%(default)s'") input_args.add_argument( "--no-throw", action="store_true", default=False, help="Return NaNs for unsupported alleles or peptides instead of raising") results_args = parser.add_argument_group(title="Result options") results_args.add_argument( "--peptide-lengths", type=int, nargs="+", default=[8, 9, 10, 11], help="Peptide lengths to consider. Default: %(default)s.") comparison_quantities = [ "presentation_score", "processing_score", "affinity", "affinity_percentile", ] results_args.add_argument( "--results-all", action="store_true", default=False, help="Return results for all peptides regardless of affinity, etc.") results_args.add_argument( "--results-best", choices=comparison_quantities, help="Take the top result for each sequence according to the specified " "predicted quantity") results_args.add_argument( "--results-filtered", choices=comparison_quantities, help="Filter results by the specified quantity.") results_args.add_argument( "--threshold-presentation-score", type=float, default=0.7, help="Threshold if filtering by presentation score. Default: %(default)s") results_args.add_argument( "--threshold-processing-score", type=float, default=0.5, help="Threshold if filtering by processing score. Default: %(default)s") results_args.add_argument( "--threshold-affinity", type=float, default=500, help="Threshold if filtering by affinity. Default: %(default)s") results_args.add_argument( "--threshold-affinity-percentile", type=float, default=2.0, help="Threshold if filtering by affinity percentile. Default: %(default)s") output_args = parser.add_argument_group(title="Output options") output_args.add_argument( "--out", metavar="OUTPUT.csv", help="Output CSV") output_args.add_argument( "--output-delimiter", metavar="CHAR", default=",", help="Delimiter character for results. Default: '%(default)s'") output_args.add_argument( "--no-affinity-percentile", default=False, action="store_true", help="Do not include affinity percentile rank") model_args = parser.add_argument_group(title="Model options") model_args.add_argument( "--models", metavar="DIR", default=None, help="Directory containing presentation models." "Default: %s" % get_default_class1_presentation_models_dir( test_exists=False)) model_args.add_argument( "--no-flanking", action="store_true", default=False, help="Do not use flanking sequence information in predictions") def run(argv=sys.argv[1:]): logging.getLogger('tensorflow').disabled = True if not argv: parser.print_help() parser.exit(1) args = parser.parse_args(argv) # It's hard to pass a tab in a shell, so we correct a common error: if args.output_delimiter == "\\t": args.output_delimiter = "\t" result_args = { "all": args.results_all, "best": args.results_best, "filtered": args.results_filtered, } if all([not bool(arg) for arg in result_args.values()]): result_args["filtered"] = "affinity_percentile" if sum([bool(arg) for arg in result_args.values()]) > 1: parser.error( "Specify at most one of --results-all, --results-best, " "--results-filtered") (result,) = [key for (key, value) in result_args.items() if value] result_comparison_quantity = ( None if result == "all" else result_args[result]) result_filter_value = None if result != "filtered" else { "presentation_score": args.threshold_presentation_score, "processing_score": args.threshold_processing_score, "affinity": args.threshold_affinity, "affinity_percentile": args.threshold_affinity_percentile, }[result_comparison_quantity] models_dir = args.models if models_dir is None: # The reason we set the default here instead of in the argument parser # is that we want to test_exists at this point, so the user gets a # message instructing them to download the models if needed. models_dir = get_default_class1_presentation_models_dir(test_exists=True) predictor = Class1PresentationPredictor.load(models_dir) if args.list_supported_alleles: print("\n".join(predictor.supported_alleles)) return if args.list_supported_peptide_lengths: min_len, max_len = predictor.supported_peptide_lengths print("\n".join([str(l) for l in range(min_len, max_len+1)])) return if args.input: if args.sequences: parser.error( "If an input file is specified, do not specify --sequences") input_format = args.input_format if input_format == "guess": extension = args.input.lower().split(".")[-1] if extension in ["gz", "bzip2"]: extension = args.input.lower().split(".")[-2] if extension == "csv": input_format = "csv" elif extension in ["fasta", "fa"]: input_format = "fasta" else: parser.error( "Couldn't guess input format from file extension: %s\n" "Pass the --input-format argument to specify if it is a " "CSV or fasta file" % args.input) print("Guessed input file format:", input_format) if input_format == "csv": df = pandas.read_csv(args.input) print("Read input CSV with %d rows, columns are: %s" % ( len(df), ", ".join(df.columns))) for col in [args.sequence_column,]: if col not in df.columns: raise ValueError( "No such column '%s' in CSV. Columns are: %s" % ( col, ", ".join(["'%s'" % c for c in df.columns]))) elif input_format == "fasta": df = read_fasta_to_dataframe(args.input) print("Read input fasta with %d sequences" % len(df)) print(df) else: raise ValueError("Unsupported input format", input_format) else: if not args.sequences: parser.error( "Specify either an input file or the --sequences argument") df = pandas.DataFrame({ args.sequence_column: args.sequences, }) if args.sequence_id_column not in df: df[args.sequence_id_column] = "sequence_" + df.index.astype(str) df = df.set_index(args.sequence_id_column) genotypes = pandas.Series(args.alleles).str.split(r"[,\s]+") genotypes.index = genotypes.index.map(lambda i: "genotype_%02d" % i) result_df = predictor.predict_sequences( sequences=df[args.sequence_column].to_dict(), alleles=genotypes.to_dict(), result=result, comparison_quantity=result_comparison_quantity, filter_value=result_filter_value, peptide_lengths=args.peptide_lengths, use_flanks=not args.no_flanking, include_affinity_percentile=not args.no_affinity_percentile, throw=not args.no_throw) if args.out: result_df.to_csv(args.out, index=False, sep=args.output_delimiter) print("Wrote: %s" % args.out) else: result_df.to_csv(sys.stdout, index=False, sep=args.output_delimiter)