diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py index 6bbeb08de509319ad81b4d0f29cf124ecbfdf3ff..124ec6218243cd137719072aae9490627b4c8f8b 100644 --- a/mhcflurry/class1_presentation_predictor.py +++ b/mhcflurry/class1_presentation_predictor.py @@ -113,6 +113,7 @@ class Class1PresentationPredictor(object): df = pandas.DataFrame({ "peptide": numpy.array(peptides, copy=False), }) + df["peptide_num"] = df.index if experiment_names is None: peptides = EncodableSequences.create(peptides) all_alleles = set() @@ -141,9 +142,13 @@ class Class1PresentationPredictor(object): new_df["affinity"] = predictions_df[ experiment_alleles ].min(1).values - new_df["best_allele"] = predictions_df[ - experiment_alleles - ].idxmin(1).values + if len(df) == 0: + new_df["best_allele"] = [] + else: + new_df["best_allele"] = predictions_df[ + experiment_alleles + ].idxmin(1).values + dfs.append(new_df) result_df = pandas.concat(dfs, ignore_index=True) else: @@ -177,8 +182,8 @@ class Class1PresentationPredictor(object): if include_affinity_percentile: result_df["affinity_percentile"] = ( self.affinity_predictor.percentile_ranks( - df.affinity.values, - alleles=df.best_alleles.values, + result_df.affinity.values, + alleles=result_df.best_allele.values, throw=False)) return result_df @@ -215,6 +220,9 @@ class Class1PresentationPredictor(object): raise ValueError("No processing predictor with flanks") predictor = self.processing_predictor_with_flanks + if len(peptides) == 0: + return numpy.array([], dtype=float) + num_chunks = int(numpy.ceil(float(len(peptides)) / PREDICT_CHUNK_SIZE)) peptide_chunks = numpy.array_split(peptides, num_chunks) n_flank_chunks = numpy.array_split(n_flanks, num_chunks) @@ -384,6 +392,13 @@ class Class1PresentationPredictor(object): Presentation scores for each peptide. Scores range from 0 to 1, with higher values indicating more favorable presentation likelihood. """ + if isinstance(alleles, dict): + if experiment_names is None: + raise ValueError( + "experiment_names must be supplied when alleles is a dict. " + "Alternatively, call predict_to_dataframe to predict over " + "all experiments") + return self.predict_to_dataframe( peptides=peptides, alleles=alleles, @@ -416,15 +431,17 @@ class Class1PresentationPredictor(object): ---------- peptides : list of string Peptide sequences - alleles : list of string or string -> string dict + alleles : list of string or string -> list of string dict If you are predicting for a single sample, pass a list of strings (up to 6) indicating the genotype. If you are predicting across multiple samples, pass a dict where the keys are (arbitrary) experiment names and the values are the alleles to predict for that sample. experiment_names : list of string [same length as peptides] - If you are passing a dict for 'alleles', use this argument to - specify which peptides go with which experiments. + If you are passing a dict for 'alleles', you can use this argument to + specify which peptides go with which experiments. If it is None, + then predictions will be performed for each peptide across all + experiments. n_flanks : list of string [same length as peptides] Upstream sequences before the peptide. Sequences of any length can be given and a suffix of the size supported by the model will be @@ -453,14 +470,12 @@ class Class1PresentationPredictor(object): if isinstance(alleles, string_types): raise TypeError("alleles must be a list or dict") - if isinstance(alleles, dict): - if experiment_names is None: - raise ValueError( - "experiment_names must be supplied when alleles is a dict") - else: + if not isinstance(alleles, dict): + # Make alleles into a dict. if experiment_names is not None: raise ValueError( "alleles must be a dict when experiment_names is specified") + alleles = numpy.array(alleles, copy=False) if len(alleles) > MAX_ALLELES_PER_SAMPLE: raise ValueError( @@ -472,7 +487,6 @@ class Class1PresentationPredictor(object): "is expected to be the same length as peptides." % MAX_ALLELES_PER_SAMPLE) - experiment_names = ["experiment1"] * len(peptides) alleles = { "experiment1": alleles, } @@ -488,22 +502,27 @@ class Class1PresentationPredictor(object): df = self.predict_affinity( peptides=peptides, - experiment_names=experiment_names, alleles=alleles, + experiment_names=experiment_names, # might be None include_affinity_percentile=include_affinity_percentile, verbose=verbose, throw=throw) + df["affinity_score"] = from_ic50(df.affinity) - df["processing_score"] = processing_scores + df["processing_score"] = df.peptide_num.map( + pandas.Series(processing_scores)) if c_flanks is not None: - df.insert(1, "c_flank", c_flanks) + df.insert(1, "c_flank", df.peptide_num.map(pandas.Series(c_flanks))) if n_flanks is not None: - df.insert(1, "n_flank", n_flanks) + df.insert(1, "n_flank", df.peptide_num.map(pandas.Series(n_flanks))) model_name = 'with_flanks' if n_flanks is not None else "without_flanks" model = self.get_model(model_name) - df["presentation_score"] = model.predict_proba( - df[self.model_inputs].values)[:,1] + if len(df) > 0: + df["presentation_score"] = model.predict_proba( + df[self.model_inputs].values)[:,1] + else: + df["presentation_score"] = [] del df["affinity_score"] return df @@ -514,9 +533,9 @@ class Class1PresentationPredictor(object): result="best", comparison_quantity="presentation_score", filter_value=None, - peptide_lengths=[8, 9, 10, 11], + peptide_lengths=(8, 9, 10, 11), use_flanks=True, - include_affinity_percentile=False, + include_affinity_percentile=True, verbose=1, throw=True): """ @@ -527,22 +546,21 @@ class Class1PresentationPredictor(object): sequences : str, list of string, or string -> string dict Protein sequences. If a dict is given, the keys are arbitrary ( e.g. protein names), and the values are the amino acid sequences. - alleles : str, list of string, list of list of string, or string -> string dict + alleles : list of string, list of list of string, or dict of string -> list of string MHC I alleles. Can be: (1) a string (a single allele), (2) a list of strings (a single genotype), (3) a list of list of strings (multiple genotypes, where the total number of genotypes must equal - the number of sequences), or (4) a dict (in which case the keys must - match the sequences dict keys). + the number of sequences), or (4) a dict giving multiple genotypes, + which will each be run over the sequences. result : string - One of: - - "best": return the strongest peptide for each sequence - - "all": return predictions for all peptides - - "filtered": return predictions where comparison_quantity is - stronger (i.e (<) for affinity, (>) for scores) than filter_value. + Specify 'best' to return the strongest peptide for each sequence, + 'all' to return predictions for all peptides, or 'filtered' to + return predictions where the comparison_quantity is stronger + (i.e (<) for affinity, (>) for scores) than filter_value. comparison_quantity : string - One of "presentation_score", "processing_score", or "affinity". - Quantity to use to rank (if result is "best") or filter (if result - is "filtered") results. + One of "presentation_score", "processing_score", "affinity", or + "affinity_percentile". Prediction to use to rank (if result is + "best") or filter (if result is "filtered") results. filter_value : float Threshold value to use, only relevant when result is "filtered". If comparison_quantity is "affinity", then all results less than @@ -589,42 +607,62 @@ class Class1PresentationPredictor(object): ("sequence_%04d" % (i + 1), sequence) for (i, sequence) in enumerate(sequences)) + cross_product = True if isinstance(alleles, string_types): + # Case (1) - alleles is a string alleles = [alleles] - if not isinstance(alleles, dict): - if all([isinstance(item, string_types) for item in alleles]): - alleles = dict((name, alleles) for name in sequences.keys()) - elif len(alleles) != len(sequences): + if isinstance(alleles, dict): + if any([isinstance(v, string_types) for v in alleles.values()]): raise ValueError( - "alleles must be (1) a string (a single allele), (2) a list of " - "strings (a single genotype), (3) a list of list of strings (" - "(multiple genotypes, where the total number of genotypes " - "must equal the number of sequences), or (4) a dict (in which " - "case the keys must match the sequences dict keys). Here " - "it seemed like option (3) was being used, but the length " - "of alleles (%d) did not match the length of sequences (%d)." - % (len(alleles), len(sequences))) + "The values in the alleles dict must be lists, not strings") + else: + if all(isinstance(a, string_types) for a in alleles): + # Case (2) - a simple list of alleles + alleles = { + 'genotype': alleles + } else: - alleles = dict(zip(sequences.keys(), alleles)) + # Case (3) - a list of lists + alleles = collections.OrderedDict( + ("genotype_%04d" % (i + 1), genotype) + for (i, genotype) in enumerate(alleles)) + cross_product = False + + if len(alleles) != len(sequences): + raise ValueError( + "When passing a list of lists for the alleles argument " + "the length of the list (%d) must match the length of " + "the sequences being predicted (%d)" % ( + len(alleles), len(sequences))) - missing = [key for key in sequences if key not in alleles] - if missing: - raise ValueError( - "Sequence names missing from alleles dict: ", missing) + if not isinstance(alleles, dict): + raise ValueError("Invalid type for alleles: ", type(alleles)) + + experiment_names = None if cross_product else [] + genotype_names = list(alleles) + position_in_sequence = [] + for (i, (name, sequence)) in enumerate(sequences.items()): + genotype_name = None if cross_product else genotype_names[i] - for (name, sequence) in sequences.items(): if not isinstance(sequence, string_types): raise ValueError("Expected string, not %s (%s)" % ( sequence, type(sequence))) - for peptide_start in range(len(sequence) - min(peptide_lengths)): + for peptide_start in range(len(sequence) - min(peptide_lengths) + 1): n_flank_start = max(0, peptide_start - n_flank_length) for peptide_length in peptide_lengths: + peptide = sequence[ + peptide_start: peptide_start + peptide_length + ] + if len(peptide) != peptide_length: + continue c_flank_end = ( peptide_start + peptide_length + c_flank_length) sequence_names.append(name) - peptides.append( - sequence[peptide_start: peptide_start + peptide_length]) + position_in_sequence.append(peptide_start) + if not cross_product: + experiment_names.append(genotype_name) + peptides.append(peptide) if use_flanks: n_flanks.append( sequence[n_flank_start : peptide_start]) @@ -636,13 +674,20 @@ class Class1PresentationPredictor(object): alleles=alleles, n_flanks=n_flanks, c_flanks=c_flanks, - experiment_names=sequence_names, + experiment_names=experiment_names, include_affinity_percentile=include_affinity_percentile, verbose=verbose, throw=throw) - result_df = result_df.rename( - columns={"experiment_name": "sequence_name"}) + result_df.insert( + 0, + "sequence_name", + result_df.peptide_num.map(pandas.Series(sequence_names))) + result_df.insert( + 1, + "position_in_sequence", + result_df.peptide_num.map(pandas.Series(position_in_sequence))) + del result_df["peptide_num"] comparison_is_score = comparison_quantity.endswith("score") @@ -652,7 +697,8 @@ class Class1PresentationPredictor(object): if result == "best": result_df = result_df.drop_duplicates( - "sequence_name", keep="first").sort_values("sequence_name") + ["sequence_name", "experiment_name"], keep="first" + ).sort_values("sequence_name") elif result == "filtered": if comparison_is_score: result_df = result_df.loc[ diff --git a/mhcflurry/fasta.py b/mhcflurry/fasta.py new file mode 100644 index 0000000000000000000000000000000000000000..88b87092b04a0d10bca73a90cce6a88f2e07aeea --- /dev/null +++ b/mhcflurry/fasta.py @@ -0,0 +1,120 @@ +""" +Adapted from pyensembl, github.com/openvax/pyensembl +Original implementation by Alex Rubinsteyn. + +The worse sin in bioinformatics is to write your own FASTA parser. +We're doing this to avoid adding another dependency to MHCflurry, however. +""" + +from __future__ import print_function, division, absolute_import + +from gzip import GzipFile +import logging + +from six import binary_type, PY3 + +import pandas + + +def read_fasta_to_dataframe(filename): + reader = FastaParser() + rows = reader.iterate_over_file(filename) + return pandas.DataFrame( + rows, + columns=["sequence_id", "sequence"]) + +class FastaParser(object): + """ + FastaParser object consumes lines of a FASTA file incrementally. + """ + def __init__(self): + self.current_id = None + self.current_lines = [] + + def iterate_over_file(self, fasta_path): + """ + Generator that yields identifiers paired with sequences. + """ + with self.open_file(fasta_path) as f: + for line in f: + line = line.rstrip() + + if len(line) == 0: + continue + + # have to slice into a bytes object or else get a single integer + first_char = line[0:1] + + if first_char == b">": + previous_entry = self._current_entry() + self.current_id = self._parse_header_id(line) + + if len(self.current_id) == 0: + logging.warning( + "Unable to parse ID from header line: %s", line) + + self.current_lines = [] + + if previous_entry is not None: + yield previous_entry + + elif first_char == b";": + # semicolon are comment characters + continue + else: + self.current_lines.append(line) + + # the last sequence is still in the lines buffer after we're done with + # the file so make sure to yield it + id_and_seq = self._current_entry() + if id_and_seq is not None: + yield id_and_seq + + def _current_entry(self): + # when we hit a new entry, if this isn't the first + # entry of the file then put the last one in the dictionary + if self.current_id: + if len(self.current_lines) == 0: + logging.warning("No sequence data for '%s'", self.current_id) + else: + sequence = b"".join(self.current_lines) + if PY3: + # only decoding into an ASCII str for Python 3 since + # the binary sequence type for Python 2 is already 'str' + # and the unicode representation is inefficient + # (using either 16 or 32 bits per character depends on build) + sequence = sequence.decode("ascii") + return self.current_id, sequence + + @staticmethod + def open_file(fasta_path): + """ + Open either a text file or compressed gzip file as a stream of bytes. + """ + if fasta_path.endswith("gz") or fasta_path.endswith("gzip"): + return GzipFile(fasta_path, 'rb') + else: + return open(fasta_path, 'rb') + + @staticmethod + def _parse_header_id(line): + """ + Pull the transcript or protein identifier from the header line + which starts with '>' + """ + if type(line) is not binary_type: + raise TypeError("Expected header line to be of type %s but got %s" % ( + binary_type, type(line))) + + if len(line) <= 1: + raise ValueError("No identifier on FASTA line") + + # split line at first space to get the unique identifier for + # this sequence + space_index = line.find(b" ") + if space_index >= 0: + identifier = line[1:space_index] + else: + identifier = line[1:] + + return identifier.decode("ascii") \ No newline at end of file diff --git a/mhcflurry/predict_scan_command.py b/mhcflurry/predict_scan_command.py new file mode 100644 index 0000000000000000000000000000000000000000..f535e6c43f5bdff485b46baaeabbf85b8e461132 --- /dev/null +++ b/mhcflurry/predict_scan_command.py @@ -0,0 +1,328 @@ +''' +Scan protein sequences using the MHCflurry presentation predictor. + +By default, subsequences with affinity percentile ranks less than 2.0 are +returned. You can also specify --results-all to return predictions for all +subsequences, or --results-best to return the top subsequence for each sequence. + +Examples: + +Scan a set of sequences in a FASTA file for binders to any alleles in a MHC I +genotype: + + mhcflurry-predict-scan \ + test/data/example.fasta \ + --alleles HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 + +Instead of a FASTA, you can also pass a CSV that has "sequence_id" and "sequence" +columns. + +You can also specify multiple MHC I genotypes to scan: + + mhcflurry-predict-scan \ + test/data/example.fasta \ + --alleles \ + HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 \ + HLA-A*01:01,HLA-A*02:06,HLA-B*68:01,HLA-B*07:02,HLA-C*01:01,HLA-C*03:01 + +If `--out` is not specified, results are written to standard out. + +You can also run on sequences specified on the commandline: + +mhcflurry-predict-scan \ + --sequences MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT \ + --alleles HLA-A0201 H-2Kb +''' +from __future__ import ( + print_function, + division, + absolute_import, +) + +import sys +import argparse +import itertools +import logging +import os + +import pandas + +from .downloads import get_default_class1_presentation_models_dir +from .class1_affinity_predictor import Class1AffinityPredictor +from .class1_presentation_predictor import Class1PresentationPredictor +from .fasta import read_fasta_to_dataframe +from .version import __version__ + + +parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + add_help=False) + + +helper_args = parser.add_argument_group(title="Help") +helper_args.add_argument( + "-h", "--help", + action="help", + help="Show this help message and exit" +) +helper_args.add_argument( + "--list-supported-alleles", + action="store_true", + default=False, + help="Prints the list of supported alleles and exits" +) +helper_args.add_argument( + "--list-supported-peptide-lengths", + action="store_true", + default=False, + help="Prints the list of supported peptide lengths and exits" +) +helper_args.add_argument( + "--version", + action="version", + version="mhcflurry %s" % __version__, +) + +input_args = parser.add_argument_group(title="Input options") +input_args.add_argument( + "input", + metavar="INPUT.csv", + nargs="?", + help="Input CSV or FASTA") +input_args.add_argument( + "--input-format", + choices=("guess", "csv", "fasta"), + default="guess", + help="Format of input file. By default, it is guessed from the file " + "extension.") +input_args.add_argument( + "--alleles", + metavar="ALLELE", + nargs="+", + help="Alleles to predict") +input_args.add_argument( + "--sequences", + metavar="SEQ", + nargs="+", + help="Sequences to predict (exclusive with passing an input file)") +input_args.add_argument( + "--sequence-id-column", + metavar="NAME", + default="sequence_id", + help="Input CSV column name for sequence IDs. Default: '%(default)s'") +input_args.add_argument( + "--sequence-column", + metavar="NAME", + default="sequence", + help="Input CSV column name for sequences. Default: '%(default)s'") +input_args.add_argument( + "--no-throw", + action="store_true", + default=False, + help="Return NaNs for unsupported alleles or peptides instead of raising") + +results_args = parser.add_argument_group(title="Result options") +results_args.add_argument( + "--peptide-lengths", + type=int, + nargs="+", + default=[8, 9, 10, 11], + help="Peptide lengths to consider. Default: %(default)s.") +comparison_quantities = [ + "presentation_score", + "processing_score", + "affinity", + "affinity_percentile", +] +results_args.add_argument( + "--results-all", + action="store_true", + default=False, + help="") +results_args.add_argument( + "--results-best", + choices=comparison_quantities, + help="Take the top result for each sequence according to the specified " + "predicted quantity") +results_args.add_argument( + "--results-filtered", + choices=comparison_quantities, + help="Filter results by the specified quantity.") +results_args.add_argument( + "--threshold-presentation-score", + type=float, + default=0.7, + help="Threshold if filtering by presentation score. Default: %(default)s") +results_args.add_argument( + "--threshold-processing-score", + type=float, + default=0.5, + help="Threshold if filtering by processing score. Default: %(default)s") +results_args.add_argument( + "--threshold-affinity", + type=float, + default=500, + help="Threshold if filtering by affinity. Default: %(default)s") +results_args.add_argument( + "--threshold-affinity-percentile", + type=float, + default=2.0, + help="Threshold if filtering by affinity percentile. Default: %(default)s") + + +output_args = parser.add_argument_group(title="Output options") +output_args.add_argument( + "--out", + metavar="OUTPUT.csv", + help="Output CSV") +output_args.add_argument( + "--output-delimiter", + metavar="CHAR", + default=",", + help="Delimiter character for results. Default: '%(default)s'") +output_args.add_argument( + "--no-affinity-percentile", + default=False, + action="store_true", + help="Do not include affinity percentile rank") + +model_args = parser.add_argument_group(title="Model options") +model_args.add_argument( + "--models", + metavar="DIR", + default=None, + help="Directory containing presentation models." + "Default: %s" % get_default_class1_presentation_models_dir( + test_exists=False)) +model_args.add_argument( + "--no-flanking", + action="store_true", + default=False, + help="Do not use flanking sequence information in predictions") + + +def run(argv=sys.argv[1:]): + logging.getLogger('tensorflow').disabled = True + + if not argv: + parser.print_help() + parser.exit(1) + + args = parser.parse_args(argv) + + # It's hard to pass a tab in a shell, so we correct a common error: + if args.output_delimiter == "\\t": + args.output_delimiter = "\t" + + result_args = { + "all": args.results_all, + "best": args.results_best, + "filtered": args.results_filtered, + } + if all([not bool(arg) for arg in result_args.values()]): + result_args["filtered"] = "affinity_percentile" + + if sum([bool(arg) for arg in result_args.values()]) > 1: + parser.error( + "Specify at most one of --results-all, --results-best, " + "--results-filtered") + + (result,) = [key for (key, value) in result_args.items() if value] + result_comparison_quantity = result_args[result] + result_filter_value = None if result != "filtered" else { + "presentation_score": args.threshold_presentation_score, + "processing_score": args.threshold_processing_score, + "affinity": args.threshold_affinity, + "affinity_percentile": args.threshold_affinity_percentile, + }[result_comparison_quantity] + + models_dir = args.models + if models_dir is None: + # The reason we set the default here instead of in the argument parser + # is that we want to test_exists at this point, so the user gets a + # message instructing them to download the models if needed. + models_dir = get_default_class1_presentation_models_dir(test_exists=True) + + predictor = Class1PresentationPredictor.load(models_dir) + + if args.list_supported_alleles: + print("\n".join(predictor.supported_alleles)) + return + + if args.list_supported_peptide_lengths: + min_len, max_len = predictor.supported_peptide_lengths + print("\n".join([str(l) for l in range(min_len, max_len+1)])) + return + + if args.input: + if args.sequences: + parser.error( + "If an input file is specified, do not specify --sequences") + + input_format = args.input_format + if input_format == "guess": + extension = args.input.lower().split(".")[-1] + if extension in ["gz", "bzip2"]: + extension = args.input.lower().split(".")[-2] + + if extension == "csv": + input_format = "csv" + elif extension in ["fasta", "fa"]: + input_format = "fasta" + else: + parser.error( + "Couldn't guess input format from file extension: %s\n" + "Pass the --input-format argument to specify if it is a " + "CSV or fasta file" % args.input) + print("Guessed input file format:", input_format) + + if input_format == "csv": + df = pandas.read_csv(args.input) + print("Read input CSV with %d rows, columns are: %s" % ( + len(df), ", ".join(df.columns))) + for col in [args.sequence_column,]: + if col not in df.columns: + raise ValueError( + "No such column '%s' in CSV. Columns are: %s" % ( + col, ", ".join(["'%s'" % c for c in df.columns]))) + + elif input_format == "fasta": + df = read_fasta_to_dataframe(args.input) + print("Read input fasta with %d sequences" % len(df)) + print(df) + else: + raise ValueError("Unsupported input format", input_format) + else: + if not args.sequences: + parser.error( + "Specify either an input file or the --sequences argument") + + df = pandas.DataFrame({ + args.sequence_column: args.sequences, + }) + + if args.sequence_id_column not in df: + df[args.sequence_id_column] = "sequence_" + df.index.astype(str) + + df = df.set_index(args.sequence_id_column) + + genotypes = pandas.Series(args.alleles).str.split(r"[,\s]+") + genotypes.index = genotypes.index.map(lambda i: "genotype_%02d" % i) + + result_df = predictor.predict_sequences( + sequences=df[args.sequence_column].to_dict(), + alleles=genotypes.to_dict(), + result=result, + comparison_quantity=result_comparison_quantity, + filter_value=result_filter_value, + peptide_lengths=args.peptide_lengths, + use_flanks=not args.no_flanking, + include_affinity_percentile=not args.no_affinity_percentile, + throw=not args.no_throw) + + if args.out: + result_df.to_csv(args.out, index=False, sep=args.output_delimiter) + print("Wrote: %s" % args.out) + else: + result_df.to_csv(sys.stdout, index=False, sep=args.output_delimiter) diff --git a/setup.py b/setup.py index c2dbed429600e8ec0767a9181c6a983c67b573b7..aaaf760380d6bca5521825b79f86189d6ce9c935 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ if __name__ == '__main__': 'console_scripts': [ 'mhcflurry-downloads = mhcflurry.downloads_command:run', 'mhcflurry-predict = mhcflurry.predict_command:run', + 'mhcflurry-predict-scan = mhcflurry.predict_scan_command:run', 'mhcflurry-class1-train-allele-specific-models = ' 'mhcflurry.train_allele_specific_models_command:run', 'mhcflurry-class1-train-pan-allele-models = ' diff --git a/test/data/example.fasta b/test/data/example.fasta new file mode 100644 index 0000000000000000000000000000000000000000..56b0ab3274e7fe359e16bff3c06b2eb2b7e54d09 --- /dev/null +++ b/test/data/example.fasta @@ -0,0 +1,26 @@ +>QHN73810.1 surface glycoprotein [Severe acute respiratory syndrome coronavirus 2] +MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHV +SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPF +LGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI +NLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYN +ENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASV +YAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIAD +YNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYF +PLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFL +PFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLT +PTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLG +AENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGI +AVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDC +LGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIG +VTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDI +LSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLM +SFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNT +FVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVA +KNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDD +SEPVLKGVKLHYT +>protein1 +MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQV +EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHH +>protein2 +VTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGA +ARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC diff --git a/test/test_class1_presentation_predictor.py b/test/test_class1_presentation_predictor.py index 242f60875341995ec3c9e82e5382ec4a87ab47bd..b0ab1612664f948e7a7cab7d983daa164729f304 100644 --- a/test/test_class1_presentation_predictor.py +++ b/test/test_class1_presentation_predictor.py @@ -132,6 +132,87 @@ def test_basic(): numpy.testing.assert_array_almost_equal( test_df["prediction2"], other_test_df["prediction2"], decimal=6) + +def test_downloaded_predictor_small(): + global PRESENTATION_PREDICTOR + + # Test sequence scanning + scan_results = PRESENTATION_PREDICTOR.predict_sequences( + sequences=[ + "MESLVPGFN", + "QPYVFIKRS", + "AGGHSYGAD", + ], + alleles={ + "HLA-A*02:01": ["HLA-A*02:01"], + "HLA-C*02:01": ["HLA-C*02:01"], + }, + peptide_lengths=[9], + result="best") + print(scan_results) + assert_equal(len(scan_results), 6) + + scan_results = PRESENTATION_PREDICTOR.predict_sequences( + sequences=[ + "MESLVPGFN", + "QPYVFIKRS", + "AGGHSYGAD", + ], + alleles={ + "HLA-A*02:01": ["HLA-A*02:01"], + "HLA-C*02:01": ["HLA-C*02:01"], + }, + peptide_lengths=[8, 9], + result="best") + print(scan_results) + assert_equal(len(scan_results), 6) + + scan_results = PRESENTATION_PREDICTOR.predict_sequences( + sequences=[ + "MESLVPGFN", + "QPYVFIKRS", + "AGGHSYGAD", + ], + alleles={ + "HLA-A*02:01": ["HLA-A*02:01"], + "HLA-C*02:01": ["HLA-C*02:01"], + }, + peptide_lengths=[9], + result="all") + print(scan_results) + assert_equal(len(scan_results), 6) + + scan_results = PRESENTATION_PREDICTOR.predict_sequences( + sequences=[ + "MESLVPGFN", + "QPYVFIKRS", + "AGGHSYGAD", + ], + alleles={ + "HLA-A*02:01": ["HLA-A*02:01"], + "HLA-C*02:01": ["HLA-C*02:01"], + }, + peptide_lengths=[8, 9], + result="all") + print(scan_results) + assert_equal(len(scan_results), 18) + + scan_results = PRESENTATION_PREDICTOR.predict_sequences( + sequences=[ + "MESLVPGFN", + "QPYVFIKRS", + "AGGHSYGAD", + ], + alleles={ + "HLA-A*02:01": ["HLA-A*02:01"], + "HLA-C*02:01": ["HLA-C*02:01"], + }, + peptide_lengths=[10], + result="all") + print(scan_results) + assert_equal(len(scan_results), 0) + + def test_downloaded_predictor(): global PRESENTATION_PREDICTOR @@ -220,3 +301,32 @@ def test_downloaded_predictor(): assert len(scan_results4) > 200, len(scan_results4) assert_less(scan_results4.iloc[0].affinity, 100) + + scan_results5 = PRESENTATION_PREDICTOR.predict_sequences( + result="all", + comparison_quantity="affinity", + sequences={ + "seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE", + "seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG", + "seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG", + }, + alleles={ + "sample1": [ + "HLA-A*02:01", + "HLA-A*03:01", + "HLA-B*57:01", + "HLA-B*44:02", + "HLA-C*02:01", + "HLA-C*07:01", + ], + "sample2": [ + "HLA-A*01:01", + "HLA-A*02:06", + "HLA-B*07:02", + "HLA-B*44:02", + "HLA-C*03:01", + "HLA-C*07:02", + ], + }) + print(scan_results5) + assert_equal(len(scan_results5), len(scan_results4) * 2)