Sequence scanning in presentation predictor

5e100c1f · Tim O'Donnell · d05e772e · 5e100c1f · 5e100c1f · 5e100c1f
Commit 5e100c1f authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -113,6 +113,7 @@ class Class1PresentationPredictor(object):
        df = pandas.DataFrame({
            "peptide": numpy.array(peptides, copy=False),
        })
+        df["peptide_num"] = df.index
        if experiment_names is None:
            peptides = EncodableSequences.create(peptides)
            all_alleles = set()
@@ -141,9 +142,13 @@ class Class1PresentationPredictor(object):
                new_df["affinity"] = predictions_df[
                    experiment_alleles
                ].min(1).values
-                new_df["best_allele"] = predictions_df[
-                    experiment_alleles
-                ].idxmin(1).values
+                if len(df) == 0:
+                    new_df["best_allele"] = []
+                else:
+                    new_df["best_allele"] = predictions_df[
+                        experiment_alleles
+                    ].idxmin(1).values
+                dfs.append(new_df)

            result_df = pandas.concat(dfs, ignore_index=True)
        else:
@@ -177,8 +182,8 @@ class Class1PresentationPredictor(object):
        if include_affinity_percentile:
            result_df["affinity_percentile"] = (
                self.affinity_predictor.percentile_ranks(
-                    df.affinity.values,
-                    alleles=df.best_alleles.values,
+                    result_df.affinity.values,
+                    alleles=result_df.best_allele.values,
                    throw=False))

        return result_df
@@ -215,6 +220,9 @@ class Class1PresentationPredictor(object):
                raise ValueError("No processing predictor with flanks")
            predictor = self.processing_predictor_with_flanks

+        if len(peptides) == 0:
+            return numpy.array([], dtype=float)
+
        num_chunks = int(numpy.ceil(float(len(peptides)) / PREDICT_CHUNK_SIZE))
        peptide_chunks = numpy.array_split(peptides, num_chunks)
        n_flank_chunks = numpy.array_split(n_flanks, num_chunks)
@@ -384,6 +392,13 @@ class Class1PresentationPredictor(object):
        Presentation scores for each peptide. Scores range from 0 to 1, with
        higher values indicating more favorable presentation likelihood.
        """
+        if isinstance(alleles, dict):
+            if experiment_names is None:
+                raise ValueError(
+                    "experiment_names must be supplied when alleles is a dict. "
+                    "Alternatively, call predict_to_dataframe to predict over "
+                    "all experiments")
+
        return self.predict_to_dataframe(
            peptides=peptides,
            alleles=alleles,
@@ -416,15 +431,17 @@ class Class1PresentationPredictor(object):
        ----------
        peptides : list of string
            Peptide sequences
-        alleles : list of string or string -> string dict
+        alleles : list of string or string -> list of string dict
            If you are predicting for a single sample, pass a list of strings
            (up to 6) indicating the genotype. If you are predicting across
            multiple samples, pass a dict where the keys are (arbitrary)
            experiment names and the values are the alleles to predict for that
            sample.
        experiment_names : list of string [same length as peptides]
-            If you are passing a dict for 'alleles', use this argument to
-            specify which peptides go with which experiments.
+            If you are passing a dict for 'alleles', you can use this argument to
+            specify which peptides go with which experiments. If it is None,
+            then predictions will be performed for each peptide across all
+            experiments.
        n_flanks : list of string [same length as peptides]
            Upstream sequences before the peptide. Sequences of any length can
            be given and a suffix of the size supported by the model will be
@@ -453,14 +470,12 @@ class Class1PresentationPredictor(object):
        if isinstance(alleles, string_types):
            raise TypeError("alleles must be a list or dict")

-        if isinstance(alleles, dict):
-            if experiment_names is None:
-                raise ValueError(
-                    "experiment_names must be supplied when alleles is a dict")
-        else:
+        if not isinstance(alleles, dict):
+            # Make alleles into a dict.
            if experiment_names is not None:
                raise ValueError(
                    "alleles must be a dict when experiment_names is specified")
+
            alleles = numpy.array(alleles, copy=False)
            if len(alleles) > MAX_ALLELES_PER_SAMPLE:
                raise ValueError(
@@ -472,7 +487,6 @@ class Class1PresentationPredictor(object):
                    "is expected to be the same length as peptides."
                    % MAX_ALLELES_PER_SAMPLE)

-            experiment_names = ["experiment1"] * len(peptides)
            alleles = {
                "experiment1": alleles,
            }
@@ -488,22 +502,27 @@ class Class1PresentationPredictor(object):

        df = self.predict_affinity(
            peptides=peptides,
-            experiment_names=experiment_names,
            alleles=alleles,
+            experiment_names=experiment_names,  # might be None
            include_affinity_percentile=include_affinity_percentile,
            verbose=verbose,
            throw=throw)
+
        df["affinity_score"] = from_ic50(df.affinity)
-        df["processing_score"] = processing_scores
+        df["processing_score"] = df.peptide_num.map(
+            pandas.Series(processing_scores))
        if c_flanks is not None:
-            df.insert(1, "c_flank", c_flanks)
+            df.insert(1, "c_flank", df.peptide_num.map(pandas.Series(c_flanks)))
        if n_flanks is not None:
-            df.insert(1, "n_flank", n_flanks)
+            df.insert(1, "n_flank", df.peptide_num.map(pandas.Series(n_flanks)))

        model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
        model = self.get_model(model_name)
-        df["presentation_score"] = model.predict_proba(
-            df[self.model_inputs].values)[:,1]
+        if len(df) > 0:
+            df["presentation_score"] = model.predict_proba(
+                df[self.model_inputs].values)[:,1]
+        else:
+            df["presentation_score"] = []
        del df["affinity_score"]
        return df

@@ -514,9 +533,9 @@ class Class1PresentationPredictor(object):
            result="best",
            comparison_quantity="presentation_score",
            filter_value=None,
-            peptide_lengths=[8, 9, 10, 11],
+            peptide_lengths=(8, 9, 10, 11),
            use_flanks=True,
-            include_affinity_percentile=False,
+            include_affinity_percentile=True,
            verbose=1,
            throw=True):
        """
@@ -527,22 +546,21 @@ class Class1PresentationPredictor(object):
        sequences : str, list of string, or string -> string dict
            Protein sequences. If a dict is given, the keys are arbitrary (
            e.g. protein names), and the values are the amino acid sequences.
-        alleles : str, list of string, list of list of string, or string -> string dict
+        alleles : list of string, list of list of string, or dict of string -> list of string
            MHC I alleles. Can be: (1) a string (a single allele), (2) a list of
            strings (a single genotype), (3) a list of list of strings
            (multiple genotypes, where the total number of genotypes must equal
-            the number of sequences), or (4) a dict (in which case the keys must
-            match the sequences dict keys).
+            the number of sequences), or (4) a dict giving multiple genotypes,
+            which will each be run over the sequences.
        result : string
-            One of:
-             - "best": return the strongest peptide for each sequence
-             - "all": return predictions for all peptides
-             - "filtered": return predictions where comparison_quantity is
-             stronger (i.e (<) for affinity, (>) for scores) than filter_value.
+            Specify 'best' to return the strongest peptide for each sequence,
+            'all' to return predictions for all peptides, or 'filtered' to
+            return predictions where the comparison_quantity is stronger
+            (i.e (<) for affinity, (>) for scores) than filter_value.
        comparison_quantity : string
-            One of "presentation_score", "processing_score", or "affinity".
-            Quantity to use to rank (if result is "best") or filter (if result
-            is "filtered") results.
+            One of "presentation_score", "processing_score", "affinity", or
+            "affinity_percentile". Prediction to use to rank (if result is
+            "best") or filter (if result is "filtered") results.
        filter_value : float
            Threshold value to use, only relevant when result is "filtered".
            If comparison_quantity is "affinity", then all results less than
@@ -589,42 +607,62 @@ class Class1PresentationPredictor(object):
                ("sequence_%04d" % (i + 1), sequence)
                for (i, sequence) in enumerate(sequences))

+        cross_product = True
        if isinstance(alleles, string_types):
+            # Case (1) - alleles is a string
            alleles = [alleles]

-        if not isinstance(alleles, dict):
-            if all([isinstance(item, string_types) for item in alleles]):
-                alleles = dict((name, alleles) for name in sequences.keys())
-            elif len(alleles) != len(sequences):
+        if isinstance(alleles, dict):
+            if any([isinstance(v, string_types) for v in alleles.values()]):
                raise ValueError(
-                    "alleles must be (1) a string (a single allele), (2) a list of "
-                    "strings (a single genotype), (3) a list of list of strings ("
-                    "(multiple genotypes, where the total number of genotypes "
-                    "must equal the number of sequences), or (4) a dict (in which "
-                    "case the keys must match the sequences dict keys). Here "
-                    "it seemed like option (3) was being used, but the length "
-                    "of alleles (%d) did not match the length of sequences (%d)."
-                    % (len(alleles), len(sequences)))
+                    "The values in the alleles dict must be lists, not strings")
+        else:
+            if all(isinstance(a, string_types) for a in alleles):
+                # Case (2) - a simple list of alleles
+                alleles = {
+                    'genotype': alleles
+                }
            else:
-                alleles = dict(zip(sequences.keys(), alleles))
+                # Case (3) - a list of lists
+                alleles = collections.OrderedDict(
+                    ("genotype_%04d" % (i + 1), genotype)
+                    for (i, genotype) in enumerate(alleles))
+                cross_product = False
+
+                if len(alleles) != len(sequences):
+                    raise ValueError(
+                        "When passing a list of lists for the alleles argument "
+                        "the length of the list (%d) must match the length of "
+                        "the sequences being predicted (%d)" % (
+                            len(alleles), len(sequences)))

-        missing = [key for key in sequences if key not in alleles]
-        if missing:
-            raise ValueError(
-                "Sequence names missing from alleles dict: ", missing)
+        if not isinstance(alleles, dict):
+            raise ValueError("Invalid type for alleles: ", type(alleles))
+
+        experiment_names = None if cross_product else []
+        genotype_names = list(alleles)
+        position_in_sequence = []
+        for (i, (name, sequence)) in enumerate(sequences.items()):
+            genotype_name = None if cross_product else genotype_names[i]

-        for (name, sequence) in sequences.items():
            if not isinstance(sequence, string_types):
                raise ValueError("Expected string, not %s (%s)" % (
                    sequence, type(sequence)))
-            for peptide_start in range(len(sequence) - min(peptide_lengths)):
+            for peptide_start in range(len(sequence) - min(peptide_lengths) + 1):
                n_flank_start = max(0, peptide_start - n_flank_length)
                for peptide_length in peptide_lengths:
+                    peptide = sequence[
+                        peptide_start: peptide_start + peptide_length
+                    ]
+                    if len(peptide) != peptide_length:
+                        continue
                    c_flank_end = (
                        peptide_start + peptide_length + c_flank_length)
                    sequence_names.append(name)
-                    peptides.append(
-                        sequence[peptide_start: peptide_start + peptide_length])
+                    position_in_sequence.append(peptide_start)
+                    if not cross_product:
+                        experiment_names.append(genotype_name)
+                    peptides.append(peptide)
                    if use_flanks:
                        n_flanks.append(
                            sequence[n_flank_start : peptide_start])
@@ -636,13 +674,20 @@ class Class1PresentationPredictor(object):
            alleles=alleles,
            n_flanks=n_flanks,
            c_flanks=c_flanks,
-            experiment_names=sequence_names,
+            experiment_names=experiment_names,
            include_affinity_percentile=include_affinity_percentile,
            verbose=verbose,
            throw=throw)

-        result_df = result_df.rename(
-            columns={"experiment_name": "sequence_name"})
+        result_df.insert(
+            0,
+            "sequence_name",
+            result_df.peptide_num.map(pandas.Series(sequence_names)))
+        result_df.insert(
+            1,
+            "position_in_sequence",
+            result_df.peptide_num.map(pandas.Series(position_in_sequence)))
+        del result_df["peptide_num"]

        comparison_is_score = comparison_quantity.endswith("score")

@@ -652,7 +697,8 @@ class Class1PresentationPredictor(object):

        if result == "best":
            result_df = result_df.drop_duplicates(
-                "sequence_name", keep="first").sort_values("sequence_name")
+                ["sequence_name", "experiment_name"], keep="first"
+            ).sort_values("sequence_name")
        elif result == "filtered":
            if comparison_is_score:
                result_df = result_df.loc[

--- a/mhcflurry/fasta.py
+++ b/mhcflurry/fasta.py
+"""
+Adapted from pyensembl, github.com/openvax/pyensembl
+Original implementation by Alex Rubinsteyn.
+
+The worse sin in bioinformatics is to write your own FASTA parser.
+We're doing this to avoid adding another dependency to MHCflurry, however.
+"""
+
+from __future__ import print_function, division, absolute_import
+
+from gzip import GzipFile
+import logging
+
+from six import binary_type, PY3
+
+import pandas
+
+
+def read_fasta_to_dataframe(filename):
+    reader = FastaParser()
+    rows = reader.iterate_over_file(filename)
+    return pandas.DataFrame(
+        rows,
+        columns=["sequence_id", "sequence"])
+
+class FastaParser(object):
+    """
+    FastaParser object consumes lines of a FASTA file incrementally.
+    """
+    def __init__(self):
+        self.current_id = None
+        self.current_lines = []
+
+    def iterate_over_file(self, fasta_path):
+        """
+        Generator that yields identifiers paired with sequences.
+        """
+        with self.open_file(fasta_path) as f:
+            for line in f:
+                line = line.rstrip()
+
+                if len(line) == 0:
+                    continue
+
+                # have to slice into a bytes object or else get a single integer
+                first_char = line[0:1]
+
+                if first_char == b">":
+                    previous_entry = self._current_entry()
+                    self.current_id = self._parse_header_id(line)
+
+                    if len(self.current_id) == 0:
+                        logging.warning(
+                            "Unable to parse ID from header line: %s", line)
+
+                    self.current_lines = []
+
+                    if previous_entry is not None:
+                        yield previous_entry
+
+                elif first_char == b";":
+                    # semicolon are comment characters
+                    continue
+                else:
+                    self.current_lines.append(line)
+
+        # the last sequence is still in the lines buffer after we're done with
+        # the file so make sure to yield it
+        id_and_seq = self._current_entry()
+        if id_and_seq is not None:
+            yield id_and_seq
+
+    def _current_entry(self):
+        # when we hit a new entry, if this isn't the first
+        # entry of the file then put the last one in the dictionary
+        if self.current_id:
+            if len(self.current_lines) == 0:
+                logging.warning("No sequence data for '%s'", self.current_id)
+            else:
+                sequence = b"".join(self.current_lines)
+                if PY3:
+                    # only decoding into an ASCII str for Python 3 since
+                    # the binary sequence type for Python 2 is already 'str'
+                    # and the unicode representation is inefficient
+                    # (using either 16 or 32 bits per character depends on build)
+                    sequence = sequence.decode("ascii")
+                return self.current_id, sequence
+
+    @staticmethod
+    def open_file(fasta_path):
+        """
+        Open either a text file or compressed gzip file as a stream of bytes.
+        """
+        if fasta_path.endswith("gz") or fasta_path.endswith("gzip"):
+            return GzipFile(fasta_path, 'rb')
+        else:
+            return open(fasta_path, 'rb')
+
+    @staticmethod
+    def _parse_header_id(line):
+        """
+        Pull the transcript or protein identifier from the header line
+        which starts with '>'
+        """
+        if type(line) is not binary_type:
+            raise TypeError("Expected header line to be of type %s but got %s" % (
+                binary_type, type(line)))
+
+        if len(line) <= 1:
+            raise ValueError("No identifier on FASTA line")
+
+        # split line at first space to get the unique identifier for
+        # this sequence
+        space_index = line.find(b" ")
+        if space_index >= 0:
+            identifier = line[1:space_index]
+        else:
+            identifier = line[1:]
+
+        return identifier.decode("ascii")
\ No newline at end of file
--- a/mhcflurry/predict_scan_command.py
+++ b/mhcflurry/predict_scan_command.py
+'''
+Scan protein sequences using the MHCflurry presentation predictor.
+
+By default, subsequences with affinity percentile ranks less than 2.0 are
+returned. You can also specify --results-all to return predictions for all
+subsequences, or --results-best to return the top subsequence for each sequence.
+
+Examples:
+
+Scan a set of sequences in a FASTA file for binders to any alleles in a MHC I
+genotype:
+
+    mhcflurry-predict-scan \
+        test/data/example.fasta \
+        --alleles HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02
+
+Instead of a FASTA, you can also pass a CSV that has "sequence_id" and "sequence"
+columns.
+
+You can also specify multiple MHC I genotypes to scan:
+
+    mhcflurry-predict-scan \
+        test/data/example.fasta \
+        --alleles \
+            HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 \
+            HLA-A*01:01,HLA-A*02:06,HLA-B*68:01,HLA-B*07:02,HLA-C*01:01,HLA-C*03:01
+
+If `--out` is not specified, results are written to standard out.
+
+You can also run on sequences specified on the commandline:
+
+mhcflurry-predict-scan \
+    --sequences MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT \
+    --alleles HLA-A0201 H-2Kb
+'''
+from __future__ import (
+    print_function,
+    division,
+    absolute_import,
+)
+
+import sys
+import argparse
+import itertools
+import logging
+import os
+
+import pandas
+
+from .downloads import get_default_class1_presentation_models_dir
+from .class1_affinity_predictor import Class1AffinityPredictor
+from .class1_presentation_predictor import Class1PresentationPredictor
+from .fasta import read_fasta_to_dataframe
+from .version import __version__
+
+
+parser = argparse.ArgumentParser(
+    description=__doc__,
+    formatter_class=argparse.RawDescriptionHelpFormatter,
+    add_help=False)
+
+
+helper_args = parser.add_argument_group(title="Help")
+helper_args.add_argument(
+    "-h", "--help",
+    action="help",
+    help="Show this help message and exit"
+)
+helper_args.add_argument(
+    "--list-supported-alleles",
+    action="store_true",
+    default=False,
+    help="Prints the list of supported alleles and exits"
+)
+helper_args.add_argument(
+    "--list-supported-peptide-lengths",
+    action="store_true",
+    default=False,
+    help="Prints the list of supported peptide lengths and exits"
+)
+helper_args.add_argument(
+    "--version",
+    action="version",
+    version="mhcflurry %s" % __version__,
+)
+
+input_args = parser.add_argument_group(title="Input options")
+input_args.add_argument(
+    "input",
+    metavar="INPUT.csv",
+    nargs="?",
+    help="Input CSV or FASTA")
+input_args.add_argument(
+    "--input-format",
+    choices=("guess", "csv", "fasta"),
+    default="guess",
+    help="Format of input file. By default, it is guessed from the file "
+         "extension.")
+input_args.add_argument(
+    "--alleles",
+    metavar="ALLELE",
+    nargs="+",
+    help="Alleles to predict")
+input_args.add_argument(
+    "--sequences",
+    metavar="SEQ",
+    nargs="+",
+    help="Sequences to predict (exclusive with passing an input file)")
+input_args.add_argument(
+    "--sequence-id-column",
+    metavar="NAME",
+    default="sequence_id",
+    help="Input CSV column name for sequence IDs. Default: '%(default)s'")
+input_args.add_argument(
+    "--sequence-column",
+    metavar="NAME",
+    default="sequence",
+    help="Input CSV column name for sequences. Default: '%(default)s'")
+input_args.add_argument(
+    "--no-throw",
+    action="store_true",
+    default=False,
+    help="Return NaNs for unsupported alleles or peptides instead of raising")
+
+results_args = parser.add_argument_group(title="Result options")
+results_args.add_argument(
+    "--peptide-lengths",
+    type=int,
+    nargs="+",
+    default=[8, 9, 10, 11],
+    help="Peptide lengths to consider. Default: %(default)s.")
+comparison_quantities = [
+    "presentation_score",
+    "processing_score",
+    "affinity",
+    "affinity_percentile",
+]
+results_args.add_argument(
+    "--results-all",
+    action="store_true",
+    default=False,
+    help="")
+results_args.add_argument(
+    "--results-best",
+    choices=comparison_quantities,
+    help="Take the top result for each sequence according to the specified "
+    "predicted quantity")
+results_args.add_argument(
+    "--results-filtered",
+    choices=comparison_quantities,
+    help="Filter results by the specified quantity.")
+results_args.add_argument(
+    "--threshold-presentation-score",
+    type=float,
+    default=0.7,
+    help="Threshold if filtering by presentation score. Default: %(default)s")
+results_args.add_argument(
+    "--threshold-processing-score",
+    type=float,
+    default=0.5,
+    help="Threshold if filtering by processing score. Default: %(default)s")
+results_args.add_argument(
+    "--threshold-affinity",
+    type=float,
+    default=500,
+    help="Threshold if filtering by affinity. Default: %(default)s")
+results_args.add_argument(
+    "--threshold-affinity-percentile",
+    type=float,
+    default=2.0,
+    help="Threshold if filtering by affinity percentile. Default: %(default)s")
+
+
+output_args = parser.add_argument_group(title="Output options")
+output_args.add_argument(
+    "--out",
+    metavar="OUTPUT.csv",
+    help="Output CSV")
+output_args.add_argument(
+    "--output-delimiter",
+    metavar="CHAR",
+    default=",",
+    help="Delimiter character for results. Default: '%(default)s'")
+output_args.add_argument(
+    "--no-affinity-percentile",
+    default=False,
+    action="store_true",
+    help="Do not include affinity percentile rank")
+
+model_args = parser.add_argument_group(title="Model options")
+model_args.add_argument(
+    "--models",
+    metavar="DIR",
+    default=None,
+    help="Directory containing presentation models."
+    "Default: %s" % get_default_class1_presentation_models_dir(
+        test_exists=False))
+model_args.add_argument(
+    "--no-flanking",
+    action="store_true",
+    default=False,
+    help="Do not use flanking sequence information in predictions")
+
+
+def run(argv=sys.argv[1:]):
+    logging.getLogger('tensorflow').disabled = True
+
+    if not argv:
+        parser.print_help()
+        parser.exit(1)
+
+    args = parser.parse_args(argv)
+
+    # It's hard to pass a tab in a shell, so we correct a common error:
+    if args.output_delimiter == "\\t":
+        args.output_delimiter = "\t"
+
+    result_args = {
+        "all": args.results_all,
+        "best": args.results_best,
+        "filtered": args.results_filtered,
+    }
+    if all([not bool(arg) for arg in result_args.values()]):
+        result_args["filtered"] = "affinity_percentile"
+
+    if sum([bool(arg) for arg in result_args.values()]) > 1:
+        parser.error(
+            "Specify at most one of --results-all, --results-best, "
+            "--results-filtered")
+
+    (result,) = [key for (key, value) in result_args.items() if value]
+    result_comparison_quantity = result_args[result]
+    result_filter_value = None if result != "filtered" else {
+        "presentation_score": args.threshold_presentation_score,
+        "processing_score": args.threshold_processing_score,
+        "affinity": args.threshold_affinity,
+        "affinity_percentile": args.threshold_affinity_percentile,
+    }[result_comparison_quantity]
+
+    models_dir = args.models
+    if models_dir is None:
+        # The reason we set the default here instead of in the argument parser
+        # is that we want to test_exists at this point, so the user gets a
+        # message instructing them to download the models if needed.
+        models_dir = get_default_class1_presentation_models_dir(test_exists=True)
+
+    predictor = Class1PresentationPredictor.load(models_dir)
+
+    if args.list_supported_alleles:
+        print("\n".join(predictor.supported_alleles))
+        return
+
+    if args.list_supported_peptide_lengths:
+        min_len, max_len = predictor.supported_peptide_lengths
+        print("\n".join([str(l) for l in range(min_len, max_len+1)]))
+        return
+
+    if args.input:
+        if args.sequences:
+            parser.error(
+                "If an input file is specified, do not specify --sequences")
+
+        input_format = args.input_format
+        if input_format == "guess":
+            extension = args.input.lower().split(".")[-1]
+            if extension in ["gz", "bzip2"]:
+                extension = args.input.lower().split(".")[-2]
+
+            if extension == "csv":
+                input_format = "csv"
+            elif extension in ["fasta", "fa"]:
+                input_format = "fasta"
+            else:
+                parser.error(
+                    "Couldn't guess input format from file extension: %s\n"
+                    "Pass the --input-format argument to specify if it is a "
+                    "CSV or fasta file" % args.input)
+            print("Guessed input file format:", input_format)
+
+        if input_format == "csv":
+            df = pandas.read_csv(args.input)
+            print("Read input CSV with %d rows, columns are: %s" % (
+                len(df), ", ".join(df.columns)))
+            for col in [args.sequence_column,]:
+                if col not in df.columns:
+                    raise ValueError(
+                        "No such column '%s' in CSV. Columns are: %s" % (
+                            col, ", ".join(["'%s'" % c for c in df.columns])))
+
+        elif input_format == "fasta":
+            df = read_fasta_to_dataframe(args.input)
+            print("Read input fasta with %d sequences" % len(df))
+            print(df)
+        else:
+            raise ValueError("Unsupported input format", input_format)
+    else:
+        if not args.sequences:
+            parser.error(
+                "Specify either an input file or the --sequences argument")
+
+        df = pandas.DataFrame({
+            args.sequence_column: args.sequences,
+        })
+
+    if args.sequence_id_column not in df:
+        df[args.sequence_id_column] = "sequence_" + df.index.astype(str)
+
+    df = df.set_index(args.sequence_id_column)
+
+    genotypes = pandas.Series(args.alleles).str.split(r"[,\s]+")
+    genotypes.index = genotypes.index.map(lambda i: "genotype_%02d" % i)
+
+    result_df = predictor.predict_sequences(
+        sequences=df[args.sequence_column].to_dict(),
+        alleles=genotypes.to_dict(),
+        result=result,
+        comparison_quantity=result_comparison_quantity,
+        filter_value=result_filter_value,
+        peptide_lengths=args.peptide_lengths,
+        use_flanks=not args.no_flanking,
+        include_affinity_percentile=not args.no_affinity_percentile,
+        throw=not args.no_throw)
+
+    if args.out:
+        result_df.to_csv(args.out, index=False, sep=args.output_delimiter)
+        print("Wrote: %s" % args.out)
+    else:
+        result_df.to_csv(sys.stdout, index=False, sep=args.output_delimiter)
--- a/setup.py
+++ b/setup.py
@@ -73,6 +73,7 @@ if __name__ == '__main__':
            'console_scripts': [
                'mhcflurry-downloads = mhcflurry.downloads_command:run',
                'mhcflurry-predict = mhcflurry.predict_command:run',
+                'mhcflurry-predict-scan = mhcflurry.predict_scan_command:run',
                'mhcflurry-class1-train-allele-specific-models = '
                    'mhcflurry.train_allele_specific_models_command:run',
                'mhcflurry-class1-train-pan-allele-models = '

--- a/test/data/example.fasta
+++ b/test/data/example.fasta
+>QHN73810.1 surface glycoprotein [Severe acute respiratory syndrome coronavirus 2]
+MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHV
+SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPF
+LGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI
+NLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYN
+ENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASV
+YAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIAD
+YNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYF
+PLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFL
+PFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLT
+PTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLG
+AENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGI
+AVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDC
+LGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIG
+VTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDI
+LSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLM
+SFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNT
+FVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVA
+KNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDD
+SEPVLKGVKLHYT
+>protein1
+MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQV
+EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHH
+>protein2
+VTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGA
+ARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC
--- a/test/test_class1_presentation_predictor.py
+++ b/test/test_class1_presentation_predictor.py
@@ -132,6 +132,87 @@ def test_basic():
        numpy.testing.assert_array_almost_equal(
            test_df["prediction2"], other_test_df["prediction2"], decimal=6)

+
+def test_downloaded_predictor_small():
+    global PRESENTATION_PREDICTOR
+
+    # Test sequence scanning
+    scan_results = PRESENTATION_PREDICTOR.predict_sequences(
+        sequences=[
+            "MESLVPGFN",
+            "QPYVFIKRS",
+            "AGGHSYGAD",
+        ],
+        alleles={
+            "HLA-A*02:01": ["HLA-A*02:01"],
+            "HLA-C*02:01": ["HLA-C*02:01"],
+        },
+        peptide_lengths=[9],
+        result="best")
+    print(scan_results)
+    assert_equal(len(scan_results), 6)
+
+    scan_results = PRESENTATION_PREDICTOR.predict_sequences(
+        sequences=[
+            "MESLVPGFN",
+            "QPYVFIKRS",
+            "AGGHSYGAD",
+        ],
+        alleles={
+            "HLA-A*02:01": ["HLA-A*02:01"],
+            "HLA-C*02:01": ["HLA-C*02:01"],
+        },
+        peptide_lengths=[8, 9],
+        result="best")
+    print(scan_results)
+    assert_equal(len(scan_results), 6)
+
+    scan_results = PRESENTATION_PREDICTOR.predict_sequences(
+        sequences=[
+            "MESLVPGFN",
+            "QPYVFIKRS",
+            "AGGHSYGAD",
+        ],
+        alleles={
+            "HLA-A*02:01": ["HLA-A*02:01"],
+            "HLA-C*02:01": ["HLA-C*02:01"],
+        },
+        peptide_lengths=[9],
+        result="all")
+    print(scan_results)
+    assert_equal(len(scan_results), 6)
+
+    scan_results = PRESENTATION_PREDICTOR.predict_sequences(
+        sequences=[
+            "MESLVPGFN",
+            "QPYVFIKRS",
+            "AGGHSYGAD",
+        ],
+        alleles={
+            "HLA-A*02:01": ["HLA-A*02:01"],
+            "HLA-C*02:01": ["HLA-C*02:01"],
+        },
+        peptide_lengths=[8, 9],
+        result="all")
+    print(scan_results)
+    assert_equal(len(scan_results), 18)
+
+    scan_results = PRESENTATION_PREDICTOR.predict_sequences(
+        sequences=[
+            "MESLVPGFN",
+            "QPYVFIKRS",
+            "AGGHSYGAD",
+        ],
+        alleles={
+            "HLA-A*02:01": ["HLA-A*02:01"],
+            "HLA-C*02:01": ["HLA-C*02:01"],
+        },
+        peptide_lengths=[10],
+        result="all")
+    print(scan_results)
+    assert_equal(len(scan_results), 0)
+
+
 def test_downloaded_predictor():
    global PRESENTATION_PREDICTOR

@@ -220,3 +301,32 @@ def test_downloaded_predictor():

    assert len(scan_results4) > 200, len(scan_results4)
    assert_less(scan_results4.iloc[0].affinity, 100)
+
+    scan_results5 = PRESENTATION_PREDICTOR.predict_sequences(
+        result="all",
+        comparison_quantity="affinity",
+        sequences={
+            "seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
+            "seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
+            "seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
+        },
+        alleles={
+            "sample1": [
+                "HLA-A*02:01",
+                "HLA-A*03:01",
+                "HLA-B*57:01",
+                "HLA-B*44:02",
+                "HLA-C*02:01",
+                "HLA-C*07:01",
+            ],
+            "sample2": [
+                "HLA-A*01:01",
+                "HLA-A*02:06",
+                "HLA-B*07:02",
+                "HLA-B*44:02",
+                "HLA-C*03:01",
+                "HLA-C*07:02",
+            ],
+        })
+    print(scan_results5)
+    assert_equal(len(scan_results5), len(scan_results4) * 2)