add test/test_predict_scan_command.py

4082613a · Tim O'Donnell · 3dc666cb · 4082613a · 4082613a · 4082613a
Commit 4082613a authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -93,11 +93,13 @@ class Class1PresentationPredictor(object):
        Parameters
        ----------
        peptides : list of string
+        alleles : dict of string -> list of string
+            Keys are experiment names, values are the alleles (genotype) for
+            that sample
        experiment_names : list of string [same length as peptides]
            Sample names corresponding to each peptide. These are used to
-            lookup the alleles for each peptide in the alleles dict.
+            lookup the alleles for each peptide in the alleles dict. If not
-        alleles : dict of string -> list of string
+            specified, then all combinations of experiment names
-            Keys are experiment names, values are the alleles for that sample
        include_affinity_percentile : bool
            Whether to include affinity percentile ranks
        verbose : int
@@ -585,6 +587,8 @@ class Class1PresentationPredictor(object):
            peptide, n_flank, c_flank, sequence_name, affinity, best_allele,
            processing_score, presentation_score
        """
+        if comparison_quantity is None:
+            comparison_quantity = "presentation_score"
        processing_predictor = self.processing_predictor_with_flanks
        if not use_flanks or processing_predictor is None:

--- a/mhcflurry/predict_command.py
+++ b/mhcflurry/predict_command.py
@@ -11,7 +11,7 @@ Examples:
 Write a CSV file containing the contents of INPUT.csv plus additional columns
 giving MHCflurry predictions:
-    $ mhcflurry-predict INPUT.csv --out RESULT.csv
+$ mhcflurry-predict INPUT.csv --out RESULT.csv
 The input CSV file is expected to contain columns "allele", "peptide", and,
 optionally, "n_flank", and "c_flank".
@@ -19,10 +19,23 @@ optionally, "n_flank", and "c_flank".
 If `--out` is not specified, results are written to stdout.
 You can also run on alleles and peptides specified on the commandline, in
-which case predictions are written for all combinations of alleles and
+which case predictions are written for *all combinations* of alleles and
 peptides:
-    $ mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL
+$ mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL
+Instead of individual alleles (in a CSV or on the command line), you can also
+give a comma separated list of alleles giving a sample genotype. In this case,
+the tightest binding affinity across the alleles for the sample will be
+returned. For example:
+$ mhcflurry-predict --peptides SIINFEKL DENDREKLLL \
+    --alleles \
+        HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 \
+        HLA-A*01:01,HLA-A*02:06,HLA-B*44:02,HLA-B*07:02,HLA-C*01:01,HLA-C*03:01
+will give the tightest predicted affinities across alleles for each of the two
+genotypes specified for each peptide.
 '''
 from __future__ import (
    print_function,

--- a/mhcflurry/predict_scan_command.py
+++ b/mhcflurry/predict_scan_command.py
 '''
 Scan protein sequences using the MHCflurry presentation predictor.
-By default, subsequences with affinity percentile ranks less than 2.0 are
+By default, sub-sequences (peptides) with affinity percentile ranks less than
-returned. You can also specify --results-all to return predictions for all
+2.0 are returned. You can also specify --results-all to return predictions for
-subsequences, or --results-best to return the top subsequence for each sequence.
+all peptides, or --results-best to return the top peptide for each sequence.
 Examples:
 Scan a set of sequences in a FASTA file for binders to any alleles in a MHC I
 genotype:
-    mhcflurry-predict-scan \
+$ mhcflurry-predict-scan \
-        test/data/example.fasta \
+    test/data/example.fasta \
-        --alleles HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02
+    --alleles HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02
 Instead of a FASTA, you can also pass a CSV that has "sequence_id" and "sequence"
 columns.
-You can also specify multiple MHC I genotypes to scan:
+You can also specify multiple MHC I genotypes to scan as space-separated
+arguments to the --alleles option:
-    mhcflurry-predict-scan \
+$ mhcflurry-predict-scan \
-        test/data/example.fasta \
+    test/data/example.fasta \
-        --alleles \
+    --alleles \
-            HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:01,HLA-C*07:02 \
+        HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02 \
-            HLA-A*01:01,HLA-A*02:06,HLA-B*68:01,HLA-B*07:02,HLA-C*01:01,HLA-C*03:01
+        HLA-A*01:01,HLA-A*02:06,HLA-B*44:02,HLA-B*07:02,HLA-C*01:02,HLA-C*03:01
 If `--out` is not specified, results are written to standard out.
-You can also run on sequences specified on the commandline:
+You can also specify sequences on the commandline:
 mhcflurry-predict-scan \
    --sequences MGYINVFAFPFTIYSLLLCRMNSRNYIAQVDVVNFNLT \
-    --alleles HLA-A0201 H-2Kb
+    --alleles HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02
 '''
 from __future__ import (
    print_function,
@@ -48,7 +50,6 @@ import os
 import pandas
 from .downloads import get_default_class1_presentation_models_dir
-from .class1_affinity_predictor import Class1AffinityPredictor
 from .class1_presentation_predictor import Class1PresentationPredictor
 from .fasta import read_fasta_to_dataframe
 from .version import __version__
@@ -70,13 +71,13 @@ helper_args.add_argument(
    "--list-supported-alleles",
    action="store_true",
    default=False,
-    help="Prints the list of supported alleles and exits"
+    help="Print the list of supported alleles and exits"
 )
 helper_args.add_argument(
    "--list-supported-peptide-lengths",
    action="store_true",
    default=False,
-    help="Prints the list of supported peptide lengths and exits"
+    help="Print the list of supported peptide lengths and exits"
 )
 helper_args.add_argument(
    "--version",
@@ -87,7 +88,7 @@ helper_args.add_argument(
 input_args = parser.add_argument_group(title="Input options")
 input_args.add_argument(
    "input",
-    metavar="INPUT.csv",
+    metavar="INPUT",
    nargs="?",
    help="Input CSV or FASTA")
 input_args.add_argument(
@@ -139,7 +140,7 @@ results_args.add_argument(
    "--results-all",
    action="store_true",
    default=False,
-    help="")
+    help="Return results for all peptides regardless of affinity, etc.")
 results_args.add_argument(
    "--results-best",
    choices=comparison_quantities,
@@ -229,7 +230,8 @@ def run(argv=sys.argv[1:]):
            "--results-filtered")
    (result,) = [key for (key, value) in result_args.items() if value]
-    result_comparison_quantity = result_args[result]
+    result_comparison_quantity = (
+        None if result == "all" else result_args[result])
    result_filter_value = None if result != "filtered" else {
        "presentation_score": args.threshold_presentation_score,
        "processing_score": args.threshold_processing_score,

--- a/test/data/example.fasta
+++ b/test/data/example.fasta
->QHN73810.1 surface glycoprotein [Severe acute respiratory syndrome coronavirus 2]
+>QHN73810.1 surface glycoprotein [Severe acute respiratory syndrome coronavirus 2] prefix
 MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHV
 SGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPF
 LGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPI
-NLVRDLPQGFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFLLKYN
-ENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITNLCPFGEVFNATRFASV
-YAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCFTNVYADSFVIRGDEVRQIAPGQTGKIAD
-YNYKLPDDFTGCVIAWNSNNLDSKVGGNYNYLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYF
-PLQSYGFQPTNGVGYQPYRVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFL
-PFQQFGRDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAIHADQLT
-PTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPRRARSVASQSIIAYTMSLG
-AENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTMYICGDSTECSNLLLQYGSFCTQLNRALTGI
-AVEQDKNTQEVFAQVKQIYKTPPIKDFGGFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDC
-LGDIAARDLICAQKFNGLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIG
-VTQNVLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGAISSVLNDI
-LSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMSECVLGQSKRVDFCGKGYHLM
-SFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAHFPREGVFVSNGTHWFVTQRNFYEPQIITTDNT
-FVSGNCDVVIGIVNNTVYDPLQPELDSFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVA
-KNLNESLIDLQELGKYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDD
-SEPVLKGVKLHYT
 >protein1
 MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQV
 EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHH

--- a/test/test_predict_command.py
+++ b/test/test_predict_command.py
+import logging
+logging.getLogger('matplotlib').disabled = True
+logging.getLogger('tensorflow').disabled = True
 import tempfile
 import os

--- a/test/test_predict_scan_command.py
+++ b/test/test_predict_scan_command.py
+import logging
+logging.getLogger('matplotlib').disabled = True
+logging.getLogger('tensorflow').disabled = True
+import tempfile
+import os
+import pandas
+from numpy.testing import assert_equal, assert_array_less, assert_array_equal
+from mhcflurry import predict_scan_command
+from mhcflurry.testing_utils import cleanup, startup
+teardown = cleanup
+setup = startup
+from . import data_path
+def test_fasta():
+    args = [
+        data_path("example.fasta"),
+        "--alleles",
+        "HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02",
+    ]
+    deletes = []
+    try:
+        fd_out = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+        deletes.append(fd_out.name)
+        full_args = args + ["--out", fd_out.name]
+        print("Running with args: %s" % full_args)
+        predict_scan_command.run(full_args)
+        result = pandas.read_csv(fd_out.name)
+        print(result)
+        assert not result.isnull().any().any()
+    finally:
+        for delete in deletes:
+            os.unlink(delete)
+    assert_equal(result.best_allele.nunique(), 6)
+    assert_equal(result.sequence_name.nunique(), 3)
+    assert_array_less(result.affinity_percentile, 2.0)
+def test_fasta_50nm():
+    args = [
+        data_path("example.fasta"),
+        "--alleles",
+        "HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02",
+        "--results-filtered", "affinity",
+        "--threshold-affinity", "50",
+    ]
+    deletes = []
+    try:
+        fd_out = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+        deletes.append(fd_out.name)
+        full_args = args + ["--out", fd_out.name]
+        print("Running with args: %s" % full_args)
+        predict_scan_command.run(full_args)
+        result = pandas.read_csv(fd_out.name)
+        print(result)
+        assert not result.isnull().any().any()
+    finally:
+        for delete in deletes:
+            os.unlink(delete)
+    assert len(result) > 0
+    assert_array_less(result.affinity, 50)
+def test_fasta_best():
+    args = [
+        data_path("example.fasta"),
+        "--alleles",
+        "HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02",
+        "--results-best", "affinity_percentile",
+    ]
+    deletes = []
+    try:
+        fd_out = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+        deletes.append(fd_out.name)
+        full_args = args + ["--out", fd_out.name]
+        print("Running with args: %s" % full_args)
+        predict_scan_command.run(full_args)
+        result = pandas.read_csv(fd_out.name)
+        print(result)
+        assert not result.isnull().any().any()
+    finally:
+        for delete in deletes:
+            os.unlink(delete)
+    assert len(result) > 0
+    assert_array_equal(
+        result.groupby(["sequence_name"]).peptide.count().values, 1)
+def test_commandline_sequences():
+    args = [
+        "--sequences", "ASDFGHKL", "QWERTYIPCVNM",
+        "--alleles", "HLA-A0201,HLA-A0301", "H-2-Kb",
+        "--peptide-lengths", "8",
+        "--results-all",
+    ]
+    deletes = []
+    try:
+        fd_out = tempfile.NamedTemporaryFile(delete=False, suffix=".csv")
+        deletes.append(fd_out.name)
+        full_args = args + ["--out", fd_out.name]
+        print("Running with args: %s" % full_args)
+        predict_scan_command.run(full_args)
+        result = pandas.read_csv(fd_out.name)
+        print(result)
+    finally:
+        for delete in deletes:
+            os.unlink(delete)
+    print(result)
+    assert_equal(result.sequence_name.nunique(), 2)
+    assert_equal(result.best_allele.nunique(), 3)
+    assert_equal(result.experiment_name.nunique(), 2)
+    assert_equal((result.peptide == "ASDFGHKL").sum(), 2)
+    assert_equal((result.peptide != "ASDFGHKL").sum(), 10)
\ No newline at end of file