From ba4b1cfb16a600fda7c67f647d64726e345d284b Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sat, 25 Jan 2020 16:38:44 -0500
Subject: [PATCH] Updates pre cleavage -> processing rename

---
 .travis.yml                                |  17 ++-
 mhcflurry/class1_affinity_predictor.py     |   8 +-
 mhcflurry/class1_presentation_predictor.py |  59 ++++++---
 mhcflurry/downloads.py                     |   2 +-
 mhcflurry/predict_command.py               | 142 ++++++++++++++-------
 test/test_predict_command.py               |   5 +-
 6 files changed, 157 insertions(+), 76 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index b4b2f9f5..c0f95d62 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -46,18 +46,21 @@ script:
       $(mhcflurry-downloads url data_curated)
       $(mhcflurry-downloads url data_mass_spec_annotated)
       $(mhcflurry-downloads url models_class1)
+      $(mhcflurry-downloads url models_class1_presentation)
       $(mhcflurry-downloads url models_class1_pan)
+      $(mhcflurry-downloads url models_class1_pan_variants)
       $(mhcflurry-downloads url allele_sequences)
       -P /tmp/downloads
   - ls -lh /tmp/downloads
   -
     mhcflurry-downloads fetch
-    data_curated
-    data_mass_spec_annotated
-    models_class1
-    models_class1_pan
-    models_class1_pan_variants
-    allele_sequences
-    --already-downloaded-dir /tmp/downloads
+      data_curated
+      data_mass_spec_annotated
+      models_class1
+      models_class1_presentation
+      models_class1_pan
+      models_class1_pan_variants
+      allele_sequences
+      --already-downloaded-dir /tmp/downloads
   - mhcflurry-downloads info  # just to test this command works
   - nosetests --with-timer -sv test
diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index 6b16ef07..9a3258cf 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -892,11 +892,15 @@ class Class1AffinityPredictor(object):
         numpy.array of float
         """
         if allele is not None:
+            normalized_allele = mhcnames.normalize_allele_name(allele)
             try:
-                transform = self.allele_to_percent_rank_transform[allele]
+                transform = self.allele_to_percent_rank_transform[normalized_allele]
                 return transform.transform(affinities)
             except KeyError:
-                msg = "Allele %s has no percentile rank information" % allele
+                msg = "Allele %s has no percentile rank information" % (
+                    allele + (
+                        "" if allele == normalized_allele
+                        else " (normalized to %s)" % normalized_allele))
                 if throw:
                     raise ValueError(msg)
                 warnings.warn(msg)
diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py
index e1181a15..a7430cb4 100644
--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -58,9 +58,22 @@ class Class1PresentationPredictor(object):
             dict(metadata_dataframes) if metadata_dataframes else {})
         self._models_cache = {}
 
-    def get_affinity_predictions(
-            self, peptides, experiment_names, alleles, verbose=1):
+    @property
+    def supported_alleles(self):
+        return self.affinity_predictor.supported_alleles
 
+    @property
+    def supported_peptide_lengths(self):
+        return self.affinity_predictor.supported_peptide_lengths
+
+    def predict_affinity(
+            self,
+            peptides,
+            experiment_names,
+            alleles,
+            include_affinity_percentile=False,
+            verbose=1,
+            throw=True):
         df = pandas.DataFrame({
             "peptide": numpy.array(peptides, copy=False),
             "experiment_name": numpy.array(experiment_names, copy=False),
@@ -80,17 +93,25 @@ class Class1PresentationPredictor(object):
                 predictions_df[allele] = self.affinity_predictor.predict(
                     peptides=experiment_peptides,
                     allele=allele,
-                    model_kwargs={'batch_size': PREDICT_BATCH_SIZE})
+                    model_kwargs={'batch_size': PREDICT_BATCH_SIZE},
+                    throw=throw)
             df.loc[
-                sub_df.index, "tightest_affinity"
+                sub_df.index, "affinity"
             ] = predictions_df.min(1).values
             df.loc[
-                sub_df.index, "tightest_affinity_allele"
+                sub_df.index, "best_allele"
             ] = predictions_df.idxmin(1).values
 
+            if include_affinity_percentile:
+                df.loc[sub_df.index, "affinity_percentile"] = (
+                    self.affinity_predictor.percentile_ranks(
+                        df.loc[sub_df.index, "affinity"].values,
+                        alleles=df.loc[sub_df.index, "best_allele"].values,
+                        throw=False))
+
         return df
 
-    def get_cleavage_predictions(
+    def predict_cleavage(
             self, peptides, n_flanks=None, c_flanks=None, verbose=1):
 
         if verbose > 0:
@@ -128,12 +149,12 @@ class Class1PresentationPredictor(object):
             c_flanks=None,
             verbose=1):
 
-        df = self.get_affinity_predictions(
+        df = self.predict_affinity(
             peptides=peptides,
             experiment_names=experiment_names,
             alleles=alleles,
             verbose=verbose)
-        df["affinity_score"] = from_ic50(df.tightest_affinity)
+        df["affinity_score"] = from_ic50(df.affinity)
         df["target"] = numpy.array(targets, copy=False)
 
         if (n_flanks is None) != (c_flanks is None):
@@ -157,7 +178,7 @@ class Class1PresentationPredictor(object):
             if verbose > 0:
                 print("Training variant", model_name)
 
-            df["cleavage_prediction"] = self.get_cleavage_predictions(
+            df["cleavage_prediction"] = self.predict_cleavage(
                 peptides=df.peptide.values,
                 n_flanks=n_flanks if with_flanks else None,
                 c_flanks=c_flanks if with_flanks else None,
@@ -206,7 +227,7 @@ class Class1PresentationPredictor(object):
             experiment_names=experiment_names,
             n_flanks=n_flanks,
             c_flanks=c_flanks,
-            verbose=verbose).score.values
+            verbose=verbose).presentation_score.values
 
     def predict_to_dataframe(
             self,
@@ -215,7 +236,9 @@ class Class1PresentationPredictor(object):
             experiment_names=None,
             n_flanks=None,
             c_flanks=None,
-            verbose=1):
+            include_affinity_percentile=False,
+            verbose=1,
+            throw=True):
 
         if isinstance(peptides, string_types):
             raise TypeError("peptides must be a list not a string")
@@ -246,17 +269,19 @@ class Class1PresentationPredictor(object):
                 "experiment1": alleles,
             }
 
-        df = self.get_affinity_predictions(
+        df = self.predict_affinity(
             peptides=peptides,
             experiment_names=experiment_names,
             alleles=alleles,
-            verbose=verbose)
-        df["affinity_score"] = from_ic50(df.tightest_affinity)
+            include_affinity_percentile=include_affinity_percentile,
+            verbose=verbose,
+            throw=throw)
+        df["affinity_score"] = from_ic50(df.affinity)
 
         if (n_flanks is None) != (c_flanks is None):
             raise ValueError("Specify both or neither of n_flanks, c_flanks")
 
-        df["cleavage_prediction"] = self.get_cleavage_predictions(
+        df["cleavage_prediction"] = self.predict_cleavage(
             peptides=df.peptide.values,
             n_flanks=n_flanks,
             c_flanks=c_flanks,
@@ -264,7 +289,9 @@ class Class1PresentationPredictor(object):
 
         model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
         model = self.get_model(model_name)
-        df["score"] = model.predict_proba(df[self.model_inputs].values)[:,1]
+        df["presentation_score"] = model.predict_proba(
+            df[self.model_inputs].values)[:,1]
+        del df["affinity_score"]
         return df
 
     def save(self, models_dir):
diff --git a/mhcflurry/downloads.py b/mhcflurry/downloads.py
index abeedadb..3f8522ec 100644
--- a/mhcflurry/downloads.py
+++ b/mhcflurry/downloads.py
@@ -120,7 +120,7 @@ def get_default_class1_presentation_models_dir(test_exists=True):
             raise IOError("No such directory: %s" % result)
         return result
     return get_path(
-        "models_class1_pan_refined", "presentation", test_exists=test_exists)
+        "models_class1_presentation", "models", test_exists=test_exists)
 
 
 def get_default_class1_cleavage_models_dir(test_exists=True):
diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py
index 17a687f5..facbe0d7 100644
--- a/mhcflurry/predict_command.py
+++ b/mhcflurry/predict_command.py
@@ -26,16 +26,19 @@ from __future__ import (
     division,
     absolute_import,
 )
+
 import sys
 import argparse
 import itertools
 import logging
+import os
 
 import pandas
 
 from .common import set_keras_backend
-from .downloads import get_default_class1_models_dir
+from .downloads import get_default_class1_models_dir, get_default_class1_presentation_models_dir
 from .class1_affinity_predictor import Class1AffinityPredictor
+from .class1_presentation_predictor import Class1PresentationPredictor
 from .version import __version__
 
 
@@ -79,13 +82,12 @@ input_args.add_argument(
     "--alleles",
     metavar="ALLELE",
     nargs="+",
-    help="Alleles to predict (exclusive with --input)")
+    help="Alleles to predict (exclusive with passing an input CSV)")
 input_args.add_argument(
     "--peptides",
     metavar="PEPTIDE",
     nargs="+",
-    help="Peptides to predict (exclusive with --input)")
-
+    help="Peptides to predict (exclusive with passing an input CSV)")
 
 input_mod_args = parser.add_argument_group(title="Input options")
 input_mod_args.add_argument(
@@ -98,13 +100,22 @@ input_mod_args.add_argument(
     metavar="NAME",
     default="peptide",
     help="Input column name for peptides. Default: '%(default)s'")
+input_mod_args.add_argument(
+    "--n-flank-column",
+    metavar="NAME",
+    default="n_flank",
+    help="Column giving N-terminal flanking sequence. Default: '%(default)s'")
+input_mod_args.add_argument(
+    "--c-flank-column",
+    metavar="NAME",
+    default="c_flank",
+    help="Column giving C-terminal flanking sequence. Default: '%(default)s'")
 input_mod_args.add_argument(
     "--no-throw",
     action="store_true",
     default=False,
     help="Return NaNs for unsupported alleles or peptides instead of raising")
 
-
 output_args = parser.add_argument_group(title="Output options")
 output_args.add_argument(
     "--out",
@@ -121,11 +132,16 @@ output_args.add_argument(
     default=",",
     help="Delimiter character for results. Default: '%(default)s'")
 output_args.add_argument(
-    "--include-individual-model-predictions",
+    "--no-affinity-percentile",
+    default=False,
     action="store_true",
+    help="Do not include affinity percentile rank")
+output_args.add_argument(
+    "--always-include-best-allele",
     default=False,
-    help="Include predictions from each model in the ensemble"
-)
+    action="store_true",
+    help="Always include the best_allele column even when it is identical "
+    "to the allele column (i.e. all queries are monoallelic).")
 
 model_args = parser.add_argument_group(title="Model options")
 model_args.add_argument(
@@ -134,29 +150,26 @@ model_args.add_argument(
     default=None,
     help="Directory containing models. "
     "Default: %s" % get_default_class1_models_dir(test_exists=False))
-
-implementation_args = parser.add_argument_group(title="Implementation options")
-implementation_args.add_argument(
-    "--backend",
-    choices=("tensorflow-gpu", "tensorflow-cpu", "tensorflow-default"),
-    help="Keras backend. If not specified will use system default.")
-implementation_args.add_argument(
-    "--threads",
-    metavar="N",
-    type=int,
-    help="Num threads for tensorflow to use. If unspecified, tensorflow will "
-    "pick a value based on the number of cores.")
-
+model_args.add_argument(
+    "--affinity-only",
+    action="store_true",
+    default=False,
+    help="Affinity prediction only (no cleavage or presentation)")
+model_args.add_argument(
+    "--no-flanking",
+    action="store_true",
+    default=False,
+    help="Do not use flanking sequence information even when available")
 
 def run(argv=sys.argv[1:]):
+    logging.getLogger('tensorflow').disabled = True
+
     if not argv:
         parser.print_help()
         parser.exit(1)
 
     args = parser.parse_args(argv)
 
-    set_keras_backend(backend=args.backend, num_threads=args.threads)
-
     # It's hard to pass a tab in a shell, so we correct a common error:
     if args.output_delimiter == "\\t":
         args.output_delimiter = "\t"
@@ -166,12 +179,24 @@ def run(argv=sys.argv[1:]):
         # The reason we set the default here instead of in the argument parser
         # is that we want to test_exists at this point, so the user gets a
         # message instructing them to download the models if needed.
-        models_dir = get_default_class1_models_dir(test_exists=True)
-    predictor = Class1AffinityPredictor.load(models_dir)
+        models_dir = get_default_class1_presentation_models_dir(test_exists=True)
+
+    if os.path.exists(os.path.join(models_dir, "weights.csv")):
+        # Using a presentation predictor.
+        predictor = Class1PresentationPredictor.load(models_dir)
+    else:
+        # Using just an affinity predictor.
+        affinity_predictor = Class1AffinityPredictor.load(models_dir)
+        predictor = Class1PresentationPredictor(
+            affinity_predictor=affinity_predictor)
+        if not args.affinity_only:
+            logging.warning(
+                "Specified models are an affinity predictor, which implies "
+                "--affinity-only. Specify this argument to silence this warning.")
+            args.affinity_only = True
 
     # The following two are informative commands that can come 
-    # if a wrapper would like to incorporate input validation 
-    # to not delibaretly make mhcflurry fail
+    # if a wrapper would like to incorporate input validation.
     if args.list_supported_alleles:
         print("\n".join(predictor.supported_alleles))
         return
@@ -180,7 +205,6 @@ def run(argv=sys.argv[1:]):
         min_len, max_len = predictor.supported_peptide_lengths
         print("\n".join([str(l) for l in range(min_len, max_len+1)]))
         return
-    # End of early terminating routines
 
     if args.input:
         if args.alleles or args.peptides:
@@ -200,19 +224,8 @@ def run(argv=sys.argv[1:]):
             parser.error(
                 "Specify either an input CSV file or both the "
                 "--alleles and --peptides arguments")
-        # split user specified allele and peptide strings in case they
-        # contain multiple entries separated by commas
-        alleles = []
-        for allele_string in args.alleles:
-            alleles.extend([s.strip() for s in allele_string.split(",")])
-        peptides = []
-        for peptide in args.peptides:
-            peptides.extend(peptide.strip() for p in peptide.split(","))
-        for peptide in peptides:
-            if not peptide.isalpha():
-                raise ValueError(
-                    "Unexpected character(s) in peptide '%s'" % peptide)
-        pairs = list(itertools.product(alleles, peptides))
+
+        pairs = list(itertools.product(args.alleles, args.peptides))
         df = pandas.DataFrame({
             "allele": [p[0] for p in pairs],
             "peptide": [p[1] for p in pairs],
@@ -221,15 +234,48 @@ def run(argv=sys.argv[1:]):
             "Predicting for %d alleles and %d peptides = %d predictions" % (
                 len(args.alleles), len(args.peptides), len(df)))
 
-    predictions = predictor.predict_to_dataframe(
-        peptides=df[args.peptide_column].values,
-        alleles=df[args.allele_column].values,
-        include_individual_model_predictions=(
-            args.include_individual_model_predictions),
-        throw=not args.no_throw)
+    allele_string_to_alleles = (
+        df.drop_duplicates(args.allele_column).set_index(
+            args.allele_column, drop=False)[
+                args.allele_column
+        ].str.split(r"[,\s]+")).to_dict()
+
+    if args.affinity_only:
+        predictions = predictor.predict_affinity(
+            peptides=df[args.peptide_column].values,
+            alleles=allele_string_to_alleles,
+            experiment_names=df[args.allele_column],
+            throw=not args.no_throw,
+            include_affinity_percentile=not args.no_affinity_percentile)
+    else:
+        n_flanks = None
+        c_flanks = None
+        if not args.no_flanking:
+            if args.n_flank_column in df.columns and args.c_flank_column in df.columns:
+                n_flanks = df[args.n_flank_column]
+                c_flanks = df[args.c_flank_column]
+            else:
+                logging.warning(
+                    "No flanking information provided. Specify --no-flanking "
+                    "to silence this warning")
+
+        predictions = predictor.predict_to_dataframe(
+            peptides=df[args.peptide_column].values,
+            n_flanks=n_flanks,
+            c_flanks=c_flanks,
+            alleles=allele_string_to_alleles,
+            experiment_names=df[args.allele_column],
+            throw=not args.no_throw,
+            include_affinity_percentile=not args.no_affinity_percentile)
+
+    # If each query is just for a single allele, the "best_allele" column
+    # is redundant so we remove it.
+    if not args.always_include_best_allele:
+        if all(len(a) == 1 for a in allele_string_to_alleles.values()):
+            del predictions["best_allele"]
 
     for col in predictions.columns:
-        if col not in ("allele", "peptide"):
+        if col not in ("allele", "peptide", "experiment_name"):
             df[args.prediction_column_prefix + col] = predictions[col]
 
     if args.out:
diff --git a/test/test_predict_command.py b/test/test_predict_command.py
index c3f0a5c1..a4fbdc51 100644
--- a/test/test_predict_command.py
+++ b/test/test_predict_command.py
@@ -63,6 +63,7 @@ def test_no_csv():
     print(result)
     assert_equal(result.shape, (6, 6))
     sub_result1 = result.loc[result.peptide == "SIINFEKL"].set_index("allele")
+    print(sub_result1)
     assert (
-        sub_result1.loc["H-2-Kb"].mhcflurry1_prediction <
-        sub_result1.loc["HLA-A0201"].mhcflurry1_prediction)
+        sub_result1.loc["H-2-Kb"].mhcflurry1_affinity <
+        sub_result1.loc["HLA-A0201"].mhcflurry1_affinity)
-- 
GitLab