Updates pre cleavage -> processing rename

ba4b1cfb · Tim O'Donnell · e6861c3f · ba4b1cfb · ba4b1cfb · ba4b1cfb
Commit ba4b1cfb authored 5 years ago by Tim O'Donnell
--- a/.travis.yml
+++ b/.travis.yml
@@ -46,18 +46,21 @@ script:
      $(mhcflurry-downloads url data_curated)
      $(mhcflurry-downloads url data_mass_spec_annotated)
      $(mhcflurry-downloads url models_class1)
+      $(mhcflurry-downloads url models_class1_presentation)
      $(mhcflurry-downloads url models_class1_pan)
+      $(mhcflurry-downloads url models_class1_pan_variants)
      $(mhcflurry-downloads url allele_sequences)
      -P /tmp/downloads
  - ls -lh /tmp/downloads
  -
    mhcflurry-downloads fetch
-    data_curated
-    data_mass_spec_annotated
-    models_class1
-    models_class1_pan
-    models_class1_pan_variants
-    allele_sequences
-    --already-downloaded-dir /tmp/downloads
+      data_curated
+      data_mass_spec_annotated
+      models_class1
+      models_class1_presentation
+      models_class1_pan
+      models_class1_pan_variants
+      allele_sequences
+      --already-downloaded-dir /tmp/downloads
  - mhcflurry-downloads info  # just to test this command works
  - nosetests --with-timer -sv test
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -892,11 +892,15 @@ class Class1AffinityPredictor(object):
        numpy.array of float
        """
        if allele is not None:
+            normalized_allele = mhcnames.normalize_allele_name(allele)
            try:
-                transform = self.allele_to_percent_rank_transform[allele]
+                transform = self.allele_to_percent_rank_transform[normalized_allele]
                return transform.transform(affinities)
            except KeyError:
-                msg = "Allele %s has no percentile rank information" % allele
+                msg = "Allele %s has no percentile rank information" % (
+                    allele + (
+                        "" if allele == normalized_allele
+                        else " (normalized to %s)" % normalized_allele))
                if throw:
                    raise ValueError(msg)
                warnings.warn(msg)

--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -58,9 +58,22 @@ class Class1PresentationPredictor(object):
            dict(metadata_dataframes) if metadata_dataframes else {})
        self._models_cache = {}

-    def get_affinity_predictions(
-            self, peptides, experiment_names, alleles, verbose=1):
+    @property
+    def supported_alleles(self):
+        return self.affinity_predictor.supported_alleles

+    @property
+    def supported_peptide_lengths(self):
+        return self.affinity_predictor.supported_peptide_lengths
+
+    def predict_affinity(
+            self,
+            peptides,
+            experiment_names,
+            alleles,
+            include_affinity_percentile=False,
+            verbose=1,
+            throw=True):
        df = pandas.DataFrame({
            "peptide": numpy.array(peptides, copy=False),
            "experiment_name": numpy.array(experiment_names, copy=False),
@@ -80,17 +93,25 @@ class Class1PresentationPredictor(object):
                predictions_df[allele] = self.affinity_predictor.predict(
                    peptides=experiment_peptides,
                    allele=allele,
-                    model_kwargs={'batch_size': PREDICT_BATCH_SIZE})
+                    model_kwargs={'batch_size': PREDICT_BATCH_SIZE},
+                    throw=throw)
            df.loc[
-                sub_df.index, "tightest_affinity"
+                sub_df.index, "affinity"
            ] = predictions_df.min(1).values
            df.loc[
-                sub_df.index, "tightest_affinity_allele"
+                sub_df.index, "best_allele"
            ] = predictions_df.idxmin(1).values

+            if include_affinity_percentile:
+                df.loc[sub_df.index, "affinity_percentile"] = (
+                    self.affinity_predictor.percentile_ranks(
+                        df.loc[sub_df.index, "affinity"].values,
+                        alleles=df.loc[sub_df.index, "best_allele"].values,
+                        throw=False))
+
        return df

-    def get_cleavage_predictions(
+    def predict_cleavage(
            self, peptides, n_flanks=None, c_flanks=None, verbose=1):

        if verbose > 0:
@@ -128,12 +149,12 @@ class Class1PresentationPredictor(object):
            c_flanks=None,
            verbose=1):

-        df = self.get_affinity_predictions(
+        df = self.predict_affinity(
            peptides=peptides,
            experiment_names=experiment_names,
            alleles=alleles,
            verbose=verbose)
-        df["affinity_score"] = from_ic50(df.tightest_affinity)
+        df["affinity_score"] = from_ic50(df.affinity)
        df["target"] = numpy.array(targets, copy=False)

        if (n_flanks is None) != (c_flanks is None):
@@ -157,7 +178,7 @@ class Class1PresentationPredictor(object):
            if verbose > 0:
                print("Training variant", model_name)

-            df["cleavage_prediction"] = self.get_cleavage_predictions(
+            df["cleavage_prediction"] = self.predict_cleavage(
                peptides=df.peptide.values,
                n_flanks=n_flanks if with_flanks else None,
                c_flanks=c_flanks if with_flanks else None,
@@ -206,7 +227,7 @@ class Class1PresentationPredictor(object):
            experiment_names=experiment_names,
            n_flanks=n_flanks,
            c_flanks=c_flanks,
-            verbose=verbose).score.values
+            verbose=verbose).presentation_score.values

    def predict_to_dataframe(
            self,
@@ -215,7 +236,9 @@ class Class1PresentationPredictor(object):
            experiment_names=None,
            n_flanks=None,
            c_flanks=None,
-            verbose=1):
+            include_affinity_percentile=False,
+            verbose=1,
+            throw=True):

        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list not a string")
@@ -246,17 +269,19 @@ class Class1PresentationPredictor(object):
                "experiment1": alleles,
            }

-        df = self.get_affinity_predictions(
+        df = self.predict_affinity(
            peptides=peptides,
            experiment_names=experiment_names,
            alleles=alleles,
-            verbose=verbose)
-        df["affinity_score"] = from_ic50(df.tightest_affinity)
+            include_affinity_percentile=include_affinity_percentile,
+            verbose=verbose,
+            throw=throw)
+        df["affinity_score"] = from_ic50(df.affinity)

        if (n_flanks is None) != (c_flanks is None):
            raise ValueError("Specify both or neither of n_flanks, c_flanks")

-        df["cleavage_prediction"] = self.get_cleavage_predictions(
+        df["cleavage_prediction"] = self.predict_cleavage(
            peptides=df.peptide.values,
            n_flanks=n_flanks,
            c_flanks=c_flanks,
@@ -264,7 +289,9 @@ class Class1PresentationPredictor(object):

        model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
        model = self.get_model(model_name)
-        df["score"] = model.predict_proba(df[self.model_inputs].values)[:,1]
+        df["presentation_score"] = model.predict_proba(
+            df[self.model_inputs].values)[:,1]
+        del df["affinity_score"]
        return df

    def save(self, models_dir):

--- a/mhcflurry/downloads.py
+++ b/mhcflurry/downloads.py
@@ -120,7 +120,7 @@ def get_default_class1_presentation_models_dir(test_exists=True):
            raise IOError("No such directory: %s" % result)
        return result
    return get_path(
-        "models_class1_pan_refined", "presentation", test_exists=test_exists)
+        "models_class1_presentation", "models", test_exists=test_exists)


 def get_default_class1_cleavage_models_dir(test_exists=True):

--- a/mhcflurry/predict_command.py
+++ b/mhcflurry/predict_command.py
@@ -26,16 +26,19 @@ from __future__ import (
    division,
    absolute_import,
 )
+
 import sys
 import argparse
 import itertools
 import logging
+import os

 import pandas

 from .common import set_keras_backend
-from .downloads import get_default_class1_models_dir
+from .downloads import get_default_class1_models_dir, get_default_class1_presentation_models_dir
 from .class1_affinity_predictor import Class1AffinityPredictor
+from .class1_presentation_predictor import Class1PresentationPredictor
 from .version import __version__


@@ -79,13 +82,12 @@ input_args.add_argument(
    "--alleles",
    metavar="ALLELE",
    nargs="+",
-    help="Alleles to predict (exclusive with --input)")
+    help="Alleles to predict (exclusive with passing an input CSV)")
 input_args.add_argument(
    "--peptides",
    metavar="PEPTIDE",
    nargs="+",
-    help="Peptides to predict (exclusive with --input)")
-
+    help="Peptides to predict (exclusive with passing an input CSV)")

 input_mod_args = parser.add_argument_group(title="Input options")
 input_mod_args.add_argument(
@@ -98,13 +100,22 @@ input_mod_args.add_argument(
    metavar="NAME",
    default="peptide",
    help="Input column name for peptides. Default: '%(default)s'")
+input_mod_args.add_argument(
+    "--n-flank-column",
+    metavar="NAME",
+    default="n_flank",
+    help="Column giving N-terminal flanking sequence. Default: '%(default)s'")
+input_mod_args.add_argument(
+    "--c-flank-column",
+    metavar="NAME",
+    default="c_flank",
+    help="Column giving C-terminal flanking sequence. Default: '%(default)s'")
 input_mod_args.add_argument(
    "--no-throw",
    action="store_true",
    default=False,
    help="Return NaNs for unsupported alleles or peptides instead of raising")

-
 output_args = parser.add_argument_group(title="Output options")
 output_args.add_argument(
    "--out",
@@ -121,11 +132,16 @@ output_args.add_argument(
    default=",",
    help="Delimiter character for results. Default: '%(default)s'")
 output_args.add_argument(
-    "--include-individual-model-predictions",
+    "--no-affinity-percentile",
+    default=False,
    action="store_true",
+    help="Do not include affinity percentile rank")
+output_args.add_argument(
+    "--always-include-best-allele",
    default=False,
-    help="Include predictions from each model in the ensemble"
-)
+    action="store_true",
+    help="Always include the best_allele column even when it is identical "
+    "to the allele column (i.e. all queries are monoallelic).")

 model_args = parser.add_argument_group(title="Model options")
 model_args.add_argument(
@@ -134,29 +150,26 @@ model_args.add_argument(
    default=None,
    help="Directory containing models. "
    "Default: %s" % get_default_class1_models_dir(test_exists=False))
-
-implementation_args = parser.add_argument_group(title="Implementation options")
-implementation_args.add_argument(
-    "--backend",
-    choices=("tensorflow-gpu", "tensorflow-cpu", "tensorflow-default"),
-    help="Keras backend. If not specified will use system default.")
-implementation_args.add_argument(
-    "--threads",
-    metavar="N",
-    type=int,
-    help="Num threads for tensorflow to use. If unspecified, tensorflow will "
-    "pick a value based on the number of cores.")
-
+model_args.add_argument(
+    "--affinity-only",
+    action="store_true",
+    default=False,
+    help="Affinity prediction only (no cleavage or presentation)")
+model_args.add_argument(
+    "--no-flanking",
+    action="store_true",
+    default=False,
+    help="Do not use flanking sequence information even when available")

 def run(argv=sys.argv[1:]):
+    logging.getLogger('tensorflow').disabled = True
+
    if not argv:
        parser.print_help()
        parser.exit(1)

    args = parser.parse_args(argv)

-    set_keras_backend(backend=args.backend, num_threads=args.threads)
-
    # It's hard to pass a tab in a shell, so we correct a common error:
    if args.output_delimiter == "\\t":
        args.output_delimiter = "\t"
@@ -166,12 +179,24 @@ def run(argv=sys.argv[1:]):
        # The reason we set the default here instead of in the argument parser
        # is that we want to test_exists at this point, so the user gets a
        # message instructing them to download the models if needed.
-        models_dir = get_default_class1_models_dir(test_exists=True)
-    predictor = Class1AffinityPredictor.load(models_dir)
+        models_dir = get_default_class1_presentation_models_dir(test_exists=True)
+
+    if os.path.exists(os.path.join(models_dir, "weights.csv")):
+        # Using a presentation predictor.
+        predictor = Class1PresentationPredictor.load(models_dir)
+    else:
+        # Using just an affinity predictor.
+        affinity_predictor = Class1AffinityPredictor.load(models_dir)
+        predictor = Class1PresentationPredictor(
+            affinity_predictor=affinity_predictor)
+        if not args.affinity_only:
+            logging.warning(
+                "Specified models are an affinity predictor, which implies "
+                "--affinity-only. Specify this argument to silence this warning.")
+            args.affinity_only = True

    # The following two are informative commands that can come 
-    # if a wrapper would like to incorporate input validation 
-    # to not delibaretly make mhcflurry fail
+    # if a wrapper would like to incorporate input validation.
    if args.list_supported_alleles:
        print("\n".join(predictor.supported_alleles))
        return
@@ -180,7 +205,6 @@ def run(argv=sys.argv[1:]):
        min_len, max_len = predictor.supported_peptide_lengths
        print("\n".join([str(l) for l in range(min_len, max_len+1)]))
        return
-    # End of early terminating routines

    if args.input:
        if args.alleles or args.peptides:
@@ -200,19 +224,8 @@ def run(argv=sys.argv[1:]):
            parser.error(
                "Specify either an input CSV file or both the "
                "--alleles and --peptides arguments")
-        # split user specified allele and peptide strings in case they
-        # contain multiple entries separated by commas
-        alleles = []
-        for allele_string in args.alleles:
-            alleles.extend([s.strip() for s in allele_string.split(",")])
-        peptides = []
-        for peptide in args.peptides:
-            peptides.extend(peptide.strip() for p in peptide.split(","))
-        for peptide in peptides:
-            if not peptide.isalpha():
-                raise ValueError(
-                    "Unexpected character(s) in peptide '%s'" % peptide)
-        pairs = list(itertools.product(alleles, peptides))
+
+        pairs = list(itertools.product(args.alleles, args.peptides))
        df = pandas.DataFrame({
            "allele": [p[0] for p in pairs],
            "peptide": [p[1] for p in pairs],
@@ -221,15 +234,48 @@ def run(argv=sys.argv[1:]):
            "Predicting for %d alleles and %d peptides = %d predictions" % (
                len(args.alleles), len(args.peptides), len(df)))

-    predictions = predictor.predict_to_dataframe(
-        peptides=df[args.peptide_column].values,
-        alleles=df[args.allele_column].values,
-        include_individual_model_predictions=(
-            args.include_individual_model_predictions),
-        throw=not args.no_throw)
+    allele_string_to_alleles = (
+        df.drop_duplicates(args.allele_column).set_index(
+            args.allele_column, drop=False)[
+                args.allele_column
+        ].str.split(r"[,\s]+")).to_dict()
+
+    if args.affinity_only:
+        predictions = predictor.predict_affinity(
+            peptides=df[args.peptide_column].values,
+            alleles=allele_string_to_alleles,
+            experiment_names=df[args.allele_column],
+            throw=not args.no_throw,
+            include_affinity_percentile=not args.no_affinity_percentile)
+    else:
+        n_flanks = None
+        c_flanks = None
+        if not args.no_flanking:
+            if args.n_flank_column in df.columns and args.c_flank_column in df.columns:
+                n_flanks = df[args.n_flank_column]
+                c_flanks = df[args.c_flank_column]
+            else:
+                logging.warning(
+                    "No flanking information provided. Specify --no-flanking "
+                    "to silence this warning")
+
+        predictions = predictor.predict_to_dataframe(
+            peptides=df[args.peptide_column].values,
+            n_flanks=n_flanks,
+            c_flanks=c_flanks,
+            alleles=allele_string_to_alleles,
+            experiment_names=df[args.allele_column],
+            throw=not args.no_throw,
+            include_affinity_percentile=not args.no_affinity_percentile)
+
+    # If each query is just for a single allele, the "best_allele" column
+    # is redundant so we remove it.
+    if not args.always_include_best_allele:
+        if all(len(a) == 1 for a in allele_string_to_alleles.values()):
+            del predictions["best_allele"]

    for col in predictions.columns:
-        if col not in ("allele", "peptide"):
+        if col not in ("allele", "peptide", "experiment_name"):
            df[args.prediction_column_prefix + col] = predictions[col]

    if args.out:

--- a/test/test_predict_command.py
+++ b/test/test_predict_command.py
@@ -63,6 +63,7 @@ def test_no_csv():
    print(result)
    assert_equal(result.shape, (6, 6))
    sub_result1 = result.loc[result.peptide == "SIINFEKL"].set_index("allele")
+    print(sub_result1)
    assert (
-        sub_result1.loc["H-2-Kb"].mhcflurry1_prediction <
-        sub_result1.loc["HLA-A0201"].mhcflurry1_prediction)
+        sub_result1.loc["H-2-Kb"].mhcflurry1_affinity <
+        sub_result1.loc["HLA-A0201"].mhcflurry1_affinity)