From 1fcf39fc73702cd4a66b063ccf2282de5a3d8a65 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Thu, 3 Oct 2019 16:57:48 -0400
Subject: [PATCH] fixes

---
 .../data_mass_spec_benchmark/GENERATE.sh      | 25 +++++++++++++++++--
 .../run_predictors.py                         | 21 +++++++++++-----
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
index bab80ffe..d67bb086 100755
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
@@ -1,6 +1,27 @@
 #!/bin/bash
 #
-# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions>
+# This download includes predictions for MHCflurry and NetMHCpan 4.0 over a
+# large number of peptides encompassing almost the full proteome.
+#
+# Usage:
+# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions|reuse-predictions-except-mhcflurry>
+#
+# The first choice listed above for each argument is the default.
+#
+# Meanings for these arguments:
+#
+# FIRST ARGUMENT: where to run
+# local             - run locally using NUM_JOBS cores.
+# cluster           - run on cluster.
+#
+# SECOND ARGUMENT: whether to reuse predictions from existing downloaded data
+# reuse-all         - reuse predictions and peptide / allele lists from existing
+#                     downloaded data_mass_spec_benchmark.
+# reuse-none        - fully self-contained run; do not reuse anything.
+# reuse-predictions - reuse predictions but not peptide or allele lists. Any
+#                     new peptides not already included will be run.
+# reuse-predictions-except-mhcflurry
+#                   - Reuse predictions except for mhcflurry.
 #
 set -e
 set -x
@@ -105,7 +126,7 @@ do
         then
             REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}"
         fi
-        if [ "${2:-reuse-none}" != "reuse-none" ]
+        if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
         then
             REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
         fi
diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
index a86ce0da..ab1d6d1a 100644
--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
@@ -81,6 +81,11 @@ parser.add_argument(
 add_local_parallelism_args(parser)
 add_cluster_parallelism_args(parser)
 
+PREDICTOR_TO_COLS = {
+    "mhcflurry": ["affinity"],
+    "netmhcpan4": ["affinity", "percentile_rank", "elution_score"],
+}
+
 
 def load_results(dirname, result_df=None):
     peptides = pandas.read_csv(
@@ -188,6 +193,7 @@ def run(argv=sys.argv[1:]):
 
     GLOBAL_DATA["predictor"] = args.predictor
     GLOBAL_DATA["args"] = args
+    GLOBAL_DATA["cols"] = PREDICTOR_TO_COLS[args.predictor]
 
     # Write peptide and allele lists to out dir.
     out_peptides = os.path.abspath(os.path.join(args.out, "peptides.csv"))
@@ -196,7 +202,7 @@ def run(argv=sys.argv[1:]):
 
     manifest_df = []
     for allele in alleles:
-        for col in ["affinity", "percentile_rank", "elution_score"]:
+        for col in PREDICTOR_TO_COLS[args.predictor]:
             manifest_df.append((allele, col))
     manifest_df = pandas.DataFrame(
         manifest_df, columns=["allele", "kind"])
@@ -222,7 +228,7 @@ def run(argv=sys.argv[1:]):
                 result_df.notnull().values.mean()))
 
         # We rerun any alleles have nulls for any kind of values
-        # (affinity, percentile rank, elution score).
+        # (e.g. affinity, percentile rank, elution score).
         print("Computing blocks.")
         start = time.time()
         blocks = blocks_of_ones(result_df.isnull().values)
@@ -327,7 +333,8 @@ def run(argv=sys.argv[1:]):
         prediction_time / 60.0))
 
 
-def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None):
+def do_predictions_mhctools(
+        work_item_num, peptides, alleles, constant_data=None):
     # This may run on the cluster in a way that misses all top level imports,
     # so we have to re-import everything here.
     import time
@@ -345,6 +352,8 @@ def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None
     else:
         raise ValueError("Unsupported", predictor_name)
 
+    cols = constant_data['cols']
+
     start = time.time()
     df = predictor.predict_peptides_dataframe(peptides)
     print("Generated predictions for %d peptides x %d alleles in %0.2f sec." % (
@@ -352,9 +361,8 @@ def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None
 
     results = {}
     for (allele, sub_df) in df.groupby("allele"):
-        for col in ["affinity", "percentile_rank", "elution_score"]:
-            results["%s %s" % (allele, col)] = sub_df[col].values.astype(
-                'float32')
+        for col in cols:
+            results["%s %s" % (allele, col)] = sub_df[col].values.astype('float32')
     return (work_item_num, results)
 
 
@@ -371,6 +379,7 @@ def do_predictions_mhcflurry(work_item_num, peptides, alleles, constant_data=Non
     args = constant_data['args']
 
     assert args.predictor == "mhcflurry"
+    assert constant_data['cols'] == ["affinity"]
 
     predictor = Class1AffinityPredictor.load(args.mhcflurry_models_dir)
 
-- 
GitLab