fixes

1fcf39fc · Tim O'Donnell · 798336f5 · 1fcf39fc · 1fcf39fc
Commit 1fcf39fc authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
 #!/bin/bash
 #
-# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions>
+# This download includes predictions for MHCflurry and NetMHCpan 4.0 over a
+# large number of peptides encompassing almost the full proteome.
+#
+# Usage:
+# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions|reuse-predictions-except-mhcflurry>
+#
+# The first choice listed above for each argument is the default.
+#
+# Meanings for these arguments:
+#
+# FIRST ARGUMENT: where to run
+# local             - run locally using NUM_JOBS cores.
+# cluster           - run on cluster.
+#
+# SECOND ARGUMENT: whether to reuse predictions from existing downloaded data
+# reuse-all         - reuse predictions and peptide / allele lists from existing
+#                     downloaded data_mass_spec_benchmark.
+# reuse-none        - fully self-contained run; do not reuse anything.
+# reuse-predictions - reuse predictions but not peptide or allele lists. Any
+#                     new peptides not already included will be run.
+# reuse-predictions-except-mhcflurry
+#                   - Reuse predictions except for mhcflurry.
 #
 set -e
 set -x
@@ -105,7 +126,7 @@ do
        then
            REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}"
        fi
-        if [ "${2:-reuse-none}" != "reuse-none" ]
+        if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
        then
            REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
        fi

--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
@@ -81,6 +81,11 @@ parser.add_argument(
 add_local_parallelism_args(parser)
 add_cluster_parallelism_args(parser)

+PREDICTOR_TO_COLS = {
+    "mhcflurry": ["affinity"],
+    "netmhcpan4": ["affinity", "percentile_rank", "elution_score"],
+}
+

 def load_results(dirname, result_df=None):
    peptides = pandas.read_csv(
@@ -188,6 +193,7 @@ def run(argv=sys.argv[1:]):

    GLOBAL_DATA["predictor"] = args.predictor
    GLOBAL_DATA["args"] = args
+    GLOBAL_DATA["cols"] = PREDICTOR_TO_COLS[args.predictor]

    # Write peptide and allele lists to out dir.
    out_peptides = os.path.abspath(os.path.join(args.out, "peptides.csv"))
@@ -196,7 +202,7 @@ def run(argv=sys.argv[1:]):

    manifest_df = []
    for allele in alleles:
-        for col in ["affinity", "percentile_rank", "elution_score"]:
+        for col in PREDICTOR_TO_COLS[args.predictor]:
            manifest_df.append((allele, col))
    manifest_df = pandas.DataFrame(
        manifest_df, columns=["allele", "kind"])
@@ -222,7 +228,7 @@ def run(argv=sys.argv[1:]):
                result_df.notnull().values.mean()))

        # We rerun any alleles have nulls for any kind of values
-        # (affinity, percentile rank, elution score).
+        # (e.g. affinity, percentile rank, elution score).
        print("Computing blocks.")
        start = time.time()
        blocks = blocks_of_ones(result_df.isnull().values)
@@ -327,7 +333,8 @@ def run(argv=sys.argv[1:]):
        prediction_time / 60.0))


-def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None):
+def do_predictions_mhctools(
+        work_item_num, peptides, alleles, constant_data=None):
    # This may run on the cluster in a way that misses all top level imports,
    # so we have to re-import everything here.
    import time
@@ -345,6 +352,8 @@ def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None
    else:
        raise ValueError("Unsupported", predictor_name)

+    cols = constant_data['cols']
+
    start = time.time()
    df = predictor.predict_peptides_dataframe(peptides)
    print("Generated predictions for %d peptides x %d alleles in %0.2f sec." % (
@@ -352,9 +361,8 @@ def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None

    results = {}
    for (allele, sub_df) in df.groupby("allele"):
-        for col in ["affinity", "percentile_rank", "elution_score"]:
-            results["%s %s" % (allele, col)] = sub_df[col].values.astype(
-                'float32')
+        for col in cols:
+            results["%s %s" % (allele, col)] = sub_df[col].values.astype('float32')
    return (work_item_num, results)


@@ -371,6 +379,7 @@ def do_predictions_mhcflurry(work_item_num, peptides, alleles, constant_data=Non
    args = constant_data['args']

    assert args.predictor == "mhcflurry"
+    assert constant_data['cols'] == ["affinity"]

    predictor = Class1AffinityPredictor.load(args.mhcflurry_models_dir)