From 339e7caf02402aeb7bf583fee1c139003b511ff8 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sun, 26 Jan 2020 22:20:10 -0500
Subject: [PATCH] update

---
 .../data_evaluation/GENERATE.sh               |  18 ++-
 .../data_evaluation/join_with_precomputed.py  | 128 ++++++++++++++++++
 2 files changed, 145 insertions(+), 1 deletion(-)
 create mode 100644 downloads-generation/data_evaluation/join_with_precomputed.py

diff --git a/downloads-generation/data_evaluation/GENERATE.sh b/downloads-generation/data_evaluation/GENERATE.sh
index 4f1cb778..a004431b 100755
--- a/downloads-generation/data_evaluation/GENERATE.sh
+++ b/downloads-generation/data_evaluation/GENERATE.sh
@@ -40,6 +40,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
 export OMP_NUM_THREADS=1
 export PYTHONUNBUFFERED=1
+export MHCFLURRY_DEFAULT_PREDICT_BATCH_SIZE=16384
 
 if [ "$2" == "continue-incomplete" ] && [ -f "hits_with_tpm.csv.bz2" ]
 then
@@ -67,7 +68,6 @@ else
         --only-format MONOALLELIC \
         --out "$(pwd)/benchmark.monoallelic.csv"
     bzip2 -f benchmark.monoallelic.csv
-    rm -f benchmark.monoallelic.predictions.csv.bz2
 fi
 
 ### GENERATE BENCHMARK: MULTIALLELIC
@@ -174,6 +174,22 @@ else
     echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_without_flanks.csv"  >> commands/multiallelic.presentation_without_flanks.sh
 fi
 
+### PRECOMPUTED ####
+for variant in netmhcpan4.ba netmhcpan4.el mixmhcpred
+do
+    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.csv.bz2" ]
+    then
+        echo "Reusing existing multiallelic ${variant}"
+    else
+        echo time python join_with_precomputed.py \
+            \""$(pwd)/benchmark.multiallelic.csv.bz2"\" \
+            \""$(mhcflurry-downloads path data_mass_spec_benchmark)/predictions/all.${variant}"\" \
+            ${variant} \
+            --out "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh
+        echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.csv"  >> commands/multiallelic.${variant}.sh
+    fi
+done
+
 ls -lh commands
 
 if [ "$1" != "cluster" ]
diff --git a/downloads-generation/data_evaluation/join_with_precomputed.py b/downloads-generation/data_evaluation/join_with_precomputed.py
new file mode 100644
index 00000000..12ba4d92
--- /dev/null
+++ b/downloads-generation/data_evaluation/join_with_precomputed.py
@@ -0,0 +1,128 @@
+"""
+Join benchmark with precomputed predictions.
+"""
+import sys
+import argparse
+import os
+import numpy
+import collections
+
+import pandas
+import tqdm
+
+import mhcflurry
+from mhcflurry.downloads import get_path
+
+parser = argparse.ArgumentParser(usage=__doc__)
+
+parser.add_argument(
+    "benchmark")
+parser.add_argument(
+    "precomputed_data")
+parser.add_argument(
+    "predictors",
+    nargs="+",
+    choices=("netmhcpan4.ba", "netmhcpan4.el", "mixmhcpred"))
+parser.add_argument(
+    "--out",
+    metavar="CSV",
+    required=True,
+    help="File to write")
+
+
+def load_results(dirname, result_df=None, columns=None):
+    peptides = pandas.read_csv(os.path.join(dirname, "peptides.csv")).peptide
+    manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv"))
+
+    print("Loading results. Existing data has", len(peptides), "peptides and",
+        len(manifest_df), "columns")
+
+    if columns is None:
+        columns = manifest_df.col.values
+
+    if result_df is None:
+        result_df = pandas.DataFrame(index=peptides, columns=columns,
+            dtype="float32")
+        result_df[:] = numpy.nan
+        peptides_to_assign = peptides
+        mask = None
+    else:
+        mask = (peptides.isin(result_df.index)).values
+        peptides_to_assign = peptides[mask]
+
+    manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
+
+    print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")
+
+    for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
+        with open(os.path.join(dirname, row.path), "rb") as fd:
+            value = numpy.load(fd)['arr_0']
+            if mask is not None:
+                value = value[mask]
+            result_df.loc[peptides_to_assign, row.col] = value
+
+    return result_df
+
+
+def run():
+    args = parser.parse_args(sys.argv[1:])
+    df = pandas.read_csv(args.hits)
+
+    df["alleles"] = df.hla.str.split()
+
+    peptides = df.peptide.unique()
+    alleles = set()
+    for some in df.hla.unique():
+        alleles.update(some.split())
+
+    predictions_dfs = {}
+
+    if 'netmhcpan4.ba' in args.predictor:
+        predictions_dfs['netmhcpan4.ba'] = load_results(
+            get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.ba"),
+            result_df=pandas.DataFrame(
+                index=peptides,
+                columns=["%s affinity" % a for a in alleles])).rename(
+            columns=lambda s: s.replace("affinity", "").strip())
+        predictions_dfs['netmhcpan4.ba'] *= -1
+
+    if 'netmhcpan4.el' in args.predictor:
+        predictions_dfs['netmhcpan4.el'] = load_results(
+            get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.el"),
+            result_df=pandas.DataFrame(
+                index=peptides,
+                columns=["%s score" % a for a in alleles])).rename(
+            columns=lambda s: s.replace("score", "").strip())
+
+    if 'mixmhcpred' in args.predictor:
+        predictions_dfs['mixmhcpred'] = load_results(
+        get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"),
+        result_df=pandas.DataFrame(
+            index=peptides,
+            columns=["%s score" % a for a in alleles])).rename(
+        columns=lambda s: s.replace("score", "").strip())
+
+    skip_experiments = set()
+
+    for hla_text, sub_df in tqdm.tqdm(df.groupby("hla"), total=df.hla.nunique()):
+        hla = hla_text.split()
+        for (name, precomputed_df) in predictions_dfs.items():
+            df.loc[sub_df.index, name] = numpy.nan
+            prediction_df = pandas.DataFrame(index=sub_df.peptide)
+            for allele in hla:
+                if allele not in precomputed_df.columns or precomputed_df[allele].isnull().all():
+                    print(sub_df.sample_id.unique(), hla)
+                    skip_experiments.update(sub_df.sample_id.unique())
+                prediction_df[allele] = precomputed_df.loc[df.index, allele]
+            df.loc[sub_df.index, name] = prediction_df.max(1, skipna=False).values
+            df.loc[sub_df.index, name + " allele"] = prediction_df.idxmax(1, skipna=False).values
+
+    print("Skip experiments", skip_experiments)
+    print("results")
+    print(df)
+
+    df.to_csv(args.out)
+    print("Wrote", args.out)
+
+if __name__ == '__main__':
+    run()
-- 
GitLab