From fb4bb8a2d1d07c1427a06965d31e47872ea3a76d Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 3 Jan 2020 15:41:48 -0500
Subject: [PATCH] Add mixmhcpred to mass spec benchmark

---
 .../data_mass_spec_benchmark/GENERATE.sh      | 47 +++++++++++--------
 .../run_predictors.py                         |  8 +++-
 2 files changed, 34 insertions(+), 21 deletions(-)

diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
index 5710ee0c..21d18639 100755
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
@@ -96,9 +96,7 @@ else
 fi
 
 # Write out and process peptides.
-# First just chr1 peptides, then all peptides.
-# TODO: switch this back
-for subset in chr1 all
+for subset in all
 do
     if [ "$2" == "reuse-all" ]
     then
@@ -118,19 +116,33 @@ do
         bzip2 proteome_peptides.$subset.csv
     fi
 
+    # Run mixmhcpred
+    OUT_DIR=predictions/${subset}.mixmhcpred
+    REUSE=""
+    if [ "${2:-reuse-none}" != "reuse-none" ]
+    then
+        REUSE="$EXISTING_DATA"/$OUT_DIR
+    fi
+
+    python run_predictors.py \
+        proteome_peptides.$subset.csv.bz2 \
+        --result-dtype "float16" \
+        --predictor mixmhcpred \
+        --chunk-size 500000 \
+        --allele $(cat alleles.txt) \
+        --out "$OUT_DIR" \
+        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
+        --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
+        --reuse-predictions "$REUSE" $EXTRA_ARGS
+
     # Run netmhcpan4
     for kind in el ba
     do
         OUT_DIR=predictions/${subset}.netmhcpan4.$kind
-        REUSE1=""
-        REUSE2=""
-        if [ "$subset" == "all" ]
-        then
-            REUSE1="predictions/chr1.netmhcpan4.$kind"
-        fi
+        REUSE=""
         if [ "${2:-reuse-none}" != "reuse-none" ]
         then
-            REUSE2="$EXISTING_DATA"/$OUT_DIR
+            REUSE="$EXISTING_DATA"/$OUT_DIR
         fi
 
         python run_predictors.py \
@@ -138,11 +150,11 @@ do
             --result-dtype "float16" \
             --predictor netmhcpan4-$kind \
             --chunk-size 1000 \
-            --allele $(cat alleles.txt | grep -v '31:0102') \
+            --allele $(cat alleles.txt) \
             --out "$OUT_DIR" \
             --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
             --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
-            --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
+            --reuse-predictions "$REUSE" $EXTRA_ARGS
     done
 
 
@@ -150,15 +162,10 @@ do
     for kind in combined
     do
         OUT_DIR=predictions/${subset}.mhcflurry.${kind}
-        REUSE1=""
-        REUSE2=""
-        if [ "$subset" == "all" ]
-        then
-            REUSE1="predictions/chr1.mhcflurry.${kind}"
-        fi
+        REUSE=""
         if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
         then
-            REUSE2="$EXISTING_DATA"/$OUT_DIR
+            REUSE="$EXISTING_DATA"/$OUT_DIR
         fi
 
         python run_predictors.py \
@@ -172,7 +179,7 @@ do
             --out "$OUT_DIR" \
             --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
             --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \
-            --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
+            --reuse-predictions "$REUSE" $EXTRA_ARGS
     done
 done
 
diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
index 482ed8ab..4b934501 100644
--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
@@ -43,7 +43,7 @@ parser.add_argument(
 parser.add_argument(
     "--predictor",
     required=True,
-    choices=("mhcflurry", "netmhcpan4-ba", "netmhcpan4-el"))
+    choices=("mhcflurry", "netmhcpan4-ba", "netmhcpan4-el", "mixmhcpred"))
 parser.add_argument(
     "--mhcflurry-models-dir",
     metavar="DIR",
@@ -90,6 +90,7 @@ PREDICTOR_TO_COLS = {
     "mhcflurry": ["affinity"],
     "netmhcpan4-ba": ["affinity", "percentile_rank"],
     "netmhcpan4-el": ["elution_score"],
+    "mixmhcpred": ["elution_score"],
 }
 
 
@@ -392,6 +393,11 @@ def do_predictions_mhctools(work_item_dicts, constant_data=None):
                 alleles=alleles,
                 program_name="netMHCpan-4.0",
                 mode="elution_score")
+        elif predictor_name == "mixmhcpred":
+            predictor = mhctools.MixMHCpred(
+                alleles=alleles,
+                program_name="netMHCpan-4.0",
+                mode="elution_score")
         else:
             raise ValueError("Unsupported", predictor_name)
 
-- 
GitLab