From c20680131e3352ae0869cbf96474fc9a8687e138 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Wed, 5 Feb 2020 18:22:00 -0500
Subject: [PATCH] modify data_evaluation to run both with and without training
 data exxcluded. Also update presentation download

---
 .../data_evaluation/GENERATE.sh               | 258 ++++++++++--------
 .../make_benchmark.py                         |   4 +-
 mhcflurry/downloads.yml                       |   2 +-
 3 files changed, 143 insertions(+), 121 deletions(-)

diff --git a/downloads-generation/data_evaluation/GENERATE.sh b/downloads-generation/data_evaluation/GENERATE.sh
index 31f97353..b39d70c2 100755
--- a/downloads-generation/data_evaluation/GENERATE.sh
+++ b/downloads-generation/data_evaluation/GENERATE.sh
@@ -55,150 +55,170 @@ else
 fi
 
 ### GENERATE BENCHMARK: MONOALLELIC
-if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.csv.bz2" ]
-then
-    echo "Reusing existing monoallelic benchmark"
-else
-    cp $SCRIPT_DIR/make_benchmark.py .
-    time python make_benchmark.py \
-        --hits "$(pwd)/hits_with_tpm.csv.bz2" \
-        --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
-        --decoys-per-hit 110 \
-        --exclude-train-data "$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms/train_data.csv.bz2" \
-        --only-format MONOALLELIC \
-        --out "$(pwd)/benchmark.monoallelic.csv"
-    bzip2 -f benchmark.monoallelic.csv
-fi
+for kind in train_excluded all
+do
+    EXCLUDE_TRAIN_DATA=""
+    if [ "$kind" == "train_excluded" ]
+    then
+        EXCLUDE_TRAIN_DATA="$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms/train_data.csv.bz2"
+    fi
+
+    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.$kind.csv.bz2" ]
+    then
+        echo "Reusing existing monoallelic benchmark: benchmark.monoallelic.$kind.csv.bz2"
+    else
+        cp $SCRIPT_DIR/make_benchmark.py .
+        time python make_benchmark.py \
+            --hits "$(pwd)/hits_with_tpm.csv.bz2" \
+            --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+            --decoys-per-hit 110 \
+            --exclude-train-data "$EXCLUDE_TRAIN_DATA" \
+            --only-format MONOALLELIC \
+            --out "$(pwd)/benchmark.monoallelic.$kind.csv"
+        bzip2 -f benchmark.monoallelic.$kind.csv
+    fi
+done
 
 ### GENERATE BENCHMARK: MULTIALLELIC
-if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.csv.bz2" ]
-then
-    echo "Reusing existing multiallelic benchmark"
-else
-    cp $SCRIPT_DIR/make_benchmark.py .
-    time python make_benchmark.py \
-        --hits "$(pwd)/hits_with_tpm.csv.bz2" \
-        --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
-        --exclude-train-data "$(mhcflurry-downloads path models_class1_pan)/models.combined/train_data.csv.bz2" \
-        --decoys-per-hit 110 \
-        --only-format MULTIALLELIC \
-        --out "$(pwd)/benchmark.multiallelic.csv"
-    bzip2 -f benchmark.multiallelic.csv
-fi
+for kind in train_excluded all
+do
+    EXCLUDE_TRAIN_DATA=""
+    if [ "$kind" == "train_excluded" ]
+    then
+        EXCLUDE_TRAIN_DATA="$(mhcflurry-downloads path models_class1_pan)/models.combined/train_data.csv.bz2"
+    fi
+
+    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.$kind.csv.bz2" ]
+    then
+        echo "Reusing existing multiallelic benchmark"
+    else
+        cp $SCRIPT_DIR/make_benchmark.py .
+        time python make_benchmark.py \
+            --hits "$(pwd)/hits_with_tpm.csv.bz2" \
+            --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+            --decoys-per-hit 110 \
+            --exclude-train-data "$EXCLUDE_TRAIN_DATA" \
+            --only-format MULTIALLELIC \
+            --out "$(pwd)/benchmark.multiallelic.$kind.csv"
+        bzip2 -f benchmark.multiallelic.$kind.csv
+    fi
+done
 
 rm -rf commands
 mkdir commands
 
-### AFFINITY PREDICTOR VARIANT: MONOALLELIC
-if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.predictions.csv.bz2" ]
-then
-    echo "Reusing existing monoallelic benchmark predictions"
-else
-    echo time mhcflurry-predict \
-        "$(pwd)/benchmark.monoallelic.csv.bz2" \
-        --allele-column hla \
-        --prediction-column-prefix no_additional_ms_ \
-        --models \""$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms"\" \
-        --affinity-only \
-        --no-affinity-percentile \
-        --out "$(pwd)/benchmark.monoallelic.no_additional_ms.csv" \
-        --no-throw >> commands/monoallelic.sh
-    echo bzip2 -f "$(pwd)/benchmark.monoallelic.predictions.csv" >> commands/monoallelic.sh
-fi
-
-
-### AFFINITY PREDICTORS: MULTIALLELIC
-if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.production.csv.bz2" ]
-then
-    echo "Reusing existing multiallelic predictions"
-else
-    echo time mhcflurry-predict \
-        "$(pwd)/benchmark.multiallelic.csv.bz2" \
-        --allele-column hla \
-        --prediction-column-prefix mhcflurry_production_ \
-        --models \""$(mhcflurry-downloads path models_class1_pan)/models.combined"\" \
-        --affinity-only \
-        --no-affinity-percentile \
-        --out "$(pwd)/benchmark.multiallelic.production.csv" >> commands/multiallelic.production.sh
-    echo bzip2 -f "$(pwd)/benchmark.multiallelic.production.csv" >> commands/multiallelic.production.sh
-fi
-
-for variant in no_additional_ms compact_peptide affinity_only no_pretrain single_hidden_no_pretrain
+for kind in train_excluded all
 do
-    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.csv.bz2" ]
+    ### AFFINITY PREDICTOR VARIANT: MONOALLELIC
+    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.predictions.$kind.csv.bz2" ]
     then
-        echo "Reusing existing multiallelic predictions: ${variant}"
+        echo "Reusing existing monoallelic benchmark predictions"
     else
         echo time mhcflurry-predict \
-            "$(pwd)/benchmark.multiallelic.csv.bz2" \
+            "$(pwd)/benchmark.monoallelic.$kind.csv.bz2" \
             --allele-column hla \
-            --prediction-column-prefix "${variant}_" \
-            --models \""$(mhcflurry-downloads path models_class1_pan_variants)/models.$variant"\" \
+            --prediction-column-prefix no_additional_ms_ \
+            --models \""$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms"\" \
             --affinity-only \
             --no-affinity-percentile \
-            --out "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh
-        echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh
+            --out "$(pwd)/benchmark.monoallelic.no_additional_ms.$kind.csv" \
+            --no-throw >> commands/monoallelic.$kind.sh
+        echo bzip2 -f "$(pwd)/benchmark.monoallelic.predictions.$kind.csv" >> commands/monoallelic.$kind.sh
     fi
-done
 
 
-### PRESENTATION: WITH FLANKS
-if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.presentation_with_flanks.csv.bz2" ]
-then
-    echo "Reusing existing multiallelic presentation with flanks"
-else
-    echo time mhcflurry-predict \
-        "$(pwd)/benchmark.multiallelic.csv.bz2" \
-        --allele-column hla \
-        --prediction-column-prefix presentation_with_flanks_ \
-        --models \""$(mhcflurry-downloads path models_class1_presentation)/models"\" \
-        --no-affinity-percentile \
-        --out "$(pwd)/benchmark.multiallelic.presentation_with_flanks.csv" >> commands/multiallelic.presentation_with_flanks.sh
-    echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_with_flanks.csv"  >> commands/multiallelic.presentation_with_flanks.sh
-fi
+    ### AFFINITY PREDICTORS: MULTIALLELIC
+    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.production.$kind.csv.bz2" ]
+    then
+        echo "Reusing existing multiallelic predictions"
+    else
+        echo time mhcflurry-predict \
+            "$(pwd)/benchmark.multiallelic.$kind.csv.bz2" \
+            --allele-column hla \
+            --prediction-column-prefix mhcflurry_production_ \
+            --models \""$(mhcflurry-downloads path models_class1_pan)/models.combined"\" \
+            --affinity-only \
+            --no-affinity-percentile \
+            --out "$(pwd)/benchmark.multiallelic.production.$kind.csv" >> commands/multiallelic.production.$kind.sh
+        echo bzip2 -f "$(pwd)/benchmark.multiallelic.production.$kind.csv" >> commands/multiallelic.production.$kind.sh
+    fi
 
-### PRESENTATION: NO FLANKS
-if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.presentation_without_flanks.csv.bz2" ]
-then
-    echo "Reusing existing multiallelic presentation without flanks"
-else
-    echo time mhcflurry-predict \
-        "$(pwd)/benchmark.multiallelic.csv.bz2" \
-        --allele-column hla \
-        --prediction-column-prefix presentation_without_flanks_ \
-        --models \""$(mhcflurry-downloads path models_class1_presentation)/models"\" \
-        --no-affinity-percentile \
-        --no-flanking \
-        --out "$(pwd)/benchmark.multiallelic.presentation_without_flanks.csv" >> commands/multiallelic.presentation_without_flanks.sh
-    echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_without_flanks.csv"  >> commands/multiallelic.presentation_without_flanks.sh
-fi
+    for variant in no_additional_ms compact_peptide affinity_only no_pretrain single_hidden_no_pretrain
+    do
+        if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.$kind.csv.bz2" ]
+        then
+            echo "Reusing existing multiallelic predictions: ${variant}"
+        else
+            echo time mhcflurry-predict \
+                "$(pwd)/benchmark.multiallelic.$kind.csv.bz2" \
+                --allele-column hla \
+                --prediction-column-prefix "${variant}_" \
+                --models \""$(mhcflurry-downloads path models_class1_pan_variants)/models.$variant"\" \
+                --affinity-only \
+                --no-affinity-percentile \
+                --out "$(pwd)/benchmark.multiallelic.${variant}.$kind.csv" >> commands/multiallelic.${variant}.$kind.sh
+            echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.$kind.csv" >> commands/multiallelic.${variant}.$kind.sh
+        fi
+    done
 
-### PRECOMPUTED ####
-for variant in netmhcpan4.ba netmhcpan4.el mixmhcpred
-do
-    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.${variant}.csv.bz2" ]
+    ### PRESENTATION: WITH FLANKS
+    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.presentation_with_flanks.$kind.csv.bz2" ]
     then
-        echo "Reusing existing monoallelic ${variant}"
+        echo "Reusing existing multiallelic presentation with flanks"
     else
-        cp $SCRIPT_DIR/join_with_precomputed.py .
-        echo time python join_with_precomputed.py \
-            \""$(pwd)/benchmark.monoallelic.csv.bz2"\" \
-            ${variant} \
-            --out "$(pwd)/benchmark.monoallelic.${variant}.csv" >> commands/monoallelic.${variant}.sh
-        echo bzip2 -f "$(pwd)/benchmark.monoallelic.${variant}.csv"  >> commands/monoallelic.${variant}.sh
+        echo time mhcflurry-predict \
+            "$(pwd)/benchmark.multiallelic.$kind.csv.bz2" \
+            --allele-column hla \
+            --prediction-column-prefix presentation_with_flanks_ \
+            --models \""$(mhcflurry-downloads path models_class1_presentation)/models"\" \
+            --no-affinity-percentile \
+            --out "$(pwd)/benchmark.multiallelic.presentation_with_flanks.$kind.csv" >> commands/multiallelic.presentation_with_flanks.$kind.sh
+        echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_with_flanks.$kind.csv"  >> commands/multiallelic.presentation_with_flanks.$kind.sh
     fi
 
-    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.csv.bz2" ]
+    ### PRESENTATION: NO FLANKS
+    if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.presentation_without_flanks.$kind.csv.bz2" ]
     then
-        echo "Reusing existing multiallelic ${variant}"
+        echo "Reusing existing multiallelic presentation without flanks"
     else
-        cp $SCRIPT_DIR/join_with_precomputed.py .
-        echo time python join_with_precomputed.py \
-            \""$(pwd)/benchmark.multiallelic.csv.bz2"\" \
-            ${variant} \
-            --out "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh
-        echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.csv"  >> commands/multiallelic.${variant}.sh
+        echo time mhcflurry-predict \
+            "$(pwd)/benchmark.multiallelic.$kind.csv.bz2" \
+            --allele-column hla \
+            --prediction-column-prefix presentation_without_flanks_ \
+            --models \""$(mhcflurry-downloads path models_class1_presentation)/models"\" \
+            --no-affinity-percentile \
+            --no-flanking \
+            --out "$(pwd)/benchmark.multiallelic.presentation_without_flanks.$kind.csv" >> commands/multiallelic.presentation_without_flanks.$kind.sh
+        echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_without_flanks.$kind.csv"  >> commands/multiallelic.presentation_without_flanks.$kind.sh
     fi
+
+    ### PRECOMPUTED ####
+    for variant in netmhcpan4.ba netmhcpan4.el mixmhcpred
+    do
+        if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.${variant}.$kind.csv.bz2" ]
+        then
+            echo "Reusing existing monoallelic ${variant}"
+        else
+            cp $SCRIPT_DIR/join_with_precomputed.py .
+            echo time python join_with_precomputed.py \
+                \""$(pwd)/benchmark.monoallelic.$kind.csv.bz2"\" \
+                ${variant} \
+                --out "$(pwd)/benchmark.monoallelic.${variant}.$kind.csv" >> commands/monoallelic.${variant}.$kind.sh
+            echo bzip2 -f "$(pwd)/benchmark.monoallelic.${variant}.$kind.csv"  >> commands/monoallelic.${variant}.$kind.sh
+        fi
+
+        if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.$kind.csv.bz2" ]
+        then
+            echo "Reusing existing multiallelic ${variant}"
+        else
+            cp $SCRIPT_DIR/join_with_precomputed.py .
+            echo time python join_with_precomputed.py \
+                \""$(pwd)/benchmark.multiallelic.$kind.csv.bz2"\" \
+                ${variant} \
+                --out "$(pwd)/benchmark.multiallelic.${variant}.$kind.csv" >> commands/multiallelic.${variant}.$kind.sh
+            echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.$kind.csv"  >> commands/multiallelic.${variant}.$kind.sh
+        fi
+    done
 done
 
 ls -lh commands
diff --git a/downloads-generation/models_class1_presentation/make_benchmark.py b/downloads-generation/models_class1_presentation/make_benchmark.py
index 7db98598..80c8cd9a 100644
--- a/downloads-generation/models_class1_presentation/make_benchmark.py
+++ b/downloads-generation/models_class1_presentation/make_benchmark.py
@@ -37,7 +37,7 @@ parser.add_argument(
     help="Exclude given PMID")
 parser.add_argument(
     "--only-pmid",
-    nargs="+",
+    nargs="*",
     default=[],
     help="Include only the given PMID")
 parser.add_argument(
@@ -101,6 +101,8 @@ def run():
 
     allele_to_excluded_peptides = collections.defaultdict(set)
     for train_dataset in args.exclude_train_data:
+        if not train_dataset:
+            continue
         print("Excluding hits from", train_dataset)
         train_df = pandas.read_csv(train_dataset)
         for (allele, peptides) in train_df.groupby("allele").peptide.unique().iteritems():
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index 576259d5..fe7cc5f4 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -25,7 +25,7 @@ releases:
               default: false
 
             - name: models_class1_presentation
-              url: https://github.com/openvax/mhcflurry/releases/download/1.6.0/models_class1_presentation.20200125.tar.bz2
+              url: https://github.com/openvax/mhcflurry/releases/download/1.6.0/models_class1_presentation.20200205.tar.bz2
               default: true
 
             - name: models_class1_processing
-- 
GitLab