diff --git a/downloads-generation/data_evaluation/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/data_evaluation/GENERATE.WITH_HPC_CLUSTER.sh
new file mode 100755
index 0000000000000000000000000000000000000000..53125eb7bec329ecbbd0d230b8afe809c1064204
--- /dev/null
+++ b/downloads-generation/data_evaluation/GENERATE.WITH_HPC_CLUSTER.sh
@@ -0,0 +1 @@
+bash GENERATE.sh cluster
diff --git a/downloads-generation/data_evaluation/GENERATE.sh b/downloads-generation/data_evaluation/GENERATE.sh
new file mode 100755
index 0000000000000000000000000000000000000000..8023c90fc4ec4ed6a629a6d9685c522c51565e1d
--- /dev/null
+++ b/downloads-generation/data_evaluation/GENERATE.sh
@@ -0,0 +1,175 @@
+#!/bin/bash
+#
+#
+# Usage: GENERATE.sh <local|cluster> <fresh|continue-incomplete>
+#
+# cluster mode uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job
+# scheduler). This would need to be modified for other sites.
+#
+set -e
+set -x
+
+DOWNLOAD_NAME=data_evaluation
+SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+
+if [ "$1" != "cluster" ]
+then
+    GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
+    echo "Detected GPUS: $GPUS"
+
+    PROCESSORS=$(getconf _NPROCESSORS_ONLN)
+    echo "Detected processors: $PROCESSORS"
+
+    if [ "$GPUS" -eq "0" ]; then
+       NUM_JOBS=${NUM_JOBS-1}
+    else
+        NUM_JOBS=${NUM_JOBS-$GPUS}
+    fi
+    echo "Num jobs: $NUM_JOBS"
+    PARALLELISM_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1"
+else
+    PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf"
+fi
+
+mkdir -p "$SCRATCH_DIR"
+if [ "$2" != "continue-incomplete" ]
+then
+    echo "Fresh run"
+    rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+    mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+else
+    echo "Continuing incomplete run"
+fi
+
+# Send stdout and stderr to a logfile included with the archive.
+LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
+exec >  >(tee -ia "$LOG")
+exec 2> >(tee -ia "$LOG" >&2)
+
+# Log some environment info
+echo "Invocation: $0 $@"
+date
+pip freeze
+git status
+mhcflurry-downloads info
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+
+export OMP_NUM_THREADS=1
+export PYTHONUNBUFFERED=1
+
+if [ "$2" == "continue-incomplete" ] && [ -f "hits_with_tpm.csv.bz2" ]
+then
+    echo "Reusing existing expression-annotated hits data"
+else
+    cp $SCRIPT_DIR/annotate_hits_with_expression.py .
+    time python annotate_hits_with_expression.py \
+        --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
+        --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
+        --out "$(pwd)/hits_with_tpm.csv"
+    bzip2 -f hits_with_tpm.csv
+fi
+
+### GENERATE BENCHMARK: MONOALLELIC
+if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.csv.bz2" ]
+then
+    echo "Reusing existing monoallelic benchmark"
+else
+    cp $SCRIPT_DIR/make_benchmark.py .
+    time python make_benchmark.py \
+        --hits "$(pwd)/hits_with_tpm.csv.bz2" \
+        --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+        --decoys-per-hit 99 \
+        --only-format MONOALLELIC \
+        --out "$(pwd)/benchmark.monoallelic.csv"
+    bzip2 -f benchmark.monoallelic.csv
+    rm -f benchmark.monoallelic.predictions.csv.bz2
+fi
+
+### AFFINITY PREDICTOR VARIANT: MONOALLELIC
+if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.predictions.csv.bz2" ]
+then
+    echo "Reusing existing monoallelic benchmark predictions"
+else
+    time mhcflurry-predict \
+        benchmark.monoallelic.csv.bz2 \
+        --allele-column hla \
+        --models "$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms" \
+        --out benchmark.monoallelic.predictions.csv
+    bzip2 -f benchmark.monoallelic.predictions.csv
+    ls -lh benchmark.monoallelic.predictions.csv.bz2
+done
+
+### GENERATE BENCHMARK: MULTIALLELIC
+if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.csv.bz2" ]
+then
+    echo "Reusing existing multiallelic benchmark"
+else
+    cp $SCRIPT_DIR/make_benchmark.py .
+    time python make_benchmark.py \
+        --hits "$(pwd)/hits_with_tpm.csv.bz2" \
+        --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+        --decoys-per-hit 99 \
+        --only-pmid 31844290 31495665 31154438 \
+        --only-format MULTIALLELIC \
+        --out "$(pwd)/benchmark.multiallelic.csv"
+    bzip2 -f benchmark.multiallelic.csv
+    rm -f benchmark.multiallelic.predictions1.csv.bz2
+fi
+
+### AFFINITY PREDICTORS: MULTIALLELIC
+if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.predictions1.csv.bz2" ]
+then
+    echo "Reusing existing multiallelic predictions"
+else
+    cp $SCRIPT_DIR/predict.py .
+    time python predict.py \
+        benchmark.multiallelic.csv \
+        --models \
+            "$(mhcflurry-downloads path models_class1_pan)/models.combined" \
+            "$(mhcflurry-downloads path models_class1_pan_variants)/models.*" \
+        --out "$(pwd)/benchmark.multiallelic.predictions.csv"
+    bzip2 -f benchmark.multiallelic.predictions1.csv
+    rm -f benchmark.multiallelic.predictions2.csv.bz2
+fi
+
+### PRESENTATION: WITH FLANKS
+if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.predictions2.csv.bz2" ]
+then
+    echo "Reusing existing multiallelic predictions2"
+else
+    mhcflurry-predict-presentation \
+        "$(pwd)/benchmark.multiallelic.predictions1.csv" \
+        --out "$(pwd)/benchmark.multiallelic.predictions2.csv" \
+        --models "$(mhcflurry-downloads path models_class1_presentation)/models" \
+        --include-details \
+        --prediction-col presentation_with_flanks \
+    bzip2 -f benchmark.multiallelic.predictions2.csv
+    rm -f benchmark.multiallelic.predictions3.csv.bz2
+fi
+
+### PRESENTATION: NO FLANKS
+if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.predictions3.csv.bz2" ]
+then
+    echo "Reusing existing multiallelic predictions3"
+else
+    mhcflurry-predict-presentation \
+        "$(pwd)/benchmark.multiallelic.predictions2.csv" \
+        --out "$(pwd)/benchmark.multiallelic.predictions3.csv" \
+        --models "$(mhcflurry-downloads path models_class1_presentation)/models" \
+        --include-details \
+        --prediction-col presentation_without_flanks \
+        --no-flanks
+    bzip2 -f benchmark.multiallelic.predictions3.csv
+fi
+
+
+
+cp $SCRIPT_ABSOLUTE_PATH .
+bzip2 -f "$LOG"
+for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done
+RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
+tar -cjf "$RESULT" *
+echo "Created archive: $RESULT"
diff --git a/downloads-generation/data_evaluation/annotate_hits_with_expression.py b/downloads-generation/data_evaluation/annotate_hits_with_expression.py
new file mode 120000
index 0000000000000000000000000000000000000000..7c6bd5c63d2feb8a237e40812b31b8fcefbf6017
--- /dev/null
+++ b/downloads-generation/data_evaluation/annotate_hits_with_expression.py
@@ -0,0 +1 @@
+../models_class1_presentation/annotate_hits_with_expression.py
\ No newline at end of file
diff --git a/downloads-generation/data_evaluation/make_benchmark.py b/downloads-generation/data_evaluation/make_benchmark.py
new file mode 120000
index 0000000000000000000000000000000000000000..bc530cf623121f27e4b6e3085d66dcb373406e63
--- /dev/null
+++ b/downloads-generation/data_evaluation/make_benchmark.py
@@ -0,0 +1 @@
+../models_class1_presentation/make_benchmark.py
\ No newline at end of file
diff --git a/downloads-generation/data_evaluation/predict.py b/downloads-generation/data_evaluation/predict.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391