Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#!/bin/bash
#
#
# Usage: GENERATE.sh <local|cluster> <fresh|continue-incomplete>
#
# cluster mode uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job
# scheduler). This would need to be modified for other sites.
#
set -e
set -x
DOWNLOAD_NAME=data_evaluation
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
mkdir -p "$SCRATCH_DIR"
if [ "$2" != "continue-incomplete" ]
then
echo "Fresh run"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
else
echo "Continuing incomplete run"
fi
# Send stdout and stderr to a logfile included with the archive.
LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
exec > >(tee -ia "$LOG")
exec 2> >(tee -ia "$LOG" >&2)
# Log some environment info
echo "Invocation: $0 $@"
date
pip freeze
git status
mhcflurry-downloads info
cd $SCRATCH_DIR/$DOWNLOAD_NAME
export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1
if [ "$2" == "continue-incomplete" ] && [ -f "hits_with_tpm.csv.bz2" ]
then
echo "Reusing existing expression-annotated hits data"
else
cp $SCRIPT_DIR/annotate_hits_with_expression.py .
time python annotate_hits_with_expression.py \
--hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
--expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
--out "$(pwd)/hits_with_tpm.csv"
bzip2 -f hits_with_tpm.csv
fi
### GENERATE BENCHMARK: MONOALLELIC
Tim O'Donnell
committed
for kind in train_excluded all
do
EXCLUDE_TRAIN_DATA=""
if [ "$kind" == "train_excluded" ]
then
EXCLUDE_TRAIN_DATA="$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms/train_data.csv.bz2"
fi
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.$kind.csv.bz2" ]
then
echo "Reusing existing monoallelic benchmark: benchmark.monoallelic.$kind.csv.bz2"
else
cp $SCRIPT_DIR/make_benchmark.py .
time python make_benchmark.py \
--hits "$(pwd)/hits_with_tpm.csv.bz2" \
--proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
--decoys-per-hit 110 \
--exclude-train-data "$EXCLUDE_TRAIN_DATA" \
--only-format MONOALLELIC \
--out "$(pwd)/benchmark.monoallelic.$kind.csv"
bzip2 -f benchmark.monoallelic.$kind.csv
fi
done
### GENERATE BENCHMARK: MULTIALLELIC
Tim O'Donnell
committed
for kind in train_excluded all
do
EXCLUDE_TRAIN_DATA=""
if [ "$kind" == "train_excluded" ]
then
EXCLUDE_TRAIN_DATA="$(mhcflurry-downloads path models_class1_pan)/models.combined/train_data.csv.bz2"
fi
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.$kind.csv.bz2" ]
then
echo "Reusing existing multiallelic benchmark"
else
cp $SCRIPT_DIR/make_benchmark.py .
time python make_benchmark.py \
--hits "$(pwd)/hits_with_tpm.csv.bz2" \
--proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
--decoys-per-hit 110 \
--exclude-train-data "$EXCLUDE_TRAIN_DATA" \
--only-format MULTIALLELIC \
--out "$(pwd)/benchmark.multiallelic.$kind.csv"
bzip2 -f benchmark.multiallelic.$kind.csv
fi
done
Tim O'Donnell
committed
for kind in train_excluded all
Tim O'Donnell
committed
### AFFINITY PREDICTOR VARIANT: MONOALLELIC
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.no_additional_ms.$kind.csv.bz2" ]
Tim O'Donnell
committed
echo "Reusing existing monoallelic benchmark predictions"
Tim O'Donnell
committed
"$(pwd)/benchmark.monoallelic.$kind.csv.bz2" \
Tim O'Donnell
committed
--prediction-column-prefix no_additional_ms_ \
--models \""$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms"\" \
Tim O'Donnell
committed
--out "$(pwd)/benchmark.monoallelic.no_additional_ms.$kind.csv" \
--no-throw >> commands/monoallelic.$kind.sh
echo bzip2 -f "$(pwd)/benchmark.monoallelic.no_additional_ms.$kind.csv" >> commands/monoallelic.$kind.sh
Tim O'Donnell
committed
### AFFINITY PREDICTORS: MULTIALLELIC
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.production.$kind.csv.bz2" ]
then
echo "Reusing existing multiallelic predictions"
else
echo time mhcflurry-predict \
"$(pwd)/benchmark.multiallelic.$kind.csv.bz2" \
--allele-column hla \
--prediction-column-prefix mhcflurry_production_ \
--models \""$(mhcflurry-downloads path models_class1_pan)/models.combined"\" \
--affinity-only \
--no-affinity-percentile \
--out "$(pwd)/benchmark.multiallelic.production.$kind.csv" >> commands/multiallelic.production.$kind.sh
echo bzip2 -f "$(pwd)/benchmark.multiallelic.production.$kind.csv" >> commands/multiallelic.production.$kind.sh
fi
for variant in no_additional_ms compact_peptide affinity_only no_pretrain single_hidden_no_pretrain 500nm
Tim O'Donnell
committed
do
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.$kind.csv.bz2" ]
then
echo "Reusing existing multiallelic predictions: ${variant}"
else
echo time mhcflurry-predict \
"$(pwd)/benchmark.multiallelic.$kind.csv.bz2" \
--allele-column hla \
--prediction-column-prefix "${variant}_" \
--models \""$(mhcflurry-downloads path models_class1_pan_variants)/models.$variant"\" \
--affinity-only \
--no-affinity-percentile \
--out "$(pwd)/benchmark.multiallelic.${variant}.$kind.csv" >> commands/multiallelic.${variant}.$kind.sh
echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.$kind.csv" >> commands/multiallelic.${variant}.$kind.sh
fi
done
Tim O'Donnell
committed
### PRESENTATION: WITH FLANKS
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.presentation_with_flanks.$kind.csv.bz2" ]
Tim O'Donnell
committed
echo "Reusing existing multiallelic presentation with flanks"
Tim O'Donnell
committed
echo time mhcflurry-predict \
"$(pwd)/benchmark.multiallelic.$kind.csv.bz2" \
--allele-column hla \
--prediction-column-prefix presentation_with_flanks_ \
--models \""$(mhcflurry-downloads path models_class1_presentation)/models"\" \
--no-affinity-percentile \
--out "$(pwd)/benchmark.multiallelic.presentation_with_flanks.$kind.csv" >> commands/multiallelic.presentation_with_flanks.$kind.sh
echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_with_flanks.$kind.csv" >> commands/multiallelic.presentation_with_flanks.$kind.sh
Tim O'Donnell
committed
### PRESENTATION: NO FLANKS
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.presentation_without_flanks.$kind.csv.bz2" ]
Tim O'Donnell
committed
echo "Reusing existing multiallelic presentation without flanks"
Tim O'Donnell
committed
echo time mhcflurry-predict \
"$(pwd)/benchmark.multiallelic.$kind.csv.bz2" \
--allele-column hla \
--prediction-column-prefix presentation_without_flanks_ \
--models \""$(mhcflurry-downloads path models_class1_presentation)/models"\" \
--no-affinity-percentile \
--no-flanking \
--out "$(pwd)/benchmark.multiallelic.presentation_without_flanks.$kind.csv" >> commands/multiallelic.presentation_without_flanks.$kind.sh
echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_without_flanks.$kind.csv" >> commands/multiallelic.presentation_without_flanks.$kind.sh
Tim O'Donnell
committed
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
### PRECOMPUTED ####
for variant in netmhcpan4.ba netmhcpan4.el mixmhcpred
do
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.monoallelic.${variant}.$kind.csv.bz2" ]
then
echo "Reusing existing monoallelic ${variant}"
else
cp $SCRIPT_DIR/join_with_precomputed.py .
echo time python join_with_precomputed.py \
\""$(pwd)/benchmark.monoallelic.$kind.csv.bz2"\" \
${variant} \
--out "$(pwd)/benchmark.monoallelic.${variant}.$kind.csv" >> commands/monoallelic.${variant}.$kind.sh
echo bzip2 -f "$(pwd)/benchmark.monoallelic.${variant}.$kind.csv" >> commands/monoallelic.${variant}.$kind.sh
fi
if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.$kind.csv.bz2" ]
then
echo "Reusing existing multiallelic ${variant}"
else
cp $SCRIPT_DIR/join_with_precomputed.py .
echo time python join_with_precomputed.py \
\""$(pwd)/benchmark.multiallelic.$kind.csv.bz2"\" \
${variant} \
--out "$(pwd)/benchmark.multiallelic.${variant}.$kind.csv" >> commands/multiallelic.${variant}.$kind.sh
echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.$kind.csv" >> commands/multiallelic.${variant}.$kind.sh
fi
done
ls -lh commands
if [ "$1" != "cluster" ]
then
echo "Running locally"
for i in $(ls commands/*.sh)
do
echo "# *******"
echo "# Command $i"
cat $i
bash $i
done
else
echo "Running on cluster"
for i in $(ls commands/*.sh)
do
echo "# *******"
echo "# Command $i"
cat $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf > ${i}.lsf
echo cd "$(pwd)" >> ${i}.lsf
cat $i >> ${i}.lsf
cat ${i}.lsf
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 -f "$LOG"
for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"