Newer
Older
# This download includes predictions for MHCflurry and NetMHCpan 4.0 over a
# large number of peptides encompassing almost the full proteome.
#
# Usage:
# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions|reuse-predictions-except-mhcflurry>
#
# The first choice listed above for each argument is the default.
#
# Meanings for these arguments:
#
# FIRST ARGUMENT: where to run
# local - run locally using NUM_JOBS cores.
# cluster - run on cluster.
#
# SECOND ARGUMENT: whether to reuse predictions from existing downloaded data
# reuse-all - reuse predictions and peptide / allele lists from existing
# downloaded data_mass_spec_benchmark.
# reuse-none - fully self-contained run; do not reuse anything.
# reuse-predictions - reuse predictions but not peptide or allele lists. Any
# new peptides not already included will be run.
# reuse-predictions-except-mhcflurry
# - Reuse predictions except for mhcflurry.
#
set -e
set -x
DOWNLOAD_NAME=data_mass_spec_benchmark
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/write_proteome_peptides.py .
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
if [ "$GPUS" -eq "0" ]; then
NUM_JOBS=${NUM_JOBS-1}
else
NUM_JOBS=${NUM_JOBS-$GPUS}
fi
echo "Num jobs: $NUM_JOBS"
EXTRA_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1"
EXTRA_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch"
PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
EXISTING_DATA="$(mhcflurry-downloads path $DOWNLOAD_NAME)"
echo "Will reuse data from $EXISTING_DATA"
# Write out alleles
if [ "$2" == "reuse-all" ]
then
echo "Reusing allele list"
cp "$EXISTING_DATA/alleles.txt" .
else
echo "Generating allele list"
python write_allele_list.py "$PEPTIDES" --out alleles.txt
fi
# Write out and process peptides.
# First just chr1 peptides, then all peptides.
if [ "$2" == "reuse-all" ]
then
echo "Reusing peptide list"
cp "$EXISTING_DATA/proteome_peptides.$subset.csv.bz2" .
else
echo "Generating peptide list"
SUBSET_ARG=""
if [ "$subset" == "chr1" ]
then
SUBSET_ARG="--chromosome 1"
fi
python write_proteome_peptides.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
--out proteome_peptides.$subset.csv $SUBSET_ARG
bzip2 proteome_peptides.$subset.csv
fi
for kind in el ba
do
OUT_DIR=predictions/${subset}.netmhcpan4.$kind
REUSE1=""
REUSE2=""
if [ "$subset" == "all" ]
then
REUSE1="predictions/chr1.netmhcpan4.$kind"
fi
if [ "${2:-reuse-none}" != "reuse-none" ]
then
REUSE2="$EXISTING_DATA"/$OUT_DIR
fi
python run_predictors.py \
proteome_peptides.$subset.csv.bz2 \
--result-dtype "float16" \
--predictor netmhcpan4-$kind \
--allele $(cat alleles.txt) \
--out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
--reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
done
# Run MHCflurry
for kind in with_mass_spec no_mass_spec
do
OUT_DIR=predictions/${subset}.mhcflurry.${kind}
if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
fi
python run_predictors.py \
proteome_peptides.${subset}.csv.bz2 \
--predictor mhcflurry \
--chunk-size 500000 \
--mhcflurry-batch-size 65536 \
--mhcflurry-models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
--allele $(cat alleles.txt) \
--out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
# Split into <2GB chunks for GitHub
PARTS="${RESULT}.part."
# Check for pre-existing part files and rename them.
for i in $(ls "${PARTS}"* )
do
DEST="${i}.OLD.$(date +%s)"
echo "WARNING: already exists: $i . Moving to $DEST"
mv $i $DEST
done