Skip to content
Snippets Groups Projects
Commit 526b93ee authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent c9b683e9
No related merge requests found
#!/bin/bash
#
#
set -e
set -x
DOWNLOAD_NAME=data_mass_spec_benchmark
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/write_proteome_peptides.py .
cp $SCRIPT_DIR/run_mhcflurry.py .
cp $SCRIPT_DIR/run_thirdparty_predictors.py .
cp $SCRIPT_DIR/write_allele_list.py .
PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
python write_allele_list.py "$PEPTIDES" --out alleles.txt
mkdir predictions
# First just chr1 peptides
python write_proteome_peptides.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
--chromosome 1 \
--out proteome_peptides.chr1.csv
#for kind in with_mass_spec no_mass_spec
#do
# python run_mhcflurry.py \
# proteome_peptides.chr1.csv \
# --chunk-size 100000 \
# --batch-size 65536 \
# --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
# --allele $(cat alleles.txt) \
# --out "predictions/chr1.mhcflurry.$kind" \
# --verbosity 1 \
# --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
# --cluster-parallelism \
# --cluster-max-retries 15 \
# --cluster-submit-command bsub \
# --cluster-results-workdir ~/mhcflurry-scratch \
# --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
#done
python run_thirdparty_predictors.py \
proteome_peptides.chr1.csv \
--predictor netmhcpan4 \
--chunk-size 10000 \
--allele $(cat alleles.txt) \
--out "predictions/chr1.netmhcpan4" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-parallelism \
--cluster-max-retries 3 \
--cluster-submit-command bsub \
--cluster-results-workdir ~/mhcflurry-scratch \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf
# Now all peptides
python write_proteome_peptides.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
--out proteome_peptides.all.csv
#for kind in with_mass_spec no_mass_spec
#do
# python run_mhcflurry.py \
# proteome_peptides.all.csv \
# --chunk-size 500000 \
# --batch-size 65536 \
# --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
# --allele $(cat alleles.txt) \
# --out "predictions/all.mhcflurry.$kind" \
# --verbosity 1 \
# --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
# --cluster-parallelism \
# --cluster-max-retries 15 \
# --cluster-submit-command bsub \
# --cluster-results-workdir ~/mhcflurry-scratch \
# --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
#done
python run_thirdparty_predictors.py \
proteome_peptides.all.csv \
--predictor netmhcpan4 \
--chunk-size 10000 \
--allele $(cat alleles.txt) \
--out "predictions/all.netmhcpan4" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-parallelism \
--cluster-max-retries 3 \
--cluster-submit-command bsub \
--cluster-results-workdir ~/mhcflurry-scratch \
--cluster-script-prefix-path cluster_submit_script_header.mssm_hpc.nogpu.lsf
bzip2 proteome_peptides.chr1.csv
bzip2 proteome_peptides.all.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
bash GENERATE.sh cluster reuse-predictions
\ No newline at end of file
#!/bin/bash
#
# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions>
#
set -e
set -x
......@@ -15,8 +16,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
......@@ -26,48 +27,123 @@ git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/write_proteome_peptides.py .
cp $SCRIPT_DIR/run_mhcflurry.py .
cp $SCRIPT_DIR/write_allele_list.py .
cp $SCRIPT_DIR/run_predictors.py .
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
if [ "$1" != "cluster" ]
then
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
if [ "$GPUS" -eq "0" ]; then
NUM_JOBS=${NUM_JOBS-1}
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
if [ "$GPUS" -eq "0" ]; then
NUM_JOBS=${NUM_JOBS-1}
else
NUM_JOBS=${NUM_JOBS-$GPUS}
fi
echo "Num jobs: $NUM_JOBS"
EXTRA_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1"
else
NUM_JOBS=${NUM_JOBS-$GPUS}
EXTRA_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir ~/mhcflurry-scratch"
fi
echo "Num jobs: $NUM_JOBS"
PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
#python write_proteome_peptides.py \
# "$PEPTIDES" \
# "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
# --out proteome_peptides.csv
#ls -lh proteome_peptides.csv
#bzip2 proteome_peptides.csv
ln -s ~/Dropbox/sinai/projects/201808-mhcflurry-pan/20190622-models/proteome_peptides.csv.bz2 proteome_peptides.csv.bz2
python write_allele_list.py "$PEPTIDES" --out alleles.txt
if [ "${2:-reuse-none}" != "reuse-none" ]
then
EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME)
echo "Will reuse data from $REFERENCES_DIR"
else
EXISTING_DATA=""
echo "Will NOT reuse any data"
fi
mkdir predictions
for kind in with_mass_spec no_mass_spec
# Write out alleles
if [ "$2" == "reuse-all" ]
then
echo "Reusing allele list"
cp "$EXISTING_DATA/alleles.txt" .
else
echo "Generating allele list"
python write_allele_list.py "$PEPTIDES" --out alleles.txt
fi
# Write out and process peptides.
# First just chr1 peptides, then all peptides.
for subset in chr1 all
do
python run_mhcflurry.py \
proteome_peptides.csv.bz2 \
--chunk-size 100000 \
--models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
--batch-size 65536 \
if [ "$2" == "reuse-all" ]
then
echo "Reusing peptide list"
cp "$EXISTING_DATA/proteome_peptides.$subset.csv.bz2" .
else
echo "Generating peptide list"
SUBSET_ARG=""
if [ "$subset" == "chr1" ]
then
SUBSET_ARG="--chromosome 1"
fi
python write_proteome_peptides.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
--out proteome_peptides.$subset.csv $SUBSET_ARG
bzip2 proteome_peptides.$subset.csv
fi
# Run MHCflurry
for kind in with_mass_spec no_mass_spec
do
OUT_DIR=predictions/${subset}.mhcflurry.${kind}
REUSE_ARG=""
if [ "$subset" == "all" ]
then
REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}"
fi
if [ "${2:-reuse-none}" != "reuse-none" ]
then
REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
fi
python run_predictors.py \
proteome_peptides.${subset}.csv.bz2 \
--predictor mhcflurry \
--chunk-size 500000 \
--mhcflurry-batch-size 65536 \
--mhcflurry-models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
--allele $(cat alleles.txt) \
--out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf \
$REUSE_ARG $EXTRA_ARGS
done
# Run netmhcpan4
OUT_DIR=predictions/${subset}.netmhcpan4
REUSE_ARG=""
if [ "$subset" == "all" ]
then
REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4"
fi
if [ "${2:-reuse-none}" != "reuse-none" ]
then
REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
fi
python run_predictors.py \
proteome_peptides.$subset.csv.bz2 \
--predictor netmhcpan4 \
--chunk-size 10000 \
--allele $(cat alleles.txt) \
--out "predictions/mhcflurry.$kind" \
--num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
--out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
$REUSE_ARG $EXTRA_ARGS
done
cp $SCRIPT_ABSOLUTE_PATH .
......@@ -75,3 +151,16 @@ bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
# Split into <2GB chunks for GitHub
PARTS="${RESULT}.part."
# Check for pre-existing part files and rename them.
for i in $(ls "${PARTS}"* )
do
DEST="${i}.OLD.$(date +%s)"
echo "WARNING: already exists: $i . Moving to $DEST"
mv $i $DEST
done
split -b 2000M "$RESULT" "$PARTS"
echo "Split into parts:"
ls -lh "${PARTS}"*
#!/bin/bash
#BSUB -J MHCf-{work_item_num} # Job name
#BSUB -P acc_nkcancer # allocation account or Unix group
#BSUB -q express # queue
#BSUB -q premium # queue
#BSUB -R span[hosts=1] # one node
#BSUB -n 1 # number of compute cores
#BSUB -W 12:00 # walltime in HH:MM
#BSUB -R rusage[mem=20000] # mb memory requested
#BSUB -R rusage[mem=4000] # mb memory requested
#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
#BSUB -eo {work_dir}/STDERR # error log
#BSUB -L /bin/bash # Initialize the execution environment
......
......@@ -75,7 +75,7 @@ parser.add_argument(
parser.add_argument(
"--reuse-predictions",
metavar="DIR",
nargs="+",
action="append",
help="Take predictions from indicated DIR instead of re-running them")
add_local_parallelism_args(parser)
......@@ -385,7 +385,9 @@ def do_predictions_mhcflurry(work_item_num, peptides, alleles, constant_data=Non
peptides=peptides,
allele=allele,
throw=False,
model_kwargs={'batch_size': args.batch_size}).astype('float32')
model_kwargs={
'batch_size': args.mhcflurry_batch_size,
}).astype('float32')
print("Done predicting in", time.time() - start, "sec")
return (work_item_num, results)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment