Skip to content
Snippets Groups Projects
Commit a2d950e9 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent d3159f17
No related branches found
No related tags found
No related merge requests found
#!/bin/bash bash GENERATE.sh cluster reuse-predictions
# \ No newline at end of file
#
set -e
set -x
DOWNLOAD_NAME=data_mass_spec_benchmark
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
export PYTHONUNBUFFERED=1
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/write_proteome_peptides.py .
cp $SCRIPT_DIR/run_mhcflurry.py .
cp $SCRIPT_DIR/run_thirdparty_predictors.py .
cp $SCRIPT_DIR/write_allele_list.py .
PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
python write_allele_list.py "$PEPTIDES" --out alleles.txt
mkdir predictions
# First just chr1 peptides
python write_proteome_peptides.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
--chromosome 1 \
--out proteome_peptides.chr1.csv
#for kind in with_mass_spec no_mass_spec
#do
# python run_mhcflurry.py \
# proteome_peptides.chr1.csv \
# --chunk-size 100000 \
# --batch-size 65536 \
# --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
# --allele $(cat alleles.txt) \
# --out "predictions/chr1.mhcflurry.$kind" \
# --verbosity 1 \
# --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
# --cluster-parallelism \
# --cluster-max-retries 15 \
# --cluster-submit-command bsub \
# --cluster-results-workdir ~/mhcflurry-scratch \
# --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
#done
python run_thirdparty_predictors.py \
proteome_peptides.chr1.csv \
--predictor netmhcpan4 \
--chunk-size 10000 \
--allele $(cat alleles.txt) \
--out "predictions/chr1.netmhcpan4" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-parallelism \
--cluster-max-retries 3 \
--cluster-submit-command bsub \
--cluster-results-workdir ~/mhcflurry-scratch \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf
# Now all peptides
python write_proteome_peptides.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
--out proteome_peptides.all.csv
#for kind in with_mass_spec no_mass_spec
#do
# python run_mhcflurry.py \
# proteome_peptides.all.csv \
# --chunk-size 500000 \
# --batch-size 65536 \
# --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
# --allele $(cat alleles.txt) \
# --out "predictions/all.mhcflurry.$kind" \
# --verbosity 1 \
# --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
# --cluster-parallelism \
# --cluster-max-retries 15 \
# --cluster-submit-command bsub \
# --cluster-results-workdir ~/mhcflurry-scratch \
# --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
#done
python run_thirdparty_predictors.py \
proteome_peptides.all.csv \
--predictor netmhcpan4 \
--chunk-size 10000 \
--allele $(cat alleles.txt) \
--out "predictions/all.netmhcpan4" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-parallelism \
--cluster-max-retries 3 \
--cluster-submit-command bsub \
--cluster-results-workdir ~/mhcflurry-scratch \
--cluster-script-prefix-path cluster_submit_script_header.mssm_hpc.nogpu.lsf
bzip2 proteome_peptides.chr1.csv
bzip2 proteome_peptides.all.csv
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
#!/bin/bash #!/bin/bash
# #
# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions>
# #
set -e set -e
set -x set -x
...@@ -15,8 +16,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" ...@@ -15,8 +16,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive. # Send stdout and stderr to a logfile included with the archive.
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info # Log some environment info
date date
...@@ -26,48 +27,123 @@ git status ...@@ -26,48 +27,123 @@ git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/write_proteome_peptides.py . cp $SCRIPT_DIR/write_proteome_peptides.py .
cp $SCRIPT_DIR/run_mhcflurry.py .
cp $SCRIPT_DIR/write_allele_list.py . cp $SCRIPT_DIR/write_allele_list.py .
cp $SCRIPT_DIR/run_predictors.py .
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 if [ "$1" != "cluster" ]
echo "Detected GPUS: $GPUS" then
PROCESSORS=$(getconf _NPROCESSORS_ONLN) GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected processors: $PROCESSORS" echo "Detected GPUS: $GPUS"
if [ "$GPUS" -eq "0" ]; then PROCESSORS=$(getconf _NPROCESSORS_ONLN)
NUM_JOBS=${NUM_JOBS-1} echo "Detected processors: $PROCESSORS"
if [ "$GPUS" -eq "0" ]; then
NUM_JOBS=${NUM_JOBS-1}
else
NUM_JOBS=${NUM_JOBS-$GPUS}
fi
echo "Num jobs: $NUM_JOBS"
EXTRA_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1"
else else
NUM_JOBS=${NUM_JOBS-$GPUS} EXTRA_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir ~/mhcflurry-scratch"
fi fi
echo "Num jobs: $NUM_JOBS"
PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2 PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references) REFERENCES_DIR=$(mhcflurry-downloads path data_references)
#python write_proteome_peptides.py \ if [ "${2:-reuse-none}" != "reuse-none" ]
# "$PEPTIDES" \ then
# "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME)
# --out proteome_peptides.csv echo "Will reuse data from $REFERENCES_DIR"
#ls -lh proteome_peptides.csv else
#bzip2 proteome_peptides.csv EXISTING_DATA=""
ln -s ~/Dropbox/sinai/projects/201808-mhcflurry-pan/20190622-models/proteome_peptides.csv.bz2 proteome_peptides.csv.bz2 echo "Will NOT reuse any data"
fi
python write_allele_list.py "$PEPTIDES" --out alleles.txt
mkdir predictions mkdir predictions
for kind in with_mass_spec no_mass_spec # Write out alleles
if [ "$2" == "reuse-all" ]
then
echo "Reusing allele list"
cp "$EXISTING_DATA/alleles.txt" .
else
echo "Generating allele list"
python write_allele_list.py "$PEPTIDES" --out alleles.txt
fi
# Write out and process peptides.
# First just chr1 peptides, then all peptides.
for subset in chr1 all
do do
python run_mhcflurry.py \ if [ "$2" == "reuse-all" ]
proteome_peptides.csv.bz2 \ then
--chunk-size 100000 \ echo "Reusing peptide list"
--models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \ cp "$EXISTING_DATA/proteome_peptides.$subset.csv.bz2" .
--batch-size 65536 \ else
echo "Generating peptide list"
SUBSET_ARG=""
if [ "$subset" == "chr1" ]
then
SUBSET_ARG="--chromosome 1"
fi
python write_proteome_peptides.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
--out proteome_peptides.$subset.csv $SUBSET_ARG
bzip2 proteome_peptides.$subset.csv
fi
# Run MHCflurry
for kind in with_mass_spec no_mass_spec
do
OUT_DIR=predictions/${subset}.mhcflurry.${kind}
REUSE_ARG=""
if [ "$subset" == "all" ]
then
REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}"
fi
if [ "${2:-reuse-none}" != "reuse-none" ]
then
REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
fi
python run_predictors.py \
proteome_peptides.${subset}.csv.bz2 \
--predictor mhcflurry \
--chunk-size 500000 \
--mhcflurry-batch-size 65536 \
--mhcflurry-models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
--allele $(cat alleles.txt) \
--out "$OUT_DIR" \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf \
$REUSE_ARG $EXTRA_ARGS
done
# Run netmhcpan4
OUT_DIR=predictions/${subset}.netmhcpan4
REUSE_ARG=""
if [ "$subset" == "all" ]
then
REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4"
fi
if [ "${2:-reuse-none}" != "reuse-none" ]
then
REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
fi
python run_predictors.py \
proteome_peptides.$subset.csv.bz2 \
--predictor netmhcpan4 \
--chunk-size 10000 \
--allele $(cat alleles.txt) \ --allele $(cat alleles.txt) \
--out "predictions/mhcflurry.$kind" \ --out "$OUT_DIR" \
--num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
$REUSE_ARG $EXTRA_ARGS
done done
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
...@@ -75,3 +151,16 @@ bzip2 LOG.txt ...@@ -75,3 +151,16 @@ bzip2 LOG.txt
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" * tar -cjf "$RESULT" *
echo "Created archive: $RESULT" echo "Created archive: $RESULT"
# Split into <2GB chunks for GitHub
PARTS="${RESULT}.part."
# Check for pre-existing part files and rename them.
for i in $(ls "${PARTS}"* )
do
DEST="${i}.OLD.$(date +%s)"
echo "WARNING: already exists: $i . Moving to $DEST"
mv $i $DEST
done
split -b 2000M "$RESULT" "$PARTS"
echo "Split into parts:"
ls -lh "${PARTS}"*
#!/bin/bash #!/bin/bash
#BSUB -J MHCf-{work_item_num} # Job name #BSUB -J MHCf-{work_item_num} # Job name
#BSUB -P acc_nkcancer # allocation account or Unix group #BSUB -P acc_nkcancer # allocation account or Unix group
#BSUB -q express # queue #BSUB -q premium # queue
#BSUB -R span[hosts=1] # one node #BSUB -R span[hosts=1] # one node
#BSUB -n 1 # number of compute cores #BSUB -n 1 # number of compute cores
#BSUB -W 12:00 # walltime in HH:MM #BSUB -W 12:00 # walltime in HH:MM
#BSUB -R rusage[mem=20000] # mb memory requested #BSUB -R rusage[mem=4000] # mb memory requested
#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID) #BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
#BSUB -eo {work_dir}/STDERR # error log #BSUB -eo {work_dir}/STDERR # error log
#BSUB -L /bin/bash # Initialize the execution environment #BSUB -L /bin/bash # Initialize the execution environment
......
...@@ -75,7 +75,7 @@ parser.add_argument( ...@@ -75,7 +75,7 @@ parser.add_argument(
parser.add_argument( parser.add_argument(
"--reuse-predictions", "--reuse-predictions",
metavar="DIR", metavar="DIR",
nargs="+", action="append",
help="Take predictions from indicated DIR instead of re-running them") help="Take predictions from indicated DIR instead of re-running them")
add_local_parallelism_args(parser) add_local_parallelism_args(parser)
...@@ -385,7 +385,9 @@ def do_predictions_mhcflurry(work_item_num, peptides, alleles, constant_data=Non ...@@ -385,7 +385,9 @@ def do_predictions_mhcflurry(work_item_num, peptides, alleles, constant_data=Non
peptides=peptides, peptides=peptides,
allele=allele, allele=allele,
throw=False, throw=False,
model_kwargs={'batch_size': args.batch_size}).astype('float32') model_kwargs={
'batch_size': args.mhcflurry_batch_size,
}).astype('float32')
print("Done predicting in", time.time() - start, "sec") print("Done predicting in", time.time() - start, "sec")
return (work_item_num, results) return (work_item_num, results)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment