diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh index 5f73e767a74d5ef569dcc3a389207d9deb44510e..20127e1348afea83e0b3ca9007686bd7c9a58d6c 100755 --- a/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh @@ -1,122 +1 @@ -#!/bin/bash -# -# -set -e -set -x - -DOWNLOAD_NAME=data_mass_spec_benchmark -SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation -SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" -SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") -export PYTHONUNBUFFERED=1 - -mkdir -p "$SCRATCH_DIR" -rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" -mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" - -# Send stdout and stderr to a logfile included with the archive. -exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) - -# Log some environment info -date -pip freeze -git status - -cd $SCRATCH_DIR/$DOWNLOAD_NAME - -cp $SCRIPT_DIR/write_proteome_peptides.py . -cp $SCRIPT_DIR/run_mhcflurry.py . -cp $SCRIPT_DIR/run_thirdparty_predictors.py . -cp $SCRIPT_DIR/write_allele_list.py . - -PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2 -REFERENCES_DIR=$(mhcflurry-downloads path data_references) - -python write_allele_list.py "$PEPTIDES" --out alleles.txt -mkdir predictions - -# First just chr1 peptides -python write_proteome_peptides.py \ - "$PEPTIDES" \ - "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ - --chromosome 1 \ - --out proteome_peptides.chr1.csv - -#for kind in with_mass_spec no_mass_spec -#do -# python run_mhcflurry.py \ -# proteome_peptides.chr1.csv \ -# --chunk-size 100000 \ -# --batch-size 65536 \ -# --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \ -# --allele $(cat alleles.txt) \ -# --out "predictions/chr1.mhcflurry.$kind" \ -# --verbosity 1 \ -# --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ -# --cluster-parallelism \ -# --cluster-max-retries 15 \ -# --cluster-submit-command bsub \ -# --cluster-results-workdir ~/mhcflurry-scratch \ -# --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf -#done - -python run_thirdparty_predictors.py \ - proteome_peptides.chr1.csv \ - --predictor netmhcpan4 \ - --chunk-size 10000 \ - --allele $(cat alleles.txt) \ - --out "predictions/chr1.netmhcpan4" \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --cluster-parallelism \ - --cluster-max-retries 3 \ - --cluster-submit-command bsub \ - --cluster-results-workdir ~/mhcflurry-scratch \ - --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf - -# Now all peptides -python write_proteome_peptides.py \ - "$PEPTIDES" \ - "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ - --out proteome_peptides.all.csv - -#for kind in with_mass_spec no_mass_spec -#do -# python run_mhcflurry.py \ -# proteome_peptides.all.csv \ -# --chunk-size 500000 \ -# --batch-size 65536 \ -# --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \ -# --allele $(cat alleles.txt) \ -# --out "predictions/all.mhcflurry.$kind" \ -# --verbosity 1 \ -# --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ -# --cluster-parallelism \ -# --cluster-max-retries 15 \ -# --cluster-submit-command bsub \ -# --cluster-results-workdir ~/mhcflurry-scratch \ -# --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf -#done - -python run_thirdparty_predictors.py \ - proteome_peptides.all.csv \ - --predictor netmhcpan4 \ - --chunk-size 10000 \ - --allele $(cat alleles.txt) \ - --out "predictions/all.netmhcpan4" \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --cluster-parallelism \ - --cluster-max-retries 3 \ - --cluster-submit-command bsub \ - --cluster-results-workdir ~/mhcflurry-scratch \ - --cluster-script-prefix-path cluster_submit_script_header.mssm_hpc.nogpu.lsf - - -bzip2 proteome_peptides.chr1.csv -bzip2 proteome_peptides.all.csv - -cp $SCRIPT_ABSOLUTE_PATH . -bzip2 LOG.txt -RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" -tar -cjf "$RESULT" * -echo "Created archive: $RESULT" +bash GENERATE.sh cluster reuse-predictions \ No newline at end of file diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh index 1818ddbf168341aa812dc9395e678bae65fd4ce7..b1bc0029859d768700bac7b0846d9f551fcaf71c 100755 --- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh @@ -1,5 +1,6 @@ #!/bin/bash # +# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions> # set -e set -x @@ -15,8 +16,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" # Send stdout and stderr to a logfile included with the archive. -#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) # Log some environment info date @@ -26,48 +27,123 @@ git status cd $SCRATCH_DIR/$DOWNLOAD_NAME cp $SCRIPT_DIR/write_proteome_peptides.py . -cp $SCRIPT_DIR/run_mhcflurry.py . cp $SCRIPT_DIR/write_allele_list.py . +cp $SCRIPT_DIR/run_predictors.py . -GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 -echo "Detected GPUS: $GPUS" +if [ "$1" != "cluster" ] +then -PROCESSORS=$(getconf _NPROCESSORS_ONLN) -echo "Detected processors: $PROCESSORS" + GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 + echo "Detected GPUS: $GPUS" -if [ "$GPUS" -eq "0" ]; then - NUM_JOBS=${NUM_JOBS-1} + PROCESSORS=$(getconf _NPROCESSORS_ONLN) + echo "Detected processors: $PROCESSORS" + + if [ "$GPUS" -eq "0" ]; then + NUM_JOBS=${NUM_JOBS-1} + else + NUM_JOBS=${NUM_JOBS-$GPUS} + fi + echo "Num jobs: $NUM_JOBS" + EXTRA_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1" else - NUM_JOBS=${NUM_JOBS-$GPUS} + EXTRA_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir ~/mhcflurry-scratch" fi -echo "Num jobs: $NUM_JOBS" - PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2 REFERENCES_DIR=$(mhcflurry-downloads path data_references) -#python write_proteome_peptides.py \ -# "$PEPTIDES" \ -# "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ -# --out proteome_peptides.csv -#ls -lh proteome_peptides.csv -#bzip2 proteome_peptides.csv -ln -s ~/Dropbox/sinai/projects/201808-mhcflurry-pan/20190622-models/proteome_peptides.csv.bz2 proteome_peptides.csv.bz2 - -python write_allele_list.py "$PEPTIDES" --out alleles.txt +if [ "${2:-reuse-none}" != "reuse-none" ] +then + EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME) + echo "Will reuse data from $REFERENCES_DIR" +else + EXISTING_DATA="" + echo "Will NOT reuse any data" +fi mkdir predictions -for kind in with_mass_spec no_mass_spec +# Write out alleles +if [ "$2" == "reuse-all" ] +then + echo "Reusing allele list" + cp "$EXISTING_DATA/alleles.txt" . +else + echo "Generating allele list" + python write_allele_list.py "$PEPTIDES" --out alleles.txt +fi + +# Write out and process peptides. +# First just chr1 peptides, then all peptides. +for subset in chr1 all do - python run_mhcflurry.py \ - proteome_peptides.csv.bz2 \ - --chunk-size 100000 \ - --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \ - --batch-size 65536 \ + if [ "$2" == "reuse-all" ] + then + echo "Reusing peptide list" + cp "$EXISTING_DATA/proteome_peptides.$subset.csv.bz2" . + else + echo "Generating peptide list" + SUBSET_ARG="" + if [ "$subset" == "chr1" ] + then + SUBSET_ARG="--chromosome 1" + fi + python write_proteome_peptides.py \ + "$PEPTIDES" \ + "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \ + --out proteome_peptides.$subset.csv $SUBSET_ARG + bzip2 proteome_peptides.$subset.csv + fi + + # Run MHCflurry + for kind in with_mass_spec no_mass_spec + do + OUT_DIR=predictions/${subset}.mhcflurry.${kind} + REUSE_ARG="" + if [ "$subset" == "all" ] + then + REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}" + fi + if [ "${2:-reuse-none}" != "reuse-none" ] + then + REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR" + fi + + python run_predictors.py \ + proteome_peptides.${subset}.csv.bz2 \ + --predictor mhcflurry \ + --chunk-size 500000 \ + --mhcflurry-batch-size 65536 \ + --mhcflurry-models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \ + --allele $(cat alleles.txt) \ + --out "$OUT_DIR" \ + --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ + --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf \ + $REUSE_ARG $EXTRA_ARGS + done + + # Run netmhcpan4 + OUT_DIR=predictions/${subset}.netmhcpan4 + REUSE_ARG="" + if [ "$subset" == "all" ] + then + REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4" + fi + if [ "${2:-reuse-none}" != "reuse-none" ] + then + REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR" + fi + + python run_predictors.py \ + proteome_peptides.$subset.csv.bz2 \ + --predictor netmhcpan4 \ + --chunk-size 10000 \ --allele $(cat alleles.txt) \ - --out "predictions/mhcflurry.$kind" \ - --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 + --out "$OUT_DIR" \ + --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ + --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \ + $REUSE_ARG $EXTRA_ARGS done cp $SCRIPT_ABSOLUTE_PATH . @@ -75,3 +151,16 @@ bzip2 LOG.txt RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" tar -cjf "$RESULT" * echo "Created archive: $RESULT" + +# Split into <2GB chunks for GitHub +PARTS="${RESULT}.part." +# Check for pre-existing part files and rename them. +for i in $(ls "${PARTS}"* ) +do + DEST="${i}.OLD.$(date +%s)" + echo "WARNING: already exists: $i . Moving to $DEST" + mv $i $DEST +done +split -b 2000M "$RESULT" "$PARTS" +echo "Split into parts:" +ls -lh "${PARTS}"* diff --git a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf index 275d650bdaba109ee56f85c5dbe440823427acf6..444d2c1157a6d40d33315c7ffe1ad5d5b808c62a 100644 --- a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf +++ b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf @@ -1,11 +1,11 @@ #!/bin/bash #BSUB -J MHCf-{work_item_num} # Job name #BSUB -P acc_nkcancer # allocation account or Unix group -#BSUB -q express # queue +#BSUB -q premium # queue #BSUB -R span[hosts=1] # one node #BSUB -n 1 # number of compute cores #BSUB -W 12:00 # walltime in HH:MM -#BSUB -R rusage[mem=20000] # mb memory requested +#BSUB -R rusage[mem=4000] # mb memory requested #BSUB -o {work_dir}/%J.stdout # output log (%J : JobID) #BSUB -eo {work_dir}/STDERR # error log #BSUB -L /bin/bash # Initialize the execution environment diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py index fd871ff64ad60e98c79e1c0c04ebcab02b1952c9..a86ce0da976fd6266868eead1aa975661cfff7af 100644 --- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py +++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py @@ -75,7 +75,7 @@ parser.add_argument( parser.add_argument( "--reuse-predictions", metavar="DIR", - nargs="+", + action="append", help="Take predictions from indicated DIR instead of re-running them") add_local_parallelism_args(parser) @@ -385,7 +385,9 @@ def do_predictions_mhcflurry(work_item_num, peptides, alleles, constant_data=Non peptides=peptides, allele=allele, throw=False, - model_kwargs={'batch_size': args.batch_size}).astype('float32') + model_kwargs={ + 'batch_size': args.mhcflurry_batch_size, + }).astype('float32') print("Done predicting in", time.time() - start, "sec") return (work_item_num, results)