fixes

a2d950e9 · Tim O'Donnell · d3159f17 · a2d950e9 · a2d950e9 · a2d950e9
Commit a2d950e9 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.WITH_HPC_CLUSTER.sh
-#!/bin/bash
+bash GENERATE.sh cluster reuse-predictions
-#
\ No newline at end of file
-#
-set -e
-set -x
-DOWNLOAD_NAME=data_mass_spec_benchmark
-SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
-SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
-SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
-export PYTHONUNBUFFERED=1
-mkdir -p "$SCRATCH_DIR"
-rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
-mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
-# Send stdout and stderr to a logfile included with the archive.
-exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
-# Log some environment info
-date
-pip freeze
-git status
-cd $SCRATCH_DIR/$DOWNLOAD_NAME
-cp $SCRIPT_DIR/write_proteome_peptides.py .
-cp $SCRIPT_DIR/run_mhcflurry.py .
-cp $SCRIPT_DIR/run_thirdparty_predictors.py .
-cp $SCRIPT_DIR/write_allele_list.py .
-PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2
-REFERENCES_DIR=$(mhcflurry-downloads path data_references)
-python write_allele_list.py "$PEPTIDES" --out alleles.txt
-mkdir predictions
-# First just chr1 peptides
-python write_proteome_peptides.py \
-    "$PEPTIDES" \
-    "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
-    --chromosome 1 \
-    --out proteome_peptides.chr1.csv
-#for kind in with_mass_spec no_mass_spec
-#do
-#    python run_mhcflurry.py \
-#        proteome_peptides.chr1.csv \
-#        --chunk-size 100000 \
-#        --batch-size 65536 \
-#        --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
-#        --allele $(cat alleles.txt) \
-#        --out "predictions/chr1.mhcflurry.$kind" \
-#        --verbosity 1 \
-#        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
-#        --cluster-parallelism \
-#        --cluster-max-retries 15 \
-#        --cluster-submit-command bsub \
-#        --cluster-results-workdir ~/mhcflurry-scratch \
-#        --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
-#done
-python run_thirdparty_predictors.py \
-    proteome_peptides.chr1.csv \
-    --predictor netmhcpan4 \
-    --chunk-size 10000 \
-    --allele $(cat alleles.txt) \
-    --out "predictions/chr1.netmhcpan4" \
-    --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
-    --cluster-parallelism \
-    --cluster-max-retries 3 \
-    --cluster-submit-command bsub \
-    --cluster-results-workdir ~/mhcflurry-scratch \
-    --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf
-# Now all peptides
-python write_proteome_peptides.py \
-    "$PEPTIDES" \
-    "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
-    --out proteome_peptides.all.csv
-#for kind in with_mass_spec no_mass_spec
-#do
-#    python run_mhcflurry.py \
-#        proteome_peptides.all.csv \
-#        --chunk-size 500000 \
-#        --batch-size 65536 \
-#        --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
-#        --allele $(cat alleles.txt) \
-#        --out "predictions/all.mhcflurry.$kind" \
-#        --verbosity 1 \
-#        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
-#        --cluster-parallelism \
-#        --cluster-max-retries 15 \
-#        --cluster-submit-command bsub \
-#        --cluster-results-workdir ~/mhcflurry-scratch \
-#        --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
-#done
-python run_thirdparty_predictors.py \
-    proteome_peptides.all.csv \
-    --predictor netmhcpan4 \
-    --chunk-size 10000 \
-    --allele $(cat alleles.txt) \
-    --out "predictions/all.netmhcpan4" \
-    --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
-    --cluster-parallelism \
-    --cluster-max-retries 3 \
-    --cluster-submit-command bsub \
-    --cluster-results-workdir ~/mhcflurry-scratch \
-    --cluster-script-prefix-path cluster_submit_script_header.mssm_hpc.nogpu.lsf
-bzip2 proteome_peptides.chr1.csv
-bzip2 proteome_peptides.all.csv
-cp $SCRIPT_ABSOLUTE_PATH .
-bzip2 LOG.txt
-RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
-tar -cjf "$RESULT" *
-echo "Created archive: $RESULT"
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
 #!/bin/bash
 #
+# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions>
 #
 set -e
 set -x
@@ -15,8 +16,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
 # Send stdout and stderr to a logfile included with the archive.
-#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 # Log some environment info
 date
@@ -26,48 +27,123 @@ git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
 cp $SCRIPT_DIR/write_proteome_peptides.py .
-cp $SCRIPT_DIR/run_mhcflurry.py .
 cp $SCRIPT_DIR/write_allele_list.py .
+cp $SCRIPT_DIR/run_predictors.py .
-GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
+if [ "$1" != "cluster" ]
-echo "Detected GPUS: $GPUS"
+then
-PROCESSORS=$(getconf _NPROCESSORS_ONLN)
+    GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
-echo "Detected processors: $PROCESSORS"
+    echo "Detected GPUS: $GPUS"
-if [ "$GPUS" -eq "0" ]; then
+    PROCESSORS=$(getconf _NPROCESSORS_ONLN)
-   NUM_JOBS=${NUM_JOBS-1}
+    echo "Detected processors: $PROCESSORS"
+    if [ "$GPUS" -eq "0" ]; then
+       NUM_JOBS=${NUM_JOBS-1}
+    else
+        NUM_JOBS=${NUM_JOBS-$GPUS}
+    fi
+    echo "Num jobs: $NUM_JOBS"
+    EXTRA_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1"
 else
-    NUM_JOBS=${NUM_JOBS-$GPUS}
+    EXTRA_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir ~/mhcflurry-scratch"
 fi
-echo "Num jobs: $NUM_JOBS"
 PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2
 REFERENCES_DIR=$(mhcflurry-downloads path data_references)
-#python write_proteome_peptides.py \
+if [ "${2:-reuse-none}" != "reuse-none" ]
-#    "$PEPTIDES" \
+then
-#    "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
+    EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME)
-#    --out proteome_peptides.csv
+    echo "Will reuse data from $REFERENCES_DIR"
-#ls -lh proteome_peptides.csv
+else
-#bzip2 proteome_peptides.csv
+    EXISTING_DATA=""
-ln -s ~/Dropbox/sinai/projects/201808-mhcflurry-pan/20190622-models/proteome_peptides.csv.bz2 proteome_peptides.csv.bz2
+    echo "Will NOT reuse any data"
+fi
-python write_allele_list.py "$PEPTIDES" --out alleles.txt
 mkdir predictions
-for kind in with_mass_spec no_mass_spec
+# Write out alleles
+if [ "$2" == "reuse-all" ]
+then
+    echo "Reusing allele list"
+    cp "$EXISTING_DATA/alleles.txt" .
+else
+    echo "Generating allele list"
+    python write_allele_list.py "$PEPTIDES" --out alleles.txt
+fi
+# Write out and process peptides.
+# First just chr1 peptides, then all peptides.
+for subset in chr1 all
 do
-    python run_mhcflurry.py \
+    if [ "$2" == "reuse-all" ]
-        proteome_peptides.csv.bz2 \
+    then
-        --chunk-size 100000 \
+        echo "Reusing peptide list"
-        --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
+        cp "$EXISTING_DATA/proteome_peptides.$subset.csv.bz2" .
-        --batch-size 65536 \
+    else
+        echo "Generating peptide list"
+        SUBSET_ARG=""
+        if [ "$subset" == "chr1" ]
+        then
+            SUBSET_ARG="--chromosome 1"
+        fi
+        python write_proteome_peptides.py \
+            "$PEPTIDES" \
+            "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
+            --out proteome_peptides.$subset.csv $SUBSET_ARG
+        bzip2 proteome_peptides.$subset.csv
+    fi
+    # Run MHCflurry
+    for kind in with_mass_spec no_mass_spec
+    do
+        OUT_DIR=predictions/${subset}.mhcflurry.${kind}
+        REUSE_ARG=""
+        if [ "$subset" == "all" ]
+        then
+            REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}"
+        fi
+        if [ "${2:-reuse-none}" != "reuse-none" ]
+        then
+            REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
+        fi
+        python run_predictors.py \
+            proteome_peptides.${subset}.csv.bz2 \
+            --predictor mhcflurry \
+            --chunk-size 500000 \
+            --mhcflurry-batch-size 65536 \
+            --mhcflurry-models-dir "$(mhcflurry-downloads path models_class1_pan)/models.$kind" \
+            --allele $(cat alleles.txt) \
+            --out "$OUT_DIR" \
+            --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
+            --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf \
+            $REUSE_ARG $EXTRA_ARGS
+    done
+    # Run netmhcpan4
+    OUT_DIR=predictions/${subset}.netmhcpan4
+    REUSE_ARG=""
+    if [ "$subset" == "all" ]
+    then
+        REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4"
+    fi
+    if [ "${2:-reuse-none}" != "reuse-none" ]
+    then
+        REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
+    fi
+    python run_predictors.py \
+        proteome_peptides.$subset.csv.bz2 \
+        --predictor netmhcpan4 \
+        --chunk-size 10000 \
        --allele $(cat alleles.txt) \
-        --out "predictions/mhcflurry.$kind" \
+        --out "$OUT_DIR" \
-        --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
+        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
+        --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
+        $REUSE_ARG $EXTRA_ARGS
 done
 cp $SCRIPT_ABSOLUTE_PATH .
@@ -75,3 +151,16 @@ bzip2 LOG.txt
 RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
 tar -cjf "$RESULT" *
 echo "Created archive: $RESULT"
+# Split into <2GB chunks for GitHub
+PARTS="${RESULT}.part."
+# Check for pre-existing part files and rename them.
+for i in $(ls "${PARTS}"* )
+do
+    DEST="${i}.OLD.$(date +%s)"
+    echo "WARNING: already exists: $i . Moving to $DEST"
+    mv $i $DEST
+done
+split -b 2000M "$RESULT" "$PARTS"
+echo "Split into parts:"
+ls -lh "${PARTS}"*
--- a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
+++ b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
 #!/bin/bash
 #BSUB -J MHCf-{work_item_num} # Job name
 #BSUB -P acc_nkcancer # allocation account or Unix group
-#BSUB -q express # queue
+#BSUB -q premium # queue
 #BSUB -R span[hosts=1] # one node
 #BSUB -n 1 # number of compute cores
 #BSUB -W 12:00 # walltime in HH:MM
-#BSUB -R rusage[mem=20000] # mb memory requested
+#BSUB -R rusage[mem=4000] # mb memory requested
 #BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
 #BSUB -eo {work_dir}/STDERR # error log
 #BSUB -L /bin/bash # Initialize the execution environment

--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
@@ -75,7 +75,7 @@ parser.add_argument(
 parser.add_argument(
    "--reuse-predictions",
    metavar="DIR",
-    nargs="+",
+    action="append",
    help="Take predictions from indicated DIR instead of re-running them")
 add_local_parallelism_args(parser)
@@ -385,7 +385,9 @@ def do_predictions_mhcflurry(work_item_num, peptides, alleles, constant_data=Non
                peptides=peptides,
                allele=allele,
                throw=False,
-                model_kwargs={'batch_size': args.batch_size}).astype('float32')
+                model_kwargs={
+                    'batch_size': args.mhcflurry_batch_size,
+                }).astype('float32')
    print("Done predicting in", time.time() - start, "sec")
    return (work_item_num, results)