fix

4033fb4d · Timothy ODonnell · 89db282d · 4033fb4d · 4033fb4d · 89db282d
Commit 4033fb4d authored 5 years ago by Timothy ODonnell
--- a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
@@ -51,8 +51,9 @@ UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
 for kind in with_mass_spec no_mass_spec
 do
-    # Model selection is always done locally. It's fast enough that it
+    # Model selection is run on the cluster, although for any reasonable
-    # doesn't make sense to put it on the cluster.
+    # machine it could be run locally. We run on the cluster because our
+    # cluster login nodes are often overloaded.
    MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
    time mhcflurry-class1-select-pan-allele-models \
        --data "$MODELS_DIR/train_data.csv.bz2" \
@@ -60,8 +61,14 @@ do
        --out-models-dir models.${kind} \
        --min-models 2 \
        --max-models 8 \
-        --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
+        --verbosity 1 \
+        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
+        --cluster-parallelism \
+        --cluster-max-retries 15 \
+        --cluster-submit-command bsub \
+        --cluster-results-workdir ~/mhcflurry-scratch \
+        --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
    cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/"
    # Percentile rank calibration is run on the cluster.
@@ -72,12 +79,13 @@ do
        --models-dir models.${kind} \
        --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
        --motif-summary \
-        --num-peptides-per-length 1000000 \
+        --num-peptides-per-length 100000 \
        --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
        --verbosity 1 \
        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
        --prediction-batch-size 524288 \
        --cluster-parallelism \
+        --cluster-max-retries 15 \
        --cluster-submit-command bsub \
        --cluster-results-workdir ~/mhcflurry-scratch \
        --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf

--- a/downloads-generation/models_class1_pan/GENERATE.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.sh
@@ -64,7 +64,7 @@ do
        --models-dir models.${kind} \
        --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
        --motif-summary \
-        --num-peptides-per-length 1000000 \
+        --num-peptides-per-length 100000 \
        --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
        --verbosity 1 \
        --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1

--- a/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf
+++ b/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf
-../models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf
\ No newline at end of file
--- a/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf
+++ b/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf
+#!/bin/bash
+#BSUB -J MHCf-{work_item_num} # Job name
+#BSUB -P acc_nkcancer # allocation account or Unix group
+#BSUB -q gpu # queue
+#BSUB -R rusage[ngpus_excl_p=1]  # 1 exclusive GPU
+#BSUB -R span[hosts=1] # one node
+#BSUB -n 1 # number of compute cores
+#BSUB -W 46:00 # walltime in HH:MM
+#BSUB -R rusage[mem=30000] # mb memory requested
+#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
+#BSUB -eo {work_dir}/STDERR # error log
+#BSUB -L /bin/bash # Initialize the execution environment
+#
+set -e
+set -x
+echo "Subsequent stderr output redirected to stdout" >&2
+exec 2>&1
+export TMPDIR=/local/JOBS/mhcflurry-{work_item_num}
+export PATH=$HOME/.conda/envs/py36b/bin/:$PATH
+export PYTHONUNBUFFERED=1
+export KMP_SETTINGS=1
+free -m
+module add cuda/10.0.130 cudnn/7.1.1
+module list
+# python -c 'import tensorflow as tf ; print("GPU AVAILABLE" if tf.test.is_gpu_available() else "GPU NOT AVAILABLE")'
+env
+cd {work_dir}
--- a/mhcflurry/cluster_parallelism.py
+++ b/mhcflurry/cluster_parallelism.py
@@ -54,6 +54,7 @@ def add_cluster_parallelism_args(parser):
    )
    group.add_argument(
        '--cluster-max-retries',
+        type=int,
        help="How many times to rerun failing jobs. Default: %(default)s",
        default=3)