From 4033fb4dc99edb605a54021e2d6a7925a2e720bd Mon Sep 17 00:00:00 2001 From: Timothy ODonnell <odonnt02@li03c03.chimera.hpc.mssm.edu> Date: Wed, 18 Sep 2019 21:38:52 -0400 Subject: [PATCH] fix --- .../GENERATE.WITH_HPC_CLUSTER.sh | 18 ++++++--- .../models_class1_pan/GENERATE.sh | 2 +- .../cluster_submit_script_header.mssm_hpc.lsf | 37 ++++++++++++++++++- mhcflurry/cluster_parallelism.py | 1 + 4 files changed, 51 insertions(+), 7 deletions(-) mode change 120000 => 100644 downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf diff --git a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh index 807770e6..0dc15cda 100755 --- a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh @@ -51,8 +51,9 @@ UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)" for kind in with_mass_spec no_mass_spec do - # Model selection is always done locally. It's fast enough that it - # doesn't make sense to put it on the cluster. + # Model selection is run on the cluster, although for any reasonable + # machine it could be run locally. We run on the cluster because our + # cluster login nodes are often overloaded. MODELS_DIR="$UNSELECTED_PATH/models.${kind}" time mhcflurry-class1-select-pan-allele-models \ --data "$MODELS_DIR/train_data.csv.bz2" \ @@ -60,8 +61,14 @@ do --out-models-dir models.${kind} \ --min-models 2 \ --max-models 8 \ - --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 - + --verbosity 1 \ + --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ + --cluster-parallelism \ + --cluster-max-retries 15 \ + --cluster-submit-command bsub \ + --cluster-results-workdir ~/mhcflurry-scratch \ + --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf + cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/" # Percentile rank calibration is run on the cluster. @@ -72,12 +79,13 @@ do --models-dir models.${kind} \ --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --motif-summary \ - --num-peptides-per-length 1000000 \ + --num-peptides-per-length 100000 \ --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \ --verbosity 1 \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ --prediction-batch-size 524288 \ --cluster-parallelism \ + --cluster-max-retries 15 \ --cluster-submit-command bsub \ --cluster-results-workdir ~/mhcflurry-scratch \ --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh index 9b4e07b4..fa767514 100755 --- a/downloads-generation/models_class1_pan/GENERATE.sh +++ b/downloads-generation/models_class1_pan/GENERATE.sh @@ -64,7 +64,7 @@ do --models-dir models.${kind} \ --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --motif-summary \ - --num-peptides-per-length 1000000 \ + --num-peptides-per-length 100000 \ --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \ --verbosity 1 \ --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 diff --git a/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf deleted file mode 120000 index 09aeb92d..00000000 --- a/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf +++ /dev/null @@ -1 +0,0 @@ -../models_class1_pan_unselected/cluster_submit_script_header.mssm_hpc.lsf \ No newline at end of file diff --git a/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf new file mode 100644 index 00000000..efa3d10e --- /dev/null +++ b/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf @@ -0,0 +1,36 @@ +#!/bin/bash +#BSUB -J MHCf-{work_item_num} # Job name +#BSUB -P acc_nkcancer # allocation account or Unix group +#BSUB -q gpu # queue +#BSUB -R rusage[ngpus_excl_p=1] # 1 exclusive GPU +#BSUB -R span[hosts=1] # one node +#BSUB -n 1 # number of compute cores +#BSUB -W 46:00 # walltime in HH:MM +#BSUB -R rusage[mem=30000] # mb memory requested +#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID) +#BSUB -eo {work_dir}/STDERR # error log +#BSUB -L /bin/bash # Initialize the execution environment +# + +set -e +set -x + +echo "Subsequent stderr output redirected to stdout" >&2 +exec 2>&1 + +export TMPDIR=/local/JOBS/mhcflurry-{work_item_num} +export PATH=$HOME/.conda/envs/py36b/bin/:$PATH +export PYTHONUNBUFFERED=1 +export KMP_SETTINGS=1 + +free -m + +module add cuda/10.0.130 cudnn/7.1.1 +module list + +# python -c 'import tensorflow as tf ; print("GPU AVAILABLE" if tf.test.is_gpu_available() else "GPU NOT AVAILABLE")' + +env + +cd {work_dir} + diff --git a/mhcflurry/cluster_parallelism.py b/mhcflurry/cluster_parallelism.py index 186e7649..f2360506 100644 --- a/mhcflurry/cluster_parallelism.py +++ b/mhcflurry/cluster_parallelism.py @@ -54,6 +54,7 @@ def add_cluster_parallelism_args(parser): ) group.add_argument( '--cluster-max-retries', + type=int, help="How many times to rerun failing jobs. Default: %(default)s", default=3) -- GitLab