From a7e94a17927b69fb02e29178f798fe9dff970ace Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 1 Jan 2020 18:28:59 -0500 Subject: [PATCH] fix --- .../models_class1_pan/GENERATE.sh | 1 + .../GENERATE.WITH_HPC_CLUSTER.sh | 1 - .../models_class1_pan_select_only/GENERATE.sh | 93 ------------------- .../additional_alleles.txt | 3 - .../cluster_submit_script_header.mssm_hpc.lsf | 36 ------- 5 files changed, 1 insertion(+), 133 deletions(-) delete mode 100755 downloads-generation/models_class1_pan_select_only/GENERATE.WITH_HPC_CLUSTER.sh delete mode 100755 downloads-generation/models_class1_pan_select_only/GENERATE.sh delete mode 100644 downloads-generation/models_class1_pan_select_only/additional_alleles.txt delete mode 100644 downloads-generation/models_class1_pan_select_only/cluster_submit_script_header.mssm_hpc.lsf diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh index 4a0fba61..85669765 100755 --- a/downloads-generation/models_class1_pan/GENERATE.sh +++ b/downloads-generation/models_class1_pan/GENERATE.sh @@ -79,6 +79,7 @@ do ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" TRAINING_DATA="$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" + HYPERPARAMETERS="hyperparameters.yaml" mhcflurry-class1-train-pan-allele-models \ --data "$TRAINING_DATA" \ diff --git a/downloads-generation/models_class1_pan_select_only/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_select_only/GENERATE.WITH_HPC_CLUSTER.sh deleted file mode 100755 index 53125eb7..00000000 --- a/downloads-generation/models_class1_pan_select_only/GENERATE.WITH_HPC_CLUSTER.sh +++ /dev/null @@ -1 +0,0 @@ -bash GENERATE.sh cluster diff --git a/downloads-generation/models_class1_pan_select_only/GENERATE.sh b/downloads-generation/models_class1_pan_select_only/GENERATE.sh deleted file mode 100755 index 2ef26eec..00000000 --- a/downloads-generation/models_class1_pan_select_only/GENERATE.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash -# Model select pan-allele MHCflurry Class I models and calibrate percentile ranks. -# -# Usage: GENERATE.sh <local|cluster> -# -set -e -set -x - -DOWNLOAD_NAME=models_class1_pan -SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation -SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" -SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") - -mkdir -p "$SCRATCH_DIR" -rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" -mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" - -# Send stdout and stderr to a logfile included with the archive. -exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) - -# Log some environment info -date -pip freeze -git status - -cd $SCRATCH_DIR/$DOWNLOAD_NAME - -cp $SCRIPT_ABSOLUTE_PATH . -cp $SCRIPT_DIR/additional_alleles.txt . - - -if [ "$1" != "cluster" ] -then - GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 - echo "Detected GPUS: $GPUS" - - PROCESSORS=$(getconf _NPROCESSORS_ONLN) - echo "Detected processors: $PROCESSORS" - - if [ "$GPUS" -eq "0" ]; then - NUM_JOBS=${NUM_JOBS-1} - else - NUM_JOBS=${NUM_JOBS-$GPUS} - fi - echo "Num jobs: $NUM_JOBS" - PARALLELISM_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1" -else - PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf" -fi - - -export PYTHONUNBUFFERED=1 - -UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)" - -# For now we calibrate percentile ranks only for alleles for which there -# is training data. Calibrating all alleles would be too slow. -# This could be improved though. -ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.combined/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) -ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') ) - -for kind in combined -do - MODELS_DIR="$UNSELECTED_PATH/models.${kind}" - time mhcflurry-class1-select-pan-allele-models \ - --data "$MODELS_DIR/train_data.csv.bz2" \ - --models-dir "$MODELS_DIR" \ - --out-models-dir models.${kind} \ - --min-models 2 \ - --max-models 8 \ - $PARALLELISM_ARGS - - cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/" - - # For now we calibrate percentile ranks only for alleles for which there - # is training data. Calibrating all alleles would be too slow. - # This could be improved though. - time mhcflurry-calibrate-percentile-ranks \ - --models-dir models.${kind} \ - --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ - --motif-summary \ - --num-peptides-per-length 100000 \ - --allele $ALLELE_LIST \ - --verbosity 1 \ - $PARALLELISM_ARGS -done - -bzip2 LOG.txt -for i in $(ls LOG-worker.*.txt) ; do bzip2 $i ; done -RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" -tar -cjf "$RESULT" * -echo "Created archive: $RESULT" diff --git a/downloads-generation/models_class1_pan_select_only/additional_alleles.txt b/downloads-generation/models_class1_pan_select_only/additional_alleles.txt deleted file mode 100644 index 37546e9e..00000000 --- a/downloads-generation/models_class1_pan_select_only/additional_alleles.txt +++ /dev/null @@ -1,3 +0,0 @@ -# Additional alleles besides those in the training data to include in percentile rank calibration -HLA-C*02:10 -HLA-A*02:20 \ No newline at end of file diff --git a/downloads-generation/models_class1_pan_select_only/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan_select_only/cluster_submit_script_header.mssm_hpc.lsf deleted file mode 100644 index ea6234dc..00000000 --- a/downloads-generation/models_class1_pan_select_only/cluster_submit_script_header.mssm_hpc.lsf +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -#BSUB -J MHCf-{work_item_num} # Job name -#BSUB -P acc_nkcancer # allocation account or Unix group -#BSUB -q gpu # queue -#BSUB -R rusage[ngpus_excl_p=1] # 1 exclusive GPU -#BSUB -R span[hosts=1] # one node -#BSUB -n 1 # number of compute cores -#BSUB -W 46:00 # walltime in HH:MM -#BSUB -R rusage[mem=30000] # mb memory requested -#BSUB -o {work_dir}/%J.stdout # output log (%J : JobID) -#BSUB -eo {work_dir}/STDERR # error log -#BSUB -L /bin/bash # Initialize the execution environment -# - -set -e -set -x - -echo "Subsequent stderr output redirected to stdout" >&2 -exec 2>&1 - -export TMPDIR=/local/JOBS/mhcflurry-{work_item_num} -export PATH=$HOME/.conda/envs/py36b/bin/:$PATH -export PYTHONUNBUFFERED=1 -export KMP_SETTINGS=1 - -free -m - -module add cuda/10.0.130 cudnn/7.1.1 -module list - -python -c 'import tensorflow as tf ; print("GPU AVAILABLE" if tf.test.is_gpu_available() else "GPU NOT AVAILABLE")' - -env - -cd {work_dir} - -- GitLab