diff --git a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_select_only/GENERATE.WITH_HPC_CLUSTER.sh similarity index 100% rename from downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh rename to downloads-generation/models_class1_pan_select_only/GENERATE.WITH_HPC_CLUSTER.sh diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan_select_only/GENERATE.sh similarity index 100% rename from downloads-generation/models_class1_pan/GENERATE.sh rename to downloads-generation/models_class1_pan_select_only/GENERATE.sh diff --git a/downloads-generation/models_class1_pan/additional_alleles.txt b/downloads-generation/models_class1_pan_select_only/additional_alleles.txt similarity index 100% rename from downloads-generation/models_class1_pan/additional_alleles.txt rename to downloads-generation/models_class1_pan_select_only/additional_alleles.txt diff --git a/downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan_select_only/cluster_submit_script_header.mssm_hpc.lsf similarity index 100% rename from downloads-generation/models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf rename to downloads-generation/models_class1_pan_select_only/cluster_submit_script_header.mssm_hpc.lsf diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh index 4706e28316bb6f279d7cc2dd0d23e36f929f776e..53125eb7bec329ecbbd0d230b8afe809c1064204 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh @@ -1,94 +1 @@ -#!/bin/bash -# -# Train pan-allele MHCflurry Class I models. Supports re-starting a failed run. -# -# Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job -# scheduler). This would need to be modified for other sites. -# -set -e -set -x - -DOWNLOAD_NAME=models_class1_pan_unselected -SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation -SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" -SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") - -mkdir -p "$SCRATCH_DIR" -if [ "$1" != "continue-incomplete" ] -then - echo "Fresh run" - rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" - mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" -else - echo "Continuing incomplete run" -fi - -# Send stdout and stderr to a logfile included with the archive. -LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt" -exec > >(tee -ia "$LOG") -exec 2> >(tee -ia "$LOG" >&2) - -# Log some environment info -echo "Invocation: $0 $@" -date -pip freeze -git status - -mhcflurry-downloads fetch data_curated allele_sequences random_peptide_predictions - -cd $SCRATCH_DIR/$DOWNLOAD_NAME - -export OMP_NUM_THREADS=1 -export PYTHONUNBUFFERED=1 - -if [ "$1" != "continue-incomplete" ] -then - cp $SCRIPT_DIR/generate_hyperparameters.py . - python generate_hyperparameters.py > hyperparameters.yaml -fi - -for kind in combined -do - EXTRA_TRAIN_ARGS="" - if [ "$1" == "continue-incomplete" ] && [ -d "models.${kind}" ] - then - echo "Will continue existing run: $kind" - EXTRA_TRAIN_ARGS="--continue-incomplete" - fi - - mhcflurry-class1-train-pan-allele-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ - --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ - --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ - --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ - --num-folds 4 \ - --hyperparameters hyperparameters.yaml \ - --out-models-dir $(pwd)/models.${kind} \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --verbosity 0 \ - --cluster-parallelism \ - --cluster-submit-command bsub \ - --cluster-results-workdir ~/mhcflurry-scratch \ - --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf \ - $EXTRA_TRAIN_ARGS -done - -cp $SCRIPT_ABSOLUTE_PATH . -bzip2 -f "$LOG" -for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done -RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" -tar -cjf "$RESULT" * -echo "Created archive: $RESULT" - -# Split into <2GB chunks for GitHub -PARTS="${RESULT}.part." -# Check for pre-existing part files and rename them. -for i in $(ls "${PARTS}"* ) -do - DEST="${i}.OLD.$(date +%s)" - echo "WARNING: already exists: $i . Moving to $DEST" - mv $i $DEST -done -split -b 2000M "$RESULT" "$PARTS" -echo "Split into parts:" -ls -lh "${PARTS}"* +bash GENERATE.sh cluster diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.sh index b6f2efae4e5cb259ff36ff6db02ced02e5c7e5a9..a161aa0e0497290f246b0422a6c763db9377626a 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.sh @@ -2,6 +2,11 @@ # # Train pan-allele MHCflurry Class I models. Supports re-starting a failed run. # +# Usage: GENERATE.sh <local|cluster> <fresh|continue-incomplete> +# +# cluster mode uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job +# scheduler). This would need to be modified for other sites. +# set -e set -x @@ -10,8 +15,27 @@ SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") +if [ "$1" != "cluster" ] +then + GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 + echo "Detected GPUS: $GPUS" + + PROCESSORS=$(getconf _NPROCESSORS_ONLN) + echo "Detected processors: $PROCESSORS" + + if [ "$GPUS" -eq "0" ]; then + NUM_JOBS=${NUM_JOBS-1} + else + NUM_JOBS=${NUM_JOBS-$GPUS} + fi + echo "Num jobs: $NUM_JOBS" + PARALLELISM_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1" +else + PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf" +fi + mkdir -p "$SCRATCH_DIR" -if [ "$1" != "continue-incomplete" ] +if [ "$2" != "continue-incomplete" ] then echo "Fresh run" rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" @@ -31,29 +55,14 @@ date pip freeze git status -mhcflurry-downloads fetch data_curated allele_sequences random_peptide_predictions - cd $SCRATCH_DIR/$DOWNLOAD_NAME -cp $SCRIPT_DIR/generate_hyperparameters.py . -python generate_hyperparameters.py > hyperparameters.yaml - -GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 -echo "Detected GPUS: $GPUS" - -PROCESSORS=$(getconf _NPROCESSORS_ONLN) -echo "Detected processors: $PROCESSORS" - -if [ "$GPUS" -eq "0" ]; then - NUM_JOBS=${NUM_JOBS-1} -else - NUM_JOBS=${NUM_JOBS-$GPUS} -fi -echo "Num jobs: $NUM_JOBS" - +export OMP_NUM_THREADS=1 export PYTHONUNBUFFERED=1 -if [ "$1" != "continue-incomplete" ] +cp $SCRIPT_DIR/additional_alleles.txt . + +if [ "$2" != "continue-incomplete" ] then cp $SCRIPT_DIR/generate_hyperparameters.py . python generate_hyperparameters.py > hyperparameters.yaml @@ -61,25 +70,60 @@ fi for kind in combined do - EXTRA_TRAIN_ARGS="" - if [ "$1" == "continue-incomplete" ] && [ -d "models.${kind}" ] + CONTINUE_INCOMPLETE_ARGS="" + if [ "$2" == "continue-incomplete" ] && [ -d "models.unselected.${kind}" ] then echo "Will continue existing run: $kind" - EXTRA_TRAIN_ARGS="--continue-incomplete" + CONTINUE_INCOMPLETE_ARGS="--continue-incomplete" fi + ALLELE_SEQUENCES="$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" + TRAINING_DATA="$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" + mhcflurry-class1-train-pan-allele-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ - --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ + --data "$TRAINING_DATA" \ + --allele-sequences "$ALLELE_SEQUENCES" \ --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ --num-folds 4 \ - --hyperparameters hyperparameters.yaml \ - --out-models-dir models.${kind} \ + --hyperparameters "$HYPERPARAMETERS" \ + --out-models-dir $(pwd)/models.unselected.${kind} \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --verbosity 0 \ - --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 \ - $EXTRA_TRAIN_ARGS + $PARALLELISM_ARGS $CONTINUE_INCOMPLETE_ARGS +done + +echo "Done training. Beginning model selection." + +for kind in combined +do + MODELS_DIR="models.unselected.${kind}" + + # For now we calibrate percentile ranks only for alleles for which there + # is training data. Calibrating all alleles would be too slow. + # This could be improved though. + ALLELE_LIST=$(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) + ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') ) + + mhcflurry-class1-select-pan-allele-models \ + --data "$MODELS_DIR/train_data.csv.bz2" \ + --models-dir "$MODELS_DIR" \ + --out-models-dir models.${kind} \ + --min-models 2 \ + --max-models 8 \ + $PARALLELISM_ARGS + cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/train_data.csv.bz2" + + # For now we calibrate percentile ranks only for alleles for which there + # is training data. Calibrating all alleles would be too slow. + # This could be improved though. + time mhcflurry-calibrate-percentile-ranks \ + --models-dir models.${kind} \ + --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ + --motif-summary \ + --num-peptides-per-length 100000 \ + --allele $ALLELE_LIST \ + --verbosity 1 \ + $PARALLELISM_ARGS done cp $SCRIPT_ABSOLUTE_PATH . @@ -102,3 +146,11 @@ split -b 2000M "$RESULT" "$PARTS" echo "Split into parts:" ls -lh "${PARTS}"* +# Write out just the selected models +# Move unselected into a hidden dir so it is excluded in the glob (*). +mkdir .ignored +mv models.unselected.* .ignored/ +RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.selected.$(date +%Y%m%d).tar.bz2" +tar -cjf "$RESULT" * +mv .ignored/* . && rmdir .ignored +echo "Created archive: $RESULT" diff --git a/downloads-generation/models_class1_pan_unselected/additional_alleles.txt b/downloads-generation/models_class1_pan_unselected/additional_alleles.txt new file mode 100644 index 0000000000000000000000000000000000000000..37546e9e67154a141494f743ec5278900ca973cd --- /dev/null +++ b/downloads-generation/models_class1_pan_unselected/additional_alleles.txt @@ -0,0 +1,3 @@ +# Additional alleles besides those in the training data to include in percentile rank calibration +HLA-C*02:10 +HLA-A*02:20 \ No newline at end of file