Skip to content
Snippets Groups Projects
Commit 0464808e authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

update

parent 160376eb
No related branches found
No related tags found
No related merge requests found
#!/bin/bash bash GENERATE.sh cluster
#
# Model select pan-allele MHCflurry Class I models and calibrate percentile ranks.
#
# Uses an HPC cluster (Mount Sinai chimera cluster, which uses lsf job
# scheduler). This would need to be modified for other sites.
#
set -e
set -x
DOWNLOAD_NAME=models_class1_pan
SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
mkdir -p "$SCRATCH_DIR"
rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
echo "Invocation: $0 $@"
date
pip freeze
git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME
export OMP_NUM_THREADS=1
export PYTHONUNBUFFERED=1
cp $SCRIPT_ABSOLUTE_PATH .
cp $SCRIPT_DIR/additional_alleles.txt .
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
if [ "$GPUS" -eq "0" ]; then
NUM_JOBS=${NUM_JOBS-1}
else
NUM_JOBS=${NUM_JOBS-$GPUS}
fi
echo "Num local jobs for model selection: $NUM_JOBS"
UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
# For now we calibrate percentile ranks only for alleles for which there
# is training data. Calibrating all alleles would be too slow.
# This could be improved though.
ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
for kind in with_mass_spec no_mass_spec
do
# Model selection is run on the cluster, although for any reasonable
# machine it could be run locally. We run on the cluster because our
# cluster login nodes are often overloaded.
MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
time mhcflurry-class1-select-pan-allele-models \
--data "$MODELS_DIR/train_data.csv.bz2" \
--models-dir "$MODELS_DIR" \
--out-models-dir models.${kind} \
--min-models 2 \
--max-models 8 \
--verbosity 1 \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--cluster-parallelism \
--cluster-max-retries 15 \
--cluster-submit-command bsub \
--cluster-results-workdir ~/mhcflurry-scratch \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/"
# Percentile rank calibration is run on the cluster.
time mhcflurry-calibrate-percentile-ranks \
--models-dir models.${kind} \
--match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
--motif-summary \
--num-peptides-per-length 100000 \
--allele $ALLELE_LIST \
--verbosity 1 \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
--prediction-batch-size 524288 \
--cluster-parallelism \
--cluster-max-retries 15 \
--cluster-submit-command bsub \
--cluster-results-workdir ~/mhcflurry-scratch \
--cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf
done
bzip2 LOG.txt
for i in $(ls LOG-worker.*.txt) ; do bzip2 $i ; done
RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
tar -cjf "$RESULT" *
echo "Created archive: $RESULT"
#!/bin/bash #!/bin/bash
# Model select pan-allele MHCflurry Class I models and calibrate percentile ranks. # Model select pan-allele MHCflurry Class I models and calibrate percentile ranks.
# #
# Usage: GENERATE.sh <local|cluster>
#
set -e set -e
set -x set -x
...@@ -27,18 +29,26 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME ...@@ -27,18 +29,26 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
cp $SCRIPT_DIR/additional_alleles.txt . cp $SCRIPT_DIR/additional_alleles.txt .
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
PROCESSORS=$(getconf _NPROCESSORS_ONLN) if [ "$1" != "cluster" ]
echo "Detected processors: $PROCESSORS" then
GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
echo "Detected GPUS: $GPUS"
PROCESSORS=$(getconf _NPROCESSORS_ONLN)
echo "Detected processors: $PROCESSORS"
if [ "$GPUS" -eq "0" ]; then if [ "$GPUS" -eq "0" ]; then
NUM_JOBS=${NUM_JOBS-1} NUM_JOBS=${NUM_JOBS-1}
else
NUM_JOBS=${NUM_JOBS-$GPUS}
fi
echo "Num jobs: $NUM_JOBS"
PARALLELISM_ARGS+=" --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1"
else else
NUM_JOBS=${NUM_JOBS-$GPUS} PARALLELISM_ARGS+=" --cluster-parallelism --cluster-max-retries 3 --cluster-submit-command bsub --cluster-results-workdir $HOME/mhcflurry-scratch --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.lsf"
fi fi
echo "Num jobs: $NUM_JOBS"
export PYTHONUNBUFFERED=1 export PYTHONUNBUFFERED=1
...@@ -50,7 +60,7 @@ UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)" ...@@ -50,7 +60,7 @@ UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') ) ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
for kind in with_mass_spec no_mass_spec for kind in combined
do do
MODELS_DIR="$UNSELECTED_PATH/models.${kind}" MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
time mhcflurry-class1-select-pan-allele-models \ time mhcflurry-class1-select-pan-allele-models \
...@@ -59,8 +69,7 @@ do ...@@ -59,8 +69,7 @@ do
--out-models-dir models.${kind} \ --out-models-dir models.${kind} \
--min-models 2 \ --min-models 2 \
--max-models 8 \ --max-models 8 \
--num-jobs 0 \ $PARALLELISM_ARGS
--num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/" cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/"
...@@ -74,7 +83,7 @@ do ...@@ -74,7 +83,7 @@ do
--num-peptides-per-length 100000 \ --num-peptides-per-length 100000 \
--allele $ALLELE_LIST \ --allele $ALLELE_LIST \
--verbosity 1 \ --verbosity 1 \
--num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 $PARALLELISM_ARGS
done done
bzip2 LOG.txt bzip2 LOG.txt
......
...@@ -26,7 +26,7 @@ releases: ...@@ -26,7 +26,7 @@ releases:
- name: models_class1_pan_unselected - name: models_class1_pan_unselected
part_urls: part_urls:
- https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa - https://github.com/openvax/mhcflurry/releases/download/1.4.0/models_class1_pan_unselected.20191221.tar.bz2.part.aa
default: false default: false
- name: models_class1_pan_refined - name: models_class1_pan_refined
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment