From 5ddf981a030890dd47c4c394f47fd8e95a9c7368 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 25 Sep 2019 13:06:53 -0400 Subject: [PATCH] tweak model selection alleles --- .../models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh | 13 +++++++++---- downloads-generation/models_class1_pan/GENERATE.sh | 9 ++++++++- mhcflurry/calibrate_percentile_ranks_command.py | 2 ++ mhcflurry/select_pan_allele_models_command.py | 4 +++- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh index 0dc15cda..3ae964c2 100755 --- a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh @@ -33,6 +33,7 @@ export OMP_NUM_THREADS=1 export PYTHONUNBUFFERED=1 cp $SCRIPT_ABSOLUTE_PATH . +cp $SCRIPT_DIR/additional_alleles.txt . GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 echo "Detected GPUS: $GPUS" @@ -49,6 +50,12 @@ echo "Num local jobs for model selection: $NUM_JOBS" UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)" +# For now we calibrate percentile ranks only for alleles for which there +# is training data. Calibrating all alleles would be too slow. +# This could be improved though. +ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) +ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') ) + for kind in with_mass_spec no_mass_spec do # Model selection is run on the cluster, although for any reasonable @@ -72,15 +79,13 @@ do cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/" # Percentile rank calibration is run on the cluster. - # For now we calibrate percentile ranks only for alleles for which there - # is training data. Calibrating all alleles would be too slow. - # This could be improved though. + time mhcflurry-calibrate-percentile-ranks \ --models-dir models.${kind} \ --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --motif-summary \ --num-peptides-per-length 100000 \ - --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \ + --allele $ALLELE_LIST \ --verbosity 1 \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ --prediction-batch-size 524288 \ diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh index fa767514..8de15f51 100755 --- a/downloads-generation/models_class1_pan/GENERATE.sh +++ b/downloads-generation/models_class1_pan/GENERATE.sh @@ -25,6 +25,7 @@ git status cd $SCRATCH_DIR/$DOWNLOAD_NAME cp $SCRIPT_ABSOLUTE_PATH . +cp $SCRIPT_DIR/additional_alleles.txt . GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 echo "Detected GPUS: $GPUS" @@ -43,6 +44,12 @@ export PYTHONUNBUFFERED=1 UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)" +# For now we calibrate percentile ranks only for alleles for which there +# is training data. Calibrating all alleles would be too slow. +# This could be improved though. +ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) +ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') ) + for kind in with_mass_spec no_mass_spec do MODELS_DIR="$UNSELECTED_PATH/models.${kind}" @@ -65,7 +72,7 @@ do --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --motif-summary \ --num-peptides-per-length 100000 \ - --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \ + --allele $ALLELE_LIST \ --verbosity 1 \ --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 done diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py index 9ced4161..b441262f 100644 --- a/mhcflurry/calibrate_percentile_ranks_command.py +++ b/mhcflurry/calibrate_percentile_ranks_command.py @@ -115,6 +115,8 @@ def run(argv=sys.argv[1:]): else: alleles = predictor.supported_alleles + alleles = sorted(set(alleles)) + distribution = None if args.match_amino_acid_distribution_data: distribution_peptides = pandas.read_csv( diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py index b99657f4..9167ac13 100644 --- a/mhcflurry/select_pan_allele_models_command.py +++ b/mhcflurry/select_pan_allele_models_command.py @@ -288,7 +288,9 @@ def run(argv=sys.argv[1:]): worker_pool.join() print("Model selection time %0.2f min." % (model_selection_time / 60.0)) - print("Predictor written to: %s" % args.out_models_dir) + print("Predictor [%d models] written to: %s" % ( + len(result_predictor.neural_networks), + args.out_models_dir)) def do_model_select_task(item, constant_data=GLOBAL_DATA): -- GitLab