From 5ddf981a030890dd47c4c394f47fd8e95a9c7368 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Wed, 25 Sep 2019 13:06:53 -0400
Subject: [PATCH] tweak model selection alleles

---
 .../models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh  | 13 +++++++++----
 downloads-generation/models_class1_pan/GENERATE.sh  |  9 ++++++++-
 mhcflurry/calibrate_percentile_ranks_command.py     |  2 ++
 mhcflurry/select_pan_allele_models_command.py       |  4 +++-
 4 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
index 0dc15cda..3ae964c2 100755
--- a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
@@ -33,6 +33,7 @@ export OMP_NUM_THREADS=1
 export PYTHONUNBUFFERED=1
 
 cp $SCRIPT_ABSOLUTE_PATH .
+cp $SCRIPT_DIR/additional_alleles.txt .
 
 GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
 echo "Detected GPUS: $GPUS"
@@ -49,6 +50,12 @@ echo "Num local jobs for model selection: $NUM_JOBS"
 
 UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
 
+# For now we calibrate percentile ranks only for alleles for which there
+# is training data. Calibrating all alleles would be too slow.
+# This could be improved though.
+ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
+ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
+
 for kind in with_mass_spec no_mass_spec
 do
     # Model selection is run on the cluster, although for any reasonable
@@ -72,15 +79,13 @@ do
     cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/"
 
     # Percentile rank calibration is run on the cluster.
-    # For now we calibrate percentile ranks only for alleles for which there
-    # is training data. Calibrating all alleles would be too slow.
-    # This could be improved though.
+
     time mhcflurry-calibrate-percentile-ranks \
         --models-dir models.${kind} \
         --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
         --motif-summary \
         --num-peptides-per-length 100000 \
-        --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
+        --allele $ALLELE_LIST \
         --verbosity 1 \
         --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
         --prediction-batch-size 524288 \
diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh
index fa767514..8de15f51 100755
--- a/downloads-generation/models_class1_pan/GENERATE.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.sh
@@ -25,6 +25,7 @@ git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
 cp $SCRIPT_ABSOLUTE_PATH .
+cp $SCRIPT_DIR/additional_alleles.txt .
 
 GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
 echo "Detected GPUS: $GPUS"
@@ -43,6 +44,12 @@ export PYTHONUNBUFFERED=1
 
 UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
 
+# For now we calibrate percentile ranks only for alleles for which there
+# is training data. Calibrating all alleles would be too slow.
+# This could be improved though.
+ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
+ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
+
 for kind in with_mass_spec no_mass_spec
 do
     MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
@@ -65,7 +72,7 @@ do
         --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
         --motif-summary \
         --num-peptides-per-length 100000 \
-        --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
+        --allele $ALLELE_LIST \
         --verbosity 1 \
         --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
 done
diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py
index 9ced4161..b441262f 100644
--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
@@ -115,6 +115,8 @@ def run(argv=sys.argv[1:]):
     else:
         alleles = predictor.supported_alleles
 
+    alleles = sorted(set(alleles))
+
     distribution = None
     if args.match_amino_acid_distribution_data:
         distribution_peptides = pandas.read_csv(
diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py
index b99657f4..9167ac13 100644
--- a/mhcflurry/select_pan_allele_models_command.py
+++ b/mhcflurry/select_pan_allele_models_command.py
@@ -288,7 +288,9 @@ def run(argv=sys.argv[1:]):
         worker_pool.join()
 
     print("Model selection time %0.2f min." % (model_selection_time / 60.0))
-    print("Predictor written to: %s" % args.out_models_dir)
+    print("Predictor [%d models] written to: %s" % (
+        len(result_predictor.neural_networks),
+        args.out_models_dir))
 
 
 def do_model_select_task(item, constant_data=GLOBAL_DATA):
-- 
GitLab