tweak model selection alleles

5ddf981a · Tim O'Donnell · 931e7304 · 5ddf981a · 5ddf981a · 5ddf981a
Commit 5ddf981a authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
@@ -33,6 +33,7 @@ export OMP_NUM_THREADS=1
 export PYTHONUNBUFFERED=1

 cp $SCRIPT_ABSOLUTE_PATH .
+cp $SCRIPT_DIR/additional_alleles.txt .

 GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
 echo "Detected GPUS: $GPUS"
@@ -49,6 +50,12 @@ echo "Num local jobs for model selection: $NUM_JOBS"

 UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"

+# For now we calibrate percentile ranks only for alleles for which there
+# is training data. Calibrating all alleles would be too slow.
+# This could be improved though.
+ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
+ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
+
 for kind in with_mass_spec no_mass_spec
 do
    # Model selection is run on the cluster, although for any reasonable
@@ -72,15 +79,13 @@ do
    cp "$MODELS_DIR/train_data.csv.bz2" "models.${kind}/"

    # Percentile rank calibration is run on the cluster.
-    # For now we calibrate percentile ranks only for alleles for which there
-    # is training data. Calibrating all alleles would be too slow.
-    # This could be improved though.
+
    time mhcflurry-calibrate-percentile-ranks \
        --models-dir models.${kind} \
        --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
        --motif-summary \
        --num-peptides-per-length 100000 \
-        --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
+        --allele $ALLELE_LIST \
        --verbosity 1 \
        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
        --prediction-batch-size 524288 \

--- a/downloads-generation/models_class1_pan/GENERATE.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.sh
@@ -25,6 +25,7 @@ git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME

 cp $SCRIPT_ABSOLUTE_PATH .
+cp $SCRIPT_DIR/additional_alleles.txt .

 GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0
 echo "Detected GPUS: $GPUS"
@@ -43,6 +44,12 @@ export PYTHONUNBUFFERED=1

 UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"

+# For now we calibrate percentile ranks only for alleles for which there
+# is training data. Calibrating all alleles would be too slow.
+# This could be improved though.
+ALLELE_LIST=$(bzcat "$UNSELECTED_PATH/models.with_mass_spec/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
+ALLELE_LIST+=$(echo " " $(cat additional_alleles.txt | grep -v '#') )
+
 for kind in with_mass_spec no_mass_spec
 do
    MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
@@ -65,7 +72,7 @@ do
        --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
        --motif-summary \
        --num-peptides-per-length 100000 \
-        --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
+        --allele $ALLELE_LIST \
        --verbosity 1 \
        --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
 done

--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
@@ -115,6 +115,8 @@ def run(argv=sys.argv[1:]):
    else:
        alleles = predictor.supported_alleles

+    alleles = sorted(set(alleles))
+
    distribution = None
    if args.match_amino_acid_distribution_data:
        distribution_peptides = pandas.read_csv(

--- a/mhcflurry/select_pan_allele_models_command.py
+++ b/mhcflurry/select_pan_allele_models_command.py
@@ -288,7 +288,9 @@ def run(argv=sys.argv[1:]):
        worker_pool.join()

    print("Model selection time %0.2f min." % (model_selection_time / 60.0))
-    print("Predictor written to: %s" % args.out_models_dir)
+    print("Predictor [%d models] written to: %s" % (
+        len(result_predictor.neural_networks),
+        args.out_models_dir))


 def do_model_select_task(item, constant_data=GLOBAL_DATA):