From 3fb579aee2c61884164f4fccd93e78d797da767c Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 27 Aug 2019 13:12:03 -0400
Subject: [PATCH] add test test_calibrate_percentile_ranks_command.py

---
 .../GENERATE.WITH_HPC_CLUSTER.sh              |  2 +-
 .../models_class1_pan/GENERATE.sh             |  4 +-
 .../calibrate_percentile_ranks_command.py     | 23 +++---
 mhcflurry/class1_affinity_predictor.py        | 63 +++++++++------
 mhcflurry/cluster_parallelism.py              |  2 +-
 ...test_calibrate_percentile_ranks_command.py | 78 +++++++++++++++++++
 test/test_train_pan_allele_models_command.py  |  8 +-
 7 files changed, 132 insertions(+), 48 deletions(-)
 create mode 100644 test/test_calibrate_percentile_ranks_command.py

diff --git a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
index f2ec20b6..7a31a652 100755
--- a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
@@ -70,7 +70,7 @@ do
         --models-dir models.${kind} \
         --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
         --motif-summary \
-        --num-peptides-per-length 100000 \
+        --num-peptides-per-length 1000000 \
         --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
         --verbosity 1 \
         --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh
index f0699f52..216942dc 100755
--- a/downloads-generation/models_class1_pan/GENERATE.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.sh
@@ -43,7 +43,7 @@ export PYTHONUNBUFFERED=1
 
 UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
 
-for kind in with_mass_spec #no_mass_spec
+for kind in with_mass_spec no_mass_spec
 do
     MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
     time mhcflurry-class1-select-pan-allele-models \
@@ -62,7 +62,7 @@ do
         --models-dir models.${kind} \
         --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
         --motif-summary \
-        --num-peptides-per-length 100000 \
+        --num-peptides-per-length 1000000 \
         --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
         --verbosity 1 \
         --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py
index c0a01965..3ba3f7ab 100644
--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
@@ -67,7 +67,8 @@ parser.add_argument(
     help="Calculate motifs and length preferences for each allele")
 parser.add_argument(
     "--summary-top-peptide-fraction",
-    default=0.001,
+    default=[0.0001, 0.001, 0.01, 0.1, 1.0],
+    nargs="+",
     type=float,
     metavar="X",
     help="The top X fraction of predictions (i.e. tightest binders) to use to "
@@ -145,7 +146,7 @@ def run(argv=sys.argv[1:]):
     GLOBAL_DATA["predictor"] = predictor
     GLOBAL_DATA["args"] = {
         'motif_summary': args.motif_summary,
-        'summary_top_peptide_fraction': args.summary_top_peptide_fraction,
+        'summary_top_peptide_fractions': args.summary_top_peptide_fraction,
         'verbose': args.verbosity > 0
     }
 
@@ -203,12 +204,12 @@ def run(argv=sys.argv[1:]):
     print("Predictor written to: %s" % args.models_dir)
 
 
-def do_calibrate_percentile_ranks(allele):
+def do_calibrate_percentile_ranks(allele, constant_data=GLOBAL_DATA):
     return calibrate_percentile_ranks(
         allele,
-        GLOBAL_DATA['predictor'],
-        peptides=GLOBAL_DATA['calibration_peptides'],
-        **GLOBAL_DATA["args"])
+        constant_data['predictor'],
+        peptides=constant_data['calibration_peptides'],
+        **constant_data["args"])
 
 
 def calibrate_percentile_ranks(
@@ -216,19 +217,13 @@ def calibrate_percentile_ranks(
         predictor,
         peptides=None,
         motif_summary=False,
-        summary_top_peptide_fraction=0.001,
+        summary_top_peptide_fractions=[0.001],
         verbose=False):
-    """
-    Private helper function.
-    """
-    global GLOBAL_DATA
-    if peptides is None:
-        peptides = GLOBAL_DATA["calibration_peptides"]
     summary_results = predictor.calibrate_percentile_ranks(
         peptides=peptides,
         alleles=[allele],
         motif_summary=motif_summary,
-        summary_top_peptide_fraction=summary_top_peptide_fraction,
+        summary_top_peptide_fractions=summary_top_peptide_fractions,
         verbose=verbose)
     transforms = {
         allele: predictor.allele_to_percent_rank_transform[allele],
diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index 1f2507e8..241560d9 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -1156,7 +1156,7 @@ class Class1AffinityPredictor(object):
             alleles=None,
             bins=None,
             motif_summary=False,
-            summary_top_peptide_fraction=0.001,
+            summary_top_peptide_fractions=[0.001],
             verbose=False):
         """
         Compute the cumulative distribution of ic50 values for a set of alleles
@@ -1229,32 +1229,45 @@ class Class1AffinityPredictor(object):
                 }).drop_duplicates('peptide').set_index("peptide")
                 predictions_df["length"] = predictions_df.index.str.len()
                 for (length, sub_df) in predictions_df.groupby("length"):
-                    selected = sub_df.prediction.nsmallest(
-                        max(
-                            int(len(sub_df) * summary_top_peptide_fraction),
-                            1)).index.values
-                    matrix = positional_frequency_matrix(selected).reset_index()
-                    original_columns = list(matrix.columns)
-                    matrix["length"] = length
-                    matrix["allele"] = allele
-                    matrix = matrix[["allele", "length"] + original_columns]
-                    frequency_matrices.append(matrix)
+                    for cutoff_fraction in summary_top_peptide_fractions:
+                        selected = sub_df.prediction.nsmallest(
+                            max(
+                                int(len(sub_df) * cutoff_fraction),
+                                1)).index.values
+                        matrix = positional_frequency_matrix(selected).reset_index()
+                        original_columns = list(matrix.columns)
+                        matrix["allele"] = allele
+                        matrix["length"] = length
+                        matrix["cutoff_fraction"] = cutoff_fraction
+                        matrix["cutoff_count"] = len(selected)
+                        matrix = matrix[
+                            ["allele", "length", "cutoff_fraction", "cutoff_count"]
+                            + original_columns
+                        ]
+                        frequency_matrices.append(matrix)
 
                 # Length distribution
-                length_distribution = predictions_df.prediction.nsmallest(
-                    max(
-                        int(len(predictions_df) * summary_top_peptide_fraction),
-                        1)).index.str.len().value_counts()
-                length_distribution.index.name = "length"
-                length_distribution /= length_distribution.sum()
-                length_distribution = length_distribution.to_frame()
-                length_distribution.columns = ["fraction"]
-                length_distribution = length_distribution.reset_index()
-                length_distribution["allele"] = allele
-                length_distribution = length_distribution[
-                    ["allele", "length", "fraction"]
-                ].sort_values("length")
-                length_distributions.append(length_distribution)
+                for cutoff_fraction in summary_top_peptide_fractions:
+                    cutoff_count = max(
+                        int(len(predictions_df) * cutoff_fraction), 1)
+                    length_distribution = predictions_df.prediction.nsmallest(
+                        cutoff_count).index.str.len().value_counts()
+                    length_distribution.index.name = "length"
+                    length_distribution /= length_distribution.sum()
+                    length_distribution = length_distribution.to_frame()
+                    length_distribution.columns = ["fraction"]
+                    length_distribution = length_distribution.reset_index()
+                    length_distribution["allele"] = allele
+                    length_distribution["cutoff_fraction"] = cutoff_fraction
+                    length_distribution["cutoff_count"] = cutoff_count
+                    length_distribution = length_distribution[[
+                        "allele",
+                        "cutoff_fraction",
+                        "cutoff_count",
+                        "length",
+                        "fraction"
+                    ]].sort_values(["cutoff_fraction", "length"])
+                    length_distributions.append(length_distribution)
 
         if frequency_matrices is not None:
             frequency_matrices = pandas.concat(
diff --git a/mhcflurry/cluster_parallelism.py b/mhcflurry/cluster_parallelism.py
index 49e71c37..c756239c 100644
--- a/mhcflurry/cluster_parallelism.py
+++ b/mhcflurry/cluster_parallelism.py
@@ -246,7 +246,7 @@ def worker_entry_point(argv=sys.argv[1:]):
         if args.result_serialization_method == 'save_predictor':
             result.save(args.result_out)
         else:
-            with open(args.out, "wb") as fd:
+            with open(args.result_out, "wb") as fd:
                 pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
         print("Wrote:", args.result_out)
     except Exception as e:
diff --git a/test/test_calibrate_percentile_ranks_command.py b/test/test_calibrate_percentile_ranks_command.py
new file mode 100644
index 00000000..2a87c192
--- /dev/null
+++ b/test/test_calibrate_percentile_ranks_command.py
@@ -0,0 +1,78 @@
+"""
+Tests for calibrate percentile ranks command
+"""
+
+import os
+import shutil
+import tempfile
+import subprocess
+
+from numpy.testing import assert_equal
+
+from mhcflurry import Class1AffinityPredictor
+from mhcflurry.downloads import get_path
+
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+
+def run_and_check(n_jobs=0, delete=True, additional_args=[]):
+    source_models_dir = get_path("models_class1_pan", "models.with_mass_spec")
+    dest_models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
+
+    # Save a new predictor that has no percent rank calibration data.
+    original_predictor = Class1AffinityPredictor.load(source_models_dir)
+    print("Loaded predictor", source_models_dir)
+    new_predictor = Class1AffinityPredictor(
+        class1_pan_allele_models=original_predictor.class1_pan_allele_models,
+        allele_to_sequence=original_predictor.allele_to_sequence,
+    )
+    new_predictor.save(dest_models_dir)
+    print("Saved predictor to", dest_models_dir)
+
+    new_predictor = Class1AffinityPredictor.load(dest_models_dir)
+    assert_equal(len(new_predictor.allele_to_percent_rank_transform), 0)
+
+    args = [
+        "mhcflurry-calibrate-percentile-ranks",
+        "--models-dir", dest_models_dir,
+        "--match-amino-acid-distribution-data", get_path(
+            "data_curated", "curated_training_data.no_mass_spec.csv.bz2"),
+        "--motif-summary",
+        "--num-peptides-per-length", "1000",
+        "--allele", "HLA-A*02:01", "HLA-B*07:02",
+        "--verbosity", "1",
+        "--num-jobs", str(n_jobs),
+    ] + additional_args
+    print("Running with args: %s" % args)
+    subprocess.check_call(args)
+
+    new_predictor = Class1AffinityPredictor.load(dest_models_dir)
+    assert_equal(len(new_predictor.allele_to_percent_rank_transform), 2)
+
+    if delete:
+        print("Deleting: %s" % dest_models_dir)
+        shutil.rmtree(dest_models_dir)
+    else:
+        print("Not deleting: %s" % dest_models_dir)
+
+
+def test_run_serial():
+    run_and_check(n_jobs=0)
+
+
+def test_run_parallel():
+    run_and_check(n_jobs=2)
+
+
+def test_run_cluster_parallelism(delete=True):
+    run_and_check(n_jobs=0, additional_args=[
+        '--cluster-parallelism',
+        '--cluster-results-workdir', '/tmp/',
+        '--cluster-max-retries', '0',
+    ], delete=delete)
+
+
+if __name__ == "__main__":
+    run_and_check(n_jobs=0, delete=False)
+    # run_and_check(n_jobs=2, delete=False)
+    # test_run_cluster_parallelism(delete=False)
diff --git a/test/test_train_pan_allele_models_command.py b/test/test_train_pan_allele_models_command.py
index cb00de4d..2c600346 100644
--- a/test/test_train_pan_allele_models_command.py
+++ b/test/test_train_pan_allele_models_command.py
@@ -7,15 +7,12 @@ import os
 import shutil
 import tempfile
 import subprocess
-from copy import deepcopy
 
-from sklearn.metrics import roc_auc_score
 import pandas
 
-from numpy.testing import assert_, assert_equal, assert_array_less
+from numpy.testing import assert_equal, assert_array_less
 
 from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork
-from mhcflurry.allele_encoding import AlleleEncoding
 from mhcflurry.downloads import get_path
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -145,6 +142,7 @@ def run_and_check(n_jobs=0, delete=True, additional_args=[]):
         print("Deleting: %s" % models_dir)
         shutil.rmtree(models_dir)
 
+
 if os.environ.get("KERAS_BACKEND") != "theano":
     def test_run_parallel():
         run_and_check(n_jobs=1)
@@ -163,5 +161,5 @@ def test_run_cluster_parallelism():
 
 
 if __name__ == "__main__":
-    #run_and_check(n_jobs=0, delete=False)
+    # run_and_check(n_jobs=0, delete=False)
     test_run_cluster_parallelism()
-- 
GitLab