add test test_calibrate_percentile_ranks_command.py

3fb579ae · Tim O'Donnell · fcb1af8c · 3fb579ae · 3fb579ae · 3fb579ae
Commit 3fb579ae authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh
@@ -70,7 +70,7 @@ do
        --models-dir models.${kind} \
        --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
        --motif-summary \
-        --num-peptides-per-length 100000 \
+        --num-peptides-per-length 1000000 \
        --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
        --verbosity 1 \
        --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \

--- a/downloads-generation/models_class1_pan/GENERATE.sh
+++ b/downloads-generation/models_class1_pan/GENERATE.sh
@@ -43,7 +43,7 @@ export PYTHONUNBUFFERED=1
 UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
-for kind in with_mass_spec #no_mass_spec
+for kind in with_mass_spec no_mass_spec
 do
    MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
    time mhcflurry-class1-select-pan-allele-models \
@@ -62,7 +62,7 @@ do
        --models-dir models.${kind} \
        --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
        --motif-summary \
-        --num-peptides-per-length 100000 \
+        --num-peptides-per-length 1000000 \
        --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
        --verbosity 1 \
        --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1

--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
@@ -67,7 +67,8 @@ parser.add_argument(
    help="Calculate motifs and length preferences for each allele")
 parser.add_argument(
    "--summary-top-peptide-fraction",
-    default=0.001,
+    default=[0.0001, 0.001, 0.01, 0.1, 1.0],
+    nargs="+",
    type=float,
    metavar="X",
    help="The top X fraction of predictions (i.e. tightest binders) to use to "
@@ -145,7 +146,7 @@ def run(argv=sys.argv[1:]):
    GLOBAL_DATA["predictor"] = predictor
    GLOBAL_DATA["args"] = {
        'motif_summary': args.motif_summary,
-        'summary_top_peptide_fraction': args.summary_top_peptide_fraction,
+        'summary_top_peptide_fractions': args.summary_top_peptide_fraction,
        'verbose': args.verbosity > 0
    }
@@ -203,12 +204,12 @@ def run(argv=sys.argv[1:]):
    print("Predictor written to: %s" % args.models_dir)
-def do_calibrate_percentile_ranks(allele):
+def do_calibrate_percentile_ranks(allele, constant_data=GLOBAL_DATA):
    return calibrate_percentile_ranks(
        allele,
-        GLOBAL_DATA['predictor'],
+        constant_data['predictor'],
-        peptides=GLOBAL_DATA['calibration_peptides'],
+        peptides=constant_data['calibration_peptides'],
-        **GLOBAL_DATA["args"])
+        **constant_data["args"])
 def calibrate_percentile_ranks(
@@ -216,19 +217,13 @@ def calibrate_percentile_ranks(
        predictor,
        peptides=None,
        motif_summary=False,
-        summary_top_peptide_fraction=0.001,
+        summary_top_peptide_fractions=[0.001],
        verbose=False):
-    """
-    Private helper function.
-    """
-    global GLOBAL_DATA
-    if peptides is None:
-        peptides = GLOBAL_DATA["calibration_peptides"]
    summary_results = predictor.calibrate_percentile_ranks(
        peptides=peptides,
        alleles=[allele],
        motif_summary=motif_summary,
-        summary_top_peptide_fraction=summary_top_peptide_fraction,
+        summary_top_peptide_fractions=summary_top_peptide_fractions,
        verbose=verbose)
    transforms = {
        allele: predictor.allele_to_percent_rank_transform[allele],

--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -1156,7 +1156,7 @@ class Class1AffinityPredictor(object):
            alleles=None,
            bins=None,
            motif_summary=False,
-            summary_top_peptide_fraction=0.001,
+            summary_top_peptide_fractions=[0.001],
            verbose=False):
        """
        Compute the cumulative distribution of ic50 values for a set of alleles
@@ -1229,32 +1229,45 @@ class Class1AffinityPredictor(object):
                }).drop_duplicates('peptide').set_index("peptide")
                predictions_df["length"] = predictions_df.index.str.len()
                for (length, sub_df) in predictions_df.groupby("length"):
-                    selected = sub_df.prediction.nsmallest(
+                    for cutoff_fraction in summary_top_peptide_fractions:
-                        max(
+                        selected = sub_df.prediction.nsmallest(
-                            int(len(sub_df) * summary_top_peptide_fraction),
+                            max(
-                            1)).index.values
+                                int(len(sub_df) * cutoff_fraction),
-                    matrix = positional_frequency_matrix(selected).reset_index()
+                                1)).index.values
-                    original_columns = list(matrix.columns)
+                        matrix = positional_frequency_matrix(selected).reset_index()
-                    matrix["length"] = length
+                        original_columns = list(matrix.columns)
-                    matrix["allele"] = allele
+                        matrix["allele"] = allele
-                    matrix = matrix[["allele", "length"] + original_columns]
+                        matrix["length"] = length
-                    frequency_matrices.append(matrix)
+                        matrix["cutoff_fraction"] = cutoff_fraction
+                        matrix["cutoff_count"] = len(selected)
+                        matrix = matrix[
+                            ["allele", "length", "cutoff_fraction", "cutoff_count"]
+                            + original_columns
+                        ]
+                        frequency_matrices.append(matrix)
                # Length distribution
-                length_distribution = predictions_df.prediction.nsmallest(
+                for cutoff_fraction in summary_top_peptide_fractions:
-                    max(
+                    cutoff_count = max(
-                        int(len(predictions_df) * summary_top_peptide_fraction),
+                        int(len(predictions_df) * cutoff_fraction), 1)
-                        1)).index.str.len().value_counts()
+                    length_distribution = predictions_df.prediction.nsmallest(
-                length_distribution.index.name = "length"
+                        cutoff_count).index.str.len().value_counts()
-                length_distribution /= length_distribution.sum()
+                    length_distribution.index.name = "length"
-                length_distribution = length_distribution.to_frame()
+                    length_distribution /= length_distribution.sum()
-                length_distribution.columns = ["fraction"]
+                    length_distribution = length_distribution.to_frame()
-                length_distribution = length_distribution.reset_index()
+                    length_distribution.columns = ["fraction"]
-                length_distribution["allele"] = allele
+                    length_distribution = length_distribution.reset_index()
-                length_distribution = length_distribution[
+                    length_distribution["allele"] = allele
-                    ["allele", "length", "fraction"]
+                    length_distribution["cutoff_fraction"] = cutoff_fraction
-                ].sort_values("length")
+                    length_distribution["cutoff_count"] = cutoff_count
-                length_distributions.append(length_distribution)
+                    length_distribution = length_distribution[[
+                        "allele",
+                        "cutoff_fraction",
+                        "cutoff_count",
+                        "length",
+                        "fraction"
+                    ]].sort_values(["cutoff_fraction", "length"])
+                    length_distributions.append(length_distribution)
        if frequency_matrices is not None:
            frequency_matrices = pandas.concat(

--- a/mhcflurry/cluster_parallelism.py
+++ b/mhcflurry/cluster_parallelism.py
@@ -246,7 +246,7 @@ def worker_entry_point(argv=sys.argv[1:]):
        if args.result_serialization_method == 'save_predictor':
            result.save(args.result_out)
        else:
-            with open(args.out, "wb") as fd:
+            with open(args.result_out, "wb") as fd:
                pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
        print("Wrote:", args.result_out)
    except Exception as e:

--- a/test/test_calibrate_percentile_ranks_command.py
+++ b/test/test_calibrate_percentile_ranks_command.py
+"""
+Tests for calibrate percentile ranks command
+"""
+import os
+import shutil
+import tempfile
+import subprocess
+from numpy.testing import assert_equal
+from mhcflurry import Class1AffinityPredictor
+from mhcflurry.downloads import get_path
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+def run_and_check(n_jobs=0, delete=True, additional_args=[]):
+    source_models_dir = get_path("models_class1_pan", "models.with_mass_spec")
+    dest_models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
+    # Save a new predictor that has no percent rank calibration data.
+    original_predictor = Class1AffinityPredictor.load(source_models_dir)
+    print("Loaded predictor", source_models_dir)
+    new_predictor = Class1AffinityPredictor(
+        class1_pan_allele_models=original_predictor.class1_pan_allele_models,
+        allele_to_sequence=original_predictor.allele_to_sequence,
+    )
+    new_predictor.save(dest_models_dir)
+    print("Saved predictor to", dest_models_dir)
+    new_predictor = Class1AffinityPredictor.load(dest_models_dir)
+    assert_equal(len(new_predictor.allele_to_percent_rank_transform), 0)
+    args = [
+        "mhcflurry-calibrate-percentile-ranks",
+        "--models-dir", dest_models_dir,
+        "--match-amino-acid-distribution-data", get_path(
+            "data_curated", "curated_training_data.no_mass_spec.csv.bz2"),
+        "--motif-summary",
+        "--num-peptides-per-length", "1000",
+        "--allele", "HLA-A*02:01", "HLA-B*07:02",
+        "--verbosity", "1",
+        "--num-jobs", str(n_jobs),
+    ] + additional_args
+    print("Running with args: %s" % args)
+    subprocess.check_call(args)
+    new_predictor = Class1AffinityPredictor.load(dest_models_dir)
+    assert_equal(len(new_predictor.allele_to_percent_rank_transform), 2)
+    if delete:
+        print("Deleting: %s" % dest_models_dir)
+        shutil.rmtree(dest_models_dir)
+    else:
+        print("Not deleting: %s" % dest_models_dir)
+def test_run_serial():
+    run_and_check(n_jobs=0)
+def test_run_parallel():
+    run_and_check(n_jobs=2)
+def test_run_cluster_parallelism(delete=True):
+    run_and_check(n_jobs=0, additional_args=[
+        '--cluster-parallelism',
+        '--cluster-results-workdir', '/tmp/',
+        '--cluster-max-retries', '0',
+    ], delete=delete)
+if __name__ == "__main__":
+    run_and_check(n_jobs=0, delete=False)
+    # run_and_check(n_jobs=2, delete=False)
+    # test_run_cluster_parallelism(delete=False)
--- a/test/test_train_pan_allele_models_command.py
+++ b/test/test_train_pan_allele_models_command.py
@@ -7,15 +7,12 @@ import os
 import shutil
 import tempfile
 import subprocess
-from copy import deepcopy
-from sklearn.metrics import roc_auc_score
 import pandas
-from numpy.testing import assert_, assert_equal, assert_array_less
+from numpy.testing import assert_equal, assert_array_less
 from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork
-from mhcflurry.allele_encoding import AlleleEncoding
 from mhcflurry.downloads import get_path
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -145,6 +142,7 @@ def run_and_check(n_jobs=0, delete=True, additional_args=[]):
        print("Deleting: %s" % models_dir)
        shutil.rmtree(models_dir)
 if os.environ.get("KERAS_BACKEND") != "theano":
    def test_run_parallel():
        run_and_check(n_jobs=1)
@@ -163,5 +161,5 @@ def test_run_cluster_parallelism():
 if __name__ == "__main__":
-    #run_and_check(n_jobs=0, delete=False)
+    # run_and_check(n_jobs=0, delete=False)
    test_run_cluster_parallelism()