From 3fb579aee2c61884164f4fccd93e78d797da767c Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 27 Aug 2019 13:12:03 -0400 Subject: [PATCH] add test test_calibrate_percentile_ranks_command.py --- .../GENERATE.WITH_HPC_CLUSTER.sh | 2 +- .../models_class1_pan/GENERATE.sh | 4 +- .../calibrate_percentile_ranks_command.py | 23 +++--- mhcflurry/class1_affinity_predictor.py | 63 +++++++++------ mhcflurry/cluster_parallelism.py | 2 +- ...test_calibrate_percentile_ranks_command.py | 78 +++++++++++++++++++ test/test_train_pan_allele_models_command.py | 8 +- 7 files changed, 132 insertions(+), 48 deletions(-) create mode 100644 test/test_calibrate_percentile_ranks_command.py diff --git a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh index f2ec20b6..7a31a652 100755 --- a/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/models_class1_pan/GENERATE.WITH_HPC_CLUSTER.sh @@ -70,7 +70,7 @@ do --models-dir models.${kind} \ --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --motif-summary \ - --num-peptides-per-length 100000 \ + --num-peptides-per-length 1000000 \ --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \ --verbosity 1 \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ diff --git a/downloads-generation/models_class1_pan/GENERATE.sh b/downloads-generation/models_class1_pan/GENERATE.sh index f0699f52..216942dc 100755 --- a/downloads-generation/models_class1_pan/GENERATE.sh +++ b/downloads-generation/models_class1_pan/GENERATE.sh @@ -43,7 +43,7 @@ export PYTHONUNBUFFERED=1 UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)" -for kind in with_mass_spec #no_mass_spec +for kind in with_mass_spec no_mass_spec do MODELS_DIR="$UNSELECTED_PATH/models.${kind}" time mhcflurry-class1-select-pan-allele-models \ @@ -62,7 +62,7 @@ do --models-dir models.${kind} \ --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --motif-summary \ - --num-peptides-per-length 100000 \ + --num-peptides-per-length 1000000 \ --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \ --verbosity 1 \ --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py index c0a01965..3ba3f7ab 100644 --- a/mhcflurry/calibrate_percentile_ranks_command.py +++ b/mhcflurry/calibrate_percentile_ranks_command.py @@ -67,7 +67,8 @@ parser.add_argument( help="Calculate motifs and length preferences for each allele") parser.add_argument( "--summary-top-peptide-fraction", - default=0.001, + default=[0.0001, 0.001, 0.01, 0.1, 1.0], + nargs="+", type=float, metavar="X", help="The top X fraction of predictions (i.e. tightest binders) to use to " @@ -145,7 +146,7 @@ def run(argv=sys.argv[1:]): GLOBAL_DATA["predictor"] = predictor GLOBAL_DATA["args"] = { 'motif_summary': args.motif_summary, - 'summary_top_peptide_fraction': args.summary_top_peptide_fraction, + 'summary_top_peptide_fractions': args.summary_top_peptide_fraction, 'verbose': args.verbosity > 0 } @@ -203,12 +204,12 @@ def run(argv=sys.argv[1:]): print("Predictor written to: %s" % args.models_dir) -def do_calibrate_percentile_ranks(allele): +def do_calibrate_percentile_ranks(allele, constant_data=GLOBAL_DATA): return calibrate_percentile_ranks( allele, - GLOBAL_DATA['predictor'], - peptides=GLOBAL_DATA['calibration_peptides'], - **GLOBAL_DATA["args"]) + constant_data['predictor'], + peptides=constant_data['calibration_peptides'], + **constant_data["args"]) def calibrate_percentile_ranks( @@ -216,19 +217,13 @@ def calibrate_percentile_ranks( predictor, peptides=None, motif_summary=False, - summary_top_peptide_fraction=0.001, + summary_top_peptide_fractions=[0.001], verbose=False): - """ - Private helper function. - """ - global GLOBAL_DATA - if peptides is None: - peptides = GLOBAL_DATA["calibration_peptides"] summary_results = predictor.calibrate_percentile_ranks( peptides=peptides, alleles=[allele], motif_summary=motif_summary, - summary_top_peptide_fraction=summary_top_peptide_fraction, + summary_top_peptide_fractions=summary_top_peptide_fractions, verbose=verbose) transforms = { allele: predictor.allele_to_percent_rank_transform[allele], diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 1f2507e8..241560d9 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -1156,7 +1156,7 @@ class Class1AffinityPredictor(object): alleles=None, bins=None, motif_summary=False, - summary_top_peptide_fraction=0.001, + summary_top_peptide_fractions=[0.001], verbose=False): """ Compute the cumulative distribution of ic50 values for a set of alleles @@ -1229,32 +1229,45 @@ class Class1AffinityPredictor(object): }).drop_duplicates('peptide').set_index("peptide") predictions_df["length"] = predictions_df.index.str.len() for (length, sub_df) in predictions_df.groupby("length"): - selected = sub_df.prediction.nsmallest( - max( - int(len(sub_df) * summary_top_peptide_fraction), - 1)).index.values - matrix = positional_frequency_matrix(selected).reset_index() - original_columns = list(matrix.columns) - matrix["length"] = length - matrix["allele"] = allele - matrix = matrix[["allele", "length"] + original_columns] - frequency_matrices.append(matrix) + for cutoff_fraction in summary_top_peptide_fractions: + selected = sub_df.prediction.nsmallest( + max( + int(len(sub_df) * cutoff_fraction), + 1)).index.values + matrix = positional_frequency_matrix(selected).reset_index() + original_columns = list(matrix.columns) + matrix["allele"] = allele + matrix["length"] = length + matrix["cutoff_fraction"] = cutoff_fraction + matrix["cutoff_count"] = len(selected) + matrix = matrix[ + ["allele", "length", "cutoff_fraction", "cutoff_count"] + + original_columns + ] + frequency_matrices.append(matrix) # Length distribution - length_distribution = predictions_df.prediction.nsmallest( - max( - int(len(predictions_df) * summary_top_peptide_fraction), - 1)).index.str.len().value_counts() - length_distribution.index.name = "length" - length_distribution /= length_distribution.sum() - length_distribution = length_distribution.to_frame() - length_distribution.columns = ["fraction"] - length_distribution = length_distribution.reset_index() - length_distribution["allele"] = allele - length_distribution = length_distribution[ - ["allele", "length", "fraction"] - ].sort_values("length") - length_distributions.append(length_distribution) + for cutoff_fraction in summary_top_peptide_fractions: + cutoff_count = max( + int(len(predictions_df) * cutoff_fraction), 1) + length_distribution = predictions_df.prediction.nsmallest( + cutoff_count).index.str.len().value_counts() + length_distribution.index.name = "length" + length_distribution /= length_distribution.sum() + length_distribution = length_distribution.to_frame() + length_distribution.columns = ["fraction"] + length_distribution = length_distribution.reset_index() + length_distribution["allele"] = allele + length_distribution["cutoff_fraction"] = cutoff_fraction + length_distribution["cutoff_count"] = cutoff_count + length_distribution = length_distribution[[ + "allele", + "cutoff_fraction", + "cutoff_count", + "length", + "fraction" + ]].sort_values(["cutoff_fraction", "length"]) + length_distributions.append(length_distribution) if frequency_matrices is not None: frequency_matrices = pandas.concat( diff --git a/mhcflurry/cluster_parallelism.py b/mhcflurry/cluster_parallelism.py index 49e71c37..c756239c 100644 --- a/mhcflurry/cluster_parallelism.py +++ b/mhcflurry/cluster_parallelism.py @@ -246,7 +246,7 @@ def worker_entry_point(argv=sys.argv[1:]): if args.result_serialization_method == 'save_predictor': result.save(args.result_out) else: - with open(args.out, "wb") as fd: + with open(args.result_out, "wb") as fd: pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL) print("Wrote:", args.result_out) except Exception as e: diff --git a/test/test_calibrate_percentile_ranks_command.py b/test/test_calibrate_percentile_ranks_command.py new file mode 100644 index 00000000..2a87c192 --- /dev/null +++ b/test/test_calibrate_percentile_ranks_command.py @@ -0,0 +1,78 @@ +""" +Tests for calibrate percentile ranks command +""" + +import os +import shutil +import tempfile +import subprocess + +from numpy.testing import assert_equal + +from mhcflurry import Class1AffinityPredictor +from mhcflurry.downloads import get_path + +os.environ["CUDA_VISIBLE_DEVICES"] = "" + + +def run_and_check(n_jobs=0, delete=True, additional_args=[]): + source_models_dir = get_path("models_class1_pan", "models.with_mass_spec") + dest_models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") + + # Save a new predictor that has no percent rank calibration data. + original_predictor = Class1AffinityPredictor.load(source_models_dir) + print("Loaded predictor", source_models_dir) + new_predictor = Class1AffinityPredictor( + class1_pan_allele_models=original_predictor.class1_pan_allele_models, + allele_to_sequence=original_predictor.allele_to_sequence, + ) + new_predictor.save(dest_models_dir) + print("Saved predictor to", dest_models_dir) + + new_predictor = Class1AffinityPredictor.load(dest_models_dir) + assert_equal(len(new_predictor.allele_to_percent_rank_transform), 0) + + args = [ + "mhcflurry-calibrate-percentile-ranks", + "--models-dir", dest_models_dir, + "--match-amino-acid-distribution-data", get_path( + "data_curated", "curated_training_data.no_mass_spec.csv.bz2"), + "--motif-summary", + "--num-peptides-per-length", "1000", + "--allele", "HLA-A*02:01", "HLA-B*07:02", + "--verbosity", "1", + "--num-jobs", str(n_jobs), + ] + additional_args + print("Running with args: %s" % args) + subprocess.check_call(args) + + new_predictor = Class1AffinityPredictor.load(dest_models_dir) + assert_equal(len(new_predictor.allele_to_percent_rank_transform), 2) + + if delete: + print("Deleting: %s" % dest_models_dir) + shutil.rmtree(dest_models_dir) + else: + print("Not deleting: %s" % dest_models_dir) + + +def test_run_serial(): + run_and_check(n_jobs=0) + + +def test_run_parallel(): + run_and_check(n_jobs=2) + + +def test_run_cluster_parallelism(delete=True): + run_and_check(n_jobs=0, additional_args=[ + '--cluster-parallelism', + '--cluster-results-workdir', '/tmp/', + '--cluster-max-retries', '0', + ], delete=delete) + + +if __name__ == "__main__": + run_and_check(n_jobs=0, delete=False) + # run_and_check(n_jobs=2, delete=False) + # test_run_cluster_parallelism(delete=False) diff --git a/test/test_train_pan_allele_models_command.py b/test/test_train_pan_allele_models_command.py index cb00de4d..2c600346 100644 --- a/test/test_train_pan_allele_models_command.py +++ b/test/test_train_pan_allele_models_command.py @@ -7,15 +7,12 @@ import os import shutil import tempfile import subprocess -from copy import deepcopy -from sklearn.metrics import roc_auc_score import pandas -from numpy.testing import assert_, assert_equal, assert_array_less +from numpy.testing import assert_equal, assert_array_less from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork -from mhcflurry.allele_encoding import AlleleEncoding from mhcflurry.downloads import get_path os.environ["CUDA_VISIBLE_DEVICES"] = "" @@ -145,6 +142,7 @@ def run_and_check(n_jobs=0, delete=True, additional_args=[]): print("Deleting: %s" % models_dir) shutil.rmtree(models_dir) + if os.environ.get("KERAS_BACKEND") != "theano": def test_run_parallel(): run_and_check(n_jobs=1) @@ -163,5 +161,5 @@ def test_run_cluster_parallelism(): if __name__ == "__main__": - #run_and_check(n_jobs=0, delete=False) + # run_and_check(n_jobs=0, delete=False) test_run_cluster_parallelism() -- GitLab