Skip to content
Snippets Groups Projects
Commit 3fb579ae authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

add test test_calibrate_percentile_ranks_command.py

parent fcb1af8c
No related merge requests found
...@@ -70,7 +70,7 @@ do ...@@ -70,7 +70,7 @@ do
--models-dir models.${kind} \ --models-dir models.${kind} \
--match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
--motif-summary \ --motif-summary \
--num-peptides-per-length 100000 \ --num-peptides-per-length 1000000 \
--allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \ --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
--verbosity 1 \ --verbosity 1 \
--worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
......
...@@ -43,7 +43,7 @@ export PYTHONUNBUFFERED=1 ...@@ -43,7 +43,7 @@ export PYTHONUNBUFFERED=1
UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)" UNSELECTED_PATH="$(mhcflurry-downloads path models_class1_pan_unselected)"
for kind in with_mass_spec #no_mass_spec for kind in with_mass_spec no_mass_spec
do do
MODELS_DIR="$UNSELECTED_PATH/models.${kind}" MODELS_DIR="$UNSELECTED_PATH/models.${kind}"
time mhcflurry-class1-select-pan-allele-models \ time mhcflurry-class1-select-pan-allele-models \
...@@ -62,7 +62,7 @@ do ...@@ -62,7 +62,7 @@ do
--models-dir models.${kind} \ --models-dir models.${kind} \
--match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \ --match-amino-acid-distribution-data "$MODELS_DIR/train_data.csv.bz2" \
--motif-summary \ --motif-summary \
--num-peptides-per-length 100000 \ --num-peptides-per-length 1000000 \
--allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \ --allele $(bzcat "$MODELS_DIR/train_data.csv.bz2" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) \
--verbosity 1 \ --verbosity 1 \
--num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1 --num-jobs $NUM_JOBS --max-tasks-per-worker 1 --gpus $GPUS --max-workers-per-gpu 1
......
...@@ -67,7 +67,8 @@ parser.add_argument( ...@@ -67,7 +67,8 @@ parser.add_argument(
help="Calculate motifs and length preferences for each allele") help="Calculate motifs and length preferences for each allele")
parser.add_argument( parser.add_argument(
"--summary-top-peptide-fraction", "--summary-top-peptide-fraction",
default=0.001, default=[0.0001, 0.001, 0.01, 0.1, 1.0],
nargs="+",
type=float, type=float,
metavar="X", metavar="X",
help="The top X fraction of predictions (i.e. tightest binders) to use to " help="The top X fraction of predictions (i.e. tightest binders) to use to "
...@@ -145,7 +146,7 @@ def run(argv=sys.argv[1:]): ...@@ -145,7 +146,7 @@ def run(argv=sys.argv[1:]):
GLOBAL_DATA["predictor"] = predictor GLOBAL_DATA["predictor"] = predictor
GLOBAL_DATA["args"] = { GLOBAL_DATA["args"] = {
'motif_summary': args.motif_summary, 'motif_summary': args.motif_summary,
'summary_top_peptide_fraction': args.summary_top_peptide_fraction, 'summary_top_peptide_fractions': args.summary_top_peptide_fraction,
'verbose': args.verbosity > 0 'verbose': args.verbosity > 0
} }
...@@ -203,12 +204,12 @@ def run(argv=sys.argv[1:]): ...@@ -203,12 +204,12 @@ def run(argv=sys.argv[1:]):
print("Predictor written to: %s" % args.models_dir) print("Predictor written to: %s" % args.models_dir)
def do_calibrate_percentile_ranks(allele): def do_calibrate_percentile_ranks(allele, constant_data=GLOBAL_DATA):
return calibrate_percentile_ranks( return calibrate_percentile_ranks(
allele, allele,
GLOBAL_DATA['predictor'], constant_data['predictor'],
peptides=GLOBAL_DATA['calibration_peptides'], peptides=constant_data['calibration_peptides'],
**GLOBAL_DATA["args"]) **constant_data["args"])
def calibrate_percentile_ranks( def calibrate_percentile_ranks(
...@@ -216,19 +217,13 @@ def calibrate_percentile_ranks( ...@@ -216,19 +217,13 @@ def calibrate_percentile_ranks(
predictor, predictor,
peptides=None, peptides=None,
motif_summary=False, motif_summary=False,
summary_top_peptide_fraction=0.001, summary_top_peptide_fractions=[0.001],
verbose=False): verbose=False):
"""
Private helper function.
"""
global GLOBAL_DATA
if peptides is None:
peptides = GLOBAL_DATA["calibration_peptides"]
summary_results = predictor.calibrate_percentile_ranks( summary_results = predictor.calibrate_percentile_ranks(
peptides=peptides, peptides=peptides,
alleles=[allele], alleles=[allele],
motif_summary=motif_summary, motif_summary=motif_summary,
summary_top_peptide_fraction=summary_top_peptide_fraction, summary_top_peptide_fractions=summary_top_peptide_fractions,
verbose=verbose) verbose=verbose)
transforms = { transforms = {
allele: predictor.allele_to_percent_rank_transform[allele], allele: predictor.allele_to_percent_rank_transform[allele],
......
...@@ -1156,7 +1156,7 @@ class Class1AffinityPredictor(object): ...@@ -1156,7 +1156,7 @@ class Class1AffinityPredictor(object):
alleles=None, alleles=None,
bins=None, bins=None,
motif_summary=False, motif_summary=False,
summary_top_peptide_fraction=0.001, summary_top_peptide_fractions=[0.001],
verbose=False): verbose=False):
""" """
Compute the cumulative distribution of ic50 values for a set of alleles Compute the cumulative distribution of ic50 values for a set of alleles
...@@ -1229,32 +1229,45 @@ class Class1AffinityPredictor(object): ...@@ -1229,32 +1229,45 @@ class Class1AffinityPredictor(object):
}).drop_duplicates('peptide').set_index("peptide") }).drop_duplicates('peptide').set_index("peptide")
predictions_df["length"] = predictions_df.index.str.len() predictions_df["length"] = predictions_df.index.str.len()
for (length, sub_df) in predictions_df.groupby("length"): for (length, sub_df) in predictions_df.groupby("length"):
selected = sub_df.prediction.nsmallest( for cutoff_fraction in summary_top_peptide_fractions:
max( selected = sub_df.prediction.nsmallest(
int(len(sub_df) * summary_top_peptide_fraction), max(
1)).index.values int(len(sub_df) * cutoff_fraction),
matrix = positional_frequency_matrix(selected).reset_index() 1)).index.values
original_columns = list(matrix.columns) matrix = positional_frequency_matrix(selected).reset_index()
matrix["length"] = length original_columns = list(matrix.columns)
matrix["allele"] = allele matrix["allele"] = allele
matrix = matrix[["allele", "length"] + original_columns] matrix["length"] = length
frequency_matrices.append(matrix) matrix["cutoff_fraction"] = cutoff_fraction
matrix["cutoff_count"] = len(selected)
matrix = matrix[
["allele", "length", "cutoff_fraction", "cutoff_count"]
+ original_columns
]
frequency_matrices.append(matrix)
# Length distribution # Length distribution
length_distribution = predictions_df.prediction.nsmallest( for cutoff_fraction in summary_top_peptide_fractions:
max( cutoff_count = max(
int(len(predictions_df) * summary_top_peptide_fraction), int(len(predictions_df) * cutoff_fraction), 1)
1)).index.str.len().value_counts() length_distribution = predictions_df.prediction.nsmallest(
length_distribution.index.name = "length" cutoff_count).index.str.len().value_counts()
length_distribution /= length_distribution.sum() length_distribution.index.name = "length"
length_distribution = length_distribution.to_frame() length_distribution /= length_distribution.sum()
length_distribution.columns = ["fraction"] length_distribution = length_distribution.to_frame()
length_distribution = length_distribution.reset_index() length_distribution.columns = ["fraction"]
length_distribution["allele"] = allele length_distribution = length_distribution.reset_index()
length_distribution = length_distribution[ length_distribution["allele"] = allele
["allele", "length", "fraction"] length_distribution["cutoff_fraction"] = cutoff_fraction
].sort_values("length") length_distribution["cutoff_count"] = cutoff_count
length_distributions.append(length_distribution) length_distribution = length_distribution[[
"allele",
"cutoff_fraction",
"cutoff_count",
"length",
"fraction"
]].sort_values(["cutoff_fraction", "length"])
length_distributions.append(length_distribution)
if frequency_matrices is not None: if frequency_matrices is not None:
frequency_matrices = pandas.concat( frequency_matrices = pandas.concat(
......
...@@ -246,7 +246,7 @@ def worker_entry_point(argv=sys.argv[1:]): ...@@ -246,7 +246,7 @@ def worker_entry_point(argv=sys.argv[1:]):
if args.result_serialization_method == 'save_predictor': if args.result_serialization_method == 'save_predictor':
result.save(args.result_out) result.save(args.result_out)
else: else:
with open(args.out, "wb") as fd: with open(args.result_out, "wb") as fd:
pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL) pickle.dump(result, fd, pickle.HIGHEST_PROTOCOL)
print("Wrote:", args.result_out) print("Wrote:", args.result_out)
except Exception as e: except Exception as e:
......
"""
Tests for calibrate percentile ranks command
"""
import os
import shutil
import tempfile
import subprocess
from numpy.testing import assert_equal
from mhcflurry import Class1AffinityPredictor
from mhcflurry.downloads import get_path
os.environ["CUDA_VISIBLE_DEVICES"] = ""
def run_and_check(n_jobs=0, delete=True, additional_args=[]):
source_models_dir = get_path("models_class1_pan", "models.with_mass_spec")
dest_models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models")
# Save a new predictor that has no percent rank calibration data.
original_predictor = Class1AffinityPredictor.load(source_models_dir)
print("Loaded predictor", source_models_dir)
new_predictor = Class1AffinityPredictor(
class1_pan_allele_models=original_predictor.class1_pan_allele_models,
allele_to_sequence=original_predictor.allele_to_sequence,
)
new_predictor.save(dest_models_dir)
print("Saved predictor to", dest_models_dir)
new_predictor = Class1AffinityPredictor.load(dest_models_dir)
assert_equal(len(new_predictor.allele_to_percent_rank_transform), 0)
args = [
"mhcflurry-calibrate-percentile-ranks",
"--models-dir", dest_models_dir,
"--match-amino-acid-distribution-data", get_path(
"data_curated", "curated_training_data.no_mass_spec.csv.bz2"),
"--motif-summary",
"--num-peptides-per-length", "1000",
"--allele", "HLA-A*02:01", "HLA-B*07:02",
"--verbosity", "1",
"--num-jobs", str(n_jobs),
] + additional_args
print("Running with args: %s" % args)
subprocess.check_call(args)
new_predictor = Class1AffinityPredictor.load(dest_models_dir)
assert_equal(len(new_predictor.allele_to_percent_rank_transform), 2)
if delete:
print("Deleting: %s" % dest_models_dir)
shutil.rmtree(dest_models_dir)
else:
print("Not deleting: %s" % dest_models_dir)
def test_run_serial():
run_and_check(n_jobs=0)
def test_run_parallel():
run_and_check(n_jobs=2)
def test_run_cluster_parallelism(delete=True):
run_and_check(n_jobs=0, additional_args=[
'--cluster-parallelism',
'--cluster-results-workdir', '/tmp/',
'--cluster-max-retries', '0',
], delete=delete)
if __name__ == "__main__":
run_and_check(n_jobs=0, delete=False)
# run_and_check(n_jobs=2, delete=False)
# test_run_cluster_parallelism(delete=False)
...@@ -7,15 +7,12 @@ import os ...@@ -7,15 +7,12 @@ import os
import shutil import shutil
import tempfile import tempfile
import subprocess import subprocess
from copy import deepcopy
from sklearn.metrics import roc_auc_score
import pandas import pandas
from numpy.testing import assert_, assert_equal, assert_array_less from numpy.testing import assert_equal, assert_array_less
from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork
from mhcflurry.allele_encoding import AlleleEncoding
from mhcflurry.downloads import get_path from mhcflurry.downloads import get_path
os.environ["CUDA_VISIBLE_DEVICES"] = "" os.environ["CUDA_VISIBLE_DEVICES"] = ""
...@@ -145,6 +142,7 @@ def run_and_check(n_jobs=0, delete=True, additional_args=[]): ...@@ -145,6 +142,7 @@ def run_and_check(n_jobs=0, delete=True, additional_args=[]):
print("Deleting: %s" % models_dir) print("Deleting: %s" % models_dir)
shutil.rmtree(models_dir) shutil.rmtree(models_dir)
if os.environ.get("KERAS_BACKEND") != "theano": if os.environ.get("KERAS_BACKEND") != "theano":
def test_run_parallel(): def test_run_parallel():
run_and_check(n_jobs=1) run_and_check(n_jobs=1)
...@@ -163,5 +161,5 @@ def test_run_cluster_parallelism(): ...@@ -163,5 +161,5 @@ def test_run_cluster_parallelism():
if __name__ == "__main__": if __name__ == "__main__":
#run_and_check(n_jobs=0, delete=False) # run_and_check(n_jobs=0, delete=False)
test_run_cluster_parallelism() test_run_cluster_parallelism()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment