Newer
Older
"""
Calibrate percentile ranks for models. Runs in-place.
"""
import argparse
import os
import signal
import sys
import time
import traceback
import random
from functools import partial
import numpy
import pandas
import yaml
from sklearn.metrics.pairwise import cosine_similarity
from mhcnames import normalize_allele_name
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
from .class1_affinity_predictor import Class1AffinityPredictor
from .common import configure_logging
from .parallelism import (
add_worker_pool_args,
worker_pool_with_gpu_assignments_from_args,
call_wrapped)
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# To avoid pickling large matrices to send to child processes when running in
# parallel, we use this global variable as a place to store data. Data that is
# stored here before creating the thread pool will be inherited to the child
# processes upon fork() call, allowing us to share large data with the workers
# via shared memory.
GLOBAL_DATA = {}
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--models-dir",
metavar="DIR",
required=True,
help="Directory to read and write models")
parser.add_argument(
"--allele",
default=None,
nargs="+",
help="Alleles to train models for. If not specified, all alleles with "
"enough measurements will be used.")
parser.add_argument(
"--num-peptides-per-length",
type=int,
metavar="N",
default=int(1e5),
help="Number of peptides per length to use to calibrate percent ranks. "
"Default: %(default)s.")
parser.add_argument(
"--verbosity",
type=int,
help="Keras verbosity. Default: %(default)s",
default=0)
add_worker_pool_args(parser)
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def run(argv=sys.argv[1:]):
global GLOBAL_DATA
# On sigusr1 print stack trace
print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())
args = parser.parse_args(argv)
args.models_dir = os.path.abspath(args.models_dir)
configure_logging(verbose=args.verbosity > 1)
predictor = Class1AffinityPredictor.load(args.models_dir)
if args.allele:
alleles = [normalize_allele_name(a) for a in args.allele]
else:
alleles = predictor.supported_alleles
start = time.time()
print("Performing percent rank calibration. Encoding peptides.")
encoded_peptides = predictor.calibrate_percentile_ranks(
alleles=[], # don't actually do any calibration, just return peptides
num_peptides_per_length=args.num_peptides_per_length)
# Now we encode the peptides for each neural network, so the encoding
# becomes cached.
for network in predictor.neural_networks:
network.peptides_to_network_input(encoded_peptides)
assert encoded_peptides.encoding_cache # must have cached the encoding
print("Finished encoding peptides for percent ranks in %0.2f sec." % (
time.time() - start))
print("Calibrating percent rank calibration for %d alleles." % len(alleles))
# Store peptides in global variable so they are in shared memory
# after fork, instead of needing to be pickled (when doing a parallel run).
GLOBAL_DATA["calibration_peptides"] = encoded_peptides
worker_pool = worker_pool_with_gpu_assignments_from_args(args)
if worker_pool is None:
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# Serial run
print("Running in serial.")
results = (
calibrate_percentile_ranks(
allele=allele,
predictor=predictor,
peptides=encoded_peptides)
for allele in alleles)
else:
# Parallel run
results = worker_pool.imap_unordered(
partial(
partial(call_wrapped, calibrate_percentile_ranks),
predictor=predictor),
alleles,
chunksize=1)
for result in tqdm.tqdm(results, total=len(alleles)):
predictor.allele_to_percent_rank_transform.update(result)
print("Done calibrating %d additional alleles." % len(alleles))
predictor.save(args.models_dir, model_names_to_write=[])
percent_rank_calibration_time = time.time() - start
if worker_pool:
worker_pool.close()
worker_pool.join()
print("Percent rank calibration time: %0.2f min." % (
percent_rank_calibration_time / 60.0))
print("Predictor written to: %s" % args.models_dir)
def calibrate_percentile_ranks(allele, predictor, peptides=None):
"""
Private helper function.
"""
global GLOBAL_DATA
if peptides is None:
peptides = GLOBAL_DATA["calibration_peptides"]
predictor.calibrate_percentile_ranks(
peptides=peptides,
alleles=[allele])
return {
allele: predictor.allele_to_percent_rank_transform[allele],
}
if __name__ == '__main__':
run()