Newer
Older
Generate certain RST files used in documentation.
from collections import OrderedDict, defaultdict
from os.path import join
import pypandoc
import pandas
from keras.utils.vis_utils import plot_model
from mhcflurry import __version__
from mhcflurry.downloads import get_path
from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--cv-summary-csv",
metavar="FILE.csv",
default=get_path(
"cross_validation_class1", "summary.all.csv", test_exists=False),
help="Cross validation scores summary. Default: %(default)s",
)
parser.add_argument(
"--class1-models-dir",
metavar="DIR",
default=get_path(
"models_class1", "models", test_exists=False),
help="Class1 models. Default: %(default)s",
)
parser.add_argument(
"--class1-unselected-models-dir",
metavar="DIR",
default=get_path(
"models_class1_unselected", "models", test_exists=False),
help="Class1 unselected models. Default: %(default)s",
)
parser.add_argument(
"--out-alleles-info-rst",
metavar="FILE.rst",
help="rst output file",
)
parser.add_argument(
"--out-models-info-rst",
metavar="FILE.rst",
help="rst output file",
)
parser.add_argument(
"--out-models-architecture-png",
metavar="FILE.png",
help="png output file",
)
parser.add_argument(
"--out-models-supported-alleles-rst",
metavar="FILE.png",
help="png output file",
)
def go(argv):
args = parser.parse_args(argv)
predictor = None
if args.out_models_supported_alleles_rst:
# Supported alleles rst
if predictor is None:
predictor = Class1AffinityPredictor.load(args.class1_models_dir)
with open(args.out_models_supported_alleles_rst, "w") as fd:
fd.write(
"Models released with the current version of MHCflurry (%s) "
"support peptides of "
"length %d-%d and the following %d alleles:\n\n::\n\n\t%s\n\n" % (
__version__,
predictor.supported_peptide_lengths[0],
predictor.supported_peptide_lengths[1],
len(predictor.supported_alleles),
"\n\t".join(
wrap(", ".join(predictor.supported_alleles)))))
print("Wrote: %s" % args.out_models_supported_alleles_rst)
if args.out_models_architecture_png:
# Architecture diagram
if predictor is None:
predictor = Class1AffinityPredictor.load(args.class1_models_dir)
network = predictor.neural_networks[0].network()
plot_model(
network,
to_file=args.out_models_architecture_png,
show_layer_names=True,
show_shapes=True)
print("Wrote: %s" % args.out_models_architecture_png)
if args.out_models_info_rst:
# Architecture information rst
if predictor is None:
predictor = Class1AffinityPredictor.load(args.class1_models_dir)
if unselected_predictor is None:
unselected_predictor = Class1AffinityPredictor.load(
args.class1_unselected_models_dir)
config_to_network = {}
config_to_alleles = {}
for (allele, networks) in unselected_predictor.allele_to_allele_specific_models.items():
for network in networks:
config = json.dumps(network.hyperparameters)
if config not in config_to_network:
config_to_network[config] = network
config_to_alleles[config] = []
for (allele, networks) in predictor.allele_to_allele_specific_models.items():
for network in networks:
config = json.dumps(network.hyperparameters)
assert config in config_to_network
config_to_alleles[config].append(allele)
network.hyperparameters for network in config_to_network.values()
]
hyperparameter_keys = all_hyperparameters[0].keys()
assert all(
hyperparameters.keys() == hyperparameter_keys
for hyperparameters in all_hyperparameters)
constant_hyperparameter_keys = [
k for k in hyperparameter_keys
if all([
hyperparameters[k] == all_hyperparameters[0][k]
for hyperparameters in all_hyperparameters
])
]
constant_hypeparameters = dict(
(key, all_hyperparameters[0][key])
for key in sorted(constant_hyperparameter_keys)
)
def write_hyperparameters(fd, hyperparameters):
rows = []
for key in sorted(hyperparameters.keys()):
rows.append((key, json.dumps(hyperparameters[key])))
fd.write("\n")
fd.write(
tabulate(rows, ["Hyperparameter", "Value"], tablefmt="grid"))
with open(args.out_models_info_rst, "w") as fd:
fd.write("Hyperparameters shared by all %d architectures:\n" %
write_hyperparameters(fd, constant_hypeparameters)
fd.write("\n")
configs = sorted(
config_to_alleles,
key=lambda s: len(config_to_alleles[s]),
reverse=True)
for (i, config) in enumerate(configs):
network = config_to_network[config]
lines = []
network.network().summary(print_fn=lines.append)
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
specific_hyperparameters = dict(
(key, value)
for (key, value) in network.hyperparameters.items()
if key not in constant_hypeparameters)
def name_component(key, value):
if key == "locally_connected_layers":
return "lc=%d" % len(value)
elif key == "train_data":
return value["subset"] + "-data"
elif key == "layer_sizes":
(value,) = value
key = "size"
elif key == "dense_layer_l1_regularization":
if value == 0:
return "no-reg"
key = "reg"
return "%s=%s" % (key, value)
def sort_key(component):
if "lc" in component:
return (1, component)
if "reg" in component:
return (2, component)
return (0, component)
components = [
name_component(key, value)
for (key, value) in specific_hyperparameters.items()
]
name = ",".join(sorted(components, key=sort_key))
fd.write("Architecture %d / %d %s\n" % (
(i + 1, len(config_to_network), name)))
fd.write(
"Selected in the ensembles for %d alleles: *%s*.\n\n" % (
len(config_to_alleles[config]),
", ".join(
sorted(config_to_alleles[config]))))
fd.write("\n\n::\n\n")
for line in lines:
fd.write(" ")
fd.write(line)
fd.write("\n")
print("Wrote: %s" % args.out_models_info_rst)
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
df = pandas.read_csv(
join(args.class1_models_dir, "unselected_summary.csv.bz2"))
train_df = pandas.read_csv(
join(args.class1_unselected_models_dir, "train_data.csv.bz2"))
quantitative_train_measurements_by_allele = train_df.loc[
train_df.measurement_type == "quantitative"
].allele.value_counts()
train_measurements_by_allele = train_df.allele.value_counts()
df = df.sort_values("allele").copy()
df["scoring"] = df.unselected_score_plan.str.replace(
"\\(\\|[0-9.]+\\|\\)", "")
df["models selected"] = df["num_models"]
df["sanitized_scoring"] = df.scoring.map(
lambda s: s.replace("mass-spec", "").replace("mse", "").replace("(", "").replace(")", "").strip()
)
df["mass spec scoring"] = df.sanitized_scoring.map(
lambda s: s.split(",")[0] if "," in s else ""
)
df["mean square error scoring"] = df.sanitized_scoring.map(
lambda s: s.split(",")[-1]
)
df["unselected percentile"] = df.unselected_accuracy_score_percentile
df["train data (all)"] = df.allele.map(train_measurements_by_allele)
df["train data (quantitative)"] = df.allele.map(
quantitative_train_measurements_by_allele)
def write_df(df, fd):
rows = [
row for (_, row) in df.iterrows()
]
fd.write("\n")
fd.write(
tabulate(rows,
[col.replace("_", " ") for col in df.columns],
tablefmt="grid"))
fd.write("\n\n")
with open(args.out_alleles_info_rst, "w") as fd:
fd.write("Supported alleles\n")
fd.write("+" * 80)
fd.write("\n\n")
common_cols = [
"allele",
"train data (all)",
"train data (quantitative)",
"mass spec scoring",
"mean square error scoring",
"unselected percentile",
"unselected_score",
"unselected_score_scrambled_mean",
]
sub_df = df.loc[df.retained][common_cols + [
"models selected",
]]
write_df(sub_df, fd)
fd.write("Rejected alleles\n")
fd.write("+" * 80)
fd.write("\n\n")
"Training for the following alleles was attempted but the "
"resulting models were excluded due to inadequate performance on "
"held out data.")
sub_df = df.loc[~df.retained][common_cols].sort_values("allele")
write_df(sub_df, fd)
print("Wrote: %s" % args.out_alleles_info_rst)
if __name__ == "__main__":
go(sys.argv[1:])