From 0ac4daecd10edb6f033c99d71d0975b3fded085f Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 6 Feb 2018 16:54:05 -0500 Subject: [PATCH] update docs --- docs/Makefile | 1 - docs/commandline_tutorial.rst | 2 +- docs/conf.py | 2 +- docs/generate.py | 69 ++++++++++++++++++++++++++---- docs/models.rst | 43 ++++++++----------- docs/python_tutorial.rst | 2 +- docs/requirements.txt | 1 + mhcflurry/class1_neural_network.py | 7 +++ 8 files changed, 89 insertions(+), 38 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 3dfdf85d..8c1bf9d5 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -56,7 +56,6 @@ generate: mhcflurry-downloads fetch models_class1 cross_validation_class1 python generate.py \ --out-models-cv-rst _build/_models_cv.rst \ - --out-models-architecture-png _build/_models_architecture.png \ --out-models-info-rst _build/_models_info.rst \ --out-models-supported-alleles-rst _build/_models_supported_alleles.rst diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst index 6dc803c8..ade306d6 100644 --- a/docs/commandline_tutorial.rst +++ b/docs/commandline_tutorial.rst @@ -111,7 +111,7 @@ training data. The data we use for our released predictors can be downloaded wit It looks like this: .. command-output:: - bzcat "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" | head -n 3 + bzcat "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" | head -n 3 :shell: :nostderr: diff --git a/docs/conf.py b/docs/conf.py index be1d9ae6..98df4244 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -86,7 +86,7 @@ author = 'Timothy O\'Donnell' # The short X.Y version. # Added by Tim: reading version from mhcflurry __init__.py as in setup.py -with open('../mhcflurry/__init__.py', 'r') as f: +with open('../mhcflurry/version.py', 'r') as f: version = re.search( r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(), diff --git a/docs/generate.py b/docs/generate.py index 848d04ce..fb39ade2 100644 --- a/docs/generate.py +++ b/docs/generate.py @@ -4,11 +4,14 @@ Generate certain RST files used in documentation. import sys import argparse +import json from textwrap import wrap +from collections import OrderedDict import pypandoc import pandas from keras.utils.vis_utils import plot_model +from tabulate import tabulate from mhcflurry import __version__ from mhcflurry.downloads import get_path @@ -89,18 +92,66 @@ def go(argv): # Architecture information rst if predictor is None: predictor = Class1AffinityPredictor.load(args.class1_models_dir) - network = predictor.neural_networks[0].network() - lines = [] - network.summary(print_fn=lines.append) + + representative_networks = OrderedDict() + for network in predictor.neural_networks: + config = json.dumps(network.hyperparameters) + if config not in representative_networks: + representative_networks[config] = network + + all_hyperparameters = [ + network.hyperparameters for network in representative_networks.values() + ] + hyperparameter_keys = all_hyperparameters[0].keys() + assert all( + hyperparameters.keys() == hyperparameter_keys + for hyperparameters in all_hyperparameters) + + constant_hyperparameter_keys = [ + k for k in hyperparameter_keys + if all([ + hyperparameters[k] == all_hyperparameters[0][k] + for hyperparameters in all_hyperparameters + ]) + ] + constant_hypeparameters = dict( + (key, all_hyperparameters[0][key]) + for key in sorted(constant_hyperparameter_keys) + ) + + def write_hyperparameters(fd, hyperparameters): + rows = [] + for key in sorted(hyperparameters.keys()): + rows.append((key, json.dumps(hyperparameters[key]))) + fd.write("\n") + fd.write( + tabulate(rows, ["Hyperparameter", "Value"], tablefmt="grid")) with open(args.out_models_info_rst, "w") as fd: - fd.write("Layers and parameters summary: ") - fd.write("\n\n::\n\n") - for line in lines: - fd.write(" ") - fd.write(line) + fd.write("Hyperparameters shared by all %d architectures:\n" % + len(representative_networks)) + write_hyperparameters(fd, constant_hypeparameters) + fd.write("\n") + for (i, network) in enumerate(representative_networks.values()): + lines = [] + network.network().summary(print_fn=lines.append) + + fd.write("Architecture %d / %d:\n" % ( + (i + 1, len(representative_networks)))) + fd.write("+" * 40) fd.write("\n") - print("Wrote: %s" % args.out_models_info_rst) + write_hyperparameters( + fd, + dict( + (key, value) + for (key, value) in network.hyperparameters.items() + if key not in constant_hypeparameters)) + fd.write("\n\n::\n\n") + for line in lines: + fd.write(" ") + fd.write(line) + fd.write("\n") + print("Wrote: %s" % args.out_models_info_rst) if args.out_models_cv_rst: # Models cv output diff --git a/docs/models.rst b/docs/models.rst index 4f7dee9e..96f2d124 100644 --- a/docs/models.rst +++ b/docs/models.rst @@ -1,35 +1,28 @@ Details on the released models =============================== -The released MHCflurry predictor consists of an ensemble of eight models for each -supported allele. Each model in the ensemble was trained on a random 80% sample -of the data for the allele, and the remaining 20% was used for early stopping. -All models use the same architecture. The predictions are taken to be the geometric -mean of the nM binding affinity predictions of the individual models. The script -we run to train these models is in "downloads-generation/models_class1/GENERATE.sh" -in the repository. - -Neural network architecture +The released MHCflurry predictor consists of an ensemble of models for each +supported allele. Each model in the ensemble was trained on a random 90% sample +of the data for the allele, and the remaining data was used for early stopping. +The predictions are taken to be the geometric mean of the nM binding affinity +predictions of the individual models whose predictions fall in the middle 50% of +values for a given prediction. The script we run to train these models is in +"downloads-generation/models_class1/GENERATE.sh" in the repository. + +Neural network architectures ------------------------------------------------------------- -The neural network architecture is quite simple, consisting of a locally -connected layer, a dense layer, and a sigmoid output. - .. include:: /_build/_models_info.rst -Architecture diagram: - -.. image:: /_build/_models_architecture.png - -Cross validation performance -------------------------------------------------------------- +.. Cross validation performance +.. ------------------------------------------------------------- -The accuracy of the MHCflurry downloadable models was estimated using 5-fold cross -validation on the training data. The values shown here are the mean cross validation -scores across folds. +.. The accuracy of the MHCflurry downloadable models was estimated using 5-fold cross +.. validation on the training data. The values shown here are the mean cross validation +.. scores across folds. -The AUC and F1 estimates use a 500 nM cutoff for distinguishing strong-binders -from weak- or non-binders. The Kendall Tau score gives the rank correlation -between the predicted and measured affinities; it uses no cutoff. +.. The AUC and F1 estimates use a 500 nM cutoff for distinguishing strong-binders +.. from weak- or non-binders. The Kendall Tau score gives the rank correlation +.. between the predicted and measured affinities; it uses no cutoff. -.. include:: /_build/_models_cv.rst +.. .. include:: /_build/_models_cv.rst diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst index 9b3d8b88..5840db46 100644 --- a/docs/python_tutorial.rst +++ b/docs/python_tutorial.rst @@ -65,7 +65,7 @@ We can get the path to this data from Python using `mhcflurry.downloads.get_path .. runblock:: pycon >>> from mhcflurry.downloads import get_path - >>> data_path = get_path("data_curated", "curated_training_data.csv.bz2") + >>> data_path = get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2") >>> data_path Now let's load it with pandas and filter to reasonably-sized peptides: diff --git a/docs/requirements.txt b/docs/requirements.txt index a8854750..af205bbb 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -7,3 +7,4 @@ numpydoc pypandoc mhctools pydot +tabulate diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index c48b051f..936e7511 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -249,6 +249,7 @@ class Class1NeuralNetwork(object): result = dict(self.__dict__) result['_network'] = None result['network_weights'] = None + result['network_weights_loader'] = None return result @classmethod @@ -277,6 +278,12 @@ class Class1NeuralNetwork(object): return instance def load_weights(self): + """ + Load weights by evaluating self.network_weights_loader, if needed. + + After calling this, self.network_weights_loader will be None and + self.network_weights will be the weights list, if available. + """ if self.network_weights_loader: self.network_weights = self.network_weights_loader() self.network_weights_loader = None -- GitLab