diff --git a/docs/Makefile b/docs/Makefile index d38388c2d53992779a1cfc53a73be98fe2344233..3fa742c5c2b917820fbab839e91d562007c198af 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -53,12 +53,8 @@ help: .PHONY: generate generate: sphinx-apidoc -M -f -o _build/ ../mhcflurry - mhcflurry-downloads fetch models_class1 models_class1_unselected - python generate.py \ - --out-alleles-info-rst _build/_alleles_info.rst \ - --out-models-info-rst _build/_models_info.rst \ - --out-models-supported-alleles-rst _build/_models_supported_alleles.rst - + mhcflurry-downloads fetch models_class1_pan + python generate_class1_pan.py --out-dir model-info # Added by Tim: .PHONY: readme @@ -76,6 +72,7 @@ clean: mv $(BUILDDIR)/html /tmp/html-bk rm -rf $(BUILDDIR)/* mv /tmp/html-bk $(BUILDDIR)/html + rm -rf model-info .PHONY: html html: diff --git a/docs/generate_class1_pan.py b/docs/generate_class1_pan.py new file mode 100644 index 0000000000000000000000000000000000000000..080421fea06fe6fd70ecc9eb210e356a5f3b3007 --- /dev/null +++ b/docs/generate_class1_pan.py @@ -0,0 +1,244 @@ +""" +Generate certain RST files used in documentation. +""" +from __future__ import print_function +import sys +import argparse +from collections import OrderedDict, defaultdict +from os.path import join, exists +from os import mkdir + +import pandas +import logomaker + +from matplotlib import pyplot + +from mhcflurry.downloads import get_path +from mhcflurry.amino_acid import COMMON_AMINO_ACIDS + +AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) + +parser = argparse.ArgumentParser(usage=__doc__) +parser.add_argument( + "--class1-models-dir-with-ms", + metavar="DIR", + default=get_path( + "models_class1_pan", "models.with_mass_spec", test_exists=False), + help="Class1 models. Default: %(default)s", +) +parser.add_argument( + "--class1-models-dir-no-ms", + metavar="DIR", + default=get_path( + "models_class1_pan", "models.no_mass_spec", test_exists=False), + help="Class1 models. Default: %(default)s", +) +parser.add_argument( + "--logo-cutoff", + default=0.01, + type=float, + help="Fraction of top to use for motifs", +) +parser.add_argument( + "--length-cutoff", + default=0.0001, + type=float, + help="Fraction of top to use for length distribution", +) +parser.add_argument( + "--lengths", + default=[8, 9, 10, 11], + type=int, + help="Peptide lengths", +) +parser.add_argument( + "--out-dir", + metavar="DIR", + required=True, + help="Directory to write RSTs and images to", +) +parser.add_argument( + "--max-alleles", + type=int, + metavar="N", + help="Only use N alleles (for testing)", +) + + +def model_info(models_dir): + length_distributions_df = pandas.read_csv( + join(models_dir, "length_distributions.csv.bz2")) + frequency_matrices_df = pandas.read_csv( + join(models_dir, "frequency_matrices.csv.bz2")) + + distribution = frequency_matrices_df.loc[ + (frequency_matrices_df.cutoff_fraction == 1.0), AMINO_ACIDS + ].mean(0) + + normalized_frequency_matrices = frequency_matrices_df.copy() + normalized_frequency_matrices.loc[:, AMINO_ACIDS] = ( + normalized_frequency_matrices[AMINO_ACIDS] / distribution) + + return { + 'length_distributions': length_distributions_df, + 'normalized_frequency_matrices': normalized_frequency_matrices, + } + + +def write_logo( + normalized_frequency_matrices, + allele, + length, + cutoff, + models_label, + out_dir): + matrix = normalized_frequency_matrices.loc[ + (normalized_frequency_matrices.allele == allele) & + (normalized_frequency_matrices.length == length) & + (normalized_frequency_matrices.cutoff_fraction == cutoff) + ].set_index("position")[AMINO_ACIDS] + if matrix.shape[0] == 0: + return None + + matrix = (matrix.T / matrix.sum(1)).T # row normalize + + fig = pyplot.figure(figsize=(8,4)) + ss_logo = logomaker.Logo( + matrix, + width=.8, + vpad=.05, + fade_probabilities=True, + stack_order='small_on_top', + # color_scheme='dodgerblue', + # ax=ax, + ) + pyplot.title("%s %d-mer (%s)" % (allele, length, models_label)) + pyplot.xticks(matrix.index.values) + name = "%s_%dmer.%s.png" % (allele, length, models_label) + pyplot.savefig(join(out_dir, name)) + print("Wrote: ", name) + fig.clear() + pyplot.close(fig) + return name + + +def write_length_distribution( + length_distributions_df, allele, lengths, cutoff, models_label, out_dir): + length_distribution = length_distributions_df.loc[ + (length_distributions_df.allele == allele) & + (length_distributions_df.cutoff_fraction == cutoff) + ] + if length_distribution.shape[0] == 0: + return None + + length_distribution = length_distribution.set_index( + "length").reindex(lengths).fillna(0.0).reset_index() + + fig = pyplot.figure(figsize=(8, 2)) + length_distribution.plot(x="length", y="fraction", kind="bar", color="black") + pyplot.title("%s (%s)" % (allele, models_label)) + pyplot.xlabel("") + pyplot.xticks(rotation=0) + pyplot.gca().get_legend().remove() + name = "%s.lengths.%s.png" % (allele, models_label) + pyplot.savefig(join(out_dir, name)) + print("Wrote: ", name) + fig.clear() + pyplot.close(fig) + return name + + +def go(argv): + args = parser.parse_args(argv) + + if not exists(args.out_dir): + mkdir(args.out_dir) + + predictors = [ + ("with_mass_spec", args.class1_models_dir_with_ms), + ("no_mass_spec", args.class1_models_dir_no_ms), + ] + info_per_predictor = OrderedDict() + alleles = set() + for (label, models_dir) in predictors: + if not models_dir: + continue + info_per_predictor[label] = model_info(models_dir) + alleles.update( + info_per_predictor[label]["normalized_frequency_matrices"].allele.unique()) + + lines = [] + + def w(*pieces): + lines.extend(pieces) + + w('Motifs and length distributions from the pan-allele predictor') + w('=' * 80, "") + + w( + "Length distributions and binding motifs were calculated by ranking a " + "large set of random peptides (an equal number of peptides for each " + "length 8-15) by predicted affinity for each allele. " + "For length distribution, the top %g%% of peptides were collected and " + "their length distributions plotted. For sequence motifs, sequence " + "logos for the top %g%% " + "peptides for each length are shown.\n" % ( + args.length_cutoff * 100.0, + args.logo_cutoff * 100.0, + )) + + w(".. contents:: :local:", "") + + + def image(name): + if name is None: + return "" + return '.. image:: %s\n' % name + + alleles = sorted(alleles, key=lambda a: ("HLA" not in a, a)) + if args.max_alleles: + alleles = alleles[:args.max_alleles] + + for allele in alleles: + w(allele, "-" * 80, "") + for (label, info) in info_per_predictor.items(): + length_distribution = info["length_distributions"] + normalized_frequency_matrices = info["normalized_frequency_matrices"] + + length_distribution_image_path = write_length_distribution( + length_distributions_df=length_distribution, + allele=allele, + lengths=args.lengths, + cutoff=args.length_cutoff, + out_dir=args.out_dir, + models_label=label) + if not length_distribution_image_path: + continue + + w( + "*" + ( + "With mass-spec" if label == "with_mass_spec" else "Affinities only") + + "*\n") + w(image(length_distribution_image_path)) + + for length in args.lengths: + w(image(write_logo( + normalized_frequency_matrices=normalized_frequency_matrices, + allele=allele, + length=length, + cutoff=args.logo_cutoff, + out_dir=args.out_dir, + models_label=label, + ))) + w("") + + document_path = join(args.out_dir, "allele_motifs.rst") + with open(document_path, "w") as fd: + for line in lines: + fd.write(line) + fd.write("\n") + print("Wrote", document_path) + + +if __name__ == "__main__": + go(sys.argv[1:]) diff --git a/docs/index.rst b/docs/index.rst index 82e479bd80d781c0c7f30b3e24997b84b7f7cd18..2c892035c9ff9589dd3af293af769b19cb764577 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -7,6 +7,7 @@ MHCflurry documentation intro commandline_tutorial python_tutorial + model-info/allele_motifs commandline_tools api diff --git a/docs/requirements.txt b/docs/requirements.txt index af205bbbf1b5613450cffece92b58334dd06ea34..d5fe8445fec45be54234f7b691ff0aa82f13d954 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -8,3 +8,4 @@ pypandoc mhctools pydot tabulate +logomaker