From a84ef111c3f580f0893b1425c357835e7f685951 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Thu, 19 Mar 2020 10:42:53 -0400 Subject: [PATCH] working on docs --- docs/Makefile | 20 +++++---- docs/README.md | 3 -- docs/commandline_tools.rst | 20 +++------ docs/commandline_tutorial.rst | 81 ++++++++++++++++++----------------- docs/conf.py | 4 +- docs/generate_class1_pan.py | 39 +++++++++-------- docs/intro.rst | 3 +- docs/requirements.txt | 3 +- 8 files changed, 83 insertions(+), 90 deletions(-) diff --git a/docs/Makefile b/docs/Makefile index 3fa742c5..42553515 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -49,21 +49,19 @@ help: @echo " coverage to run coverage check of the documentation (if enabled)" @echo " dummy to check syntax errors of document sources" -# Added by Tim: +# Added by Tim .PHONY: generate generate: sphinx-apidoc -M -f -o _build/ ../mhcflurry mhcflurry-downloads fetch models_class1_pan python generate_class1_pan.py --out-dir model-info -# Added by Tim: -.PHONY: readme -readme: text - rm -f package_readme/readme.generated.txt - cat package_readme/readme_header.rst \ - _build/text/package_readme/readme.template.txt \ - > package_readme/readme.generated.txt - chmod 444 package_readme/readme.generated.txt # read only +# Added by Tim +.PHONY: generate_model_info +generate_model_info: + sphinx-apidoc -M -f -o _build/ ../mhcflurry + mhcflurry-downloads fetch models_class1_pan + python generate_class1_pan.py --out-dir model-info .PHONY: clean clean: @@ -72,6 +70,10 @@ clean: mv $(BUILDDIR)/html /tmp/html-bk rm -rf $(BUILDDIR)/* mv /tmp/html-bk $(BUILDDIR)/html + +# Added by Tim +.PHONY: clean_model_info +clean_model_info: rm -rf model-info .PHONY: html diff --git a/docs/README.md b/docs/README.md index b6330f59..615880b4 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,8 +1,5 @@ # MHCflurry documentation -Due to our use of `sphinxcontrib-autorun2` we unfortunately require Python 2.7 -to build to the docs. Python 3 is not supported. - To generate Sphinx documentation, from this directory run: ``` diff --git a/docs/commandline_tools.rst b/docs/commandline_tools.rst index e2f378b3..20f7cbe5 100644 --- a/docs/commandline_tools.rst +++ b/docs/commandline_tools.rst @@ -8,24 +8,14 @@ See also the :ref:`tutorial <commandline_tutorial>`. .. autoprogram:: mhcflurry.predict_command:parser :prog: mhcflurry-predict +.. _mhcflurry-predict-scan: + +.. autoprogram:: mhcflurry.predict_scan_command:parser + :prog: mhcflurry-predict-scan + .. _mhcflurry-downloads: .. autoprogram:: mhcflurry.downloads_command:parser :prog: mhcflurry-downloads -.. _mhcflurry-class1-train-allele-specific-models: - -.. autoprogram:: mhcflurry.train_allele_specific_models_command:parser - :prog: mhcflurry-class1-train-allele-specific-models - -.. _mhcflurry-calibrate-percentile-ranks: - -.. autoprogram:: mhcflurry.calibrate_percentile_ranks_command:parser - :prog: mhcflurry-calibrate-percentile-ranks - -.. _mhcflurry-class1-select-allele-specific-models: - -.. autoprogram:: mhcflurry.select_allele_specific_models_command:parser - :prog: mhcflurry-class1-select-allele-specific-models - diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst index 8ccf28c2..bbcbe7fb 100644 --- a/docs/commandline_tutorial.rst +++ b/docs/commandline_tutorial.rst @@ -14,12 +14,12 @@ are distributed separately from the pip package and may be downloaded with the .. code-block:: shell - $ mhcflurry-downloads fetch models_class1 + $ mhcflurry-downloads fetch models_class1_presentation Files downloaded with :ref:`mhcflurry-downloads` are stored in a platform-specific directory. To get the path to downloaded data, you can use: -.. command-output:: mhcflurry-downloads path models_class1 +.. command-output:: mhcflurry-downloads path models_class1_presentation :nostderr: We also release a few other "downloads," such as curated training data and some @@ -28,6 +28,10 @@ experimental models. To see what's available and what you have downloaded, run: .. command-output:: mhcflurry-downloads info :nostderr: +Most users will only need ``models_class1_presentation``, however, as the +presentation predictor includes a peptide / MHC I binding affinity (BA) predictor +as well as an antigen processing (AP) predictor. + .. note:: The code we use for *generating* the downloads is in the @@ -37,8 +41,9 @@ experimental models. To see what's available and what you have downloaded, run: Generating predictions ---------------------- -The :ref:`mhcflurry-predict` command generates predictions from the command-line. -By default it will use the pre-trained models you downloaded above; other +The :ref:`mhcflurry-predict` command generates predictions for individual peptides +(as opposed to scanning protein sequences for epitopes). +By default it will use the pre-trained models you downloaded above. Other models can be used by specifying the ``--models`` argument. Running: @@ -68,6 +73,38 @@ on the Keras backend and other details. In most cases you'll want to specify the input as a CSV file instead of passing peptides and alleles as commandline arguments. See :ref:`mhcflurry-predict` docs. +Scanning protein sequences for predicted MHC I ligands +------------------------------------------------- + +Starting in version 1.6.0, MHCflurry supports scanning proteins for MHC I binding +peptides using the ``mhcflurry-predict-scan`` command. + +We'll generate predictions across ``example.fasta``, a FASTA file with two short +sequences: + +.. literalinclude:: /example.fasta + +Here's the ``mhctools`` invocation. + +.. command-output:: + mhctools + --mhc-predictor mhcflurry + --input-fasta-file example.fasta + --mhc-alleles A02:01,A03:01 + --mhc-peptide-lengths 8,9,10,11 + --extract-subsequences + --output-csv /tmp/subsequence_predictions.csv + :ellipsis: 2,-2 + :nostderr: + +This will write a file giving predictions for all subsequences of the specified lengths: + +.. command-output:: + head -n 3 /tmp/subsequence_predictions.csv + +See the :ref:`mhcflurry-predict-scan` docs for more options. + + Fitting your own models ----------------------- @@ -115,42 +152,6 @@ It looks like this: :nostderr: -Scanning protein sequences for predicted epitopes -------------------------------------------------- - -The `mhctools <https://github.com/hammerlab/mhctools>`__ package -provides support for scanning protein sequences to find predicted -epitopes. It supports MHCflurry as well as other binding predictors. -Here is an example. - -First, install ``mhctools`` if it is not already installed: - -.. code-block:: shell - - $ pip install mhctools - -We'll generate predictions across ``example.fasta``, a FASTA file with two short -sequences: - -.. literalinclude:: /example.fasta - -Here's the ``mhctools`` invocation. See ``mhctools -h`` for more information. - -.. command-output:: - mhctools - --mhc-predictor mhcflurry - --input-fasta-file example.fasta - --mhc-alleles A02:01,A03:01 - --mhc-peptide-lengths 8,9,10,11 - --extract-subsequences - --output-csv /tmp/subsequence_predictions.csv - :ellipsis: 2,-2 - :nostderr: - -This will write a file giving predictions for all subsequences of the specified lengths: - -.. command-output:: - head -n 3 /tmp/subsequence_predictions.csv Environment variables diff --git a/docs/conf.py b/docs/conf.py index 3cc86178..29e2d3ea 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -54,7 +54,7 @@ extensions = [ 'sphinx.ext.viewcode', 'sphinx.ext.githubpages', 'numpydoc', - 'sphinxcontrib.autorun2', + 'sphinx_autorun', 'sphinxcontrib.programoutput', 'sphinxcontrib.autoprogram', 'sphinx.ext.githubpages', @@ -76,7 +76,7 @@ master_doc = 'index' # General information about the project. project = 'MHCflurry' -copyright = '2019, Timothy O\'Donnell' +copyright = 'Timothy O\'Donnell' author = 'Timothy O\'Donnell' # The version info for the project you're documenting, acts as replacement for diff --git a/docs/generate_class1_pan.py b/docs/generate_class1_pan.py index 785ecc3d..ef868a84 100644 --- a/docs/generate_class1_pan.py +++ b/docs/generate_class1_pan.py @@ -12,36 +12,24 @@ from os import mkdir import pandas import logomaker +import tqdm + from matplotlib import pyplot from mhcflurry.downloads import get_path from mhcflurry.amino_acid import COMMON_AMINO_ACIDS +from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( - "--class1-models-dir-with-ms", - "--class1-models", + "--class1-models-dir", metavar="DIR", default=get_path( "models_class1_pan", "models.combined", test_exists=False), help="Class1 models. Default: %(default)s", ) -parser.add_argument( - "--class1-models-dir-no-ms", - metavar="DIR", - default=get_path( - "models_class1_pan", "models.no_mass_spec", test_exists=False), - help="Class1 models. Default: %(default)s", -) -parser.add_argument( - "--class1-models-dir-refined", - metavar="DIR", - default=get_path( - "models_class1_pan_refined", "models.affinity", test_exists=False), - help="Class1 refined models. Default: %(default)s", -) parser.add_argument( "--logo-cutoff", default=0.01, @@ -84,6 +72,9 @@ parser.add_argument( def model_info(models_dir): + allele_to_sequence = Class1AffinityPredictor.load( + models_dir).allele_to_sequence + length_distributions_df = pandas.read_csv( join(models_dir, "length_distributions.csv.bz2")) frequency_matrices_df = pandas.read_csv( @@ -104,10 +95,21 @@ def model_info(models_dir): normalized_frequency_matrices.loc[:, AMINO_ACIDS] = ( normalized_frequency_matrices[AMINO_ACIDS] / distribution) + sequence_to_alleles = defaultdict(list) + for allele in normalized_frequency_matrices.allele.unique(): + sequence = allele_to_sequence[allele] + sequence_to_alleles[sequence].append(allele) + + allele_equivalance_classes = sorted([ + sorted(equivalence_group) + for equivalence_group in sequence_to_alleles.values() + ], key=lambda equivalence_group: equivalence_group[0]) + return { 'length_distributions': length_distributions_df, 'normalized_frequency_matrices': normalized_frequency_matrices, 'observations_per_allele': observations_per_allele, + 'allele_equivalance_classes': allele_equivalance_classes, } @@ -191,7 +193,7 @@ def go(argv): mkdir(args.out_dir) predictors = [ - ("combined", args.class1_models_dir_with_ms), + ("combined", args.class1_models_dir), ] info_per_predictor = OrderedDict() alleles = set() @@ -224,7 +226,6 @@ def go(argv): w(".. contents:: :local:", "") - def image(name): if name is None: return "" @@ -234,7 +235,7 @@ def go(argv): if args.max_alleles: alleles = alleles[:args.max_alleles] - for allele in alleles: + for allele in tqdm.tqdm(alleles): w(allele, "-" * 80, "") for (label, info) in info_per_predictor.items(): length_distribution = info["length_distributions"] diff --git a/docs/intro.rst b/docs/intro.rst index 6c0c260d..dbd82ac4 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -5,7 +5,8 @@ MHCflurry is an open source package for peptide/MHC I binding affinity predictio provides competitive accuracy with a fast and documented implementation. You can download pre-trained MHCflurry models fit to affinity measurements -deposited in IEDB or train a MHCflurry predictor on your own data. +deposited in IEDB (and a few other sources) +or train a MHCflurry predictor on your own data. Currently only allele-specific prediction is implemented, in which separate models are trained for each allele. The released models therefore support a fixed set of common diff --git a/docs/requirements.txt b/docs/requirements.txt index d5fe8445..d47a6bca 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,5 @@ sphinx -sphinxcontrib-autorun2 +sphinxcontrib-autorun sphinxcontrib-programoutput sphinxcontrib-autoprogram sphinx-rtd-theme @@ -9,3 +9,4 @@ mhctools pydot tabulate logomaker +tqdm -- GitLab