From a84ef111c3f580f0893b1425c357835e7f685951 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Thu, 19 Mar 2020 10:42:53 -0400
Subject: [PATCH] working on docs

---
 docs/Makefile                 | 20 +++++----
 docs/README.md                |  3 --
 docs/commandline_tools.rst    | 20 +++------
 docs/commandline_tutorial.rst | 81 ++++++++++++++++++-----------------
 docs/conf.py                  |  4 +-
 docs/generate_class1_pan.py   | 39 +++++++++--------
 docs/intro.rst                |  3 +-
 docs/requirements.txt         |  3 +-
 8 files changed, 83 insertions(+), 90 deletions(-)

diff --git a/docs/Makefile b/docs/Makefile
index 3fa742c5..42553515 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -49,21 +49,19 @@ help:
 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 	@echo "  dummy      to check syntax errors of document sources"
 
-# Added by Tim:
+# Added by Tim
 .PHONY: generate
 generate:
 	sphinx-apidoc -M -f -o _build/ ../mhcflurry
 	mhcflurry-downloads fetch models_class1_pan
 	python generate_class1_pan.py --out-dir model-info
 
-# Added by Tim:
-.PHONY: readme
-readme: text
-	rm -f package_readme/readme.generated.txt
-	cat package_readme/readme_header.rst \
-	    _build/text/package_readme/readme.template.txt \
-	    > package_readme/readme.generated.txt
-	chmod 444 package_readme/readme.generated.txt  # read only
+# Added by Tim
+.PHONY: generate_model_info
+generate_model_info:
+	sphinx-apidoc -M -f -o _build/ ../mhcflurry
+	mhcflurry-downloads fetch models_class1_pan
+	python generate_class1_pan.py --out-dir model-info
 
 .PHONY: clean
 clean:
@@ -72,6 +70,10 @@ clean:
 	mv $(BUILDDIR)/html /tmp/html-bk
 	rm -rf $(BUILDDIR)/*
 	mv /tmp/html-bk $(BUILDDIR)/html
+
+# Added by Tim
+.PHONY: clean_model_info
+clean_model_info:
 	rm -rf model-info
 
 .PHONY: html
diff --git a/docs/README.md b/docs/README.md
index b6330f59..615880b4 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,8 +1,5 @@
 # MHCflurry documentation
 
-Due to our use of `sphinxcontrib-autorun2` we unfortunately require Python 2.7
-to build to the docs. Python 3 is not supported.
-
 To generate Sphinx documentation, from this directory run:
 
 ```
diff --git a/docs/commandline_tools.rst b/docs/commandline_tools.rst
index e2f378b3..20f7cbe5 100644
--- a/docs/commandline_tools.rst
+++ b/docs/commandline_tools.rst
@@ -8,24 +8,14 @@ See also the :ref:`tutorial <commandline_tutorial>`.
 .. autoprogram:: mhcflurry.predict_command:parser
     :prog: mhcflurry-predict
 
+.. _mhcflurry-predict-scan:
+
+.. autoprogram:: mhcflurry.predict_scan_command:parser
+    :prog: mhcflurry-predict-scan
+
 .. _mhcflurry-downloads:
 
 .. autoprogram:: mhcflurry.downloads_command:parser
     :prog: mhcflurry-downloads
 
-.. _mhcflurry-class1-train-allele-specific-models:
-
-.. autoprogram:: mhcflurry.train_allele_specific_models_command:parser
-    :prog: mhcflurry-class1-train-allele-specific-models
-
-.. _mhcflurry-calibrate-percentile-ranks:
-
-.. autoprogram:: mhcflurry.calibrate_percentile_ranks_command:parser
-    :prog: mhcflurry-calibrate-percentile-ranks
-
-.. _mhcflurry-class1-select-allele-specific-models:
-
-.. autoprogram:: mhcflurry.select_allele_specific_models_command:parser
-    :prog: mhcflurry-class1-select-allele-specific-models
-
 
diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst
index 8ccf28c2..bbcbe7fb 100644
--- a/docs/commandline_tutorial.rst
+++ b/docs/commandline_tutorial.rst
@@ -14,12 +14,12 @@ are distributed separately from the pip package and may be downloaded with the
 
 .. code-block:: shell
 
-    $ mhcflurry-downloads fetch models_class1
+    $ mhcflurry-downloads fetch models_class1_presentation
 
 Files downloaded with :ref:`mhcflurry-downloads` are stored in a platform-specific
 directory. To get the path to downloaded data, you can use:
 
-.. command-output:: mhcflurry-downloads path models_class1
+.. command-output:: mhcflurry-downloads path models_class1_presentation
     :nostderr:
 
 We also release a few other "downloads," such as curated training data and some
@@ -28,6 +28,10 @@ experimental models. To see what's available and what you have downloaded, run:
 .. command-output:: mhcflurry-downloads info
     :nostderr:
 
+Most users will only need ``models_class1_presentation``, however, as the
+presentation predictor includes a peptide / MHC I binding affinity (BA) predictor
+as well as an antigen processing (AP) predictor.
+
 .. note::
 
     The code we use for *generating* the downloads is in the
@@ -37,8 +41,9 @@ experimental models. To see what's available and what you have downloaded, run:
 Generating predictions
 ----------------------
 
-The :ref:`mhcflurry-predict` command generates predictions from the command-line.
-By default it will use the pre-trained models you downloaded above; other
+The :ref:`mhcflurry-predict` command generates predictions for individual peptides
+(as opposed to scanning protein sequences for epitopes).
+By default it will use the pre-trained models you downloaded above. Other
 models can be used by specifying the ``--models`` argument.
 
 Running:
@@ -68,6 +73,38 @@ on the Keras backend and other details.
 In most cases you'll want to specify the input as a CSV file instead of passing
 peptides and alleles as commandline arguments. See :ref:`mhcflurry-predict` docs.
 
+Scanning protein sequences for predicted MHC I ligands
+-------------------------------------------------
+
+Starting in version 1.6.0, MHCflurry supports scanning proteins for MHC I binding
+peptides using the ``mhcflurry-predict-scan`` command.
+
+We'll generate predictions across ``example.fasta``, a FASTA file with two short
+sequences:
+
+.. literalinclude:: /example.fasta
+
+Here's the ``mhctools`` invocation.
+
+.. command-output::
+    mhctools
+        --mhc-predictor mhcflurry
+        --input-fasta-file example.fasta
+        --mhc-alleles A02:01,A03:01
+        --mhc-peptide-lengths 8,9,10,11
+        --extract-subsequences
+        --output-csv /tmp/subsequence_predictions.csv
+    :ellipsis: 2,-2
+    :nostderr:
+
+This will write a file giving predictions for all subsequences of the specified lengths:
+
+.. command-output::
+    head -n 3 /tmp/subsequence_predictions.csv
+
+See the :ref:`mhcflurry-predict-scan` docs for more options.
+
+
 Fitting your own models
 -----------------------
 
@@ -115,42 +152,6 @@ It looks like this:
     :nostderr:
 
 
-Scanning protein sequences for predicted epitopes
--------------------------------------------------
-
-The `mhctools <https://github.com/hammerlab/mhctools>`__ package
-provides support for scanning protein sequences to find predicted
-epitopes. It supports MHCflurry as well as other binding predictors.
-Here is an example.
-
-First, install ``mhctools`` if it is not already installed:
-
-.. code-block:: shell
-
-    $ pip install mhctools
-
-We'll generate predictions across ``example.fasta``, a FASTA file with two short
-sequences:
-
-.. literalinclude:: /example.fasta
-
-Here's the ``mhctools`` invocation. See ``mhctools -h`` for more information.
-
-.. command-output::
-    mhctools
-        --mhc-predictor mhcflurry
-        --input-fasta-file example.fasta
-        --mhc-alleles A02:01,A03:01
-        --mhc-peptide-lengths 8,9,10,11
-        --extract-subsequences
-        --output-csv /tmp/subsequence_predictions.csv
-    :ellipsis: 2,-2
-    :nostderr:
-
-This will write a file giving predictions for all subsequences of the specified lengths:
-
-.. command-output::
-    head -n 3 /tmp/subsequence_predictions.csv
 
 
 Environment variables
diff --git a/docs/conf.py b/docs/conf.py
index 3cc86178..29e2d3ea 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -54,7 +54,7 @@ extensions = [
     'sphinx.ext.viewcode',
     'sphinx.ext.githubpages',
     'numpydoc',
-    'sphinxcontrib.autorun2',
+    'sphinx_autorun',
     'sphinxcontrib.programoutput',
     'sphinxcontrib.autoprogram',
     'sphinx.ext.githubpages',
@@ -76,7 +76,7 @@ master_doc = 'index'
 
 # General information about the project.
 project = 'MHCflurry'
-copyright = '2019, Timothy O\'Donnell'
+copyright = 'Timothy O\'Donnell'
 author = 'Timothy O\'Donnell'
 
 # The version info for the project you're documenting, acts as replacement for
diff --git a/docs/generate_class1_pan.py b/docs/generate_class1_pan.py
index 785ecc3d..ef868a84 100644
--- a/docs/generate_class1_pan.py
+++ b/docs/generate_class1_pan.py
@@ -12,36 +12,24 @@ from os import mkdir
 import pandas
 import logomaker
 
+import tqdm
+
 from matplotlib import pyplot
 
 from mhcflurry.downloads import get_path
 from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
+from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor
 
 AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)
 
 parser = argparse.ArgumentParser(usage=__doc__)
 parser.add_argument(
-    "--class1-models-dir-with-ms",
-    "--class1-models",
+    "--class1-models-dir",
     metavar="DIR",
     default=get_path(
         "models_class1_pan", "models.combined", test_exists=False),
     help="Class1 models. Default: %(default)s",
 )
-parser.add_argument(
-    "--class1-models-dir-no-ms",
-    metavar="DIR",
-    default=get_path(
-        "models_class1_pan", "models.no_mass_spec", test_exists=False),
-    help="Class1 models. Default: %(default)s",
-)
-parser.add_argument(
-    "--class1-models-dir-refined",
-    metavar="DIR",
-    default=get_path(
-        "models_class1_pan_refined", "models.affinity", test_exists=False),
-    help="Class1 refined models. Default: %(default)s",
-)
 parser.add_argument(
     "--logo-cutoff",
     default=0.01,
@@ -84,6 +72,9 @@ parser.add_argument(
 
 
 def model_info(models_dir):
+    allele_to_sequence = Class1AffinityPredictor.load(
+        models_dir).allele_to_sequence
+
     length_distributions_df = pandas.read_csv(
         join(models_dir, "length_distributions.csv.bz2"))
     frequency_matrices_df = pandas.read_csv(
@@ -104,10 +95,21 @@ def model_info(models_dir):
     normalized_frequency_matrices.loc[:, AMINO_ACIDS] = (
             normalized_frequency_matrices[AMINO_ACIDS] / distribution)
 
+    sequence_to_alleles = defaultdict(list)
+    for allele in normalized_frequency_matrices.allele.unique():
+        sequence = allele_to_sequence[allele]
+        sequence_to_alleles[sequence].append(allele)
+
+    allele_equivalance_classes = sorted([
+        sorted(equivalence_group)
+        for equivalence_group in sequence_to_alleles.values()
+    ], key=lambda equivalence_group: equivalence_group[0])
+
     return {
         'length_distributions': length_distributions_df,
         'normalized_frequency_matrices': normalized_frequency_matrices,
         'observations_per_allele': observations_per_allele,
+        'allele_equivalance_classes': allele_equivalance_classes,
     }
 
 
@@ -191,7 +193,7 @@ def go(argv):
         mkdir(args.out_dir)
 
     predictors = [
-        ("combined", args.class1_models_dir_with_ms),
+        ("combined", args.class1_models_dir),
     ]
     info_per_predictor = OrderedDict()
     alleles = set()
@@ -224,7 +226,6 @@ def go(argv):
 
     w(".. contents:: :local:", "")
 
-
     def image(name):
         if name is None:
             return ""
@@ -234,7 +235,7 @@ def go(argv):
     if args.max_alleles:
         alleles = alleles[:args.max_alleles]
 
-    for allele in alleles:
+    for allele in tqdm.tqdm(alleles):
         w(allele, "-" * 80, "")
         for (label, info) in info_per_predictor.items():
             length_distribution = info["length_distributions"]
diff --git a/docs/intro.rst b/docs/intro.rst
index 6c0c260d..dbd82ac4 100644
--- a/docs/intro.rst
+++ b/docs/intro.rst
@@ -5,7 +5,8 @@ MHCflurry is an open source package for peptide/MHC I binding affinity predictio
 provides competitive accuracy with a fast and documented implementation.
 
 You can download pre-trained MHCflurry models fit to affinity measurements
-deposited in IEDB or train a MHCflurry predictor on your own data.
+deposited in IEDB (and a few other sources)
+or train a MHCflurry predictor on your own data.
 
 Currently only allele-specific prediction is implemented, in which separate models
 are trained for each allele. The released models therefore support a fixed set of common
diff --git a/docs/requirements.txt b/docs/requirements.txt
index d5fe8445..d47a6bca 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,5 @@
 sphinx
-sphinxcontrib-autorun2
+sphinxcontrib-autorun
 sphinxcontrib-programoutput
 sphinxcontrib-autoprogram
 sphinx-rtd-theme
@@ -9,3 +9,4 @@ mhctools
 pydot
 tabulate
 logomaker
+tqdm
-- 
GitLab