From 3dca1947c6b8a320245827609e83b92a458cde23 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Fri, 20 Mar 2020 16:18:55 -0400
Subject: [PATCH] python tutorial

---
 .travis.yml                                |   1 +
 docs/Makefile                              |  16 +-
 docs/README.md                             |   8 +
 docs/commandline_tutorial.rst              | 115 ++++----
 docs/conf.py                               |  12 +-
 docs/doctest.sh                            |   7 +
 docs/generate.py                           | 301 ---------------------
 docs/generate_class1_pan.py                | 278 -------------------
 docs/intro.rst                             |  39 +--
 docs/python_tutorial.rst                   | 284 ++++++++++---------
 docs/requirements.txt                      |   3 +-
 mhcflurry/__init__.py                      |   5 +-
 mhcflurry/class1_presentation_predictor.py |  70 +----
 mhcflurry/predict_command.py               |   2 +-
 14 files changed, 282 insertions(+), 859 deletions(-)
 create mode 100755 docs/doctest.sh
 delete mode 100644 docs/generate.py
 delete mode 100644 docs/generate_class1_pan.py

diff --git a/.travis.yml b/.travis.yml
index 3dcd0481..db7d5c23 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -68,3 +68,4 @@ script:
       --already-downloaded-dir /tmp/downloads
   - mhcflurry-downloads info  # just to test this command works
   - nosetests --with-timer -sv test
+  - cd docs && bash ./doctest.sh
diff --git a/docs/Makefile b/docs/Makefile
index 42553515..9dfd2cce 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -15,7 +15,7 @@ endif
 # Internal variables.
 PAPEROPT_a4     = -D latex_paper_size=a4
 PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+ALLSPHINXOPTS   = -v -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 # the i18n builder cannot share the environment and doctrees with the others
 I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 
@@ -53,15 +53,6 @@ help:
 .PHONY: generate
 generate:
 	sphinx-apidoc -M -f -o _build/ ../mhcflurry
-	mhcflurry-downloads fetch models_class1_pan
-	python generate_class1_pan.py --out-dir model-info
-
-# Added by Tim
-.PHONY: generate_model_info
-generate_model_info:
-	sphinx-apidoc -M -f -o _build/ ../mhcflurry
-	mhcflurry-downloads fetch models_class1_pan
-	python generate_class1_pan.py --out-dir model-info
 
 .PHONY: clean
 clean:
@@ -71,11 +62,6 @@ clean:
 	rm -rf $(BUILDDIR)/*
 	mv /tmp/html-bk $(BUILDDIR)/html
 
-# Added by Tim
-.PHONY: clean_model_info
-clean_model_info:
-	rm -rf model-info
-
 .PHONY: html
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
diff --git a/docs/README.md b/docs/README.md
index 615880b4..2f712d4c 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -9,3 +9,11 @@ $ make generate html
 
 Documentation is written to the _build/ directory. These files should not be
 checked into the repo.
+
+To test example code:
+```
+$ make doctest 
+```
+
+Then take a look at _build/doctest for detailed output.
+
diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst
index bbcbe7fb..2014ec55 100644
--- a/docs/commandline_tutorial.rst
+++ b/docs/commandline_tutorial.rst
@@ -22,11 +22,9 @@ directory. To get the path to downloaded data, you can use:
 .. command-output:: mhcflurry-downloads path models_class1_presentation
     :nostderr:
 
-We also release a few other "downloads," such as curated training data and some
-experimental models. To see what's available and what you have downloaded, run:
-
-.. command-output:: mhcflurry-downloads info
-    :nostderr:
+We also release a number of other "downloads," such as curated training data and some
+experimental models. To see what's available and what you have downloaded, run
+``mhcflurry-downloads info``.
 
 Most users will only need ``models_class1_presentation``, however, as the
 presentation predictor includes a peptide / MHC I binding affinity (BA) predictor
@@ -42,8 +40,8 @@ Generating predictions
 ----------------------
 
 The :ref:`mhcflurry-predict` command generates predictions for individual peptides
-(as opposed to scanning protein sequences for epitopes).
-By default it will use the pre-trained models you downloaded above. Other
+(see the next section for how to scan protein sequences for epitopes). By
+default it will use the pre-trained models you downloaded above. Other
 models can be used by specifying the ``--models`` argument.
 
 Running:
@@ -60,23 +58,39 @@ results in a file like this:
 .. command-output::
     cat /tmp/predictions.csv
 
-The predictions are given as affinities (KD) in nM in the ``mhcflurry_prediction``
-column. The other fields give the 5-95 percentile predictions across
-the models in the ensemble and the quantile of the affinity prediction among
-a large number of random peptides tested on that allele.
+The binding affinity predictions are given as affinities (KD) in nM in the
+``mhcflurry_affinity`` column. Lower values indicate stronger binders. A commonly-used
+threshold for peptides with a reasonable chance of being immunogenic is 500 nM.
 
-The predictions shown above were generated with MHCflurry |version|. Different versions of
-MHCflurry can give considerably different results. Even
-on the same version, exact predictions may vary (up to about 1 nM) depending
-on the Keras backend and other details.
+The ``mhcflurry_affinity_percentile`` gives the quantile of the affinity
+prediction among a large number of random peptides tested on that allele. Lower
+is stronger. Two percent is a commonly-used threshold.
+
+The last two columns give the antigen processing and presentation scores,
+respectively. These range from 0 to 1 with higher values indicating more
+favorable processing or presentation.
+
+.. note::
+
+    The processing predictor is experimental and under
+    development. It models allele-independent effects that influence whether a
+    peptide will be detected in a mass spec experiment. The presentation score is
+    a simple logistic regression model that combines the (log) binding affinity
+    prediction with the processing score to give a composite prediction. The resulting
+    prediction is appropriate for prioritizing potential epitopes to test, but no
+    thresholds have yet been established for what constitutes a "high enough"
+    presentation score.
 
 In most cases you'll want to specify the input as a CSV file instead of passing
-peptides and alleles as commandline arguments. See :ref:`mhcflurry-predict` docs.
+peptides and alleles as commandline arguments. If you're relying on the
+processing or presentation scores, you may also want to pass the upstream and
+downstream sequences of the peptides from their source proteins for potentially more
+accurate cleavage prediction. See the :ref:`mhcflurry-predict` docs.
 
 Scanning protein sequences for predicted MHC I ligands
 -------------------------------------------------
 
-Starting in version 1.6.0, MHCflurry supports scanning proteins for MHC I binding
+Starting in version 1.6.0, MHCflurry supports scanning proteins for MHC-binding
 peptides using the ``mhcflurry-predict-scan`` command.
 
 We'll generate predictions across ``example.fasta``, a FASTA file with two short
@@ -84,33 +98,31 @@ sequences:
 
 .. literalinclude:: /example.fasta
 
-Here's the ``mhctools`` invocation.
+Here's the ``mhcflurry-predict-scan`` invocation to scan the proteins for
+binders to either of two MHC I genotypes:
 
 .. command-output::
-    mhctools
-        --mhc-predictor mhcflurry
-        --input-fasta-file example.fasta
-        --mhc-alleles A02:01,A03:01
-        --mhc-peptide-lengths 8,9,10,11
-        --extract-subsequences
-        --output-csv /tmp/subsequence_predictions.csv
-    :ellipsis: 2,-2
+    mhcflurry-predict-scan
+        example.fasta
+        --alleles
+            HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02
+            HLA-A*01:01,HLA-A*02:06,HLA-B*44:02,HLA-B*07:02,HLA-C*01:02,HLA-C*03:01
+        --results-filtered affinity_percentile
+        --threshold-affinity-percentile 1.0
     :nostderr:
 
-This will write a file giving predictions for all subsequences of the specified lengths:
-
-.. command-output::
-    head -n 3 /tmp/subsequence_predictions.csv
-
 See the :ref:`mhcflurry-predict-scan` docs for more options.
 
 
 Fitting your own models
 -----------------------
 
-The :ref:`mhcflurry-class1-train-allele-specific-models` command is used to
-fit models to training data. The models we release with MHCflurry are trained
-with a command like:
+If you have your own data and want to fit your own MHCflurry models, you have
+a few options. If you have data for only one or a few MHC I alleles, the best
+approach is to use the
+:ref:`mhcflurry-class1-train-allele-specific-models` command to fit an
+"allele-specific" predictor, in which separate neural networks are used for
+each allele. Here's an example:
 
 .. code-block:: shell
 
@@ -120,21 +132,23 @@ with a command like:
         --min-measurements-per-allele 75 \
         --out-models-dir models
 
-MHCflurry predictors are serialized to disk as many files in a directory. The
-command above will write the models to the output directory specified by the
-``--out-models-dir`` argument. This directory has files like:
+.. note::
 
-.. program-output::
-    ls "$(mhcflurry-downloads path models_class1)/models"
-    :shell:
-    :nostderr:
-    :ellipsis: 4,-4
+    MHCflurry predictors are serialized to disk as many files in a directory. The
+    command above will write the models to the output directory specified by the
+    ``--out-models-dir`` argument. This directory has files like:
+
+    .. program-output::
+        ls "$(mhcflurry-downloads path models_class1)/models"
+        :shell:
+        :nostderr:
+        :ellipsis: 4,-4
 
-The ``manifest.csv`` file gives metadata for all the models used in the predictor.
-There will be a ``weights_...`` file for each model giving its weights
-(the parameters for the neural network). The ``percent_ranks.csv`` stores a
-histogram of model predictions for each allele over a large number of random
-peptides. It is used for generating the percent ranks at prediction time.
+    The ``manifest.csv`` file gives metadata for all the models used in the predictor.
+    There will be a ``weights_...`` file for each model giving its weights
+    (the parameters for the neural network). The ``percent_ranks.csv`` stores a
+    histogram of model predictions for each allele over a large number of random
+    peptides. It is used for generating the percent ranks at prediction time.
 
 To call :ref:`mhcflurry-class1-train-allele-specific-models` you'll need some
 training data. The data we use for our released predictors can be downloaded with
@@ -151,7 +165,12 @@ It looks like this:
     :shell:
     :nostderr:
 
-
+To fit pan-allele models like the ones released with MHCflurry, you can use
+a similar tool, ``mhcflurry-class1-train-pan-allele-models``. You'll probably
+also want to take a look at the scripts used to generate the production models,
+which are available in the *downloads-generation* directory in the MHCflurry
+repository. The production MHCflurry models were fit using a cluster with several
+dozen GPUs over a period of about two days.
 
 
 Environment variables
diff --git a/docs/conf.py b/docs/conf.py
index 29e2d3ea..533a841e 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -54,12 +54,22 @@ extensions = [
     'sphinx.ext.viewcode',
     'sphinx.ext.githubpages',
     'numpydoc',
-    'sphinx_autorun',
     'sphinxcontrib.programoutput',
     'sphinxcontrib.autoprogram',
     'sphinx.ext.githubpages',
 ]
 
+doctest_global_setup = '''
+import logging
+logging.getLogger('matplotlib').disabled = True
+logging.getLogger('tensorflow').disabled = True
+import numpy
+import pandas
+import mhcflurry
+'''
+
+doctest_test_doctest_blocks = ''
+
 # Add any paths that contain templates here, relative to this directory.
 templates_path = ['_templates']
 
diff --git a/docs/doctest.sh b/docs/doctest.sh
new file mode 100755
index 00000000..04841694
--- /dev/null
+++ b/docs/doctest.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+make doctest
+RETVAL=$?
+echo doctest returned $RETVAL
+cat _build/doctest/output.txt
+exit $RETVAL
diff --git a/docs/generate.py b/docs/generate.py
deleted file mode 100644
index 46b93080..00000000
--- a/docs/generate.py
+++ /dev/null
@@ -1,301 +0,0 @@
-"""
-Generate certain RST files used in documentation.
-"""
-
-import sys
-import argparse
-import json
-from textwrap import wrap
-from collections import OrderedDict, defaultdict
-from os.path import join
-
-import pypandoc
-import pandas
-from keras.utils.vis_utils import plot_model
-from tabulate import tabulate
-
-from mhcflurry import __version__
-from mhcflurry.downloads import get_path
-from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor
-
-parser = argparse.ArgumentParser(usage=__doc__)
-parser.add_argument(
-    "--cv-summary-csv",
-    metavar="FILE.csv",
-    default=get_path(
-        "cross_validation_class1", "summary.all.csv", test_exists=False),
-    help="Cross validation scores summary. Default: %(default)s",
-)
-parser.add_argument(
-    "--class1-models-dir",
-    metavar="DIR",
-    default=get_path(
-        "models_class1", "models", test_exists=False),
-    help="Class1 models. Default: %(default)s",
-)
-parser.add_argument(
-    "--class1-unselected-models-dir",
-    metavar="DIR",
-    default=get_path(
-        "models_class1_unselected", "models", test_exists=False),
-    help="Class1 unselected models. Default: %(default)s",
-)
-parser.add_argument(
-    "--out-alleles-info-rst",
-    metavar="FILE.rst",
-    help="rst output file",
-)
-parser.add_argument(
-    "--out-models-info-rst",
-    metavar="FILE.rst",
-    help="rst output file",
-)
-parser.add_argument(
-    "--out-models-architecture-png",
-    metavar="FILE.png",
-    help="png output file",
-)
-parser.add_argument(
-    "--out-models-supported-alleles-rst",
-    metavar="FILE.png",
-    help="png output file",
-)
-
-
-def go(argv):
-    args = parser.parse_args(argv)
-
-    predictor = None
-    unselected_predictor = None
-
-    if args.out_models_supported_alleles_rst:
-        # Supported alleles rst
-        if predictor is None:
-            predictor = Class1AffinityPredictor.load(args.class1_models_dir)
-        with open(args.out_models_supported_alleles_rst, "w") as fd:
-            fd.write(
-                "Models released with the current version of MHCflurry (%s) "
-                "support peptides of "
-                "length %d-%d and the following %d alleles:\n\n::\n\n\t%s\n\n" % (
-                    __version__,
-                    predictor.supported_peptide_lengths[0],
-                    predictor.supported_peptide_lengths[1],
-                    len(predictor.supported_alleles),
-                    "\n\t".join(
-                        wrap(", ".join(predictor.supported_alleles)))))
-            print("Wrote: %s" % args.out_models_supported_alleles_rst)
-
-    if args.out_models_architecture_png:
-        # Architecture diagram
-        raise NotImplementedError()  # for now
-        if predictor is None:
-            predictor = Class1AffinityPredictor.load(args.class1_models_dir)
-        network = predictor.neural_networks[0].network()
-        plot_model(
-            network,
-            to_file=args.out_models_architecture_png,
-            show_layer_names=True,
-            show_shapes=True)
-        print("Wrote: %s" % args.out_models_architecture_png)
-
-    if args.out_models_info_rst:
-        # Architecture information rst
-        if predictor is None:
-            predictor = Class1AffinityPredictor.load(args.class1_models_dir)
-        if unselected_predictor is None:
-            unselected_predictor = Class1AffinityPredictor.load(
-                args.class1_unselected_models_dir)
-
-        config_to_network = {}
-        config_to_alleles = {}
-        for (allele, networks) in unselected_predictor.allele_to_allele_specific_models.items():
-            for network in networks:
-                config = json.dumps(network.hyperparameters)
-                if config not in config_to_network:
-                    config_to_network[config] = network
-                config_to_alleles[config] = []
-
-        for (allele, networks) in predictor.allele_to_allele_specific_models.items():
-            for network in networks:
-                config = json.dumps(network.hyperparameters)
-                assert config in config_to_network
-                config_to_alleles[config].append(allele)
-
-        all_hyperparameters = [
-            network.hyperparameters for network in config_to_network.values()
-        ]
-        hyperparameter_keys =  all_hyperparameters[0].keys()
-        assert all(
-            hyperparameters.keys() == hyperparameter_keys
-            for hyperparameters in all_hyperparameters)
-
-        constant_hyperparameter_keys = [
-            k for k in hyperparameter_keys
-            if all([
-                hyperparameters[k] == all_hyperparameters[0][k]
-                for hyperparameters in all_hyperparameters
-            ])
-        ]
-        constant_hypeparameters = dict(
-            (key, all_hyperparameters[0][key])
-            for key in sorted(constant_hyperparameter_keys)
-        )
-
-        def write_hyperparameters(fd, hyperparameters):
-            rows = []
-            for key in sorted(hyperparameters.keys()):
-                rows.append((key, json.dumps(hyperparameters[key])))
-            fd.write("\n")
-            fd.write(
-                tabulate(rows, ["Hyperparameter", "Value"], tablefmt="grid"))
-
-        with open(args.out_models_info_rst, "w") as fd:
-            fd.write("Hyperparameters shared by all %d architectures:\n" %
-                len(config_to_network))
-            write_hyperparameters(fd, constant_hypeparameters)
-            fd.write("\n")
-
-            configs = sorted(
-                config_to_alleles,
-                key=lambda s: len(config_to_alleles[s]),
-                reverse=True)
-
-            for (i, config) in enumerate(configs):
-                network = config_to_network[config]
-                lines = []
-                network.network().summary(print_fn=lines.append)
-
-                specific_hyperparameters = dict(
-                    (key, value)
-                    for (key, value) in network.hyperparameters.items()
-                    if key not in constant_hypeparameters)
-
-                def name_component(key, value):
-                    if key == "locally_connected_layers":
-                        return "lc=%d" % len(value)
-                    elif key == "train_data":
-                        return value["subset"] + "-data"
-                    elif key == "layer_sizes":
-                        (value,) = value
-                        key = "size"
-                    elif key == "dense_layer_l1_regularization":
-                        if value == 0:
-                            return "no-reg"
-                        key = "reg"
-                    return "%s=%s" % (key, value)
-
-                def sort_key(component):
-                    if "lc" in component:
-                        return (1, component)
-                    if "reg" in component:
-                        return (2, component)
-                    return (0, component)
-
-                components = [
-                    name_component(key, value)
-                    for (key, value) in specific_hyperparameters.items()
-                ]
-                name = ",".join(sorted(components, key=sort_key))
-
-                fd.write("Architecture %d / %d %s\n" % (
-                    (i + 1, len(config_to_network), name)))
-                fd.write("+" * 40)
-                fd.write("\n")
-                fd.write(
-                    "Selected in the ensembles for %d alleles: *%s*.\n\n" % (
-                        len(config_to_alleles[config]),
-                        ", ".join(
-                            sorted(config_to_alleles[config]))))
-                write_hyperparameters(
-                    fd,
-                    specific_hyperparameters)
-                fd.write("\n\n::\n\n")
-                for line in lines:
-                    fd.write("    ")
-                    fd.write(line)
-                    fd.write("\n")
-        print("Wrote: %s" % args.out_models_info_rst)
-
-    if args.out_alleles_info_rst:
-        # Models cv output
-        df = pandas.read_csv(
-            join(args.class1_models_dir, "unselected_summary.csv.bz2"))
-
-        train_df = pandas.read_csv(
-            join(args.class1_unselected_models_dir, "train_data.csv.bz2"))
-
-        quantitative_train_measurements_by_allele = train_df.loc[
-            train_df.measurement_type == "quantitative"
-        ].allele.value_counts()
-
-        train_measurements_by_allele = train_df.allele.value_counts()
-
-        df = df.sort_values("allele").copy()
-
-        df["scoring"] = df.unselected_score_plan.str.replace(
-            "\\(\\|[0-9.]+\\|\\)", "")
-        df["models selected"] = df["num_models"]
-
-        df["sanitized_scoring"] = df.scoring.map(
-            lambda s: s.replace("mass-spec", "").replace("mse", "").replace("(", "").replace(")", "").strip()
-        )
-
-        df["mass spec scoring"] = df.sanitized_scoring.map(
-            lambda s: s.split(",")[0] if "," in s else ""
-        )
-        df["mean square error scoring"] = df.sanitized_scoring.map(
-            lambda s: s.split(",")[-1]
-        )
-        df["unselected percentile"] = df.unselected_accuracy_score_percentile
-
-        df["train data (all)"] = df.allele.map(train_measurements_by_allele)
-        df["train data (quantitative)"] = df.allele.map(
-            quantitative_train_measurements_by_allele)
-
-        def write_df(df, fd):
-            rows = [
-                row for (_, row) in df.iterrows()
-            ]
-            fd.write("\n")
-            fd.write(
-                tabulate(rows,
-                    [col.replace("_", " ") for col in df.columns],
-                    tablefmt="grid"))
-            fd.write("\n\n")
-
-        with open(args.out_alleles_info_rst, "w") as fd:
-            fd.write("Supported alleles\n")
-            fd.write("+" * 80)
-            fd.write("\n\n")
-
-            common_cols = [
-                "allele",
-                "train data (all)",
-                "train data (quantitative)",
-                "mass spec scoring",
-                "mean square error scoring",
-                "unselected percentile",
-                "unselected_score",
-                "unselected_score_scrambled_mean",
-            ]
-
-            sub_df = df.loc[df.retained][common_cols + [
-                "models selected",
-            ]]
-            write_df(sub_df, fd)
-
-            fd.write("Rejected alleles\n")
-            fd.write("+" * 80)
-            fd.write("\n\n")
-            fd.write(
-                "Training for the following alleles was attempted but the "
-                "resulting models were excluded due to inadequate performance on "
-                "held out data.")
-            fd.write("\n\n")
-
-            sub_df = df.loc[~df.retained][common_cols].sort_values("allele")
-            write_df(sub_df, fd)
-            print("Wrote: %s" % args.out_alleles_info_rst)
-
-if __name__ == "__main__":
-    go(sys.argv[1:])
\ No newline at end of file
diff --git a/docs/generate_class1_pan.py b/docs/generate_class1_pan.py
deleted file mode 100644
index ef868a84..00000000
--- a/docs/generate_class1_pan.py
+++ /dev/null
@@ -1,278 +0,0 @@
-"""
-Generate certain RST files used in documentation.
-"""
-from __future__ import print_function
-import sys
-import argparse
-from collections import OrderedDict, defaultdict
-import os
-from os.path import join, exists
-from os import mkdir
-
-import pandas
-import logomaker
-
-import tqdm
-
-from matplotlib import pyplot
-
-from mhcflurry.downloads import get_path
-from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
-from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor
-
-AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)
-
-parser = argparse.ArgumentParser(usage=__doc__)
-parser.add_argument(
-    "--class1-models-dir",
-    metavar="DIR",
-    default=get_path(
-        "models_class1_pan", "models.combined", test_exists=False),
-    help="Class1 models. Default: %(default)s",
-)
-parser.add_argument(
-    "--logo-cutoff",
-    default=0.01,
-    type=float,
-    help="Fraction of top to use for motifs",
-)
-parser.add_argument(
-    "--length-cutoff",
-    default=0.01,
-    type=float,
-    help="Fraction of top to use for length distribution",
-)
-parser.add_argument(
-    "--length-distribution-lengths",
-    nargs="+",
-    default=[8, 9, 10, 11, 12, 13, 14, 15],
-    type=int,
-    help="Peptide lengths for length distribution plots",
-)
-parser.add_argument(
-    "--motif-lengths",
-    nargs="+",
-    default=[8, 9, 10, 11],
-    type=int,
-    help="Peptide lengths for motif plots",
-)
-parser.add_argument(
-    "--out-dir",
-    metavar="DIR",
-    required=True,
-    help="Directory to write RSTs and images to",
-)
-parser.add_argument(
-    "--max-alleles",
-    default=None,
-    type=int,
-    metavar="N",
-    help="Only use N alleles (for testing)",
-)
-
-
-def model_info(models_dir):
-    allele_to_sequence = Class1AffinityPredictor.load(
-        models_dir).allele_to_sequence
-
-    length_distributions_df = pandas.read_csv(
-        join(models_dir, "length_distributions.csv.bz2"))
-    frequency_matrices_df = pandas.read_csv(
-        join(models_dir, "frequency_matrices.csv.bz2"))
-    try:
-        train_data_df = pandas.read_csv(
-            join(models_dir, "train_data.csv.bz2"))
-        observations_per_allele = (
-            train_data_df.groupby("allele").peptide.nunique().to_dict())
-    except IOError:
-        observations_per_allele = None
-
-    distribution = frequency_matrices_df.loc[
-        (frequency_matrices_df.cutoff_fraction == 1.0), AMINO_ACIDS
-    ].mean(0)
-
-    normalized_frequency_matrices = frequency_matrices_df.copy()
-    normalized_frequency_matrices.loc[:, AMINO_ACIDS] = (
-            normalized_frequency_matrices[AMINO_ACIDS] / distribution)
-
-    sequence_to_alleles = defaultdict(list)
-    for allele in normalized_frequency_matrices.allele.unique():
-        sequence = allele_to_sequence[allele]
-        sequence_to_alleles[sequence].append(allele)
-
-    allele_equivalance_classes = sorted([
-        sorted(equivalence_group)
-        for equivalence_group in sequence_to_alleles.values()
-    ], key=lambda equivalence_group: equivalence_group[0])
-
-    return {
-        'length_distributions': length_distributions_df,
-        'normalized_frequency_matrices': normalized_frequency_matrices,
-        'observations_per_allele': observations_per_allele,
-        'allele_equivalance_classes': allele_equivalance_classes,
-    }
-
-
-def write_logo(
-        normalized_frequency_matrices,
-        allele,
-        lengths,
-        cutoff,
-        models_label,
-        out_dir):
-
-    fig = pyplot.figure(figsize=(8,10))
-
-    for (i, length) in enumerate(lengths):
-        ax = pyplot.subplot(len(lengths), 1, i + 1)
-        matrix = normalized_frequency_matrices.loc[
-            (normalized_frequency_matrices.allele == allele) &
-            (normalized_frequency_matrices.length == length) &
-            (normalized_frequency_matrices.cutoff_fraction == cutoff)
-        ].set_index("position")[AMINO_ACIDS]
-        if matrix.shape[0] == 0:
-            return None
-
-        matrix = (matrix.T / matrix.sum(1)).T  # row normalize
-
-        ss_logo = logomaker.Logo(
-            matrix,
-            width=.8,
-            vpad=.05,
-            fade_probabilities=True,
-            stack_order='small_on_top',
-            ax=ax,
-        )
-        pyplot.title(
-            "%s %d-mer (%s)" % (allele, length, models_label), y=0.85)
-        pyplot.xticks(matrix.index.values)
-    pyplot.tight_layout()
-    name = "%s.motifs.%s.png" % (
-        allele.replace("*", "-").replace(":", "-"), models_label)
-    filename = os.path.abspath(join(out_dir, name))
-    pyplot.savefig(filename)
-    print("Wrote: ", filename)
-    fig.clear()
-    pyplot.close(fig)
-    return name
-
-
-def write_length_distribution(
-        length_distributions_df, allele, lengths, cutoff, models_label, out_dir):
-    length_distribution = length_distributions_df.loc[
-        (length_distributions_df.allele == allele) &
-        (length_distributions_df.cutoff_fraction == cutoff)
-    ]
-    if length_distribution.shape[0] == 0:
-        return None
-
-    length_distribution = length_distribution.set_index(
-        "length").reindex(lengths).fillna(0.0).reset_index()
-
-    fig = pyplot.figure(figsize=(8, 2))
-    length_distribution.plot(x="length", y="fraction", kind="bar", color="black")
-    pyplot.title("%s (%s)" % (allele, models_label))
-    pyplot.xlabel("")
-    pyplot.xticks(rotation=0)
-    pyplot.gca().get_legend().remove()
-    name = "%s.lengths.%s.png" % (
-        allele.replace("*", "-").replace(":", "-"), models_label)
-
-    filename = os.path.abspath(join(out_dir, name))
-    pyplot.savefig(filename)
-    print("Wrote: ", filename)
-    fig.clear()
-    pyplot.close(fig)
-    return name
-
-
-def go(argv):
-    args = parser.parse_args(argv)
-
-    if not exists(args.out_dir):
-        mkdir(args.out_dir)
-
-    predictors = [
-        ("combined", args.class1_models_dir),
-    ]
-    info_per_predictor = OrderedDict()
-    alleles = set()
-    for (label, models_dir) in predictors:
-        if not models_dir:
-            continue
-        info_per_predictor[label] = model_info(models_dir)
-        alleles.update(
-            info_per_predictor[label]["normalized_frequency_matrices"].allele.unique())
-
-    lines = []
-
-    def w(*pieces):
-        lines.extend(pieces)
-
-    w('Motifs and length distributions from the pan-allele predictor')
-    w('=' * 80, "")
-
-    w(
-        "Length distributions and binding motifs were calculated by ranking a "
-        "large set of random peptides (an equal number of peptides for each "
-        "length 8-15) by predicted affinity for each allele. "
-        "For length distribution, the top %g%% of peptides were collected and "
-        "their length distributions plotted. For sequence motifs, sequence "
-        "logos for the top %g%% "
-        "peptides for each length are shown.\n" % (
-            args.length_cutoff * 100.0,
-            args.logo_cutoff * 100.0,
-        ))
-
-    w(".. contents:: :local:", "")
-
-    def image(name):
-        if name is None:
-            return ""
-        return '.. image:: %s\n' % name
-
-    alleles = sorted(alleles, key=lambda a: ("HLA" not in a, a))
-    if args.max_alleles:
-        alleles = alleles[:args.max_alleles]
-
-    for allele in tqdm.tqdm(alleles):
-        w(allele, "-" * 80, "")
-        for (label, info) in info_per_predictor.items():
-            length_distribution = info["length_distributions"]
-            normalized_frequency_matrices = info["normalized_frequency_matrices"]
-
-            length_distribution_image_path = write_length_distribution(
-                length_distributions_df=length_distribution,
-                allele=allele,
-                lengths=args.length_distribution_lengths,
-                cutoff=args.length_cutoff,
-                out_dir=args.out_dir,
-                models_label=label)
-            if not length_distribution_image_path:
-                continue
-            w("*%s*\n")
-            if info['observations_per_allele'] is not None:
-                w("Training observations (unique peptides): %d" % (
-                    info['observations_per_allele'].get(allele, 0)))
-                w("\n")
-            w(image(length_distribution_image_path))
-            w(image(write_logo(
-                normalized_frequency_matrices=normalized_frequency_matrices,
-                allele=allele,
-                lengths=args.motif_lengths,
-                cutoff=args.logo_cutoff,
-                out_dir=args.out_dir,
-                models_label=label,
-            )))
-        w("")
-
-    document_path = join(args.out_dir, "allele_motifs.rst")
-    with open(document_path, "w") as fd:
-        for line in lines:
-            fd.write(line)
-            fd.write("\n")
-    print("Wrote", document_path)
-
-
-if __name__ == "__main__":
-    go(sys.argv[1:])
diff --git a/docs/intro.rst b/docs/intro.rst
index dbd82ac4..5b5f3581 100644
--- a/docs/intro.rst
+++ b/docs/intro.rst
@@ -2,26 +2,33 @@ Introduction and setup
 =======================
 
 MHCflurry is an open source package for peptide/MHC I binding affinity prediction. It
-provides competitive accuracy with a fast and documented implementation.
-
-You can download pre-trained MHCflurry models fit to affinity measurements
-deposited in IEDB (and a few other sources)
-or train a MHCflurry predictor on your own data.
-
-Currently only allele-specific prediction is implemented, in which separate models
-are trained for each allele. The released models therefore support a fixed set of common
-class I alleles for which sufficient published training data is available
-(see :ref:`models_supported_alleles`\ ).
-
-MHCflurry supports Python versions 2.7 and 3.4+. It uses the `keras <https://keras.io>`__
+attempts to provide competitive accuracy with a fast and documented implementation.
+
+You can download pre-trained MHCflurry models fit to mass spec-identified MHC I
+ligands and peptide/MHC affinity measurements deposited in IEDB (plus a few other
+sources) or train a MHCflurry predictor on your own data.
+
+Starting in version 1.6.0, the default MHCflurry binding affinity predictors
+are "pan-allele" models that support most sequenced MHC I alleles across humans
+and a few other species (about 14,000 alleles in total). This version also
+introduces two experimental predictors, an "antigen processing" predictor
+that attempts to model MHC allele-independent effects such as proteosomal
+cleavage and a "presentation" predictor that integrates processing predictions
+with binding affinity predictions to give a composite "presentation score." Both
+models are trained on mass spec-identified MHC ligands.
+
+MHCflurry supports Python 3.4+. It uses the `keras <https://keras.io>`__
 neural network library via either the Tensorflow or Theano backends. GPUs may
-optionally be used for a generally modest speed improvement.
+optionally be used for a modest speed improvement.
 
 If you find MHCflurry useful in your research please cite:
 
-    O'Donnell, T. et al., 2017. MHCflurry: open-source class I MHC
-    binding affinity prediction. bioRxiv. Available at:
-    http://www.biorxiv.org/content/early/2017/08/09/174243.
+    T. J. O’Donnell, et al., "MHCflurry: Open-Source Class I MHC Binding Affinity
+    Prediction," *Cell Systems*, 2018.
+    https://www.cell.com/cell-systems/fulltext/S2405-4712(18)30232-1.
+
+If you have questions or encounter problems, please file an issue at the
+MHCflurry github repo: https://github.com/openvax/mhcflurry
 
 
 Installation (pip)
diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst
index 132cea7c..b906c29f 100644
--- a/docs/python_tutorial.rst
+++ b/docs/python_tutorial.rst
@@ -1,159 +1,183 @@
 Python library tutorial
 =======================
 
-Predicting
-----------
-
 The MHCflurry Python API exposes additional options and features beyond those
-supported by the commandline tools. This tutorial gives a basic overview
-of the most important functionality. See the :ref:`API-documentation` for further details.
-
-The `~mhcflurry.Class1AffinityPredictor` class is the primary user-facing interface.
-Use the `~mhcflurry.Class1AffinityPredictor.load` static method to load a
+supported by the commandline tools and can be more convenient for interactive
+analyses and bioinformatic pipelines. This tutorial gives a basic overview
+of the most important functionality. See the :ref:`API-documentation` for further
+details.
+
+Loading a predictor
+----------------------------------
+
+Most prediction tasks can be performed using the
+`~mhcflurry.Class1PresentationPredictor` class, which provides a programmatic API
+to the functionality in the :ref:`mhcflurry-predict` and
+:ref:`mhcflurry-predict-scan` commands.
+
+Instances of `~mhcflurry.Class1PresentationPredictor` wrap a
+`~mhcflurry.Class1AffinityPredictor` to generate binding affinity predictions
+and a `~mhcflurry.Class1ProcessingPredictor` to generate antigen processing
+predictions. The presentation score is computed using a logistic regression
+model over binding affinity and processing predictions.
+
+Use the `~mhcflurry.Class1PresentationPredictor.load` static method to load a
 trained predictor from disk. With no arguments this method will load the predictor
 released with MHCflurry (see :ref:`downloading`\ ). If you pass a path to a
 models directory, then it will load that predictor instead.
 
-.. runblock:: pycon
+.. doctest::
 
-    >>> from mhcflurry import Class1AffinityPredictor
-    >>> predictor = Class1AffinityPredictor.load()
-    >>> predictor.supported_alleles[:10]
+    >>> from mhcflurry import Class1PresentationPredictor
+    >>> predictor = Class1PresentationPredictor.load()
+    >>> predictor.supported_alleles[:5]
+    ['Atbe-B*01:01', 'Atbe-E*03:01', 'Atbe-G*03:01', 'Atbe-G*03:02', 'Atbe-G*06:01']
+
+Predicting for individual peptides
+----------------------------------
 
-With a predictor loaded we can now generate some binding predictions:
+To generate predictions for individual peptides, we can use the
+`~mhcflurry.Class1AffinityPredictor.predict` method of the `~mhcflurry.Class1PresentationPredictor`,
+loaded above. This method returns a `pandas.DataFrame` with binding affinity, processing, and presentation
+predictions:
 
-.. runblock:: pycon
+.. doctest::
 
-    >>> predictor.predict(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"])
+    >>> predictor.predict(
+    ...     peptides=["SIINFEKL", "NLVPMVATV"],
+    ...     alleles=["HLA-A0201", "HLA-A0301"],
+    ...     verbose=0)
+         peptide  peptide_num sample_name      affinity best_allele  processing_score  presentation_score
+    0   SIINFEKL            0     sample1  12906.786173   HLA-A0201          0.101473            0.012503
+    1  NLVPMVATV            1     sample1     15.038358   HLA-A0201          0.676289            0.975463
+
+Here, the list of alleles is taken to be an individual's MHC I genotype (i.e. up
+to 6 alleles), and the strongest binder across alleles for each peptide is
+reported.
 
 .. note::
 
-    MHCflurry normalizes allele names using the `mhcnames <https://github.com/hammerlab/mhcnames>`__
+    MHCflurry normalizes allele names using the `mhcnames <https://github.com/openvax/mhcnames>`__
     package. Names like ``HLA-A0201`` or ``A*02:01`` will be
     normalized to ``HLA-A*02:01``, so most naming conventions can be used
-    with methods such as `~mhcflurry.Class1AffinityPredictor.predict`.
-
-For more detailed results, we can use
-`~mhcflurry.Class1AffinityPredictor.predict_to_dataframe`.
-
-.. runblock:: pycon
-
-    >>> predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"])
-
-Instead of a single allele and multiple peptides, we may need predictions for
-allele/peptide pairs. We can predict across pairs by specifying
-the `alleles` argument instead of `allele`. The list of alleles
-must be the same length as the list of peptides (i.e. it is predicting over pairs,
-*not* taking the cross product).
-
-.. runblock:: pycon
-
-    >>> predictor.predict(alleles=["HLA-A0201", "HLA-B*57:01"], peptides=["SIINFEKL", "SIINFEQL"])
-
-Training
---------
-
-Let's fit our own MHCflurry predictor. First we need some training data. If you
-haven't already, run this in a shell to download the MHCflurry training data:
-
-.. code-block:: shell
-
-    $ mhcflurry-downloads fetch data_curated
-
-We can get the path to this data from Python using `mhcflurry.downloads.get_path`:
-
-.. runblock:: pycon
-
-    >>> from mhcflurry.downloads import get_path
-    >>> data_path = get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2")
-    >>> data_path
-
-Now let's load it with pandas and filter to reasonably-sized peptides:
-
-.. runblock:: pycon
-
-    >>> import pandas
-    >>> df = pandas.read_csv(data_path)
-    >>> df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)]
-    >>> df.head(5)
-
-We'll make an untrained `~mhcflurry.Class1AffinityPredictor` and then call
-`~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` to fit
-some models.
-
-.. runblock:: pycon
-
-    >>> new_predictor = Class1AffinityPredictor()
-    >>> single_allele_train_data = df.loc[df.allele == "HLA-B*57:01"].sample(100)
-    >>> new_predictor.fit_allele_specific_predictors(
-    ...    n_models=1,
-    ...    architecture_hyperparameters_list=[{
-    ...         "layer_sizes": [16],
-    ...         "max_epochs": 5,
-    ...         "random_negative_constant": 5,
-    ...    }],
-    ...    peptides=single_allele_train_data.peptide.values,
-    ...    affinities=single_allele_train_data.measurement_value.values,
-    ...    allele="HLA-B*57:01")
-
-
-The `~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` method
-can be called any number of times on the same instance to build up ensembles
-of models across alleles. The architecture hyperparameters we specified are
-for demonstration purposes; to fit real models you would usually train for
-more epochs.
-
-Now we can generate predictions:
-
-.. runblock:: pycon
-
-    >>> new_predictor.predict(["SYNPEPII"], allele="HLA-B*57:01")
-
-We can save our predictor to the specified directory on disk by running:
-
-.. runblock:: pycon
-
-    >>> new_predictor.save("/tmp/new-predictor")
-
-and restore it:
+    with methods such as `~mhcflurry.Class1PresentationPredictor.predict`.
+
+If you have multiple sample genotypes, you can pass a dict, where the
+keys are arbitrary sample names:
+
+.. doctest::
+
+    >>> predictor.predict(
+    ...     peptides=["KSEYMTSWFY", "NLVPMVATV"],
+    ...     alleles={
+    ...        "sample1": ["A0201", "A0301", "B0702", "B4402", "C0201", "C0702"],
+    ...        "sample2": ["A0101", "A0206", "B5701", "C0202"],
+    ...     },
+    ...     verbose=0)
+          peptide  peptide_num sample_name      affinity best_allele  processing_score  presentation_score
+    0  KSEYMTSWFY            0     sample1  16737.745268       A0301          0.381632            0.026550
+    1   NLVPMVATV            1     sample1     15.038358       A0201          0.676289            0.975463
+    2  KSEYMTSWFY            0     sample2     62.540779       A0101          0.381632            0.796731
+    3   NLVPMVATV            1     sample2     15.765500       A0206          0.676289            0.974439
+
+Here the strongest binder for each sample / peptide pair is returned.
+
+Many users will focus on the binding affinity predictions, as the
+processing and presentation predictions are experimental. If you do use the latter
+scores, however, when available you should provide the upstream (N-flank)
+and downstream (C-flank) sequences from the source proteins of the peptides for
+a small boost in accuracy. To do so, specify the ``n_flank`` and ``c_flank``
+arguments, which give the flanking sequences for the corresponding peptides:
+
+.. doctest::
+
+    >>> predictor.predict(
+    ...     peptides=["KSEYMTSWFY", "NLVPMVATV"],
+    ...     n_flanks=["NNNNNNN", "SSSSSSSS"],
+    ...     c_flanks=["CCCCCCCC", "YYYAAAA"],
+    ...     alleles={
+    ...        "sample1": ["A0201", "A0301", "B0702", "B4402", "C0201", "C0702"],
+    ...        "sample2": ["A0101", "A0206", "B5701", "C0202"],
+    ...     },
+    ...     verbose=0)
+          peptide   n_flank   c_flank  peptide_num sample_name      affinity best_allele  processing_score  presentation_score
+    0  KSEYMTSWFY   NNNNNNN  CCCCCCCC            0     sample1  16737.745268       A0301          0.605816            0.056190
+    1   NLVPMVATV  SSSSSSSS   YYYAAAA            1     sample1     15.038358       A0201          0.824994            0.986719
+    2  KSEYMTSWFY   NNNNNNN  CCCCCCCC            0     sample2     62.540779       A0101          0.605816            0.897493
+    3   NLVPMVATV  SSSSSSSS   YYYAAAA            1     sample2     15.765500       A0206          0.824994            0.986155
+
+Scanning protein sequences
+--------------------------
+
+The `~mhcflurry.Class1PresentationPredictor.predict_sequences` method supports
+scanning protein sequences for MHC ligands. Here's an example to identify all
+peptides with a predicted binding affinity of 500 nM or tighter to any allele
+across two sample genotypes and two short peptide sequences.
+
+.. doctest::
+
+    >>> predictor.predict_sequences(
+    ...    sequences={
+    ...        'protein1': "MDSKGSSQKGSRLLLLLVVSNLL",
+    ...        'protein2': "SSLPTPEDKEQAQQTHH",
+    ...    },
+    ...    alleles={
+    ...        "sample1": ["A0201", "A0301", "B0702"],
+    ...        "sample2": ["A0101", "C0202"],
+    ...    },
+    ...    result="filtered",
+    ...    comparison_quantity="affinity",
+    ...    filter_value=500,
+    ...    verbose=0)
+      sequence_name  pos     peptide         n_flank     c_flank sample_name    affinity best_allele  affinity_percentile  processing_score  presentation_score
+    0      protein1   13   LLLLVVSNL   MDSKGSSQKGSRL           L     sample1   38.206225       A0201             0.380125          0.017644            0.571060
+    1      protein1   14   LLLVVSNLL  MDSKGSSQKGSRLL                 sample1   42.243472       A0201             0.420250          0.090984            0.619213
+    2      protein1    5   SSQKGSRLL           MDSKG   LLLVVSNLL     sample2   66.749223       C0202             0.803375          0.383608            0.774468
+    3      protein1    6   SQKGSRLLL          MDSKGS    LLVVSNLL     sample2  178.033467       C0202             1.820000          0.275019            0.482206
+    4      protein1   13  LLLLVVSNLL   MDSKGSSQKGSRL                 sample1  202.208167       A0201             1.112500          0.058782            0.261320
+    5      protein1   12  LLLLLVVSNL    MDSKGSSQKGSR           L     sample1  202.506582       A0201             1.112500          0.010025            0.225648
+    6      protein2    0   SSLPTPEDK                    EQAQQTHH     sample1  335.529377       A0301             1.011750          0.010443            0.156798
+    7      protein2    0   SSLPTPEDK                    EQAQQTHH     sample2  353.451759       C0202             2.674250          0.010443            0.150753
+    8      protein1    8   KGSRLLLLL        MDSKGSSQ      VVSNLL     sample2  410.327286       C0202             2.887000          0.121374            0.194081
+    9      protein1    5    SSQKGSRL           MDSKG  LLLLVVSNLL     sample2  477.285937       C0202             3.107375          0.111982            0.168572
 
-.. runblock:: pycon
+When using ``predict_sequences``, the flanking sequences for each peptide are
+automatically included in the processing and presentation predictions.
 
-    >>> new_predictor2 = Class1AffinityPredictor.load("/tmp/new-predictor")
-    >>> new_predictor2.supported_alleles
+See the documentation for `~mhcflurry.Class1PresentationPredictor` for other
+useful methods.
 
 
-Lower level interface
----------------------
+Lower level interfaces
+----------------------------------
 
-The high-level `Class1AffinityPredictor` delegates to low-level
-`~mhcflurry.Class1NeuralNetwork` objects, each of which represents
-a single neural network. The purpose of `~mhcflurry.Class1AffinityPredictor`
-is to implement several important features:
+The `~mhcflurry.Class1PresentationPredictor` predictor delegates to a
+`~mhcflurry.Class1AffinityPredictor` instance for binding affinity predictions.
+If all you need are binding affinities, you can use this instance directly.
 
-ensembles
-    More than one neural network can be used to generate each prediction. The
-    predictions returned to the user are the geometric mean of the individual
-    model predictions. This gives higher accuracy in most situations
+Here's an example:
 
-multiple alleles
-    A `~mhcflurry.Class1NeuralNetwork` generates predictions for only a single
-    allele. The `~mhcflurry.Class1AffinityPredictor` maps alleles to the
-    relevant `~mhcflurry.Class1NeuralNetwork` instances
+.. doctest::
 
-serialization
-    Loading and saving predictors is implemented in `~mhcflurry.Class1AffinityPredictor`.
+    >>> from mhcflurry import Class1AffinityPredictor
+    >>> predictor = Class1AffinityPredictor.load()
+    >>> predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"])
+        peptide     allele    prediction  prediction_low  prediction_high  prediction_percentile
+    0  SIINFEKL  HLA-A0201  12906.786173     8829.460289     18029.923061               6.566375
+    1  SIINFEQL  HLA-A0201  13025.300796     9050.056312     18338.004869               6.623625
 
-Sometimes it's easiest to work directly with `~mhcflurry.Class1NeuralNetwork`.
-Here is a simple example of doing so:
+The ``prediction_low`` and ``prediction_high`` fields give the 5-95 percentile
+predictions across the models in the ensemble. This detailed information is not
+available through the higher-level `~mhcflurry.Class1PresentationPredictor`
+interface.
 
-.. runblock:: pycon
+Under the hood, `Class1AffinityPredictor` itself delegates to an ensemble of
+of `~mhcflurry.Class1NeuralNetwork` instances, which implement the neural network
+models used for prediction. To fit your own affinity prediction models, call
+`~mhcflurry.Class1NeuralNetwork.fit`.
 
-    >>> from mhcflurry import Class1NeuralNetwork
-    >>> network = Class1NeuralNetwork()
-    >>> network.fit(
-    ...    single_allele_train_data.peptide.values,
-    ...    single_allele_train_data.measurement_value.values,
-    ...    verbose=0)
-    >>> network.predict(["SIINFEKLL"])
+You can similarly use `~mhcflurry.Class1ProcessingPredictor` directly for
+antigen processing prediction, and there is a low-level
+`~mhcflurry.Class1ProcessingNeuralNetwork` with a `~mhcflurry.Class1ProcessingNeuralNetwork.fit` method.
 
+See the API documentation of these classes for details.
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
index d47a6bca..faadab82 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,10 +1,9 @@
 sphinx
-sphinxcontrib-autorun
 sphinxcontrib-programoutput
 sphinxcontrib-autoprogram
 sphinx-rtd-theme
 numpydoc
-pypandoc
+ypandoc
 mhctools
 pydot
 tabulate
diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py
index c66586cb..9c72a602 100644
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -5,6 +5,7 @@ Class I MHC ligand prediction package
 from .class1_affinity_predictor import Class1AffinityPredictor
 from .class1_neural_network import Class1NeuralNetwork
 from .class1_processing_predictor import Class1ProcessingPredictor
+from .class1_processing_neural_network import Class1ProcessingNeuralNetwork
 from .class1_presentation_predictor import Class1PresentationPredictor
 
 from .version import __version__
@@ -12,6 +13,8 @@ from .version import __version__
 __all__ = [
     "__version__",
     "Class1AffinityPredictor",
-    "Class1NeuralNetwork", "Class1ProcessingPredictor",
+    "Class1NeuralNetwork",
+    "Class1ProcessingPredictor",
+    "Class1ProcessingNeuralNetwork",
     "Class1PresentationPredictor",
 ]
diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py
index bdb39705..37fe175e 100644
--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -46,7 +46,7 @@ class Class1PresentationPredictor(object):
     a "presentation score" prediction.
 
     Most users will call the `load` static method to get an instance of this
-    class, then call the `predict_to_dataframe` method to generate predictions.
+    class, then call the `predict` method to generate predictions.
     """
     model_inputs = ["affinity_score", "processing_score"]
 
@@ -85,7 +85,7 @@ class Class1PresentationPredictor(object):
             peptides,
             alleles,
             sample_names=None,
-            include_affinity_percentile=False,
+            include_affinity_percentile=True,
             verbose=1,
             throw=True):
         """
@@ -391,68 +391,6 @@ class Class1PresentationPredictor(object):
         return model
 
     def predict(
-            self,
-            peptides,
-            alleles,
-            sample_names=None,
-            n_flanks=None,
-            c_flanks=None,
-            verbose=1):
-        """
-        Predict presentation scores across a set of peptides.
-
-        Presentation scores combine predictions for MHC I binding affinity
-        and antigen processing.
-
-        For intermediate results, see the `predict_to_dataframe` method.
-
-        Parameters
-        ----------
-        peptides : list of string
-            Peptide sequences
-        alleles : list of string or string -> string dict
-            If you are predicting for a single sample, pass a list of strings
-            (up to 6) indicating the genotype. If you are predicting across
-            multiple samples, pass a dict where the keys are (arbitrary)
-            sample names and the values are the alleles to predict for that
-            sample.
-        sample_names : list of string [same length as peptides]
-            If you are passing a dict for 'alleles', use this argument to
-            specify which peptides go with which sample.
-        n_flanks : list of string [same length as peptides]
-            Upstream sequences before the peptide. Sequences of any length can
-            be given and a suffix of the size supported by the model will be
-            used.
-        c_flanks : list of string [same length as peptides]
-            Downstream sequences after the peptide. Sequences of any length can
-            be given and a prefix of the size supported by the model will be
-            used.
-        verbose : int
-            Set to 0 for quiet mode.
-
-        Returns
-        -------
-        numpy.array
-
-        Presentation scores for each peptide. Scores range from 0 to 1, with
-        higher values indicating more favorable presentation likelihood.
-        """
-        if isinstance(alleles, dict):
-            if sample_names is None:
-                raise ValueError(
-                    "sample_names must be supplied when alleles is a dict. "
-                    "Alternatively, call predict_to_dataframe to predict over "
-                    "all samples")
-
-        return self.predict_to_dataframe(
-            peptides=peptides,
-            alleles=alleles,
-            sample_names=sample_names,
-            n_flanks=n_flanks,
-            c_flanks=c_flanks,
-            verbose=verbose).presentation_score.values
-
-    def predict_to_dataframe(
             self,
             peptides,
             alleles,
@@ -475,7 +413,7 @@ class Class1PresentationPredictor(object):
         Example:
 
         >>> predictor = Class1PresentationPredictor.load()
-        >>> predictor.predict_to_dataframe(
+        >>> predictor.predict(
         ...    peptides=["SIINFEKL", "PEPTIDE"],
         ...    n_flanks=["NNN", "SNS"],
         ...    c_flanks=["CCC", "CNC"],
@@ -766,7 +704,7 @@ class Class1PresentationPredictor(object):
                         c_flanks.append(
                             sequence[peptide_start + peptide_length : c_flank_end])
 
-        result_df = self.predict_to_dataframe(
+        result_df = self.predict(
             peptides=peptides,
             alleles=alleles,
             n_flanks=n_flanks,
diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py
index ecb35872..b054b696 100644
--- a/mhcflurry/predict_command.py
+++ b/mhcflurry/predict_command.py
@@ -275,7 +275,7 @@ def run(argv=sys.argv[1:]):
                     "No flanking information provided. Specify --no-flanking "
                     "to silence this warning")
 
-        predictions = predictor.predict_to_dataframe(
+        predictions = predictor.predict(
             peptides=df[args.peptide_column].values,
             n_flanks=n_flanks,
             c_flanks=c_flanks,
-- 
GitLab