From 3dca1947c6b8a320245827609e83b92a458cde23 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 20 Mar 2020 16:18:55 -0400 Subject: [PATCH] python tutorial --- .travis.yml | 1 + docs/Makefile | 16 +- docs/README.md | 8 + docs/commandline_tutorial.rst | 115 ++++---- docs/conf.py | 12 +- docs/doctest.sh | 7 + docs/generate.py | 301 --------------------- docs/generate_class1_pan.py | 278 ------------------- docs/intro.rst | 39 +-- docs/python_tutorial.rst | 284 ++++++++++--------- docs/requirements.txt | 3 +- mhcflurry/__init__.py | 5 +- mhcflurry/class1_presentation_predictor.py | 70 +---- mhcflurry/predict_command.py | 2 +- 14 files changed, 282 insertions(+), 859 deletions(-) create mode 100755 docs/doctest.sh delete mode 100644 docs/generate.py delete mode 100644 docs/generate_class1_pan.py diff --git a/.travis.yml b/.travis.yml index 3dcd0481..db7d5c23 100644 --- a/.travis.yml +++ b/.travis.yml @@ -68,3 +68,4 @@ script: --already-downloaded-dir /tmp/downloads - mhcflurry-downloads info # just to test this command works - nosetests --with-timer -sv test + - cd docs && bash ./doctest.sh diff --git a/docs/Makefile b/docs/Makefile index 42553515..9dfd2cce 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -15,7 +15,7 @@ endif # Internal variables. PAPEROPT_a4 = -D latex_paper_size=a4 PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +ALLSPHINXOPTS = -v -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . # the i18n builder cannot share the environment and doctrees with the others I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . @@ -53,15 +53,6 @@ help: .PHONY: generate generate: sphinx-apidoc -M -f -o _build/ ../mhcflurry - mhcflurry-downloads fetch models_class1_pan - python generate_class1_pan.py --out-dir model-info - -# Added by Tim -.PHONY: generate_model_info -generate_model_info: - sphinx-apidoc -M -f -o _build/ ../mhcflurry - mhcflurry-downloads fetch models_class1_pan - python generate_class1_pan.py --out-dir model-info .PHONY: clean clean: @@ -71,11 +62,6 @@ clean: rm -rf $(BUILDDIR)/* mv /tmp/html-bk $(BUILDDIR)/html -# Added by Tim -.PHONY: clean_model_info -clean_model_info: - rm -rf model-info - .PHONY: html html: $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html diff --git a/docs/README.md b/docs/README.md index 615880b4..2f712d4c 100644 --- a/docs/README.md +++ b/docs/README.md @@ -9,3 +9,11 @@ $ make generate html Documentation is written to the _build/ directory. These files should not be checked into the repo. + +To test example code: +``` +$ make doctest +``` + +Then take a look at _build/doctest for detailed output. + diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst index bbcbe7fb..2014ec55 100644 --- a/docs/commandline_tutorial.rst +++ b/docs/commandline_tutorial.rst @@ -22,11 +22,9 @@ directory. To get the path to downloaded data, you can use: .. command-output:: mhcflurry-downloads path models_class1_presentation :nostderr: -We also release a few other "downloads," such as curated training data and some -experimental models. To see what's available and what you have downloaded, run: - -.. command-output:: mhcflurry-downloads info - :nostderr: +We also release a number of other "downloads," such as curated training data and some +experimental models. To see what's available and what you have downloaded, run +``mhcflurry-downloads info``. Most users will only need ``models_class1_presentation``, however, as the presentation predictor includes a peptide / MHC I binding affinity (BA) predictor @@ -42,8 +40,8 @@ Generating predictions ---------------------- The :ref:`mhcflurry-predict` command generates predictions for individual peptides -(as opposed to scanning protein sequences for epitopes). -By default it will use the pre-trained models you downloaded above. Other +(see the next section for how to scan protein sequences for epitopes). By +default it will use the pre-trained models you downloaded above. Other models can be used by specifying the ``--models`` argument. Running: @@ -60,23 +58,39 @@ results in a file like this: .. command-output:: cat /tmp/predictions.csv -The predictions are given as affinities (KD) in nM in the ``mhcflurry_prediction`` -column. The other fields give the 5-95 percentile predictions across -the models in the ensemble and the quantile of the affinity prediction among -a large number of random peptides tested on that allele. +The binding affinity predictions are given as affinities (KD) in nM in the +``mhcflurry_affinity`` column. Lower values indicate stronger binders. A commonly-used +threshold for peptides with a reasonable chance of being immunogenic is 500 nM. -The predictions shown above were generated with MHCflurry |version|. Different versions of -MHCflurry can give considerably different results. Even -on the same version, exact predictions may vary (up to about 1 nM) depending -on the Keras backend and other details. +The ``mhcflurry_affinity_percentile`` gives the quantile of the affinity +prediction among a large number of random peptides tested on that allele. Lower +is stronger. Two percent is a commonly-used threshold. + +The last two columns give the antigen processing and presentation scores, +respectively. These range from 0 to 1 with higher values indicating more +favorable processing or presentation. + +.. note:: + + The processing predictor is experimental and under + development. It models allele-independent effects that influence whether a + peptide will be detected in a mass spec experiment. The presentation score is + a simple logistic regression model that combines the (log) binding affinity + prediction with the processing score to give a composite prediction. The resulting + prediction is appropriate for prioritizing potential epitopes to test, but no + thresholds have yet been established for what constitutes a "high enough" + presentation score. In most cases you'll want to specify the input as a CSV file instead of passing -peptides and alleles as commandline arguments. See :ref:`mhcflurry-predict` docs. +peptides and alleles as commandline arguments. If you're relying on the +processing or presentation scores, you may also want to pass the upstream and +downstream sequences of the peptides from their source proteins for potentially more +accurate cleavage prediction. See the :ref:`mhcflurry-predict` docs. Scanning protein sequences for predicted MHC I ligands ------------------------------------------------- -Starting in version 1.6.0, MHCflurry supports scanning proteins for MHC I binding +Starting in version 1.6.0, MHCflurry supports scanning proteins for MHC-binding peptides using the ``mhcflurry-predict-scan`` command. We'll generate predictions across ``example.fasta``, a FASTA file with two short @@ -84,33 +98,31 @@ sequences: .. literalinclude:: /example.fasta -Here's the ``mhctools`` invocation. +Here's the ``mhcflurry-predict-scan`` invocation to scan the proteins for +binders to either of two MHC I genotypes: .. command-output:: - mhctools - --mhc-predictor mhcflurry - --input-fasta-file example.fasta - --mhc-alleles A02:01,A03:01 - --mhc-peptide-lengths 8,9,10,11 - --extract-subsequences - --output-csv /tmp/subsequence_predictions.csv - :ellipsis: 2,-2 + mhcflurry-predict-scan + example.fasta + --alleles + HLA-A*02:01,HLA-A*03:01,HLA-B*57:01,HLA-B*45:01,HLA-C*02:02,HLA-C*07:02 + HLA-A*01:01,HLA-A*02:06,HLA-B*44:02,HLA-B*07:02,HLA-C*01:02,HLA-C*03:01 + --results-filtered affinity_percentile + --threshold-affinity-percentile 1.0 :nostderr: -This will write a file giving predictions for all subsequences of the specified lengths: - -.. command-output:: - head -n 3 /tmp/subsequence_predictions.csv - See the :ref:`mhcflurry-predict-scan` docs for more options. Fitting your own models ----------------------- -The :ref:`mhcflurry-class1-train-allele-specific-models` command is used to -fit models to training data. The models we release with MHCflurry are trained -with a command like: +If you have your own data and want to fit your own MHCflurry models, you have +a few options. If you have data for only one or a few MHC I alleles, the best +approach is to use the +:ref:`mhcflurry-class1-train-allele-specific-models` command to fit an +"allele-specific" predictor, in which separate neural networks are used for +each allele. Here's an example: .. code-block:: shell @@ -120,21 +132,23 @@ with a command like: --min-measurements-per-allele 75 \ --out-models-dir models -MHCflurry predictors are serialized to disk as many files in a directory. The -command above will write the models to the output directory specified by the -``--out-models-dir`` argument. This directory has files like: +.. note:: -.. program-output:: - ls "$(mhcflurry-downloads path models_class1)/models" - :shell: - :nostderr: - :ellipsis: 4,-4 + MHCflurry predictors are serialized to disk as many files in a directory. The + command above will write the models to the output directory specified by the + ``--out-models-dir`` argument. This directory has files like: + + .. program-output:: + ls "$(mhcflurry-downloads path models_class1)/models" + :shell: + :nostderr: + :ellipsis: 4,-4 -The ``manifest.csv`` file gives metadata for all the models used in the predictor. -There will be a ``weights_...`` file for each model giving its weights -(the parameters for the neural network). The ``percent_ranks.csv`` stores a -histogram of model predictions for each allele over a large number of random -peptides. It is used for generating the percent ranks at prediction time. + The ``manifest.csv`` file gives metadata for all the models used in the predictor. + There will be a ``weights_...`` file for each model giving its weights + (the parameters for the neural network). The ``percent_ranks.csv`` stores a + histogram of model predictions for each allele over a large number of random + peptides. It is used for generating the percent ranks at prediction time. To call :ref:`mhcflurry-class1-train-allele-specific-models` you'll need some training data. The data we use for our released predictors can be downloaded with @@ -151,7 +165,12 @@ It looks like this: :shell: :nostderr: - +To fit pan-allele models like the ones released with MHCflurry, you can use +a similar tool, ``mhcflurry-class1-train-pan-allele-models``. You'll probably +also want to take a look at the scripts used to generate the production models, +which are available in the *downloads-generation* directory in the MHCflurry +repository. The production MHCflurry models were fit using a cluster with several +dozen GPUs over a period of about two days. Environment variables diff --git a/docs/conf.py b/docs/conf.py index 29e2d3ea..533a841e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -54,12 +54,22 @@ extensions = [ 'sphinx.ext.viewcode', 'sphinx.ext.githubpages', 'numpydoc', - 'sphinx_autorun', 'sphinxcontrib.programoutput', 'sphinxcontrib.autoprogram', 'sphinx.ext.githubpages', ] +doctest_global_setup = ''' +import logging +logging.getLogger('matplotlib').disabled = True +logging.getLogger('tensorflow').disabled = True +import numpy +import pandas +import mhcflurry +''' + +doctest_test_doctest_blocks = '' + # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] diff --git a/docs/doctest.sh b/docs/doctest.sh new file mode 100755 index 00000000..04841694 --- /dev/null +++ b/docs/doctest.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +make doctest +RETVAL=$? +echo doctest returned $RETVAL +cat _build/doctest/output.txt +exit $RETVAL diff --git a/docs/generate.py b/docs/generate.py deleted file mode 100644 index 46b93080..00000000 --- a/docs/generate.py +++ /dev/null @@ -1,301 +0,0 @@ -""" -Generate certain RST files used in documentation. -""" - -import sys -import argparse -import json -from textwrap import wrap -from collections import OrderedDict, defaultdict -from os.path import join - -import pypandoc -import pandas -from keras.utils.vis_utils import plot_model -from tabulate import tabulate - -from mhcflurry import __version__ -from mhcflurry.downloads import get_path -from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor - -parser = argparse.ArgumentParser(usage=__doc__) -parser.add_argument( - "--cv-summary-csv", - metavar="FILE.csv", - default=get_path( - "cross_validation_class1", "summary.all.csv", test_exists=False), - help="Cross validation scores summary. Default: %(default)s", -) -parser.add_argument( - "--class1-models-dir", - metavar="DIR", - default=get_path( - "models_class1", "models", test_exists=False), - help="Class1 models. Default: %(default)s", -) -parser.add_argument( - "--class1-unselected-models-dir", - metavar="DIR", - default=get_path( - "models_class1_unselected", "models", test_exists=False), - help="Class1 unselected models. Default: %(default)s", -) -parser.add_argument( - "--out-alleles-info-rst", - metavar="FILE.rst", - help="rst output file", -) -parser.add_argument( - "--out-models-info-rst", - metavar="FILE.rst", - help="rst output file", -) -parser.add_argument( - "--out-models-architecture-png", - metavar="FILE.png", - help="png output file", -) -parser.add_argument( - "--out-models-supported-alleles-rst", - metavar="FILE.png", - help="png output file", -) - - -def go(argv): - args = parser.parse_args(argv) - - predictor = None - unselected_predictor = None - - if args.out_models_supported_alleles_rst: - # Supported alleles rst - if predictor is None: - predictor = Class1AffinityPredictor.load(args.class1_models_dir) - with open(args.out_models_supported_alleles_rst, "w") as fd: - fd.write( - "Models released with the current version of MHCflurry (%s) " - "support peptides of " - "length %d-%d and the following %d alleles:\n\n::\n\n\t%s\n\n" % ( - __version__, - predictor.supported_peptide_lengths[0], - predictor.supported_peptide_lengths[1], - len(predictor.supported_alleles), - "\n\t".join( - wrap(", ".join(predictor.supported_alleles))))) - print("Wrote: %s" % args.out_models_supported_alleles_rst) - - if args.out_models_architecture_png: - # Architecture diagram - raise NotImplementedError() # for now - if predictor is None: - predictor = Class1AffinityPredictor.load(args.class1_models_dir) - network = predictor.neural_networks[0].network() - plot_model( - network, - to_file=args.out_models_architecture_png, - show_layer_names=True, - show_shapes=True) - print("Wrote: %s" % args.out_models_architecture_png) - - if args.out_models_info_rst: - # Architecture information rst - if predictor is None: - predictor = Class1AffinityPredictor.load(args.class1_models_dir) - if unselected_predictor is None: - unselected_predictor = Class1AffinityPredictor.load( - args.class1_unselected_models_dir) - - config_to_network = {} - config_to_alleles = {} - for (allele, networks) in unselected_predictor.allele_to_allele_specific_models.items(): - for network in networks: - config = json.dumps(network.hyperparameters) - if config not in config_to_network: - config_to_network[config] = network - config_to_alleles[config] = [] - - for (allele, networks) in predictor.allele_to_allele_specific_models.items(): - for network in networks: - config = json.dumps(network.hyperparameters) - assert config in config_to_network - config_to_alleles[config].append(allele) - - all_hyperparameters = [ - network.hyperparameters for network in config_to_network.values() - ] - hyperparameter_keys = all_hyperparameters[0].keys() - assert all( - hyperparameters.keys() == hyperparameter_keys - for hyperparameters in all_hyperparameters) - - constant_hyperparameter_keys = [ - k for k in hyperparameter_keys - if all([ - hyperparameters[k] == all_hyperparameters[0][k] - for hyperparameters in all_hyperparameters - ]) - ] - constant_hypeparameters = dict( - (key, all_hyperparameters[0][key]) - for key in sorted(constant_hyperparameter_keys) - ) - - def write_hyperparameters(fd, hyperparameters): - rows = [] - for key in sorted(hyperparameters.keys()): - rows.append((key, json.dumps(hyperparameters[key]))) - fd.write("\n") - fd.write( - tabulate(rows, ["Hyperparameter", "Value"], tablefmt="grid")) - - with open(args.out_models_info_rst, "w") as fd: - fd.write("Hyperparameters shared by all %d architectures:\n" % - len(config_to_network)) - write_hyperparameters(fd, constant_hypeparameters) - fd.write("\n") - - configs = sorted( - config_to_alleles, - key=lambda s: len(config_to_alleles[s]), - reverse=True) - - for (i, config) in enumerate(configs): - network = config_to_network[config] - lines = [] - network.network().summary(print_fn=lines.append) - - specific_hyperparameters = dict( - (key, value) - for (key, value) in network.hyperparameters.items() - if key not in constant_hypeparameters) - - def name_component(key, value): - if key == "locally_connected_layers": - return "lc=%d" % len(value) - elif key == "train_data": - return value["subset"] + "-data" - elif key == "layer_sizes": - (value,) = value - key = "size" - elif key == "dense_layer_l1_regularization": - if value == 0: - return "no-reg" - key = "reg" - return "%s=%s" % (key, value) - - def sort_key(component): - if "lc" in component: - return (1, component) - if "reg" in component: - return (2, component) - return (0, component) - - components = [ - name_component(key, value) - for (key, value) in specific_hyperparameters.items() - ] - name = ",".join(sorted(components, key=sort_key)) - - fd.write("Architecture %d / %d %s\n" % ( - (i + 1, len(config_to_network), name))) - fd.write("+" * 40) - fd.write("\n") - fd.write( - "Selected in the ensembles for %d alleles: *%s*.\n\n" % ( - len(config_to_alleles[config]), - ", ".join( - sorted(config_to_alleles[config])))) - write_hyperparameters( - fd, - specific_hyperparameters) - fd.write("\n\n::\n\n") - for line in lines: - fd.write(" ") - fd.write(line) - fd.write("\n") - print("Wrote: %s" % args.out_models_info_rst) - - if args.out_alleles_info_rst: - # Models cv output - df = pandas.read_csv( - join(args.class1_models_dir, "unselected_summary.csv.bz2")) - - train_df = pandas.read_csv( - join(args.class1_unselected_models_dir, "train_data.csv.bz2")) - - quantitative_train_measurements_by_allele = train_df.loc[ - train_df.measurement_type == "quantitative" - ].allele.value_counts() - - train_measurements_by_allele = train_df.allele.value_counts() - - df = df.sort_values("allele").copy() - - df["scoring"] = df.unselected_score_plan.str.replace( - "\\(\\|[0-9.]+\\|\\)", "") - df["models selected"] = df["num_models"] - - df["sanitized_scoring"] = df.scoring.map( - lambda s: s.replace("mass-spec", "").replace("mse", "").replace("(", "").replace(")", "").strip() - ) - - df["mass spec scoring"] = df.sanitized_scoring.map( - lambda s: s.split(",")[0] if "," in s else "" - ) - df["mean square error scoring"] = df.sanitized_scoring.map( - lambda s: s.split(",")[-1] - ) - df["unselected percentile"] = df.unselected_accuracy_score_percentile - - df["train data (all)"] = df.allele.map(train_measurements_by_allele) - df["train data (quantitative)"] = df.allele.map( - quantitative_train_measurements_by_allele) - - def write_df(df, fd): - rows = [ - row for (_, row) in df.iterrows() - ] - fd.write("\n") - fd.write( - tabulate(rows, - [col.replace("_", " ") for col in df.columns], - tablefmt="grid")) - fd.write("\n\n") - - with open(args.out_alleles_info_rst, "w") as fd: - fd.write("Supported alleles\n") - fd.write("+" * 80) - fd.write("\n\n") - - common_cols = [ - "allele", - "train data (all)", - "train data (quantitative)", - "mass spec scoring", - "mean square error scoring", - "unselected percentile", - "unselected_score", - "unselected_score_scrambled_mean", - ] - - sub_df = df.loc[df.retained][common_cols + [ - "models selected", - ]] - write_df(sub_df, fd) - - fd.write("Rejected alleles\n") - fd.write("+" * 80) - fd.write("\n\n") - fd.write( - "Training for the following alleles was attempted but the " - "resulting models were excluded due to inadequate performance on " - "held out data.") - fd.write("\n\n") - - sub_df = df.loc[~df.retained][common_cols].sort_values("allele") - write_df(sub_df, fd) - print("Wrote: %s" % args.out_alleles_info_rst) - -if __name__ == "__main__": - go(sys.argv[1:]) \ No newline at end of file diff --git a/docs/generate_class1_pan.py b/docs/generate_class1_pan.py deleted file mode 100644 index ef868a84..00000000 --- a/docs/generate_class1_pan.py +++ /dev/null @@ -1,278 +0,0 @@ -""" -Generate certain RST files used in documentation. -""" -from __future__ import print_function -import sys -import argparse -from collections import OrderedDict, defaultdict -import os -from os.path import join, exists -from os import mkdir - -import pandas -import logomaker - -import tqdm - -from matplotlib import pyplot - -from mhcflurry.downloads import get_path -from mhcflurry.amino_acid import COMMON_AMINO_ACIDS -from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor - -AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) - -parser = argparse.ArgumentParser(usage=__doc__) -parser.add_argument( - "--class1-models-dir", - metavar="DIR", - default=get_path( - "models_class1_pan", "models.combined", test_exists=False), - help="Class1 models. Default: %(default)s", -) -parser.add_argument( - "--logo-cutoff", - default=0.01, - type=float, - help="Fraction of top to use for motifs", -) -parser.add_argument( - "--length-cutoff", - default=0.01, - type=float, - help="Fraction of top to use for length distribution", -) -parser.add_argument( - "--length-distribution-lengths", - nargs="+", - default=[8, 9, 10, 11, 12, 13, 14, 15], - type=int, - help="Peptide lengths for length distribution plots", -) -parser.add_argument( - "--motif-lengths", - nargs="+", - default=[8, 9, 10, 11], - type=int, - help="Peptide lengths for motif plots", -) -parser.add_argument( - "--out-dir", - metavar="DIR", - required=True, - help="Directory to write RSTs and images to", -) -parser.add_argument( - "--max-alleles", - default=None, - type=int, - metavar="N", - help="Only use N alleles (for testing)", -) - - -def model_info(models_dir): - allele_to_sequence = Class1AffinityPredictor.load( - models_dir).allele_to_sequence - - length_distributions_df = pandas.read_csv( - join(models_dir, "length_distributions.csv.bz2")) - frequency_matrices_df = pandas.read_csv( - join(models_dir, "frequency_matrices.csv.bz2")) - try: - train_data_df = pandas.read_csv( - join(models_dir, "train_data.csv.bz2")) - observations_per_allele = ( - train_data_df.groupby("allele").peptide.nunique().to_dict()) - except IOError: - observations_per_allele = None - - distribution = frequency_matrices_df.loc[ - (frequency_matrices_df.cutoff_fraction == 1.0), AMINO_ACIDS - ].mean(0) - - normalized_frequency_matrices = frequency_matrices_df.copy() - normalized_frequency_matrices.loc[:, AMINO_ACIDS] = ( - normalized_frequency_matrices[AMINO_ACIDS] / distribution) - - sequence_to_alleles = defaultdict(list) - for allele in normalized_frequency_matrices.allele.unique(): - sequence = allele_to_sequence[allele] - sequence_to_alleles[sequence].append(allele) - - allele_equivalance_classes = sorted([ - sorted(equivalence_group) - for equivalence_group in sequence_to_alleles.values() - ], key=lambda equivalence_group: equivalence_group[0]) - - return { - 'length_distributions': length_distributions_df, - 'normalized_frequency_matrices': normalized_frequency_matrices, - 'observations_per_allele': observations_per_allele, - 'allele_equivalance_classes': allele_equivalance_classes, - } - - -def write_logo( - normalized_frequency_matrices, - allele, - lengths, - cutoff, - models_label, - out_dir): - - fig = pyplot.figure(figsize=(8,10)) - - for (i, length) in enumerate(lengths): - ax = pyplot.subplot(len(lengths), 1, i + 1) - matrix = normalized_frequency_matrices.loc[ - (normalized_frequency_matrices.allele == allele) & - (normalized_frequency_matrices.length == length) & - (normalized_frequency_matrices.cutoff_fraction == cutoff) - ].set_index("position")[AMINO_ACIDS] - if matrix.shape[0] == 0: - return None - - matrix = (matrix.T / matrix.sum(1)).T # row normalize - - ss_logo = logomaker.Logo( - matrix, - width=.8, - vpad=.05, - fade_probabilities=True, - stack_order='small_on_top', - ax=ax, - ) - pyplot.title( - "%s %d-mer (%s)" % (allele, length, models_label), y=0.85) - pyplot.xticks(matrix.index.values) - pyplot.tight_layout() - name = "%s.motifs.%s.png" % ( - allele.replace("*", "-").replace(":", "-"), models_label) - filename = os.path.abspath(join(out_dir, name)) - pyplot.savefig(filename) - print("Wrote: ", filename) - fig.clear() - pyplot.close(fig) - return name - - -def write_length_distribution( - length_distributions_df, allele, lengths, cutoff, models_label, out_dir): - length_distribution = length_distributions_df.loc[ - (length_distributions_df.allele == allele) & - (length_distributions_df.cutoff_fraction == cutoff) - ] - if length_distribution.shape[0] == 0: - return None - - length_distribution = length_distribution.set_index( - "length").reindex(lengths).fillna(0.0).reset_index() - - fig = pyplot.figure(figsize=(8, 2)) - length_distribution.plot(x="length", y="fraction", kind="bar", color="black") - pyplot.title("%s (%s)" % (allele, models_label)) - pyplot.xlabel("") - pyplot.xticks(rotation=0) - pyplot.gca().get_legend().remove() - name = "%s.lengths.%s.png" % ( - allele.replace("*", "-").replace(":", "-"), models_label) - - filename = os.path.abspath(join(out_dir, name)) - pyplot.savefig(filename) - print("Wrote: ", filename) - fig.clear() - pyplot.close(fig) - return name - - -def go(argv): - args = parser.parse_args(argv) - - if not exists(args.out_dir): - mkdir(args.out_dir) - - predictors = [ - ("combined", args.class1_models_dir), - ] - info_per_predictor = OrderedDict() - alleles = set() - for (label, models_dir) in predictors: - if not models_dir: - continue - info_per_predictor[label] = model_info(models_dir) - alleles.update( - info_per_predictor[label]["normalized_frequency_matrices"].allele.unique()) - - lines = [] - - def w(*pieces): - lines.extend(pieces) - - w('Motifs and length distributions from the pan-allele predictor') - w('=' * 80, "") - - w( - "Length distributions and binding motifs were calculated by ranking a " - "large set of random peptides (an equal number of peptides for each " - "length 8-15) by predicted affinity for each allele. " - "For length distribution, the top %g%% of peptides were collected and " - "their length distributions plotted. For sequence motifs, sequence " - "logos for the top %g%% " - "peptides for each length are shown.\n" % ( - args.length_cutoff * 100.0, - args.logo_cutoff * 100.0, - )) - - w(".. contents:: :local:", "") - - def image(name): - if name is None: - return "" - return '.. image:: %s\n' % name - - alleles = sorted(alleles, key=lambda a: ("HLA" not in a, a)) - if args.max_alleles: - alleles = alleles[:args.max_alleles] - - for allele in tqdm.tqdm(alleles): - w(allele, "-" * 80, "") - for (label, info) in info_per_predictor.items(): - length_distribution = info["length_distributions"] - normalized_frequency_matrices = info["normalized_frequency_matrices"] - - length_distribution_image_path = write_length_distribution( - length_distributions_df=length_distribution, - allele=allele, - lengths=args.length_distribution_lengths, - cutoff=args.length_cutoff, - out_dir=args.out_dir, - models_label=label) - if not length_distribution_image_path: - continue - w("*%s*\n") - if info['observations_per_allele'] is not None: - w("Training observations (unique peptides): %d" % ( - info['observations_per_allele'].get(allele, 0))) - w("\n") - w(image(length_distribution_image_path)) - w(image(write_logo( - normalized_frequency_matrices=normalized_frequency_matrices, - allele=allele, - lengths=args.motif_lengths, - cutoff=args.logo_cutoff, - out_dir=args.out_dir, - models_label=label, - ))) - w("") - - document_path = join(args.out_dir, "allele_motifs.rst") - with open(document_path, "w") as fd: - for line in lines: - fd.write(line) - fd.write("\n") - print("Wrote", document_path) - - -if __name__ == "__main__": - go(sys.argv[1:]) diff --git a/docs/intro.rst b/docs/intro.rst index dbd82ac4..5b5f3581 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -2,26 +2,33 @@ Introduction and setup ======================= MHCflurry is an open source package for peptide/MHC I binding affinity prediction. It -provides competitive accuracy with a fast and documented implementation. - -You can download pre-trained MHCflurry models fit to affinity measurements -deposited in IEDB (and a few other sources) -or train a MHCflurry predictor on your own data. - -Currently only allele-specific prediction is implemented, in which separate models -are trained for each allele. The released models therefore support a fixed set of common -class I alleles for which sufficient published training data is available -(see :ref:`models_supported_alleles`\ ). - -MHCflurry supports Python versions 2.7 and 3.4+. It uses the `keras <https://keras.io>`__ +attempts to provide competitive accuracy with a fast and documented implementation. + +You can download pre-trained MHCflurry models fit to mass spec-identified MHC I +ligands and peptide/MHC affinity measurements deposited in IEDB (plus a few other +sources) or train a MHCflurry predictor on your own data. + +Starting in version 1.6.0, the default MHCflurry binding affinity predictors +are "pan-allele" models that support most sequenced MHC I alleles across humans +and a few other species (about 14,000 alleles in total). This version also +introduces two experimental predictors, an "antigen processing" predictor +that attempts to model MHC allele-independent effects such as proteosomal +cleavage and a "presentation" predictor that integrates processing predictions +with binding affinity predictions to give a composite "presentation score." Both +models are trained on mass spec-identified MHC ligands. + +MHCflurry supports Python 3.4+. It uses the `keras <https://keras.io>`__ neural network library via either the Tensorflow or Theano backends. GPUs may -optionally be used for a generally modest speed improvement. +optionally be used for a modest speed improvement. If you find MHCflurry useful in your research please cite: - O'Donnell, T. et al., 2017. MHCflurry: open-source class I MHC - binding affinity prediction. bioRxiv. Available at: - http://www.biorxiv.org/content/early/2017/08/09/174243. + T. J. O’Donnell, et al., "MHCflurry: Open-Source Class I MHC Binding Affinity + Prediction," *Cell Systems*, 2018. + https://www.cell.com/cell-systems/fulltext/S2405-4712(18)30232-1. + +If you have questions or encounter problems, please file an issue at the +MHCflurry github repo: https://github.com/openvax/mhcflurry Installation (pip) diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst index 132cea7c..b906c29f 100644 --- a/docs/python_tutorial.rst +++ b/docs/python_tutorial.rst @@ -1,159 +1,183 @@ Python library tutorial ======================= -Predicting ----------- - The MHCflurry Python API exposes additional options and features beyond those -supported by the commandline tools. This tutorial gives a basic overview -of the most important functionality. See the :ref:`API-documentation` for further details. - -The `~mhcflurry.Class1AffinityPredictor` class is the primary user-facing interface. -Use the `~mhcflurry.Class1AffinityPredictor.load` static method to load a +supported by the commandline tools and can be more convenient for interactive +analyses and bioinformatic pipelines. This tutorial gives a basic overview +of the most important functionality. See the :ref:`API-documentation` for further +details. + +Loading a predictor +---------------------------------- + +Most prediction tasks can be performed using the +`~mhcflurry.Class1PresentationPredictor` class, which provides a programmatic API +to the functionality in the :ref:`mhcflurry-predict` and +:ref:`mhcflurry-predict-scan` commands. + +Instances of `~mhcflurry.Class1PresentationPredictor` wrap a +`~mhcflurry.Class1AffinityPredictor` to generate binding affinity predictions +and a `~mhcflurry.Class1ProcessingPredictor` to generate antigen processing +predictions. The presentation score is computed using a logistic regression +model over binding affinity and processing predictions. + +Use the `~mhcflurry.Class1PresentationPredictor.load` static method to load a trained predictor from disk. With no arguments this method will load the predictor released with MHCflurry (see :ref:`downloading`\ ). If you pass a path to a models directory, then it will load that predictor instead. -.. runblock:: pycon +.. doctest:: - >>> from mhcflurry import Class1AffinityPredictor - >>> predictor = Class1AffinityPredictor.load() - >>> predictor.supported_alleles[:10] + >>> from mhcflurry import Class1PresentationPredictor + >>> predictor = Class1PresentationPredictor.load() + >>> predictor.supported_alleles[:5] + ['Atbe-B*01:01', 'Atbe-E*03:01', 'Atbe-G*03:01', 'Atbe-G*03:02', 'Atbe-G*06:01'] + +Predicting for individual peptides +---------------------------------- -With a predictor loaded we can now generate some binding predictions: +To generate predictions for individual peptides, we can use the +`~mhcflurry.Class1AffinityPredictor.predict` method of the `~mhcflurry.Class1PresentationPredictor`, +loaded above. This method returns a `pandas.DataFrame` with binding affinity, processing, and presentation +predictions: -.. runblock:: pycon +.. doctest:: - >>> predictor.predict(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) + >>> predictor.predict( + ... peptides=["SIINFEKL", "NLVPMVATV"], + ... alleles=["HLA-A0201", "HLA-A0301"], + ... verbose=0) + peptide peptide_num sample_name affinity best_allele processing_score presentation_score + 0 SIINFEKL 0 sample1 12906.786173 HLA-A0201 0.101473 0.012503 + 1 NLVPMVATV 1 sample1 15.038358 HLA-A0201 0.676289 0.975463 + +Here, the list of alleles is taken to be an individual's MHC I genotype (i.e. up +to 6 alleles), and the strongest binder across alleles for each peptide is +reported. .. note:: - MHCflurry normalizes allele names using the `mhcnames <https://github.com/hammerlab/mhcnames>`__ + MHCflurry normalizes allele names using the `mhcnames <https://github.com/openvax/mhcnames>`__ package. Names like ``HLA-A0201`` or ``A*02:01`` will be normalized to ``HLA-A*02:01``, so most naming conventions can be used - with methods such as `~mhcflurry.Class1AffinityPredictor.predict`. - -For more detailed results, we can use -`~mhcflurry.Class1AffinityPredictor.predict_to_dataframe`. - -.. runblock:: pycon - - >>> predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) - -Instead of a single allele and multiple peptides, we may need predictions for -allele/peptide pairs. We can predict across pairs by specifying -the `alleles` argument instead of `allele`. The list of alleles -must be the same length as the list of peptides (i.e. it is predicting over pairs, -*not* taking the cross product). - -.. runblock:: pycon - - >>> predictor.predict(alleles=["HLA-A0201", "HLA-B*57:01"], peptides=["SIINFEKL", "SIINFEQL"]) - -Training --------- - -Let's fit our own MHCflurry predictor. First we need some training data. If you -haven't already, run this in a shell to download the MHCflurry training data: - -.. code-block:: shell - - $ mhcflurry-downloads fetch data_curated - -We can get the path to this data from Python using `mhcflurry.downloads.get_path`: - -.. runblock:: pycon - - >>> from mhcflurry.downloads import get_path - >>> data_path = get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2") - >>> data_path - -Now let's load it with pandas and filter to reasonably-sized peptides: - -.. runblock:: pycon - - >>> import pandas - >>> df = pandas.read_csv(data_path) - >>> df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)] - >>> df.head(5) - -We'll make an untrained `~mhcflurry.Class1AffinityPredictor` and then call -`~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` to fit -some models. - -.. runblock:: pycon - - >>> new_predictor = Class1AffinityPredictor() - >>> single_allele_train_data = df.loc[df.allele == "HLA-B*57:01"].sample(100) - >>> new_predictor.fit_allele_specific_predictors( - ... n_models=1, - ... architecture_hyperparameters_list=[{ - ... "layer_sizes": [16], - ... "max_epochs": 5, - ... "random_negative_constant": 5, - ... }], - ... peptides=single_allele_train_data.peptide.values, - ... affinities=single_allele_train_data.measurement_value.values, - ... allele="HLA-B*57:01") - - -The `~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` method -can be called any number of times on the same instance to build up ensembles -of models across alleles. The architecture hyperparameters we specified are -for demonstration purposes; to fit real models you would usually train for -more epochs. - -Now we can generate predictions: - -.. runblock:: pycon - - >>> new_predictor.predict(["SYNPEPII"], allele="HLA-B*57:01") - -We can save our predictor to the specified directory on disk by running: - -.. runblock:: pycon - - >>> new_predictor.save("/tmp/new-predictor") - -and restore it: + with methods such as `~mhcflurry.Class1PresentationPredictor.predict`. + +If you have multiple sample genotypes, you can pass a dict, where the +keys are arbitrary sample names: + +.. doctest:: + + >>> predictor.predict( + ... peptides=["KSEYMTSWFY", "NLVPMVATV"], + ... alleles={ + ... "sample1": ["A0201", "A0301", "B0702", "B4402", "C0201", "C0702"], + ... "sample2": ["A0101", "A0206", "B5701", "C0202"], + ... }, + ... verbose=0) + peptide peptide_num sample_name affinity best_allele processing_score presentation_score + 0 KSEYMTSWFY 0 sample1 16737.745268 A0301 0.381632 0.026550 + 1 NLVPMVATV 1 sample1 15.038358 A0201 0.676289 0.975463 + 2 KSEYMTSWFY 0 sample2 62.540779 A0101 0.381632 0.796731 + 3 NLVPMVATV 1 sample2 15.765500 A0206 0.676289 0.974439 + +Here the strongest binder for each sample / peptide pair is returned. + +Many users will focus on the binding affinity predictions, as the +processing and presentation predictions are experimental. If you do use the latter +scores, however, when available you should provide the upstream (N-flank) +and downstream (C-flank) sequences from the source proteins of the peptides for +a small boost in accuracy. To do so, specify the ``n_flank`` and ``c_flank`` +arguments, which give the flanking sequences for the corresponding peptides: + +.. doctest:: + + >>> predictor.predict( + ... peptides=["KSEYMTSWFY", "NLVPMVATV"], + ... n_flanks=["NNNNNNN", "SSSSSSSS"], + ... c_flanks=["CCCCCCCC", "YYYAAAA"], + ... alleles={ + ... "sample1": ["A0201", "A0301", "B0702", "B4402", "C0201", "C0702"], + ... "sample2": ["A0101", "A0206", "B5701", "C0202"], + ... }, + ... verbose=0) + peptide n_flank c_flank peptide_num sample_name affinity best_allele processing_score presentation_score + 0 KSEYMTSWFY NNNNNNN CCCCCCCC 0 sample1 16737.745268 A0301 0.605816 0.056190 + 1 NLVPMVATV SSSSSSSS YYYAAAA 1 sample1 15.038358 A0201 0.824994 0.986719 + 2 KSEYMTSWFY NNNNNNN CCCCCCCC 0 sample2 62.540779 A0101 0.605816 0.897493 + 3 NLVPMVATV SSSSSSSS YYYAAAA 1 sample2 15.765500 A0206 0.824994 0.986155 + +Scanning protein sequences +-------------------------- + +The `~mhcflurry.Class1PresentationPredictor.predict_sequences` method supports +scanning protein sequences for MHC ligands. Here's an example to identify all +peptides with a predicted binding affinity of 500 nM or tighter to any allele +across two sample genotypes and two short peptide sequences. + +.. doctest:: + + >>> predictor.predict_sequences( + ... sequences={ + ... 'protein1': "MDSKGSSQKGSRLLLLLVVSNLL", + ... 'protein2': "SSLPTPEDKEQAQQTHH", + ... }, + ... alleles={ + ... "sample1": ["A0201", "A0301", "B0702"], + ... "sample2": ["A0101", "C0202"], + ... }, + ... result="filtered", + ... comparison_quantity="affinity", + ... filter_value=500, + ... verbose=0) + sequence_name pos peptide n_flank c_flank sample_name affinity best_allele affinity_percentile processing_score presentation_score + 0 protein1 13 LLLLVVSNL MDSKGSSQKGSRL L sample1 38.206225 A0201 0.380125 0.017644 0.571060 + 1 protein1 14 LLLVVSNLL MDSKGSSQKGSRLL sample1 42.243472 A0201 0.420250 0.090984 0.619213 + 2 protein1 5 SSQKGSRLL MDSKG LLLVVSNLL sample2 66.749223 C0202 0.803375 0.383608 0.774468 + 3 protein1 6 SQKGSRLLL MDSKGS LLVVSNLL sample2 178.033467 C0202 1.820000 0.275019 0.482206 + 4 protein1 13 LLLLVVSNLL MDSKGSSQKGSRL sample1 202.208167 A0201 1.112500 0.058782 0.261320 + 5 protein1 12 LLLLLVVSNL MDSKGSSQKGSR L sample1 202.506582 A0201 1.112500 0.010025 0.225648 + 6 protein2 0 SSLPTPEDK EQAQQTHH sample1 335.529377 A0301 1.011750 0.010443 0.156798 + 7 protein2 0 SSLPTPEDK EQAQQTHH sample2 353.451759 C0202 2.674250 0.010443 0.150753 + 8 protein1 8 KGSRLLLLL MDSKGSSQ VVSNLL sample2 410.327286 C0202 2.887000 0.121374 0.194081 + 9 protein1 5 SSQKGSRL MDSKG LLLLVVSNLL sample2 477.285937 C0202 3.107375 0.111982 0.168572 -.. runblock:: pycon +When using ``predict_sequences``, the flanking sequences for each peptide are +automatically included in the processing and presentation predictions. - >>> new_predictor2 = Class1AffinityPredictor.load("/tmp/new-predictor") - >>> new_predictor2.supported_alleles +See the documentation for `~mhcflurry.Class1PresentationPredictor` for other +useful methods. -Lower level interface ---------------------- +Lower level interfaces +---------------------------------- -The high-level `Class1AffinityPredictor` delegates to low-level -`~mhcflurry.Class1NeuralNetwork` objects, each of which represents -a single neural network. The purpose of `~mhcflurry.Class1AffinityPredictor` -is to implement several important features: +The `~mhcflurry.Class1PresentationPredictor` predictor delegates to a +`~mhcflurry.Class1AffinityPredictor` instance for binding affinity predictions. +If all you need are binding affinities, you can use this instance directly. -ensembles - More than one neural network can be used to generate each prediction. The - predictions returned to the user are the geometric mean of the individual - model predictions. This gives higher accuracy in most situations +Here's an example: -multiple alleles - A `~mhcflurry.Class1NeuralNetwork` generates predictions for only a single - allele. The `~mhcflurry.Class1AffinityPredictor` maps alleles to the - relevant `~mhcflurry.Class1NeuralNetwork` instances +.. doctest:: -serialization - Loading and saving predictors is implemented in `~mhcflurry.Class1AffinityPredictor`. + >>> from mhcflurry import Class1AffinityPredictor + >>> predictor = Class1AffinityPredictor.load() + >>> predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) + peptide allele prediction prediction_low prediction_high prediction_percentile + 0 SIINFEKL HLA-A0201 12906.786173 8829.460289 18029.923061 6.566375 + 1 SIINFEQL HLA-A0201 13025.300796 9050.056312 18338.004869 6.623625 -Sometimes it's easiest to work directly with `~mhcflurry.Class1NeuralNetwork`. -Here is a simple example of doing so: +The ``prediction_low`` and ``prediction_high`` fields give the 5-95 percentile +predictions across the models in the ensemble. This detailed information is not +available through the higher-level `~mhcflurry.Class1PresentationPredictor` +interface. -.. runblock:: pycon +Under the hood, `Class1AffinityPredictor` itself delegates to an ensemble of +of `~mhcflurry.Class1NeuralNetwork` instances, which implement the neural network +models used for prediction. To fit your own affinity prediction models, call +`~mhcflurry.Class1NeuralNetwork.fit`. - >>> from mhcflurry import Class1NeuralNetwork - >>> network = Class1NeuralNetwork() - >>> network.fit( - ... single_allele_train_data.peptide.values, - ... single_allele_train_data.measurement_value.values, - ... verbose=0) - >>> network.predict(["SIINFEKLL"]) +You can similarly use `~mhcflurry.Class1ProcessingPredictor` directly for +antigen processing prediction, and there is a low-level +`~mhcflurry.Class1ProcessingNeuralNetwork` with a `~mhcflurry.Class1ProcessingNeuralNetwork.fit` method. +See the API documentation of these classes for details. \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt index d47a6bca..faadab82 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,10 +1,9 @@ sphinx -sphinxcontrib-autorun sphinxcontrib-programoutput sphinxcontrib-autoprogram sphinx-rtd-theme numpydoc -pypandoc +ypandoc mhctools pydot tabulate diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index c66586cb..9c72a602 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -5,6 +5,7 @@ Class I MHC ligand prediction package from .class1_affinity_predictor import Class1AffinityPredictor from .class1_neural_network import Class1NeuralNetwork from .class1_processing_predictor import Class1ProcessingPredictor +from .class1_processing_neural_network import Class1ProcessingNeuralNetwork from .class1_presentation_predictor import Class1PresentationPredictor from .version import __version__ @@ -12,6 +13,8 @@ from .version import __version__ __all__ = [ "__version__", "Class1AffinityPredictor", - "Class1NeuralNetwork", "Class1ProcessingPredictor", + "Class1NeuralNetwork", + "Class1ProcessingPredictor", + "Class1ProcessingNeuralNetwork", "Class1PresentationPredictor", ] diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py index bdb39705..37fe175e 100644 --- a/mhcflurry/class1_presentation_predictor.py +++ b/mhcflurry/class1_presentation_predictor.py @@ -46,7 +46,7 @@ class Class1PresentationPredictor(object): a "presentation score" prediction. Most users will call the `load` static method to get an instance of this - class, then call the `predict_to_dataframe` method to generate predictions. + class, then call the `predict` method to generate predictions. """ model_inputs = ["affinity_score", "processing_score"] @@ -85,7 +85,7 @@ class Class1PresentationPredictor(object): peptides, alleles, sample_names=None, - include_affinity_percentile=False, + include_affinity_percentile=True, verbose=1, throw=True): """ @@ -391,68 +391,6 @@ class Class1PresentationPredictor(object): return model def predict( - self, - peptides, - alleles, - sample_names=None, - n_flanks=None, - c_flanks=None, - verbose=1): - """ - Predict presentation scores across a set of peptides. - - Presentation scores combine predictions for MHC I binding affinity - and antigen processing. - - For intermediate results, see the `predict_to_dataframe` method. - - Parameters - ---------- - peptides : list of string - Peptide sequences - alleles : list of string or string -> string dict - If you are predicting for a single sample, pass a list of strings - (up to 6) indicating the genotype. If you are predicting across - multiple samples, pass a dict where the keys are (arbitrary) - sample names and the values are the alleles to predict for that - sample. - sample_names : list of string [same length as peptides] - If you are passing a dict for 'alleles', use this argument to - specify which peptides go with which sample. - n_flanks : list of string [same length as peptides] - Upstream sequences before the peptide. Sequences of any length can - be given and a suffix of the size supported by the model will be - used. - c_flanks : list of string [same length as peptides] - Downstream sequences after the peptide. Sequences of any length can - be given and a prefix of the size supported by the model will be - used. - verbose : int - Set to 0 for quiet mode. - - Returns - ------- - numpy.array - - Presentation scores for each peptide. Scores range from 0 to 1, with - higher values indicating more favorable presentation likelihood. - """ - if isinstance(alleles, dict): - if sample_names is None: - raise ValueError( - "sample_names must be supplied when alleles is a dict. " - "Alternatively, call predict_to_dataframe to predict over " - "all samples") - - return self.predict_to_dataframe( - peptides=peptides, - alleles=alleles, - sample_names=sample_names, - n_flanks=n_flanks, - c_flanks=c_flanks, - verbose=verbose).presentation_score.values - - def predict_to_dataframe( self, peptides, alleles, @@ -475,7 +413,7 @@ class Class1PresentationPredictor(object): Example: >>> predictor = Class1PresentationPredictor.load() - >>> predictor.predict_to_dataframe( + >>> predictor.predict( ... peptides=["SIINFEKL", "PEPTIDE"], ... n_flanks=["NNN", "SNS"], ... c_flanks=["CCC", "CNC"], @@ -766,7 +704,7 @@ class Class1PresentationPredictor(object): c_flanks.append( sequence[peptide_start + peptide_length : c_flank_end]) - result_df = self.predict_to_dataframe( + result_df = self.predict( peptides=peptides, alleles=alleles, n_flanks=n_flanks, diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py index ecb35872..b054b696 100644 --- a/mhcflurry/predict_command.py +++ b/mhcflurry/predict_command.py @@ -275,7 +275,7 @@ def run(argv=sys.argv[1:]): "No flanking information provided. Specify --no-flanking " "to silence this warning") - predictions = predictor.predict_to_dataframe( + predictions = predictor.predict( peptides=df[args.peptide_column].values, n_flanks=n_flanks, c_flanks=c_flanks, -- GitLab