From d52134fed62c8dddc3461c3e0b666197ba48ba50 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Thu, 21 Dec 2017 16:10:32 -0500 Subject: [PATCH] update docs --- docs/commandline_tools.rst | 14 ++ docs/commandline_tutorial.rst | 117 ++++++++++--- docs/conf.py | 1 + docs/index.rst | 1 + docs/package_readme/readme.generated.rst | 156 +++++++++++++----- docs/python_tutorial.rst | 4 +- docs/requirements.txt | 1 + mhcflurry/downloads_command.py | 13 -- mhcflurry/predict_command.py | 17 +- .../train_allele_specific_models_command.py | 1 - 10 files changed, 225 insertions(+), 100 deletions(-) create mode 100644 docs/commandline_tools.rst diff --git a/docs/commandline_tools.rst b/docs/commandline_tools.rst new file mode 100644 index 00000000..3b36a80a --- /dev/null +++ b/docs/commandline_tools.rst @@ -0,0 +1,14 @@ +Command-line reference +============================ + +.. _mhcflurry-predict: +.. autoprogram:: mhcflurry.predict_command:parser + :prog: mhcflurry-predict + +.. _mhcflurry-class1-train-allele-specific-models: +.. autoprogram:: mhcflurry.train_allele_specific_models_command:parser + :prog: mhcflurry-class1-train-allele-specific-models + +.. _mhcflurry-downloads: +.. autoprogram:: mhcflurry.downloads_command:parser + :prog: mhcflurry-downloads diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst index ab5fdaff..0f33229f 100644 --- a/docs/commandline_tutorial.rst +++ b/docs/commandline_tutorial.rst @@ -1,49 +1,112 @@ -Command-line usage -================== +Command-line tutorial +===================== Downloading models ------------------ Most users will use pre-trained MHCflurry models that we release. These models -are distributed separately from the source code and may be downloaded with the -following command: +are distributed separately from the pip package and may be downloaded with the +:ref:`mhcflurry-downloads` tool: + +.. code-block:: shell -.. code: shell $ mhcflurry-downloads fetch models_class1 -We also release other "downloads," such as curated training data and some +We also release a few other "downloads," such as curated training data and some experimental models. To see what you have downloaded, run: -.. code: shell - $ mhcflurry-downloads info +.. command-output:: mhcflurry-downloads info + :nostderr: + +Files downloaded with :ref:`mhcflurry-downloads` are stored in a platform-specific +directory. To get the path to downloaded data, you can use: +.. command-output:: mhcflurry-downloads path models_class1 + :nostderr: -mhcflurry-predict ------------------ -The ``mhcflurry-predict`` command generates predictions from the command-line. -It defaults to using the pre-trained models you downloaded above but this can -be customized with the ``--models`` argument. See ``mhcflurry-predict -h`` for -details. +Generating predictions +---------------------- -.. command-output:: mhcflurry-predict --alleles HLA-A0201 HLA-A0301 --peptides SIINFEKL SIINFEKD SIINFEKQ +The :ref:`mhcflurry-predict` command generates predictions from the command-line. +By default it will use the pre-trained models you downloaded above but other +models can be used by specifying the ``--models`` argument. + +Running: + +.. command-output:: + mhcflurry-predict + --alleles HLA-A0201 HLA-A0301 + --peptides SIINFEKL SIINFEKD SIINFEKQ + --out /tmp/predictions.csv :nostderr: -The predictions returned are affinities (KD) in nM. The ``prediction_low`` and -``prediction_high`` fields give the 5-95 percentile predictions across -the models in the ensemble. The predictions above were generated with MHCflurry -|version|. +results in a file like this: -Your exact predictions may vary slightly from these (up to about 1 nM) depending -on the Keras backend in use and other numerical details. Different versions of -MHCflurry can of course give results considerably different from these. +.. command-output:: + head -n 3 /tmp/predictions.csv + +The predictions are given as affinities (KD) in nM in the ``mhcflurry_prediction`` +column. The other fields give the 5-95 percentile predictions across +the models in the ensemble and the quantile of the affinity prediction among +a large number of random peptides tested on that allele. + +The predictions shown above were generated with MHCflurry |version|. Different versions of +MHCflurry can give considerably different results. Even +on the same version, your exact predictions may vary (up to about 1 nM) depending +on the Keras backend and other details. -You can also specify the input and output as CSV files. Run -``mhcflurry-predict -h`` for details. +In most cases you'll want to specify the input as a CSV file instead of passing +peptides and alleles as commandline arguments. See :ref:`mhcflurry-predict` docs. Fitting your own models ----------------------- +The :ref:`mhcflurry-class1-train-allele-specific-models` command is used to +fit models to training data. The models we release with MHCflurry are trained +with a command like: + +.. code-block:: shell + + $ mhcflurry-class1-train-allele-specific-models \ + --data TRAINING_DATA.csv \ + --hyperparameters hyperparameters.yaml \ + --percent-rank-calibration-num-peptides-per-length 1000000 \ + --min-measurements-per-allele 75 \ + --out-models-dir models + +MHCflurry predictors are serialized to disk as many files in a directory. The +command above will write the models to the output directory specified by the +``--out-models-dir`` argument. This directory has files like: + +.. program-output:: + ls "$(mhcflurry-downloads path models_class1)/models" + :shell: + :nostderr: + :ellipsis: 3,-3 + +The ``manifest.csv`` file gives metadata for all the models used in the predictor. +There will be a ``weights_...`` file for each model giving its weights +(the parameters for the neural network). The ``percent_ranks.csv`` stores a +histogram of model predictions for each allele over a large number of random +peptides. It is used for generating the percent ranks at prediction time. + +To call :ref:`mhcflurry-class1-train-allele-specific-models` you'll need some +training data. The data we use for our released predictors can be downloaded with +:ref:`mhcflurry-downloads`: + +.. code-block:: shell + + $ mhcflurry-downloads fetch data_curated + +It looks like this: + +.. command-output:: + bzcat "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" | head -n 3 + :shell: + :nostderr: + + Scanning protein sequences for predicted epitopes ------------------------------------------------- @@ -54,7 +117,7 @@ Here is an example. First, install ``mhctools`` if it is not already installed: -.. code:: shell +.. code-block:: shell $ pip install mhctools @@ -72,11 +135,11 @@ Here's the ``mhctools`` invocation. See ``mhctools -h`` for more information. --mhc-alleles A02:01,A03:01 --mhc-peptide-lengths 8,9,10,11 --extract-subsequences - --output-csv /tmp/result.csv + --output-csv /tmp/subsequence_predictions.csv :ellipsis: 2,-2 :nostderr: This will write a file giving predictions for all subsequences of the specified lengths: .. command-output:: - head -n 3 /tmp/result.csv + head -n 3 /tmp/subsequence_predictions.csv diff --git a/docs/conf.py b/docs/conf.py index ffd7f081..7df14e7a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,6 +40,7 @@ extensions = [ 'numpydoc', 'sphinx_autorun', 'sphinxcontrib.programoutput', + 'sphinxcontrib.autoprogram', ] # Add any paths that contain templates here, relative to this directory. diff --git a/docs/index.rst b/docs/index.rst index 750e3f6b..b98a0d89 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -11,6 +11,7 @@ Contents: python_tutorial models_supported_alleles models + commandline_tools api diff --git a/docs/package_readme/readme.generated.rst b/docs/package_readme/readme.generated.rst index d590e1da..e431863f 100644 --- a/docs/package_readme/readme.generated.rst +++ b/docs/package_readme/readme.generated.rst @@ -70,55 +70,133 @@ Then continue as above: mhcflurry-downloads fetch -Command-line usage -================== +Command-line tutorial +===================== Downloading models ****************** Most users will use pre-trained MHCflurry models that we release. -These models are distributed separately from the source code and may -be downloaded with the following command: +These models are distributed separately from the pip package and may +be downloaded with the mhcflurry-downloads tool: -We also release other “downloads,” such as curated training data and -some experimental models. To see what you have downloaded, run: + $ mhcflurry-downloads fetch models_class1 +We also release a few other “downloads,” such as curated training data +and some experimental models. To see what you have downloaded, run: -mhcflurry-predict -***************** + $ mhcflurry-downloads info + Environment variables + MHCFLURRY_DATA_DIR [unset or empty] + MHCFLURRY_DOWNLOADS_CURRENT_RELEASE [unset or empty] + MHCFLURRY_DOWNLOADS_DIR [unset or empty] -The "mhcflurry-predict" command generates predictions from the -command-line. It defaults to using the pre-trained models you -downloaded above but this can be customized with the "--models" -argument. See "mhcflurry-predict -h" for details. + Configuration + current release = 1.0.0 + downloads dir = '/Users/tim/Library/Application Support/mhcflurry/4/1.0.0' [exists] - $ mhcflurry-predict --alleles HLA-A0201 HLA-A0301 --peptides SIINFEKL SIINFEKD SIINFEKQ + DOWNLOAD NAME DOWNLOADED? DEFAULT? URL + models_class1 YES YES http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1.tar.bz2 + models_class1_experiments1 NO NO http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2 + cross_validation_class1 YES NO http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2 + data_iedb NO NO https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 + data_kim2014 NO NO http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 + data_curated YES YES https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2 + +Files downloaded with mhcflurry-downloads are stored in a platform- +specific directory. To get the path to downloaded data, you can use: + + $ mhcflurry-downloads path models_class1 + /Users/tim/Library/Application Support/mhcflurry/4/1.0.0/models_class1/ + + +Generating predictions +********************** + +The mhcflurry-predict command generates predictions from the command- +line. By default it will use the pre-trained models you downloaded +above but other models can be used by specifying the "--models" +argument. + +Running: + + $ mhcflurry-predict + --alleles HLA-A0201 HLA-A0301 + --peptides SIINFEKL SIINFEKD SIINFEKQ + --out /tmp/predictions.csv + Wrote: /tmp/predictions.csv + +results in a file like this: + + $ head -n 3 /tmp/predictions.csv allele,peptide,mhcflurry_prediction,mhcflurry_prediction_low,mhcflurry_prediction_high,mhcflurry_prediction_percentile HLA-A0201,SIINFEKL,4899.047843425702,2767.7636539507857,7269.683642935029,6.509787499999997 HLA-A0201,SIINFEKD,21050.420242970613,16834.65859138968,24129.046091695887,34.297175 - HLA-A0201,SIINFEKQ,21048.47265780004,16736.561254929948,24111.013114442652,34.297175 - HLA-A0301,SIINFEKL,28227.298909150148,24826.30790978725,32714.28597399942,33.95121249999998 - HLA-A0301,SIINFEKD,30816.721218383507,27685.50847082019,36037.32590461623,41.22577499999998 - HLA-A0301,SIINFEKQ,24183.021046496786,19346.154182011513,32263.71247531383,24.81096249999999 -The predictions returned are affinities (KD) in nM. The -"prediction_low" and "prediction_high" fields give the 5-95 percentile -predictions across the models in the ensemble. The predictions above -were generated with MHCflurry 1.0.0. +The predictions are given as affinities (KD) in nM in the +"mhcflurry_prediction" column. The other fields give the 5-95 +percentile predictions across the models in the ensemble and the +quantile of the affinity prediction among a large number of random +peptides tested on that allele. -Your exact predictions may vary slightly from these (up to about 1 nM) -depending on the Keras backend in use and other numerical details. -Different versions of MHCflurry can of course give results -considerably different from these. +The predictions shown above were generated with MHCflurry 1.0.0. +Different versions of MHCflurry can give considerably different +results. Even on the same version, your exact predictions may vary (up +to about 1 nM) depending on the Keras backend and other details. -You can also specify the input and output as CSV files. Run -"mhcflurry-predict -h" for details. +In most cases you’ll want to specify the input as a CSV file instead +of passing peptides and alleles as commandline arguments. See +mhcflurry-predict docs. Fitting your own models *********************** +The mhcflurry-class1-train-allele-specific-models command is used to +fit models to training data. The models we release with MHCflurry are +trained with a command like: + + $ mhcflurry-class1-train-allele-specific-models \ + --data TRAINING_DATA.csv \ + --hyperparameters hyperparameters.yaml \ + --percent-rank-calibration-num-peptides-per-length 1000000 \ + --min-measurements-per-allele 75 \ + --out-models-dir models + +MHCflurry predictors are serialized to disk as many files in a +directory. The command above will write the models to the output +directory specified by the "--out-models-dir" argument. This directory +has files like: + + manifest.csv + percent_ranks.csv + weights_BOLA-6*13:01-0-1e6e7c0610ac68f8.npz + ... + weights_PATR-B*24:01-0-e12e0ee723833110.npz + weights_PATR-B*24:01-0-ec4a36529321d868.npz + weights_PATR-B*24:01-0-fd5a340098d3a9f4.npz + +The "manifest.csv" file gives metadata for all the models used in the +predictor. There will be a "weights_..." file for each model giving +its weights (the parameters for the neural network). The +"percent_ranks.csv" stores a histogram of model predictions for each +allele over a large number of random peptides. It is used for +generating the percent ranks at prediction time. + +To call mhcflurry-class1-train-allele-specific-models you’ll need some +training data. The data we use for our released predictors can be +downloaded with mhcflurry-downloads: + + $ mhcflurry-downloads fetch data_curated + +It looks like this: + + $ bzcat "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" | head -n 3 + allele,peptide,measurement_value,measurement_type,measurement_source,original_allele + BoLA-1*21:01,AENDTLVVSV,7817.0,quantitative,Barlow - purified MHC/competitive/fluorescence,BoLA-1*02101 + BoLA-1*21:01,NQFNGGCLLV,1086.0,quantitative,Barlow - purified MHC/direct/fluorescence,BoLA-1*02101 + Scanning protein sequences for predicted epitopes ************************************************* @@ -150,24 +228,24 @@ information. --mhc-alleles A02:01,A03:01 --mhc-peptide-lengths 8,9,10,11 --extract-subsequences - --output-csv /tmp/result.csv - 2017-12-21 14:13:47,847 - mhctools.cli.args - INFO - Building MHC binding prediction type for alleles ['HLA-A*02:01', 'HLA-A*03:01'] and epitope lengths [8, 9, 10, 11] - 2017-12-21 14:13:52,753 - mhctools.cli.script - INFO - + --output-csv /tmp/subsequence_predictions.csv + 2017-12-21 14:26:39,143 - mhctools.cli.args - INFO - Building MHC binding prediction type for alleles ['HLA-A*02:01', 'HLA-A*03:01'] and epitope lengths [8, 9, 10, 11] + 2017-12-21 14:26:45,471 - mhctools.cli.script - INFO - ... [1192 rows x 8 columns] - Wrote: /tmp/result.csv + Wrote: /tmp/subsequence_predictions.csv This will write a file giving predictions for all subsequences of the specified lengths: - $ head -n 3 /tmp/result.csv + $ head -n 3 /tmp/subsequence_predictions.csv source_sequence_name,offset,peptide,allele,affinity,percentile_rank,prediction_method_name,length protein2,42,AARYSAFY,HLA-A*02:01,33829.639361000336,73.7865875,mhcflurry,8 protein2,42,AARYSAFYN,HLA-A*02:01,29747.41688667342,60.34871249999998,mhcflurry,9 -Library usage -============= +Python library tutorial +======================= The MHCflurry Python API exposes additional options and features beyond those supported by the commandline tools. This tutorial gives a @@ -178,14 +256,8 @@ The "Class1AffinityPredictor" class is the primary user-facing interface. - >>> import mhcflurry - >>> print("MHCflurry version: %s" % (mhcflurry.__version__)) - MHCflurry version: 1.0.0 - >>> - >>> # Load downloaded predictor - >>> predictor = mhcflurry.Class1AffinityPredictor.load() - >>> print(predictor.supported_alleles) - ['BoLA-6*13:01', 'Eqca-1*01:01', 'H-2-Db', 'H-2-Dd', 'H-2-Kb', 'H-2-Kd', 'H-2-Kk', 'H-2-Ld', 'HLA-A*01:01', 'HLA-A*02:01', 'HLA-A*02:02', 'HLA-A*02:03', 'HLA-A*02:05', 'HLA-A*02:06', 'HLA-A*02:07', 'HLA-A*02:11', 'HLA-A*02:12', 'HLA-A*02:16', 'HLA-A*02:17', 'HLA-A*02:19', 'HLA-A*02:50', 'HLA-A*03:01', 'HLA-A*11:01', 'HLA-A*23:01', 'HLA-A*24:01', 'HLA-A*24:02', 'HLA-A*24:03', 'HLA-A*25:01', 'HLA-A*26:01', 'HLA-A*26:02', 'HLA-A*26:03', 'HLA-A*29:02', 'HLA-A*30:01', 'HLA-A*30:02', 'HLA-A*31:01', 'HLA-A*32:01', 'HLA-A*32:07', 'HLA-A*33:01', 'HLA-A*66:01', 'HLA-A*68:01', 'HLA-A*68:02', 'HLA-A*68:23', 'HLA-A*69:01', 'HLA-A*80:01', 'HLA-B*07:01', 'HLA-B*07:02', 'HLA-B*08:01', 'HLA-B*08:02', 'HLA-B*08:03', 'HLA-B*14:02', 'HLA-B*15:01', 'HLA-B*15:02', 'HLA-B*15:03', 'HLA-B*15:09', 'HLA-B*15:17', 'HLA-B*15:42', 'HLA-B*18:01', 'HLA-B*27:01', 'HLA-B*27:03', 'HLA-B*27:04', 'HLA-B*27:05', 'HLA-B*27:06', 'HLA-B*27:20', 'HLA-B*35:01', 'HLA-B*35:03', 'HLA-B*35:08', 'HLA-B*37:01', 'HLA-B*38:01', 'HLA-B*39:01', 'HLA-B*40:01', 'HLA-B*40:02', 'HLA-B*42:01', 'HLA-B*44:01', 'HLA-B*44:02', 'HLA-B*44:03', 'HLA-B*45:01', 'HLA-B*45:06', 'HLA-B*46:01', 'HLA-B*48:01', 'HLA-B*51:01', 'HLA-B*53:01', 'HLA-B*54:01', 'HLA-B*57:01', 'HLA-B*58:01', 'HLA-B*73:01', 'HLA-B*83:01', 'HLA-C*03:03', 'HLA-C*03:04', 'HLA-C*04:01', 'HLA-C*05:01', 'HLA-C*06:02', 'HLA-C*07:01', 'HLA-C*07:02', 'HLA-C*08:02', 'HLA-C*12:03', 'HLA-C*14:02', 'HLA-C*15:02', 'Mamu-A*01:01', 'Mamu-A*02:01', 'Mamu-A*02:0102', 'Mamu-A*07:01', 'Mamu-A*07:0103', 'Mamu-A*11:01', 'Mamu-A*22:01', 'Mamu-A*26:01', 'Mamu-B*01:01', 'Mamu-B*03:01', 'Mamu-B*08:01', 'Mamu-B*10:01', 'Mamu-B*17:01', 'Mamu-B*17:04', 'Mamu-B*39:01', 'Mamu-B*52:01', 'Mamu-B*66:01', 'Mamu-B*83:01', 'Mamu-B*87:01', 'Patr-A*01:01', 'Patr-A*03:01', 'Patr-A*04:01', 'Patr-A*07:01', 'Patr-A*09:01', 'Patr-B*01:01', 'Patr-B*13:01', 'Patr-B*24:01'] + /Users/tim/miniconda3/envs/py3k/lib/python3.5/site-packages/matplotlib/__init__.py:913: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter. + warnings.warn(self.msg_depr % (key, alt_key)) # coding: utf-8 diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst index 7222e201..7d130148 100644 --- a/docs/python_tutorial.rst +++ b/docs/python_tutorial.rst @@ -1,5 +1,5 @@ -Library usage -============= +Python library tutorial +======================= The MHCflurry Python API exposes additional options and features beyond those supported by the commandline tools. This tutorial gives a basic overview diff --git a/docs/requirements.txt b/docs/requirements.txt index a37e6504..c4a411cc 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,5 +1,6 @@ sphinx-autorun sphinxcontrib-programoutput +sphinxcontrib-autoprogram sphinx numpydoc pypandoc diff --git a/mhcflurry/downloads_command.py b/mhcflurry/downloads_command.py index 6ddf11ca..08648a99 100644 --- a/mhcflurry/downloads_command.py +++ b/mhcflurry/downloads_command.py @@ -1,16 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ''' Download MHCflurry released datasets and trained models. diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py index 0b35d348..d7fdc0a2 100644 --- a/mhcflurry/predict_command.py +++ b/mhcflurry/predict_command.py @@ -1,16 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ''' Run MHCflurry predictor on specified peptide/allele pairs. @@ -19,7 +6,7 @@ Examples: Write a CSV file containing the contents of INPUT.csv plus an additional column giving MHCflurry binding affinity predictions: - mhcflurry-predict INPUT.csv --out RESULT.csv + $ mhcflurry-predict INPUT.csv --out RESULT.csv The input CSV file is expected to contain columns 'allele' and 'peptide'. The predictions are written to a column called 'mhcflurry_prediction'. @@ -32,7 +19,7 @@ You can also run on alleles and peptides specified on the commandline, in which case predictions are written for all combinations of alleles and peptides: - mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL + $ mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL ''' from __future__ import ( print_function, diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py index 8345ec9f..6602761b 100644 --- a/mhcflurry/train_allele_specific_models_command.py +++ b/mhcflurry/train_allele_specific_models_command.py @@ -1,6 +1,5 @@ """ Train Class1 single allele models. - """ import argparse import os -- GitLab