diff --git a/.gitignore b/.gitignore index e44e39073dba1cfc73a4f18cab410469ce0737b3..f09f2154082bb5325c7d291fa082ed530789a0e0 100644 --- a/.gitignore +++ b/.gitignore @@ -58,6 +58,8 @@ coverage.xml # Sphinx documentation docs/_build/ +docs/_static +docs/_templates # PyBuilder target/ diff --git a/README.md b/README.md index bf02a88b45a5e4d4a3de740685a082dfc4c8105c..e6552afb6a2a686c6137e3e11b513925e901edd7 100644 --- a/README.md +++ b/README.md @@ -1,138 +1,42 @@ [](https://travis-ci.org/hammerlab/mhcflurry) [](https://coveralls.io/github/hammerlab/mhcflurry?branch=master) # mhcflurry -Open source neural network models for peptide-MHC binding affinity prediction - -The [adaptive immune system](https://en.wikipedia.org/wiki/Adaptive_immune_system) -depends on the presentation of protein fragments by [MHC](https://en.wikipedia.org/wiki/Major_histocompatibility_complex) -molecules. Machine learning models of this interaction are used in studies of -infectious diseases, autoimmune diseases, vaccine development, and cancer -immunotherapy. +[MHC I](https://en.wikipedia.org/wiki/MHC_class_I) ligand +prediction package with competitive accuracy and a fast and +[documented](http://www.hammerlab.org/mhcflurry/) implementation. MHCflurry supports Class I peptide/MHC binding affinity prediction using -ensembles of allele-specific models. You can fit MHCflurry models to your own data or download models that we fit to data from -[IEDB](http://www.iedb.org/home_v3.php) and [Kim 2014](http://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-15-241). -Our combined dataset is available for download [here](https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/data_curated.tar.bz2). - -Pan-allelic prediction is supported in principle but is not yet performing -accurately. Infrastructure for modeling other aspects of antigen -processing is also implemented but experimental. +ensembles of allele-specific models. It runs on Python 2.7 and 3.4+ using +the [keras](https://keras.io) neural network library. It exposes [command-line](http://www.hammerlab.org/mhcflurry/commandline_tutorial.html) +and [Python library](http://www.hammerlab.org/mhcflurry/python_tutorial.html) interfaces. If you find MHCflurry useful in your research please cite: > O'Donnell, T. et al., 2017. MHCflurry: open-source class I MHC binding affinity prediction. bioRxiv. Available at: http://www.biorxiv.org/content/early/2017/08/09/174243. -## Setup (pip) +## Installation (pip) Install the package: ``` -pip install mhcflurry +$ pip install mhcflurry ``` Then download our datasets and trained models: ``` -mhcflurry-downloads fetch -``` - -From a checkout you can run the unit tests with: - -``` -nosetests . -``` - -The MHCflurry predictors are implemented in Python using [keras](https://keras.io). - -MHCflurry works with both the tensorflow and theano keras backends. The -tensorflow backend gives faster model-loading time but is undergoing more -rapid development and sometimes hits issues. If you encounter tensorflow errors -running MHCflurry, try setting this environment variable to switch to the theano -backend: - -``` -export KERAS_BACKEND=theano -``` - -You may also needs to `pip install theano`. - -## Setup (conda) - -You can alternatively get up and running with a [conda](https://conda.io/docs/) -environment as follows: - -``` -conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.0' -source activate mhcflurry-env +$ mhcflurry-downloads fetch ``` -Then continue as above: +You can now generate predictions: ``` -pip install mhcflurry -mhcflurry-downloads fetch -``` - -If you wish to test your installation, you can install `nose` and run the tests -from a checkout: - +$ mhcflurry-predict \ + --alleles HLA-A0201 HLA-A0301 \ + --peptides SIINFEKL SIINFEKD SIINFEKQ \ + --out /tmp/predictions.csv \ + +Wrote: /tmp/predictions.csv ``` -pip install nose -nosetests . -``` - - - -## Making predictions from the command-line - -```shell -$ mhcflurry-predict --alleles HLA-A0201 HLA-A0301 --peptides SIINFEKL SIINFEKD SIINFEKQ -allele,peptide,mhcflurry_prediction,mhcflurry_prediction_low,mhcflurry_prediction_high -HLA-A0201,SIINFEKL,5326.541919062165,3757.86675352994,7461.37693353508 -HLA-A0201,SIINFEKD,18763.70298522213,13140.82000240037,23269.82139560844 -HLA-A0201,SIINFEKQ,18620.10057358322,13096.425874678192,23223.148184869413 -HLA-A0301,SIINFEKL,24481.726678691946,21035.52779725433,27245.371837497867 -HLA-A0301,SIINFEKD,24687.529360239587,21582.590014592537,27749.39869616437 -HLA-A0301,SIINFEKQ,25923.062203902562,23522.5793450799,28079.456657427705 -``` - -The predictions returned are affinities (KD) in nM. The `prediction_low` and -`prediction_high` fields give the 5-95 percentile predictions across the models -in the ensemble. The predictions above were generated with MHCflurry 0.9.2. -Your exact predictions may vary slightly from these (up to about 1 nM) -depending on the Keras backend in use and other numerical details. -Different versions of MHCflurry can of course give results considerably -different from these. - -You can also specify the input and output as CSV files. -Run `mhcflurry-predict -h` for details. - - -## Making predictions from Python - -```python ->>> from mhcflurry import Class1AffinityPredictor ->>> predictor = Class1AffinityPredictor.load() ->>> predictor.predict_to_dataframe(peptides=['SIINFEKL'], allele='A0201') - - - allele peptide prediction prediction_low prediction_high - A0201 SIINFEKL 6029.084473 4474.103253 7771.297702 -``` - -See the [class1_allele_specific_models.ipynb](https://github.com/hammerlab/mhcflurry/blob/master/examples/class1_allele_specific_models.ipynb) -notebook for an overview of the Python API, including fitting your own predictors. - - -## Details on the downloadable models - -An ensemble of eight single-allele models was trained for each allele with at least -100 measurements in the training set (118 alleles). The models were trained on a -random 80% sample of the data for the allele and the remaining 20% was used for -early stopping. All models use the same [architecture](downloads-generation/models_class1/hyperparameters.yaml). The -predictions are taken to be the geometric mean of the nM binding affinity -predictions of the individual models. The training script is [here](downloads-generation/models_class1/GENERATE.sh). - -## Environment variables -The path where MHCflurry looks for model weights and data can be set with the `MHCFLURRY_DOWNLOADS_DIR` environment variable. This directory should contain subdirectories like "models_class1". +See the [documentation](http://www.hammerlab.org/mhcflurry/) for more details. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..e6118afe4c56ccaa0c30f8d14f84218605857e52 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,254 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) + $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don\'t have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help +help: + @echo "Please use \`make <target>' where <target> is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " epub3 to make an epub3" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + @echo " dummy to check syntax errors of document sources" + +# Added by Tim: +.PHONY: generate +generate: + sphinx-apidoc -M -f -o _build/ ../mhcflurry + python generate.py \ + --out-models-cv-rst _build/_models_cv.rst \ + --out-models-architecture-png _build/_models_architecture.png \ + --out-models-info-rst _build/_models_info.rst \ + --out-models-supported-alleles-rst _build/_models_supported_alleles.rst + + +# Added by Tim: +.PHONY: readme +readme: text + rm -f package_readme/readme.generated.txt + cat package_readme/readme_header.rst \ + _build/text/package_readme/readme.template.txt \ + > package_readme/readme.generated.txt + chmod 444 package_readme/readme.generated.txt # read only + +.PHONY: clean +clean: + # Added by tim: preserve html/.git + rm -rf $(BUILDDIR)/html/* + mv $(BUILDDIR)/html /tmp/html-bk + rm -rf $(BUILDDIR)/* + mv /tmp/html-bk $(BUILDDIR)/html + +.PHONY: html +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +.PHONY: dirhtml +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +.PHONY: singlehtml +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +.PHONY: pickle +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +.PHONY: json +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +.PHONY: htmlhelp +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +.PHONY: qthelp +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/MHCflurry.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/MHCflurry.qhc" + +.PHONY: applehelp +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +.PHONY: devhelp +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/MHCflurry" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/MHCflurry" + @echo "# devhelp" + +.PHONY: epub +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +.PHONY: epub3 +epub3: + $(SPHINXBUILD) -b epub3 $(ALLSPHINXOPTS) $(BUILDDIR)/epub3 + @echo + @echo "Build finished. The epub3 file is in $(BUILDDIR)/epub3." + +.PHONY: latex +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +.PHONY: latexpdf +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: latexpdfja +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +.PHONY: text +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +.PHONY: man +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +.PHONY: texinfo +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +.PHONY: info +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +.PHONY: gettext +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +.PHONY: changes +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +.PHONY: linkcheck +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +.PHONY: doctest +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +.PHONY: coverage +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +.PHONY: xml +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +.PHONY: pseudoxml +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." + +.PHONY: dummy +dummy: + $(SPHINXBUILD) -b dummy $(ALLSPHINXOPTS) $(BUILDDIR)/dummy + @echo + @echo "Build finished. Dummy builder generates no files." diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b6330f5975fb52d9f09eccc4bfcb415c9dce67bc --- /dev/null +++ b/docs/README.md @@ -0,0 +1,14 @@ +# MHCflurry documentation + +Due to our use of `sphinxcontrib-autorun2` we unfortunately require Python 2.7 +to build to the docs. Python 3 is not supported. + +To generate Sphinx documentation, from this directory run: + +``` +$ pip install -r requirements.txt # for the first time you generate docs +$ make generate html +``` + +Documentation is written to the _build/ directory. These files should not be +checked into the repo. diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000000000000000000000000000000000000..05561300d79c22f7cd92b50ec26bf1027805ac25 --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,7 @@ +.. _api-documentation: + +API Documentation +================= + +.. include:: _build/mhcflurry.rst + :start-line: 2 \ No newline at end of file diff --git a/docs/commandline_tools.rst b/docs/commandline_tools.rst new file mode 100644 index 0000000000000000000000000000000000000000..2b3c64567c81e432de053c04670c680c3c76b4fa --- /dev/null +++ b/docs/commandline_tools.rst @@ -0,0 +1,21 @@ +Command-line reference +============================ + +See also the :ref:`tutorial <commandline_tutorial>`. + +.. _mhcflurry-predict: + +.. autoprogram:: mhcflurry.predict_command:parser + :prog: mhcflurry-predict + +.. _mhcflurry-class1-train-allele-specific-models: + +.. autoprogram:: mhcflurry.train_allele_specific_models_command:parser + :prog: mhcflurry-class1-train-allele-specific-models + + +.. _mhcflurry-downloads: + +.. autoprogram:: mhcflurry.downloads_command:parser + :prog: mhcflurry-downloads + diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst new file mode 100644 index 0000000000000000000000000000000000000000..6dc803c81b40657c2be83f8d5106a46654fb1c73 --- /dev/null +++ b/docs/commandline_tutorial.rst @@ -0,0 +1,154 @@ +.. _commandline_tutorial: + +Command-line tutorial +===================== + +.. _downloading: + +Downloading models +------------------ + +Most users will use pre-trained MHCflurry models that we release. These models +are distributed separately from the pip package and may be downloaded with the +:ref:`mhcflurry-downloads` tool: + +.. code-block:: shell + + $ mhcflurry-downloads fetch models_class1 + +Files downloaded with :ref:`mhcflurry-downloads` are stored in a platform-specific +directory. To get the path to downloaded data, you can use: + +.. command-output:: mhcflurry-downloads path models_class1 + :nostderr: + +We also release a few other "downloads," such as curated training data and some +experimental models. To see what's available and what you have downloaded, run: + +.. command-output:: mhcflurry-downloads info + :nostderr: + +.. note:: + + The code we use for *generating* the downloads is in the + ``downloads_generation`` directory in the repository. + + +Generating predictions +---------------------- + +The :ref:`mhcflurry-predict` command generates predictions from the command-line. +By default it will use the pre-trained models you downloaded above; other +models can be used by specifying the ``--models`` argument. + +Running: + +.. command-output:: + mhcflurry-predict + --alleles HLA-A0201 HLA-A0301 + --peptides SIINFEKL SIINFEKD SIINFEKQ + --out /tmp/predictions.csv + :nostderr: + +results in a file like this: + +.. command-output:: + cat /tmp/predictions.csv + +The predictions are given as affinities (KD) in nM in the ``mhcflurry_prediction`` +column. The other fields give the 5-95 percentile predictions across +the models in the ensemble and the quantile of the affinity prediction among +a large number of random peptides tested on that allele. + +The predictions shown above were generated with MHCflurry |version|. Different versions of +MHCflurry can give considerably different results. Even +on the same version, exact predictions may vary (up to about 1 nM) depending +on the Keras backend and other details. + +In most cases you'll want to specify the input as a CSV file instead of passing +peptides and alleles as commandline arguments. See :ref:`mhcflurry-predict` docs. + +Fitting your own models +----------------------- + +The :ref:`mhcflurry-class1-train-allele-specific-models` command is used to +fit models to training data. The models we release with MHCflurry are trained +with a command like: + +.. code-block:: shell + + $ mhcflurry-class1-train-allele-specific-models \ + --data TRAINING_DATA.csv \ + --hyperparameters hyperparameters.yaml \ + --percent-rank-calibration-num-peptides-per-length 1000000 \ + --min-measurements-per-allele 75 \ + --out-models-dir models + +MHCflurry predictors are serialized to disk as many files in a directory. The +command above will write the models to the output directory specified by the +``--out-models-dir`` argument. This directory has files like: + +.. program-output:: + ls "$(mhcflurry-downloads path models_class1)/models" + :shell: + :nostderr: + :ellipsis: 3,-3 + +The ``manifest.csv`` file gives metadata for all the models used in the predictor. +There will be a ``weights_...`` file for each model giving its weights +(the parameters for the neural network). The ``percent_ranks.csv`` stores a +histogram of model predictions for each allele over a large number of random +peptides. It is used for generating the percent ranks at prediction time. + +To call :ref:`mhcflurry-class1-train-allele-specific-models` you'll need some +training data. The data we use for our released predictors can be downloaded with +:ref:`mhcflurry-downloads`: + +.. code-block:: shell + + $ mhcflurry-downloads fetch data_curated + +It looks like this: + +.. command-output:: + bzcat "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" | head -n 3 + :shell: + :nostderr: + + +Scanning protein sequences for predicted epitopes +------------------------------------------------- + +The `mhctools <https://github.com/hammerlab/mhctools>`__ package +provides support for scanning protein sequences to find predicted +epitopes. It supports MHCflurry as well as other binding predictors. +Here is an example. + +First, install ``mhctools`` if it is not already installed: + +.. code-block:: shell + + $ pip install mhctools + +We'll generate predictions across ``example.fasta``, a FASTA file with two short +sequences: + +.. literalinclude:: /example.fasta + +Here's the ``mhctools`` invocation. See ``mhctools -h`` for more information. + +.. command-output:: + mhctools + --mhc-predictor mhcflurry + --input-fasta-file example.fasta + --mhc-alleles A02:01,A03:01 + --mhc-peptide-lengths 8,9,10,11 + --extract-subsequences + --output-csv /tmp/subsequence_predictions.csv + :ellipsis: 2,-2 + :nostderr: + +This will write a file giving predictions for all subsequences of the specified lengths: + +.. command-output:: + head -n 3 /tmp/subsequence_predictions.csv diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..be2e10f48ca71e6332c6ae97636321d6fa7eca7a --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,326 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# MHCflurry documentation build configuration file, created by +# sphinx-quickstart on Sun Dec 10 20:25:16 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import re +import textwrap +import logging + +# Hack added by tim for bug in autoprogram extension under Python 2. +from sphinx.util.pycompat import indent # pylint: disable=import-error +textwrap.indent = indent + +# Disable logging (added by tim) +logging.disable(logging.ERROR) + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath('.')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.doctest', + 'sphinx.ext.coverage', + 'sphinx.ext.ifconfig', + 'sphinx.ext.viewcode', + 'sphinx.ext.githubpages', + 'numpydoc', + 'sphinxcontrib.autorun2', + 'sphinxcontrib.programoutput', + 'sphinxcontrib.autoprogram', + 'sphinx.ext.githubpages', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'MHCflurry' +copyright = '2017, Timothy O\'Donnell' +author = 'Timothy O\'Donnell' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# + +# The short X.Y version. +# Added by Tim: reading version from mhcflurry __init__.py as in setup.py +with open('../mhcflurry/__init__.py', 'r') as f: + version = re.search( + r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', + f.read(), + re.MULTILINE).group(1) + +# The full version, including alpha/beta/rc tags. +release = version + +# Added by tim +autodoc_member_order = 'bysource' +autoclass_content = 'both' + +# Added by tim +suppress_warnings = ['image.nonlocal_uri'] + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +default_role = 'py:obj' + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + +# Added by Tim +# http://stackoverflow.com/questions/12206334/sphinx-autosummary-toctree-contains-reference-to-nonexisting-document-warnings +numpydoc_show_class_members = False + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'sphinx_rtd_theme' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. +# "<project> v<release> documentation" by default. +#html_title = 'MHCflurry v1.0.0' + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (relative to this directory) to use as a favicon of +# the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +# html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not None, a 'Last updated on:' timestamp is inserted at every page +# bottom, using the given strftime format. +# The empty string is equivalent to '%b %d, %Y'. +html_last_updated_fmt = "" + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +html_domain_indices = False + +# If false, no index is generated. +html_use_index = False + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a <link> tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr', 'zh' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# 'ja' uses this config value. +# 'zh' user can custom change `jieba` dictionary path. +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'MHCflurrydoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'MHCflurry.tex', 'MHCflurry Documentation', + 'Timothy O\'Donnell', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'mhcflurry', 'MHCflurry Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'MHCflurry', 'MHCflurry Documentation', + author, 'MHCflurry', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/example.fasta b/docs/example.fasta new file mode 100644 index 0000000000000000000000000000000000000000..ea095115509e108927979079136d5ff5b358d864 --- /dev/null +++ b/docs/example.fasta @@ -0,0 +1,6 @@ +>protein1 +MDSKGSSQKGSRLLLLLVVSNLLLCQGVVSTPVCPNGPGNCQV +EMFNEFDKRYAQGKGFITMALNSCHTSSLPTPEDKEQAQQTHH +>protein2 +VTEVRGMKGAPDAILSRAIEIEEENKRLLEGMEMIFGQVIPGA +ARYSAFYNLLHCLRRDSSKIDTYLKLLNCRIIYNNNC diff --git a/docs/generate.py b/docs/generate.py new file mode 100644 index 0000000000000000000000000000000000000000..848d04ce5a15c0d8391a4c1bd81e90b4a5229939 --- /dev/null +++ b/docs/generate.py @@ -0,0 +1,131 @@ +""" +Generate certain RST files used in documentation. +""" + +import sys +import argparse +from textwrap import wrap + +import pypandoc +import pandas +from keras.utils.vis_utils import plot_model + +from mhcflurry import __version__ +from mhcflurry.downloads import get_path +from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor + +parser = argparse.ArgumentParser(usage=__doc__) +parser.add_argument( + "--cv-summary-csv", + metavar="FILE.csv", + default=get_path( + "cross_validation_class1", "summary.all.csv", test_exists=False), + help="Cross validation scores summary. Default: %(default)s", +) +parser.add_argument( + "--class1-models-dir", + metavar="DIR", + default=get_path( + "models_class1", "models", test_exists=False), + help="Class1 models. Default: %(default)s", +) +parser.add_argument( + "--out-models-cv-rst", + metavar="FILE.rst", + help="rst output file", +) +parser.add_argument( + "--out-models-info-rst", + metavar="FILE.rst", + help="rst output file", +) +parser.add_argument( + "--out-models-architecture-png", + metavar="FILE.png", + help="png output file", +) +parser.add_argument( + "--out-models-supported-alleles-rst", + metavar="FILE.png", + help="png output file", +) + + +def go(argv): + args = parser.parse_args(argv) + + predictor = None + + if args.out_models_supported_alleles_rst: + # Supported alleles rst + if predictor is None: + predictor = Class1AffinityPredictor.load(args.class1_models_dir) + with open(args.out_models_supported_alleles_rst, "w") as fd: + fd.write( + "Models released with the current version of MHCflurry (%s) " + "support peptides of " + "length %d-%d and the following %d alleles:\n\n::\n\n\t%s\n\n" % ( + __version__, + predictor.supported_peptide_lengths[0], + predictor.supported_peptide_lengths[1], + len(predictor.supported_alleles), + "\n\t".join( + wrap(", ".join(predictor.supported_alleles))))) + print("Wrote: %s" % args.out_models_supported_alleles_rst) + + if args.out_models_architecture_png: + # Architecture diagram + if predictor is None: + predictor = Class1AffinityPredictor.load(args.class1_models_dir) + network = predictor.neural_networks[0].network() + plot_model( + network, + to_file=args.out_models_architecture_png, + show_layer_names=True, + show_shapes=True) + print("Wrote: %s" % args.out_models_architecture_png) + + if args.out_models_info_rst: + # Architecture information rst + if predictor is None: + predictor = Class1AffinityPredictor.load(args.class1_models_dir) + network = predictor.neural_networks[0].network() + lines = [] + network.summary(print_fn=lines.append) + + with open(args.out_models_info_rst, "w") as fd: + fd.write("Layers and parameters summary: ") + fd.write("\n\n::\n\n") + for line in lines: + fd.write(" ") + fd.write(line) + fd.write("\n") + print("Wrote: %s" % args.out_models_info_rst) + + if args.out_models_cv_rst: + # Models cv output + df = pandas.read_csv(args.cv_summary_csv) + sub_df = df.loc[ + df.kind == "ensemble" + ].sort_values("allele").copy().reset_index(drop=True) + sub_df["Allele"] = sub_df.allele + sub_df["CV Training Size"] = sub_df.train_size.astype(int) + sub_df["AUC"] = sub_df.auc + sub_df["F1"] = sub_df.f1 + sub_df["Kendall Tau"] = sub_df.tau + sub_df = sub_df[sub_df.columns[-5:]] + html = sub_df.to_html( + index=False, + float_format=lambda v: "%0.3f" % v, + justify="left") + rst = pypandoc.convert_text(html, format="html", to="rst") + + with open(args.out_models_cv_rst, "w") as fd: + fd.write( + "Showing estimated performance for %d alleles." % len(sub_df)) + fd.write("\n\n") + fd.write(rst) + print("Wrote: %s" % args.out_models_cv_rst) + +if __name__ == "__main__": + go(sys.argv[1:]) \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..11190d9fe16b57c86218cd55dd9f2a81d44110e2 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,14 @@ +MHCflurry documentation +===================================== + +.. toctree:: + :maxdepth: 3 + + intro + commandline_tutorial + python_tutorial + models_supported_alleles + models + commandline_tools + api + diff --git a/docs/intro.rst b/docs/intro.rst new file mode 100644 index 0000000000000000000000000000000000000000..2644c317bf22b1583db285b0af2370ac86747ec5 --- /dev/null +++ b/docs/intro.rst @@ -0,0 +1,67 @@ +Introduction and setup +======================= + +MHCflurry is an open source package for peptide/MHC I binding affinity prediction. It +provides competitive accuracy with a fast and documented implementation. + +You can download pre-trained MHCflurry models fit to affinity measurements +deposited in IEDB or train a MHCflurry predictor on your own data. + +Currently only allele-specific prediction is implemented, in which separate models +are trained for each allele. The released models therefore support a fixed set of common +class I alleles for which sufficient published training data is available +(see :ref:`models_supported_alleles`\ ). + +MHCflurry supports Python versions 2.7 and 3.4+. It uses the `keras <https://keras.io>`__ +neural network library via either the Tensorflow or Theano backends. GPUs may +optionally be used for a generally modest speed improvement. + +If you find MHCflurry useful in your research please cite: + + O'Donnell, T. et al., 2017. MHCflurry: open-source class I MHC + binding affinity prediction. bioRxiv. Available at: + http://www.biorxiv.org/content/early/2017/08/09/174243. + + +Installation (pip) +------------------- + +Install the package: + +.. code-block:: shell + + $ pip install mhcflurry + +Then download our datasets and trained models: + +.. code-block:: shell + + $ mhcflurry-downloads fetch + +From a checkout you can run the unit tests with: + +.. code-block:: shell + + $ pip install nose + $ nosetests . + + +Using conda +------------- + +You can alternatively get up and running with a `conda <https://conda.io/docs/>`__ +environment as follows. Some users have reported that this can avoid problems installing +tensorflow. + +.. code-block:: shell + + $ conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.2' + $ source activate mhcflurry-env + +Then continue as above: + +.. code-block:: shell + + $ pip install mhcflurry + $ mhcflurry-downloads fetch + diff --git a/docs/models.rst b/docs/models.rst new file mode 100644 index 0000000000000000000000000000000000000000..4f7dee9edb93dcf686b830795aad79bf93c74bd7 --- /dev/null +++ b/docs/models.rst @@ -0,0 +1,35 @@ +Details on the released models +=============================== + +The released MHCflurry predictor consists of an ensemble of eight models for each +supported allele. Each model in the ensemble was trained on a random 80% sample +of the data for the allele, and the remaining 20% was used for early stopping. +All models use the same architecture. The predictions are taken to be the geometric +mean of the nM binding affinity predictions of the individual models. The script +we run to train these models is in "downloads-generation/models_class1/GENERATE.sh" +in the repository. + +Neural network architecture +------------------------------------------------------------- + +The neural network architecture is quite simple, consisting of a locally +connected layer, a dense layer, and a sigmoid output. + +.. include:: /_build/_models_info.rst + +Architecture diagram: + +.. image:: /_build/_models_architecture.png + +Cross validation performance +------------------------------------------------------------- + +The accuracy of the MHCflurry downloadable models was estimated using 5-fold cross +validation on the training data. The values shown here are the mean cross validation +scores across folds. + +The AUC and F1 estimates use a 500 nM cutoff for distinguishing strong-binders +from weak- or non-binders. The Kendall Tau score gives the rank correlation +between the predicted and measured affinities; it uses no cutoff. + +.. include:: /_build/_models_cv.rst diff --git a/docs/models_supported_alleles.rst b/docs/models_supported_alleles.rst new file mode 100644 index 0000000000000000000000000000000000000000..0dd3c94ed86545a3781a904cdcf3b064b8189648 --- /dev/null +++ b/docs/models_supported_alleles.rst @@ -0,0 +1,6 @@ +.. _models_supported_alleles: + +Supported alleles and peptide lengths +===================================== + +.. include:: /_build/_models_supported_alleles.rst \ No newline at end of file diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst new file mode 100644 index 0000000000000000000000000000000000000000..f8a9379984b9fec2ce7a31a41ac60b29bb8c8881 --- /dev/null +++ b/docs/python_tutorial.rst @@ -0,0 +1,158 @@ +Python library tutorial +======================= + +Predicting +---------- + +The MHCflurry Python API exposes additional options and features beyond those +supported by the commandline tools. This tutorial gives a basic overview +of the most important functionality. See the :ref:`API-documentation` for further details. + +The `~mhcflurry.Class1AffinityPredictor` class is the primary user-facing interface. +Use the `~mhcflurry.Class1AffinityPredictor.load` static method to load a +trained predictor from disk. With no arguments this method will load the predictor +released with MHCflurry (see :ref:`downloading`\ ). If you pass a path to a +models directory, then it will load that predictor instead. + +.. runblock:: pycon + + >>> from mhcflurry import Class1AffinityPredictor + >>> predictor = Class1AffinityPredictor.load() + >>> predictor.supported_alleles[:10] + +With a predictor loaded we can now generate some binding predictions: + +.. runblock:: pycon + + >>> predictor.predict(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) + +.. note:: + + MHCflurry normalizes allele names using the `mhcnames <https://github.com/hammerlab/mhcnames>`__ + package. Names like ``HLA-A0201`` or ``A*02:01`` will be + normalized to ``HLA-A*02:01``, so most naming conventions can be used + with methods such as `~mhcflurry.Class1AffinityPredictor.predict`. + +For more detailed results, we can use +`~mhcflurry.Class1AffinityPredictor.predict_to_dataframe`. + +.. runblock:: pycon + + >>> predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) + +Instead of a single allele and multiple peptides, we may need predictions for +allele/peptide pairs. We can predict across pairs by specifying +the `alleles` argument instead of `allele`. The list of alleles +must be the same length as the list of peptides (i.e. it is predicting over pairs, +*not* taking the cross product). + +.. runblock:: pycon + + >>> predictor.predict(alleles=["HLA-A0201", "HLA-B*57:01"], peptides=["SIINFEKL", "SIINFEQL"]) + +Training +-------- + +Let's fit our own MHCflurry predictor. First we need some training data. If you +haven't already, run this in a shell to download the MHCflurry training data: + +.. code-block:: shell + + $ mhcflurry-downloads fetch data_curated + +We can get the path to this data from Python using `mhcflurry.downloads.get_path`: + +.. runblock:: pycon + + >>> from mhcflurry.downloads import get_path + >>> data_path = get_path("data_curated", "curated_training_data.csv.bz2") + >>> data_path + +Now let's load it with pandas and filter to reasonably-sized peptides: + +.. runblock:: pycon + + >>> import pandas + >>> df = pandas.read_csv(data_path) + >>> df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)] + >>> df.head(5) + +We'll make an untrained `~mhcflurry.Class1AffinityPredictor` and then call +`~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` to fit +some models. + +.. runblock:: pycon + + >>> new_predictor = Class1AffinityPredictor() + >>> single_allele_train_data = df.loc[df.allele == "HLA-B*57:01"].sample(100) + >>> new_predictor.fit_allele_specific_predictors( + ... n_models=1, + ... architecture_hyperparameters={ + ... "layer_sizes": [16], + ... "max_epochs": 5, + ... "random_negative_constant": 5, + ... }, + ... peptides=single_allele_train_data.peptide.values, + ... affinities=single_allele_train_data.measurement_value.values, + ... allele="HLA-B*57:01") + +The `~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` method +can be called any number of times on the same instance to build up ensembles +of models across alleles. The `architecture_hyperparameters` we specified are +for demonstration purposes; to fit real models you would usually train for +more epochs. + +Now we can generate predictions: + +.. runblock:: pycon + + >>> new_predictor.predict(["SYNPEPII"], allele="HLA-B*57:01") + +We can save our predictor to the specified directory on disk by running: + +.. runblock:: pycon + + >>> new_predictor.save("/tmp/new-predictor") + +and restore it: + +.. runblock:: pycon + + >>> new_predictor2 = Class1AffinityPredictor.load("/tmp/new-predictor") + >>> new_predictor2.supported_alleles + + +Lower level interface +--------------------- + +The high-level `Class1AffinityPredictor` delegates to low-level +`~mhcflurry.Class1NeuralNetwork` objects, each of which represents +a single neural network. The purpose of `~mhcflurry.Class1AffinityPredictor` +is to implement several important features: + +ensembles + More than one neural network can be used to generate each prediction. The + predictions returned to the user are the geometric mean of the individual + model predictions. This gives higher accuracy in most situations + +multiple alleles + A `~mhcflurry.Class1NeuralNetwork` generates predictions for only a single + allele. The `~mhcflurry.Class1AffinityPredictor` maps alleles to the + relevant `~mhcflurry.Class1NeuralNetwork` instances + +serialization + Loading and saving predictors is implemented in `~mhcflurry.Class1AffinityPredictor`. + +Sometimes it's easiest to work directly with `~mhcflurry.Class1NeuralNetwork`. +Here is a simple example of doing so: + +.. runblock:: pycon + + >>> from mhcflurry import Class1NeuralNetwork + >>> network = Class1NeuralNetwork() + >>> network.fit( + ... single_allele_train_data.peptide.values, + ... single_allele_train_data.measurement_value.values, + ... verbose=0) + >>> network.predict(["SIINFEKLL"]) + diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..a88547504c1d7ede3af6ca6d67010fc0614ade96 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,9 @@ +sphinx +sphinxcontrib-autorun2 +sphinxcontrib-programoutput +sphinxcontrib-autoprogram +sphinx-rtd-theme +numpydoc +pypandoc +mhctools +pydot diff --git a/downloads-generation/cross_validation_class1/GENERATE.sh b/downloads-generation/cross_validation_class1/GENERATE.sh new file mode 100755 index 0000000000000000000000000000000000000000..c35e0862a0b97ddfa20e65e6117c8cdf8917d8da --- /dev/null +++ b/downloads-generation/cross_validation_class1/GENERATE.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# +# Cross validation using the standard class I models. +# Splits training data into 5 folds (stratifying on allele), trains and tests a +# predictor on each (train, test) fold, and writes a summary CSV giving +# performance for each allele on each fold. +# +set -e +set -x + +DOWNLOAD_NAME=cross_validation_class1 +SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation +SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" +SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") + +NFOLDS=5 + +mkdir -p "$SCRATCH_DIR" +rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" +mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" + +# Send stdout and stderr to a logfile included with the archive. +exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) + +# Log some environment info +date +pip freeze +git status + +cd $SCRATCH_DIR/$DOWNLOAD_NAME + +cp $SCRIPT_DIR/hyperparameters.yaml . +cp $SCRIPT_DIR/split_folds.py . +cp $SCRIPT_DIR/score.py . + +time python split_folds.py \ + "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ + --min-measurements-per-allele 75 \ + --folds $NFOLDS \ + --random-state 1 \ + --output-pattern-test "./test.fold_{}.csv" \ + --output-pattern-train "./train.fold_{}.csv" + +# Kill child processes if parent exits: +trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT + +for fold in $(seq 0 $(expr $NFOLDS - 1)) +do + mhcflurry-class1-train-allele-specific-models \ + --data train.fold_${fold}.csv \ + --hyperparameters hyperparameters.yaml \ + --out-models-dir models.fold_${fold} \ + --min-measurements-per-allele 0 \ + --percent-rank-calibration-num-peptides-per-length 0 \ + 2>&1 | tee -a LOG.train.fold_${fold}.txt & +done +wait + +echo "DONE TRAINING. NOW PREDICTING." + +for fold in $(seq 0 $(expr $NFOLDS - 1)) +do + mhcflurry-predict \ + test.fold_${fold}.csv \ + --models models.fold_${fold} \ + --no-throw \ + --include-individual-model-predictions \ + --out predictions.fold_${fold}.csv & +done +wait + +time python score.py \ + predictions.fold_*.csv \ + --out-combined predictions.combined.csv \ + --out-scores scores.csv \ + --out-summary summary.all.csv + +grep -v single summary.all.csv > summary.ensemble.csv + +cp $SCRIPT_ABSOLUTE_PATH . +for i in $(ls *.txt) +do + bzip2 $i +done +tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * + +echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/cross_validation_class1/README.md b/downloads-generation/cross_validation_class1/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f7584e324e3b8f2e24a8682a2cde2fde86fa94ac --- /dev/null +++ b/downloads-generation/cross_validation_class1/README.md @@ -0,0 +1,7 @@ +# Cross validation of standard Class I models + +This download contains cross validation results and intermediate data for +class I allele-specific MHCflurry models. + +This exists to track the exact steps used to generate cross-validation results. +Users will probably not interact with this directly. \ No newline at end of file diff --git a/downloads-generation/cross_validation_class1/hyperparameters.yaml b/downloads-generation/cross_validation_class1/hyperparameters.yaml new file mode 120000 index 0000000000000000000000000000000000000000..f32feef1682d757437fe1f0cee2a31c030d7508f --- /dev/null +++ b/downloads-generation/cross_validation_class1/hyperparameters.yaml @@ -0,0 +1 @@ +../models_class1/hyperparameters.yaml \ No newline at end of file diff --git a/downloads-generation/cross_validation_class1/score.py b/downloads-generation/cross_validation_class1/score.py new file mode 100644 index 0000000000000000000000000000000000000000..7af791c4ccf2bc6e7d4c77cda4d30999d7d40a2a --- /dev/null +++ b/downloads-generation/cross_validation_class1/score.py @@ -0,0 +1,103 @@ +""" +Scoring script for cross-validation. +""" +import argparse +import sys +import collections + +import pandas +from mhcflurry.scoring import make_scores + + +parser = argparse.ArgumentParser(usage = __doc__) + +parser.add_argument( + "input", metavar="INPUT.csv", help="Input CSV", nargs="+") + +parser.add_argument( + "--out-scores", + metavar="RESULT.csv") + +parser.add_argument( + "--out-combined", + metavar="COMBINED.csv") + +parser.add_argument( + "--out-summary", + metavar="RESULT.csv") + +def run(argv): + args = parser.parse_args(argv) + + df = None + for (i, filename) in enumerate(args.input): + input_df = pandas.read_csv(filename) + assert not input_df.mhcflurry_prediction.isnull().any() + + cols_to_merge = [] + input_df["prediction_%d" % i] = input_df.mhcflurry_prediction + cols_to_merge.append(input_df.columns[-1]) + if 'mhcflurry_model_single_0' in input_df.columns: + input_df["prediction_single_%d" % i] = input_df.mhcflurry_model_single_0 + cols_to_merge.append(input_df.columns[-1]) + + if df is None: + df = input_df[ + ["allele", "peptide", "measurement_value"] + cols_to_merge + ].copy() + else: + df = pandas.merge( + df, + input_df[['allele', 'peptide'] + cols_to_merge], + on=['allele', 'peptide'], + how='outer') + + print("Loaded data:") + print(df.head(5)) + + if args.out_combined: + df.to_csv(args.out_combined, index=False) + print("Wrote: %s" % args.out_combined) + + prediction_cols = [ + c + for c in df.columns + if c.startswith("prediction_") + ] + + scores_rows = [] + for (allele, allele_df) in df.groupby("allele"): + for prediction_col in prediction_cols: + sub_df = allele_df.loc[~allele_df[prediction_col].isnull()] + scores = collections.OrderedDict() + scores['allele'] = allele + scores['fold'] = prediction_col.replace("prediction_", "").replace("single_", "") + scores['kind'] = "single" if "single" in prediction_col else "ensemble" + scores['train_size'] = allele_df[prediction_col].isnull().sum() + scores['test_size'] = len(sub_df) + + # make_scores returns a dict with entries "auc", "f1", "tau" + scores.update( + make_scores( + sub_df.measurement_value, sub_df[prediction_col])) + scores_rows.append(scores) + scores_df = pandas.DataFrame(scores_rows) + print(scores_df) + + if args.out_scores: + scores_df.to_csv(args.out_scores, index=False) + print("Wrote: %s" % args.out_scores) + + summary_df = scores_df.groupby(["allele", "kind"])[ + ["train_size", "test_size", "auc", "f1", "tau"] + ].mean().reset_index() + print("Summary:") + print(summary_df) + + if args.out_summary: + summary_df.to_csv(args.out_summary, index=False) + print("Wrote: %s" % args.out_summary) + +if __name__ == '__main__': + run(sys.argv[1:]) + diff --git a/downloads-generation/cross_validation_class1/split_folds.py b/downloads-generation/cross_validation_class1/split_folds.py new file mode 100644 index 0000000000000000000000000000000000000000..dd49085fd6818a9a751419edce1aec38cb2f2eaf --- /dev/null +++ b/downloads-generation/cross_validation_class1/split_folds.py @@ -0,0 +1,121 @@ +""" +Split training data into CV folds. +""" +import argparse +import sys +from os.path import abspath + +import pandas +import numpy +from sklearn.model_selection import StratifiedKFold + +parser = argparse.ArgumentParser(usage = __doc__) + +parser.add_argument( + "input", metavar="INPUT.csv", help="Input CSV") + +parser.add_argument( + "--folds", metavar="N", type=int, default=5) + +parser.add_argument( + "--allele", + nargs="+", + help="Include only the specified allele(s)") + +parser.add_argument( + "--min-measurements-per-allele", + type=int, + metavar="N", + help="Use only alleles with >=N measurements.") + +parser.add_argument( + "--subsample", + type=int, + metavar="N", + help="Subsample to first N rows") + +parser.add_argument( + "--random-state", + metavar="N", + type=int, + help="Specify an int for deterministic splitting") + +parser.add_argument( + "--output-pattern-train", + default="./train.fold_{}.csv", + help="Pattern to use to generate output filename. Default: %(default)s") + +parser.add_argument( + "--output-pattern-test", + default="./test.fold_{}.csv", + help="Pattern to use to generate output filename. Default: %(default)s") + + +def run(argv): + args = parser.parse_args(argv) + + df = pandas.read_csv(args.input) + print("Loaded data with shape: %s" % str(df.shape)) + + df = df.ix[ + (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) + ] + print("Subselected to 8-15mers: %s" % (str(df.shape))) + + allele_counts = df.allele.value_counts() + + if args.allele: + alleles = args.allele + else: + alleles = list( + allele_counts.ix[ + allele_counts > args.min_measurements_per_allele + ].index) + + df = df.loc[df.allele.isin(alleles)].copy() + print("Potentially subselected by allele to: %s" % str(df.shape)) + + print("Data has %d alleles: %s" % ( + df.allele.nunique(), " ".join(df.allele.unique()))) + + print(df.head()) + + # Take log before taking median (in case of even number of samples). + df["measurement_value"] = numpy.log1p(df.measurement_value) + df = df.groupby(["allele", "peptide"]).measurement_value.median().reset_index() + df["measurement_value"] = numpy.expm1(df.measurement_value) + print("Took median for each duplicate peptide/allele pair: %s" % str(df.shape)) + + print(df.head()) + + if args.subsample: + df = df.head(args.subsample) + print("Subsampled to: %s" % str(df.shape)) + + kf = StratifiedKFold( + n_splits=args.folds, + shuffle=True, + random_state=args.random_state) + + # Stratify by both allele and binder vs. nonbinder. + df["key"] = [ + "%s_%s" % ( + row.allele, + "binder" if row.measurement_value < 500 else "nonbinder") + for (_, row) in df.iterrows() + ] + + for i, (train, test) in enumerate(kf.split(df, df.key)): + train_filename = args.output_pattern_train.format(i) + test_filename = args.output_pattern_test.format(i) + + df.iloc[train].to_csv(train_filename, index=False) + print("Wrote: %s" % abspath(train_filename)) + + df.iloc[test].to_csv(test_filename, index=False) + print("Wrote: %s" % abspath(test_filename)) + + +if __name__ == '__main__': + run(sys.argv[1:]) + diff --git a/downloads-generation/data_curated/GENERATE.sh b/downloads-generation/data_curated/GENERATE.sh index 89982e6a9c0b1f7ea598912270fd428f9a5019c6..f0a3d54aece76755ab92f1a6764243f014f112b4 100755 --- a/downloads-generation/data_curated/GENERATE.sh +++ b/downloads-generation/data_curated/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Create "curated" training data, which combines an IEDB download with additional +# published data, removes unusable entries, normalizes allele name, and performs +# other filtering and standardization. +# set -e set -x diff --git a/downloads-generation/data_iedb/GENERATE.sh b/downloads-generation/data_iedb/GENERATE.sh index 55156647d17eb21d90e3b136519f151bb3bd5cd9..a6067a36eb10c18ad1b26a17edb38f656dd106ae 100755 --- a/downloads-generation/data_iedb/GENERATE.sh +++ b/downloads-generation/data_iedb/GENERATE.sh @@ -1,5 +1,7 @@ #!/bin/bash - +# +# Download latest MHC I ligand data from IEDB. +# set -e set -x diff --git a/downloads-generation/data_kim2014/GENERATE.sh b/downloads-generation/data_kim2014/GENERATE.sh index baf8a3a453804e47296fd0dcc7aff85b9118ae7e..dbda0fe8c12df076e5e8f3b96728eefda51f23c6 100755 --- a/downloads-generation/data_kim2014/GENERATE.sh +++ b/downloads-generation/data_kim2014/GENERATE.sh @@ -1,5 +1,8 @@ #!/bin/bash - +# +# Download some published MHC I ligand data from a location on Dropbox. +# +# set -e set -x diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh index ae98f51ebe9a09381c3dd497d49dfcdd2643dfdb..b72334b536cca453300b52257eb8e9c65c7e0dd3 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Train standard MHCflurry Class I models. +# Calls mhcflurry-class1-train-allele-specific-models on curated training data +# using the hyperparameters in "hyperparameters.yaml". +# set -e set -x @@ -31,7 +35,8 @@ time mhcflurry-class1-train-allele-specific-models \ --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ --hyperparameters hyperparameters.yaml \ --out-models-dir models \ - --min-measurements-per-allele 200 + --percent-rank-calibration-num-peptides-per-length 1000000 \ + --min-measurements-per-allele 75 cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt diff --git a/downloads-generation/models_class1/hyperparameters.yaml b/downloads-generation/models_class1/hyperparameters.yaml index 0e114b320231a585d8ef78df925caff74dbf43c6..9d38ad1af3f7c39b576e60e218fd9fb85eb4cd5f 100644 --- a/downloads-generation/models_class1/hyperparameters.yaml +++ b/downloads-generation/models_class1/hyperparameters.yaml @@ -2,15 +2,16 @@ ########################################## # ENSEMBLE SIZE ########################################## -"n_models": 12, +"n_models": 8, ########################################## # OPTIMIZATION ########################################## "max_epochs": 500, -"patience": 10, +"patience": 20, "early_stopping": true, "validation_split": 0.2, +"minibatch_size": 128, ########################################## # RANDOM NEGATIVE PEPTIDES @@ -26,17 +27,13 @@ # One of "one-hot", "embedding", or "BLOSUM62". "peptide_amino_acid_encoding": "BLOSUM62", "use_embedding": false, # maintained for backward compatability +"embedding_output_dim": 8, # only used if using embedding "kmer_size": 15, ########################################## # NEURAL NETWORK ARCHITECTURE ########################################## "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, { "filters": 8, "activation": "tanh", @@ -45,9 +42,7 @@ ], "activation": "relu", "output_activation": "sigmoid", -"layer_sizes": [ - 32 -], +"layer_sizes": [16], "dense_layer_l1_regularization": 0.001, "batch_normalization": false, "dropout_probability": 0.0, diff --git a/downloads-generation/models_class1_experiments1/GENERATE.sh b/downloads-generation/models_class1_experiments1/GENERATE.sh index 50d39f496e7fe929ad577f804b025dbd8f96b1cb..89921d924e819b8ac41c6e274730182664013e22 100755 --- a/downloads-generation/models_class1_experiments1/GENERATE.sh +++ b/downloads-generation/models_class1_experiments1/GENERATE.sh @@ -1,5 +1,9 @@ #!/bin/bash - +# +# Train "experimental" models using various hyperparameter combinations. +# This trains models only for a small number of alleles for which we have good +# mass-spec validation data. +# set -e set -x @@ -8,6 +12,9 @@ SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")" SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH") +# Terminate children on exit +trap "trap - SIGTERM && kill -- -$$" SIGINT SIGTERM EXIT + mkdir -p "$SCRATCH_DIR" rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" @@ -23,31 +30,38 @@ git status cd $SCRATCH_DIR/$DOWNLOAD_NAME +ALLELES="HLA-A*01:01 HLA-A*02:01 HLA-A*02:03 HLA-A*02:07 HLA-A*03:01 HLA-A*11:01 HLA-A*24:02 HLA-A*29:02 HLA-A*31:01 HLA-A*68:02 HLA-B*07:02 HLA-B*15:01 HLA-B*35:01 HLA-B*44:02 HLA-B*44:03 HLA-B*51:01 HLA-B*54:01 HLA-B*57:01" + # Standard architecture on quantitative only -cp $SCRIPT_DIR/hyperparameters-standard.json . +cp $SCRIPT_DIR/hyperparameters-standard.yaml . mkdir models-standard-quantitative time mhcflurry-class1-train-allele-specific-models \ --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ --only-quantitative \ - --hyperparameters hyperparameters-standard.json \ + --hyperparameters hyperparameters-standard.yaml \ --out-models-dir models-standard-quantitative \ - --min-measurements-per-allele 100 & + --percent-rank-calibration-num-peptides-per-length 0 \ + --allele $ALLELES 2>&1 | tee -a LOG.standard.txt & # Model variations on qualitative + quantitative -for mod in 0local_noL1 0local 1local dense16 dense64 noL1 +for mod in 0local_noL1 0local 2local widelocal dense8 dense32 noL1 onehot embedding do - cp $SCRIPT_DIR/hyperparameters-${mod}.json . + cp $SCRIPT_DIR/hyperparameters-${mod}.yaml . mkdir models-${mod} time mhcflurry-class1-train-allele-specific-models \ --data "$(mhcflurry-downloads path data_curated)/curated_training_data.csv.bz2" \ - --hyperparameters hyperparameters-${mod}.json \ + --hyperparameters hyperparameters-${mod}.yaml \ --out-models-dir models-${mod} \ - --min-measurements-per-allele 100 & + --percent-rank-calibration-num-peptides-per-length 0 \ + --allele $ALLELES 2>&1 | tee -a LOG.${mod}.txt & done wait cp $SCRIPT_ABSOLUTE_PATH . -bzip2 LOG.txt +for i in $(ls *.txt) +do + bzip2 $i +done tar -cjf "../${DOWNLOAD_NAME}.tar.bz2" * echo "Created archive: $SCRATCH_DIR/$DOWNLOAD_NAME.tar.bz2" diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-0local.json b/downloads-generation/models_class1_experiments1/hyperparameters-0local.json deleted file mode 100644 index d6ddd3d1991f6d69aa6199f045b3798bd39dde1b..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-0local.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "n_models": 12, - "max_epochs": 500, - "patience": 10, - "early_stopping": true, - "validation_split": 0.2, - - "random_negative_rate": 0.0, - "random_negative_constant": 25, - - "use_embedding": false, - "kmer_size": 15, - "batch_normalization": false, - "locally_connected_layers": [], - "activation": "relu", - "output_activation": "sigmoid", - "layer_sizes": [ - 32 - ], - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - "dense_layer_l1_regularization": 0.001, - "dropout_probability": 0.0 - } -] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-0local.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-0local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e39d96d6cdb7c2ff555294049bd6eb690692ef81 --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-0local.yaml @@ -0,0 +1,45 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "BLOSUM62", +"use_embedding": false, # maintained for backward compatability +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 16 +], +"dense_layer_l1_regularization": 0.001, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.json b/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.json deleted file mode 100644 index 52fdeee8e3cc8f9fe07fa9f28c562120e715c556..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.json +++ /dev/null @@ -1,26 +0,0 @@ -[ - { - "n_models": 12, - "max_epochs": 500, - "patience": 10, - "early_stopping": true, - "validation_split": 0.2, - - "random_negative_rate": 0.0, - "random_negative_constant": 25, - - "use_embedding": false, - "kmer_size": 15, - "batch_normalization": false, - "locally_connected_layers": [], - "activation": "relu", - "output_activation": "sigmoid", - "layer_sizes": [ - 32 - ], - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - "dense_layer_l1_regularization": 0.0, - "dropout_probability": 0.0 - } -] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..abe4d296fa4c3d0a6d6bd3967ea28facfcff4e6a --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-0local_noL1.yaml @@ -0,0 +1,45 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "BLOSUM62", +"use_embedding": false, # maintained for backward compatability +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 16 +], +"dense_layer_l1_regularization": 0.0, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-1local.json b/downloads-generation/models_class1_experiments1/hyperparameters-1local.json deleted file mode 100644 index dde317fa2e2f1601b71e54629c11c2f8ef660d41..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-1local.json +++ /dev/null @@ -1,32 +0,0 @@ -[ - { - "n_models": 12, - "max_epochs": 500, - "patience": 10, - "early_stopping": true, - "validation_split": 0.2, - - "random_negative_rate": 0.0, - "random_negative_constant": 25, - - "use_embedding": false, - "kmer_size": 15, - "batch_normalization": false, - "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } - ], - "activation": "relu", - "output_activation": "sigmoid", - "layer_sizes": [ - 32 - ], - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - "dense_layer_l1_regularization": 0.001, - "dropout_probability": 0.0 - } -] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-2local.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-2local.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4ce0eea23cf9097e9bdf6fa771f3046f154deabb --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-2local.yaml @@ -0,0 +1,55 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "BLOSUM62", +"use_embedding": false, # maintained for backward compatability +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + }, + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 16 +], +"dense_layer_l1_regularization": 0.001, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-dense16.json b/downloads-generation/models_class1_experiments1/hyperparameters-dense16.json deleted file mode 100644 index c25383f5428ffef261b11bc3041a5d7481983018..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-dense16.json +++ /dev/null @@ -1,37 +0,0 @@ -[ - { - "n_models": 12, - "max_epochs": 500, - "patience": 10, - "early_stopping": true, - "validation_split": 0.2, - - "random_negative_rate": 0.0, - "random_negative_constant": 25, - - "use_embedding": false, - "kmer_size": 15, - "batch_normalization": false, - "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } - ], - "activation": "relu", - "output_activation": "sigmoid", - "layer_sizes": [ - 16 - ], - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - "dense_layer_l1_regularization": 0.001, - "dropout_probability": 0.0 - } -] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-dense32.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-dense32.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ef6b334a1155888d9f3d6847b6ec32c996c98f4f --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-dense32.yaml @@ -0,0 +1,50 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "BLOSUM62", +"use_embedding": false, # maintained for backward compatability +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 32 +], +"dense_layer_l1_regularization": 0.001, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-dense64.json b/downloads-generation/models_class1_experiments1/hyperparameters-dense64.json deleted file mode 100644 index e54f47b837927211e98e6e3b4810356c9d5d7b49..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-dense64.json +++ /dev/null @@ -1,37 +0,0 @@ -[ - { - "n_models": 12, - "max_epochs": 500, - "patience": 10, - "early_stopping": true, - "validation_split": 0.2, - - "random_negative_rate": 0.0, - "random_negative_constant": 25, - - "use_embedding": false, - "kmer_size": 15, - "batch_normalization": false, - "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } - ], - "activation": "relu", - "output_activation": "sigmoid", - "layer_sizes": [ - 64 - ], - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - "dense_layer_l1_regularization": 0.001, - "dropout_probability": 0.0 - } -] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-dense8.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-dense8.yaml new file mode 100644 index 0000000000000000000000000000000000000000..b16983a8ad6a06c671e4f9b63c47cf97f15e3ef3 --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-dense8.yaml @@ -0,0 +1,50 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "BLOSUM62", +"use_embedding": false, # maintained for backward compatability +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 8 +], +"dense_layer_l1_regularization": 0.001, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-embedding.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-embedding.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea30d9eb0fcc9bb80d5c59a3424a8073ff0695cf --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-embedding.yaml @@ -0,0 +1,51 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "embedding", +"use_embedding": true, # maintained for backward compatability +"embedding_output_dim": 8, # only used if using embedding +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 16 +], +"dense_layer_l1_regularization": 0.001, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-noL1.json b/downloads-generation/models_class1_experiments1/hyperparameters-noL1.json deleted file mode 100644 index d4e1a4b5832808ab60473340d9659ffbdbd5865e..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-noL1.json +++ /dev/null @@ -1,37 +0,0 @@ -[ - { - "n_models": 12, - "max_epochs": 500, - "patience": 10, - "early_stopping": true, - "validation_split": 0.2, - - "random_negative_rate": 0.0, - "random_negative_constant": 25, - - "use_embedding": false, - "kmer_size": 15, - "batch_normalization": false, - "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } - ], - "activation": "relu", - "output_activation": "sigmoid", - "layer_sizes": [ - 32 - ], - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - "dense_layer_l1_regularization": 0.0, - "dropout_probability": 0.0 - } -] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-noL1.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-noL1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..25c1942bc9ac420f9f12851049ff08856fa8de36 --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-noL1.yaml @@ -0,0 +1,50 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "BLOSUM62", +"use_embedding": false, # maintained for backward compatability +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 16 +], +"dense_layer_l1_regularization": 0.0, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-onehot.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-onehot.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d6c83f9388d4d40665da8e0f1bdf1e99c011e799 --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-onehot.yaml @@ -0,0 +1,50 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "one-hot", +"use_embedding": false, # maintained for backward compatability +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 16 +], +"dense_layer_l1_regularization": 0.001, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-standard.json b/downloads-generation/models_class1_experiments1/hyperparameters-standard.json deleted file mode 120000 index 8d78d6311cca714aa8ba4f3e806a2bafeb793aac..0000000000000000000000000000000000000000 --- a/downloads-generation/models_class1_experiments1/hyperparameters-standard.json +++ /dev/null @@ -1 +0,0 @@ -../models_class1/hyperparameters.json \ No newline at end of file diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-standard.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-standard.yaml new file mode 120000 index 0000000000000000000000000000000000000000..f32feef1682d757437fe1f0cee2a31c030d7508f --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-standard.yaml @@ -0,0 +1 @@ +../models_class1/hyperparameters.yaml \ No newline at end of file diff --git a/downloads-generation/models_class1_experiments1/hyperparameters-widelocal.yaml b/downloads-generation/models_class1_experiments1/hyperparameters-widelocal.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6123c79fb88c99ab596804e307caf2c294dcbcba --- /dev/null +++ b/downloads-generation/models_class1_experiments1/hyperparameters-widelocal.yaml @@ -0,0 +1,51 @@ +[{ +########################################## +# ENSEMBLE SIZE +########################################## +"n_models": 8, + +########################################## +# OPTIMIZATION +########################################## +"max_epochs": 500, +"patience": 10, +"early_stopping": true, +"validation_split": 0.2, +"minibatch_size": 128, + +########################################## +# RANDOM NEGATIVE PEPTIDES +########################################## +"random_negative_rate": 0.0, +"random_negative_constant": 25, +"random_negative_affinity_min": 20000.0, +"random_negative_affinity_max": 50000.0, + +########################################## +# PEPTIDE REPRESENTATION +########################################## +# One of "one-hot", "embedding", or "BLOSUM62". +"peptide_amino_acid_encoding": "BLOSUM62", +"use_embedding": false, # maintained for backward compatability +"embedding_output_dim": 8, # only used if using embedding +"kmer_size": 15, + +########################################## +# NEURAL NETWORK ARCHITECTURE +########################################## +"locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 5 + } +], +"activation": "relu", +"output_activation": "sigmoid", +"layer_sizes": [ + 16 +], +"dense_layer_l1_regularization": 0.001, +"batch_normalization": false, +"dropout_probability": 0.0, +}] diff --git a/examples/class1_allele_specific_models.ipynb b/examples/class1_allele_specific_models.ipynb index 45d85d04b2906aa7e19bcfc8d637c238e7d3212c..e6857e7c23eada48232dcbcf595a0d294ec279a2 100644 --- a/examples/class1_allele_specific_models.ipynb +++ b/examples/class1_allele_specific_models.ipynb @@ -2,25 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 22, "metadata": { "collapsed": false }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tim/miniconda3/envs/py3k/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.\n", - " warnings.warn(self.msg_depr % (key, alt_key))\n", - "Using Theano backend.\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "MHCflurry version: 0.9.0\n" + "MHCflurry version: 1.0.0\n" ] } ], @@ -54,15 +45,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "/Users/tim/miniconda3/envs/py3k/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.\n", - " warnings.warn(self.msg_depr % (key, alt_key))\n", - "Using Theano backend.\n", - "Fetching 0/4 downloads from release 1.0.0\n", - "DOWNLOAD NAME ALREADY DOWNLOADED? WILL DOWNLOAD NOW? URL \n", - "models_class1 YES NO http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/models_class1.tar.bz2 \n", - "data_curated YES NO https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/data_curated.tar.bz2 \n", - "data_kim2014 YES NO http://github.com/hammerlab/mhcflurry/releases/download/0.0.8/data_kim2014.tar.bz2 \n", - "data_iedb YES NO https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0.0-alpha/data_iedb.tar.bz2 \n" + "Fetching 0/6 downloads from release 1.0.0\r\n", + "DOWNLOAD NAME ALREADY DOWNLOADED? WILL DOWNLOAD NOW? URL \r\n", + "models_class1 YES NO http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1.tar.bz2 \r\n", + "models_class1_experiments1 NO NO http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2 \r\n", + "cross_validation_class1 NO NO http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2 \r\n", + "data_iedb NO NO https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 \r\n", + "data_kim2014 NO NO http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 \r\n", + "data_curated YES NO https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2 \r\n" ] } ], @@ -88,7 +78,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Help on class Class1AffinityPredictor in module mhcflurry.class1_affinity_prediction.class1_affinity_predictor:\n", + "Help on class Class1AffinityPredictor in module mhcflurry.class1_affinity_predictor:\n", "\n", "class Class1AffinityPredictor(builtins.object)\n", " | High-level interface for peptide/MHC I binding affinity prediction.\n", @@ -101,7 +91,7 @@ " | \n", " | Methods defined here:\n", " | \n", - " | __init__(self, allele_to_allele_specific_models=None, class1_pan_allele_models=None, allele_to_pseudosequence=None, manifest_df=None)\n", + " | __init__(self, allele_to_allele_specific_models=None, class1_pan_allele_models=None, allele_to_pseudosequence=None, manifest_df=None, allele_to_percent_rank_transform=None)\n", " | Parameters\n", " | ----------\n", " | allele_to_allele_specific_models : dict of string -> list of Class1NeuralNetwork\n", @@ -118,8 +108,34 @@ " | Only required if you want to update an existing serialization of a\n", " | Class1AffinityPredictor. Otherwise this dataframe will be generated\n", " | automatically based on the supplied models.\n", + " | \n", + " | allele_to_percent_rank_transform : dict of string -> PercentRankTransform, optional\n", + " | PercentRankTransform instances to use for each allele\n", + " | \n", + " | calibrate_percentile_ranks(self, peptides=None, num_peptides_per_length=100000, alleles=None, bins=None, quiet=False)\n", + " | Compute the cumulative distribution of ic50 values for a set of alleles\n", + " | over a large universe of random peptides, to enable computing quantiles in\n", + " | this distribution later.\n", + " | \n", + " | Parameters\n", + " | ----------\n", + " | peptides : sequence of string, optional\n", + " | Peptides to use\n", + " | num_peptides_per_length : int, optional\n", + " | If peptides argument is not specified, then num_peptides_per_length\n", + " | peptides are randomly sampled from a uniform distribution for each\n", + " | supported length\n", + " | alleles : sequence of string, optional\n", + " | Alleles to perform calibration for. If not specified all supported\n", + " | alleles will be calibrated.\n", + " | bins : object\n", + " | Anything that can be passed to numpy.histogram's \"bins\" argument\n", + " | can be used here, i.e. either an integer or a sequence giving bin\n", + " | edges. This is in ic50 space.\n", + " | quiet : boolean\n", + " | If False (default), status updates will be printed to stdout.\n", " | \n", - " | fit_allele_specific_predictors(self, n_models, architecture_hyperparameters, allele, peptides, affinities, models_dir_for_save=None, verbose=1)\n", + " | fit_allele_specific_predictors(self, n_models, architecture_hyperparameters, allele, peptides, affinities, models_dir_for_save=None, verbose=1, progress_preamble='')\n", " | Fit one or more allele specific predictors for a single allele using a\n", " | single neural network architecture.\n", " | \n", @@ -147,11 +163,14 @@ " | verbose : int\n", " | Keras verbosity\n", " | \n", + " | progress_preamble : string\n", + " | Optional string of information to include in each progress update\n", + " | \n", " | Returns\n", " | -------\n", " | list of Class1NeuralNetwork\n", " | \n", - " | fit_class1_pan_allele_models(self, n_models, architecture_hyperparameters, alleles, peptides, affinities, models_dir_for_save=None, verbose=1)\n", + " | fit_class1_pan_allele_models(self, n_models, architecture_hyperparameters, alleles, peptides, affinities, models_dir_for_save=None, verbose=1, progress_preamble='')\n", " | Fit one or more pan-allele predictors using a single neural network\n", " | architecture.\n", " | \n", @@ -180,10 +199,34 @@ " | verbose : int\n", " | Keras verbosity\n", " | \n", + " | progress_preamble : string\n", + " | Optional string of information to include in each progress update\n", + " | \n", " | Returns\n", " | -------\n", " | list of Class1NeuralNetwork\n", " | \n", + " | percentile_ranks(self, affinities, allele=None, alleles=None, throw=True)\n", + " | Return percentile ranks for the given ic50 affinities and alleles.\n", + " | \n", + " | The 'allele' and 'alleles' argument are as in the predict() method.\n", + " | Specify one of these.\n", + " | \n", + " | Parameters\n", + " | ----------\n", + " | affinities : sequence of float\n", + " | nM affinities\n", + " | allele : string\n", + " | alleles : sequence of string\n", + " | throw : boolean\n", + " | If True, a ValueError will be raised in the case of unsupported\n", + " | alleles. If False, a warning will be logged and NaN will be returned\n", + " | for those percentile ranks.\n", + " | \n", + " | Returns\n", + " | -------\n", + " | numpy.array of float\n", + " | \n", " | predict(self, peptides, alleles=None, allele=None, throw=True)\n", " | Predict nM binding affinities.\n", " | \n", @@ -209,7 +252,7 @@ " | -------\n", " | numpy.array of predictions\n", " | \n", - " | predict_to_dataframe(self, peptides, alleles=None, allele=None, throw=True, include_individual_model_predictions=False)\n", + " | predict_to_dataframe(self, peptides, alleles=None, allele=None, throw=True, include_individual_model_predictions=False, include_percentile_ranks=True)\n", " | Predict nM binding affinities. Gives more detailed output than `predict`\n", " | method, including 5-95% prediction intervals.\n", " | \n", @@ -226,13 +269,17 @@ " | peptides : EncodableSequences or list of string\n", " | alleles : list of string\n", " | allele : string\n", - " | include_individual_model_predictions : boolean\n", - " | If True, the predictions of each individual model are incldued as\n", - " | columns in the result dataframe.\n", " | throw : boolean\n", " | If True, a ValueError will be raised in the case of unsupported\n", " | alleles or peptide lengths. If False, a warning will be logged and\n", " | the predictions for the unsupported alleles or peptides will be NaN.\n", + " | include_individual_model_predictions : boolean\n", + " | If True, the predictions of each individual model are included as\n", + " | columns in the result dataframe.\n", + " | include_percentile_ranks : boolean, default True\n", + " | If True, a \"prediction_percentile\" column will be included giving the\n", + " | percentile ranks. If no percentile rank information is available,\n", + " | this will be ignored with a warning.\n", " | \n", " | Returns\n", " | -------\n", @@ -257,6 +304,23 @@ " | incremental updates during training.\n", " | \n", " | ----------------------------------------------------------------------\n", + " | Class methods defined here:\n", + " | \n", + " | merge(predictors) from builtins.type\n", + " | Merge the ensembles of two or more Class1AffinityPredictor instances.\n", + " | \n", + " | Note: the resulting merged predictor will NOT have calibrated percentile\n", + " | ranks. Call calibrate_percentile_ranks() on it if these are needed.\n", + " | \n", + " | Parameters\n", + " | ----------\n", + " | predictors : sequence of Class1AffinityPredictor\n", + " | \n", + " | Returns\n", + " | -------\n", + " | Class1AffinityPredictor\n", + " | \n", + " | ----------------------------------------------------------------------\n", " | Static methods defined here:\n", " | \n", " | load(models_dir=None, max_models=None)\n", @@ -333,6 +397,15 @@ " | __weakref__\n", " | list of weak references to the object (if defined)\n", " | \n", + " | neural_networks\n", + " | List of the neural networks in the ensemble.\n", + " | \n", + " | Returns\n", + " | -------\n", + " | list of Class1NeuralNetwork\n", + " | \n", + " | num_networks\n", + " | \n", " | supported_alleles\n", " | Alleles for which predictions can be made.\n", " | \n", @@ -373,10 +446,17 @@ "collapsed": false }, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + }, { "data": { "text/plain": [ - "array([ 6029.07861328, 4798.79443359], dtype=float32)" + "array([ 4899.04784343, 5685.25682682])" ] }, "execution_count": 5, @@ -399,6 +479,19 @@ "data": { "text/html": [ "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", @@ -408,6 +501,7 @@ " <th>prediction</th>\n", " <th>prediction_low</th>\n", " <th>prediction_high</th>\n", + " <th>prediction_percentile</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", @@ -415,26 +509,32 @@ " <th>0</th>\n", " <td>HLA-A0201</td>\n", " <td>SIINFEKL</td>\n", - " <td>6029.078613</td>\n", - " <td>4474.103253</td>\n", - " <td>7771.292885</td>\n", + " <td>4899.047843</td>\n", + " <td>2767.763654</td>\n", + " <td>7269.683643</td>\n", + " <td>6.509787</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>HLA-A0201</td>\n", " <td>SIINFEQL</td>\n", - " <td>4798.794434</td>\n", - " <td>3089.979654</td>\n", - " <td>6757.660606</td>\n", + " <td>5685.256827</td>\n", + " <td>3815.923563</td>\n", + " <td>7476.714466</td>\n", + " <td>7.436687</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " allele peptide prediction prediction_low prediction_high\n", - "0 HLA-A0201 SIINFEKL 6029.078613 4474.103253 7771.292885\n", - "1 HLA-A0201 SIINFEQL 4798.794434 3089.979654 6757.660606" + " allele peptide prediction prediction_low prediction_high \\\n", + "0 HLA-A0201 SIINFEKL 4899.047843 2767.763654 7269.683643 \n", + "1 HLA-A0201 SIINFEQL 5685.256827 3815.923563 7476.714466 \n", + "\n", + " prediction_percentile \n", + "0 6.509787 \n", + "1 7.436687 " ] }, "execution_count": 6, @@ -457,6 +557,19 @@ "data": { "text/html": [ "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", @@ -466,6 +579,7 @@ " <th>prediction</th>\n", " <th>prediction_low</th>\n", " <th>prediction_high</th>\n", + " <th>prediction_percentile</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", @@ -473,26 +587,32 @@ " <th>0</th>\n", " <td>HLA-A0201</td>\n", " <td>SIINFEKL</td>\n", - " <td>6029.080248</td>\n", - " <td>4474.103332</td>\n", - " <td>7771.295550</td>\n", + " <td>4899.047942</td>\n", + " <td>2767.763654</td>\n", + " <td>7269.683643</td>\n", + " <td>6.509787</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>HLA-B*57:01</td>\n", " <td>SIINFEQL</td>\n", - " <td>26494.172574</td>\n", - " <td>23963.167585</td>\n", - " <td>28407.840921</td>\n", + " <td>26704.220115</td>\n", + " <td>23198.059394</td>\n", + " <td>30635.114610</td>\n", + " <td>47.611925</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " allele peptide prediction prediction_low prediction_high\n", - "0 HLA-A0201 SIINFEKL 6029.080248 4474.103332 7771.295550\n", - "1 HLA-B*57:01 SIINFEQL 26494.172574 23963.167585 28407.840921" + " allele peptide prediction prediction_low prediction_high \\\n", + "0 HLA-A0201 SIINFEKL 4899.047942 2767.763654 7269.683643 \n", + "1 HLA-B*57:01 SIINFEQL 26704.220115 23198.059394 30635.114610 \n", + "\n", + " prediction_percentile \n", + "0 6.509787 \n", + "1 47.611925 " ] }, "execution_count": 7, @@ -515,6 +635,19 @@ "data": { "text/html": [ "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", @@ -532,6 +665,7 @@ " <th>model_single_5</th>\n", " <th>model_single_6</th>\n", " <th>model_single_7</th>\n", + " <th>prediction_percentile</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", @@ -539,33 +673,35 @@ " <th>0</th>\n", " <td>HLA-A0201</td>\n", " <td>SIINFEKL</td>\n", - " <td>6029.078613</td>\n", - " <td>4474.103253</td>\n", - " <td>7771.292885</td>\n", - " <td>6342.672852</td>\n", - " <td>5768.515625</td>\n", - " <td>6045.880371</td>\n", - " <td>6485.166016</td>\n", - " <td>4922.885742</td>\n", - " <td>4249.643066</td>\n", - " <td>7165.508301</td>\n", - " <td>8118.428711</td>\n", + " <td>4899.047843</td>\n", + " <td>2767.763654</td>\n", + " <td>7269.683643</td>\n", + " <td>5245.313773</td>\n", + " <td>4131.368053</td>\n", + " <td>4599.034976</td>\n", + " <td>7350.344042</td>\n", + " <td>2230.774145</td>\n", + " <td>6754.462616</td>\n", + " <td>4220.768251</td>\n", + " <td>7122.226730</td>\n", + " <td>6.509787</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>HLA-A0201</td>\n", " <td>SIINFEQL</td>\n", - " <td>4798.794434</td>\n", - " <td>3089.979654</td>\n", - " <td>6757.660606</td>\n", - " <td>4940.594727</td>\n", - " <td>5556.735352</td>\n", - " <td>4746.480469</td>\n", - " <td>4656.201172</td>\n", - " <td>3591.689453</td>\n", - " <td>2849.514893</td>\n", - " <td>6637.148926</td>\n", - " <td>6823.454590</td>\n", + " <td>5685.256827</td>\n", + " <td>3815.923563</td>\n", + " <td>7476.714466</td>\n", + " <td>5711.583293</td>\n", + " <td>5718.509616</td>\n", + " <td>5459.967973</td>\n", + " <td>7709.914896</td>\n", + " <td>3394.800793</td>\n", + " <td>7062.179615</td>\n", + " <td>4741.495660</td>\n", + " <td>6983.180148</td>\n", + " <td>7.436687</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", @@ -573,16 +709,20 @@ ], "text/plain": [ " allele peptide prediction prediction_low prediction_high \\\n", - "0 HLA-A0201 SIINFEKL 6029.078613 4474.103253 7771.292885 \n", - "1 HLA-A0201 SIINFEQL 4798.794434 3089.979654 6757.660606 \n", + "0 HLA-A0201 SIINFEKL 4899.047843 2767.763654 7269.683643 \n", + "1 HLA-A0201 SIINFEQL 5685.256827 3815.923563 7476.714466 \n", "\n", " model_single_0 model_single_1 model_single_2 model_single_3 \\\n", - "0 6342.672852 5768.515625 6045.880371 6485.166016 \n", - "1 4940.594727 5556.735352 4746.480469 4656.201172 \n", + "0 5245.313773 4131.368053 4599.034976 7350.344042 \n", + "1 5711.583293 5718.509616 5459.967973 7709.914896 \n", + "\n", + " model_single_4 model_single_5 model_single_6 model_single_7 \\\n", + "0 2230.774145 6754.462616 4220.768251 7122.226730 \n", + "1 3394.800793 7062.179615 4741.495660 6983.180148 \n", "\n", - " model_single_4 model_single_5 model_single_6 model_single_7 \n", - "0 4922.885742 4249.643066 7165.508301 8118.428711 \n", - "1 3591.689453 2849.514893 6637.148926 6823.454590 " + " prediction_percentile \n", + "0 6.509787 \n", + "1 7.436687 " ] }, "execution_count": 8, @@ -608,6 +748,19 @@ "data": { "text/html": [ "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", @@ -617,6 +770,7 @@ " <th>prediction</th>\n", " <th>prediction_low</th>\n", " <th>prediction_high</th>\n", + " <th>prediction_percentile</th>\n", " </tr>\n", " </thead>\n", " <tbody>\n", @@ -624,17 +778,19 @@ " <th>0</th>\n", " <td>HLA-A0201</td>\n", " <td>SIINFEKL</td>\n", - " <td>6029.079750</td>\n", - " <td>4474.103332</td>\n", - " <td>7771.292208</td>\n", + " <td>4899.047843</td>\n", + " <td>2767.763654</td>\n", + " <td>7269.683643</td>\n", + " <td>6.509787</td>\n", " </tr>\n", " <tr>\n", " <th>1</th>\n", " <td>HLA-A0201</td>\n", " <td>SIINFEQL</td>\n", - " <td>4798.795518</td>\n", - " <td>3089.980068</td>\n", - " <td>6757.660130</td>\n", + " <td>5685.256827</td>\n", + " <td>3815.923563</td>\n", + " <td>7476.714466</td>\n", + " <td>7.436687</td>\n", " </tr>\n", " <tr>\n", " <th>2</th>\n", @@ -643,16 +799,22 @@ " <td>NaN</td>\n", " <td>NaN</td>\n", " <td>NaN</td>\n", + " <td>100.000000</td>\n", " </tr>\n", " </tbody>\n", "</table>\n", "</div>" ], "text/plain": [ - " allele peptide prediction prediction_low prediction_high\n", - "0 HLA-A0201 SIINFEKL 6029.079750 4474.103332 7771.292208\n", - "1 HLA-A0201 SIINFEQL 4798.795518 3089.980068 6757.660130\n", - "2 HLA-A0201 TAAAALANGGGGGGGG NaN NaN NaN" + " allele peptide prediction prediction_low prediction_high \\\n", + "0 HLA-A0201 SIINFEKL 4899.047843 2767.763654 7269.683643 \n", + "1 HLA-A0201 SIINFEQL 5685.256827 3815.923563 7476.714466 \n", + "2 HLA-A0201 TAAAALANGGGGGGGG NaN NaN NaN \n", + "\n", + " prediction_percentile \n", + "0 6.509787 \n", + "1 7.436687 \n", + "2 100.000000 " ] }, "execution_count": 9, @@ -707,7 +869,7 @@ { "data": { "text/plain": [ - "array([ 25589.67773438, 29587.88476562, 35768.203125 ], dtype=float32)" + "array([ 28227.29890915, 26568.72745054, 39043.95304442])" ] }, "execution_count": 11, @@ -763,6 +925,19 @@ "data": { "text/html": [ "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", "<table border=\"1\" class=\"dataframe\">\n", " <thead>\n", " <tr style=\"text-align: right;\">\n", @@ -1056,7 +1231,7 @@ " <td>...</td>\n", " </tr>\n", " <tr>\n", - " <th>241522</th>\n", + " <th>239917</th>\n", " <td>SLA-2*04:01</td>\n", " <td>MTAHITVPY</td>\n", " <td>50.0</td>\n", @@ -1065,7 +1240,7 @@ " <td>SLA-2*0401</td>\n", " </tr>\n", " <tr>\n", - " <th>241523</th>\n", + " <th>239918</th>\n", " <td>SLA-2*04:01</td>\n", " <td>NMTAHITVPY</td>\n", " <td>50000.0</td>\n", @@ -1074,7 +1249,7 @@ " <td>SLA-2*0401</td>\n", " </tr>\n", " <tr>\n", - " <th>241524</th>\n", + " <th>239919</th>\n", " <td>SLA-2*04:01</td>\n", " <td>NTYLSGIAQY</td>\n", " <td>50.0</td>\n", @@ -1083,7 +1258,7 @@ " <td>SLA-2*0401</td>\n", " </tr>\n", " <tr>\n", - " <th>241525</th>\n", + " <th>239920</th>\n", " <td>SLA-2*04:01</td>\n", " <td>QSSVGVTHGY</td>\n", " <td>50000.0</td>\n", @@ -1092,7 +1267,7 @@ " <td>SLA-2*0401</td>\n", " </tr>\n", " <tr>\n", - " <th>241526</th>\n", + " <th>239921</th>\n", " <td>SLA-2*04:01</td>\n", " <td>SSVGVTHGY</td>\n", " <td>50.0</td>\n", @@ -1101,7 +1276,7 @@ " <td>SLA-2*0401</td>\n", " </tr>\n", " <tr>\n", - " <th>241527</th>\n", + " <th>239922</th>\n", " <td>SLA-2*04:01</td>\n", " <td>TVYNGTSKY</td>\n", " <td>50000.0</td>\n", @@ -1110,7 +1285,7 @@ " <td>SLA-2*0401</td>\n", " </tr>\n", " <tr>\n", - " <th>241528</th>\n", + " <th>239923</th>\n", " <td>SLA-2*04:01</td>\n", " <td>YLSGIAQYY</td>\n", " <td>50.0</td>\n", @@ -1119,7 +1294,7 @@ " <td>SLA-2*0401</td>\n", " </tr>\n", " <tr>\n", - " <th>241529</th>\n", + " <th>239924</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KANTQFTAV</td>\n", " <td>100.0</td>\n", @@ -1128,7 +1303,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241530</th>\n", + " <th>239925</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KENTQFTAV</td>\n", " <td>5000.0</td>\n", @@ -1137,7 +1312,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241531</th>\n", + " <th>239926</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KFNTQFTAV</td>\n", " <td>5000.0</td>\n", @@ -1146,7 +1321,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241532</th>\n", + " <th>239927</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KINTQFTAV</td>\n", " <td>5000.0</td>\n", @@ -1155,7 +1330,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241533</th>\n", + " <th>239928</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KLNTQFTAV</td>\n", " <td>5000.0</td>\n", @@ -1164,7 +1339,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241534</th>\n", + " <th>239929</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMATQFTAV</td>\n", " <td>100.0</td>\n", @@ -1173,7 +1348,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241535</th>\n", + " <th>239930</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMETQFTAV</td>\n", " <td>5000.0</td>\n", @@ -1182,7 +1357,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241536</th>\n", + " <th>239931</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMFTQFTAV</td>\n", " <td>5000.0</td>\n", @@ -1191,7 +1366,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241537</th>\n", + " <th>239932</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMNTAFTAV</td>\n", " <td>100.0</td>\n", @@ -1200,7 +1375,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241538</th>\n", + " <th>239933</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMNTQFTAA</td>\n", " <td>100.0</td>\n", @@ -1209,7 +1384,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241539</th>\n", + " <th>239934</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMNTQFTAF</td>\n", " <td>5000.0</td>\n", @@ -1218,7 +1393,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241540</th>\n", + " <th>239935</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMNTQFTAI</td>\n", " <td>500.0</td>\n", @@ -1227,7 +1402,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241541</th>\n", + " <th>239936</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMNTQFTAL</td>\n", " <td>5000.0</td>\n", @@ -1236,7 +1411,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241542</th>\n", + " <th>239937</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMNTQFTAV</td>\n", " <td>100.0</td>\n", @@ -1245,7 +1420,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241543</th>\n", + " <th>239938</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMNTQFTAV</td>\n", " <td>100.0</td>\n", @@ -1254,7 +1429,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241544</th>\n", + " <th>239939</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMNTQFTAV</td>\n", " <td>100.0</td>\n", @@ -1263,7 +1438,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241545</th>\n", + " <th>239940</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMQTQFTAV</td>\n", " <td>100.0</td>\n", @@ -1272,7 +1447,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241546</th>\n", + " <th>239941</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KMRTQFTAV</td>\n", " <td>500.0</td>\n", @@ -1281,7 +1456,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241547</th>\n", + " <th>239942</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KRNTQFTAV</td>\n", " <td>500.0</td>\n", @@ -1290,7 +1465,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241548</th>\n", + " <th>239943</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KTNTQFTAV</td>\n", " <td>5000.0</td>\n", @@ -1299,7 +1474,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241549</th>\n", + " <th>239944</th>\n", " <td>SLA-3*02:02</td>\n", " <td>KVNTQFTAV</td>\n", " <td>5000.0</td>\n", @@ -1308,7 +1483,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241550</th>\n", + " <th>239945</th>\n", " <td>SLA-3*02:02</td>\n", " <td>MQFSSLTV</td>\n", " <td>100.0</td>\n", @@ -1317,7 +1492,7 @@ " <td>SLA-3*0202</td>\n", " </tr>\n", " <tr>\n", - " <th>241551</th>\n", + " <th>239946</th>\n", " <td>SLA-3*02:02</td>\n", " <td>RRNYFTAEV</td>\n", " <td>100.0</td>\n", @@ -1327,7 +1502,7 @@ " </tr>\n", " </tbody>\n", "</table>\n", - "<p>241552 rows × 6 columns</p>\n", + "<p>239947 rows × 6 columns</p>\n", "</div>" ], "text/plain": [ @@ -1363,36 +1538,36 @@ "28 BoLA-2*12:01 KKSHGMGKVGK 50000.0 qualitative \n", "29 BoLA-2*12:01 KLHGMGKVGK 50000.0 qualitative \n", "... ... ... ... ... \n", - "241522 SLA-2*04:01 MTAHITVPY 50.0 qualitative \n", - "241523 SLA-2*04:01 NMTAHITVPY 50000.0 qualitative \n", - "241524 SLA-2*04:01 NTYLSGIAQY 50.0 qualitative \n", - "241525 SLA-2*04:01 QSSVGVTHGY 50000.0 qualitative \n", - "241526 SLA-2*04:01 SSVGVTHGY 50.0 qualitative \n", - "241527 SLA-2*04:01 TVYNGTSKY 50000.0 qualitative \n", - "241528 SLA-2*04:01 YLSGIAQYY 50.0 qualitative \n", - "241529 SLA-3*02:02 KANTQFTAV 100.0 qualitative \n", - "241530 SLA-3*02:02 KENTQFTAV 5000.0 qualitative \n", - "241531 SLA-3*02:02 KFNTQFTAV 5000.0 qualitative \n", - "241532 SLA-3*02:02 KINTQFTAV 5000.0 qualitative \n", - "241533 SLA-3*02:02 KLNTQFTAV 5000.0 qualitative \n", - "241534 SLA-3*02:02 KMATQFTAV 100.0 qualitative \n", - "241535 SLA-3*02:02 KMETQFTAV 5000.0 qualitative \n", - "241536 SLA-3*02:02 KMFTQFTAV 5000.0 qualitative \n", - "241537 SLA-3*02:02 KMNTAFTAV 100.0 qualitative \n", - "241538 SLA-3*02:02 KMNTQFTAA 100.0 qualitative \n", - "241539 SLA-3*02:02 KMNTQFTAF 5000.0 qualitative \n", - "241540 SLA-3*02:02 KMNTQFTAI 500.0 qualitative \n", - "241541 SLA-3*02:02 KMNTQFTAL 5000.0 qualitative \n", - "241542 SLA-3*02:02 KMNTQFTAV 100.0 qualitative \n", - "241543 SLA-3*02:02 KMNTQFTAV 100.0 qualitative \n", - "241544 SLA-3*02:02 KMNTQFTAV 100.0 qualitative \n", - "241545 SLA-3*02:02 KMQTQFTAV 100.0 qualitative \n", - "241546 SLA-3*02:02 KMRTQFTAV 500.0 qualitative \n", - "241547 SLA-3*02:02 KRNTQFTAV 500.0 qualitative \n", - "241548 SLA-3*02:02 KTNTQFTAV 5000.0 qualitative \n", - "241549 SLA-3*02:02 KVNTQFTAV 5000.0 qualitative \n", - "241550 SLA-3*02:02 MQFSSLTV 100.0 qualitative \n", - "241551 SLA-3*02:02 RRNYFTAEV 100.0 qualitative \n", + "239917 SLA-2*04:01 MTAHITVPY 50.0 qualitative \n", + "239918 SLA-2*04:01 NMTAHITVPY 50000.0 qualitative \n", + "239919 SLA-2*04:01 NTYLSGIAQY 50.0 qualitative \n", + "239920 SLA-2*04:01 QSSVGVTHGY 50000.0 qualitative \n", + "239921 SLA-2*04:01 SSVGVTHGY 50.0 qualitative \n", + "239922 SLA-2*04:01 TVYNGTSKY 50000.0 qualitative \n", + "239923 SLA-2*04:01 YLSGIAQYY 50.0 qualitative \n", + "239924 SLA-3*02:02 KANTQFTAV 100.0 qualitative \n", + "239925 SLA-3*02:02 KENTQFTAV 5000.0 qualitative \n", + "239926 SLA-3*02:02 KFNTQFTAV 5000.0 qualitative \n", + "239927 SLA-3*02:02 KINTQFTAV 5000.0 qualitative \n", + "239928 SLA-3*02:02 KLNTQFTAV 5000.0 qualitative \n", + "239929 SLA-3*02:02 KMATQFTAV 100.0 qualitative \n", + "239930 SLA-3*02:02 KMETQFTAV 5000.0 qualitative \n", + "239931 SLA-3*02:02 KMFTQFTAV 5000.0 qualitative \n", + "239932 SLA-3*02:02 KMNTAFTAV 100.0 qualitative \n", + "239933 SLA-3*02:02 KMNTQFTAA 100.0 qualitative \n", + "239934 SLA-3*02:02 KMNTQFTAF 5000.0 qualitative \n", + "239935 SLA-3*02:02 KMNTQFTAI 500.0 qualitative \n", + "239936 SLA-3*02:02 KMNTQFTAL 5000.0 qualitative \n", + "239937 SLA-3*02:02 KMNTQFTAV 100.0 qualitative \n", + "239938 SLA-3*02:02 KMNTQFTAV 100.0 qualitative \n", + "239939 SLA-3*02:02 KMNTQFTAV 100.0 qualitative \n", + "239940 SLA-3*02:02 KMQTQFTAV 100.0 qualitative \n", + "239941 SLA-3*02:02 KMRTQFTAV 500.0 qualitative \n", + "239942 SLA-3*02:02 KRNTQFTAV 500.0 qualitative \n", + "239943 SLA-3*02:02 KTNTQFTAV 5000.0 qualitative \n", + "239944 SLA-3*02:02 KVNTQFTAV 5000.0 qualitative \n", + "239945 SLA-3*02:02 MQFSSLTV 100.0 qualitative \n", + "239946 SLA-3*02:02 RRNYFTAEV 100.0 qualitative \n", "\n", " measurement_source original_allele \n", "0 Barlow - purified MHC/competitive/fluorescence BoLA-1*02101 \n", @@ -1426,38 +1601,38 @@ "28 Morrison - cellular MHC/T cell inhibition BoLA-2*01201 \n", "29 Morrison - cellular MHC/T cell inhibition BoLA-2*01201 \n", "... ... ... \n", - "241522 Golde - purified MHC SLA-2*0401 \n", - "241523 Golde - purified MHC SLA-2*0401 \n", - "241524 Golde - purified MHC SLA-2*0401 \n", - "241525 Golde - purified MHC SLA-2*0401 \n", - "241526 Golde - purified MHC SLA-2*0401 \n", - "241527 Golde - purified MHC SLA-2*0401 \n", - "241528 Golde - purified MHC SLA-2*0401 \n", - "241529 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", - "241530 Xia - purified MHC SLA-3*0202 \n", - "241531 Xia - purified MHC SLA-3*0202 \n", - "241532 Xia - purified MHC SLA-3*0202 \n", - "241533 Xia - purified MHC SLA-3*0202 \n", - "241534 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", - "241535 Xia - purified MHC SLA-3*0202 \n", - "241536 Xia - purified MHC SLA-3*0202 \n", - "241537 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", - "241538 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", - "241539 Xia - purified MHC SLA-3*0202 \n", - "241540 Xia - purified MHC SLA-3*0202 \n", - "241541 Xia - purified MHC SLA-3*0202 \n", - "241542 Xia - x-ray crystallography SLA-3*0202 \n", - "241543 Xia - purified MHC SLA-3*0202 \n", - "241544 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", - "241545 Xia - purified MHC SLA-3*0202 \n", - "241546 Xia - purified MHC SLA-3*0202 \n", - "241547 Xia - purified MHC SLA-3*0202 \n", - "241548 Xia - purified MHC SLA-3*0202 \n", - "241549 Xia - purified MHC SLA-3*0202 \n", - "241550 Xia - purified MHC SLA-3*0202 \n", - "241551 Xia - purified MHC SLA-3*0202 \n", + "239917 Golde - purified MHC SLA-2*0401 \n", + "239918 Golde - purified MHC SLA-2*0401 \n", + "239919 Golde - purified MHC SLA-2*0401 \n", + "239920 Golde - purified MHC SLA-2*0401 \n", + "239921 Golde - purified MHC SLA-2*0401 \n", + "239922 Golde - purified MHC SLA-2*0401 \n", + "239923 Golde - purified MHC SLA-2*0401 \n", + "239924 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", + "239925 Xia - purified MHC SLA-3*0202 \n", + "239926 Xia - purified MHC SLA-3*0202 \n", + "239927 Xia - purified MHC SLA-3*0202 \n", + "239928 Xia - purified MHC SLA-3*0202 \n", + "239929 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", + "239930 Xia - purified MHC SLA-3*0202 \n", + "239931 Xia - purified MHC SLA-3*0202 \n", + "239932 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", + "239933 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", + "239934 Xia - purified MHC SLA-3*0202 \n", + "239935 Xia - purified MHC SLA-3*0202 \n", + "239936 Xia - purified MHC SLA-3*0202 \n", + "239937 Xia - x-ray crystallography SLA-3*0202 \n", + "239938 Xia - purified MHC SLA-3*0202 \n", + "239939 Xia - purified MHC/direct/fluorescence SLA-3*0202 \n", + "239940 Xia - purified MHC SLA-3*0202 \n", + "239941 Xia - purified MHC SLA-3*0202 \n", + "239942 Xia - purified MHC SLA-3*0202 \n", + "239943 Xia - purified MHC SLA-3*0202 \n", + "239944 Xia - purified MHC SLA-3*0202 \n", + "239945 Xia - purified MHC SLA-3*0202 \n", + "239946 Xia - purified MHC SLA-3*0202 \n", "\n", - "[241552 rows x 6 columns]" + "[239947 rows x 6 columns]" ] }, "execution_count": 13, @@ -1502,16 +1677,17 @@ " 'left_edge': 4,\n", " 'locally_connected_layers': [{'activation': 'tanh',\n", " 'filters': 8,\n", - " 'kernel_size': 3},\n", - " {'activation': 'tanh', 'filters': 8, 'kernel_size': 3}],\n", + " 'kernel_size': 3}],\n", " 'loss': 'mse',\n", " 'max_epochs': 500,\n", " 'min_delta': 0,\n", + " 'minibatch_size': 128,\n", " 'mode': 'auto',\n", " 'monitor': 'val_loss',\n", " 'optimizer': 'rmsprop',\n", " 'output_activation': 'sigmoid',\n", " 'patience': 10,\n", + " 'peptide_amino_acid_encoding': 'one-hot',\n", " 'pseudosequence_use_embedding': False,\n", " 'random_negative_affinity_max': 50000.0,\n", " 'random_negative_affinity_min': 20000.0,\n", @@ -1539,7 +1715,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "metadata": { "collapsed": false }, @@ -1548,76 +1724,54 @@ "name": "stdout", "output_type": "stream", "text": [ - "Train on 2489 samples, validate on 623 samples\n", - "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.1778 - val_loss: 0.1229\n", - "Train on 2489 samples, validate on 623 samples\n", - "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0903 - val_loss: 0.0624\n", - "Train on 2489 samples, validate on 623 samples\n", - "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0452 - val_loss: 0.0367\n", - "Train on 2489 samples, validate on 623 samples\n", - "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0298 - val_loss: 0.0283\n", - "Train on 2489 samples, validate on 623 samples\n", - "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0255 - val_loss: 0.0259\n", - "Train on 2489 samples, validate on 623 samples\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0233 - val_loss: 0.0250\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 36us/step - loss: 0.0173 - val_loss: 0.0270\n", + "Epoch 0 / 500: loss=0.0173307. Min val loss (None) at epoch None\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0219 - val_loss: 0.0242\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 41us/step - loss: 0.0175 - val_loss: 0.0262\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0211 - val_loss: 0.0254\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 39us/step - loss: 0.0170 - val_loss: 0.0278\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0206 - val_loss: 0.0275\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 35us/step - loss: 0.0175 - val_loss: 0.0284\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0197 - val_loss: 0.0241\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 34us/step - loss: 0.0174 - val_loss: 0.0276\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0194 - val_loss: 0.0252\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 59us/step - loss: 0.0171 - val_loss: 0.0272\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0190 - val_loss: 0.0273\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 33us/step - loss: 0.0171 - val_loss: 0.0273\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0187 - val_loss: 0.0247\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 49us/step - loss: 0.0169 - val_loss: 0.0268\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0180 - val_loss: 0.0254\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 66us/step - loss: 0.0171 - val_loss: 0.0273\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0179 - val_loss: 0.0244\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 35us/step - loss: 0.0169 - val_loss: 0.0268\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0177 - val_loss: 0.0258\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 31us/step - loss: 0.0168 - val_loss: 0.0275\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0176 - val_loss: 0.0278\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 41us/step - loss: 0.0169 - val_loss: 0.0275\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0171 - val_loss: 0.0269\n", - "Train on 2489 samples, validate on 623 samples\n", - "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0169 - val_loss: 0.0277\n", - "Train on 2489 samples, validate on 623 samples\n", - "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0169 - val_loss: 0.0300\n", - "Train on 2489 samples, validate on 623 samples\n", - "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0166 - val_loss: 0.0263\n", - "CPU times: user 9.31 s, sys: 204 ms, total: 9.52 s\n", - "Wall time: 10.7 s\n" + "2491/2491 [==============================] - 0s 47us/step - loss: 0.0169 - val_loss: 0.0298\n", + "Early stopping at epoch 12 / 500: loss=0.0168712. Min val loss (0.0261514389179) at epoch 1\n", + "CPU times: user 1.92 s, sys: 167 ms, total: 2.09 s\n", + "Wall time: 1.72 s\n" ] } ], "source": [ - "train_data = data_df.ix[\n", + "train_data = data_df.loc[\n", " (data_df.allele == \"HLA-B*57:01\") &\n", " (data_df.peptide.str.len() >= 8) &\n", " (data_df.peptide.str.len() <= 15)\n", @@ -1627,7 +1781,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -1635,10 +1789,10 @@ { "data": { "text/plain": [ - "array([ 25132.52929688], dtype=float32)" + "array([ 26802.58186135])" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1656,7 +1810,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "collapsed": false }, @@ -1665,45 +1819,46 @@ "name": "stdout", "output_type": "stream", "text": [ - "Train on 2489 samples, validate on 623 samples\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.1773 - val_loss: 0.1200\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 1s 204us/step - loss: 0.2511 - val_loss: 0.2062\n", + "Epoch 0 / 10: loss=0.251054. Min val loss (None) at epoch None\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0896 - val_loss: 0.0625\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 56us/step - loss: 0.1951 - val_loss: 0.1758\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0451 - val_loss: 0.0360\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 52us/step - loss: 0.1658 - val_loss: 0.1494\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0297 - val_loss: 0.0287\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 50us/step - loss: 0.1401 - val_loss: 0.1259\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0253 - val_loss: 0.0271\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 48us/step - loss: 0.1171 - val_loss: 0.1059\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0234 - val_loss: 0.0257\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 52us/step - loss: 0.0965 - val_loss: 0.0869\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0218 - val_loss: 0.0249\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 40us/step - loss: 0.0783 - val_loss: 0.0716\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0213 - val_loss: 0.0240\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 46us/step - loss: 0.0631 - val_loss: 0.0586\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0200 - val_loss: 0.0238\n", - "Train on 2489 samples, validate on 623 samples\n", + "2491/2491 [==============================] - 0s 54us/step - loss: 0.0514 - val_loss: 0.0486\n", + "Train on 2491 samples, validate on 623 samples\n", "Epoch 1/1\n", - "2489/2489 [==============================] - 0s - loss: 0.0197 - val_loss: 0.0243\n" + "2491/2491 [==============================] - 0s 36us/step - loss: 0.0433 - val_loss: 0.0421\n" ] }, { "data": { "text/plain": [ - "<generator object Class1AffinityPredictor._fit_predictors at 0x1252b5fc0>" + "[<mhcflurry.class1_neural_network.Class1NeuralNetwork at 0x124ad7d30>]" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1723,7 +1878,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": { "collapsed": false }, @@ -1731,10 +1886,10 @@ { "data": { "text/plain": [ - "array([ 25200.29882812], dtype=float32)" + "array([ 17405.26823281])" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1752,7 +1907,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": { "collapsed": false }, @@ -1761,11 +1916,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "mkdir: /tmp/saved-affinity-predictor: File exists\n", - "manifest.csv\n", - "weights_HLA-B*57:01-0-7d2f64641ccdd312.npz\n", - "weights_HLA-B*57:01-0-93498abc9bbd5291.npz\n", - "weights_HLA-B*57:01-0-9e5317fef54dc1f6.npz\n" + "manifest.csv\r\n", + "weights_HLA-B*57:01-0-7be58b1094489f2d.npz\r\n" ] } ], @@ -1777,7 +1929,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": { "collapsed": false }, @@ -1785,10 +1937,10 @@ { "data": { "text/plain": [ - "array([ 25200.29882812], dtype=float32)" + "array([ 17405.26823281])" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index cf13e1e6339778da3c658e5d9b64afddfac31ecb..600882fd9693f56f8cc89f45c91f8906ee899d3d 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -1,26 +1,10 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor +from mhcflurry.class1_neural_network import Class1NeuralNetwork -from .class1_affinity_prediction.class1_neural_network import ( - Class1NeuralNetwork) -from .class1_affinity_prediction.class1_affinity_predictor import ( - Class1AffinityPredictor) - -__version__ = "0.9.3" +__version__ = "1.0.0" __all__ = [ - "Class1NeuralNetwork", - "Class1AffinityPredictor", "__version__", + "Class1AffinityPredictor", + "Class1NeuralNetwork", ] diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index e5e59cf1acb5f84777fbd756b84448ad0ba62665..1077c64443c505ecb6961df50250e8ea89e586ee 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -1,16 +1,7 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +""" +Functions for encoding fixed length sequences of amino acids into various +vector representations, such as one-hot and BLOSUM62. +""" from __future__ import ( print_function, @@ -20,6 +11,10 @@ from __future__ import ( import collections from copy import copy +import pandas +from six import StringIO + + COMMON_AMINO_ACIDS = collections.OrderedDict(sorted({ "A": "Alanine", "R": "Arginine", @@ -47,3 +42,118 @@ COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown" AMINO_ACID_INDEX = dict( (letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN)) + +AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys()) + +BLOSUM62_MATRIX = pandas.read_table(StringIO(""" + A R N D C Q E G H I L K M F P S T W Y V X +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0 +X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] +assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all() + +ENCODING_DATA_FRAMES = { + "BLOSUM62": BLOSUM62_MATRIX, + "one-hot": pandas.DataFrame([ + [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] + for j in range(len(AMINO_ACIDS)) + ], index=AMINO_ACIDS, columns=AMINO_ACIDS) +} + + +def available_vector_encodings(): + """ + Return list of supported amino acid vector encodings. + + Returns + ------- + list of string + + """ + return list(ENCODING_DATA_FRAMES) + + +def vector_encoding_length(name): + """ + Return the length of the given vector encoding. + + Parameters + ---------- + name : string + + Returns + ------- + int + """ + return ENCODING_DATA_FRAMES[name].shape[1] + + +def index_encoding(sequences, letter_to_index_dict): + """ + Encode a sequence of same-length strings to a matrix of integers of the + same shape. The map from characters to integers is given by + `letter_to_index_dict`. + + Given a sequence of `n` strings all of length `k`, return a `k * n` array where + the (`i`, `j`)th element is `letter_to_index_dict[sequence[i][j]]`. + + Parameters + ---------- + sequences : list of length n of strings of length k + letter_to_index_dict : dict : string -> int + + Returns + ------- + numpy.array of integers with shape (`k`, `n`) + """ + df = pandas.DataFrame(iter(s) for s in sequences) + result = df.replace(letter_to_index_dict) + return result.values + + +def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df): + """ + Given a `n` x `k` matrix of integers such as that returned by `index_encoding()` and + a dataframe mapping each index to an arbitrary vector, return a `n * k * m` + array where the (`i`, `j`)'th element is `letter_to_vector_df.iloc[sequence[i][j]]`. + + The dataframe index and columns names are ignored here; the indexing is done + entirely by integer position in the dataframe. + + Parameters + ---------- + index_encoded_sequences : `n` x `k` array of integers + + letter_to_vector_df : pandas.DataFrame of shape (`alphabet size`, `m`) + + Returns + ------- + numpy.array of integers with shape (`n`, `k`, `m`) + """ + (num_sequences, sequence_length) = index_encoded_sequences.shape + target_shape = ( + num_sequences, sequence_length, letter_to_vector_df.shape[0]) + result = letter_to_vector_df.iloc[ + index_encoded_sequences.flat + ].values.reshape(target_shape) + return result + diff --git a/mhcflurry/class1_affinity_prediction/__init__.py b/mhcflurry/class1_affinity_prediction/__init__.py deleted file mode 100644 index 96707e88331792c6a4cc0d0214ef656b7522bfff..0000000000000000000000000000000000000000 --- a/mhcflurry/class1_affinity_prediction/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -from __future__ import absolute_import - -from .class1_neural_network import Class1NeuralNetwork -from .class1_affinity_predictor import Class1AffinityPredictor - -__all__ = [ - 'Class1NeuralNetwork', - 'Class1AffinityPredictor', -] diff --git a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py deleted file mode 100644 index 1128ca99a3dfaef999d2b131ea3f6865faf53127..0000000000000000000000000000000000000000 --- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py +++ /dev/null @@ -1,131 +0,0 @@ -""" -Train Class1 single allele models. - -""" -import os -import sys -import argparse -import yaml - -import pandas - -from .class1_affinity_predictor import Class1AffinityPredictor -from ..common import configure_logging - - -parser = argparse.ArgumentParser(usage=__doc__) - -parser.add_argument( - "--data", - metavar="FILE.csv", - required=True, - help=( - "Training data CSV. Expected columns: " - "allele, peptide, measurement_value")) -parser.add_argument( - "--out-models-dir", - metavar="DIR", - required=True, - help="Directory to write models and manifest") -parser.add_argument( - "--hyperparameters", - metavar="FILE.json", - required=True, - help="JSON or YAML of hyperparameters") -parser.add_argument( - "--allele", - default=None, - nargs="+", - help="Alleles to train models for. If not specified, all alleles with " - "enough measurements will be used.") -parser.add_argument( - "--min-measurements-per-allele", - type=int, - metavar="N", - default=50, - help="Train models for alleles with >=N measurements.") -parser.add_argument( - "--only-quantitative", - action="store_true", - default=False, - help="Use only quantitative training data") -parser.add_argument( - "--verbosity", - type=int, - help="Keras verbosity. Default: %(default)s", - default=1) - - -def run(argv=sys.argv[1:]): - args = parser.parse_args(argv) - - configure_logging(verbose=args.verbosity > 1) - - hyperparameters_lst = yaml.load(open(args.hyperparameters)) - assert isinstance(hyperparameters_lst, list) - print("Loaded hyperparameters list: %s" % str(hyperparameters_lst)) - - df = pandas.read_csv(args.data) - print("Loaded training data: %s" % (str(df.shape))) - - df = df.ix[ - (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) - ] - print("Subselected to 8-15mers: %s" % (str(df.shape))) - - if args.only_quantitative: - df = df.loc[ - df.measurement_type == "quantitative" - ] - print("Subselected to quantitative: %s" % (str(df.shape))) - - allele_counts = df.allele.value_counts() - - if args.allele: - alleles = args.allele - df = df.ix[df.allele.isin(alleles)] - else: - alleles = list(allele_counts.ix[ - allele_counts > args.min_measurements_per_allele - ].index) - - print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) - print("Training data: %s" % (str(df.shape))) - - predictor = Class1AffinityPredictor() - - if args.out_models_dir and not os.path.exists(args.out_models_dir): - print("Attempting to create directory: %s" % args.out_models_dir) - os.mkdir(args.out_models_dir) - print("Done.") - - for (h, hyperparameters) in enumerate(hyperparameters_lst): - n_models = hyperparameters.pop("n_models") - - for model_group in range(n_models): - for (i, allele) in enumerate(alleles): - print( - "[%2d / %2d hyperparameters] " - "[%2d / %2d replicates] " - "[%4d / %4d alleles]: %s" % ( - h + 1, - len(hyperparameters_lst), - model_group + 1, - n_models, - i + 1, - len(alleles), allele)) - - train_data = df.ix[df.allele == allele].dropna().sample( - frac=1.0) - - predictor.fit_allele_specific_predictors( - n_models=1, - architecture_hyperparameters=hyperparameters, - allele=allele, - peptides=train_data.peptide.values, - affinities=train_data.measurement_value.values, - models_dir_for_save=args.out_models_dir) - - -if __name__ == '__main__': - run() diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py similarity index 64% rename from mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py rename to mhcflurry/class1_affinity_predictor.py index bfba10cb52c896dd1bd139d08b5fe67e3918510c..1c523e35be470a8eecdf5ae218a9a110b29ec075 100644 --- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -1,55 +1,63 @@ import collections -import time import hashlib import json -from os.path import join, exists -from six import string_types import logging +import sys +import time +import warnings +from os.path import join, exists +from os import mkdir +import mhcnames import numpy import pandas +from numpy.testing import assert_equal +from six import string_types -import mhcnames - -from ..encodable_sequences import EncodableSequences -from ..downloads import get_path - -from .class1_neural_network import Class1NeuralNetwork +from mhcflurry.class1_neural_network import Class1NeuralNetwork +from mhcflurry.common import random_peptides +from mhcflurry.downloads import get_path +from mhcflurry.encodable_sequences import EncodableSequences +from mhcflurry.percent_rank_transform import PercentRankTransform +from mhcflurry.regression_target import to_ic50 class Class1AffinityPredictor(object): """ High-level interface for peptide/MHC I binding affinity prediction. - - This is the class most users will want to use. - - This class delegates to one or more `Class1NeuralNetwork` instances. - It supports prediction across multiple alleles using ensembles of single- - or pan-allele predictors. + + This class manages low-level `Class1NeuralNetwork` instances, each of which + wraps a single Keras network. The purpose of `Class1AffinityPredictor` is to + implement ensembles, handling of multiple alleles, and predictor loading and + saving. """ def __init__( self, allele_to_allele_specific_models=None, class1_pan_allele_models=None, allele_to_pseudosequence=None, - manifest_df=None): + manifest_df=None, + allele_to_percent_rank_transform=None): """ Parameters ---------- - allele_to_allele_specific_models : dict of string -> list of Class1NeuralNetwork + allele_to_allele_specific_models : dict of string -> list of `Class1NeuralNetwork` Ensemble of single-allele models to use for each allele. - class1_pan_allele_models : list of Class1NeuralNetwork + class1_pan_allele_models : list of `Class1NeuralNetwork` Ensemble of pan-allele models. allele_to_pseudosequence : dict of string -> string Required only if class1_pan_allele_models is specified. - manifest_df : pandas.DataFrame, optional + manifest_df : `pandas.DataFrame`, optional Must have columns: model_name, allele, config_json, model. Only required if you want to update an existing serialization of a Class1AffinityPredictor. Otherwise this dataframe will be generated automatically based on the supplied models. + + allele_to_percent_rank_transform : dict of string -> `PercentRankTransform`, optional + `PercentRankTransform` instances to use for each allele """ if allele_to_allele_specific_models is None: @@ -86,6 +94,63 @@ class Class1AffinityPredictor(object): columns=["model_name", "allele", "config_json", "model"]) self.manifest_df = manifest_df + if not allele_to_percent_rank_transform: + allele_to_percent_rank_transform = {} + self.allele_to_percent_rank_transform = allele_to_percent_rank_transform + + @property + def neural_networks(self): + """ + List of the neural networks in the ensemble. + + Returns + ------- + list of `Class1NeuralNetwork` + """ + result = [] + for models in self.allele_to_allele_specific_models.values(): + result.extend(models) + result.extend(self.class1_pan_allele_models) + return result + + @classmethod + def merge(cls, predictors): + """ + Merge the ensembles of two or more `Class1AffinityPredictor` instances. + + Note: the resulting merged predictor will NOT have calibrated percentile + ranks. Call `calibrate_percentile_ranks` on it if these are needed. + + Parameters + ---------- + predictors : sequence of `Class1AffinityPredictor` + + Returns + ------- + `Class1AffinityPredictor` instance + + """ + assert len(predictors) > 0 + if len(predictors) == 1: + return predictors[0] + + allele_to_allele_specific_models = collections.defaultdict(list) + class1_pan_allele_models = [] + allele_to_pseudosequence = predictors[0].allele_to_pseudosequence + + for predictor in predictors: + for (allele, networks) in ( + predictor.allele_to_allele_specific_models.items()): + allele_to_allele_specific_models[allele].extend(networks) + class1_pan_allele_models.extend( + predictor.class1_pan_allele_models) + + return Class1AffinityPredictor( + allele_to_allele_specific_models=allele_to_allele_specific_models, + class1_pan_allele_models=class1_pan_allele_models, + allele_to_pseudosequence=allele_to_pseudosequence + ) + @property def supported_alleles(self): """ @@ -121,7 +186,8 @@ class Class1AffinityPredictor(object): def save(self, models_dir, model_names_to_write=None): """ - Serialize the predictor to a directory on disk. + Serialize the predictor to a directory on disk. If the directory does + not exist it will be created. The serialization format consists of a file called "manifest.csv" with the configurations of each Class1NeuralNetwork, along with per-network @@ -148,6 +214,9 @@ class Class1AffinityPredictor(object): # Write all models model_names_to_write = self.manifest_df.model_name.values + if not exists(models_dir): + mkdir(models_dir) + sub_manifest_df = self.manifest_df.ix[ self.manifest_df.model_name.isin(model_names_to_write) ] @@ -165,6 +234,21 @@ class Class1AffinityPredictor(object): write_manifest_df.to_csv(manifest_path, index=False) logging.info("Wrote: %s" % manifest_path) + if self.allele_to_percent_rank_transform: + percent_ranks_df = None + for (allele, transform) in self.allele_to_percent_rank_transform.items(): + series = transform.to_series() + if percent_ranks_df is None: + percent_ranks_df = pandas.DataFrame(index=series.index) + assert_equal(series.index.values, percent_ranks_df.index.values) + percent_ranks_df[allele] = series + percent_ranks_path = join(models_dir, "percent_ranks.csv") + percent_ranks_df.to_csv( + percent_ranks_path, + index=True, + index_label="bin") + logging.info("Wrote: %s" % percent_ranks_path) + @staticmethod def load(models_dir=None, max_models=None): """ @@ -176,11 +260,11 @@ class Class1AffinityPredictor(object): Path to directory max_models : int, optional - Maximum number of Class1NeuralNetwork instances to load + Maximum number of `Class1NeuralNetwork` instances to load Returns ------- - Class1AffinityPredictor + `Class1AffinityPredictor` instance """ if models_dir is None: models_dir = get_path("models_class1", "models") @@ -211,11 +295,20 @@ class Class1AffinityPredictor(object): join(models_dir, "pseudosequences.csv"), index_col="allele").to_dict() + allele_to_percent_rank_transform = {} + percent_ranks_path = join(models_dir, "percent_ranks.csv") + if exists(percent_ranks_path): + percent_ranks_df = pandas.read_csv(percent_ranks_path, index_col=0) + for allele in percent_ranks_df.columns: + allele_to_percent_rank_transform[allele] = ( + PercentRankTransform.from_series(percent_ranks_df[allele])) + logging.info( - "Loaded %d class1 pan allele predictors, %d pseudosequences, and " - "%d allele specific models: %s" % ( + "Loaded %d class1 pan allele predictors, %d pseudosequences, " + "%d percent rank distributions, and %d allele specific models: %s" % ( len(class1_pan_allele_models), len(pseudosequences) if pseudosequences else 0, + len(allele_to_percent_rank_transform), sum(len(v) for v in allele_to_allele_specific_models.values()), ", ".join( "%s (%d)" % (allele, len(v)) @@ -226,7 +319,9 @@ class Class1AffinityPredictor(object): allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, allele_to_pseudosequence=pseudosequences, - manifest_df=manifest_df) + manifest_df=manifest_df, + allele_to_percent_rank_transform=allele_to_percent_rank_transform, + ) return result @staticmethod @@ -272,7 +367,8 @@ class Class1AffinityPredictor(object): peptides, affinities, models_dir_for_save=None, - verbose=1): + verbose=1, + progress_preamble=""): """ Fit one or more allele specific predictors for a single allele using a single neural network architecture. @@ -289,7 +385,7 @@ class Class1AffinityPredictor(object): allele : string - peptides : EncodableSequences or list of string + peptides : `EncodableSequences` or list of string affinities : list of float nM affinities @@ -301,9 +397,12 @@ class Class1AffinityPredictor(object): verbose : int Keras verbosity + progress_preamble : string + Optional string of information to include in each progress update + Returns ------- - list of Class1NeuralNetwork + list of `Class1NeuralNetwork` """ allele = mhcnames.normalize_allele_name(allele) @@ -313,7 +412,8 @@ class Class1AffinityPredictor(object): peptides=peptides, affinities=affinities, allele_pseudosequences=None, - verbose=verbose) + verbose=verbose, + progress_preamble=progress_preamble) if allele not in self.allele_to_allele_specific_models: self.allele_to_allele_specific_models[allele] = [] @@ -334,7 +434,7 @@ class Class1AffinityPredictor(object): if models_dir_for_save: self.save( models_dir_for_save, model_names_to_write=[model_name]) - return models + return models_list def fit_class1_pan_allele_models( self, @@ -344,7 +444,8 @@ class Class1AffinityPredictor(object): peptides, affinities, models_dir_for_save=None, - verbose=1): + verbose=1, + progress_preamble=""): """ Fit one or more pan-allele predictors using a single neural network architecture. @@ -362,7 +463,7 @@ class Class1AffinityPredictor(object): alleles : list of string Allele names (not pseudosequences) corresponding to each peptide - peptides : EncodableSequences or list of string + peptides : `EncodableSequences` or list of string affinities : list of float nM affinities @@ -374,9 +475,12 @@ class Class1AffinityPredictor(object): verbose : int Keras verbosity + progress_preamble : string + Optional string of information to include in each progress update + Returns ------- - list of Class1NeuralNetwork + list of `Class1NeuralNetwork` """ alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name) @@ -388,10 +492,13 @@ class Class1AffinityPredictor(object): peptides=peptides, affinities=affinities, allele_pseudosequences=allele_pseudosequences, - verbose=verbose) + verbose=verbose, + progress_preamble=progress_preamble) + models_list = [] for (i, model) in enumerate(models): model_name = self.model_name("pan-class1", i) + models_list.append(model) # models is a generator self.class1_pan_allele_models.append(model) row = pandas.Series(collections.OrderedDict([ ("model_name", model_name), @@ -404,7 +511,7 @@ class Class1AffinityPredictor(object): if models_dir_for_save: self.save( models_dir_for_save, model_names_to_write=[model_name]) - return models + return models_list def _fit_predictors( self, @@ -413,7 +520,8 @@ class Class1AffinityPredictor(object): peptides, affinities, allele_pseudosequences, - verbose=1): + verbose=1, + progress_preamble = ""): """ Private helper method @@ -425,10 +533,12 @@ class Class1AffinityPredictor(object): affinities : list of float allele_pseudosequences : EncodableSequences or list of string verbose : int + progress_preamble : string + Optional string of information to include in each progress update Returns ------- - generator of Class1NeuralNetwork + generator of `Class1NeuralNetwork` """ encodable_peptides = EncodableSequences.create(peptides) for i in range(n_models): @@ -438,9 +548,123 @@ class Class1AffinityPredictor(object): encodable_peptides, affinities, allele_pseudosequences=allele_pseudosequences, - verbose=verbose) + verbose=verbose, + progress_preamble=progress_preamble) yield model + def calibrate_percentile_ranks( + self, + peptides=None, + num_peptides_per_length=int(1e5), + alleles=None, + bins=None, + quiet=False): + """ + Compute the cumulative distribution of ic50 values for a set of alleles + over a large universe of random peptides, to enable computing quantiles in + this distribution later. + + Parameters + ---------- + peptides : sequence of string, optional + Peptides to use + num_peptides_per_length : int, optional + If peptides argument is not specified, then num_peptides_per_length + peptides are randomly sampled from a uniform distribution for each + supported length + alleles : sequence of string, optional + Alleles to perform calibration for. If not specified all supported + alleles will be calibrated. + bins : object + Anything that can be passed to numpy.histogram's "bins" argument + can be used here, i.e. either an integer or a sequence giving bin + edges. This is in ic50 space. + quiet : boolean + If False (default), status updates will be printed to stdout. + """ + if bins is None: + bins = to_ic50(numpy.linspace(1, 0, 1000)) + + if alleles is None: + alleles = self.supported_alleles + + if peptides is None: + peptides = [] + lengths = range( + self.supported_peptide_lengths[0], + self.supported_peptide_lengths[1] + 1) + for length in lengths: + peptides.extend( + random_peptides(num_peptides_per_length, length)) + + if quiet: + def msg(s): + pass + else: + def msg(s): + print(s) + sys.stdout.flush() + + encoded_peptides = EncodableSequences.create(peptides) + for (i, allele) in enumerate(alleles): + msg("Calibrating percentile ranks for allele %03d/%03d: %s" % ( + i + 1, len(alleles), allele)) + start = time.time() + predictions = self.predict(encoded_peptides, allele=allele) + msg("Generated %d predictions in %0.2f sec." % ( + len(predictions), time.time() - start)) + transform = PercentRankTransform() + transform.fit(predictions, bins=bins) + self.allele_to_percent_rank_transform[allele] = transform + msg("Done calibrating allele %s in %0.2f sec." % ( + allele, time.time() - start)) + + def percentile_ranks(self, affinities, allele=None, alleles=None, throw=True): + """ + Return percentile ranks for the given ic50 affinities and alleles. + + The 'allele' and 'alleles' argument are as in the `predict` method. + Specify one of these. + + Parameters + ---------- + affinities : sequence of float + nM affinities + allele : string + alleles : sequence of string + throw : boolean + If True, a ValueError will be raised in the case of unsupported + alleles. If False, a warning will be logged and NaN will be returned + for those percentile ranks. + + Returns + ------- + numpy.array of float + """ + if allele is not None: + try: + transform = self.allele_to_percent_rank_transform[allele] + return transform.transform(affinities) + except KeyError: + msg = "Allele %s has no percentile rank information" % allele + if throw: + raise ValueError(msg) + else: + warnings.warn(msg) + # Return NaNs + return numpy.ones(len(affinities)) * numpy.nan + + if alleles is None: + raise ValueError("Specify allele or alleles") + + df = pandas.DataFrame({"affinity": affinities}) + df["allele"] = alleles + df["result"] = numpy.nan + for (allele, sub_df) in df.groupby("allele"): + df.loc[sub_df.index, "result"] = self.percentile_ranks( + sub_df.affinity, allele=allele, throw=throw) + return df.result.values + def predict(self, peptides, alleles=None, allele=None, throw=True): """ Predict nM binding affinities. @@ -455,7 +679,7 @@ class Class1AffinityPredictor(object): Parameters ---------- - peptides : EncodableSequences or list of string + peptides : `EncodableSequences` or list of string alleles : list of string allele : string throw : boolean @@ -472,6 +696,7 @@ class Class1AffinityPredictor(object): alleles=alleles, allele=allele, throw=throw, + include_percentile_ranks=False, ) return df.prediction.values @@ -481,7 +706,8 @@ class Class1AffinityPredictor(object): alleles=None, allele=None, throw=True, - include_individual_model_predictions=False): + include_individual_model_predictions=False, + include_percentile_ranks=True): """ Predict nM binding affinities. Gives more detailed output than `predict` method, including 5-95% prediction intervals. @@ -496,20 +722,24 @@ class Class1AffinityPredictor(object): Parameters ---------- - peptides : EncodableSequences or list of string + peptides : `EncodableSequences` or list of string alleles : list of string allele : string - include_individual_model_predictions : boolean - If True, the predictions of each individual model are incldued as - columns in the result dataframe. throw : boolean If True, a ValueError will be raised in the case of unsupported alleles or peptide lengths. If False, a warning will be logged and the predictions for the unsupported alleles or peptides will be NaN. + include_individual_model_predictions : boolean + If True, the predictions of each individual model are included as + columns in the result dataframe. + include_percentile_ranks : boolean, default True + If True, a "prediction_percentile" column will be included giving the + percentile ranks. If no percentile rank information is available, + this will be ignored with a warning. Returns ------- - pandas.DataFrame of predictions + `pandas.DataFrame` of predictions """ if isinstance(peptides, string_types): raise TypeError("peptides must be a list or array, not a string") @@ -527,6 +757,19 @@ class Class1AffinityPredictor(object): 'peptide': peptides.sequences, 'allele': alleles, }) + if len(df) == 0: + # No predictions. + logging.warning("Predicting for 0 peptides.") + empty_result = pandas.DataFrame( + columns=[ + 'peptide', + 'allele', + 'prediction', + 'prediction_low', + 'prediction_high' + ]) + return empty_result + df["normalized_allele"] = df.allele.map( mhcnames.normalize_allele_name) @@ -593,7 +836,11 @@ class Class1AffinityPredictor(object): mask = ( (df.normalized_allele == allele) & df.supported_peptide_length).values - if mask.sum() > 0: + if mask.all(): + # Common case optimization + for (i, model) in enumerate(models): + df["model_single_%d" % i] = model.predict(peptides) + elif mask.sum() > 0: allele_peptides = EncodableSequences.create( df.ix[mask].peptide.values) for (i, model) in enumerate(models): @@ -611,16 +858,25 @@ class Class1AffinityPredictor(object): df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1)) df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1)) - del df["normalized_allele"] - del df["supported_peptide_length"] if include_individual_model_predictions: columns = sorted(df.columns, key=lambda c: c.startswith('model_')) else: columns = [ c for c in df.columns if c not in df_predictions.columns ] - return df[columns] - + columns.remove("normalized_allele") + columns.remove("supported_peptide_length") + + if include_percentile_ranks: + if self.allele_to_percent_rank_transform: + df["prediction_percentile"] = self.percentile_ranks( + df.prediction, + alleles=df.normalized_allele.values, + throw=throw) + columns.append("prediction_percentile") + else: + warnings.warn("No percentile rank information available.") + return df[columns].copy() @staticmethod def save_weights(weights_list, filename): @@ -650,11 +906,9 @@ class Class1AffinityPredictor(object): ---------- filename : string Should end in ".npz". - - + Returns ---------- - list of array """ loaded = numpy.load(filename) diff --git a/mhcflurry/class1_affinity_prediction/class1_neural_network.py b/mhcflurry/class1_neural_network.py similarity index 86% rename from mhcflurry/class1_affinity_prediction/class1_neural_network.py rename to mhcflurry/class1_neural_network.py index 13a7f55c8250255a3fdc43a4c5017505801bfe6a..eafbbf96398e58eb0028a8bfaadab62835377a67 100644 --- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -5,23 +5,12 @@ import logging import numpy import pandas -import keras.models -import keras.layers.pooling -import keras.regularizers -from keras.layers import Input -import keras.layers -from keras.layers.core import Dense, Flatten, Dropout -from keras.layers.embeddings import Embedding -from keras.layers.normalization import BatchNormalization - from mhcflurry.hyperparameters import HyperparameterDefaults -from ..encodable_sequences import ( - EncodableSequences, - available_vector_encodings, - vector_encoding_length) -from ..regression_target import to_ic50, from_ic50 -from ..common import random_peptides, amino_acid_distribution +from mhcflurry.encodable_sequences import EncodableSequences +from mhcflurry.amino_acid import available_vector_encodings, vector_encoding_length +from mhcflurry.regression_target import to_ic50, from_ic50 +from mhcflurry.common import random_peptides, amino_acid_distribution class Class1NeuralNetwork(object): @@ -52,11 +41,6 @@ class Class1NeuralNetwork(object): batch_normalization=False, embedding_init_method="glorot_uniform", locally_connected_layers=[ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, { "filters": 8, "activation": "tanh", @@ -64,27 +48,43 @@ class Class1NeuralNetwork(object): } ], ) + """ + Hyperparameters (and their default values) that affect the neural network + architecture. + """ compile_hyperparameter_defaults = HyperparameterDefaults( loss="mse", optimizer="rmsprop", ) + """ + Loss and optimizer hyperparameters. Any values supported by keras may be + used. + """ input_encoding_hyperparameter_defaults = HyperparameterDefaults( left_edge=4, right_edge=4) + """ + Number of amino acid residues that are given fixed positions on the each + side in the variable length encoding. + """ fit_hyperparameter_defaults = HyperparameterDefaults( max_epochs=500, take_best_epoch=False, # currently unused validation_split=0.2, early_stopping=True, + minibatch_size=128, random_negative_rate=0.0, random_negative_constant=25, random_negative_affinity_min=20000.0, random_negative_affinity_max=50000.0, random_negative_match_distribution=True, random_negative_distribution_smoothing=0.0) + """ + Hyperparameters for neural network training. + """ early_stopping_hyperparameter_defaults = HyperparameterDefaults( patience=10, @@ -93,12 +93,18 @@ class Class1NeuralNetwork(object): verbose=1, # currently unused mode='auto' # currently unused ) + """ + Hyperparameters for early stopping. + """ hyperparameter_defaults = network_hyperparameter_defaults.extend( compile_hyperparameter_defaults).extend( input_encoding_hyperparameter_defaults).extend( fit_hyperparameter_defaults).extend( early_stopping_hyperparameter_defaults) + """ + Combined set of all supported hyperparameters and their default values. + """ def __init__(self, **hyperparameters): self.hyperparameters = self.hyperparameter_defaults.with_defaults( @@ -112,9 +118,11 @@ class Class1NeuralNetwork(object): self.fit_seconds = None self.fit_num_points = None - # Process-wide keras model cache. - # architecture JSON string -> (Keras model, existing network weights) KERAS_MODELS_CACHE = {} + """ + Process-wide keras model cache, a map from: architecture JSON string to + (Keras model, existing network weights) + """ @classmethod def borrow_cached_network(klass, network_json, network_weights): @@ -141,6 +149,7 @@ class Class1NeuralNetwork(object): assert network_weights is not None if network_json not in klass.KERAS_MODELS_CACHE: # Cache miss. + import keras.models network = keras.models.model_from_json(network_json) existing_weights = None else: @@ -171,6 +180,7 @@ class Class1NeuralNetwork(object): self.network_json, self.network_weights) else: + import keras.models self._network = keras.models.model_from_json(self.network_json) if self.network_weights is not None: self._network.set_weights(self.network_weights) @@ -323,7 +333,8 @@ class Class1NeuralNetwork(object): affinities, allele_pseudosequences=None, sample_weights=None, - verbose=1): + verbose=1, + progress_preamble=""): """ Fit the neural network. @@ -344,6 +355,9 @@ class Class1NeuralNetwork(object): verbose : int Keras verbosity level + + progress_preamble : string + Optional string of information to include in each progress update """ self.fit_num_points = len(peptides) @@ -421,6 +435,7 @@ class Class1NeuralNetwork(object): self.loss_history = collections.defaultdict(list) start = time.time() + last_progress_print = None for i in range(self.hyperparameters['max_epochs']): random_negative_peptides_list = [] for (length, count) in num_random_negative.iteritems(): @@ -449,6 +464,7 @@ class Class1NeuralNetwork(object): x_dict_with_random_negatives, y_dict_with_random_negatives, shuffle=True, + batch_size=self.hyperparameters['minibatch_size'], verbose=verbose, epochs=1, validation_split=self.hyperparameters['validation_split'], @@ -457,12 +473,17 @@ class Class1NeuralNetwork(object): for (key, value) in fit_history.history.items(): self.loss_history[key].extend(value) - logging.info( - "Epoch %3d / %3d: loss=%g. Min val loss at epoch %s" % ( - i, - self.hyperparameters['max_epochs'], - self.loss_history['loss'][-1], - min_val_loss_iteration)) + # Print progress no more often than once every few seconds. + if not last_progress_print or time.time() - last_progress_print > 5: + print((progress_preamble + " " + + "Epoch %3d / %3d: loss=%g. " + "Min val loss (%s) at epoch %s" % ( + i, + self.hyperparameters['max_epochs'], + self.loss_history['loss'][-1], + str(min_val_loss), + min_val_loss_iteration)).strip()) + last_progress_print = time.time() if self.hyperparameters['validation_split']: val_loss = self.loss_history['val_loss'][-1] @@ -477,11 +498,18 @@ class Class1NeuralNetwork(object): min_val_loss_iteration + self.hyperparameters['patience']) if i > threshold: - logging.info("Early stopping") + print((progress_preamble + " " + + "Early stopping at epoch %3d / %3d: loss=%g. " + "Min val loss (%s) at epoch %s" % ( + i, + self.hyperparameters['max_epochs'], + self.loss_history['loss'][-1], + str(min_val_loss), + min_val_loss_iteration)).strip()) break self.fit_seconds = time.time() - start - def predict(self, peptides, allele_pseudosequences=None): + def predict(self, peptides, allele_pseudosequences=None, batch_size=4096): """ Predict affinities @@ -492,6 +520,9 @@ class Class1NeuralNetwork(object): allele_pseudosequences : EncodableSequences or list of string, optional Only required when this model is a pan-allele model + batch_size : int + batch_size passed to Keras + Returns ------- numpy.array of nM affinity predictions @@ -503,8 +534,10 @@ class Class1NeuralNetwork(object): pseudosequences_input = self.pseudosequence_to_network_input( allele_pseudosequences) x_dict['pseudosequence'] = pseudosequences_input - (predictions,) = numpy.array( - self.network(borrow=True).predict(x_dict), dtype="float64").T + + network = self.network(borrow=True) + raw_predictions = network.predict(x_dict, batch_size=batch_size) + predictions = numpy.array(raw_predictions, dtype = "float64")[:,0] return to_ic50(predictions) def compile(self): @@ -537,6 +570,16 @@ class Class1NeuralNetwork(object): """ Helper function to make a keras network for class1 affinity prediction. """ + + # We import keras here to avoid tensorflow debug output, etc. unless we + # are actually about to use Keras. + + from keras.layers import Input + import keras.layers + from keras.layers.core import Dense, Flatten, Dropout + from keras.layers.embeddings import Embedding + from keras.layers.normalization import BatchNormalization + if use_embedding or peptide_amino_acid_encoding == "embedding": peptide_input = Input( shape=(kmer_size,), dtype='int32', name='peptide') @@ -626,7 +669,4 @@ class Class1NeuralNetwork(object): inputs=inputs, outputs=[output], name="predictor") - - print("*** ARCHITECTURE ***") - model.summary() return model diff --git a/mhcflurry/common.py b/mhcflurry/common.py index f87d235d999e124aa036c80b95301005b167f7be..abc2a72c7b7efc1893f2f3630c48b3166169d5ef 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -1,25 +1,7 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import -import itertools import collections import logging -import hashlib -import time import sys -from os import environ import numpy import pandas @@ -27,50 +9,15 @@ import pandas from . import amino_acid -def all_combinations(**dict_of_lists): - """ - Iterator that generates all combinations of parameters given in the - kwargs dictionary which is expected to map argument names to lists - of possible values. - """ - arg_names = dict_of_lists.keys() - value_lists = dict_of_lists.values() - for combination_of_values in itertools.product(*value_lists): - yield dict(zip(arg_names, combination_of_values)) - - -def dataframe_cryptographic_hash(df): +def configure_logging(verbose=False): """ - Return a cryptographic (i.e. collisions extremely unlikely) hash - of a dataframe. Suitible for using as a cache key. + Configure logging module using defaults. Parameters - ----------- - df : pandas.DataFrame or pandas.Series - - Returns - ----------- - string - """ - start = time.time() - result = hashlib.sha1(df.to_msgpack()).hexdigest() - logging.info( - "Generated dataframe hash in %0.2f sec" % (time.time() - start)) - return result - - -def freeze_object(o): - """ - Recursively convert nested dicts and lists into frozensets and tuples. + ---------- + verbose : boolean + If true, output will be at level DEBUG, otherwise, INFO. """ - if isinstance(o, dict): - return frozenset({k: freeze_object(v) for k, v in o.items()}.items()) - if isinstance(o, list): - return tuple(freeze_object(v) for v in o) - return o - - -def configure_logging(verbose=False): level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( format="%(asctime)s.%(msecs)d %(levelname)s %(module)s - %(funcName)s:" @@ -80,72 +27,6 @@ def configure_logging(verbose=False): level=level) -def describe_nulls(df, related_df_with_same_index_to_describe=None): - """ - Return a string describing the positions of any nan or inf values - in a dataframe. - - If related_df_with_same_index_to_describe is specified, it should be - a dataframe with the same index as df. Positions corresponding to - where df has null values will also be printed from this dataframe. - """ - if isinstance(df, pandas.Series): - df = df.to_frame() - with pandas.option_context('mode.use_inf_as_null', True): - null_counts_by_col = pandas.isnull(df).sum(axis=0) - null_rows = pandas.isnull(df).sum(axis=1) > 0 - return ( - "Columns with nulls:\n%s, related rows with nulls:\n%s, " - "full df:\n%s" % ( - null_counts_by_col.index[null_counts_by_col > 0], - related_df_with_same_index_to_describe.ix[null_rows] - if related_df_with_same_index_to_describe is not None - else "(n/a)", - str(df.ix[null_rows]))) - - -def raise_or_debug(exception): - """ - Raise the exception unless the MHCFLURRY_DEBUG environment variable is set, - in which case drop into ipython debugger (ipdb). - """ - if environ.get("MHCFLURRY_DEBUG"): - import ipdb - ipdb.set_trace() - raise exception - - -def assert_no_null(df, message=''): - """ - Raise an assertion error if the given DataFrame has any nan or inf values. - """ - if hasattr(df, 'count'): - with pandas.option_context('mode.use_inf_as_null', True): - failed = df.count().sum() != df.size - else: - failed = numpy.isnan(df).sum() > 0 - if failed: - raise_or_debug( - AssertionError( - "%s %s" % (message, describe_nulls(df)))) - - -def drop_nulls_and_warn(df, related_df_with_same_index_to_describe=None): - """ - Return a new DataFrame that is a copy of the given DataFrame where any - rows with nulls have been removed, and a warning about them logged. - """ - with pandas.option_context('mode.use_inf_as_null', True): - new_df = df.dropna() - if df.shape != new_df.shape: - logging.warn( - "Dropped rows with null or inf: %s -> %s:\n%s" % ( - df.shape, - new_df.shape, - describe_nulls(df, related_df_with_same_index_to_describe))) - return new_df - - def amino_acid_distribution(peptides, smoothing=0.0): """ Compute the fraction of each amino acid across a collection of peptides. diff --git a/mhcflurry/downloads.py b/mhcflurry/downloads.py index 81ef467e51e2670096a7830e2b67a9f24a9a23df..e88675ed00b78718f0687e28ed4911473593e13c 100644 --- a/mhcflurry/downloads.py +++ b/mhcflurry/downloads.py @@ -1,16 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """ Manage local downloaded data. """ @@ -69,12 +56,13 @@ def get_current_release_downloads(): Return a dict of all available downloads in the current release. The dict keys are the names of the downloads. The values are a dict - with entries: - downloaded : bool - Whether the download is currently available locally + with two entries: - metadata : dict - Info about the download from downloads.yml such as URL + downloaded : bool + Whether the download is currently available locally + + metadata : dict + Info about the download from downloads.yml such as URL """ downloads = ( get_downloads_metadata() diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index 1c82f7b618c7e6738c2f1a255fa579c917a98206..cf7709b9edf7fb46c1f2ae5e7b23ebb681099380 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -8,7 +8,7 @@ # by name, the downloads with "default=true" are downloaded. # This should usually be the latest release. -current-release: 0.9.2 +current-release: 1.0.0 # An integer indicating what models the current MHCflurry code base is compatible # with. Increment this integer when changes are made to MHCflurry that would break @@ -17,6 +17,33 @@ current-compatibility-version: 2 # Add new releases here as they are made. releases: + 1.0.0: + compatibility-version: 2 + downloads: + - name: models_class1 + url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1.tar.bz2 + default: true + + - name: models_class1_experiments1 + url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/models_class1_experiments1.tar.bz2 + default: false + + - name: cross_validation_class1 + url: http://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/cross_validation_class1.tar.bz2 + default: false + + - name: data_iedb + url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_iedb.tar.bz2 + default: false + + - name: data_kim2014 + url: http://github.com/hammerlab/mhcflurry/releases/download/0.9.1/data_kim2014.tar.bz2 + default: false + + - name: data_curated + url: https://github.com/hammerlab/mhcflurry/releases/download/pre-1.0/data_curated.tar.bz2 + default: true + 0.9.2: compatibility-version: 2 downloads: diff --git a/mhcflurry/downloads_command.py b/mhcflurry/downloads_command.py index 6ddf11ca32c496261f267dbed03b57af37113e82..08648a99e8eb6d2c03967cfd3ebb0bd12bbdaab8 100644 --- a/mhcflurry/downloads_command.py +++ b/mhcflurry/downloads_command.py @@ -1,16 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ''' Download MHCflurry released datasets and trained models. diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index 4abc94a1032860594bc091ba9043cc94320ffd30..aab2b0fe1874b288033afa4419de1919e29f4e1e 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -1,17 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import ( print_function, division, @@ -19,117 +5,13 @@ from __future__ import ( ) import math +from six import string_types -import pandas import numpy -from six import StringIO - -import typechecks +import pandas from . import amino_acid -AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys()) - -BLOSUM62_MATRIX = pandas.read_table(StringIO(""" - A R N D C Q E G H I L K M F P S T W Y V X -A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 -R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0 -N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0 -D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0 -C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0 -Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 -E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0 -G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0 -H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 -I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0 -L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0 -K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 -M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0 -F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0 -P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0 -S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 -T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0 -W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0 -Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0 -V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0 -X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 -"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] -assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all() - -ENCODING_DFS = { - "BLOSUM62": BLOSUM62_MATRIX, - "one-hot": pandas.DataFrame([ - [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] - for j in range(len(AMINO_ACIDS)) - ], index=AMINO_ACIDS, columns=AMINO_ACIDS) -} - - -def available_vector_encodings(): - """ - Return list of supported amino acid vector encodings. - - Returns - ------- - list of string - - """ - return list(ENCODING_DFS) - - -def vector_encoding_length(name): - """ - Return the length of the given vector encoding. - - Parameters - ---------- - name : string - - Returns - ------- - int - """ - return ENCODING_DFS[name].shape[1] - - -def index_encoding(sequences, letter_to_index_dict): - """ - Given a sequence of n strings all of length k, return a k * n array where - the (i, j)th element is letter_to_index_dict[sequence[i][j]]. - - Parameters - ---------- - sequences : list of length n of strings of length k - letter_to_index_dict : dict : string -> int - - Returns - ------- - numpy.array of integers with shape (k, n) - """ - df = pandas.DataFrame(iter(s) for s in sequences) - result = df.replace(letter_to_index_dict) - return result.values - - -def fixed_vectors_encoding(sequences, letter_to_vector_function): - """ - Given a sequence of n strings all of length k, return a n * k * m array where - the (i, j)th element is letter_to_vector_function(sequence[i][j]). - - Parameters - ---------- - sequences : list of length n of strings of length k - letter_to_vector_function : function : string -> vector of length m - - Returns - ------- - numpy.array of integers with shape (n, k, m) - """ - arr = numpy.array([list(s) for s in sequences]) - result = numpy.vectorize( - letter_to_vector_function, signature='()->(n)')(arr) - return result - class EncodableSequences(object): """ @@ -151,8 +33,8 @@ class EncodableSequences(object): return klass(sequences) def __init__(self, sequences): - typechecks.require_iterable_of( - sequences, typechecks.string_types, "sequences") + if not all(isinstance(obj, string_types) for obj in sequences): + raise ValueError("Sequence of strings is required") self.sequences = numpy.array(sequences) self.encoding_cache = {} self.fixed_sequence_length = None @@ -190,16 +72,13 @@ class EncodableSequences(object): max_length) if cache_key not in self.encoding_cache: - fixed_length_sequences = [ - self.sequence_to_fixed_length_string( - sequence, + fixed_length_sequences = ( + self.sequences_to_fixed_length_index_encoded_array( + self.sequences, left_edge=left_edge, right_edge=right_edge, - max_length=max_length) - for sequence in self.sequences - ] - self.encoding_cache[cache_key] = index_encoding( - fixed_length_sequences, amino_acid.AMINO_ACID_INDEX) + max_length=max_length)) + self.encoding_cache[cache_key] = fixed_length_sequences return self.encoding_cache[cache_key] def variable_length_to_fixed_length_vector_encoding( @@ -234,40 +113,39 @@ class EncodableSequences(object): right_edge, max_length) if cache_key not in self.encoding_cache: - fixed_length_sequences = [ - self.sequence_to_fixed_length_string( - sequence, + fixed_length_sequences = ( + self.sequences_to_fixed_length_index_encoded_array( + self.sequences, left_edge=left_edge, right_edge=right_edge, - max_length=max_length) - for sequence in self.sequences - ] - result = fixed_vectors_encoding( + max_length=max_length)) + result = amino_acid.fixed_vectors_encoding( fixed_length_sequences, - ENCODING_DFS[vector_encoding_name].loc.__getitem__) + amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name]) assert result.shape[0] == len(self.sequences) self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] - @classmethod - def sequence_to_fixed_length_string( - klass, sequence, left_edge=4, right_edge=4, max_length=15): + def sequences_to_fixed_length_index_encoded_array( + klass, sequences, left_edge=4, right_edge=4, max_length=15): """ - Transform a string of length at least left_edge + right_edge and at - most max_length into a string of length max_length using a scheme - designed to preserve the anchor positions of class I peptides. - + Transform a sequence of strings, where each string is of length at least + left_edge + right_edge and at most max_length into strings of length + max_length using a scheme designed to preserve the anchor positions of + class I peptides. + The first left_edge characters in the input always map to the first left_edge characters in the output. Similarly for the last right_edge characters. The middle characters are filled in based on the length, with the X character filling in the blanks. - + For example, using defaults: - + AAAACDDDD -> AAAAXXXCXXXDDDD - - + + The strings are also converted to int categorical amino acid indices. + Parameters ---------- sequence : string @@ -277,30 +155,65 @@ class EncodableSequences(object): Returns ------- - string of length max_length - + numpy array of shape (len(sequences), max_length) and dtype int """ - if len(sequence) < left_edge + right_edge: - raise ValueError( - "Sequence '%s' (length %d) unsupported: length must be at " - "least %d" % (sequence, len(sequence), left_edge + right_edge)) - if len(sequence) > max_length: - raise ValueError( - "Sequence '%s' (length %d) unsupported: length must be at " - "most %d" % (sequence, len(sequence), max_length)) + + # Result array is int32, filled with X (null amino acid) value. + result = numpy.full( + fill_value=amino_acid.AMINO_ACID_INDEX['X'], + shape=(len(sequences), max_length), + dtype="int32") + + df = pandas.DataFrame({"peptide": sequences}) + df["length"] = df.peptide.str.len() middle_length = max_length - left_edge - right_edge - num_null = max_length - len(sequence) - num_null_left = int(math.ceil(num_null / 2)) - num_null_right = int(math.floor(num_null / 2)) - num_not_null_middle = middle_length - num_null - string_encoding = "".join([ - sequence[:left_edge], - klass.unknown_character * num_null_left, - sequence[left_edge:left_edge + num_not_null_middle], - klass.unknown_character * num_null_right, - sequence[-right_edge:], - ]) - assert len(string_encoding) == max_length - return string_encoding + # For efficiency we handle each supported peptide length using bulk + # array operations. + for (length, sub_df) in df.groupby("length"): + if length < left_edge + right_edge: + raise ValueError( + "Sequence '%s' (length %d) unsupported: length must be at " + "least %d. There are %d total peptides with this length." % ( + sub_df.iloc[0].peptide, length, left_edge + right_edge, + len(sub_df))) + if length > max_length: + raise ValueError( + "Sequence '%s' (length %d) unsupported: length must be at " + "most %d. There are %d total peptides with this length." % ( + sub_df.iloc[0].peptide, length, max_length, + len(sub_df))) + + # Array of shape (num peptides, length) giving fixed-length amino + # acid encoding each peptide of the current length. + fixed_length_sequences = numpy.stack( + sub_df.peptide.map( + lambda s: numpy.array([ + amino_acid.AMINO_ACID_INDEX[char] for char in s + ])).values) + + num_null = max_length - length + num_null_left = int(math.ceil(num_null / 2)) + num_middle_filled = middle_length - num_null + middle_start = left_edge + num_null_left + + # Set left edge + result[sub_df.index, :left_edge] = fixed_length_sequences[ + :, :left_edge + ] + + # Set middle. + result[ + sub_df.index, + middle_start : middle_start + num_middle_filled + ] = fixed_length_sequences[ + :, left_edge : left_edge + num_middle_filled + ] + + # Set right edge. + result[ + sub_df.index, + -right_edge: + ] = fixed_length_sequences[:, -right_edge:] + return result diff --git a/mhcflurry/hyperparameters.py b/mhcflurry/hyperparameters.py index c0ddaab4c7445d34671ec8b1df11c3a4c3433c04..cc5950d5c175a35c08eb9cfea0d44d64e3fdeb65 100644 --- a/mhcflurry/hyperparameters.py +++ b/mhcflurry/hyperparameters.py @@ -1,16 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. from __future__ import ( print_function, division, diff --git a/mhcflurry/percent_rank_transform.py b/mhcflurry/percent_rank_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..7054736db6cfb48d903eb862d9697e77e2eb55d1 --- /dev/null +++ b/mhcflurry/percent_rank_transform.py @@ -0,0 +1,83 @@ +import numpy +import pandas + +class PercentRankTransform(object): + """ + Transform arbitrary values into percent ranks. + """ + + def __init__(self): + self.cdf = None + self.bin_edges = None + + def fit(self, values, bins): + """ + Fit the transform using the given values (in our case ic50s). + + Parameters + ---------- + values : ic50 values + bins : bins for the cumulative distribution function + Anything that can be passed to numpy.histogram's "bins" argument + can be used here. + """ + assert self.cdf is None + assert self.bin_edges is None + assert len(values) > 0 + (hist, self.bin_edges) = numpy.histogram(values, bins=bins) + self.cdf = numpy.ones(len(hist) + 3) * numpy.nan + self.cdf[0] = 0.0 + self.cdf[1] = 0.0 + self.cdf[-1] = 100.0 + numpy.cumsum(hist * 100.0 / numpy.sum(hist), out=self.cdf[2:-1]) + assert not numpy.isnan(self.cdf).any() + + def transform(self, values): + """ + Return percent ranks (range [0, 100]) for the given values. + """ + assert self.cdf is not None + assert self.bin_edges is not None + indices = numpy.searchsorted(self.bin_edges, values) + result = self.cdf[indices] + assert len(result) == len(values) + return result + + def to_series(self): + """ + Serialize the fit to a pandas.Series. + + The index on the series gives the bin edges and the valeus give the CDF. + + Returns + ------- + pandas.Series + + """ + return pandas.Series( + self.cdf, index=[numpy.nan] + list(self.bin_edges) + [numpy.nan]) + + @staticmethod + def from_series(series): + """ + Deseralize a PercentRankTransform the given pandas.Series, as returned + by `to_series()`. + + Parameters + ---------- + series : pandas.Series + + Returns + ------- + PercentRankTransform + + """ + result = PercentRankTransform() + result.cdf = series.values + result.bin_edges = series.index.values[1:-1] + return result + + + + + diff --git a/mhcflurry/predict_command.py b/mhcflurry/predict_command.py index f3a4a2016e35d3cc7ccd615fdf0624827d34d2fe..3eb9f16fe2b6fb7d40e37001ee306bcfd3b5fa11 100644 --- a/mhcflurry/predict_command.py +++ b/mhcflurry/predict_command.py @@ -1,16 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. ''' Run MHCflurry predictor on specified peptide/allele pairs. @@ -19,20 +6,20 @@ Examples: Write a CSV file containing the contents of INPUT.csv plus an additional column giving MHCflurry binding affinity predictions: - mhcflurry-predict INPUT.csv --out RESULT.csv + $ mhcflurry-predict INPUT.csv --out RESULT.csv -The input CSV file is expected to contain columns 'allele' and 'peptide'. -The predictions are written to a column called 'mhcflurry_prediction'. -These default column names may be changed with the --allele-column, ---peptide-column, and --prediction-column options. +The input CSV file is expected to contain columns ``allele`` and ``peptide``. +The predictions are written to a column called ``mhcflurry_prediction``. +These default column names may be changed with the `--allele-column`, +`--peptide-column`, and `--prediction-column` options. -If --out is not specified, results are writtent to standard out. +If `--out` is not specified, results are written to standard out. You can also run on alleles and peptides specified on the commandline, in which case predictions are written for all combinations of alleles and peptides: - mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL + $ mhcflurry-predict --alleles HLA-A0201 H-2Kb --peptides SIINFEKL DENDREKLLL ''' from __future__ import ( print_function, @@ -47,7 +34,7 @@ import logging import pandas from .downloads import get_path -from .class1_affinity_prediction import Class1AffinityPredictor +from .class1_affinity_predictor import Class1AffinityPredictor parser = argparse.ArgumentParser( diff --git a/mhcflurry/regression_target.py b/mhcflurry/regression_target.py index f1d9d9f03506770e25a446d13b5a03e775098754..d1189178c54df06d32ae1b45b2ee23efeed9306e 100644 --- a/mhcflurry/regression_target.py +++ b/mhcflurry/regression_target.py @@ -1,17 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import numpy diff --git a/mhcflurry/scoring.py b/mhcflurry/scoring.py index fc2a926334cec48b176abc5ed51a379212ccec88..f6a256baa18e1bbe01968ce270466791588a34f1 100644 --- a/mhcflurry/scoring.py +++ b/mhcflurry/scoring.py @@ -4,7 +4,7 @@ from __future__ import ( absolute_import, ) import logging -import sklearn +import sklearn.metrics import numpy import scipy diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py new file mode 100644 index 0000000000000000000000000000000000000000..6602761bf7c6a096d7c23438543f297fb8356bb5 --- /dev/null +++ b/mhcflurry/train_allele_specific_models_command.py @@ -0,0 +1,281 @@ +""" +Train Class1 single allele models. +""" +import argparse +import os +import signal +import sys +import time +import traceback +from multiprocessing import Pool + +import pandas +import yaml +from mhcnames import normalize_allele_name + +from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor +from mhcflurry.common import configure_logging + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "--data", + metavar="FILE.csv", + required=True, + help=( + "Training data CSV. Expected columns: " + "allele, peptide, measurement_value")) +parser.add_argument( + "--out-models-dir", + metavar="DIR", + required=True, + help="Directory to write models and manifest") +parser.add_argument( + "--hyperparameters", + metavar="FILE.json", + required=True, + help="JSON or YAML of hyperparameters") +parser.add_argument( + "--allele", + default=None, + nargs="+", + help="Alleles to train models for. If not specified, all alleles with " + "enough measurements will be used.") +parser.add_argument( + "--min-measurements-per-allele", + type=int, + metavar="N", + default=50, + help="Train models for alleles with >=N measurements.") +parser.add_argument( + "--only-quantitative", + action="store_true", + default=False, + help="Use only quantitative training data") +parser.add_argument( + "--percent-rank-calibration-num-peptides-per-length", + type=int, + metavar="N", + default=int(1e5), + help="Number of peptides per length to use to calibrate percent ranks. " + "Set to 0 to disable percent rank calibration. The resulting models will " + "not support percent ranks. Default: %(default)s.") +parser.add_argument( + "--n-models", + type=int, + metavar="N", + help="Ensemble size, i.e. how many models to train for each architecture. " + "If specified here it overrides any 'n_models' specified in the " + "hyperparameters.") +parser.add_argument( + "--max-epochs", + type=int, + metavar="N", + help="Max training epochs. If specified here it overrides any 'max_epochs' " + "specified in the hyperparameters.") +parser.add_argument( + "--verbosity", + type=int, + help="Keras verbosity. Default: %(default)s", + default=0) +parser.add_argument( + "--parallelization-num-jobs", + default=1, + type=int, + metavar="N", + help="Parallelization jobs. Experimental. Does NOT work with tensorflow. " + "Set to 1 for serial run. Set to 0 to use number of cores. " + "Default: %(default)s.") + + +def run(argv=sys.argv[1:]): + # On sigusr1 print stack trace + print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) + signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) + + args = parser.parse_args(argv) + + configure_logging(verbose=args.verbosity > 1) + + hyperparameters_lst = yaml.load(open(args.hyperparameters)) + assert isinstance(hyperparameters_lst, list) + print("Loaded hyperparameters list: %s" % str(hyperparameters_lst)) + + df = pandas.read_csv(args.data) + print("Loaded training data: %s" % (str(df.shape))) + + df = df.ix[ + (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) + ] + print("Subselected to 8-15mers: %s" % (str(df.shape))) + + if args.only_quantitative: + df = df.loc[ + df.measurement_type == "quantitative" + ] + print("Subselected to quantitative: %s" % (str(df.shape))) + + allele_counts = df.allele.value_counts() + + if args.allele: + alleles = [normalize_allele_name(a) for a in args.allele] + else: + alleles = list(allele_counts.ix[ + allele_counts > args.min_measurements_per_allele + ].index) + + # Allele names in data are assumed to be already normalized. + df = df.loc[df.allele.isin(alleles)].dropna() + + print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) + print("Training data: %s" % (str(df.shape))) + + predictor = Class1AffinityPredictor() + if args.parallelization_num_jobs == 1: + # Serial run + worker_pool = None + else: + worker_pool = Pool( + processes=( + args.parallelization_num_jobs + if args.parallelization_num_jobs else None)) + print("Using worker pool: %s" % str(worker_pool)) + + if args.out_models_dir and not os.path.exists(args.out_models_dir): + print("Attempting to create directory: %s" % args.out_models_dir) + os.mkdir(args.out_models_dir) + print("Done.") + + for (h, hyperparameters) in enumerate(hyperparameters_lst): + n_models = None + if 'n_models' in hyperparameters: + n_models = hyperparameters.pop("n_models") + if args.n_models: + n_models = args.n_models + if not n_models: + raise ValueError("Specify --ensemble-size or n_models hyperparameter") + + if args.max_epochs: + hyperparameters['max_epochs'] = args.max_epochs + + work_items = [] + total_data_to_train_on = 0 + for (i, (allele, sub_df)) in enumerate(df.groupby("allele")): + total_data_to_train_on += len(sub_df) * n_models + for model_group in range(n_models): + work_dict = { + 'model_group': model_group, + 'n_models': n_models, + 'allele_num': i, + 'n_alleles': len(alleles), + 'hyperparameter_set_num': h, + 'num_hyperparameter_sets': len(hyperparameters_lst), + 'allele': allele, + 'data': sub_df, + 'hyperparameters': hyperparameters, + 'verbose': args.verbosity, + 'predictor': predictor if not worker_pool else None, + 'save_to': args.out_models_dir if not worker_pool else None, + } + work_items.append(work_dict) + + if worker_pool: + print("Processing %d work items in parallel." % len(work_items)) + predictors = worker_pool.map(work_entrypoint, work_items, chunksize=1) + print("Merging %d predictors fit in parallel." % (len(predictors))) + predictor = Class1AffinityPredictor.merge([predictor] + predictors) + print("Saving merged predictor to: %s" % args.out_models_dir) + predictor.save(args.out_models_dir) + else: + # Run in serial. In this case, every worker is passed the same predictor, + # which it adds models to, so no merging is required. It also saves + # as it goes so no saving is required at the end. + start = time.time() + data_trained_on = 0 + while work_items: + item = work_items.pop(0) + work_predictor = work_entrypoint(item) + assert work_predictor is predictor + + # When running in serial we try to estimate time remaining. + data_trained_on += len(item['data']) + progress = float(data_trained_on) / total_data_to_train_on + time_elapsed = time.time() - start + total_time = time_elapsed / progress + print( + "Estimated total training time: %0.2f min, " + "remaining: %0.2f min" % ( + total_time / 60, + (total_time - time_elapsed) / 60)) + + + if worker_pool: + worker_pool.close() + worker_pool.join() + + if args.percent_rank_calibration_num_peptides_per_length > 0: + start = time.time() + print("Performing percent rank calibration.") + predictor.calibrate_percentile_ranks( + num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length) + print("Finished calibrating percent ranks in %0.2f sec." % ( + time.time() - start)) + predictor.save(args.out_models_dir, model_names_to_write=[]) + + +def work_entrypoint(item): + return process_work(**item) + + +def process_work( + model_group, + n_models, + allele_num, + n_alleles, + hyperparameter_set_num, + num_hyperparameter_sets, + allele, + data, + hyperparameters, + verbose, + predictor, + save_to): + + if predictor is None: + predictor = Class1AffinityPredictor() + + progress_preamble = ( + "[%2d / %2d hyperparameters] " + "[%4d / %4d alleles] " + "[%2d / %2d replicates]: %s " % ( + hyperparameter_set_num + 1, + num_hyperparameter_sets, + allele_num + 1, + n_alleles, + model_group + 1, + n_models, + allele)) + + train_data = data.sample(frac=1.0) + (model,) = predictor.fit_allele_specific_predictors( + n_models=1, + architecture_hyperparameters=hyperparameters, + allele=allele, + peptides=train_data.peptide.values, + affinities=train_data.measurement_value.values, + models_dir_for_save=save_to, + progress_preamble=progress_preamble, + verbose=verbose) + + if allele_num == 0 and model_group == 0: + # For the first model for the first allele, print the architecture. + print("*** ARCHITECTURE FOR HYPERPARAMETER SET %d***" % + (hyperparameter_set_num + 1)) + model.network(borrow=True).summary() + + return predictor + + + +if __name__ == '__main__': + run() diff --git a/requirements.txt b/requirements.txt index 8d31007f08081ee473243222f7edf4eef1b2a4b4..746d97f072e3bc272758b9eb35c82e2e2f7040d0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,9 @@ six numpy>=1.11 -pandas>=0.13.1 -Keras==2.0.9 +pandas>=0.20.3 +Keras>=2.0.9 tensorflow>=1.1.0 appdirs scikit-learn -typechecks mhcnames pyyaml diff --git a/setup.py b/setup.py index 6f45d03764462b9b7248bd59b61e402893aace5b..4d0dfb95489129d8910c4f8bf0baaf80cf491df1 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,7 @@ from setuptools import setup PY2 = (sys.version_info.major == 2) readme_dir = os.path.dirname(__file__) -readme_filename = os.path.join(readme_dir, 'README.md') +readme_filename = os.path.join(readme_dir, 'README.rst') try: with open(readme_filename, 'r') as f: @@ -33,13 +33,6 @@ except: logging.warning("Failed to load %s" % readme_filename) readme = "" -try: - import pypandoc - readme = pypandoc.convert(readme, to='rst', format='md') -except: - logging.warning("Conversion of long_description from MD to RST failed") - pass - with open('mhcflurry/__init__.py', 'r') as f: version = re.search( r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', @@ -50,12 +43,11 @@ if __name__ == '__main__': required_packages = [ 'six', 'numpy>=1.11', - 'pandas>=0.13.1', - 'Keras==2.0.9', + 'pandas>=0.20.3', + 'Keras>=2.0.9', 'appdirs', 'tensorflow>=1.1.0', 'scikit-learn', - 'typechecks', 'mhcnames', 'pyyaml', ] @@ -77,8 +69,7 @@ if __name__ == '__main__': 'mhcflurry-downloads = mhcflurry.downloads_command:run', 'mhcflurry-predict = mhcflurry.predict_command:run', 'mhcflurry-class1-train-allele-specific-models = ' - 'mhcflurry.class1_affinity_prediction.' - 'train_allele_specific_models_command:run', + 'mhcflurry.train_allele_specific_models_command:run', ] }, classifiers=[ @@ -97,6 +88,5 @@ if __name__ == '__main__': long_description=readme, packages=[ 'mhcflurry', - 'mhcflurry.class1_affinity_prediction', ], ) diff --git a/test/test_encodable_sequences.py b/test/test_amino_acid.py similarity index 67% rename from test/test_encodable_sequences.py rename to test/test_amino_acid.py index cc47290a08eb05e3aa8f282f7cefbe57af669b5a..5f7eb83011dd7ada8b5d1cda3f5ff9f8c8a23032 100644 --- a/test/test_encodable_sequences.py +++ b/test/test_amino_acid.py @@ -1,7 +1,8 @@ -from mhcflurry import encodable_sequences +from mhcflurry import amino_acid from nose.tools import eq_ from numpy.testing import assert_equal import numpy +import pandas letter_to_index_dict = { 'A': 0, @@ -11,7 +12,15 @@ letter_to_index_dict = { def test_index_and_one_hot_encoding(): - index_encoding = encodable_sequences.index_encoding( + letter_to_vector_df = pandas.DataFrame( + [ + [1, 0, 0,], + [0, 1, 0,], + [0, 0, 1,] + ], columns=[0, 1, 2] + ) + + index_encoding = amino_acid.index_encoding( ["AAAA", "ABCA"], letter_to_index_dict) assert_equal( index_encoding, @@ -19,13 +28,9 @@ def test_index_and_one_hot_encoding(): [0, 0, 0, 0], [0, 1, 2, 0], ]) - one_hot = encodable_sequences.fixed_vectors_encoding( + one_hot = amino_acid.fixed_vectors_encoding( index_encoding, - { - 0: numpy.array([1, 0, 0]), - 1: numpy.array([0, 1, 0]), - 2: numpy.array([0, 0, 1]), - }.get) + letter_to_vector_df) eq_(one_hot.shape, (2, 4, 3)) assert_equal( one_hot[0], diff --git a/test/test_class1_affinity_predictor.py b/test/test_class1_affinity_predictor.py index 4d7e67d69c3511e6a9d80ba9515778bc4cd89005..65f1f587316b9c56c1c30dfa0ff4c828c88c9648 100644 --- a/test/test_class1_affinity_predictor.py +++ b/test/test_class1_affinity_predictor.py @@ -59,7 +59,7 @@ def test_a1_known_epitopes_in_newly_trained_model(): ] hyperparameters = { - "max_epochs": 500, + "max_epochs": 100, "patience": 10, "early_stopping": True, "validation_split": 0.2, @@ -67,15 +67,11 @@ def test_a1_known_epitopes_in_newly_trained_model(): "random_negative_rate": 0.0, "random_negative_constant": 25, + "peptide_amino_acid_encoding": "BLOSUM62", "use_embedding": False, "kmer_size": 15, "batch_normalization": False, "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - }, { "filters": 8, "activation": "tanh", @@ -129,14 +125,13 @@ def test_class1_affinity_predictor_a0205_memorize_training_data(): hyperparameters = dict( activation="tanh", layer_sizes=[64], - max_epochs=500, + max_epochs=100, early_stopping=False, validation_split=0.0, locally_connected_layers=[], dense_layer_l1_regularization=0.0, dropout_probability=0.0) - # First test a Class1NeuralNetwork, then a Class1AffinityPredictor. allele = "HLA-A*02:05" df = pandas.read_csv( @@ -163,6 +158,7 @@ def test_class1_affinity_predictor_a0205_memorize_training_data(): peptides=df.peptide.values, affinities=df.measurement_value.values, ) + predictor.calibrate_percentile_ranks(num_peptides_per_length=1000) ic50_pred = predictor.predict(df.peptide.values, allele=allele) ic50_true = df.measurement_value.values eq_(len(ic50_pred), len(ic50_true)) @@ -175,6 +171,8 @@ def test_class1_affinity_predictor_a0205_memorize_training_data(): ic50_pred_df = predictor.predict_to_dataframe( df.peptide.values, allele=allele) print(ic50_pred_df) + assert 'prediction_percentile' in ic50_pred_df.columns + assert ic50_pred_df.prediction_percentile.isnull().sum() == 0 ic50_pred_df2 = predictor.predict_to_dataframe( df.peptide.values, diff --git a/test/test_class1_neural_network.py b/test/test_class1_neural_network.py index 8747f7e00be1b417ee76045e85522949b43e58c1..835c632d7c2c4dc6cbe1f454bc8e04ee754229d2 100644 --- a/test/test_class1_neural_network.py +++ b/test/test_class1_neural_network.py @@ -2,7 +2,7 @@ import numpy import pandas numpy.random.seed(0) -from mhcflurry import Class1NeuralNetwork +from mhcflurry.class1_neural_network import Class1NeuralNetwork from nose.tools import eq_ from numpy import testing diff --git a/test/test_hyperparameters.py b/test/test_hyperparameters.py index c1af6cacc0e27752ae842804362b61dd16ebc0c2..0b887236f8be6313630c89e046978969316d7401 100644 --- a/test/test_hyperparameters.py +++ b/test/test_hyperparameters.py @@ -1,6 +1,6 @@ from numpy.testing import assert_equal -from mhcflurry.class1_affinity_prediction import Class1NeuralNetwork +from mhcflurry.class1_neural_network import Class1NeuralNetwork def test_all_combinations_of_hyperparameters(): diff --git a/test/test_percent_rank_transform.py b/test/test_percent_rank_transform.py new file mode 100644 index 0000000000000000000000000000000000000000..e30aa7bb162b6611facfd9539043d9409dcd0b06 --- /dev/null +++ b/test/test_percent_rank_transform.py @@ -0,0 +1,24 @@ +import numpy + +from mhcflurry.percent_rank_transform import PercentRankTransform + +from numpy.testing import assert_allclose, assert_equal + + +def test_percent_rank_transform(): + model = PercentRankTransform() + model.fit(numpy.arange(1000), bins=100) + assert_allclose( + model.transform([-2, 0, 50, 100, 2000]), + [0.0, 0.0, 5.0, 10.0, 100.0], + err_msg=str(model.__dict__)) + + model2 = PercentRankTransform.from_series(model.to_series()) + assert_allclose( + model2.transform([-2, 0, 50, 100, 2000]), + [0.0, 0.0, 5.0, 10.0, 100.0], + err_msg=str(model.__dict__)) + + assert_equal(model.cdf, model2.cdf) + assert_equal(model.bin_edges, model2.bin_edges) + diff --git a/test/test_predict_command.py b/test/test_predict_command.py index bc8969e64c17e3d8d991e0ffa6b0dfb6792b5a0a..cfaf2a0a67b367e693b2130d40ab6cf25460c39b 100644 --- a/test/test_predict_command.py +++ b/test/test_predict_command.py @@ -32,7 +32,7 @@ def test_csv(): for delete in deletes: os.unlink(delete) - assert_equal(result.shape, (3, 6)) + assert_equal(result.shape, (3, 7)) def test_no_csv(): @@ -55,7 +55,8 @@ def test_no_csv(): for delete in deletes: os.unlink(delete) - assert_equal(result.shape, (6, 5)) + print(result) + assert_equal(result.shape, (6, 6)) sub_result1 = result.ix[result.peptide == "SIINFEKL"].set_index("allele") assert ( sub_result1.ix["H-2-Kb"].mhcflurry1_prediction < diff --git a/test/test_speed.py b/test/test_speed.py index b84e1f37ea4214d5421ca6ee030d6213cdbaa6ba..e80b612dce4e6fa2636e345eb52cd00c0d7f0c0e 100644 --- a/test/test_speed.py +++ b/test/test_speed.py @@ -3,21 +3,22 @@ numpy.random.seed(0) import time import cProfile import pstats +import collections import pandas from mhcflurry import Class1AffinityPredictor +from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry.common import random_peptides -NUM = 10000 - DOWNLOADED_PREDICTOR = Class1AffinityPredictor.load() +NUM = 10000 def test_speed(profile=False): - starts = {} - timings = {} - profilers = {} + starts = collections.OrderedDict() + timings = collections.OrderedDict() + profilers = collections.OrderedDict() def start(name): starts[name] = time.time() @@ -30,6 +31,7 @@ def test_speed(profile=False): if profile: profilers[name].disable() + start("first") DOWNLOADED_PREDICTOR.predict(["SIINFEKL"], allele="HLA-A*02:01") end("first") @@ -39,6 +41,16 @@ def test_speed(profile=False): DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01") end("pred_%d" % NUM) + NUM2 = 10000 + peptides = EncodableSequences.create(random_peptides(NUM2, length=13)) + start("encode_blosum_%d" % NUM2) + peptides.variable_length_to_fixed_length_vector_encoding("BLOSUM62") + end("encode_blosum_%d" % NUM2) + + start("pred_already_encoded_%d" % NUM2) + DOWNLOADED_PREDICTOR.predict(peptides, allele="HLA-A*02:01") + end("pred_already_encoded_%d" % NUM2) + print("SPEED BENCHMARK") print("Results:\n%s" % str(pandas.Series(timings))) @@ -55,4 +67,5 @@ if __name__ == '__main__': # Leave in ipython locals().update(result) - import ipdb ; ipdb.set_trace() + import ipdb # pylint: disable=import-error + ipdb.set_trace() diff --git a/test/test_train_allele_specific_models_command.py b/test/test_train_allele_specific_models_command.py index 5c3fbcfc961d1e2690c6dc46ca79262b11fe3451..affcde7569b0ae5766ae554f50a73122f3e3894f 100644 --- a/test/test_train_allele_specific_models_command.py +++ b/test/test_train_allele_specific_models_command.py @@ -1,15 +1,14 @@ -import tempfile -import shutil -import os import json +import os +import shutil +import tempfile from numpy.testing import assert_array_less, assert_equal -from mhcflurry.class1_affinity_prediction import ( - train_allele_specific_models_command, Class1AffinityPredictor) +from mhcflurry import train_allele_specific_models_command +from mhcflurry import Class1AffinityPredictor from mhcflurry.downloads import get_path - HYPERPARAMETERS = [ { "n_models": 2, @@ -21,6 +20,7 @@ HYPERPARAMETERS = [ "random_negative_rate": 0.0, "random_negative_constant": 25, + "peptide_amino_acid_encoding": "BLOSUM62", "use_embedding": False, "kmer_size": 15, "batch_normalization": False, @@ -49,30 +49,42 @@ HYPERPARAMETERS = [ ] -def test_run(): - try: - models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") - hyperparameters_filename = os.path.join( - models_dir, "hyperparameters.yaml") - with open(hyperparameters_filename, "w") as fd: - json.dump(HYPERPARAMETERS, fd) +def run_and_check(n_jobs=0): + models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") + hyperparameters_filename = os.path.join( + models_dir, "hyperparameters.yaml") + with open(hyperparameters_filename, "w") as fd: + json.dump(HYPERPARAMETERS, fd) - args = [ - "--data", get_path("data_curated", "curated_training_data.csv.bz2"), - "--hyperparameters", hyperparameters_filename, - "--min-measurements-per-allele", "9000", - "--out-models-dir", models_dir, - ] - print("Running with args: %s" % args) - train_allele_specific_models_command.run(args) + args = [ + "--data", get_path("data_curated", "curated_training_data.csv.bz2"), + "--hyperparameters", hyperparameters_filename, + "--allele", "HLA-A*02:01", "HLA-A*01:01", "HLA-A*03:01", + "--out-models-dir", models_dir, + "--percent-rank-calibration-num-peptides-per-length", "10000", + "--parallelization-num-jobs", str(n_jobs), + ] + print("Running with args: %s" % args) + train_allele_specific_models_command.run(args) - result = Class1AffinityPredictor.load(models_dir) - predictions = result.predict( + result = Class1AffinityPredictor.load(models_dir) + predictions = result.predict( + peptides=["SLYNTVATL"], + alleles=["HLA-A*02:01"]) + assert_equal(predictions.shape, (1,)) + assert_array_less(predictions, 500) + df = result.predict_to_dataframe( peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) - assert_equal(predictions.shape, (1,)) - assert_array_less(predictions, 500) + print(df) + assert "prediction_percentile" in df.columns + + print("Deleting: %s" % models_dir) + shutil.rmtree(models_dir) + +def Xtest_run_parallel(): + run_and_check(n_jobs=3) + - finally: - print("Deleting: %s" % models_dir) - shutil.rmtree(models_dir) +def test_run_serial(): + run_and_check(n_jobs=1) \ No newline at end of file