diff --git a/README.rst b/README.rst index 52dd66f68424e75f53840cb7c01bb2dd318fe16e..60223fb8b3c48667453076c1b5ca1dfe97913f5c 120000 --- a/README.rst +++ b/README.rst @@ -1 +1 @@ -docs/package_readme/readme.generated.rst \ No newline at end of file +docs/package_readme/readme.generated.txt \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index 72c597ba4e4c373485ee1a41a88b4265dc419525..83ec2f208799c51ae7761baa32a418c988a353a8 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -52,7 +52,7 @@ help: # Added by Tim: .PHONY: generate generate: - sphinx-apidoc -f -o _build/ ../mhcflurry + sphinx-apidoc -M -f -o _build/ ../mhcflurry python generate.py \ --out-models-cv-rst _build/_models_cv.rst \ --out-models-architecture-png _build/_models_architecture.png \ @@ -63,11 +63,11 @@ generate: # Added by Tim: .PHONY: readme readme: text - rm -f package_readme/readme.generated.rst + rm -f package_readme/readme.generated.txt cat package_readme/readme_header.rst \ _build/text/package_readme/readme.template.txt \ - > package_readme/readme.generated.rst - chmod 444 package_readme/readme.generated.rst # read only + > package_readme/readme.generated.txt + chmod 444 package_readme/readme.generated.txt # read only .PHONY: clean clean: diff --git a/docs/README.md b/docs/README.md index e7b6e9539acfb51f11816e180f7001a05d0c2981..ddc5ffc657352a9cb3ebe1888f4d78dedc980ed0 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,5 +1,8 @@ # MHCflurry documentation +Due to our use of `sphinxcontrib-autorun2` we unfortunately require Python 2.7 +to build to the docs. Python 3 is not supported. + To generate Sphinx documentation, from this directory run: ``` @@ -17,6 +20,6 @@ To build this file, run: $ make readme ``` -This will write `docs/package_readme/readme.generated.rst`. The main +This will write `docs/package_readme/readme.generated.txt`. The main [README.rst](../README.rst) is symlinked to this file. diff --git a/docs/commandline_tools.rst b/docs/commandline_tools.rst index 3b36a80a6ca54d309e50f9f72a78fa6b9b11d134..2b3c64567c81e432de053c04670c680c3c76b4fa 100644 --- a/docs/commandline_tools.rst +++ b/docs/commandline_tools.rst @@ -1,14 +1,21 @@ Command-line reference ============================ +See also the :ref:`tutorial <commandline_tutorial>`. + .. _mhcflurry-predict: + .. autoprogram:: mhcflurry.predict_command:parser - :prog: mhcflurry-predict + :prog: mhcflurry-predict .. _mhcflurry-class1-train-allele-specific-models: + .. autoprogram:: mhcflurry.train_allele_specific_models_command:parser - :prog: mhcflurry-class1-train-allele-specific-models + :prog: mhcflurry-class1-train-allele-specific-models + .. _mhcflurry-downloads: + .. autoprogram:: mhcflurry.downloads_command:parser - :prog: mhcflurry-downloads + :prog: mhcflurry-downloads + diff --git a/docs/commandline_tutorial.rst b/docs/commandline_tutorial.rst index 0f33229f99eca522762b7c9607a87addd53bef21..253f417226b938ddf3c205f90896c168e29e6373 100644 --- a/docs/commandline_tutorial.rst +++ b/docs/commandline_tutorial.rst @@ -1,6 +1,10 @@ +.. _commandline_tutorial: + Command-line tutorial ===================== +.. _downloading: + Downloading models ------------------ @@ -12,17 +16,22 @@ are distributed separately from the pip package and may be downloaded with the $ mhcflurry-downloads fetch models_class1 +Files downloaded with :ref:`mhcflurry-downloads` are stored in a platform-specific +directory. To get the path to downloaded data, you can use: + +.. command-output:: mhcflurry-downloads path models_class1 + :nostderr: + We also release a few other "downloads," such as curated training data and some experimental models. To see what you have downloaded, run: .. command-output:: mhcflurry-downloads info :nostderr: -Files downloaded with :ref:`mhcflurry-downloads` are stored in a platform-specific -directory. To get the path to downloaded data, you can use: +.. note:: -.. command-output:: mhcflurry-downloads path models_class1 - :nostderr: + The code we use for *generating* the downloads is in the + ``downloads_generation`` directory in the repository. Generating predictions diff --git a/docs/conf.py b/docs/conf.py index 7df14e7aab2cd62b9f08175b337e121eb961a672..84b8ec5e17e03d873ce05e8cb45e2db5c65e0da4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -16,6 +16,13 @@ import sys import os import re +import textwrap +import logging + +# Hack added by tim for bug in autoprogram extension under Python 2. +from sphinx.util.pycompat import indent +textwrap.indent = indent +logging.disable(logging.ERROR) # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the @@ -38,7 +45,8 @@ extensions = [ 'sphinx.ext.viewcode', 'sphinx.ext.githubpages', 'numpydoc', - 'sphinx_autorun', + #'sphinx_autorun', + 'sphinxcontrib.autorun2', 'sphinxcontrib.programoutput', 'sphinxcontrib.autoprogram', ] @@ -80,6 +88,7 @@ release = version # Added by tim autodoc_member_order = 'bysource' +autoclass_content = 'both' # Added by tim suppress_warnings = ['image.nonlocal_uri'] @@ -176,7 +185,7 @@ html_theme = 'sphinx_rtd_theme' # If not None, a 'Last updated on:' timestamp is inserted at every page # bottom, using the given strftime format. # The empty string is equivalent to '%b %d, %Y'. -#html_last_updated_fmt = None +html_last_updated_fmt = "" # If true, SmartyPants will be used to convert quotes and dashes to # typographically correct entities. @@ -190,10 +199,10 @@ html_theme = 'sphinx_rtd_theme' #html_additional_pages = {} # If false, no module index is generated. -#html_domain_indices = True +html_domain_indices = False # If false, no index is generated. -#html_use_index = True +html_use_index = False # If true, the index is split into individual pages for each letter. #html_split_index = False diff --git a/docs/generate.py b/docs/generate.py index edce8f1e835bf41796ba83d71de7ebf4c52cf647..848d04ce5a15c0d8391a4c1bd81e90b4a5229939 100644 --- a/docs/generate.py +++ b/docs/generate.py @@ -107,7 +107,7 @@ def go(argv): df = pandas.read_csv(args.cv_summary_csv) sub_df = df.loc[ df.kind == "ensemble" - ].sort_values("allele").dropna().copy().reset_index(drop=True) + ].sort_values("allele").copy().reset_index(drop=True) sub_df["Allele"] = sub_df.allele sub_df["CV Training Size"] = sub_df.train_size.astype(int) sub_df["AUC"] = sub_df.auc diff --git a/docs/index.rst b/docs/index.rst index b98a0d8906b77bdd8cc9ae98bd4704f187c8001f..11190d9fe16b57c86218cd55dd9f2a81d44110e2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,8 +1,6 @@ MHCflurry documentation ===================================== -Contents: - .. toctree:: :maxdepth: 3 @@ -14,11 +12,3 @@ Contents: commandline_tools api - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` - diff --git a/docs/intro.rst b/docs/intro.rst index b3ff684e168aca51d4d79c4839100aa169014187..2644c317bf22b1583db285b0af2370ac86747ec5 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -1,21 +1,19 @@ Introduction and setup ======================= -MHCflurry is a Python package for peptide/MHC I binding affinity prediction. It -provides competitive accuracy with a fast, documented, open source -implementation. +MHCflurry is an open source package for peptide/MHC I binding affinity prediction. It +provides competitive accuracy with a fast and documented implementation. You can download pre-trained MHCflurry models fit to affinity measurements -deposited in IEDB. See the "downloads_generation/models_class1" directory in the -repository for the workflow used to train these predictors. Users with their own -data can also fit their own MHCflurry models. +deposited in IEDB or train a MHCflurry predictor on your own data. Currently only allele-specific prediction is implemented, in which separate models are trained for each allele. The released models therefore support a fixed set of common -class I alleles for which sufficient published training data is available. +class I alleles for which sufficient published training data is available +(see :ref:`models_supported_alleles`\ ). -MHCflurry supports Python versions 2.7 and 3.4+. It uses the Keras neural -network library via either the Tensorflow or Theano backends. GPUs may +MHCflurry supports Python versions 2.7 and 3.4+. It uses the `keras <https://keras.io>`__ +neural network library via either the Tensorflow or Theano backends. GPUs may optionally be used for a generally modest speed improvement. If you find MHCflurry useful in your research please cite: @@ -30,22 +28,22 @@ Installation (pip) Install the package: -:: +.. code-block:: shell - pip install mhcflurry + $ pip install mhcflurry Then download our datasets and trained models: -:: +.. code-block:: shell - mhcflurry-downloads fetch + $ mhcflurry-downloads fetch From a checkout you can run the unit tests with: -:: +.. code-block:: shell - pip install nose - nosetests . + $ pip install nose + $ nosetests . Using conda @@ -55,15 +53,15 @@ You can alternatively get up and running with a `conda <https://conda.io/docs/>` environment as follows. Some users have reported that this can avoid problems installing tensorflow. -:: +.. code-block:: shell - conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.2' - source activate mhcflurry-env + $ conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.2' + $ source activate mhcflurry-env Then continue as above: -:: +.. code-block:: shell - pip install mhcflurry - mhcflurry-downloads fetch + $ pip install mhcflurry + $ mhcflurry-downloads fetch diff --git a/docs/models.rst b/docs/models.rst index 5287daf0c353b2f2a254a0b84c15efd650b4ad8b..4f7dee9edb93dcf686b830795aad79bf93c74bd7 100644 --- a/docs/models.rst +++ b/docs/models.rst @@ -21,7 +21,6 @@ Architecture diagram: .. image:: /_build/_models_architecture.png - Cross validation performance ------------------------------------------------------------- diff --git a/docs/models_supported_alleles.rst b/docs/models_supported_alleles.rst index 3bca50762a7670418db9e8e2d9ca93bb6d2a0b34..0dd3c94ed86545a3781a904cdcf3b064b8189648 100644 --- a/docs/models_supported_alleles.rst +++ b/docs/models_supported_alleles.rst @@ -1,3 +1,5 @@ +.. _models_supported_alleles: + Supported alleles and peptide lengths ===================================== diff --git a/docs/package_readme/readme.generated.rst b/docs/package_readme/readme.generated.txt similarity index 95% rename from docs/package_readme/readme.generated.rst rename to docs/package_readme/readme.generated.txt index e431863ffeac03366c0b82af53aed69372250395..6f16f412cd2e08362c03ad4d1db0374e9a327e1f 100644 --- a/docs/package_readme/readme.generated.rst +++ b/docs/package_readme/readme.generated.txt @@ -11,20 +11,19 @@ mhcflurry Open source neural network models for peptide-MHC binding affinity prediction -MHCflurry is a Python package for peptide/MHC I binding affinity -prediction. It provides competitive accuracy with a fast, documented, -open source implementation. +MHCflurry is an open source package for peptide/MHC I binding affinity +prediction. It provides competitive accuracy with a fast and +documented implementation. You can download pre-trained MHCflurry models fit to affinity -measurements deposited in IEDB. See the -“downloads_generation/models_class1” directory in the repository for -the workflow used to train these predictors. Users with their own data -can also fit their own MHCflurry models. +measurements deposited in IEDB or train a MHCflurry predictor on your +own data. Currently only allele-specific prediction is implemented, in which separate models are trained for each allele. The released models therefore support a fixed set of common class I alleles for which -sufficient published training data is available. +sufficient published training data is available (see Supported alleles +and peptide lengths). MHCflurry supports Python versions 2.7 and 3.4+. It uses the Keras neural network library via either the Tensorflow or Theano backends. @@ -42,16 +41,16 @@ Installation (pip) Install the package: - pip install mhcflurry + $ pip install mhcflurry Then download our datasets and trained models: - mhcflurry-downloads fetch + $ mhcflurry-downloads fetch From a checkout you can run the unit tests with: - pip install nose - nosetests . + $ pip install nose + $ nosetests . Using conda @@ -61,13 +60,13 @@ You can alternatively get up and running with a conda environment as follows. Some users have reported that this can avoid problems installing tensorflow. - conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.2' - source activate mhcflurry-env + $ conda create -q -n mhcflurry-env python=3.6 'tensorflow>=1.1.2' + $ source activate mhcflurry-env Then continue as above: - pip install mhcflurry - mhcflurry-downloads fetch + $ pip install mhcflurry + $ mhcflurry-downloads fetch Command-line tutorial @@ -110,6 +109,9 @@ specific directory. To get the path to downloaded data, you can use: $ mhcflurry-downloads path models_class1 /Users/tim/Library/Application Support/mhcflurry/4/1.0.0/models_class1/ +Note: The code we use for generating the downloads is in the + "downloads_generation" directory in the repository. + Generating predictions ********************** @@ -229,8 +231,8 @@ information. --mhc-peptide-lengths 8,9,10,11 --extract-subsequences --output-csv /tmp/subsequence_predictions.csv - 2017-12-21 14:26:39,143 - mhctools.cli.args - INFO - Building MHC binding prediction type for alleles ['HLA-A*02:01', 'HLA-A*03:01'] and epitope lengths [8, 9, 10, 11] - 2017-12-21 14:26:45,471 - mhctools.cli.script - INFO - + 2017-12-21 16:29:58,003 - mhctools.cli.args - INFO - Building MHC binding prediction type for alleles ['HLA-A*02:01', 'HLA-A*03:01'] and epitope lengths [8, 9, 10, 11] + 2017-12-21 16:30:03,062 - mhctools.cli.script - INFO - ... [1192 rows x 8 columns] Wrote: /tmp/subsequence_predictions.csv @@ -240,8 +242,8 @@ specified lengths: $ head -n 3 /tmp/subsequence_predictions.csv source_sequence_name,offset,peptide,allele,affinity,percentile_rank,prediction_method_name,length - protein2,42,AARYSAFY,HLA-A*02:01,33829.639361000336,73.7865875,mhcflurry,8 - protein2,42,AARYSAFYN,HLA-A*02:01,29747.41688667342,60.34871249999998,mhcflurry,9 + protein2,42,AARYSAFY,HLA-A*03:01,5744.344274398671,4.739962499999998,mhcflurry,8 + protein2,42,AARYSAFYN,HLA-A*03:01,10576.536440802967,8.399187499999996,mhcflurry,9 Python library tutorial @@ -445,8 +447,6 @@ peptides of length 8-15 and the following 124 alleles: Mamu-B*87:01, Patr-A*01:01, Patr-A*03:01, Patr-A*04:01, Patr-A*07:01, Patr-A*09:01, Patr-B*01:01, Patr-B*13:01, Patr-B*24:01 -[image: Build Status][image] [image: Coverage Status][image] - mhcflurry ========= diff --git a/docs/package_readme/readme.template.rst b/docs/package_readme/readme.template.rst index a3d7a410d42a08926f9a64ec3e690dc87f919501..69bb924d9649a54c1156328bdc88b6a3471c94e9 100644 --- a/docs/package_readme/readme.template.rst +++ b/docs/package_readme/readme.template.rst @@ -8,7 +8,6 @@ .. include:: /python_tutorial.rst .. include:: /models_supported_alleles.rst -|Build Status| |Coverage Status| mhcflurry ========= diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst index 7d13014862d731151ef155661a06c95b8dcdef3f..f8a9379984b9fec2ce7a31a41ac60b29bb8c8881 100644 --- a/docs/python_tutorial.rst +++ b/docs/python_tutorial.rst @@ -1,177 +1,158 @@ Python library tutorial ======================= +Predicting +---------- + The MHCflurry Python API exposes additional options and features beyond those supported by the commandline tools. This tutorial gives a basic overview of the most important functionality. See the :ref:`API-documentation` for further details. The `~mhcflurry.Class1AffinityPredictor` class is the primary user-facing interface. - +Use the `~mhcflurry.Class1AffinityPredictor.load` static method to load a +trained predictor from disk. With no arguments this method will load the predictor +released with MHCflurry (see :ref:`downloading`\ ). If you pass a path to a +models directory, then it will load that predictor instead. .. runblock:: pycon - >>> import mhcflurry - >>> print("MHCflurry version: %s" % (mhcflurry.__version__)) - >>> - >>> # Load downloaded predictor - >>> predictor = mhcflurry.Class1AffinityPredictor.load() - >>> print(predictor.supported_alleles) - - - -:: - - - # coding: utf-8 - - # In[22]: - - import pandas - import numpy - import seaborn - import logging - from matplotlib import pyplot - - import mhcflurry - - - - # # Download data and models - - # In[2]: - - get_ipython().system('mhcflurry-downloads fetch') - - - # # Making predictions with `Class1AffinityPredictor` - - # In[3]: - - help(mhcflurry.Class1AffinityPredictor) - - - # In[4]: - - downloaded_predictor = mhcflurry.Class1AffinityPredictor.load() - + >>> from mhcflurry import Class1AffinityPredictor + >>> predictor = Class1AffinityPredictor.load() + >>> predictor.supported_alleles[:10] - # In[5]: +With a predictor loaded we can now generate some binding predictions: - downloaded_predictor.predict(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) - - - # In[6]: - - downloaded_predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) - - - # In[7]: - - downloaded_predictor.predict_to_dataframe(alleles=["HLA-A0201", "HLA-B*57:01"], peptides=["SIINFEKL", "SIINFEQL"]) - - - # In[8]: - - downloaded_predictor.predict_to_dataframe( - allele="HLA-A0201", - peptides=["SIINFEKL", "SIINFEQL"], - include_individual_model_predictions=True) - - - # In[9]: - - downloaded_predictor.predict_to_dataframe( - allele="HLA-A0201", - peptides=["SIINFEKL", "SIINFEQL", "TAAAALANGGGGGGGG"], - throw=False) # Without throw=False, you'll get a ValueError for invalid peptides or alleles +.. runblock:: pycon + >>> predictor.predict(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) - # # Instantiating a `Class1AffinityPredictor` from a saved model on disk +.. note:: - # In[10]: + MHCflurry normalizes allele names using the `mhcnames <https://github.com/hammerlab/mhcnames>`__ + package. Names like ``HLA-A0201`` or ``A*02:01`` will be + normalized to ``HLA-A*02:01``, so most naming conventions can be used + with methods such as `~mhcflurry.Class1AffinityPredictor.predict`. - models_dir = mhcflurry.downloads.get_path("models_class1", "models") - models_dir +For more detailed results, we can use +`~mhcflurry.Class1AffinityPredictor.predict_to_dataframe`. +.. runblock:: pycon - # In[11]: + >>> predictor.predict_to_dataframe(allele="HLA-A0201", peptides=["SIINFEKL", "SIINFEQL"]) - # This will be the same predictor we instantiated above. We're just being explicit about what models to load. - downloaded_predictor = mhcflurry.Class1AffinityPredictor.load(models_dir) - downloaded_predictor.predict(["SIINFEKL", "SIQNPEKP", "SYNFPEPI"], allele="HLA-A0301") +Instead of a single allele and multiple peptides, we may need predictions for +allele/peptide pairs. We can predict across pairs by specifying +the `alleles` argument instead of `allele`. The list of alleles +must be the same length as the list of peptides (i.e. it is predicting over pairs, +*not* taking the cross product). +.. runblock:: pycon - # # Fit a model: first load some data + >>> predictor.predict(alleles=["HLA-A0201", "HLA-B*57:01"], peptides=["SIINFEKL", "SIINFEQL"]) - # In[12]: +Training +-------- - # This is the data the downloaded models were trained on - data_path = mhcflurry.downloads.get_path("data_curated", "curated_training_data.csv.bz2") - data_path +Let's fit our own MHCflurry predictor. First we need some training data. If you +haven't already, run this in a shell to download the MHCflurry training data: +.. code-block:: shell - # In[13]: + $ mhcflurry-downloads fetch data_curated - data_df = pandas.read_csv(data_path) - data_df +We can get the path to this data from Python using `mhcflurry.downloads.get_path`: +.. runblock:: pycon - # # Fit a model: Low level `Class1NeuralNetwork` interface + >>> from mhcflurry.downloads import get_path + >>> data_path = get_path("data_curated", "curated_training_data.csv.bz2") + >>> data_path - # In[14]: +Now let's load it with pandas and filter to reasonably-sized peptides: - # We'll use mostly the default hyperparameters here. Could also specify them as kwargs. - new_model = mhcflurry.Class1NeuralNetwork(layer_sizes=[16]) - new_model.hyperparameters +.. runblock:: pycon + >>> import pandas + >>> df = pandas.read_csv(data_path) + >>> df = df.loc[(df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)] + >>> df.head(5) - # In[16]: +We'll make an untrained `~mhcflurry.Class1AffinityPredictor` and then call +`~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` to fit +some models. - train_data = data_df.loc[ - (data_df.allele == "HLA-B*57:01") & - (data_df.peptide.str.len() >= 8) & - (data_df.peptide.str.len() <= 15) - ] - get_ipython().magic('time new_model.fit(train_data.peptide.values, train_data.measurement_value.values)') +.. runblock:: pycon + >>> new_predictor = Class1AffinityPredictor() + >>> single_allele_train_data = df.loc[df.allele == "HLA-B*57:01"].sample(100) + >>> new_predictor.fit_allele_specific_predictors( + ... n_models=1, + ... architecture_hyperparameters={ + ... "layer_sizes": [16], + ... "max_epochs": 5, + ... "random_negative_constant": 5, + ... }, + ... peptides=single_allele_train_data.peptide.values, + ... affinities=single_allele_train_data.measurement_value.values, + ... allele="HLA-B*57:01") + +The `~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` method +can be called any number of times on the same instance to build up ensembles +of models across alleles. The `architecture_hyperparameters` we specified are +for demonstration purposes; to fit real models you would usually train for +more epochs. + +Now we can generate predictions: - # In[17]: +.. runblock:: pycon - new_model.predict(["SYNPEPII"]) + >>> new_predictor.predict(["SYNPEPII"], allele="HLA-B*57:01") +We can save our predictor to the specified directory on disk by running: - # # Fit a model: high level `Class1AffinityPredictor` interface +.. runblock:: pycon - # In[18]: + >>> new_predictor.save("/tmp/new-predictor") - affinity_predictor = mhcflurry.Class1AffinityPredictor() +and restore it: - # This can be called any number of times, for example on different alleles, to build up the ensembles. - affinity_predictor.fit_allele_specific_predictors( - n_models=1, - architecture_hyperparameters={"layer_sizes": [16], "max_epochs": 10}, - peptides=train_data.peptide.values, - affinities=train_data.measurement_value.values, - allele="HLA-B*57:01", - ) +.. runblock:: pycon + >>> new_predictor2 = Class1AffinityPredictor.load("/tmp/new-predictor") + >>> new_predictor2.supported_alleles - # In[19]: - affinity_predictor.predict(["SYNPEPII"], allele="HLA-B*57:01") +Lower level interface +--------------------- +The high-level `Class1AffinityPredictor` delegates to low-level +`~mhcflurry.Class1NeuralNetwork` objects, each of which represents +a single neural network. The purpose of `~mhcflurry.Class1AffinityPredictor` +is to implement several important features: - # # Save and restore the fit model +ensembles + More than one neural network can be used to generate each prediction. The + predictions returned to the user are the geometric mean of the individual + model predictions. This gives higher accuracy in most situations - # In[20]: +multiple alleles + A `~mhcflurry.Class1NeuralNetwork` generates predictions for only a single + allele. The `~mhcflurry.Class1AffinityPredictor` maps alleles to the + relevant `~mhcflurry.Class1NeuralNetwork` instances - get_ipython().system('mkdir /tmp/saved-affinity-predictor') - affinity_predictor.save("/tmp/saved-affinity-predictor") - get_ipython().system('ls /tmp/saved-affinity-predictor') +serialization + Loading and saving predictors is implemented in `~mhcflurry.Class1AffinityPredictor`. +Sometimes it's easiest to work directly with `~mhcflurry.Class1NeuralNetwork`. +Here is a simple example of doing so: - # In[21]: +.. runblock:: pycon - affinity_predictor2 = mhcflurry.Class1AffinityPredictor.load("/tmp/saved-affinity-predictor") - affinity_predictor2.predict(["SYNPEPII"], allele="HLA-B*57:01") + >>> from mhcflurry import Class1NeuralNetwork + >>> network = Class1NeuralNetwork() + >>> network.fit( + ... single_allele_train_data.peptide.values, + ... single_allele_train_data.measurement_value.values, + ... verbose=0) + >>> network.predict(["SIINFEKLL"]) diff --git a/docs/requirements.txt b/docs/requirements.txt index c4a411cc460e7e7f96605f7f8ef5df57e2c91429..a88547504c1d7ede3af6ca6d67010fc0614ade96 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,9 @@ -sphinx-autorun +sphinx +sphinxcontrib-autorun2 sphinxcontrib-programoutput sphinxcontrib-autoprogram -sphinx +sphinx-rtd-theme numpydoc pypandoc mhctools +pydot diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index e743fb33ac4c3b80e66dd80f0e15ac167320316c..600882fd9693f56f8cc89f45c91f8906ee899d3d 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -1,26 +1,10 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from mhcflurry.class1_affinity_predictor import ( - Class1AffinityPredictor) -from mhcflurry.class1_neural_network import ( - Class1NeuralNetwork) +from mhcflurry.class1_affinity_predictor import Class1AffinityPredictor +from mhcflurry.class1_neural_network import Class1NeuralNetwork __version__ = "1.0.0" __all__ = [ - "Class1NeuralNetwork", - "Class1AffinityPredictor", "__version__", + "Class1AffinityPredictor", + "Class1NeuralNetwork", ] diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 591419c0d63ac8331d18359f188bbe1eb569e32e..1c523e35be470a8eecdf5ae218a9a110b29ec075 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -6,6 +6,7 @@ import sys import time import warnings from os.path import join, exists +from os import mkdir import mhcnames import numpy @@ -24,12 +25,11 @@ from mhcflurry.regression_target import to_ic50 class Class1AffinityPredictor(object): """ High-level interface for peptide/MHC I binding affinity prediction. - - This is the class most users will want to use. - - This class delegates to one or more `Class1NeuralNetwork` instances. - It supports prediction across multiple alleles using ensembles of single- - or pan-allele predictors. + + This class manages low-level `Class1NeuralNetwork` instances, each of which + wraps a single Keras network. The purpose of `Class1AffinityPredictor` is to + implement ensembles, handling of multiple alleles, and predictor loading and + saving. """ def __init__( self, @@ -186,7 +186,8 @@ class Class1AffinityPredictor(object): def save(self, models_dir, model_names_to_write=None): """ - Serialize the predictor to a directory on disk. + Serialize the predictor to a directory on disk. If the directory does + not exist it will be created. The serialization format consists of a file called "manifest.csv" with the configurations of each Class1NeuralNetwork, along with per-network @@ -213,6 +214,9 @@ class Class1AffinityPredictor(object): # Write all models model_names_to_write = self.manifest_df.model_name.values + if not exists(models_dir): + mkdir(models_dir) + sub_manifest_df = self.manifest_df.ix[ self.manifest_df.model_name.isin(model_names_to_write) ] diff --git a/mhcflurry/common.py b/mhcflurry/common.py index 10c349bf96a9538946e7aa0b935812bd743fe505..abc2a72c7b7efc1893f2f3630c48b3166169d5ef 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -1,25 +1,7 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import print_function, division, absolute_import -import itertools import collections import logging -import hashlib -import time import sys -from os import environ import numpy import pandas diff --git a/mhcflurry/downloads.py b/mhcflurry/downloads.py index 21800feba7d75582432c8ffc6fac9db8bf48bc5f..e88675ed00b78718f0687e28ed4911473593e13c 100644 --- a/mhcflurry/downloads.py +++ b/mhcflurry/downloads.py @@ -1,16 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """ Manage local downloaded data. """ diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index 64c1a89ad87e8c011b033c918944883bb86b9f71..aab2b0fe1874b288033afa4419de1919e29f4e1e 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -1,17 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - from __future__ import ( print_function, division, @@ -19,6 +5,7 @@ from __future__ import ( ) import math +from six import string_types import numpy import pandas @@ -46,6 +33,8 @@ class EncodableSequences(object): return klass(sequences) def __init__(self, sequences): + if not all(isinstance(obj, string_types) for obj in sequences): + raise ValueError("Sequence of strings is required") self.sequences = numpy.array(sequences) self.encoding_cache = {} self.fixed_sequence_length = None diff --git a/mhcflurry/hyperparameters.py b/mhcflurry/hyperparameters.py index c0ddaab4c7445d34671ec8b1df11c3a4c3433c04..cc5950d5c175a35c08eb9cfea0d44d64e3fdeb65 100644 --- a/mhcflurry/hyperparameters.py +++ b/mhcflurry/hyperparameters.py @@ -1,16 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. from __future__ import ( print_function, division, diff --git a/mhcflurry/regression_target.py b/mhcflurry/regression_target.py index f1d9d9f03506770e25a446d13b5a03e775098754..d1189178c54df06d32ae1b45b2ee23efeed9306e 100644 --- a/mhcflurry/regression_target.py +++ b/mhcflurry/regression_target.py @@ -1,17 +1,3 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - import numpy