From 95833a7953dff38b6bd66e4075b5b3c0d6a5f7fd Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 19 Dec 2017 18:05:46 -0500
Subject: [PATCH] Improve docs

---
 docs/_readme.md         |  4 ++--
 docs/intro.rst          |  8 ++++----
 mhcflurry/amino_acid.py | 44 ++++++++++++++++++++---------------------
 3 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/docs/_readme.md b/docs/_readme.md
index f0289ece..00d2a726 100644
--- a/docs/_readme.md
+++ b/docs/_readme.md
@@ -10,9 +10,9 @@ Open source peptide/MHC I binding affinity prediction
 Introduction and setup
 ----------------------
 
-MHCflurry is a peptide/MHC I binding affinity prediction package written in Python. It aims to provide state of the art accuracy in a documented, fast, and open source implementation.
+MHCflurry is a peptide/MHC I binding affinity prediction package written in Python. It aims to provide state of the art accuracy with a documented, fast, and open source implementation.
 
-MHCflurry users may download trained predictors fit to affinity measurements deposited in IEDB. The complete workflow to generate these models is available in the "downloads\_generation/models\_class1" directory in the repository. It is also easy for users with their own data to fit their own models.
+MHCflurry users may download trained predictors fit to affinity measurements deposited in IEDB. See the "downloads\_generation/models\_class1" directory in the repository for the workflow used to train these predictors. It is also easy for users with their own data to fit their own models.
 
 Currently only allele-specific prediction is implemented, in which separate models are trained for each allele. The released models therefore support a fixed set of common class I alleles for which sufficient published training data is available.
 
diff --git a/docs/intro.rst b/docs/intro.rst
index 71aac925..0ed0cff9 100644
--- a/docs/intro.rst
+++ b/docs/intro.rst
@@ -2,13 +2,13 @@ Introduction and setup
 =======================
 
 MHCflurry is a peptide/MHC I binding affinity prediction package written in
-Python. It aims to provide state of the art accuracy in a documented, fast, and
+Python. It aims to provide state of the art accuracy with a documented, fast, and
 open source implementation.
 
 MHCflurry users may download trained predictors fit to affinity measurements
-deposited in IEDB. The complete workflow to generate these models
-is available in the "downloads_generation/models_class1" directory in the
-repository. It is also easy for users with their own data to fit their own models.
+deposited in IEDB. See the "downloads_generation/models_class1" directory in the
+repository for the workflow used to train these predictors. It is also easy
+for users with their own data to fit their own models.
 
 Currently only allele-specific prediction is implemented, in which separate models
 are trained for each allele. The released models therefore support a fixed set of common
diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py
index ffa5cffc..1077c644 100644
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -1,16 +1,7 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+"""
+Functions for encoding fixed length sequences of amino acids into various
+vector representations, such as one-hot and BLOSUM62.
+"""
 
 from __future__ import (
     print_function,
@@ -118,8 +109,12 @@ def vector_encoding_length(name):
 
 def index_encoding(sequences, letter_to_index_dict):
     """
-    Given a sequence of n strings all of length k, return a k * n array where
-    the (i, j)th element is letter_to_index_dict[sequence[i][j]].
+    Encode a sequence of same-length strings to a matrix of integers of the
+    same shape. The map from characters to integers is given by
+    `letter_to_index_dict`.
+
+    Given a sequence of `n` strings all of length `k`, return a `k * n` array where
+    the (`i`, `j`)th element is `letter_to_index_dict[sequence[i][j]]`.
 
     Parameters
     ----------
@@ -128,7 +123,7 @@ def index_encoding(sequences, letter_to_index_dict):
 
     Returns
     -------
-    numpy.array of integers with shape (k, n)
+    numpy.array of integers with shape (`k`, `n`)
     """
     df = pandas.DataFrame(iter(s) for s in sequences)
     result = df.replace(letter_to_index_dict)
@@ -137,19 +132,22 @@ def index_encoding(sequences, letter_to_index_dict):
 
 def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df):
     """
-    Given a sequence of n strings all of length k, and a dataframe mapping each
-    character to an arbitrary vector, return a n * k * m array where
-    the (i, j)th element is letter_to_vector_df.loc[sequence[i][j]].
+    Given a `n` x `k` matrix of integers such as that returned by `index_encoding()` and
+    a dataframe mapping each index to an arbitrary vector, return a `n * k * m`
+    array where the (`i`, `j`)'th element is `letter_to_vector_df.iloc[sequence[i][j]]`.
+
+    The dataframe index and columns names are ignored here; the indexing is done
+    entirely by integer position in the dataframe.
 
     Parameters
     ----------
-    sequences : list of length n of strings of length k
-    letter_to_vector_df : pandas.DataFrame of shape (alphabet size, m)
-        The index of the dataframe should be amino acid characters.
+    index_encoded_sequences : `n` x `k` array of integers
+
+    letter_to_vector_df : pandas.DataFrame of shape (`alphabet size`, `m`)
 
     Returns
     -------
-    numpy.array of integers with shape (n, k, m)
+    numpy.array of integers with shape (`n`, `k`, `m`)
     """
     (num_sequences, sequence_length) = index_encoded_sequences.shape
     target_shape = (
-- 
GitLab