From 95833a7953dff38b6bd66e4075b5b3c0d6a5f7fd Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 19 Dec 2017 18:05:46 -0500 Subject: [PATCH] Improve docs --- docs/_readme.md | 4 ++-- docs/intro.rst | 8 ++++---- mhcflurry/amino_acid.py | 44 ++++++++++++++++++++--------------------- 3 files changed, 27 insertions(+), 29 deletions(-) diff --git a/docs/_readme.md b/docs/_readme.md index f0289ece..00d2a726 100644 --- a/docs/_readme.md +++ b/docs/_readme.md @@ -10,9 +10,9 @@ Open source peptide/MHC I binding affinity prediction Introduction and setup ---------------------- -MHCflurry is a peptide/MHC I binding affinity prediction package written in Python. It aims to provide state of the art accuracy in a documented, fast, and open source implementation. +MHCflurry is a peptide/MHC I binding affinity prediction package written in Python. It aims to provide state of the art accuracy with a documented, fast, and open source implementation. -MHCflurry users may download trained predictors fit to affinity measurements deposited in IEDB. The complete workflow to generate these models is available in the "downloads\_generation/models\_class1" directory in the repository. It is also easy for users with their own data to fit their own models. +MHCflurry users may download trained predictors fit to affinity measurements deposited in IEDB. See the "downloads\_generation/models\_class1" directory in the repository for the workflow used to train these predictors. It is also easy for users with their own data to fit their own models. Currently only allele-specific prediction is implemented, in which separate models are trained for each allele. The released models therefore support a fixed set of common class I alleles for which sufficient published training data is available. diff --git a/docs/intro.rst b/docs/intro.rst index 71aac925..0ed0cff9 100644 --- a/docs/intro.rst +++ b/docs/intro.rst @@ -2,13 +2,13 @@ Introduction and setup ======================= MHCflurry is a peptide/MHC I binding affinity prediction package written in -Python. It aims to provide state of the art accuracy in a documented, fast, and +Python. It aims to provide state of the art accuracy with a documented, fast, and open source implementation. MHCflurry users may download trained predictors fit to affinity measurements -deposited in IEDB. The complete workflow to generate these models -is available in the "downloads_generation/models_class1" directory in the -repository. It is also easy for users with their own data to fit their own models. +deposited in IEDB. See the "downloads_generation/models_class1" directory in the +repository for the workflow used to train these predictors. It is also easy +for users with their own data to fit their own models. Currently only allele-specific prediction is implemented, in which separate models are trained for each allele. The released models therefore support a fixed set of common diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index ffa5cffc..1077c644 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -1,16 +1,7 @@ -# Copyright (c) 2016. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +""" +Functions for encoding fixed length sequences of amino acids into various +vector representations, such as one-hot and BLOSUM62. +""" from __future__ import ( print_function, @@ -118,8 +109,12 @@ def vector_encoding_length(name): def index_encoding(sequences, letter_to_index_dict): """ - Given a sequence of n strings all of length k, return a k * n array where - the (i, j)th element is letter_to_index_dict[sequence[i][j]]. + Encode a sequence of same-length strings to a matrix of integers of the + same shape. The map from characters to integers is given by + `letter_to_index_dict`. + + Given a sequence of `n` strings all of length `k`, return a `k * n` array where + the (`i`, `j`)th element is `letter_to_index_dict[sequence[i][j]]`. Parameters ---------- @@ -128,7 +123,7 @@ def index_encoding(sequences, letter_to_index_dict): Returns ------- - numpy.array of integers with shape (k, n) + numpy.array of integers with shape (`k`, `n`) """ df = pandas.DataFrame(iter(s) for s in sequences) result = df.replace(letter_to_index_dict) @@ -137,19 +132,22 @@ def index_encoding(sequences, letter_to_index_dict): def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df): """ - Given a sequence of n strings all of length k, and a dataframe mapping each - character to an arbitrary vector, return a n * k * m array where - the (i, j)th element is letter_to_vector_df.loc[sequence[i][j]]. + Given a `n` x `k` matrix of integers such as that returned by `index_encoding()` and + a dataframe mapping each index to an arbitrary vector, return a `n * k * m` + array where the (`i`, `j`)'th element is `letter_to_vector_df.iloc[sequence[i][j]]`. + + The dataframe index and columns names are ignored here; the indexing is done + entirely by integer position in the dataframe. Parameters ---------- - sequences : list of length n of strings of length k - letter_to_vector_df : pandas.DataFrame of shape (alphabet size, m) - The index of the dataframe should be amino acid characters. + index_encoded_sequences : `n` x `k` array of integers + + letter_to_vector_df : pandas.DataFrame of shape (`alphabet size`, `m`) Returns ------- - numpy.array of integers with shape (n, k, m) + numpy.array of integers with shape (`n`, `k`, `m`) """ (num_sequences, sequence_length) = index_encoded_sequences.shape target_shape = ( -- GitLab