Newer
Older
"""
Functions for encoding fixed length sequences of amino acids into various
vector representations, such as one-hot and BLOSUM62.
"""
from __future__ import (
print_function,
division,
absolute_import,
)
"A": "Alanine",
"R": "Arginine",
"N": "Asparagine",
"D": "Aspartic Acid",
"C": "Cysteine",
"E": "Glutamic Acid",
"Q": "Glutamine",
"G": "Glycine",
"H": "Histidine",
"I": "Isoleucine",
"L": "Leucine",
"K": "Lysine",
"M": "Methionine",
"F": "Phenylalanine",
"P": "Proline",
"S": "Serine",
"T": "Threonine",
"W": "Tryptophan",
"Y": "Tyrosine",
"V": "Valine",
}.items()))
COMMON_AMINO_ACIDS_WITH_UNKNOWN = copy(COMMON_AMINO_ACIDS)
COMMON_AMINO_ACIDS_WITH_UNKNOWN["X"] = "Unknown"
AMINO_ACID_INDEX = dict(
(letter, i) for (i, letter) in enumerate(COMMON_AMINO_ACIDS_WITH_UNKNOWN))
AMINO_ACIDS = list(COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
BLOSUM62_MATRIX = pandas.read_table(StringIO("""
A R N D C Q E G H I L K M F P S T W Y V X
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
"BLOSUM62": BLOSUM62_MATRIX,
"one-hot": pandas.DataFrame([
[1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
for j in range(len(AMINO_ACIDS))
], index=AMINO_ACIDS, columns=AMINO_ACIDS)
}
def available_vector_encodings():
"""
Return list of supported amino acid vector encodings.
Returns
-------
list of string
"""
def vector_encoding_length(name):
"""
Return the length of the given vector encoding.
Parameters
----------
name : string
Returns
-------
int
"""
def index_encoding(sequences, letter_to_index_dict):
"""
Encode a sequence of same-length strings to a matrix of integers of the
same shape. The map from characters to integers is given by
`letter_to_index_dict`.
Given a sequence of `n` strings all of length `k`, return a `k * n` array where
the (`i`, `j`)th element is `letter_to_index_dict[sequence[i][j]]`.
Parameters
----------
sequences : list of length n of strings of length k
letter_to_index_dict : dict : string -> int
Returns
-------
"""
df = pandas.DataFrame(iter(s) for s in sequences)
result = df.replace(letter_to_index_dict)
return result.values
def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df):
Given a `n` x `k` matrix of integers such as that returned by `index_encoding()` and
a dataframe mapping each index to an arbitrary vector, return a `n * k * m`
array where the (`i`, `j`)'th element is `letter_to_vector_df.iloc[sequence[i][j]]`.
The dataframe index and columns names are ignored here; the indexing is done
entirely by integer position in the dataframe.
index_encoded_sequences : `n` x `k` array of integers
letter_to_vector_df : pandas.DataFrame of shape (`alphabet size`, `m`)
(num_sequences, sequence_length) = index_encoded_sequences.shape
target_shape = (
num_sequences, sequence_length, letter_to_vector_df.shape[0])
result = letter_to_vector_df.iloc[
].values.reshape(target_shape)
return result