Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
Class for encoding variable-length flanking and peptides to
fixed-size numerical matrices
"""
from __future__ import (
print_function, division, absolute_import, )
from six import string_types
from collections import namedtuple
from .encodable_sequences import EncodingError, EncodableSequences
import numpy
import pandas
EncodingResult = namedtuple(
"EncodingResult", ["array", "peptide_lengths"])
class FlankingEncoding(object):
"""
"""
unknown_character = "X"
def __init__(self, peptides, n_flanks, c_flanks):
self.dataframe = pandas.DataFrame({
"peptide": peptides,
"n_flank": n_flanks,
"c_flank": c_flanks,
}, dtype=str)
self.encoding_cache = {}
def __len__(self):
return len(self.dataframe)
def vector_encode(
self,
vector_encoding_name,
peptide_max_length,
n_flank_length,
c_flank_length,
allow_unsupported_amino_acids=True):
"""
Encode variable-length sequences to a fixed-size matrix. Amino acids
are encoded as specified by the vector_encoding_name argument.
See `sequences_to_fixed_length_index_encoded_array` for details.
See also: variable_length_to_fixed_length_categorical.
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings().
alignment_method : string
One of "pad_middle" or "left_pad_right_pad"
left_edge : int, size of fixed-position left side
Only relevant for pad_middle alignment method
right_edge : int, size of the fixed-position right side
Only relevant for pad_middle alignment method
max_length : maximum supported peptide length
Returns
-------
numpy.array with shape (num sequences, encoded length, m)
where
- m is the vector encoding length (usually 21).
- encoded length is max_length if alignment_method is pad_middle;
3 * max_length if it's left_pad_right_pad.
"""
cache_key = (
"vector_encode",
vector_encoding_name,
peptide_max_length,
n_flank_length,
c_flank_length,
allow_unsupported_amino_acids)
if cache_key not in self.encoding_cache:
result = self.encode(
vector_encoding_name=vector_encoding_name,
df=self.dataframe,
peptide_max_length=peptide_max_length,
n_flank_length=n_flank_length,
c_flank_length=c_flank_length,
allow_unsupported_amino_acids=allow_unsupported_amino_acids)
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
@staticmethod
def encode(
vector_encoding_name,
df,
peptide_max_length,
n_flank_length,
c_flank_length,
allow_unsupported_amino_acids=False):
"""
"""
error_df = df.loc[
(df.peptide.str.len() > peptide_max_length) |
(df.peptide.str.len() < 1)
]
if len(error_df) > 0:
raise EncodingError(
"Sequence '%s' (length %d) unsupported. There are %d "
"total peptides with this length." % (
error_df.iloc[0].peptide,
len(error_df.iloc[0].peptide),
len(error_df)))
if n_flank_length > 0:
n_flanks = df.n_flank.str.pad(
n_flank_length,
side="left",
fillchar="X").str.slice(-n_flank_length).str.upper()
else:
n_flanks = pandas.Series([""] * len(df))
c_flanks = df.c_flank.str.pad(
c_flank_length,
side="right",
fillchar="X").str.slice(0, c_flank_length).str.upper()
peptides = df.peptide.str.upper()
concatenated = n_flanks + peptides + c_flanks
encoder = EncodableSequences.create(concatenated.values)
array = encoder.variable_length_to_fixed_length_vector_encoding(
vector_encoding_name=vector_encoding_name,
alignment_method="right_pad",
max_length=n_flank_length + peptide_max_length + c_flank_length,
allow_unsupported_amino_acids=allow_unsupported_amino_acids)
result = EncodingResult(
array, peptide_lengths=peptides.str.len().values)
return result