Newer
Older
"""
Class for encoding variable-length peptides to fixed-size numerical matrices
"""
from __future__ import (
print_function,
division,
absolute_import,
)
import math
def __init__(self, message, supported_peptide_lengths):
self.supported_peptide_lengths = supported_peptide_lengths
ValueError.__init__(
self,
message + " Supported lengths: %s - %s." % supported_peptide_lengths)
Class for encoding variable-length peptides to fixed-size numerical matrices
This class caches various encodings of a list of sequences.
In practice this is used only for peptides. To encode MHC allele sequences,
see AlleleEncoding.
"""
unknown_character = "X"
@classmethod
def create(klass, sequences):
"""
Factory that returns an EncodableSequences given a list of
strings. As a convenience, you can also pass it an EncodableSequences
instance, in which case the object is returned unchanged.
"""
if isinstance(sequences, klass):
return sequences
return klass(sequences)
def __init__(self, sequences):
if not all(isinstance(obj, string_types) for obj in sequences):
raise ValueError("Sequence of strings is required")
lengths = pandas.Series(self.sequences, dtype=numpy.object_).str.len()
self.min_length = lengths.min()
self.max_length = lengths.max()
self.encoding_cache = {}
self.fixed_sequence_length = None
if len(self.sequences) > 0 and all(
len(s) == len(self.sequences[0]) for s in self.sequences):
self.fixed_sequence_length = len(self.sequences[0])
def __len__(self):
return len(self.sequences)
def variable_length_to_fixed_length_categorical(
self,
alignment_method="pad_middle",
left_edge=4,
right_edge=4,
max_length=15):
Encode variable-length sequences to a fixed-size index-encoded (integer)
matrix.
See `sequences_to_fixed_length_index_encoded_array` for details.
alignment_method : string
One of "pad_middle" or "left_pad_right_pad"
Only relevant for pad_middle alignment method
max_length : maximum supported peptide length
numpy.array of integers with shape (num sequences, encoded length)
For pad_middle, the encoded length is max_length. For left_pad_right_pad,
it's 3 * max_length.
left_edge,
right_edge,
max_length)
if cache_key not in self.encoding_cache:
fixed_length_sequences = (
self.sequences_to_fixed_length_index_encoded_array(
self.sequences,
alignment_method=alignment_method,
max_length=max_length))
self.encoding_cache[cache_key] = fixed_length_sequences
def variable_length_to_fixed_length_vector_encoding(
self,
vector_encoding_name,
alignment_method="pad_middle",
left_edge=4,
right_edge=4,
max_length=15):
Encode variable-length sequences to a fixed-size matrix. Amino acids
are encoded as specified by the vector_encoding_name argument.
See `sequences_to_fixed_length_index_encoded_array` for details.
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings().
alignment_method : string
One of "pad_middle" or "left_pad_right_pad"
Only relevant for pad_middle alignment method
max_length : maximum supported peptide length
numpy.array with shape (num sequences, encoded length, m)
where
- m is the vector encoding length (usually 21).
- encoded length is max_length if alignment_method is pad_middle;
3 * max_length if it's left_pad_right_pad.
"fixed_length_vector_encoding",
vector_encoding_name,
left_edge,
right_edge,
max_length)
if cache_key not in self.encoding_cache:
fixed_length_sequences = (
self.sequences_to_fixed_length_index_encoded_array(
self.sequences,
alignment_method=alignment_method,
left_edge=left_edge,
right_edge=right_edge,
max_length=max_length))
amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
@classmethod
def sequences_to_fixed_length_index_encoded_array(
klass,
sequences,
alignment_method="pad_middle",
left_edge=4,
right_edge=4,
max_length=15):
Encode variable-length sequences to a fixed-size index-encoded (integer)
matrix.
How variable length sequences get mapped to fixed length is set by the
"alignment_method" argument. Supported alignment methods are:
pad_middle
Encoding designed for preserving the anchor positions of class
I peptides. This is what is used in allele-specific models.
Each string must be of length at least left_edge + right_edge
and at most max_length. The first left_edge characters in the
input always map to the first left_edge characters in the
output. Similarly for the last right_edge characters. The
middle characters are filled in based on the length, with the
X character filling in the blanks.
left_pad_centered_right_pad
Encoding that makes no assumptions on anchor positions but is
3x larger than pad_middle, since it duplicates the peptide
(left aligned + centered + right aligned). This is what is used
for the pan-allele models.
Example:
AAAACDDDD -> AAAACDDDDXXXXXXXXXAAAACDDDDXXXXXXXXXAAAACDDDD
left_pad_right_pad
Same as left_pad_centered_right_pad but only includes left-
and right-padded peptide.
Example:
AAAACDDDD -> AAAACDDDDXXXXXXXXXXXXAAAACDDDD
sequences : list of string
alignment_method : string
One of "pad_middle" or "left_pad_right_pad"
left_edge : int, size of fixed-position left side
Only relevant for pad_middle alignment method
right_edge : int, size of the fixed-position right side
Only relevant for pad_middle alignment method
max_length : maximum supported peptide length
numpy.array of integers with shape (num sequences, encoded length)
For pad_middle, the encoded length is max_length. For left_pad_right_pad,
it's 2 * max_length. For left_pad_centered_right_pad, it's
3 * max_length.
result = None
if alignment_method == 'pad_middle':
# Result array is int32, filled with X (null amino acid) value.
result = numpy.full(
fill_value=amino_acid.AMINO_ACID_INDEX['X'],
shape=(len(sequences), max_length),
dtype="int32")
df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_)
df["length"] = df.peptide.str.len()
middle_length = max_length - left_edge - right_edge
# For efficiency we handle each supported peptide length using bulk
# array operations.
for (length, sub_df) in df.groupby("length"):
if length < min_length or length > max_length:
raise EncodingError(
"Sequence '%s' (length %d) unsupported. There are %d "
"total peptides with this length." % (
sub_df.iloc[0].peptide,
length,
len(sub_df)), supported_peptide_lengths=(
min_length, max_length))
# Array of shape (num peptides, length) giving fixed-length
# amino acid encoding each peptide of the current length.
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
fixed_length_sequences = numpy.stack(
sub_df.peptide.map(
lambda s: numpy.array([
amino_acid.AMINO_ACID_INDEX[char] for char in s
])).values)
num_null = max_length - length
num_null_left = int(math.ceil(num_null / 2))
num_middle_filled = middle_length - num_null
middle_start = left_edge + num_null_left
# Set left edge
result[sub_df.index, :left_edge] = fixed_length_sequences[
:, :left_edge
]
# Set middle.
result[
sub_df.index,
middle_start : middle_start + num_middle_filled
] = fixed_length_sequences[
:, left_edge : left_edge + num_middle_filled
]
# Set right edge.
result[
sub_df.index,
-right_edge:
] = fixed_length_sequences[:, -right_edge:]
elif alignment_method == "left_pad_right_pad":
# We arbitrarily set a minimum length of 5, although this encoding
# could handle smaller peptides.
min_length = 5
# Result array is int32, filled with X (null amino acid) value.
result = numpy.full(
fill_value=amino_acid.AMINO_ACID_INDEX['X'],
shape=(len(sequences), max_length * 2),
dtype="int32")
df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_)
# For efficiency we handle each supported peptide length using bulk
# array operations.
for (length, sub_df) in df.groupby(df.peptide.str.len()):
if length < min_length or length > max_length:
raise EncodingError(
"Sequence '%s' (length %d) unsupported. There are %d "
"total peptides with this length." % (
sub_df.iloc[0].peptide,
length,
len(sub_df)), supported_peptide_lengths=(
min_length, max_length))
# Array of shape (num peptides, length) giving fixed-length
# amino acid encoding each peptide of the current length.
fixed_length_sequences = numpy.stack(sub_df.peptide.map(
lambda s: numpy.array([
amino_acid.AMINO_ACID_INDEX[char] for char in s
])).values)
# Set left edge
result[sub_df.index, :length] = fixed_length_sequences
# Set right edge.
result[sub_df.index, -length:] = fixed_length_sequences
elif alignment_method == "left_pad_centered_right_pad":
# We arbitrarily set a minimum length of 5, although this encoding
# could handle smaller peptides.
min_length = 5
# Result array is int32, filled with X (null amino acid) value.
result = numpy.full(
fill_value=amino_acid.AMINO_ACID_INDEX['X'],
shape=(len(sequences), max_length * 3),
dtype="int32")
df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_)
# For efficiency we handle each supported peptide length using bulk
# array operations.
for (length, sub_df) in df.groupby(df.peptide.str.len()):
if length < min_length or length > max_length:
raise EncodingError(
"Sequence '%s' (length %d) unsupported. There are %d "
"total peptides with this length." % (
sub_df.iloc[0].peptide,
length,
len(sub_df)), supported_peptide_lengths=(
min_length, max_length))
# Array of shape (num peptides, length) giving fixed-length
# amino acid encoding each peptide of the current length.
fixed_length_sequences = numpy.stack(sub_df.peptide.map(
lambda s: numpy.array([
amino_acid.AMINO_ACID_INDEX[char] for char in s
])).values)
# Set left edge
result[sub_df.index, :length] = fixed_length_sequences
# Set right edge.
result[sub_df.index, -length:] = fixed_length_sequences
# Set center.
center_left_padding = int(
math.floor((max_length - length) / 2))
center_left_offset = max_length + center_left_padding
result[
sub_df.index,
center_left_offset : center_left_offset + length
] = fixed_length_sequences
else:
raise NotImplementedError(
"Unsupported alignment method: %s" % alignment_method)