-
Tim O'Donnell authoredTim O'Donnell authored
encodable_sequences.py 9.25 KiB
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import (
print_function,
division,
absolute_import,
)
import math
import pandas
import numpy
from . import amino_acid
def index_encoding(sequences, letter_to_index_dict):
"""
Given a sequence of n strings all of length k, return a k * n array where
the (i, j)th element is letter_to_index_dict[sequence[i][j]].
Parameters
----------
sequences : list of length n of strings of length k
letter_to_index_dict : dict : string -> int
Returns
-------
numpy.array of integers with shape (k, n)
"""
df = pandas.DataFrame(iter(s) for s in sequences)
result = df.replace(letter_to_index_dict)
return result.values
def one_hot_encoding(index_encoded, alphabet_size):
"""
Given an n * k array of integers in the range [0, alphabet_size), return
an n * k * alphabet_size array where element (i, k, j) is 1 if element
(i, k) == j in the input array and zero otherwise.
Parameters
----------
index_encoded : numpy.array of integers with shape (n, k)
alphabet_size : int
Returns
-------
numpy.array of integers of shape (n, k, alphabet_size)
"""
(num_sequences, sequence_length) = index_encoded.shape
result = numpy.zeros(
(num_sequences, sequence_length, alphabet_size),
dtype='int32')
# Transform the index encoded array into an array of indices into the
# flattened result, which we will set to 1.
flattened_indices = (
index_encoded +
(
sequence_length * alphabet_size * numpy.arange(num_sequences)
).reshape((-1, 1)) +
numpy.tile(numpy.arange(sequence_length),
(num_sequences, 1)) * alphabet_size)
result.put(flattened_indices, 1)
return result
class EncodableSequences(object):
"""
Sequences of amino acids.
This class caches various encodings of a list of sequences.
"""
unknown_character = "X"
@classmethod
def create(klass, sequences):
"""
Factory that returns an EncodableSequences given a list of
strings. As a convenience, you can also pass it an EncodableSequences
instance, in which case the object is returned unchanged.
"""
if isinstance(sequences, klass):
return sequences
return klass(sequences)
def __init__(self, sequences):
self.sequences = sequences
self.encoding_cache = {}
self.fixed_sequence_length = None
if len(sequences) > 0 and all(
len(s) == len(sequences[0]) for s in sequences):
self.fixed_sequence_length = len(sequences[0])
def __len__(self):
return len(self.sequences)
def fixed_length_categorical(self):
"""
Returns a categorical encoding (i.e. integers 0 <= x < 21) of the
sequences, which must already be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key = ("categorical",)
if cache_key not in self.encoding_cache:
assert self.fixed_sequence_length
self.encoding_cache[cache_key] = index_encoding(
self.sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key]
def fixed_length_one_hot(self):
"""
Returns a binary one-hot encoding of the sequences, which must already
be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key = ("one_hot",)
if cache_key not in self.encoding_cache:
assert self.fixed_sequence_length
encoded = self.categorical_encoding()
result = one_hot_encoding(
encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_categorical(
self, left_edge=4, right_edge=4, max_length=15):
"""
Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides.
The sequences must be of length at least left_edge + right_edge, and at
most max_length.
Parameters
----------
left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding
Returns
-------
numpy.array of integers with shape (num sequences, max_length)
"""
cache_key = (
"fixed_length_categorical",
left_edge,
right_edge,
max_length)
if cache_key not in self.encoding_cache:
fixed_length_sequences = [
self.sequence_to_fixed_length_string(
sequence,
left_edge=left_edge,
right_edge=right_edge,
max_length=max_length)
for sequence in self.sequences
]
self.encoding_cache[cache_key] = index_encoding(
fixed_length_sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_one_hot(
self, left_edge=4, right_edge=4, max_length=15):
"""
Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides.
The sequences must be of length at least left_edge + right_edge, and at
most max_length.
Parameters
----------
left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding
Returns
-------
binary numpy.array with shape (num sequences, max_length, 21)
"""
cache_key = (
"fixed_length_one_hot",
left_edge,
right_edge,
max_length)
if cache_key not in self.encoding_cache:
encoded = self.variable_length_to_fixed_length_categorical(
left_edge=left_edge,
right_edge=right_edge,
max_length=max_length)
result = one_hot_encoding(
encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
assert result.shape == (
len(self.sequences),
encoded.shape[1],
len(amino_acid.AMINO_ACID_INDEX))
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
@classmethod
def sequence_to_fixed_length_string(
klass, sequence, left_edge=4, right_edge=4, max_length=15):
"""
Transform a string of length at least left_edge + right_edge and at
most max_length into a string of length max_length using a scheme
designed to preserve the anchor positions of class I peptides.
The first left_edge characters in the input always map to the first
left_edge characters in the output. Similarly for the last right_edge
characters. The middle characters are filled in based on the length,
with the X character filling in the blanks.
For example, using defaults:
AAAACDDDD -> AAAAXXXCXXXDDDD
Parameters
----------
sequence : string
left_edge : int
right_edge : int
max_length : int
Returns
-------
string of length max_length
"""
if len(sequence) < left_edge + right_edge:
raise ValueError(
"Sequence '%s' (length %d) unsupported: length must be at "
"least %d" % (sequence, len(sequence), left_edge + right_edge))
if len(sequence) > max_length:
raise ValueError(
"Sequence '%s' (length %d) unsupported: length must be at "
"most %d" % (sequence, len(sequence), max_length))
middle_length = max_length - left_edge - right_edge
num_null = max_length - len(sequence)
num_null_left = int(math.ceil(num_null / 2))
num_null_right = int(math.floor(num_null / 2))
num_not_null_middle = middle_length - num_null
string_encoding = "".join([
sequence[:left_edge],
klass.unknown_character * num_null_left,
sequence[left_edge:left_edge + num_not_null_middle],
klass.unknown_character * num_null_right,
sequence[-right_edge:],
])
assert len(string_encoding) == max_length
return string_encoding