Newer
Older
# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import (
print_function,
division,
absolute_import,
)
import math
import numpy
from . import amino_acid
class EncodableSequences(object):
"""
Sequences of amino acids.
This class caches various encodings of a list of sequences.
"""
unknown_character = "X"
@classmethod
def create(klass, sequences):
"""
Factory that returns an EncodableSequences given a list of
strings. As a convenience, you can also pass it an EncodableSequences
instance, in which case the object is returned unchanged.
"""
if isinstance(sequences, klass):
return sequences
return klass(sequences)
def __init__(self, sequences):
typechecks.require_iterable_of(
sequences, typechecks.string_types, "sequences")
self.sequences = numpy.array(sequences)
self.encoding_cache = {}
self.fixed_sequence_length = None
if len(self.sequences) > 0 and all(
len(s) == len(self.sequences[0]) for s in self.sequences):
self.fixed_sequence_length = len(self.sequences[0])
def __len__(self):
return len(self.sequences)
def variable_length_to_fixed_length_categorical(
self, left_edge=4, right_edge=4, max_length=15):
"""
Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides.
The sequences must be of length at least left_edge + right_edge, and at
most max_length.
Parameters
----------
left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding
Returns
-------
numpy.array of integers with shape (num sequences, max_length)
"""
cache_key = (
"fixed_length_categorical",
left_edge,
right_edge,
max_length)
if cache_key not in self.encoding_cache:
fixed_length_sequences = (
self.sequences_to_fixed_length_index_encoded_array(
self.sequences,
max_length=max_length))
self.encoding_cache[cache_key] = fixed_length_sequences
def variable_length_to_fixed_length_vector_encoding(
self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15):
"""
Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides.
The sequences must be of length at least left_edge + right_edge, and at
most max_length.
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings().
left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding
Returns
-------
numpy.array with shape (num sequences, max_length, m) where m is
vector_encoding_length(vector_encoding_name)
"fixed_length_vector_encoding",
vector_encoding_name,
left_edge,
right_edge,
max_length)
if cache_key not in self.encoding_cache:
fixed_length_sequences = (
self.sequences_to_fixed_length_index_encoded_array(
self.sequences,
left_edge=left_edge,
right_edge=right_edge,
max_length=max_length))
amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
@classmethod
def sequences_to_fixed_length_index_encoded_array(
klass, sequences, left_edge=4, right_edge=4, max_length=15):
Transform a sequence of strings, where each string is of length at least
left_edge + right_edge and at most max_length into strings of length
max_length using a scheme designed to preserve the anchor positions of
class I peptides.
The first left_edge characters in the input always map to the first
left_edge characters in the output. Similarly for the last right_edge
characters. The middle characters are filled in based on the length,
with the X character filling in the blanks.
The strings are also converted to int categorical amino acid indices.
Parameters
----------
sequence : string
left_edge : int
right_edge : int
max_length : int
Returns
-------
numpy array of shape (len(sequences), max_length) and dtype int
# Result array is int32, filled with X (null amino acid) value.
result = numpy.full(
fill_value=amino_acid.AMINO_ACID_INDEX['X'],
shape=(len(sequences), max_length),
dtype="int32")
df = pandas.DataFrame({"peptide": sequences})
df["length"] = df.peptide.str.len()
middle_length = max_length - left_edge - right_edge
# For efficiency we handle each supported peptide length using bulk
# array operations.
for (length, sub_df) in df.groupby("length"):
if length < left_edge + right_edge:
raise ValueError(
"Sequence '%s' (length %d) unsupported: length must be at "
"least %d. There are %d total peptides with this length." % (
sub_df.iloc[0].peptide, length, left_edge + right_edge,
len(sub_df)))
if length > max_length:
raise ValueError(
"Sequence '%s' (length %d) unsupported: length must be at "
"most %d. There are %d total peptides with this length." % (
sub_df.iloc[0].peptide, length, max_length,
len(sub_df)))
# Array of shape (num peptides, length) giving fixed-length amino
# acid encoding each peptide of the current length.
lambda s: numpy.array([
amino_acid.AMINO_ACID_INDEX[char] for char in s
])).values)
num_null_left = int(math.ceil(num_null / 2))
num_middle_filled = middle_length - num_null
middle_start = left_edge + num_null_left
result[sub_df.index, :left_edge] = fixed_length_sequences[
:, :left_edge
]
# Set middle.
result[
sub_df.index,
middle_start : middle_start + num_middle_filled
] = fixed_length_sequences[
:, left_edge : left_edge + num_middle_filled
]
# Set right edge.
result[
sub_df.index,
-right_edge: