Skip to content
Snippets Groups Projects
Commit 873a60a5 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

More speed efficient variable-length peptide encoding

parent d65f7740
No related branches found
No related tags found
No related merge requests found
...@@ -135,7 +135,7 @@ def index_encoding(sequences, letter_to_index_dict): ...@@ -135,7 +135,7 @@ def index_encoding(sequences, letter_to_index_dict):
return result.values return result.values
def fixed_vectors_encoding(sequences, letter_to_vector_df): def fixed_vectors_encoding(index_encoded_sequences, letter_to_vector_df):
""" """
Given a sequence of n strings all of length k, and a dataframe mapping each Given a sequence of n strings all of length k, and a dataframe mapping each
character to an arbitrary vector, return a n * k * m array where character to an arbitrary vector, return a n * k * m array where
...@@ -152,10 +152,10 @@ def fixed_vectors_encoding(sequences, letter_to_vector_df): ...@@ -152,10 +152,10 @@ def fixed_vectors_encoding(sequences, letter_to_vector_df):
numpy.array of integers with shape (n, k, m) numpy.array of integers with shape (n, k, m)
""" """
target_shape = ( target_shape = (
len(sequences), len(index_encoded_sequences),
len(sequences[0]), len(index_encoded_sequences[0]),
letter_to_vector_df.shape[0]) letter_to_vector_df.shape[0])
result = letter_to_vector_df.loc[ result = letter_to_vector_df.iloc[
(letter for seq in sequences for letter in seq) index_encoded_sequences.flat
].values.reshape(target_shape) ].values.reshape(target_shape)
return result return result
...@@ -86,16 +86,13 @@ class EncodableSequences(object): ...@@ -86,16 +86,13 @@ class EncodableSequences(object):
max_length) max_length)
if cache_key not in self.encoding_cache: if cache_key not in self.encoding_cache:
fixed_length_sequences = [ fixed_length_sequences = (
self.sequence_to_fixed_length_string( self.sequences_to_fixed_length_index_encoded_array(
sequence, self.sequences,
left_edge=left_edge, left_edge=left_edge,
right_edge=right_edge, right_edge=right_edge,
max_length=max_length) max_length=max_length))
for sequence in self.sequences self.encoding_cache[cache_key] = fixed_length_sequences
]
self.encoding_cache[cache_key] = amino_acid.index_encoding(
fixed_length_sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key] return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_vector_encoding( def variable_length_to_fixed_length_vector_encoding(
...@@ -130,14 +127,12 @@ class EncodableSequences(object): ...@@ -130,14 +127,12 @@ class EncodableSequences(object):
right_edge, right_edge,
max_length) max_length)
if cache_key not in self.encoding_cache: if cache_key not in self.encoding_cache:
fixed_length_sequences = [ fixed_length_sequences = (
self.sequence_to_fixed_length_string( self.sequences_to_fixed_length_index_encoded_array(
sequence, self.sequences,
left_edge=left_edge, left_edge=left_edge,
right_edge=right_edge, right_edge=right_edge,
max_length=max_length) max_length=max_length))
for sequence in self.sequences
]
result = amino_acid.fixed_vectors_encoding( result = amino_acid.fixed_vectors_encoding(
fixed_length_sequences, fixed_length_sequences,
amino_acid.ENCODING_DFS[vector_encoding_name]) amino_acid.ENCODING_DFS[vector_encoding_name])
...@@ -145,25 +140,26 @@ class EncodableSequences(object): ...@@ -145,25 +140,26 @@ class EncodableSequences(object):
self.encoding_cache[cache_key] = result self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key] return self.encoding_cache[cache_key]
@classmethod @classmethod
def sequence_to_fixed_length_string( def sequences_to_fixed_length_index_encoded_array(
klass, sequence, left_edge=4, right_edge=4, max_length=15): klass, sequences, left_edge=4, right_edge=4, max_length=15):
""" """
Transform a string of length at least left_edge + right_edge and at Transform a sequence of strings, where each string is of length at least
most max_length into a string of length max_length using a scheme left_edge + right_edge and at most max_length into strings of length
designed to preserve the anchor positions of class I peptides. max_length using a scheme designed to preserve the anchor positions of
class I peptides.
The first left_edge characters in the input always map to the first The first left_edge characters in the input always map to the first
left_edge characters in the output. Similarly for the last right_edge left_edge characters in the output. Similarly for the last right_edge
characters. The middle characters are filled in based on the length, characters. The middle characters are filled in based on the length,
with the X character filling in the blanks. with the X character filling in the blanks.
For example, using defaults: For example, using defaults:
AAAACDDDD -> AAAAXXXCXXXDDDD AAAACDDDD -> AAAAXXXCXXXDDDD
The strings are also converted to int categorical amino acid indices.
Parameters Parameters
---------- ----------
sequence : string sequence : string
...@@ -173,30 +169,37 @@ class EncodableSequences(object): ...@@ -173,30 +169,37 @@ class EncodableSequences(object):
Returns Returns
------- -------
string of length max_length numpy array of shape (len(sequences), max_length, 21) and dtype int
""" """
if len(sequence) < left_edge + right_edge: result = numpy.ones(shape=(len(sequences), max_length), dtype=int) * -1
raise ValueError( fill_value = amino_acid.AMINO_ACID_INDEX['X']
"Sequence '%s' (length %d) unsupported: length must be at " for (i, sequence) in enumerate(sequences):
"least %d" % (sequence, len(sequence), left_edge + right_edge)) sequence_indexes = [
if len(sequence) > max_length: amino_acid.AMINO_ACID_INDEX[char] for char in sequence
raise ValueError( ]
"Sequence '%s' (length %d) unsupported: length must be at "
"most %d" % (sequence, len(sequence), max_length)) if len(sequence) < left_edge + right_edge:
raise ValueError(
middle_length = max_length - left_edge - right_edge "Sequence '%s' (length %d) unsupported: length must be at "
"least %d" % (
num_null = max_length - len(sequence) sequence, len(sequence), left_edge + right_edge))
num_null_left = int(math.ceil(num_null / 2)) if len(sequence) > max_length:
num_null_right = int(math.floor(num_null / 2)) raise ValueError(
num_not_null_middle = middle_length - num_null "Sequence '%s' (length %d) unsupported: length must be at "
string_encoding = "".join([ "most %d" % (sequence, len(sequence), max_length))
sequence[:left_edge],
klass.unknown_character * num_null_left, middle_length = max_length - left_edge - right_edge
sequence[left_edge:left_edge + num_not_null_middle], num_null = max_length - len(sequence)
klass.unknown_character * num_null_right, num_null_left = int(math.ceil(num_null / 2))
sequence[-right_edge:], num_null_right = int(math.floor(num_null / 2))
]) num_not_null_middle = middle_length - num_null
assert len(string_encoding) == max_length
return string_encoding result[i] = numpy.concatenate([
sequence_indexes[:left_edge],
numpy.ones(num_null_left) * fill_value,
sequence_indexes[left_edge:left_edge + num_not_null_middle],
numpy.ones(num_null_right) * fill_value,
sequence_indexes[-right_edge:],
])
assert len(result[i]) == max_length
return result
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment