Skip to content
Snippets Groups Projects
Commit 41985338 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Much faster performance (~5X improvement) in amino acid encoding

parent 78cdb265
No related branches found
No related tags found
No related merge requests found
...@@ -21,7 +21,6 @@ import collections ...@@ -21,7 +21,6 @@ import collections
from copy import copy from copy import copy
import pandas import pandas
import numpy
from six import StringIO from six import StringIO
...@@ -136,21 +135,27 @@ def index_encoding(sequences, letter_to_index_dict): ...@@ -136,21 +135,27 @@ def index_encoding(sequences, letter_to_index_dict):
return result.values return result.values
def fixed_vectors_encoding(sequences, letter_to_vector_function): def fixed_vectors_encoding(sequences, letter_to_vector_df):
""" """
Given a sequence of n strings all of length k, return a n * k * m array where Given a sequence of n strings all of length k, and a dataframe mapping each
the (i, j)th element is letter_to_vector_function(sequence[i][j]). character to an arbitrary vector, return a n * k * m array where
the (i, j)th element is letter_to_vector_df.loc[sequence[i][j]].
Parameters Parameters
---------- ----------
sequences : list of length n of strings of length k sequences : list of length n of strings of length k
letter_to_vector_function : function : string -> vector of length m letter_to_vector_df : pandas.DataFrame of shape (alphabet size, m)
The index of the dataframe should be amino acid characters.
Returns Returns
------- -------
numpy.array of integers with shape (n, k, m) numpy.array of integers with shape (n, k, m)
""" """
arr = numpy.array([list(s) for s in sequences]) target_shape = (
result = numpy.vectorize( len(sequences),
letter_to_vector_function, signature='()->(n)')(arr) len(sequences[0]),
return result letter_to_vector_df.shape[0])
\ No newline at end of file result = letter_to_vector_df.loc[
(letter for seq in sequences for letter in seq)
].values.reshape(target_shape)
return result
...@@ -140,7 +140,7 @@ class EncodableSequences(object): ...@@ -140,7 +140,7 @@ class EncodableSequences(object):
] ]
result = amino_acid.fixed_vectors_encoding( result = amino_acid.fixed_vectors_encoding(
fixed_length_sequences, fixed_length_sequences,
amino_acid.ENCODING_DFS[vector_encoding_name].loc.__getitem__) amino_acid.ENCODING_DFS[vector_encoding_name])
assert result.shape[0] == len(self.sequences) assert result.shape[0] == len(self.sequences)
self.encoding_cache[cache_key] = result self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key] return self.encoding_cache[cache_key]
......
...@@ -2,6 +2,7 @@ from mhcflurry import amino_acid ...@@ -2,6 +2,7 @@ from mhcflurry import amino_acid
from nose.tools import eq_ from nose.tools import eq_
from numpy.testing import assert_equal from numpy.testing import assert_equal
import numpy import numpy
import pandas
letter_to_index_dict = { letter_to_index_dict = {
'A': 0, 'A': 0,
...@@ -11,6 +12,14 @@ letter_to_index_dict = { ...@@ -11,6 +12,14 @@ letter_to_index_dict = {
def test_index_and_one_hot_encoding(): def test_index_and_one_hot_encoding():
letter_to_vector_df = pandas.DataFrame(
[
[1, 0, 0,],
[0, 1, 0,],
[0, 0, 1,]
], columns=[0, 1, 2]
)
index_encoding = amino_acid.index_encoding( index_encoding = amino_acid.index_encoding(
["AAAA", "ABCA"], letter_to_index_dict) ["AAAA", "ABCA"], letter_to_index_dict)
assert_equal( assert_equal(
...@@ -21,11 +30,7 @@ def test_index_and_one_hot_encoding(): ...@@ -21,11 +30,7 @@ def test_index_and_one_hot_encoding():
]) ])
one_hot = amino_acid.fixed_vectors_encoding( one_hot = amino_acid.fixed_vectors_encoding(
index_encoding, index_encoding,
{ letter_to_vector_df)
0: numpy.array([1, 0, 0]),
1: numpy.array([0, 1, 0]),
2: numpy.array([0, 0, 1]),
}.get)
eq_(one_hot.shape, (2, 4, 3)) eq_(one_hot.shape, (2, 4, 3))
assert_equal( assert_equal(
one_hot[0], one_hot[0],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment