Skip to content
Snippets Groups Projects
Commit 6a292951 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

more efficient allele encoding

parent 70c28425
No related merge requests found
......@@ -54,8 +54,13 @@ class AlleleEncoding(object):
Returns
-------
numpy.array with shape (num sequences, sequence length, m) where m is
list of numpy arrays. Pass it to numpy.array to get an array with shape
(num sequences, sequence length, m) where m is
vector_encoding_length(vector_encoding_name)
The reason to return a list instead of an array is that the list can
use much less memory in the common case where many of the rows are
the same.
"""
cache_key = (
"fixed_length_vector_encoding",
......@@ -79,16 +84,10 @@ class AlleleEncoding(object):
vector_encoded = amino_acid.fixed_vectors_encoding(
index_encoded_matrix,
amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
flattened = pandas.DataFrame(
vector_encoded.reshape(
(len(allele_to_fixed_length_sequence), -1)))
encoding_shape = vector_encoded.shape[1:]
else:
# Raw values
flattened = allele_to_fixed_length_sequence
encoding_shape = (allele_to_fixed_length_sequence.shape[1],)
result = flattened.iloc[indices].values.reshape(
(len(self.alleles),) + encoding_shape)
vector_encoded = allele_to_fixed_length_sequence.values
result = [vector_encoded[i] for i in indices]
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
......
......@@ -55,4 +55,4 @@ def test_allele_encoding_raw_values():
[0, 1, -1],
[10, 11, 12],
[0, 1, -1],
], encoding1)
], numpy.array(encoding1))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment