Skip to content
Snippets Groups Projects
Commit 8a67cac6 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Update AlleleEncoding to support arbitrary values

parent deb7c162
No related merge requests found
......@@ -8,7 +8,7 @@ class AlleleEncoding(object):
def __init__(
self,
alleles,
allele_to_fixed_length_sequence=None):
allele_to_fixed_length_sequence):
"""
A place to cache encodings for a (potentially large) sequence of alleles.
......@@ -18,22 +18,22 @@ class AlleleEncoding(object):
Allele names
allele_to_fixed_length_sequence : dict of str -> str
Allele name to fixed lengths sequence ("pseudosequence")
Allele name to fixed lengths sequence ("pseudosequence"), or a
pandas dataframe with allele names as the index and arbitrary values
to use for the encoding of those alleles
"""
alleles = pandas.Series(alleles)
self.alleles = pandas.Series(alleles)
all_alleles = list(sorted(alleles.unique()))
self.allele_to_index = dict(
(allele, i)
for (i, allele) in enumerate(all_alleles))
self.indices = alleles.map(self.allele_to_index)
self.fixed_length_sequences = pandas.Series(
[allele_to_fixed_length_sequence[a] for a in all_alleles],
index=all_alleles)
if isinstance(allele_to_fixed_length_sequence, dict):
self.allele_to_fixed_length_sequence = pandas.DataFrame(
index=allele_to_fixed_length_sequence)
self.allele_to_fixed_length_sequence["value"] = (
self.allele_to_fixed_length_sequence.index.map(
allele_to_fixed_length_sequence.get))
else:
assert isinstance(allele_to_fixed_length_sequence, pandas.DataFrame)
self.allele_to_fixed_length_sequence = allele_to_fixed_length_sequence
self.encoding_cache = {}
......@@ -48,6 +48,10 @@ class AlleleEncoding(object):
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings() in amino_acid.
If a DataFrame was provided as `allele_to_fixed_length_sequence`
in the constructor, then those values will be used and this argument
will be ignored.
Returns
-------
numpy.array with shape (num sequences, sequence length, m) where m is
......@@ -57,13 +61,34 @@ class AlleleEncoding(object):
"fixed_length_vector_encoding",
vector_encoding_name)
if cache_key not in self.encoding_cache:
index_encoded_matrix = amino_acid.index_encoding(
self.fixed_length_sequences.values,
amino_acid.AMINO_ACID_INDEX)
vector_encoded = amino_acid.fixed_vectors_encoding(
index_encoded_matrix,
amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
result = vector_encoded[self.indices]
all_alleles = list(sorted(self.alleles.unique()))
allele_to_index = dict(
(allele, i)
for (i, allele) in enumerate(all_alleles))
indices = self.alleles.map(allele_to_index)
allele_to_fixed_length_sequence = self.allele_to_fixed_length_sequence.loc[
all_alleles
].copy()
if list(allele_to_fixed_length_sequence) == ["value"]:
# Pseudosequence
index_encoded_matrix = amino_acid.index_encoding(
allele_to_fixed_length_sequence["value"].values,
amino_acid.AMINO_ACID_INDEX)
vector_encoded = amino_acid.fixed_vectors_encoding(
index_encoded_matrix,
amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
flattened = pandas.DataFrame(
vector_encoded.reshape(
(len(allele_to_fixed_length_sequence), -1)))
encoding_shape = vector_encoded.shape[1:]
else:
# Raw values
flattened = allele_to_fixed_length_sequence
encoding_shape = (allele_to_fixed_length_sequence.shape[1],)
result = flattened.iloc[indices].values.reshape(
(len(self.alleles),) + encoding_shape)
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
......
import time
from mhcflurry.allele_encoding import AlleleEncoding
from mhcflurry.amino_acid import BLOSUM62_MATRIX
from nose.tools import eq_
from numpy.testing import assert_equal
import numpy
import pandas
def test_allele_encoding_speed():
encoding = AlleleEncoding(
["A*02:01", "A*02:03", "A*02:01"],
{
"A*02:01": "AC",
"A*02:03": "AE",
}
)
start = time.time()
encoding1 = encoding.fixed_length_vector_encoded_sequences("BLOSUM62")
assert_equal(
[
[BLOSUM62_MATRIX["A"], BLOSUM62_MATRIX["C"]],
[BLOSUM62_MATRIX["A"], BLOSUM62_MATRIX["E"]],
[BLOSUM62_MATRIX["A"], BLOSUM62_MATRIX["C"]],
], encoding1)
print("Simple encoding in %0.2f sec." % (time.time() - start))
print(encoding1)
encoding = AlleleEncoding(
["A*02:01", "A*02:03", "A*02:01"] * int(1e5),
{
"A*02:01": "AC" * 16,
"A*02:03": "AE" * 16,
}
)
start = time.time()
encoding1 = encoding.fixed_length_vector_encoded_sequences("BLOSUM62")
print("Long encoding in %0.2f sec." % (time.time() - start))
def test_allele_encoding_raw_values():
encoding = AlleleEncoding(
["A*02:01", "A*02:03", "A*02:01"],
pandas.DataFrame(
[
[0, 1, -1],
[10, 11, 12],
],
index=["A*02:01", "A*02:03"]))
encoding1 = encoding.fixed_length_vector_encoded_sequences("BLOSUM62")
assert_equal(
[
[0, 1, -1],
[10, 11, 12],
[0, 1, -1],
], encoding1)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment