Newer
Older
import numpy
import pandas
from .encodable_sequences import EncodableSequences
from . import amino_acid
class AlleleEncoding(object):
def __init__(
self,
alleles,
allele_to_fixed_length_sequence):
A place to cache encodings for a (potentially large) sequence of alleles.
Parameters
----------
alleles : list of string
Allele names
allele_to_fixed_length_sequence : dict of str -> str
Allele name to fixed lengths sequence ("pseudosequence"), or a
pandas dataframe with allele names as the index and arbitrary values
to use for the encoding of those alleles
self.alleles = pandas.Series(alleles)
if isinstance(allele_to_fixed_length_sequence, dict):
self.allele_to_fixed_length_sequence = pandas.DataFrame(
index=allele_to_fixed_length_sequence)
self.allele_to_fixed_length_sequence["value"] = (
self.allele_to_fixed_length_sequence.index.map(
allele_to_fixed_length_sequence.get))
else:
assert isinstance(allele_to_fixed_length_sequence, pandas.DataFrame)
self.allele_to_fixed_length_sequence = allele_to_fixed_length_sequence
def fixed_length_vector_encoded_sequences(self, vector_encoding_name):
"""
Encode alleles.
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings() in amino_acid.
If a DataFrame was provided as `allele_to_fixed_length_sequence`
in the constructor, then those values will be used and this argument
will be ignored.
(num sequences, sequence length, m) where m is
vector_encoding_length(vector_encoding_name)
"""
cache_key = (
"fixed_length_vector_encoding",
vector_encoding_name)
if cache_key not in self.encoding_cache:
all_alleles = list(sorted(self.alleles.unique()))
allele_to_index = dict(
(allele, i)
for (i, allele) in enumerate(all_alleles))
indices = self.alleles.map(allele_to_index)
allele_to_fixed_length_sequence = self.allele_to_fixed_length_sequence.loc[
all_alleles
].copy()
if list(allele_to_fixed_length_sequence) == ["value"]:
# Pseudosequence
index_encoded_matrix = amino_acid.index_encoding(
allele_to_fixed_length_sequence["value"].values,
amino_acid.AMINO_ACID_INDEX)
vector_encoded = amino_acid.fixed_vectors_encoding(
index_encoded_matrix,
amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
else:
# Raw values
vector_encoded = allele_to_fixed_length_sequence.values
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]