Skip to content
Snippets Groups Projects
allele_encoding.py 3.33 KiB
Newer Older
import numpy
import pandas

from .encodable_sequences import EncodableSequences
from . import amino_acid

class AlleleEncoding(object):
    def __init__(
            self,
            alleles,
            allele_to_fixed_length_sequence):
Tim O'Donnell's avatar
Tim O'Donnell committed
        A place to cache encodings for a (potentially large) sequence of alleles.

        Parameters
        ----------
        alleles : list of string
            Allele names

        allele_to_fixed_length_sequence : dict of str -> str
            Allele name to fixed lengths sequence ("pseudosequence"), or a
            pandas dataframe with allele names as the index and arbitrary values
            to use for the encoding of those alleles
        self.alleles = pandas.Series(alleles)
        if isinstance(allele_to_fixed_length_sequence, dict):
            self.allele_to_fixed_length_sequence = pandas.DataFrame(
                index=allele_to_fixed_length_sequence)
            self.allele_to_fixed_length_sequence["value"] = (
                self.allele_to_fixed_length_sequence.index.map(
                    allele_to_fixed_length_sequence.get))
        else:
            assert isinstance(allele_to_fixed_length_sequence, pandas.DataFrame)
            self.allele_to_fixed_length_sequence = allele_to_fixed_length_sequence

        self.encoding_cache = {}

Tim O'Donnell's avatar
Tim O'Donnell committed
    def fixed_length_vector_encoded_sequences(self, vector_encoding_name):
        """
        Encode alleles.

        Parameters
        ----------
        vector_encoding_name : string
            How to represent amino acids.
            One of "BLOSUM62", "one-hot", etc. Full list of supported vector
            encodings is given by available_vector_encodings() in amino_acid.

            If a DataFrame was provided as `allele_to_fixed_length_sequence`
            in the constructor, then those values will be used and this argument
            will be ignored.

        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        numpy.array to get an array with shape
        (num sequences, sequence length, m) where m is
        vector_encoding_length(vector_encoding_name)
        """
        cache_key = (
            "fixed_length_vector_encoding",
            vector_encoding_name)
        if cache_key not in self.encoding_cache:
            all_alleles = list(sorted(self.alleles.unique()))
            allele_to_index = dict(
                (allele, i)
                for (i, allele) in enumerate(all_alleles))
            indices = self.alleles.map(allele_to_index)

            allele_to_fixed_length_sequence = self.allele_to_fixed_length_sequence.loc[
                all_alleles
            ].copy()

            if list(allele_to_fixed_length_sequence) == ["value"]:
                # Pseudosequence
                index_encoded_matrix = amino_acid.index_encoding(
                    allele_to_fixed_length_sequence["value"].values,
                    amino_acid.AMINO_ACID_INDEX)
                vector_encoded = amino_acid.fixed_vectors_encoding(
                    index_encoded_matrix,
                    amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
            else:
                # Raw values
                vector_encoded = allele_to_fixed_length_sequence.values
Tim O'Donnell's avatar
Tim O'Donnell committed
            result = numpy.array([vector_encoded[i] for i in indices])
            self.encoding_cache[cache_key] = result
        return self.encoding_cache[cache_key]