Skip to content
Snippets Groups Projects
allele_encoding.py 5.07 KiB
Newer Older
Tim O'Donnell's avatar
Tim O'Donnell committed
from six import callable

import pandas

from . import amino_acid
from .allele_encoding_transforms import TRANSFORMS
class AlleleEncoding(object):
    def __init__(
            self,
            alleles=None,
            allele_to_sequence=None,
            transforms=None,
            borrow_from=None):
Tim O'Donnell's avatar
Tim O'Donnell committed
        A place to cache encodings for a (potentially large) sequence of alleles.

        Parameters
        ----------
        alleles : list of string
            Allele names

        allele_to_sequence : dict of str -> str
            Allele name to amino acid sequence
        if alleles is not None:
            alleles = pandas.Series(alleles)
        self.borrow_from = borrow_from
        self.allele_to_sequence = allele_to_sequence

        if transforms is None:
            transforms = dict(
                (name, klass()) for (name, klass) in TRANSFORMS.items())
        self.transforms = transforms

        if self.borrow_from is None:
            assert allele_to_sequence is not None
            all_alleles = (
Tim O'Donnell's avatar
Tim O'Donnell committed
                sorted(allele_to_sequence))
                #if alleles is None
                #else list(sorted(alleles.unique())))
            self.allele_to_index = dict(
                (allele, i)
                for (i, allele) in enumerate(all_alleles))
            unpadded = pandas.Series(
                [allele_to_sequence[a] for a in all_alleles],
                index=all_alleles)
            self.sequences = unpadded.str.pad(
                unpadded.str.len().max(), fillchar="X")
        else:
            assert allele_to_sequence is None
            self.allele_to_index = borrow_from.allele_to_index
            self.sequences = borrow_from.sequences
            self.allele_to_sequence = borrow_from.allele_to_sequence
            self.transforms = borrow_from.transforms
Tim O'Donnell's avatar
Tim O'Donnell committed
                allele in self.allele_to_index for allele in alleles),\
                "Missing alleles: " + " ".join(set(
                    a for a in alleles if a not in self.allele_to_index))
            self.indices = alleles.map(self.allele_to_index)
            assert not self.indices.isnull().any()

        self.encoding_cache = {}

    def allele_representations(self, encoding_name):
            return self.borrow_from.allele_representations(encoding_name)
        if cache_key not in self.encoding_cache:
Tim O'Donnell's avatar
Tim O'Donnell committed
            if ":" in encoding_name:
                # Transform
                pieces = encoding_name.split(":", 3)
                if pieces[0] != "transform":
                    raise RuntimeError(
                        "Expected 'transform' but saw: %s" % pieces[0])
                if len(pieces) == 1:
                    raise RuntimeError("Expected: 'transform:<name>[:argument]")
                transform_name = pieces[1]
                argument = None if len(pieces) == 2 else pieces[2]
                try:
                    transform = self.transforms[transform_name]
                except KeyError:
                    raise KeyError(
                        "Unsupported transform: %s. Supported transforms: %s" % (
                            transform_name,
                            " ".join(self.transforms) if self.transforms else "(none)"))
Tim O'Donnell's avatar
Tim O'Donnell committed
                vector_encoded = (
                    transform.transform(self) if argument is None
                    else transform.transform(self, argument))
            else:
                # No transform.
                index_encoded_matrix = amino_acid.index_encoding(
                    self.sequences.values,
                    amino_acid.AMINO_ACID_INDEX)
                vector_encoded = amino_acid.fixed_vectors_encoding(
                    index_encoded_matrix,
                    amino_acid.ENCODING_DATA_FRAMES[encoding_name])
            self.encoding_cache[cache_key] = vector_encoded
        return self.encoding_cache[cache_key]

    def fixed_length_vector_encoded_sequences(self, encoding_name):
        """
        Encode alleles.
        Parameters
        ----------
        encoding_name : string
            How to represent amino acids.
            One of "BLOSUM62", "one-hot", etc. Full list of supported vector
            encodings is given by available_vector_encodings() in amino_acid.

            Also supported are names like pca:BLOSUM62, which would run the
            "pca" transform on BLOSUM62-encoded sequences.
        numpy.array with shape (num sequences, sequence length, m) where m is
        vector_encoding_length(vector_encoding_name)
        """
        cache_key = (
            "fixed_length_vector_encoding",
        if cache_key not in self.encoding_cache:
            vector_encoded = self.allele_representations(encoding_name)
            self.encoding_cache[cache_key] = result
        return self.encoding_cache[cache_key]