encodable_sequences.py

# Copyright (c) 2016. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import (
    print_function,
    division,
    absolute_import,
)

import math

import numpy
import pandas

import typechecks

from . import amino_acid


class EncodableSequences(object):
    """
    Sequences of amino acids.
    
    This class caches various encodings of a list of sequences.
    """
    unknown_character = "X"

    @classmethod
    def create(klass, sequences):
        """
        Factory that returns an EncodableSequences given a list of
        strings. As a convenience, you can also pass it an EncodableSequences
        instance, in which case the object is returned unchanged.
        """
        if isinstance(sequences, klass):
            return sequences
        return klass(sequences)

    def __init__(self, sequences):
        typechecks.require_iterable_of(
            sequences, typechecks.string_types, "sequences")
        self.sequences = numpy.array(sequences)
        self.encoding_cache = {}
        self.fixed_sequence_length = None
        if len(self.sequences) > 0 and all(
                len(s) == len(self.sequences[0]) for s in self.sequences):
            self.fixed_sequence_length = len(self.sequences[0])

    def __len__(self):
        return len(self.sequences)

    def variable_length_to_fixed_length_categorical(
            self, left_edge=4, right_edge=4, max_length=15):
        """
        Encode variable-length sequences using a fixed-length encoding designed
        for preserving the anchor positions of class I peptides.
        
        The sequences must be of length at least left_edge + right_edge, and at
        most max_length.
        
        Parameters
        ----------
        left_edge : int, size of fixed-position left side
        right_edge : int, size of the fixed-position right side
        max_length : sequence length of the resulting encoding

        Returns
        -------
        numpy.array of integers with shape (num sequences, max_length)
        """

        cache_key = (
            "fixed_length_categorical",
            left_edge,
            right_edge,
            max_length)

        if cache_key not in self.encoding_cache:
            fixed_length_sequences = (
                self.sequences_to_fixed_length_index_encoded_array(
                    self.sequences,
                    left_edge=left_edge,
                    right_edge=right_edge,
                    max_length=max_length))
            self.encoding_cache[cache_key] = fixed_length_sequences
        return self.encoding_cache[cache_key]

    def variable_length_to_fixed_length_vector_encoding(
            self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15):
        """
        Encode variable-length sequences using a fixed-length encoding designed
        for preserving the anchor positions of class I peptides.

        The sequences must be of length at least left_edge + right_edge, and at
        most max_length.

        Parameters
        ----------
        vector_encoding_name : string
            How to represent amino acids.
            One of "BLOSUM62", "one-hot", etc. Full list of supported vector
            encodings is given by available_vector_encodings().
        left_edge : int, size of fixed-position left side
        right_edge : int, size of the fixed-position right side
        max_length : sequence length of the resulting encoding

        Returns
        -------
        numpy.array with shape (num sequences, max_length, m) where m is
        vector_encoding_length(vector_encoding_name)

        """
        cache_key = (
            "fixed_length_vector_encoding",
            vector_encoding_name,
            left_edge,
            right_edge,
            max_length)
        if cache_key not in self.encoding_cache:
            fixed_length_sequences = (
                self.sequences_to_fixed_length_index_encoded_array(
                    self.sequences,
                    left_edge=left_edge,
                    right_edge=right_edge,
                    max_length=max_length))
            result = amino_acid.fixed_vectors_encoding(
                fixed_length_sequences,
                amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name])
            assert result.shape[0] == len(self.sequences)
            self.encoding_cache[cache_key] = result
        return self.encoding_cache[cache_key]

    @classmethod
    def sequences_to_fixed_length_index_encoded_array(
            klass, sequences, left_edge=4, right_edge=4, max_length=15):
        """
        Transform a sequence of strings, where each string is of length at least
        left_edge + right_edge and at most max_length into strings of length
        max_length using a scheme designed to preserve the anchor positions of
        class I peptides.

        The first left_edge characters in the input always map to the first
        left_edge characters in the output. Similarly for the last right_edge
        characters. The middle characters are filled in based on the length,
        with the X character filling in the blanks.

        For example, using defaults:

        AAAACDDDD -> AAAAXXXCXXXDDDD

        The strings are also converted to int categorical amino acid indices.

        Parameters
        ----------
        sequence : string
        left_edge : int
        right_edge : int
        max_length : int

        Returns
        -------
        numpy array of shape (len(sequences), max_length) and dtype int
        """

        # Result array is int32, filled with X (null amino acid) value.
        result = numpy.full(
            fill_value=amino_acid.AMINO_ACID_INDEX['X'],
            shape=(len(sequences), max_length),
            dtype="int32")

        df = pandas.DataFrame({"peptide": sequences})
        df["length"] = df.peptide.str.len()

        middle_length = max_length - left_edge - right_edge

        # For efficiency we handle each supported peptide length using bulk
        # array operations.
        for (length, sub_df) in df.groupby("length"):
            if length < left_edge + right_edge:
                raise ValueError(
                    "Sequence '%s' (length %d) unsupported: length must be at "
                    "least %d. There are %d total peptides with this length." % (
                        sub_df.iloc[0].peptide, length, left_edge + right_edge,
                        len(sub_df)))
            if length > max_length:
                raise ValueError(
                    "Sequence '%s' (length %d) unsupported: length must be at "
                    "most %d. There are %d total peptides with this length." % (
                        sub_df.iloc[0].peptide, length, max_length,
                        len(sub_df)))

            # Array of shape (num peptides, length) giving fixed-length amino
            # acid encoding each peptide of the current length.
            fixed_length_sequences = numpy.stack(
                sub_df.peptide.map(
                    lambda s: numpy.array([
                        amino_acid.AMINO_ACID_INDEX[char] for char in s
                    ])).values)

            num_null = max_length - length
            num_null_left = int(math.ceil(num_null / 2))
            num_middle_filled = middle_length - num_null
            middle_start = left_edge + num_null_left

            # Set left edge
            result[sub_df.index, :left_edge] = fixed_length_sequences[
                :, :left_edge
            ]

            # Set middle.
            result[
                sub_df.index,
                middle_start : middle_start + num_middle_filled
            ] = fixed_length_sequences[
                :, left_edge : left_edge + num_middle_filled
            ]

            # Set right edge.
            result[
                sub_df.index,
                -right_edge:
            ] = fixed_length_sequences[:, -right_edge:]
        return result