From 30f4442aeded4d9ca376e2d4534de253b7480ee8 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Tue, 9 Apr 2019 22:52:32 -0400
Subject: [PATCH] update encodable_sequences

---
 mhcflurry/encodable_sequences.py | 131 +++++++++++++++++--------------
 1 file changed, 71 insertions(+), 60 deletions(-)

diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py
index 47d62f88..2560a8ef 100644
--- a/mhcflurry/encodable_sequences.py
+++ b/mhcflurry/encodable_sequences.py
@@ -132,7 +132,12 @@ class EncodableSequences(object):
 
     @classmethod
     def sequences_to_fixed_length_index_encoded_array(
-            klass, sequences, left_edge=4, right_edge=4, max_length=15):
+            klass,
+            sequences,
+            alignment_method="pad_middle",
+            left_edge=4,
+            right_edge=4,
+            max_length=15):
         """
         Transform a sequence of strings, where each string is of length at least
         left_edge + right_edge and at most max_length into strings of length
@@ -161,63 +166,69 @@ class EncodableSequences(object):
         -------
         numpy array of shape (len(sequences), max_length) and dtype int
         """
-
-        # Result array is int32, filled with X (null amino acid) value.
-        result = numpy.full(
-            fill_value=amino_acid.AMINO_ACID_INDEX['X'],
-            shape=(len(sequences), max_length),
-            dtype="int32")
-
-        df = pandas.DataFrame({"peptide": sequences})
-        df["length"] = df.peptide.str.len()
-
-        middle_length = max_length - left_edge - right_edge
-
-        # For efficiency we handle each supported peptide length using bulk
-        # array operations.
-        for (length, sub_df) in df.groupby("length"):
-            if length < left_edge + right_edge:
-                raise ValueError(
-                    "Sequence '%s' (length %d) unsupported: length must be at "
-                    "least %d. There are %d total peptides with this length." % (
-                        sub_df.iloc[0].peptide, length, left_edge + right_edge,
-                        len(sub_df)))
-            if length > max_length:
-                raise ValueError(
-                    "Sequence '%s' (length %d) unsupported: length must be at "
-                    "most %d. There are %d total peptides with this length." % (
-                        sub_df.iloc[0].peptide, length, max_length,
-                        len(sub_df)))
-
-            # Array of shape (num peptides, length) giving fixed-length amino
-            # acid encoding each peptide of the current length.
-            fixed_length_sequences = numpy.stack(
-                sub_df.peptide.map(
-                    lambda s: numpy.array([
-                        amino_acid.AMINO_ACID_INDEX[char] for char in s
-                    ])).values)
-
-            num_null = max_length - length
-            num_null_left = int(math.ceil(num_null / 2))
-            num_middle_filled = middle_length - num_null
-            middle_start = left_edge + num_null_left
-
-            # Set left edge
-            result[sub_df.index, :left_edge] = fixed_length_sequences[
-                :, :left_edge
-            ]
-
-            # Set middle.
-            result[
-                sub_df.index,
-                middle_start : middle_start + num_middle_filled
-            ] = fixed_length_sequences[
-                :, left_edge : left_edge + num_middle_filled
-            ]
-
-            # Set right edge.
-            result[
-                sub_df.index,
-                -right_edge:
-            ] = fixed_length_sequences[:, -right_edge:]
+        result = None
+        if alignment_method == 'pad_middle':
+            # Result array is int32, filled with X (null amino acid) value.
+            result = numpy.full(
+                fill_value=amino_acid.AMINO_ACID_INDEX['X'],
+                shape=(len(sequences), max_length),
+                dtype="int32")
+
+            df = pandas.DataFrame({"peptide": sequences})
+            df["length"] = df.peptide.str.len()
+
+            middle_length = max_length - left_edge - right_edge
+
+            # For efficiency we handle each supported peptide length using bulk
+            # array operations.
+            for (length, sub_df) in df.groupby("length"):
+                if length < left_edge + right_edge:
+                    raise ValueError(
+                        "Sequence '%s' (length %d) unsupported: length must be at "
+                        "least %d. There are %d total peptides with this length." % (
+                            sub_df.iloc[0].peptide, length, left_edge + right_edge,
+                            len(sub_df)))
+                if length > max_length:
+                    raise ValueError(
+                        "Sequence '%s' (length %d) unsupported: length must be at "
+                        "most %d. There are %d total peptides with this length." % (
+                            sub_df.iloc[0].peptide, length, max_length,
+                            len(sub_df)))
+
+                # Array of shape (num peptides, length) giving fixed-length amino
+                # acid encoding each peptide of the current length.
+                fixed_length_sequences = numpy.stack(
+                    sub_df.peptide.map(
+                        lambda s: numpy.array([
+                            amino_acid.AMINO_ACID_INDEX[char] for char in s
+                        ])).values)
+
+                num_null = max_length - length
+                num_null_left = int(math.ceil(num_null / 2))
+                num_middle_filled = middle_length - num_null
+                middle_start = left_edge + num_null_left
+
+                # Set left edge
+                result[sub_df.index, :left_edge] = fixed_length_sequences[
+                    :, :left_edge
+                ]
+
+                # Set middle.
+                result[
+                    sub_df.index,
+                    middle_start : middle_start + num_middle_filled
+                ] = fixed_length_sequences[
+                    :, left_edge : left_edge + num_middle_filled
+                ]
+
+                # Set right edge.
+                result[
+                    sub_df.index,
+                    -right_edge:
+                ] = fixed_length_sequences[:, -right_edge:]
+        elif alignment_method == "left_pad_right_pad":
+            raise NotImplementedError
+        else:
+            raise NotImplementedError(
+                "Unsupported alignment method: %s" % alignment_method)
         return result
-- 
GitLab