From 4b0fe361548d2da2337cbc5882e9ee82dfa2a8f6 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Sat, 21 Nov 2015 16:31:27 -0500
Subject: [PATCH] added unit tests for peptide shortening and extension

---
 mhcflurry/__init__.py                  |   6 +-
 mhcflurry/{data_helpers.py => data.py} | 136 +++++++++---------
 mhcflurry/fixed_length_peptides.py     | 188 +++++++++++++++----------
 mhcflurry/mhc1_binding_predictor.py    |   3 +-
 mhcflurry/peptide_encoding.py          |  65 ++++++---
 test/test_fixed_length_peptides.py     | 109 ++++++++++++++
 6 files changed, 347 insertions(+), 160 deletions(-)
 rename mhcflurry/{data_helpers.py => data.py} (72%)
 create mode 100644 test/test_fixed_length_peptides.py

diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py
index 5f7b272d..04e607f0 100644
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -13,17 +13,19 @@
 # limitations under the License.
 
 from . import paths
-from . import data_helpers
+from . import data
 from . import feedforward
 from . import common
 from . import fixed_length_peptides
+from . import peptide_encoding
 from .mhc1_binding_predictor import Mhc1BindingPredictor
 
 __all__ = [
     "paths",
-    "data_helpers",
+    "data",
     "feedforward",
     "fixed_length_peptides",
+    "peptide_encoding",
     "common",
     "Mhc1BindingPredictor"
 ]
diff --git a/mhcflurry/data_helpers.py b/mhcflurry/data.py
similarity index 72%
rename from mhcflurry/data_helpers.py
rename to mhcflurry/data.py
index be40e8c6..a097e335 100644
--- a/mhcflurry/data_helpers.py
+++ b/mhcflurry/data.py
@@ -23,53 +23,25 @@ import pandas as pd
 import numpy as np
 
 from .common import normalize_allele_name
-from .amino_acid import amino_acid_letter_indices
-
-AlleleData = namedtuple("AlleleData", "X Y peptides ic50")
-
-
-def hotshot_encoding(peptides, peptide_length):
-    """
-    Encode a set of equal length peptides as a binary matrix,
-    where each letter is transformed into a length 20 vector with a single
-    element that is 1 (and the others are 0).
-    """
-    shape = (len(peptides), peptide_length, 20)
-    X = np.zeros(shape, dtype=bool)
-    for i, peptide in enumerate(peptides):
-        for j, amino_acid in enumerate(peptide):
-            k = amino_acid_letter_indices[amino_acid]
-            X[i, j, k] = 1
-    return X
-
-
-def index_encoding(peptides, peptide_length):
-    """
-    Encode a set of equal length peptides as a vector of their
-    amino acid indices.
-    """
-    X = np.zeros((len(peptides), peptide_length), dtype=int)
-    for i, peptide in enumerate(peptides):
-        for j, amino_acid in enumerate(peptide):
-            X[i, j] = amino_acid_letter_indices[amino_acid]
-    return X
-
-
-def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
-    """
-    Given an (n_samples, peptide_length) integer matrix
-    convert it to a binary encoding of shape:
-        (n_samples, peptide_length * n_indices)
-    """
-    (n_samples, peptide_length) = X.shape
-    if not n_indices:
-        n_indices = X.max() - first_index_value + 1
+from .peptide_encoding import (
+    fixed_length_index_encoding,
+    indices_to_hotshot_encoding,
+)
 
-    X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool)
-    for i, row in enumerate(X):
-        for j, xij in enumerate(row):
-            X_binary[i, n_indices * j + xij - first_index_value] = 1
-    return X_binary.astype(float)
+AlleleData = namedtuple(
+    "AlleleData",
+    [
+        "X_index",    # index-based featue encoding of fixed length peptides
+        "X_binary",  # binary encoding of fixed length peptides
+        "Y",     # regression encoding of IC50 (log scaled between 0..1)
+        "peptides",  # list of fixed length peptide string
+        "ic50",      # IC50 value associated with each entry
+        "original_peptides",  # original peptides may be of different lengths
+        "original_length",  # len(original_peptide)
+        "substring_count",  # how many substrings were extracted from
+                            # each original peptide string
+        "weight",    # 1.0 / count
+    ])
 
 
 def _infer_csv_separator(filename):
@@ -213,8 +185,8 @@ def load_allele_dicts(
 def load_allele_datasets(
         filename,
         peptide_length=9,
-        max_ic50=5000.0,
-        binary_encoding=True,
+        use_multiple_peptide_lengths=True,
+        max_ic50=50000.0,
         flatten_binary_encoding=True,
         sep=None,
         species_column_name="species",
@@ -241,14 +213,18 @@ def load_allele_datasets(
     peptide_length : int
         Which length peptides to use (default=9)
 
+    use_multiple_peptide_lengths : bool
+        If a peptide is shorter than `peptide_length`, expand it into many
+        peptides of the appropriate length by inserting all combinations of
+        amino acids. Similarly, if a peptide is longer than `peptide_length`,
+        shorten it by deleting stretches of contiguous amino acids at all
+        peptide positions.
+
     max_ic50 : float
         Treat IC50 scores above this value as all equally bad
         (transform them to 0.0 in the rescaled output)
 
-    binary_encoding : bool
-        Encode amino acids of each peptide as indices or binary vectors
-
-    flatten_features : bool
+    flatten_binary_encoding : bool
         If False, returns a (n_samples, peptide_length, 20) matrix, otherwise
         returns the 2D flattened version of the same data.
 
@@ -275,21 +251,53 @@ def load_allele_datasets(
 
     allele_groups = {}
     for allele, group in df.groupby(allele_column_name):
-        ic50 = np.array(group[ic50_column_name])
-        Y = np.array(group["regression_output"])
-        peptides = list(group[peptide_column_name])
-        if binary_encoding:
-            X = hotshot_encoding(peptides, peptide_length=peptide_length)
-            if flatten_binary_encoding:
-                # collapse 3D input into 2D matrix
-                X = X.reshape((X.shape[0], peptide_length * 20))
-        else:
-            X = index_encoding(peptides, peptide_length=peptide_length)
         assert allele not in allele_groups, \
             "Duplicate datasets for %s" % allele
+
+        raw_peptides = group[peptide_column_name]
+
+        # filter lengths in case user wants to drop peptides that are longer
+        # or shorter than the desired fixed length
+        if not use_multiple_peptide_lengths:
+            drop_mask = raw_peptides.str.len() != peptide_length
+            group = group[~drop_mask]
+            raw_peptides = raw_peptides[~drop_mask]
+
+        # convert from a Pandas column to a list, since that's easier to
+        # interact with later
+        raw_peptides = list(raw_peptides)
+        # convert numberical values from a Pandas column to arrays
+        ic50 = np.array(group[ic50_column_name])
+        Y = np.array(group["regression_output"])
+
+        X_index, original_peptides, counts = fixed_length_index_encoding(
+            peptides=raw_peptides,
+            desired_length=peptide_length)
+
+        X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
+        assert X_binary.shape[0] == X_index.shape[0], \
+            ("Mismatch between number of samples for index encoding (%d)"
+             " vs. binary encoding (%d)") % (
+                X_binary.shape[0],
+                X_index.shape[0])
+        n_samples = X_binary.shape[0]
+
+        if flatten_binary_encoding:
+            # collapse 3D input into 2D matrix
+            n_binary_features = peptide_length * 20
+            X_binary = X_binary.reshape((n_samples, n_binary_features))
+
+        # easier to work with counts when they're an array instead of list
+        counts = np.array(counts)
+
         allele_groups[allele] = AlleleData(
-            X=X,
+            X_index=X_index,
+            X_binary=X_binary,
             Y=Y,
             ic50=ic50,
-            peptides=peptides)
+            peptides=raw_peptides,
+            original_peptides=original_peptides,
+            original_length=[len(peptide) for peptide in original_peptides],
+            substring_count=counts,
+            weight=1.0 / counts)
     return allele_groups
diff --git a/mhcflurry/fixed_length_peptides.py b/mhcflurry/fixed_length_peptides.py
index d5ef0a69..edaa4548 100644
--- a/mhcflurry/fixed_length_peptides.py
+++ b/mhcflurry/fixed_length_peptides.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import itertools
+import logging
 
 from .amino_acid import amino_acid_letters
 
@@ -20,14 +21,25 @@ from .amino_acid import amino_acid_letters
 def all_kmers(k, alphabet=amino_acid_letters):
     """
     Generates all k-mer peptide sequences
+
+    Parameters
+    ----------
+    k : int
+
+    alphabet : str | list of characters
     """
+    alphabets = [alphabet] * k
     return [
         "".join(combination)
         for combination
-        in itertools.product(list(alphabet) * k)
+        in itertools.product(*alphabets)
     ]
 
 
+class CombinatorialExplosion(Exception):
+    pass
+
+
 def extend_peptide(
         peptide,
         desired_length,
@@ -35,18 +47,37 @@ def extend_peptide(
         end_offset,
         alphabet=amino_acid_letters):
     """Extend peptide by inserting every possible amino acid combination
-    if we're trying to e.g. turn an 8mer into 9mers
+    if we're trying to e.g. turn an 8mer into 9mers.
+
+    Parameters
+    ----------
+    peptide : str
+
+    desired_length : int
+
+    start_offset : int
+        How many characters (from the position before the start of the string)
+        to skip before inserting characters.
+
+
+    end_offset : int
+        Last character position from the end where we insert new characters,
+        where 0 is the position after the last character.
+
+    alphabet : str | list of character
     """
     n = len(peptide)
-    assert n < desired_length
+    assert n < desired_length, \
+        "%s (length = %d) is too long! Must be shorter than %d" % (
+            peptide, n, desired_length)
     n_missing = desired_length - n
     if n_missing > 3:
-        raise ValueError(
+        raise CombinatorialExplosion(
             "Cannot transform %s of length %d into a %d-mer peptide" % (
                 peptide, n, desired_length))
     return [
         peptide[:i] + extra + peptide[i:]
-        for i in range(start_offset, n - end_offset)
+        for i in range(start_offset, n - end_offset + 1)
         for extra in all_kmers(n_missing, alphabet=alphabet)
     ]
 
@@ -57,9 +88,25 @@ def shorten_peptide(
         start_offset,
         end_offset,
         alphabet=amino_acid_letters):
-    """Shorten peptide if trying to e.g. turn 10mer into 9mers"""
+    """Shorten peptide if trying to e.g. turn 10mer into 9mers
+
+    Parameters
+    ----------
+
+    peptide : str
+
+    desired_length : int
+
+    start_offset : int
+
+    end_offset : int
+
+    alphabet : str | list of characters
+    """
     n = len(peptide)
-    assert n > desired_length
+    assert n > desired_length, \
+        "%s (length = %d) is too short! Must be longer than %d" % (
+            peptide, n, desired_length)
     n_skip = n - desired_length
     assert n_skip > 0, \
         "Expected length of peptide %s %d to be greater than %d" % (
@@ -71,50 +118,6 @@ def shorten_peptide(
     ]
 
 
-def fixed_length_from_single_peptide(
-        peptide,
-        desired_length,
-        start_offset_extend=2,
-        end_offset_extend=1,
-        start_offset_shorten=2,
-        end_offset_shorten=0,
-        alphabet=amino_acid_letters):
-    """
-    Create a set of fixed-length k-mer peptides from a single peptide of any
-    length. Shorter peptides are filled in using all possible amino acids at any
-    insertion site between (start_offset, -end_offset).
-
-    We can recreate the methods from:
-       Accurate approximation method for prediction of class I MHC
-       affinities for peptides of length 8, 10 and 11 using prediction
-       tools trained on 9mers.
-    by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
-    with the following settings:
-        - desired_length = 9
-        - start_offset_extend = 2
-        - end_offset_extend = 1
-        - start_offset_shorten = 2
-        - end_offset_shorten = 0
-    """
-    n = len(peptide)
-    if n == desired_length:
-        return [peptide]
-    elif n < desired_length:
-        return extend_peptide(
-            peptide=peptide,
-            desired_length=desired_length,
-            start_offset=start_offset_extend,
-            end_offset=end_offset_extend,
-            alphabet=alphabet)
-    else:
-        return shorten_peptide(
-            peptide=peptide,
-            desired_length=desired_length,
-            start_offset=start_offset_shorten,
-            end_offset=end_offset_shorten,
-            alphabet=alphabet)
-
-
 def fixed_length_from_many_peptides(
         peptides,
         desired_length,
@@ -125,11 +128,19 @@ def fixed_length_from_many_peptides(
         alphabet=amino_acid_letters):
     """
     Create a set of fixed-length k-mer peptides from a collection of varying
-    length peptides. Shorter peptides are filled in using all possible amino
-    acids at any insertion site between
-        [start_offset_extend, length - end_offset_extend).
-    Longer peptides are made smaller by deleting contiguous residues between
-        [start_offset_shorten, length - end_offset_shorten)
+    length peptides.
+
+
+    Shorter peptides are filled in using all possible amino acids at any
+    insertion site between start_offset_shorten and -end_offset_shorten
+    where start_offset_extend=0 represents insertions before the string
+    and end_offset_extend=0 represents insertions after the string's ending.
+
+    Longer peptides are shortened by deleting contiguous residues, starting
+    from start_offset_shorten and ending with -end_offset_shorten. Unlike
+    peptide extensions, the offsets for shortening a peptide range between
+    the first and last positions (rather than between the positions *before*
+    the string starts and the position *after*).
 
     We can recreate the methods from:
        Accurate approximation method for prediction of class I MHC
@@ -138,10 +149,10 @@ def fixed_length_from_many_peptides(
     by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
     with the following settings:
         - desired_length = 9
-        - start_offset_extend = 2
-        - end_offset_extend = 1
-        - start_offset_shorten = 2
-        - end_offset_shorten = 0
+        - start_offset_extend = 3
+        - end_offset_extend = 2
+        - start_offset_shorten = 3
+        - end_offset_shorten = 1
 
     Returns three lists:
         - a list of fixed length peptides (all of length `desired_length`)
@@ -162,21 +173,52 @@ def fixed_length_from_many_peptides(
         kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]
         original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]
         counts == [3, 3, 3, 6, 6, 6, 6, 6, 6]
+
+    Parameters
+    ----------
+    peptide : list of str
+
+    desired_length : int
+
+    start_offset_extend : int
+
+    end_offset_extend : int
+
+    start_offset_shorten : int
+
+    end_offset_shorten : int
+
+    alphabet : str | list of characters
     """
-    fixed_length_peptides = []
+    all_fixed_length_peptides = []
     original_peptide_sequences = []
-    number_expanded = []
+    counts = []
     for peptide in peptides:
-        fixed_length_peptides = fixed_length_from_single_peptide(
-            peptide,
-            desired_length=desired_length,
-            start_offset_extend=start_offset_extend,
-            end_offset_extend=end_offset_extend,
-            start_offset_shorten=start_offset_shorten,
-            end_offset_shorten=end_offset_shorten,
-            alphabet=alphabet)
+        n = len(peptide)
+        if n == desired_length:
+            fixed_length_peptides = [peptide]
+        elif n < desired_length:
+            try:
+                fixed_length_peptides = extend_peptide(
+                    peptide=peptide,
+                    desired_length=desired_length,
+                    start_offset=start_offset_extend,
+                    end_offset=end_offset_extend,
+                    alphabet=alphabet)
+            except CombinatorialExplosion:
+                logging.warn(
+                    "Peptide %s is too short to be extended to length %d" % (
+                        peptide, desired_length))
+                continue
+        else:
+            fixed_length_peptides = shorten_peptide(
+                peptide=peptide,
+                desired_length=desired_length,
+                start_offset=start_offset_shorten,
+                end_offset=end_offset_shorten,
+                alphabet=alphabet)
         n_fixed_length = len(fixed_length_peptides)
-        fixed_length_peptides.extend(fixed_length_peptides)
+        all_fixed_length_peptides.extend(fixed_length_peptides)
         original_peptide_sequences.extend([peptide] * n_fixed_length)
-        number_expanded.extend([n_fixed_length] * n_fixed_length)
-    return fixed_length_peptides, original_peptide_sequences, number_expanded
+        counts.extend([n_fixed_length] * n_fixed_length)
+    return all_fixed_length_peptides, original_peptide_sequences, counts
diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py
index b037b2d6..312eb892 100644
--- a/mhcflurry/mhc1_binding_predictor.py
+++ b/mhcflurry/mhc1_binding_predictor.py
@@ -30,7 +30,8 @@ import pandas as pd
 from keras.models import model_from_config
 
 from .class1_allele_specific_hyperparameters import MAX_IC50
-from .data_helpers import index_encoding, normalize_allele_name
+from .common import normalize_allele_name
+from .peptide_encoding import index_encoding
 from .paths import CLASS1_MODEL_DIRECTORY
 from .fixed_length_peptides import fixed_length_from_many_peptides
 
diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py
index db20c8ea..ac9f9620 100644
--- a/mhcflurry/peptide_encoding.py
+++ b/mhcflurry/peptide_encoding.py
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 import numpy as np
+
 from .amino_acid import amino_acid_letter_indices
+from .fixed_length_peptides import fixed_length_from_many_peptides
 
 
 def hotshot_encoding(peptides, peptide_length):
@@ -43,26 +45,6 @@ def index_encoding(peptides, peptide_length):
     return X
 
 
-def index_encoding_of_substrings(
-        peptides,
-        substring_length,
-        delete_exclude_start=0,
-        delete_exclude_end=0):
-    """
-    Take peptides of varying lengths, chop them into substrings of fixed
-    length and apply index encoding to these substrings.
-
-    If a string is longer than the substring length, then it's reduced to
-    the desired length by deleting characters at all possible positions.
-    If positions at the start or end of a string should be exempt from deletion
-    then the number of exempt characters can be controlled via
-    `delete_exclude_start` and `delete_exclude_end`.
-
-    Returns feature matrix X and a vector of substring counts.
-    """
-    pass
-
-
 def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
     """
     Given an (n_samples, peptide_length) integer matrix
@@ -78,3 +60,46 @@ def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
         for j, xij in enumerate(row):
             X_binary[i, n_indices * j + xij - first_index_value] = 1
     return X_binary.astype(float)
+
+
+def fixed_length_index_encoding(
+        peptides,
+        desired_length,
+        start_offset_shorten=0,
+        end_offset_shorten=0,
+        start_offset_extend=0,
+        end_offset_extend=0):
+    """
+    Take peptides of varying lengths, chop them into substrings of fixed
+    length and apply index encoding to these substrings.
+
+    If a string is longer than the desired length, then it's reduced to
+    the desired length by deleting characters at all possible positions. When
+    positions at the start or end of a string should be exempt from deletion
+    then the number of exempt characters can be controlled via
+    `start_offset_shorten` and `end_offset_shorten`.
+
+    If a string is shorter than the desired length then it is filled
+    with all possible characters of the alphabet at all positions. The
+    parameters `start_offset_extend` and `end_offset_extend` control whether
+    certain positions are excluded from insertion. The positions are
+    in a "inter-residue" coordinate system, where `start_offset_extend` = 0
+    refers to the position *before* the start of a peptide and, similarly,
+    `end_offset_extend` = 0 refers to the position *after* the peptide.
+
+    Returns feature matrix X, a list of original peptides for each feature
+    vector, and a list of integer counts indicating how many rows share a
+    particular original peptide. When two rows are expanded out of a single
+    original peptide, they will both have a count of 2. These counts can
+    be useful for down-weighting the importance of multiple feature vectors
+    which originate from the same sample.
+    """
+    fixed_length, original, counts = fixed_length_from_many_peptides(
+        peptides=peptides,
+        desired_length=desired_length,
+        start_offset_shorten=start_offset_shorten,
+        end_offset_shorten=end_offset_shorten,
+        start_offset_extend=start_offset_extend,
+        end_offset_extend=end_offset_extend)
+    X = index_encoding(fixed_length, desired_length)
+    return X, original, counts
diff --git a/test/test_fixed_length_peptides.py b/test/test_fixed_length_peptides.py
new file mode 100644
index 00000000..ebe0f8e2
--- /dev/null
+++ b/test/test_fixed_length_peptides.py
@@ -0,0 +1,109 @@
+from mhcflurry.fixed_length_peptides import (
+    all_kmers,
+    extend_peptide,
+    shorten_peptide,
+    fixed_length_from_many_peptides,
+)
+from nose.tools import eq_
+
+
+def test_all_kmers():
+    kmers = all_kmers(2, alphabet=["A", "B"])
+    assert len(kmers) == 4, kmers
+    eq_(set(kmers), {"AA", "AB", "BA", "BB"})
+
+
+def test_all_kmers_string_alphabet():
+    kmers = all_kmers(2, alphabet="AB")
+    assert len(kmers) == 4, kmers
+    eq_(set(kmers), {"AA", "AB", "BA", "BB"})
+
+
+def test_extend_peptide_all_positions():
+    # insert 0 or 1 at every position
+    results = extend_peptide(
+        "111",
+        desired_length=4,
+        start_offset=0,
+        end_offset=0,
+        alphabet="01")
+
+    expected = [
+        "0111",
+        "1111",
+        "1011",
+        "1111",
+        "1101",
+        "1111",
+        "1110",
+        "1111",
+    ]
+    eq_(results, expected)
+
+
+def test_shorten_peptide_all_positions():
+    # insert 0 or 1 at every position
+    results = shorten_peptide(
+        "012",
+        desired_length=2,
+        start_offset=0,
+        end_offset=0,
+        alphabet="012")
+
+    expected = [
+        "12",
+        "02",
+        "01"
+    ]
+    eq_(results, expected)
+
+
+def test_shorten_peptide_all_positions_except_first():
+    # insert 0 or 1 at every position
+    results = shorten_peptide(
+        "012",
+        desired_length=2,
+        start_offset=1,
+        end_offset=0,
+        alphabet="012")
+
+    expected = [
+        "02",
+        "01",
+    ]
+    eq_(results, expected)
+
+
+def test_shorten_peptide_all_positions_except_last():
+    # insert 0 or 1 at every position
+    results = shorten_peptide(
+        "012",
+        desired_length=2,
+        start_offset=0,
+        end_offset=1,
+        alphabet="012")
+
+    expected = [
+        "12",
+        "02",
+    ]
+    eq_(results, expected)
+
+
+def test_fixed_length_from_many_peptides():
+    kmers, original, counts = fixed_length_from_many_peptides(
+        peptides=["ABC", "A"],
+        desired_length=2,
+        start_offset_extend=0,
+        end_offset_extend=0,
+        start_offset_shorten=0,
+        end_offset_shorten=0,
+        alphabet="ABC")
+    print(kmers)
+    print(original)
+    print(counts)
+    eq_(len(kmers), len(original))
+    eq_(len(kmers), len(counts))
+    eq_(kmers, ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"])
+    eq_(original, ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"])
+    eq_(counts, [3, 3, 3, 6, 6, 6, 6, 6, 6])
-- 
GitLab