From ec233b8caca3c053d6b8372861b1348c90d9cc0d Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 6 May 2016 19:10:56 -0400
Subject: [PATCH] changing imputation logic to use Dataset

---
 ...llele_specific_kmer_ic50_predictor_base.py |  13 +-
 mhcflurry/data.py                             | 231 ------------------
 mhcflurry/dataset.py                          |  42 ++++
 mhcflurry/imputation.py                       |  50 +---
 mhcflurry/peptide_encoding.py                 |  42 ++--
 test/test_class1_binding_predictor_A0205.py   |   1 -
 test/test_load_dataframe.py                   |  73 ++----
 test/test_peptide_encoding.py                 |   8 +-
 8 files changed, 105 insertions(+), 355 deletions(-)
 delete mode 100644 mhcflurry/data.py

diff --git a/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py b/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py
index ee244da1..e8c6ff00 100644
--- a/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py
+++ b/mhcflurry/class1_allele_specific_kmer_ic50_predictor_base.py
@@ -158,8 +158,17 @@ class Class1AlleleSpecificKmerIC50PredictorBase(IC50PredictorBase):
             Extra arguments are passed on to the fit_encoded_kmer_arrays()
             method.
         """
-        X, Y, sample_weights = dataset.encode()
-        X_pretrain, Y_pretrain, sample_weights_pretrain = pretraining_dataset.encode()
+        X, Y, sample_weights, _ = dataset.kmer_index_encoding(
+            kmer_size=self.kmer_size,
+            allow_unknown_amino_acids=self.allow_unknown_amino_acids)
+
+        if pretraining_dataset is None:
+            X_pretrain = Y_pretrain = sample_weights_pretrain = None
+        else:
+            X_pretrain, Y_pretrain, sample_weights_pretrain = \
+                pretraining_dataset.kmer_index_encoding(
+                    kmer_size=self.kmer_size,
+                    allow_unknown_amino_acids=self.allow_unknown_amino_acids)
         return self.fit_arrays(
             X=X,
             Y=Y,
diff --git a/mhcflurry/data.py b/mhcflurry/data.py
deleted file mode 100644
index 4792ce04..00000000
--- a/mhcflurry/data.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright (c) 2016. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import (
-    print_function,
-    division,
-    absolute_import,
-)
-from collections import namedtuple, defaultdict
-
-import pandas as pd
-import numpy as np
-
-from .amino_acid import common_amino_acids
-from .peptide_encoding import (
-    indices_to_hotshot_encoding,
-    fixed_length_index_encoding,
-    check_valid_index_encoding_array,
-)
-
-
-index_encoding = common_amino_acids.index_encoding
-
-AlleleData = namedtuple(
-    "AlleleData",
-    [
-        "X_index",    # index-based featue encoding of fixed length peptides
-        "X_binary",  # binary encoding of fixed length peptides
-        "Y",     # regression encoding of IC50 (log scaled between 0..1)
-        "peptides",  # list of fixed length peptide string
-        "ic50",      # IC50 value associated with each entry
-        "original_peptides",  # original peptides may be of different lengths
-        "original_lengths",   # len(original_peptide)
-        "substring_counts",   # how many substrings were extracted from
-                              # each original peptide string
-        "weights",    # 1.0 / count
-        "max_ic50",   # maximum IC50 value used for encoding Y
-    ])
-
-
-
-
-
-def create_allele_data_from_peptide_to_ic50_dict(
-        peptide_to_ic50_dict,
-        max_ic50=MAX_IC50,
-        kmer_length=9,
-        flatten_binary_encoding=True):
-    """
-    Parameters
-    ----------
-    peptide_to_ic50_dict : dict
-        Dictionary mapping peptides of different lengths to IC50 binding
-        affinity values.
-
-    max_ic50 : float
-        Maximum IC50 value used as the cutoff for affinity of 0.0 when
-        transforming from IC50 to regression targets.
-
-    kmer_length : int
-        What length substrings will be fed to a fixed-length predictor?
-
-    flatten_binary_encoding : bool
-        Should hotshot encodings of amino acid inputs be flattened into a 1D
-        vector or have two dimensions (where the first represents position)?
-
-    Return an AlleleData object.
-    """
-    Y_dict = {
-        peptide: ic50_to_regression_target(ic50, max_ic50)
-        for (peptide, ic50)
-        in peptide_to_ic50_dict.items()
-    }
-    (kmer_peptides, original_peptides, counts, X_index, X_binary, Y_kmer) = \
-        encode_peptide_to_affinity_dict(
-            Y_dict,
-            peptide_length=kmer_length,
-            flatten_binary_encoding=flatten_binary_encoding)
-
-    ic50_array = np.array([peptide_to_ic50_dict[p] for p in original_peptides])
-    assert len(kmer_peptides) == len(ic50_array), \
-        "Mismatch between # of peptides %d and # IC50 outputs %d" % (
-            len(kmer_peptides), len(ic50_array))
-
-    return AlleleData(
-        X_index=X_index,
-        X_binary=X_binary,
-        Y=Y_kmer,
-        ic50=ic50_array,
-        peptides=np.array(kmer_peptides),
-        original_peptides=np.array(original_peptides),
-        original_lengths=np.array(
-            [len(peptide) for peptide in original_peptides]),
-        substring_counts=counts,
-        weights=1.0 / counts,
-        max_ic50=max_ic50)
-
-
-def load_allele_datasets(
-        filename,
-        peptide_length=9,
-        use_multiple_peptide_lengths=True,
-        max_ic50=MAX_IC50,
-        flatten_binary_encoding=True,
-        sep=None,
-        species_column_name="species",
-        allele_column_name="mhc",
-        peptide_column_name=None,
-        peptide_length_column_name="peptide_length",
-        ic50_column_name="meas",
-        only_human=False):
-    """
-    Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides
-    and log-transforms the IC50 measurement. Returns dictionary mapping allele
-    names to AlleleData objects (containing fields X, Y, ic50)
-
-    Parameters
-    ----------
-    filename : str
-        TSV filename with columns:
-            - 'species'
-            - 'mhc'
-            - 'peptide_length'
-            - 'sequence'
-            - 'meas'
-
-    peptide_length : int
-        Which length peptides to use (default=9)
-
-    use_multiple_peptide_lengths : bool
-        If a peptide is shorter than `peptide_length`, expand it into many
-        peptides of the appropriate length by inserting all combinations of
-        amino acids. Similarly, if a peptide is longer than `peptide_length`,
-        shorten it by deleting stretches of contiguous amino acids at all
-        peptide positions.
-
-    max_ic50 : float
-        Treat IC50 scores above this value as all equally bad
-        (transform them to 0.0 in the rescaled output)
-
-    flatten_binary_encoding : bool
-        If False, returns a (n_samples, peptide_length, 20) matrix, otherwise
-        returns the 2D flattened version of the same data.
-
-    sep : str, optional
-        Separator in CSV file, default is to let Pandas infer
-
-    peptide_column_name : str, optional
-        Default behavior is to try {"sequence", "peptide", "peptide_sequence"}
-
-    only_human : bool
-        Only load entries from human MHC alleles
-    """
-    df, peptide_column_name = load_dataframe(
-        filename=filename,
-        max_ic50=max_ic50,
-        sep=sep,
-        species_column_name=species_column_name,
-        allele_column_name=allele_column_name,
-        peptide_column_name=peptide_column_name,
-        filter_peptide_length=None if use_multiple_peptide_lengths else peptide_length,
-        ic50_column_name=ic50_column_name,
-        only_human=only_human)
-
-    allele_groups = {}
-    for allele, group in df.groupby(allele_column_name):
-        assert allele not in allele_groups, \
-            "Duplicate datasets for %s" % allele
-
-        raw_peptides = group[peptide_column_name]
-
-        # filter lengths in case user wants to drop peptides that are longer
-        # or shorter than the desired fixed length
-        if not use_multiple_peptide_lengths:
-            drop_mask = raw_peptides.str.len() != peptide_length
-            group = group[~drop_mask]
-            raw_peptides = raw_peptides[~drop_mask]
-
-        # convert from a Pandas column to a list, since that's easier to
-        # interact with later
-        raw_peptides = list(raw_peptides)
-
-        # create dictionaries of outputs from which we can look up values
-        # after peptides have been expanded
-        ic50_dict = {
-            peptide: ic50
-            for (peptide, ic50)
-            in zip(raw_peptides, group[ic50_column_name])
-        }
-        allele_groups[allele] = create_allele_data_from_peptide_to_ic50_dict(
-            ic50_dict,
-            max_ic50=max_ic50)
-    return allele_groups
-
-
-def collapse_multiple_peptide_entries(allele_datasets):
-    """
-    If an (allele, peptide) pair occurs multiple times then reduce it
-    to a single entry by taking the weighted average of affinity values.
-
-    Returns a dictionary of alleles, each of which maps to a dictionary of
-    peptides that map to affinity values.
-    """
-    allele_to_peptide_to_affinity = {}
-    for (allele, dataset) in allele_datasets.items():
-        multiple_affinities = defaultdict(list)
-        for (peptide, normalized_affinity, weight) in zip(
-                dataset.peptides, dataset.Y, dataset.weights):
-            multiple_affinities[peptide].append((normalized_affinity, weight))
-        weighted_averages = {}
-        for peptide, affinity_weight_tuples in multiple_affinities.items():
-            denom = 0.0
-            sum_weighted_affinities = 0.0
-            for affinity, weight in affinity_weight_tuples:
-                sum_weighted_affinities += affinity * weight
-                denom += weight
-            if denom > 0:
-                weighted_averages[peptide] = sum_weighted_affinities / denom
-        allele_to_peptide_to_affinity[allele] = weighted_averages
-    return allele_to_peptide_to_affinity
diff --git a/mhcflurry/dataset.py b/mhcflurry/dataset.py
index 9201053f..cfc8b611 100644
--- a/mhcflurry/dataset.py
+++ b/mhcflurry/dataset.py
@@ -28,6 +28,7 @@ from .dataset_helpers import (
     prepare_pMHC_affinity_arrays,
     load_dataframe
 )
+from .peptide_encoding import fixed_length_index_encoding
 
 class Dataset(object):
 
@@ -342,6 +343,47 @@ class Dataset(object):
         df = pd.DataFrame(new_data_dict)
         return self.__class__(df)
 
+    def kmer_index_encoding(
+            self,
+            kmer_size=9,
+            allow_unknown_amino_acids=True):
+        """
+        Encode peptides in this dataset using a fixed-length vector
+        representation.
+
+        Parameters
+        ----------
+        kmer_size : int
+            Length of encoding for each peptide
+
+        allow_unknown_amino_acids : bool
+            If True, then extend shorter amino acids using "X" character,
+            otherwise fill in all possible combinations of real amino acids.
+
+        Returns:
+            - 2d array of encoded kmers
+            - 1d array of affinity value corresponding to the source
+              peptide for each kmer
+            - sample_weights (1 / kmer count per peptide)
+            - indices of original peptides from which kmers were extracted
+        """
+        X_index, _, original_peptide_indices, counts = \
+            fixed_length_index_encoding(
+                peptides=self.peptides,
+                desired_length=kmer_size,
+                start_offset_shorten=0,
+                end_offset_shorten=0,
+                start_offset_extend=0,
+                end_offset_extend=0,
+                allow_unknown_amino_acids=allow_unknown_amino_acids)
+        original_peptide_indices = np.asarray(original_peptide_indices)
+        counts = np.asarray(counts)
+        assert len(original_peptide_indices) == len(self.affinities)
+        assert len(counts) == len(self.affinities)
+        kmer_affinities = self.affinities[original_peptide_indices]
+        sample_weights = 1.0 / counts
+        return X_index, kmer_affinities, sample_weights, original_peptide_indices
+
     def imputed_missing_values(
             self,
             imputation_method,
diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py
index 018356f3..2f809e2f 100644
--- a/mhcflurry/imputation.py
+++ b/mhcflurry/imputation.py
@@ -28,10 +28,7 @@ from fancyimpute.soft_impute import SoftImpute
 from fancyimpute.mice import MICE
 from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary
 
-from .data import (
-    create_allele_data_from_peptide_to_ic50_dict,
-)
-from .regression_target import regression_target_to_ic50
+from .dataset import Dataset
 
 def _check_dense_pMHC_array(X, peptide_list, allele_list):
     if len(peptide_list) != len(set(peptide_list)):
@@ -164,15 +161,14 @@ def dense_pMHC_matrix_to_nested_dict(X, peptide_list, allele_list):
     return allele_to_peptide_to_ic50_dict
 
 def create_imputed_datasets(
-        allele_data_dict,
+        dataset,
         imputer,
-        min_observations_per_peptide=1,
-        min_observations_per_allele=1):
+        min_observations_per_peptide=2,
+        min_observations_per_allele=2):
     """
     Parameters
     ----------
-    allele_data_dict : dict
-        Dictionary mapping allele names to AlleleData objects
+    allele_data_dict : Dataset
 
     imputer : object
         Expected to have a method imputer.complete(X) which takes an array
@@ -187,51 +183,27 @@ def create_imputed_datasets(
     Returns dictionary mapping allele names to AlleleData objects containing
     imputed affinity values.
     """
-    if len(allele_data_dict) == 0:
-        return {}
-
-    # pick the max IC50 associated with a random AlleleData object and then
-    # make sure that they all agree
-    allele_data_objects = list(allele_data_dict.values())
-    max_ic50 = allele_data_objects[0].max_ic50
-    if not all(allele_data.max_ic50 == max_ic50 for allele_data in allele_data_objects):
-        raise ValueError("AlleleData objects must all use the same max IC50 (%f)" % (
-            max_ic50,))
     X_incomplete, peptide_list, allele_list = create_incomplete_dense_pMHC_matrix(
         allele_data_dict=allele_data_dict,
         min_observations_per_peptide=min_observations_per_peptide,
         min_observations_per_allele=min_observations_per_allele)
 
+    X_incomplete_log = np.log(X_incomplete)
+
     if np.isnan(X_incomplete).sum() == 0:
         # if all entries in the matrix are already filled in then don't
         # try using an imputation algorithm since it might raise an
         # exception.
         logging.warn("No missing values, using original data instead of imputation")
-        X_complete = X_incomplete
+        X_complete_log = X_incomplete_log
     else:
-        X_complete = imputer.complete(X_incomplete)
-
+        X_complete_log = imputer.complete(X_incomplete_log)
+    X_complete = np.exp(X_complete_log)
     allele_to_peptide_to_affinity_dict = dense_pMHC_matrix_to_nested_dict(
         X=X_complete,
         peptide_list=peptide_list,
         allele_list=allele_list)
-
-    # map all the normalized regression targets to IC50 values
-    allele_to_peptide_to_ic50_dict = {
-        allele: {
-            peptide: regression_target_to_ic50(y, max_ic50=max_ic50)
-            for (peptide, y)
-            in allele_dict.items()
-        }
-        for (allele, allele_dict)
-        in allele_to_peptide_to_affinity_dict.items()
-    }
-
-    return {
-        allele_name: create_allele_data_from_peptide_to_ic50_dict(allele_dict)
-        for (allele_name, allele_dict)
-        in allele_to_peptide_to_ic50_dict.items()
-    }
+    return Dataset.from_nested_dictionary(allele_to_peptide_to_affinity_dict)
 
 def imputer_from_name(imputation_method_name, **kwargs):
     """
diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py
index cad5f53c..49698814 100644
--- a/mhcflurry/peptide_encoding.py
+++ b/mhcflurry/peptide_encoding.py
@@ -128,7 +128,6 @@ def shorten_peptide(
         for i in range(start_offset, end_range)
     ]
 
-
 def fixed_length_from_many_peptides(
         peptides,
         desired_length,
@@ -141,7 +140,6 @@ def fixed_length_from_many_peptides(
     Create a set of fixed-length k-mer peptides from a collection of varying
     length peptides.
 
-
     Shorter peptides are filled in using all possible amino acids at any
     insertion site between start_offset_shorten and -end_offset_shorten
     where start_offset_extend=0 represents insertions before the string
@@ -167,10 +165,10 @@ def fixed_length_from_many_peptides(
 
     Returns three lists:
         - a list of fixed length peptides (all of length `desired_length`)
-        - a list of the original peptides from which subsequences were
-          contracted or lengthened
-        - a list of integers indicating the number of fixed length peptides
-         generated from each variable length peptide.
+        - a list of indices of the original peptides from which subsequences
+          were contracted or lengthened
+        - a list of counts for each fixed length peptide indicating the
+          number extracted from its corresponding shorter/longer peptide
 
     Example:
         kmers, original, counts = fixed_length_from_many_peptides(
@@ -187,7 +185,7 @@ def fixed_length_from_many_peptides(
 
     Parameters
     ----------
-    peptide : list of str
+    peptides : list of str
 
     desired_length : int
 
@@ -202,9 +200,9 @@ def fixed_length_from_many_peptides(
     insert_amino_acid_letters : str | list of characters
     """
     all_fixed_length_peptides = []
-    original_peptide_sequences = []
+    indices = []
     counts = []
-    for peptide in peptides:
+    for i, peptide in enumerate(peptides):
         n = len(peptide)
         if n == desired_length:
             fixed_length_peptides = [peptide]
@@ -228,12 +226,11 @@ def fixed_length_from_many_peptides(
                 start_offset=start_offset_shorten,
                 end_offset=end_offset_shorten,
                 insert_amino_acid_letters=insert_amino_acid_letters)
-
         n_fixed_length = len(fixed_length_peptides)
         all_fixed_length_peptides.extend(fixed_length_peptides)
-        original_peptide_sequences.extend([peptide] * n_fixed_length)
+        indices.extend([i] * n_fixed_length)
         counts.extend([n_fixed_length] * n_fixed_length)
-    return all_fixed_length_peptides, original_peptide_sequences, counts
+    return all_fixed_length_peptides, indices, counts
 
 
 def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
@@ -296,16 +293,17 @@ def fixed_length_index_encoding(
         insert_letters = common_amino_acid_letters
         index_encoding = common_amino_acids.index_encoding
 
-    fixed_length, original, counts = fixed_length_from_many_peptides(
-        peptides=peptides,
-        desired_length=desired_length,
-        start_offset_shorten=start_offset_shorten,
-        end_offset_shorten=end_offset_shorten,
-        start_offset_extend=start_offset_extend,
-        end_offset_extend=end_offset_extend,
-        insert_amino_acid_letters=insert_letters)
-    X_index = index_encoding(fixed_length, desired_length)
-    return (X_index, fixed_length, original, counts)
+    fixed_length, original_peptide_indices, counts = \
+        fixed_length_from_many_peptides(
+            peptides=peptides,
+            desired_length=desired_length,
+            start_offset_shorten=start_offset_shorten,
+            end_offset_shorten=end_offset_shorten,
+            start_offset_extend=start_offset_extend,
+            end_offset_extend=end_offset_extend,
+            insert_amino_acid_letters=insert_letters)
+    X = index_encoding(fixed_length, desired_length)
+    return (X, fixed_length, original_peptide_indices, counts)
 
 def check_valid_index_encoding_array(X, allow_unknown_amino_acids=True):
         X = np.asarray(X)
diff --git a/test/test_class1_binding_predictor_A0205.py b/test/test_class1_binding_predictor_A0205.py
index 0383c40c..819bbd1c 100644
--- a/test/test_class1_binding_predictor_A0205.py
+++ b/test/test_class1_binding_predictor_A0205.py
@@ -7,7 +7,6 @@ import numpy as np
 
 
 def class1_binding_predictor_A0205_training_accuracy():
-
     dataset = Dataset.from_csv(CLASS1_DATA_CSV_PATH)
     dataset_a0205 = dataset.get_allele("HLA-A0205")
 
diff --git a/test/test_load_dataframe.py b/test/test_load_dataframe.py
index 64972e9f..ab19a94c 100644
--- a/test/test_load_dataframe.py
+++ b/test/test_load_dataframe.py
@@ -18,24 +18,20 @@ Testing dataframe loading from CSV:
     load_dataframe
     Input:
         filename,
-        peptide_length=None,
-        max_ic50=MAX_IC50,
-        sep=None,
-        species_column_name="species",
-        allele_column_name="mhc",
-        peptide_column_name=None,
-        peptide_length_column_name="peptide_length",
-        ic50_column_name="meas",
-        only_human=True
     Outputs:
-        Tuple with dataframe and name of peptide column
+        Tuple with following elements:
+            - dataframe
+            - name of allele column
+            - name of peptide column
+            - name of affinity column
 """
 
 
-from mhcflurry.data import load_dataframe
+from mhcflurry.dataset_helpers import load_dataframe
 import numpy as np
 from tempfile import NamedTemporaryFile
 from pandas import DataFrame
+from nose.tools import eq_
 
 def make_dummy_dataframe():
     dummy_ic50_values = np.array([
@@ -62,48 +58,13 @@ def test_load_dataframe():
         binding_data_path = f.name
         df_expected.to_csv(binding_data_path, index=False)
 
-        for max_ic50 in [500.0, 50000.0]:
-            df, peptide_column_name = load_dataframe(
-                filename=binding_data_path,
-                max_ic50=max_ic50,
-                only_human=False)
-            assert peptide_column_name == "sequence"
-            assert len(df) == n_expected, \
-                "Expected %d entries in DataFrame but got %d in %s" % (
-                    n_expected,
-                    len(df),
-                    df)
-            assert "regression_output" in df, df.columns
-            expected_values = np.minimum(
-                0,
-                np.log(df["meas"]) / np.log(max_ic50)
-            )
-            np.allclose(df["regression_output"], expected_values)
-
-def test_load_dataframe_human():
-    df_expected = make_dummy_dataframe()
-    human_mask = df_expected["species"] == "human"
-    df_expected = df_expected[human_mask]
-    n_expected = len(df_expected)
-
-    with NamedTemporaryFile(mode="w") as f:
-        binding_data_path = f.name
-        df_expected.to_csv(binding_data_path, index=False)
-
-        for max_ic50 in [500.0, 50000.0]:
-            df, peptide_column_name = load_dataframe(
-                filename=binding_data_path,
-                max_ic50=max_ic50,
-                only_human=True)
-            assert peptide_column_name == "sequence"
-            assert len(df) == n_expected, \
-                "Expected %d entries in DataFrame but got %d in %s" % (
-                    n_expected,
-                    len(df),
-                    df)
-            assert "regression_output" in df, df.columns
-            expected_values = np.minimum(
-                0,
-                np.log(df["meas"]) / np.log(max_ic50)
-            )
-            np.allclose(df["regression_output"], expected_values)
+        df, allele_column_name, peptide_column_name, affinity_column_name = \
+            load_dataframe(filename=binding_data_path)
+        eq_(allele_column_name, "mhc")
+        eq_(peptide_column_name, "sequence")
+        eq_(affinity_column_name, "meas")
+        assert len(df) == n_expected, \
+            "Expected %d entries in DataFrame but got %d in %s" % (
+                n_expected,
+                len(df),
+                df)
diff --git a/test/test_peptide_encoding.py b/test/test_peptide_encoding.py
index aa6ff804..e1e31e51 100644
--- a/test/test_peptide_encoding.py
+++ b/test/test_peptide_encoding.py
@@ -105,7 +105,7 @@ def test_shorten_peptide_all_positions_except_last():
 
 
 def test_fixed_length_from_many_peptides():
-    kmers, original, counts = fixed_length_from_many_peptides(
+    kmers, original_indices, counts = fixed_length_from_many_peptides(
         peptides=["ABC", "A"],
         desired_length=2,
         start_offset_extend=0,
@@ -114,10 +114,10 @@ def test_fixed_length_from_many_peptides():
         end_offset_shorten=0,
         insert_amino_acid_letters="ABC")
     print(kmers)
-    print(original)
+    print(original_indices)
     print(counts)
-    eq_(len(kmers), len(original))
+    eq_(len(kmers), len(original_indices))
     eq_(len(kmers), len(counts))
     eq_(kmers, ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"])
-    eq_(original, ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"])
+    eq_(original_indices, [0] * 3 + [1] * 6)
     eq_(counts, [3, 3, 3, 6, 6, 6, 6, 6, 6])
-- 
GitLab