diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py
index 27f8fc4cf1c1777be47ac0f3edc766e05db201ff..366a5a40b5850f33b3d89e246f04efa6e338f232 100644
--- a/mhcflurry/class1_binding_predictor.py
+++ b/mhcflurry/class1_binding_predictor.py
@@ -34,7 +34,7 @@ from .serialization_helpers import (
     load_keras_model_from_disk,
     save_keras_model_to_disk
 )
-
+from .peptide_encoding import check_valid_index_encoding_array
 from .class1_allele_specific_hyperparameters import MAX_IC50
 
 _allele_predictor_cache = {}
@@ -386,14 +386,7 @@ class Class1BindingPredictor(PredictorBase):
         Given an encoded array of amino acid indices, returns a vector
         of predicted log IC50 values.
         """
-        X = np.asarray(X)
-        if len(X.shape) != 2:
-            raise ValueError("Expected 2d input, got array with shape %s" % (
-                X.shape,))
-        max_expected_index = 20 if self.allow_unknown_amino_acids else 19
-        if X.max() > max_expected_index:
-            raise ValueError(
-                "Got index %d in peptide encoding, max expected %d" % (
-                    X.max(),
-                    max_expected_index))
+        X = check_valid_index_encoding_array(
+            X,
+            allow_unknown_amino_acids=self.allow_unknown_amino_acids)
         return self.model.predict(X, verbose=False).flatten()
diff --git a/mhcflurry/data.py b/mhcflurry/data.py
index bbf2e76c98044c29b08f21b5e6e3c379973a7167..57bc2c58dbecb06754ea9ff2dd6fd171bd604fc4 100644
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -26,7 +26,8 @@ from .common import normalize_allele_name
 from .amino_acid import common_amino_acids
 from .peptide_encoding import (
     indices_to_hotshot_encoding,
-    fixed_length_from_many_peptides
+    fixed_length_index_encoding,
+    check_valid_index_encoding_array,
 )
 from .class1_allele_specific_hyperparameters import MAX_IC50
 
@@ -196,10 +197,32 @@ def load_allele_dicts(
 def encode_peptide_to_affinity_dict(
         peptide_to_affinity_dict,
         peptide_length=9,
-        flatten_binary_encoding=True):
+        flatten_binary_encoding=True,
+        allow_unknown_amino_acids=True):
     """
-    Given a dictionary mapping from peptide sequences to affinity values,
-    returns tuple with the following fields:
+    Given a dictionary mapping from peptide sequences to affinity values, return
+    both index and binary encodings of fixed length peptides, and
+    a vector of their affinities.
+
+    Parameters
+    ----------
+    peptide_to_affinity_dict : dict
+        Keys are peptide strings (of multiple lengths), each mapping to a
+        continuous affinity value.
+
+    peptide_length : int
+        Length of vector encoding
+
+    flatten_binary_encoding : bool
+        Should the binary encoding of a peptide be two-dimensional (9x20)
+        or a flattened 1d vector
+
+    allow_unknown_amino_acids : bool
+        When extending a short vector to the desired peptide length, should
+        we insert every possible amino acid or a designated character "X"
+        indicating an unknown amino acid.
+
+    Returns tuple with the following fields:
         - kmer_peptides: fixed length peptide strings
         - original_peptides: variable length peptide strings
         - counts: how many fixed length peptides were made from this original
@@ -208,24 +231,32 @@ def encode_peptide_to_affinity_dict(
         - Y: affinity values associated with original peptides
     """
     raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
-    kmer_peptides, original_peptides, counts = \
-        fixed_length_from_many_peptides(
+    X_index, kmer_peptides, original_peptides, counts = \
+        fixed_length_index_encoding(
             peptides=raw_peptides,
             desired_length=peptide_length,
             start_offset_shorten=0,
             end_offset_shorten=0,
             start_offset_extend=0,
-            end_offset_extend=0)
+            end_offset_extend=0,
+            allow_unknown_amino_acids=allow_unknown_amino_acids)
+
     n_samples = len(kmer_peptides)
+
     assert n_samples == len(original_peptides), \
         "Mismatch between # of samples (%d) and # of peptides (%d)" % (
             n_samples, len(original_peptides))
     assert n_samples == len(counts), \
         "Mismatch between # of samples (%d) and # of counts (%d)" % (
             n_samples, len(counts))
-
-    X_index = index_encoding(kmer_peptides, peptide_length)
-    X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
+    assert n_samples == len(X_index), \
+        "Mismatch between # of sample (%d) and index feature vectors (%d)" % (
+            n_samples, len(X_index))
+    X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids)
+    n_indices = 20 + allow_unknown_amino_acids
+    X_binary = indices_to_hotshot_encoding(
+        X_index,
+        n_indices=n_indices)
 
     assert X_binary.shape[0] == X_index.shape[0], \
         ("Mismatch between number of samples for index encoding (%d)"
@@ -235,7 +266,7 @@ def encode_peptide_to_affinity_dict(
 
     if flatten_binary_encoding:
         # collapse 3D input into 2D matrix
-        n_binary_features = peptide_length * 20
+        n_binary_features = peptide_length * n_indices
         X_binary = X_binary.reshape((n_samples, n_binary_features))
 
     # easier to work with counts when they're an array instead of list
@@ -260,7 +291,7 @@ def load_allele_datasets(
         peptide_column_name=None,
         peptide_length_column_name="peptide_length",
         ic50_column_name="meas",
-        only_human=True):
+        only_human=False):
     """
     Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides
     and log-transforms the IC50 measurement. Returns dictionary mapping allele
diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py
index d07a41b8407179e62e4d8aebd39784be4c62b6b6..59bfaed5a3a94a3b7eb742d2c058ffe74b6bba56 100644
--- a/mhcflurry/peptide_encoding.py
+++ b/mhcflurry/peptide_encoding.py
@@ -273,12 +273,16 @@ def fixed_length_index_encoding(
     refers to the position *before* the start of a peptide and, similarly,
     `end_offset_extend` = 0 refers to the position *after* the peptide.
 
-    Returns feature matrix X, a list of original peptides for each feature
-    vector, and a list of integer counts indicating how many rows share a
-    particular original peptide. When two rows are expanded out of a single
-    original peptide, they will both have a count of 2. These counts can
-    be useful for down-weighting the importance of multiple feature vectors
-    which originate from the same sample.
+    Returns tuple with the following fields:
+        - index encoded feature matrix X
+        - list of fixed length peptides
+        - list of "original" peptides of varying lengths
+        - list of integer counts indicating how many rows came from
+          that original peptide.
+
+    When two rows are expanded out of a single original peptide, they will both
+    have a count of 2. These counts can be useful for down-weighting the
+    importance of multiple feature vectors which originate from the same sample.
     """
     if allow_unknown_amino_acids:
         insert_letters = ["X"]
@@ -295,5 +299,18 @@ def fixed_length_index_encoding(
         start_offset_extend=start_offset_extend,
         end_offset_extend=end_offset_extend,
         insert_amino_acid_letters=insert_letters)
-    X = index_encoding(fixed_length, desired_length)
-    return X, original, counts
+    X_index = index_encoding(fixed_length, desired_length)
+    return X_index, fixed_length, original, counts
+
+def check_valid_index_encoding_array(X, allow_unknown_amino_acids=True):
+        X = np.asarray(X)
+        if len(X.shape) != 2:
+            raise ValueError("Expected 2d input, got array with shape %s" % (
+                X.shape,))
+        max_expected_index = 20 if allow_unknown_amino_acids else 19
+        if X.max() > max_expected_index:
+            raise ValueError(
+                "Got index %d in peptide encoding, max expected %d" % (
+                    X.max(),
+                    max_expected_index))
+        return X
diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py
index 1c6ed73a5aeedbaf1b174601dbfb7e061782e324..a82192fed8f1400532e287c7e937a092b0b7a385 100644
--- a/mhcflurry/predictor_base.py
+++ b/mhcflurry/predictor_base.py
@@ -67,7 +67,7 @@ class PredictorBase(object):
         indices = []
         encoded_matrices = []
         for i, peptide in enumerate(peptides):
-            matrix, _, _ = fixed_length_index_encoding(
+            matrix, _, _, _ = fixed_length_index_encoding(
                 peptides=[peptide],
                 desired_length=9,
                 allow_unknown_amino_acids=self.allow_unknown_amino_acids)