diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py
old mode 100644
new mode 100755
index b62a778b9ac8f385418aeef36714ded408aa0cd5..c9559f09529d16119790c94f79c1887283f6d90e
--- a/experiments/matrix-completion-hyperparameter-search.py
+++ b/experiments/matrix-completion-hyperparameter-search.py
@@ -24,12 +24,13 @@ For each allele, perform 5-fold cross validation to
 import argparse
 from collections import defaultdict
 
-from fancyimpute import MICE, KNN
+from fancyimpute import MICE, KNN, SimpleFill
 import numpy as np
 import pandas as pd
-from mhcflurry.peptide_encoding import (
-    fixed_length_index_encoding,
-    index_encoding,
+from mhcflurry.peptide_encoding import fixed_length_index_encoding
+from mhcflurry.amino_acid import (
+    common_amino_acids,
+    amino_acids_with_unknown,
 )
 from mhcflurry import Class1BindingPredictor
 from sklearn.cross_validation import StratifiedKFold
@@ -109,10 +110,15 @@ parser.add_argument(
     default=100)
 
 parser.add_argument(
-    "--use-mice",
+    "--impute",
+    default="mice",
+    help="Use {'mice', 'knn', 'meanfill'} for imputing pre-training data")
+
+parser.add_argument(
+    "--unknown-amino-acids",
     default=False,
     action="store_true",
-    help="Use MICE for imputation instead of KNN")
+    help="When expanding 8mers into 9mers use 'X' instead of all possible AAs")
 
 
 def print_length_distribution(peptides, values, name):
@@ -179,6 +185,21 @@ if __name__ == "__main__":
             "activation"
         ])
 
+    if args.unknown_amino_acids:
+        index_encoding = amino_acids_with_unknown.index_encoding
+    else:
+        index_encoding = common_amino_acids.index_encoding
+
+    impute_method_name = args.impute.lower()
+    if impute_method_name.startswith("mice"):
+        imputer = MICE(n_burn_in=5, n_imputations=20, min_value=0, max_value=1)
+    elif impute_method_name.startswith("knn"):
+        imputer = KNN(k=1, orientation="columns", print_interval=10)
+    elif impute_method_name.startswith("mean"):
+        imputer = SimpleFill("mean")
+    else:
+        raise ValueError("Invalid imputation method: %s" % impute_method_name)
+
     # want at least 5 samples in each fold of CV
     # to make meaningful estimates of accuracy
     min_samples_per_cv_fold = 5 * args.n_folds
@@ -243,10 +264,6 @@ if __name__ == "__main__":
                 allele_list=allele_list,
                 min_observations_per_peptide=2,
                 min_observations_per_allele=min(10, min_samples_per_cv_fold))
-            if args.use_mice:
-                imputer = MICE(n_burn_in=5, n_imputations=20, min_value=0, max_value=1)
-            else:
-                imputer = KNN(k=1, orientation="columns")
 
             pMHC_affinity_matrix_fold_completed = imputer.complete(pMHC_affinity_matrix_fold)
             # keep synthetic data for 9mer peptides,
@@ -276,7 +293,8 @@ if __name__ == "__main__":
             X_train, training_row_peptides, training_counts = \
                 fixed_length_index_encoding(
                     peptides=train_peptides,
-                    desired_length=9)
+                    desired_length=9,
+                    allow_unknown_amino_acids=args.unknown_amino_acids)
             most_common_peptide_idx = np.argmax(training_counts)
             if args.verbose:
                 print("-- Most common peptide in training data: %s (length=%d, count=%d)" % (
@@ -298,9 +316,13 @@ if __name__ == "__main__":
                                 activation
                             )
                             print("\n-----")
-                            print("Training model for %s with hyperparameters: %s" % (
-                                allele,
-                                key))
+                            print(
+                                ("Training model for %s (# peptides = %d, # samples=%d)"
+                                 " with parameters: %s") % (
+                                    allele,
+                                    len(train_peptides),
+                                    len(X_train),
+                                    key))
                             print("-----")
                             predictor = Class1BindingPredictor.from_hyperparameters(
                                 embedding_output_dim=embedding_dim_size,
@@ -308,6 +330,9 @@ if __name__ == "__main__":
                                 activation=activation,
                                 output_activation="sigmoid",
                                 dropout_probability=dropout,
+                                verbose=args.verbose,
+                                allow_unknown_amino_acids=args.unknown_amino_acids,
+                                embedding_input_dim=21 if args.unknown_amino_acids else 20,
                             )
                             predictor.fit(
                                 X=X_train,
diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py
index ec1ba0edbe1165226e40327356677ce6b85974f9..4ab1d343a2cb6462e2e50506c00e727a731fd75f 100644
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -16,16 +16,16 @@ from . import paths
 from . import data
 from . import feedforward
 from . import common
-from . import fixed_length_peptides
 from . import peptide_encoding
+from . import amino_acid
 from .class1_binding_predictor import Class1BindingPredictor
 
 __all__ = [
     "paths",
     "data",
     "feedforward",
-    "fixed_length_peptides",
     "peptide_encoding",
+    "amino_acid",
     "common",
     "Class1BindingPredictor"
 ]
diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py
index c7607d4d712ff1929056e99836ef2aa06696a44e..1ba60648febd60a3da2dce17395c16ecbc7e2d6c 100644
--- a/mhcflurry/amino_acid.py
+++ b/mhcflurry/amino_acid.py
@@ -12,7 +12,81 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-amino_acids = {
+import numpy as np
+
+
+class Alphabet(object):
+    """
+    Used to track the order of amino acids used for peptide encodings
+    """
+
+    def __init__(self, **kwargs):
+        self.letters_to_names = {}
+        for (k, v) in kwargs.items():
+            self.add(k, v)
+
+    def add(self, letter, name):
+        assert letter not in self.letters_to_names
+        assert len(letter) == 1
+        self.letters_to_names[letter] = name
+
+    def letters(self):
+        return list(sorted(self.letters_to_names.keys()))
+
+    def names(self):
+        return [self.letters_to_names[k] for k in self.letters()]
+
+    def index_dict(self):
+        return {c: i for (i, c) in enumerate(self.letters())}
+
+    def copy(self):
+        return Alphabet(**self.letters_to_names)
+
+    def __getitem__(self, k):
+        return self.letters_to_names[k]
+
+    def __setitem__(self, k, v):
+        self.add(k, v)
+
+    def index_encoding_list(self, peptides):
+        index_dict = self.index_dict()
+        return [
+            [index_dict[amino_acid] for amino_acid in peptide]
+            for peptide in peptides
+        ]
+
+    def index_encoding(self, peptides, peptide_length):
+        """
+        Encode a set of equal length peptides as a matrix of their
+        amino acid indices.
+        """
+        X = np.zeros((len(peptides), peptide_length), dtype=int)
+        index_dict = self.index_dict()
+        for i, peptide in enumerate(peptides):
+            for j, amino_acid in enumerate(peptide):
+                X[i, j] = index_dict[amino_acid]
+        return X
+
+    def hotshot_encoding(
+            self,
+            peptides,
+            peptide_length):
+        """
+        Encode a set of equal length peptides as a binary matrix,
+        where each letter is transformed into a length 20 vector with a single
+        element that is 1 (and the others are 0).
+        """
+        shape = (len(peptides), peptide_length, 20)
+        index_dict = self.index_dict()
+        X = np.zeros(shape, dtype=bool)
+        for i, peptide in enumerate(peptides):
+            for j, amino_acid in enumerate(peptide):
+                k = index_dict[amino_acid]
+                X[i, j, k] = 1
+        return X
+
+
+common_amino_acids = Alphabet(**{
     "A": "Alanine",
     "R": "Arginine",
     "N": "Asparagine",
@@ -33,8 +107,7 @@ amino_acids = {
     "W": "Tryptophan",
     "Y": "Tyrosine",
     "V": "Valine",
-}
+})
 
-amino_acid_letters = list(sorted(amino_acids.keys()))
-amino_acid_names = [amino_acids[k] for k in amino_acid_letters]
-amino_acid_letter_indices = {c: i for (i, c) in enumerate(amino_acid_letters)}
+amino_acids_with_unknown = common_amino_acids.copy()
+amino_acids_with_unknown.add("X", "Unknown")
diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py
index f75bf24e4958d5a16f461eb4e311272dd87317fc..3931a5ab5158a29a5cb88d39295ad08777d83af8 100644
--- a/mhcflurry/class1_binding_predictor.py
+++ b/mhcflurry/class1_binding_predictor.py
@@ -29,21 +29,34 @@ import json
 import numpy as np
 from keras.models import model_from_config
 
+from .amino_acid import (
+    common_amino_acids,
+    amino_acids_with_unknown,
+)
 from .class1_allele_specific_hyperparameters import MAX_IC50
 from .common import normalize_allele_name
-from .peptide_encoding import index_encoding
 from .paths import CLASS1_MODEL_DIRECTORY
 from .feedforward import make_embedding_network
-from .fixed_length_peptides import fixed_length_from_many_peptides
+from .peptide_encoding import fixed_length_from_many_peptides
 
 _allele_predictor_cache = {}
 
+common_amino_acid_letters = common_amino_acids.letters()
+
 
 class Class1BindingPredictor(object):
-    def __init__(self, model, name=None, max_ic50=MAX_IC50):
+    def __init__(
+            self,
+            model,
+            name=None,
+            max_ic50=MAX_IC50,
+            allow_unknown_amino_acids=False,
+            verbose=False):
         self.model = model
         self.max_ic50 = max_ic50
         self.name = name
+        self.allow_unknown_amino_acids = allow_unknown_amino_acids
+        self.verbose = verbose
 
     @classmethod
     def from_disk(
@@ -92,7 +105,8 @@ class Class1BindingPredictor(object):
             loss="mse",
             output_activation="sigmoid",
             dropout_probability=0,
-            learning_rate=0.001):
+            learning_rate=0.001,
+            **kwargs):
         """
         Create untrained predictor with the given hyperparameters.
         """
@@ -110,7 +124,8 @@ class Class1BindingPredictor(object):
         return cls(
             name=name,
             max_ic50=max_ic50,
-            model=model)
+            model=model,
+            **kwargs)
 
     def _combine_training_data(
             self,
@@ -197,8 +212,8 @@ class Class1BindingPredictor(object):
         X_combined = np.vstack([X_pretrain, X])
         Y_combined = np.concatenate([Y_pretrain, Y])
         combined_weights = np.concatenate([
+            pretrain_sample_weights,
             sample_weights,
-            pretrain_sample_weights
         ])
         return X_combined, Y_combined, combined_weights, n_pretrain_samples
 
@@ -252,6 +267,13 @@ class Class1BindingPredictor(object):
         total_combined_sample_weight = (
             total_pretrain_sample_weight + total_train_sample_weight)
 
+        if self.verbose:
+            print("-- Total pretrain weight = %f (%f%%), sample weight = %f (%f%%)" % (
+                total_pretrain_sample_weight,
+                100 * total_pretrain_sample_weight / total_combined_sample_weight,
+                total_train_sample_weight,
+                100 * total_train_sample_weight / total_combined_sample_weight))
+
         for epoch in range(n_training_epochs):
             # weights for synthetic points can be shrunk as:
             #  ~ 1 / (1+epoch)**2
@@ -305,42 +327,6 @@ class Class1BindingPredictor(object):
                     nb_epoch=1,
                     verbose=0)
 
-    """
-    def fit_peptides(
-            self,
-            peptides,
-            affinity_values,
-            sample_weights=None,
-            pretrain_peptides=None,
-            pretrain_affinity_values=None,
-            pretrain_sample_weights=None,
-            n_training_epochs=200,
-            verbose=False):
-        '''
-        Train model from peptide sequences, expanding shorter or longer
-        peptides to make 9mers.
-        '''
-        X, original_peptides, counts = \
-                fixed_length_index_encoding(
-                    peptides=peptides,
-                    desired_length=9)
-        lookup = {k: v for (k, v) in zip}
-        Y = np.asarray(Y)
-        if sample_weights is None:
-            sample_weights = np.ones_like(Y)
-        else:
-            sample_weights = np.asarray(Y)
-
-        if pretrain_peptides is None:
-
-        if Y_pretrain
-        Y_pretrain = np.asarray(Y_pretrain)
-        if pretrain_sample_weights is None:
-            pretrain_sample_weights = np.ones_like(Y_pretrain)
-
-        train_weights = 1.0 / np.array(expanded_train_counts)
-    """
-
     def to_disk(self, model_json_path, weights_hdf_path, overwrite=False):
         if exists(model_json_path) and overwrite:
             logging.info(
@@ -435,13 +421,22 @@ class Class1BindingPredictor(object):
         """
         return self.max_ic50 ** (1.0 - log_value)
 
+    def encode_peptides(self, peptides):
+        if self.allow_unknown_amino_acids:
+            return amino_acids_with_unknown.index_encoding(peptides, 9)
+        else:
+            return common_amino_acids.index_encoding(peptides, 9)
+
     def _predict_9mer_peptides(self, peptides):
         """
         Predict binding affinity for 9mer peptides
         """
         if any(len(peptide) != 9 for peptide in peptides):
             raise ValueError("Can only predict 9mer peptides")
-        X = index_encoding(peptides, peptide_length=9)
+        X = self.encode_peptides(peptides)
+        max_expected_index = 20 if self.allow_unknown_amino_acids else 19
+        assert X.max() <= max_expected_index, \
+            "Got index %d in peptide encoding" % (X.max(),)
         return self.model.predict(X, verbose=False).flatten()
 
     def _predict_9mer_peptides_ic50(self, peptides):
@@ -457,9 +452,19 @@ class Class1BindingPredictor(object):
             group_peptides = list(group_peptides)
             expanded_peptides, _, _ = fixed_length_from_many_peptides(
                 peptides=group_peptides,
-                desired_length=9)
+                desired_length=9,
+                insert_amino_acid_letters=(
+                    ["X"] if self.allow_unknown_amino_acids
+                    else common_amino_acid_letters))
+
             n_group = len(group_peptides)
             n_expanded = len(expanded_peptides)
+            if self.verbose:
+                print(
+                    "[Class1BindingPredictor] Expanded %d peptides of length %d => %d" % (
+                        n_group,
+                        length,
+                        n_expanded))
             expansion_factor = int(n_expanded / n_group)
             raw_y = self._predict_9mer_peptides(expanded_peptides)
             if expansion_factor == 1:
diff --git a/mhcflurry/data.py b/mhcflurry/data.py
index 75435c3b1de95f7d030452aa3406f53550035a5e..895edecdfe9db51affda0233f1defc59549eaafe 100644
--- a/mhcflurry/data.py
+++ b/mhcflurry/data.py
@@ -23,14 +23,14 @@ import pandas as pd
 import numpy as np
 
 from .common import normalize_allele_name
+from .amino_acid import common_amino_acids
 from .peptide_encoding import (
-    index_encoding,
     indices_to_hotshot_encoding,
-)
-from .fixed_length_peptides import (
     fixed_length_from_many_peptides
 )
 
+index_encoding = common_amino_acids.index_encoding
+
 AlleleData = namedtuple(
     "AlleleData",
     [
diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py
index 7f09c73def7612508d33f44981663b7f8aa785f9..f2c7876bc520d6f6513b4922335293fba07aabfe 100644
--- a/mhcflurry/feedforward.py
+++ b/mhcflurry/feedforward.py
@@ -128,9 +128,10 @@ def make_hotshot_network(
         output_activation="sigmoid",
         dropout_probability=0.0,
         optimizer=None,
-        learning_rate=0.001):
+        learning_rate=0.001,
+        n_amino_acids=20):
     return make_network(
-        input_size=peptide_length * 20,
+        input_size=peptide_length * n_amino_acids,
         layer_sizes=layer_sizes,
         activation=activation,
         init=init,
diff --git a/mhcflurry/fixed_length_peptides.py b/mhcflurry/fixed_length_peptides.py
deleted file mode 100644
index 1ab37bdf4fd0a178c52be952f34888f1eaf652ad..0000000000000000000000000000000000000000
--- a/mhcflurry/fixed_length_peptides.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) 2015. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import (
-    print_function,
-    division,
-    absolute_import,
-)
-import itertools
-import logging
-
-from .amino_acid import amino_acid_letters
-
-
-def all_kmers(k, alphabet=amino_acid_letters):
-    """
-    Generates all k-mer peptide sequences
-
-    Parameters
-    ----------
-    k : int
-
-    alphabet : str | list of characters
-    """
-    alphabets = [alphabet] * k
-    return [
-        "".join(combination)
-        for combination
-        in itertools.product(*alphabets)
-    ]
-
-
-class CombinatorialExplosion(Exception):
-    pass
-
-
-def extend_peptide(
-        peptide,
-        desired_length,
-        start_offset,
-        end_offset,
-        alphabet=amino_acid_letters):
-    """Extend peptide by inserting every possible amino acid combination
-    if we're trying to e.g. turn an 8mer into 9mers.
-
-    Parameters
-    ----------
-    peptide : str
-
-    desired_length : int
-
-    start_offset : int
-        How many characters (from the position before the start of the string)
-        to skip before inserting characters.
-
-
-    end_offset : int
-        Last character position from the end where we insert new characters,
-        where 0 is the position after the last character.
-
-    alphabet : str | list of character
-    """
-    n = len(peptide)
-    assert n < desired_length, \
-        "%s (length = %d) is too long! Must be shorter than %d" % (
-            peptide, n, desired_length)
-    n_missing = desired_length - n
-    if n_missing > 3:
-        raise CombinatorialExplosion(
-            "Cannot transform %s of length %d into a %d-mer peptide" % (
-                peptide, n, desired_length))
-    return [
-        peptide[:i] + extra + peptide[i:]
-        for i in range(start_offset, n - end_offset + 1)
-        for extra in all_kmers(n_missing, alphabet=alphabet)
-    ]
-
-
-def shorten_peptide(
-        peptide,
-        desired_length,
-        start_offset,
-        end_offset,
-        alphabet=amino_acid_letters):
-    """Shorten peptide if trying to e.g. turn 10mer into 9mers
-
-    Parameters
-    ----------
-
-    peptide : str
-
-    desired_length : int
-
-    start_offset : int
-
-    end_offset : int
-
-    alphabet : str | list of characters
-    """
-    n = len(peptide)
-    assert n > desired_length, \
-        "%s (length = %d) is too short! Must be longer than %d" % (
-            peptide, n, desired_length)
-    n_skip = n - desired_length
-    assert n_skip > 0, \
-        "Expected length of peptide %s %d to be greater than %d" % (
-            peptide, n, desired_length)
-    end_range = n - end_offset - n_skip + 1
-    return [
-        peptide[:i] + peptide[i + n_skip:]
-        for i in range(start_offset, end_range)
-    ]
-
-
-def fixed_length_from_many_peptides(
-        peptides,
-        desired_length,
-        start_offset_extend=2,
-        end_offset_extend=1,
-        start_offset_shorten=2,
-        end_offset_shorten=0,
-        alphabet=amino_acid_letters):
-    """
-    Create a set of fixed-length k-mer peptides from a collection of varying
-    length peptides.
-
-
-    Shorter peptides are filled in using all possible amino acids at any
-    insertion site between start_offset_shorten and -end_offset_shorten
-    where start_offset_extend=0 represents insertions before the string
-    and end_offset_extend=0 represents insertions after the string's ending.
-
-    Longer peptides are shortened by deleting contiguous residues, starting
-    from start_offset_shorten and ending with -end_offset_shorten. Unlike
-    peptide extensions, the offsets for shortening a peptide range between
-    the first and last positions (rather than between the positions *before*
-    the string starts and the position *after*).
-
-    We can recreate the methods from:
-       Accurate approximation method for prediction of class I MHC
-       affinities for peptides of length 8, 10 and 11 using prediction
-       tools trained on 9mers.
-    by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
-    with the following settings:
-        - desired_length = 9
-        - start_offset_extend = 3
-        - end_offset_extend = 2
-        - start_offset_shorten = 3
-        - end_offset_shorten = 1
-
-    Returns three lists:
-        - a list of fixed length peptides (all of length `desired_length`)
-        - a list of the original peptides from which subsequences were
-          contracted or lengthened
-        - a list of integers indicating the number of fixed length peptides
-         generated from each variable length peptide.
-
-    Example:
-        kmers, original, counts = fixed_length_from_many_peptides(
-            peptides=["ABC", "A"]
-            desired_length=2,
-            start_offset_extend=0,
-            end_offset_extend=0,
-            start_offset_shorten=0,
-            end_offset_shorten=0,
-            alphabet="ABC")
-        kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]
-        original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]
-        counts == [3, 3, 3, 6, 6, 6, 6, 6, 6]
-
-    Parameters
-    ----------
-    peptide : list of str
-
-    desired_length : int
-
-    start_offset_extend : int
-
-    end_offset_extend : int
-
-    start_offset_shorten : int
-
-    end_offset_shorten : int
-
-    alphabet : str | list of characters
-    """
-    all_fixed_length_peptides = []
-    original_peptide_sequences = []
-    counts = []
-    for peptide in peptides:
-        n = len(peptide)
-        if n == desired_length:
-            fixed_length_peptides = [peptide]
-        elif n < desired_length:
-            try:
-                fixed_length_peptides = extend_peptide(
-                    peptide=peptide,
-                    desired_length=desired_length,
-                    start_offset=start_offset_extend,
-                    end_offset=end_offset_extend,
-                    alphabet=alphabet)
-            except CombinatorialExplosion:
-                logging.warn(
-                    "Peptide %s is too short to be extended to length %d" % (
-                        peptide, desired_length))
-                continue
-        else:
-            fixed_length_peptides = shorten_peptide(
-                peptide=peptide,
-                desired_length=desired_length,
-                start_offset=start_offset_shorten,
-                end_offset=end_offset_shorten,
-                alphabet=alphabet)
-
-        n_fixed_length = len(fixed_length_peptides)
-        all_fixed_length_peptides.extend(fixed_length_peptides)
-        original_peptide_sequences.extend([peptide] * n_fixed_length)
-        counts.extend([n_fixed_length] * n_fixed_length)
-    return all_fixed_length_peptides, original_peptide_sequences, counts
diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py
index ac9f96207a86e28e9129de87b97fa7d7842fa57f..c4698fc9d4a57d4bdf71cb76917483defd284957 100644
--- a/mhcflurry/peptide_encoding.py
+++ b/mhcflurry/peptide_encoding.py
@@ -12,37 +12,223 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import itertools
+import logging
+
 import numpy as np
 
-from .amino_acid import amino_acid_letter_indices
-from .fixed_length_peptides import fixed_length_from_many_peptides
+from .amino_acid import common_amino_acids, amino_acids_with_unknown
+
+common_amino_acid_letters = common_amino_acids.letters()
+
+
+def all_kmers(k, alphabet=common_amino_acid_letters):
+    """
+    Generates all k-mer peptide sequences
+
+    Parameters
+    ----------
+    k : int
+
+    alphabet : str | list of characters
+    """
+    alphabets = [alphabet] * k
+    return [
+        "".join(combination)
+        for combination
+        in itertools.product(*alphabets)
+    ]
+
+
+class CombinatorialExplosion(Exception):
+    pass
+
 
+def extend_peptide(
+        peptide,
+        desired_length,
+        start_offset,
+        end_offset,
+        insert_amino_acid_letters=common_amino_acid_letters):
+    """Extend peptide by inserting every possible amino acid combination
+    if we're trying to e.g. turn an 8mer into 9mers.
+
+    Parameters
+    ----------
+    peptide : str
+
+    desired_length : int
+
+    start_offset : int
+        How many characters (from the position before the start of the string)
+        to skip before inserting characters.
+
+
+    end_offset : int
+        Last character position from the end where we insert new characters,
+        where 0 is the position after the last character.
 
-def hotshot_encoding(peptides, peptide_length):
+    insert_alphabet : str | list of character
     """
-    Encode a set of equal length peptides as a binary matrix,
-    where each letter is transformed into a length 20 vector with a single
-    element that is 1 (and the others are 0).
+    n = len(peptide)
+    assert n < desired_length, \
+        "%s (length = %d) is too long! Must be shorter than %d" % (
+            peptide, n, desired_length)
+    n_missing = desired_length - n
+    if n_missing > 3:
+        raise CombinatorialExplosion(
+            "Cannot transform %s of length %d into a %d-mer peptide" % (
+                peptide, n, desired_length))
+    return [
+        peptide[:i] + extra + peptide[i:]
+        for i in range(start_offset, n - end_offset + 1)
+        for extra in all_kmers(
+            n_missing,
+            alphabet=insert_amino_acid_letters)
+    ]
+
+
+def shorten_peptide(
+        peptide,
+        desired_length,
+        start_offset,
+        end_offset,
+        insert_amino_acid_letters=common_amino_acid_letters):
+    """Shorten peptide if trying to e.g. turn 10mer into 9mers
+
+    Parameters
+    ----------
+
+    peptide : str
+
+    desired_length : int
+
+    start_offset : int
+
+    end_offset : int
+
+    alphabet : str | list of characters
     """
-    shape = (len(peptides), peptide_length, 20)
-    X = np.zeros(shape, dtype=bool)
-    for i, peptide in enumerate(peptides):
-        for j, amino_acid in enumerate(peptide):
-            k = amino_acid_letter_indices[amino_acid]
-            X[i, j, k] = 1
-    return X
+    n = len(peptide)
+    assert n > desired_length, \
+        "%s (length = %d) is too short! Must be longer than %d" % (
+            peptide, n, desired_length)
+    n_skip = n - desired_length
+    assert n_skip > 0, \
+        "Expected length of peptide %s %d to be greater than %d" % (
+            peptide, n, desired_length)
+    end_range = n - end_offset - n_skip + 1
+    return [
+        peptide[:i] + peptide[i + n_skip:]
+        for i in range(start_offset, end_range)
+    ]
 
 
-def index_encoding(peptides, peptide_length):
+def fixed_length_from_many_peptides(
+        peptides,
+        desired_length,
+        start_offset_extend=2,
+        end_offset_extend=1,
+        start_offset_shorten=2,
+        end_offset_shorten=0,
+        insert_amino_acid_letters=common_amino_acid_letters):
     """
-    Encode a set of equal length peptides as a vector of their
-    amino acid indices.
+    Create a set of fixed-length k-mer peptides from a collection of varying
+    length peptides.
+
+
+    Shorter peptides are filled in using all possible amino acids at any
+    insertion site between start_offset_shorten and -end_offset_shorten
+    where start_offset_extend=0 represents insertions before the string
+    and end_offset_extend=0 represents insertions after the string's ending.
+
+    Longer peptides are shortened by deleting contiguous residues, starting
+    from start_offset_shorten and ending with -end_offset_shorten. Unlike
+    peptide extensions, the offsets for shortening a peptide range between
+    the first and last positions (rather than between the positions *before*
+    the string starts and the position *after*).
+
+    We can recreate the methods from:
+       Accurate approximation method for prediction of class I MHC
+       affinities for peptides of length 8, 10 and 11 using prediction
+       tools trained on 9mers.
+    by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397)
+    with the following settings:
+        - desired_length = 9
+        - start_offset_extend = 3
+        - end_offset_extend = 2
+        - start_offset_shorten = 3
+        - end_offset_shorten = 1
+
+    Returns three lists:
+        - a list of fixed length peptides (all of length `desired_length`)
+        - a list of the original peptides from which subsequences were
+          contracted or lengthened
+        - a list of integers indicating the number of fixed length peptides
+         generated from each variable length peptide.
+
+    Example:
+        kmers, original, counts = fixed_length_from_many_peptides(
+            peptides=["ABC", "A"]
+            desired_length=2,
+            start_offset_extend=0,
+            end_offset_extend=0,
+            start_offset_shorten=0,
+            end_offset_shorten=0,
+            insert_amino_acid_letters="ABC")
+        kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]
+        original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]
+        counts == [3, 3, 3, 6, 6, 6, 6, 6, 6]
+
+    Parameters
+    ----------
+    peptide : list of str
+
+    desired_length : int
+
+    start_offset_extend : int
+
+    end_offset_extend : int
+
+    start_offset_shorten : int
+
+    end_offset_shorten : int
+
+    insert_amino_acid_letters : str | list of characters
     """
-    X = np.zeros((len(peptides), peptide_length), dtype=int)
-    for i, peptide in enumerate(peptides):
-        for j, amino_acid in enumerate(peptide):
-            X[i, j] = amino_acid_letter_indices[amino_acid]
-    return X
+    all_fixed_length_peptides = []
+    original_peptide_sequences = []
+    counts = []
+    for peptide in peptides:
+        n = len(peptide)
+        if n == desired_length:
+            fixed_length_peptides = [peptide]
+        elif n < desired_length:
+            try:
+                fixed_length_peptides = extend_peptide(
+                    peptide=peptide,
+                    desired_length=desired_length,
+                    start_offset=start_offset_extend,
+                    end_offset=end_offset_extend,
+                    insert_amino_acid_letters=insert_amino_acid_letters)
+            except CombinatorialExplosion:
+                logging.warn(
+                    "Peptide %s is too short to be extended to length %d" % (
+                        peptide, desired_length))
+                continue
+        else:
+            fixed_length_peptides = shorten_peptide(
+                peptide=peptide,
+                desired_length=desired_length,
+                start_offset=start_offset_shorten,
+                end_offset=end_offset_shorten,
+                insert_amino_acid_letters=insert_amino_acid_letters)
+
+        n_fixed_length = len(fixed_length_peptides)
+        all_fixed_length_peptides.extend(fixed_length_peptides)
+        original_peptide_sequences.extend([peptide] * n_fixed_length)
+        counts.extend([n_fixed_length] * n_fixed_length)
+    return all_fixed_length_peptides, original_peptide_sequences, counts
 
 
 def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0):
@@ -68,7 +254,8 @@ def fixed_length_index_encoding(
         start_offset_shorten=0,
         end_offset_shorten=0,
         start_offset_extend=0,
-        end_offset_extend=0):
+        end_offset_extend=0,
+        allow_unknown_amino_acids=False):
     """
     Take peptides of varying lengths, chop them into substrings of fixed
     length and apply index encoding to these substrings.
@@ -94,12 +281,20 @@ def fixed_length_index_encoding(
     be useful for down-weighting the importance of multiple feature vectors
     which originate from the same sample.
     """
+    if allow_unknown_amino_acids:
+        insert_letters = ["X"]
+        index_encoding = amino_acids_with_unknown.index_encoding
+    else:
+        insert_letters = common_amino_acids.letters()
+        index_encoding = common_amino_acids.index_encoding
+
     fixed_length, original, counts = fixed_length_from_many_peptides(
         peptides=peptides,
         desired_length=desired_length,
         start_offset_shorten=start_offset_shorten,
         end_offset_shorten=end_offset_shorten,
         start_offset_extend=start_offset_extend,
-        end_offset_extend=end_offset_extend)
+        end_offset_extend=end_offset_extend,
+        insert_amino_acid_letters=insert_letters)
     X = index_encoding(fixed_length, desired_length)
     return X, original, counts
diff --git a/test/test_fixed_length_peptides.py b/test/test_fixed_length_peptides.py
index ebe0f8e25cc0ce2b4861275cfdb465b054c68ba8..733ef8a8f49737762df93479457c9b38399a5c7d 100644
--- a/test/test_fixed_length_peptides.py
+++ b/test/test_fixed_length_peptides.py
@@ -1,4 +1,4 @@
-from mhcflurry.fixed_length_peptides import (
+from mhcflurry.peptide_encoding import (
     all_kmers,
     extend_peptide,
     shorten_peptide,
@@ -26,7 +26,7 @@ def test_extend_peptide_all_positions():
         desired_length=4,
         start_offset=0,
         end_offset=0,
-        alphabet="01")
+        insert_amino_acid_letters="01")
 
     expected = [
         "0111",
@@ -48,7 +48,7 @@ def test_shorten_peptide_all_positions():
         desired_length=2,
         start_offset=0,
         end_offset=0,
-        alphabet="012")
+        insert_amino_acid_letters="012")
 
     expected = [
         "12",
@@ -65,7 +65,7 @@ def test_shorten_peptide_all_positions_except_first():
         desired_length=2,
         start_offset=1,
         end_offset=0,
-        alphabet="012")
+        insert_amino_acid_letters="012")
 
     expected = [
         "02",
@@ -81,7 +81,7 @@ def test_shorten_peptide_all_positions_except_last():
         desired_length=2,
         start_offset=0,
         end_offset=1,
-        alphabet="012")
+        insert_amino_acid_letters="012")
 
     expected = [
         "12",
@@ -98,7 +98,7 @@ def test_fixed_length_from_many_peptides():
         end_offset_extend=0,
         start_offset_shorten=0,
         end_offset_shorten=0,
-        alphabet="ABC")
+        insert_amino_acid_letters="ABC")
     print(kmers)
     print(original)
     print(counts)