diff --git a/downloads-generation/data_kim2014/GENERATE.sh b/downloads-generation/data_kim2014/GENERATE.sh
index 3ee0078cf12ed7963ef372f8bedd40ef25ba38f0..3ae9a6b70bdd76aba2b323ce7d1c168086d04235 100755
--- a/downloads-generation/data_kim2014/GENERATE.sh
+++ b/downloads-generation/data_kim2014/GENERATE.sh
@@ -18,7 +18,7 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 # Log some environment info
 date
 pip freeze
-git rev-parse HEAD
+# git rev-parse HEAD
 git status
 
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
diff --git a/downloads-generation/models_class1_allele_specific_single/GENERATE.sh b/downloads-generation/models_class1_allele_specific_single/GENERATE.sh
index f49fc99c20d62ee71e7502a1b0eee291a3384250..93463556a3f946d77522ab590d9b76bdf033fc76 100755
--- a/downloads-generation/models_class1_allele_specific_single/GENERATE.sh
+++ b/downloads-generation/models_class1_allele_specific_single/GENERATE.sh
@@ -25,9 +25,9 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 
 # Log some environment info
 date
-pip freeze
-git rev-parse HEAD
-git status
+# pip freeze
+# git rev-parse HEAD
+# git status
 
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py
index dc67b8bc3bff77fee5d5d845de4f7535eb73a480..a2b1d664cabb8f417700ef252abf27c73d01353c 100644
--- a/mhcflurry/__init__.py
+++ b/mhcflurry/__init__.py
@@ -15,9 +15,10 @@
 from .class1_allele_specific.class1_binding_predictor import (
     Class1BindingPredictor)
 from .predict import predict
-from .package_metadata import __version__
 from . import parallelism
 
+__version__ = "0.2.0"
+
 __all__ = [
     "Class1BindingPredictor",
     "predict",
diff --git a/mhcflurry/class1_allele_specific/cross_validation.py b/mhcflurry/class1_allele_specific/cross_validation.py
index 1bf0038fdde8f9ab2a6808693bc18d0e93fdba75..d2189377e0d9f28d4d608bda9186b9a6095b06e5 100644
--- a/mhcflurry/class1_allele_specific/cross_validation.py
+++ b/mhcflurry/class1_allele_specific/cross_validation.py
@@ -20,12 +20,12 @@ from __future__ import (
 import collections
 import logging
 
-import pepdata
+from pepdata.reduced_alphabet import make_alphabet_transformer, gbmr4
 
 from .train import impute_and_select_allele, AlleleSpecificTrainTestFold
 from ..parallelism import get_default_backend
 
-gbmr4_transformer = pepdata.reduced_alphabet.make_alphabet_transformer("gbmr4")
+gbmr4_transformer = make_alphabet_transformer(gbmr4)
 
 
 def default_projector(peptide):
@@ -70,7 +70,8 @@ def similar_peptides(set1, set2, projector=default_projector):
 
     Returns
     ----------
-    string list
+    string list of peptides which approximately overlap between the two input
+    sets.
     """
     result = collections.defaultdict(lambda: ([], []))
     for (index, peptides) in enumerate([set1, set2]):
@@ -156,6 +157,10 @@ def cross_validation_folds(
                 )
 
             if peptides_to_remove:
+                # TODO: instead of dropping peptides, downweight the
+                # peptides which get grouped together
+                # For example, we could replace this code with
+                #   test_peptides, test_peptide_weights = ....
                 test_split = full_test_split.drop_allele_peptide_lists(
                     [allele] * len(peptides_to_remove),
                     peptides_to_remove)
diff --git a/mhcflurry/keras_layers/drop_mask.py b/mhcflurry/keras_layers/drop_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ea799ca15ec7f5561f71d721fe6619036c74401
--- /dev/null
+++ b/mhcflurry/keras_layers/drop_mask.py
@@ -0,0 +1,16 @@
+from keras.layers import Layer
+
+class DropMask(Layer):
+    """
+    Sometimes we know that a mask is always going to contain 1s (and never 0s)
+    due to e.g. slicing the beginning of a sequence with a known min length.
+    In that case it can be useful to drop the sequence mask and feed the
+    activations to a layer which does not support masking (e.g. Dense).
+    """
+    supports_masking = True
+
+    def call(self, x, mask):
+        return x
+
+    def compute_mask(self, x, mask):
+        return None
diff --git a/mhcflurry/keras_layers/masked_global_average_pooling.py b/mhcflurry/keras_layers/masked_global_average_pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..187cc9c862dfd4f3896c82680d249a6653350888
--- /dev/null
+++ b/mhcflurry/keras_layers/masked_global_average_pooling.py
@@ -0,0 +1,31 @@
+import keras.layers
+import keras.backend as K
+
+class MaskedGlobalAveragePooling1D(keras.layers.pooling._GlobalPooling1D):
+    """
+    Takes an embedded representation of a sentence with dims
+    (n_samples, max_length, n_dims)
+    where each sample is masked to allow for variable-length inputs.
+    Returns a tensor of shape (n_samples, n_dims) after averaging across
+    time in a mask-sensitive fashion.
+    """
+    supports_masking = True
+
+    def call(self, x, mask):
+        expanded_mask = K.expand_dims(mask)
+        # zero embedded vectors which come from masked characters
+        x_masked = x * expanded_mask
+        # how many non-masked characters are in each row?
+        mask_counts = K.sum(mask, axis=-1)
+        # add up the vector representations along the time dimension
+        # the result should have dimension (n_samples, n_embedding_dims)
+        x_sums = K.sum(x_masked, axis=1)
+        # cast the number of non-zero elements to float32 and
+        # give it an extra dimension so it can broadcast properly in
+        # an elementwise divsion
+        counts_cast = K.expand_dims(K.cast(mask_counts, "float32"))
+        return x_sums / counts_cast
+
+    def compute_mask(self, x, mask):
+        return None
+
diff --git a/mhcflurry/keras_layers/masked_global_max_pooling.py b/mhcflurry/keras_layers/masked_global_max_pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bec252a5a445cb691c9ed2503cc5402a3d7e0ad
--- /dev/null
+++ b/mhcflurry/keras_layers/masked_global_max_pooling.py
@@ -0,0 +1,24 @@
+import keras.layers
+import keras.backend as K
+
+class MaskedGlobalMaxPooling1D(keras.layers.pooling._GlobalPooling1D):
+    """
+    Takes an embedded representation of a sentence with dims
+    (n_samples, max_length, n_dims)
+    where each sample is masked to allow for variable-length inputs.
+    Returns a tensor of shape (n_samples, n_dims) after averaging across
+    time in a mask-sensitive fashion.
+    """
+    supports_masking = True
+
+    def call(self, x, mask):
+        expanded_mask = K.expand_dims(mask)
+        # zero embedded vectors which come from masked characters
+        x_masked = x * expanded_mask
+
+        # one flaw here is that we're returning max(0, max(x[:, i])) instead of
+        # max(x[:, i])
+        return K.max(x_masked, axis=1)
+
+    def compute_mask(self, x, mask):
+        return None
diff --git a/mhcflurry/keras_layers/masked_slice.py b/mhcflurry/keras_layers/masked_slice.py
new file mode 100644
index 0000000000000000000000000000000000000000..022c7f79058763a878556adb42ad71971f02ff2b
--- /dev/null
+++ b/mhcflurry/keras_layers/masked_slice.py
@@ -0,0 +1,37 @@
+import keras.layers
+
+class MaskedSlice(keras.layers.Lambda):
+    """
+    Takes an embedded representation of a sentence with dims
+    (n_samples, max_length, n_dims)
+    where each sample is masked to allow for variable-length inputs.
+    Returns a tensor of shape (n_samples, n_dims) which are the first
+    and last vectors in each sentence.
+    """
+    supports_masking = True
+
+    def __init__(
+            self,
+            time_start,
+            time_end,
+            *args,
+            **kwargs):
+        assert time_start >= 0
+        assert time_end >= 0
+        self.time_start = time_start
+        self.time_end = time_end
+        super(MaskedSlice, self).__init__(*args, **kwargs)
+
+    def call(self, x, mask):
+        return x[:, self.time_start:self.time_end, :]
+
+    def compute_mask(self, x, mask):
+        return mask[:, self.time_start:self.time_end, :]
+
+    def get_output_shape_for(self, input_shape):
+        assert len(input_shape) == 3
+        output_shape = (
+            input_shape[0],
+            self.time_end - self.time_start + 1,
+            input_shape[2])
+        return output_shape
diff --git a/mhcflurry/package_metadata.py b/mhcflurry/package_metadata.py
deleted file mode 100644
index 36e61c641d912318fc6e395d4913c21c77c544ff..0000000000000000000000000000000000000000
--- a/mhcflurry/package_metadata.py
+++ /dev/null
@@ -1,2 +0,0 @@
-
-__version__ = "0.1.0"
diff --git a/mhcflurry/parallelism.py b/mhcflurry/parallelism.py
index faecc871469210058ebb11f3323ae0a1ccdc20ce..df8dcfab8ff6b054ee65a2b1fc71df85f204b7bc 100644
--- a/mhcflurry/parallelism.py
+++ b/mhcflurry/parallelism.py
@@ -1,5 +1,5 @@
-from concurrent import futures
 import logging
+from concurrent import futures
 
 DEFAULT_BACKEND = None
 
diff --git a/setup.py b/setup.py
index 909ba1f78c4328e7374ce3243423e1ce72d35e1c..dd66f5813776311e7bcaa4269175373e26a8cfce 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@ except:
     pass
 
 
-with open('mhcflurry/package_metadata.py', 'r') as f:
+with open('mhcflurry/__init__.py', 'r') as f:
     version = re.search(
         r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]',
         f.read(),