diff --git a/.travis.yml b/.travis.yml
index 6676b2e9eaf0dd4d3b932fb1bfd37700682a8c6d..a75ba01cd86ecf728eb4ff2affc80b7ed34e08ef 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -50,7 +50,7 @@ env:
     - KERAS_BACKEND=tensorflow
 script:
   # download data and models, then run tests
-  - mhcflurry-downloads fetch
+  - mhcflurry-downloads fetch data_curated models_class1 models_class1_pan
   - mhcflurry-downloads info  # just to test this command works
   - nosetests test -sv
   - ./lint.sh
diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py
index cf6a86243dbc2a5ced64f10cbfd0d73a8fbef557..d925f991af0cc4d66cf4bbf081eaa6d0d698ec06 100644
--- a/mhcflurry/train_pan_allele_models_command.py
+++ b/mhcflurry/train_pan_allele_models_command.py
@@ -133,6 +133,35 @@ add_cluster_parallelism_args(parser)
 
 
 def assign_folds(df, num_folds, held_out_fraction, held_out_max):
+    """
+    Split training data into multple test/train pairs, which we refer to as
+    folds. Note that a given data point may be assigned to multiple test or
+    train sets; these folds are NOT a non-overlapping partition as used in cross
+    validation.
+
+    A fold is defined by a boolean value for each data point, indicating whether
+    it is included in the training data for that fold. If it's not in the
+    training data, then it's in the test data.
+
+    Folds are balanced in terms of allele content.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        training data
+    num_folds : int
+    held_out_fraction : float
+        Fraction of data to hold out as test data in each fold
+    held_out_max
+        For a given allele, do not hold out more than held_out_max number of
+        data points in any fold.
+
+    Returns
+    -------
+    pandas.DataFrame
+        index is same as df.index, columns are "fold_0", ... "fold_N" giving
+        whether the data point is in the training data for the fold
+    """
     result_df = pandas.DataFrame(index=df.index)
 
     for fold in range(num_folds):
@@ -183,6 +212,21 @@ def pretrain_data_iterator(
         filename,
         master_allele_encoding,
         peptides_per_chunk=1024):
+    """
+    Step through a CSV file giving predictions for a large number of peptides
+    (rows) and alleles (columns).
+
+    Parameters
+    ----------
+    filename : string
+    master_allele_encoding : AlleleEncoding
+    peptides_per_chunk : int
+
+    Returns
+    -------
+    Generator of (AlleleEncoding, EncodableSequences, float affinities) tuples
+
+    """
     empty = pandas.read_csv(filename, index_col=0, nrows=0)
     usable_alleles = [
         c for c in empty.columns