From 18d451d26a1fb45b46a8388fddef690b45c8fc77 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Thu, 5 Sep 2019 17:31:35 -0400
Subject: [PATCH] docs

---
 .travis.yml                                  |  2 +-
 mhcflurry/train_pan_allele_models_command.py | 44 ++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 6676b2e9..a75ba01c 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -50,7 +50,7 @@ env:
     - KERAS_BACKEND=tensorflow
 script:
   # download data and models, then run tests
-  - mhcflurry-downloads fetch
+  - mhcflurry-downloads fetch data_curated models_class1 models_class1_pan
   - mhcflurry-downloads info  # just to test this command works
   - nosetests test -sv
   - ./lint.sh
diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py
index cf6a8624..d925f991 100644
--- a/mhcflurry/train_pan_allele_models_command.py
+++ b/mhcflurry/train_pan_allele_models_command.py
@@ -133,6 +133,35 @@ add_cluster_parallelism_args(parser)
 
 
 def assign_folds(df, num_folds, held_out_fraction, held_out_max):
+    """
+    Split training data into multple test/train pairs, which we refer to as
+    folds. Note that a given data point may be assigned to multiple test or
+    train sets; these folds are NOT a non-overlapping partition as used in cross
+    validation.
+
+    A fold is defined by a boolean value for each data point, indicating whether
+    it is included in the training data for that fold. If it's not in the
+    training data, then it's in the test data.
+
+    Folds are balanced in terms of allele content.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        training data
+    num_folds : int
+    held_out_fraction : float
+        Fraction of data to hold out as test data in each fold
+    held_out_max
+        For a given allele, do not hold out more than held_out_max number of
+        data points in any fold.
+
+    Returns
+    -------
+    pandas.DataFrame
+        index is same as df.index, columns are "fold_0", ... "fold_N" giving
+        whether the data point is in the training data for the fold
+    """
     result_df = pandas.DataFrame(index=df.index)
 
     for fold in range(num_folds):
@@ -183,6 +212,21 @@ def pretrain_data_iterator(
         filename,
         master_allele_encoding,
         peptides_per_chunk=1024):
+    """
+    Step through a CSV file giving predictions for a large number of peptides
+    (rows) and alleles (columns).
+
+    Parameters
+    ----------
+    filename : string
+    master_allele_encoding : AlleleEncoding
+    peptides_per_chunk : int
+
+    Returns
+    -------
+    Generator of (AlleleEncoding, EncodableSequences, float affinities) tuples
+
+    """
     empty = pandas.read_csv(filename, index_col=0, nrows=0)
     usable_alleles = [
         c for c in empty.columns
-- 
GitLab