From 18d451d26a1fb45b46a8388fddef690b45c8fc77 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Thu, 5 Sep 2019 17:31:35 -0400 Subject: [PATCH] docs --- .travis.yml | 2 +- mhcflurry/train_pan_allele_models_command.py | 44 ++++++++++++++++++++ 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6676b2e9..a75ba01c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -50,7 +50,7 @@ env: - KERAS_BACKEND=tensorflow script: # download data and models, then run tests - - mhcflurry-downloads fetch + - mhcflurry-downloads fetch data_curated models_class1 models_class1_pan - mhcflurry-downloads info # just to test this command works - nosetests test -sv - ./lint.sh diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index cf6a8624..d925f991 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -133,6 +133,35 @@ add_cluster_parallelism_args(parser) def assign_folds(df, num_folds, held_out_fraction, held_out_max): + """ + Split training data into multple test/train pairs, which we refer to as + folds. Note that a given data point may be assigned to multiple test or + train sets; these folds are NOT a non-overlapping partition as used in cross + validation. + + A fold is defined by a boolean value for each data point, indicating whether + it is included in the training data for that fold. If it's not in the + training data, then it's in the test data. + + Folds are balanced in terms of allele content. + + Parameters + ---------- + df : pandas.DataFrame + training data + num_folds : int + held_out_fraction : float + Fraction of data to hold out as test data in each fold + held_out_max + For a given allele, do not hold out more than held_out_max number of + data points in any fold. + + Returns + ------- + pandas.DataFrame + index is same as df.index, columns are "fold_0", ... "fold_N" giving + whether the data point is in the training data for the fold + """ result_df = pandas.DataFrame(index=df.index) for fold in range(num_folds): @@ -183,6 +212,21 @@ def pretrain_data_iterator( filename, master_allele_encoding, peptides_per_chunk=1024): + """ + Step through a CSV file giving predictions for a large number of peptides + (rows) and alleles (columns). + + Parameters + ---------- + filename : string + master_allele_encoding : AlleleEncoding + peptides_per_chunk : int + + Returns + ------- + Generator of (AlleleEncoding, EncodableSequences, float affinities) tuples + + """ empty = pandas.read_csv(filename, index_col=0, nrows=0) usable_alleles = [ c for c in empty.columns -- GitLab