From 4935045d1a42653c96fdfdba1656287ab5adf97d Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Fri, 30 Oct 2015 11:59:48 -0400
Subject: [PATCH] use stratified k-fold to build ensembles for test accuracy

---
 experiments/extend-test-predictions.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/experiments/extend-test-predictions.py b/experiments/extend-test-predictions.py
index 3a856a2c..7da0751a 100755
--- a/experiments/extend-test-predictions.py
+++ b/experiments/extend-test-predictions.py
@@ -31,7 +31,7 @@ from itertools import groupby
 
 import pandas as pd
 import numpy as np
-from sklearn.cross_validation import KFold
+from sklearn.cross_validation import StratifiedKFold
 from mhcflurry.data_helpers import load_data, index_encoding, hotshot_encoding
 from mhcflurry.common import normalize_allele_name, expand_9mer_peptides
 
@@ -232,19 +232,31 @@ if __name__ == "__main__":
         if not training_epochs:
             training_epochs = max(1, int(10 ** 6 / len(Y_train)))
 
-        for i, (cv_train_indices, cv_test_indices) in KFold(args.ensemble_size):
+        for i, (cv_train_indices, cv_test_indices) in enumerate(StratifiedKFold(
+                y=(Y_train <= 500),
+                n_folds=args.ensemble_size,
+                shuffle=True)):
             for epoch in range(args.training_epochs):
                 models[i].fit(
-                    X_train[cv_train_indices],
+                    X_train[cv_train_indices, :],
                     Y_train[cv_train_indices],
                     nb_epoch=1,
                     batch_size=args.minibatch_size,
-                    shuffle=True)
-                fold_pred = models[i].predict(X_train[cv_test_indices])
-                print("Model #%d epoch #%d MSE=%0.4f" % (
+                    verbose=0)
+                cv_train_pred = models[i].predict(X_train[cv_train_indices, :])
+                cv_train_pred = cv_train_pred.flatten()
+                cv_train_mse = ((
+                    cv_train_pred - Y_train[cv_train_indices]) ** 2).mean()
+                cv_test_pred = models[i].predict(X_train[cv_test_indices, :])
+                cv_test_pred = cv_test_pred.flatten()
+                cv_test_mse = ((
+                    cv_test_pred - Y_train[cv_test_indices]) ** 2).mean()
+
+                print("Model #%d epoch #%d train MSE=%0.4f test MSE=%0.4f" % (
                     i + 1,
                     epoch + 1,
-                    ((fold_pred - Y_train[cv_test_indices]) ** 2).mean()
+                    cv_train_mse,
+                    cv_test_mse,
                 ))
 
         predictions = {}
-- 
GitLab