From e3e6f5eb9ce86dae8d94062b21b105cee192f513 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Mon, 21 Dec 2015 12:39:55 -0500
Subject: [PATCH] pulling together more bits of isolated training scripts into
 helpers file

---
 ...train-models-with-synthetic-pretraining.py |  93 ++++++------
 experiments/training_helpers.py               | 135 +++++++++++++-----
 2 files changed, 144 insertions(+), 84 deletions(-)

diff --git a/experiments/train-models-with-synthetic-pretraining.py b/experiments/train-models-with-synthetic-pretraining.py
index 4d3f90e9..c21a9caa 100644
--- a/experiments/train-models-with-synthetic-pretraining.py
+++ b/experiments/train-models-with-synthetic-pretraining.py
@@ -29,6 +29,7 @@ from mhcflurry.data import (
 from arg_parsing import parse_int_list, parse_float_list
 from dataset_paths import PETERS2009_CSV_PATH
 from common import load_csv_binding_data_as_dict
+from training_helpers import create_and_evaluate_model_with_synthetic_data
 
 parser = argparse.ArgumentParser()
 
@@ -159,6 +160,25 @@ def data_augmentation(
     return aucs, f1s, n_originals
 
 
+def rescale_ic50(ic50, max_ic50):
+    log_ic50 = np.log(ic50) / np.log(args.max_ic50)
+    return max(0.0, min(1.0, 1.0 - log_ic50))
+
+
+def load_synthetic_data(csv_path, max_ic50):
+    synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict(
+        csv_path)
+    return {
+        allele: {
+            peptide: rescale_ic50(ic50, max_ic50=max_ic50)
+            for (peptide, ic50)
+            in allele_dict.items()
+        }
+        for (allele, allele_dict)
+        in synthetic_allele_to_peptide_to_ic50_dict.items()
+    }
+
+
 if __name__ == "__main__":
     args = parser.parse_args()
     print(args)
@@ -169,70 +189,39 @@ if __name__ == "__main__":
         max_ic50=args.max_ic50,
         only_human=False)
     print("Loading synthetic data from %s" % args.synthetic_data_csv)
-    synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict(
-        args.synthetic_data_csv)
-    synthetic_allele_to_peptide_to_y_dict = {
-        allele: {
-            peptide: max(
-                0.0,
-                min(
-                    1.0,
-                    1.0 - np.log(ic50) / np.log(args.max_ic50)))
-            for (peptide, ic50)
-            in allele_dict.items()
-        }
-        for (allele, allele_dict)
-        in synthetic_allele_to_peptide_to_ic50_dict.items()
-    }
+
+    synthetic_affinities = load_synthetic_data(
+        csv_path=args.synthetic_data_csv,
+        max_ic50=args.max_ic50)
 
     combined_allele_set = set(allele_datasets.keys()).union(
-        synthetic_allele_to_peptide_to_y_dict.keys())
+        synthetic_affinities.keys())
+
     combined_allele_list = list(sorted(combined_allele_set))
     for allele in combined_allele_list:
-        actual_dataset = allele_datasets[allele]
-        X_actual = actual_dataset.X_index
-        weights_actual = actual_dataset.weights
-        Y_actual = actual_dataset.Y
-
-        synthetic_dict = synthetic_allele_to_peptide_to_y_dict[allele]
-
-        _, _, C_synth, X_synth, _, Y_synth = encode_peptide_to_affinity_dict(
-            synthetic_dict)
-
-        n_actual_samples, n_actual_dims = X_actual.shape
-        n_synth_samples, n_synth_dims = X_synth.shape
-        assert n_actual_dims == n_synth_dims, \
-            "Mismatch between # of actual dims %d and synthetic dims %d" % (
-                n_actual_dims, n_synth_dims)
-        print("-- Using %d actual samples and %d synthetic samples for %s" % (
-            n_actual_samples, n_synth_samples, allele))
-        X = np.vstack([X_actual, X_synth])
-        print("-- X.shape = %s, dtype = %s" % (X.shape, X.dtype))
-        n_samples = n_actual_samples + n_synth_samples
-        assert X.shape[0] == n_samples, \
-            "Expected %d samples but got data array with shape %s" % (
-                n_actual_samples + n_synth_samples, X.shape)
-        Y = np.concatenate([Y_actual, Y_synth])
-        print("-- Y.shape = %s, dtype = %s" % (Y.shape, Y.dtype))
-        assert Y.min() >= 0, \
-            "Y should not contain negative numbers! Y.min() = %f" % (Y.min(),)
-        assert Y.max() <= 1, \
-            "Y should have max value 1.0, got Y.max() = %f" % (Y.max(),)
-        weights_synth = 1.0 / C_synth
-        weights = np.concatenate([weights_actual, weights_synth])
-        assert len(weights) == n_samples
-        print("-- weights.shape = %s, dtype = %s" % (
-            weights.shape, weights.dtype))
+        synthetic_allele_dict = synthetic_affinities[allele]
+        (_, _, Counts_synth, X_synth, _, Y_synth) = \
+            encode_peptide_to_affinity_dict(synthetic_allele_dict)
+        synthetic_sample_weights = 1.0 / Counts_synth
         scores = {}
         for dropout in args.dropouts:
             for embedding_dim_size in args.embedding_dim_sizes:
                 for hidden_layer_size in args.hidden_layer_sizes:
                     params = (
-                        ("dropout", dropout),
+                        ("dropout_probability", dropout),
                         ("embedding_dim_size", embedding_dim_size),
                         ("hidden_layer_size", hidden_layer_size),
                     )
-                    tau, auc, f1 = evaluate_model(**dict(params))
+                    tau, auc, f1 = create_and_evaluate_model_with_synthetic_data(
+                        X_original=allele_datasets[allele].X_index,
+                        Y_original=allele_datasets[allele].Y,
+                        X_synth=X_synth,
+                        Y_synth=Y_synth,
+                        original_sample_weights=allele_datasets[allele].weights,
+                        synthetic_sample_weights=synthetic_sample_weights,
+                        n_training_epochs=150,
+                        max_ic50=args.max_ic50,
+                        **dict(params))
                     scores[params] = (tau, auc, f1)
                     print("%s => tau=%f, AUC=%f, F1=%f" % (
                         params,
diff --git a/experiments/training_helpers.py b/experiments/training_helpers.py
index 5d1f2d3e..bc15780e 100644
--- a/experiments/training_helpers.py
+++ b/experiments/training_helpers.py
@@ -128,29 +128,51 @@ def train_model_and_return_scores(
     return (accuracy, auc, f1_score)
 
 
-def create_and_evaluate_model_with_synthetic_data(
-        X,
-        Y,
-        weights_synth,
-        weights_actual,
+def train_model_with_synthetic_data(
+        model,
         n_training_epochs,
-        dropout,
-        embedding_dim_size,
-        hidden_layer_size):
-    model = mhcflurry.feedforward.make_embedding_network(
-        peptide_length=9,
-        embedding_input_dim=20,
-        embedding_output_dim=4,
-        layer_sizes=[4],
-        activation="tanh",
-        init="lecun_uniform",
-        loss="mse",
-        output_activation="sigmoid",
-        dropout_probability=0.0,
-        optimizer=None,
-        learning_rate=0.001)
-    total_synth_weights = weights_synth.sum()
-    total_actual_weights = weights_actual.sum()
+        max_ic50,
+        X_original,
+        Y_original,
+        X_synth,
+        Y_synth,
+        original_sample_weights,
+        synthetic_sample_weights):
+    total_synth_weights = synthetic_sample_weights.sum()
+    total_original_weights = original_sample_weights.sum()
+    print("Mean Y=%f, Y_synth=%f, weight=%f, weight_synth=%f" % (
+        np.mean(Y_original),
+        np.mean(Y_synth),
+        np.mean(original_sample_weights),
+        np.mean(synthetic_sample_weights)))
+    combined_weights = np.concatenate([
+        original_sample_weights,
+        synthetic_sample_weights
+    ])
+    n_actual_samples, n_actual_dims = X_original.shape
+    n_synth_samples, n_synth_dims = X_synth.shape
+    assert n_actual_dims == n_synth_dims, \
+        "Mismatch between # of actual dims %d and synthetic dims %d" % (
+            n_actual_dims, n_synth_dims)
+    X_combined = np.vstack([X_original, X_synth])
+    n_combined_samples = n_actual_samples + n_synth_samples
+    assert X_combined.shape[0] == n_combined_samples, \
+        "Expected %d samples but got data array with shape %s" % (
+            n_actual_samples + n_synth_samples, X_combined.shape)
+    Y_combined = np.concatenate([Y_original, Y_synth])
+    assert Y_combined.min() >= 0, \
+        "Y should not contain negative numbers! Y.min() = %f" % (
+            Y_combined.min(),)
+    assert Y_combined.max() <= 1, \
+        "Y should have max value 1.0, got Y.max() = %f" % (
+            Y_combined.max(),)
+    combined_weights = np.concatenate([
+        original_sample_weights,
+        synthetic_sample_weights
+    ])
+
+    assert len(combined_weights) == n_combined_samples
+
     for epoch in range(n_training_epochs):
         # weights for synthetic points can be shrunk as:
         #  ~ 1 / (1+epoch)**2
@@ -160,20 +182,69 @@ def create_and_evaluate_model_with_synthetic_data(
         # if the contribution of synthetic samples is less than a
         # thousandth of the actual data, then stop using it
         synth_contribution = total_synth_weights * decay_factor
-        if synth_contribution < total_actual_weights / 1000:
+        if synth_contribution < total_original_weights / 1000:
             print("Epoch %d, using only actual data" % (epoch + 1,))
             model.fit(
-                X_actual,
-                Y_actual,
-                sample_weight=weights_actual,
-                nb_epoch=1)
+                X_original,
+                Y_original,
+                sample_weight=original_sample_weights,
+                nb_epoch=1,
+                verbose=0)
         else:
             print("Epoch %d, synth decay factor = %f" % (
                 epoch + 1, decay_factor))
-            weights[n_actual_samples:] = weights_synth * decay_factor
-            model.fit(X, Y, sample_weight=weights, nb_epoch=1)
-        Y_pred = model.predict(X_actual)
-        print("Training MSE %0.4f" % ((Y_actual - Y_pred) ** 2).mean())
-
+            combined_weights[n_actual_samples:] = (
+                synthetic_sample_weights * decay_factor)
+            model.fit(
+                X_combined,
+                Y_combined,
+                sample_weight=combined_weights,
+                nb_epoch=1,
+                verbose=0)
+        Y_pred = model.predict(X_original)
+        training_mse = ((Y_original - Y_pred) ** 2).mean()
+        print(
+            "-- Epoch %d/%d Training MSE %0.4f" % (
+                epoch + 1,
+                n_training_epochs,
+                training_mse))
 
 
+def create_and_evaluate_model_with_synthetic_data(
+        X_original,
+        Y_original,
+        X_synth,
+        Y_synth,
+        original_sample_weights=None,
+        synthetic_sample_weights=None,
+        n_training_epochs=150,
+        embedding_dim_size=16,
+        hidden_layer_size=50,
+        dropout_probability=0.0,
+        max_ic50=50000.0):
+    if original_sample_weights is None:
+        original_sample_weights = np.ones(len(X_original), dtype=float)
+    if synthetic_sample_weights is None:
+        synthetic_sample_weights = np.ones(len(X_synth), dtype=float)
+    model = mhcflurry.feedforward.make_embedding_network(
+        peptide_length=9,
+        embedding_input_dim=20,
+        embedding_output_dim=embedding_dim_size,
+        layer_sizes=[hidden_layer_size],
+        activation="tanh",
+        init="lecun_uniform",
+        loss="mse",
+        output_activation="sigmoid",
+        dropout_probability=dropout_probability,
+        optimizer=None,
+        learning_rate=0.001)
+    train_model_with_synthetic_data(
+        model=model,
+        n_training_epochs=n_training_epochs,
+        max_ic50=max_ic50,
+        X_original=X_original,
+        Y_original=Y_original,
+        X_synth=X_synth,
+        Y_synth=Y_synth,
+        original_sample_weights=original_sample_weights,
+        synthetic_sample_weights=synthetic_sample_weights)
-- 
GitLab