From 5aa591b01ce5862d5e45d93ca8c590c82a11555c Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Thu, 17 Dec 2015 14:52:55 -0500
Subject: [PATCH] split up CV code for checking best smoothing coefs for first
 type of data synthesis

---
 .../best-synthetic-data-hyperparams.py        | 247 ++++++++++--------
 experiments/model-selection.py                |   2 +
 ...train-models-with-synthetic-pretraining.py |  47 ----
 experiments/training_helpers.py               |   1 +
 mhcflurry/predict.py                          |  33 ++-
 5 files changed, 158 insertions(+), 172 deletions(-)

diff --git a/experiments/best-synthetic-data-hyperparams.py b/experiments/best-synthetic-data-hyperparams.py
index 0c523e9e..36df4f46 100755
--- a/experiments/best-synthetic-data-hyperparams.py
+++ b/experiments/best-synthetic-data-hyperparams.py
@@ -25,6 +25,7 @@ averaged across all given alleles.
 """
 
 import argparse
+from collections import defaultdict
 
 from mhcflurry.data import load_allele_dicts
 from scipy import stats
@@ -69,43 +70,32 @@ parser.add_argument(
     help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
 
 
-def evaluate_synthetic_data(
+def generate_cross_validation_datasets(
         allele_to_peptide_to_affinity,
-        smoothing_coef,
-        exponent,
-        max_ic50,
         n_folds=4):
-    """
-    Use cross-validation over entries of each allele to determine the predictive
-    accuracy of data synthesis on known affinities.
-    """
-    taus = []
-    f1_scores = []
-    aucs = []
-
-    peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
-        allele_to_peptide_to_affinity)
-    for allele, dataset in allele_to_peptide_to_affinity.items():
+    for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
         peptides = list(dataset.keys())
         affinities = list(dataset.values())
-        all_indices = np.arange(len(peptides))
-        np.random.shuffle(all_indices)
         n_samples = len(peptides)
-        cv_aucs = []
-        cv_taus = []
-        cv_f1_scores = []
-
-        if len(peptides) < n_folds:
+        print("Generating similarities for folds of %s data (n=%d)" % (
+            allele,
+            n_samples))
+
+        if n_samples < n_folds:
+            print("Too few samples (%d) for %d-fold cross-validation" % (
+                n_samples,
+                n_folds))
             continue
 
         kfold_iterator = enumerate(
             KFold(n_samples, n_folds=n_folds, shuffle=True))
 
         for fold_number, (train_indices, test_indices) in kfold_iterator:
+            print("-- Fold %d for %s" % (fold_number + 1, allele))
             train_peptides = [peptides[i] for i in train_indices]
             train_affinities = [affinities[i] for i in train_indices]
             test_peptide_set = set([peptides[i] for i in test_indices])
-
+            print("-- Test peptides = %s" % (list(test_peptide_set),))
             # copy the affinity data for all alleles other than this one
             fold_affinity_dict = {
                 allele_key: affinity_dict
@@ -125,97 +115,126 @@ def evaluate_synthetic_data(
                     allele_to_peptide_to_affinity=fold_affinity_dict,
                     min_weight=0.1)
             this_allele_similarities = allele_similarities[allele]
-            # create a peptide -> list[(allele, affinity, weight)] dictionary
-            # restricted only to the peptides for which we want to test accuracy
-            test_reverse_lookup = {
-                peptide: triplets
-                for (peptide, triplets) in peptide_to_affinities.items()
-                if peptide in test_peptide_set
-            }
-            synthetic_values = synthesize_affinities_for_single_allele(
-                similarities=this_allele_similarities,
-                peptide_to_affinities=test_reverse_lookup,
-                smoothing=smoothing_coef,
-                exponent=exponent,
-                exclude_alleles=[allele])
-
-            synthetic_peptide_set = set(synthetic_values.keys())
-            # set of peptides for which we have both true and synthetic
-            # affinity values
-            combined_peptide_set = synthetic_peptide_set.intersection(
-                test_peptide_set)
 
-            combined_peptide_list = list(sorted(combined_peptide_set))
+            yield (
+                allele,
+                dataset,
+                fold_number,
+                this_allele_similarities,
+                test_peptide_set
+            )
+        if allele.startswith("HLA"):
+            break
 
-            # print("-- allele = %s, n_actual = %d, n_synthetic = %d, intersection = %d" % (
-            #    allele,
-            #    len(dataset),
-            #    len(synthetic_peptide_set),
-            #    len(combined_peptide_list)))
 
-            if len(combined_peptide_list) < 2:
-                continue
+def evaluate_synthetic_data(
+        allele_to_peptide_to_affinity,
+        smoothing_coef,
+        exponent,
+        max_ic50,
+        n_folds=4):
+    """
+    Use cross-validation over entries of each allele to determine the predictive
+    accuracy of data synthesis on known affinities.
+    """
 
-            synthetic_affinity_list = [
-                synthetic_values[peptide]
-                for peptide in combined_peptide_list
-            ]
+    peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
+        allele_to_peptide_to_affinity)
 
-            true_affinity_list = [
-                dataset[peptide]
-                for peptide in combined_peptide_list
-            ]
-            assert len(true_affinity_list) == len(synthetic_affinity_list)
-            if all(x == true_affinity_list[0] for x in true_affinity_list):
-                continue
-
-            tau, _ = stats.kendalltau(
-                synthetic_affinity_list,
-                true_affinity_list)
-            assert not np.isnan(tau)
-            print("==> %s (CV fold %d/%d) tau = %f" % (
-                allele,
-                fold_number + 1,
-                n_folds,
-                tau))
-            cv_taus.append(tau)
-
-            true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list))
-            predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list))
-
-            true_binding_label = true_ic50s <= 500
-            if true_binding_label.all() or not true_binding_label.any():
-                # can't compute AUC or F1 without both negative and positive cases
-                continue
-            auc = sklearn.metrics.roc_auc_score(
-                true_binding_label,
-                synthetic_affinity_list)
-            print("==> %s (CV fold %d/%d) AUC = %f" % (
-                allele,
-                fold_number + 1,
-                n_folds,
-                auc))
-            cv_aucs.append(auc)
-
-            predicted_binding_label = predicted_ic50s <= 500
-            if predicted_binding_label.all() or not predicted_binding_label.any():
-                # can't compute F1 without both positive and negative predictions
-                continue
-            f1_score = sklearn.metrics.f1_score(
-                true_binding_label,
-                predicted_binding_label)
-            print("==> %s (CV fold %d/%d) F1 = %f" % (
-                allele,
-                fold_number + 1,
+    taus = defaultdict(list)
+    f1_scores = defaultdict(list)
+    aucs = defaultdict(list)
+
+    for (allele, allele_dataset, fold_num, fold_sims, fold_test_peptides) in \
+            generate_cross_validation_datasets(
+                allele_to_peptide_to_affinity,
+                n_folds=n_folds):
+        # create a peptide -> list[(allele, affinity, weight)] dictionary
+        # restricted only to the peptides for which we want to test accuracy
+        test_reverse_lookup = {
+            peptide: triplets
+            for (peptide, triplets) in peptide_to_affinities.items()
+            if peptide in fold_test_peptides
+        }
+
+        synthetic_values = synthesize_affinities_for_single_allele(
+            similarities=fold_sims,
+            peptide_to_affinities=test_reverse_lookup,
+            smoothing=smoothing_coef,
+            exponent=exponent,
+            exclude_alleles=[allele])
+
+        synthetic_peptide_set = set(synthetic_values.keys())
+        # set of peptides for which we have both true and synthetic
+        # affinity values
+        combined_peptide_set = synthetic_peptide_set.intersection(
+            fold_test_peptides)
+
+        combined_peptide_list = list(sorted(combined_peptide_set))
+
+        if len(combined_peptide_list) < 2:
+            print("Too few peptides in combined set %s for fold %d/%d of %s" % (
+                combined_peptide_list,
+                fold_num + 1,
                 n_folds,
-                f1_score))
-            cv_f1_scores.append(f1_score)
-        if len(cv_taus) > 0:
-            taus.append(np.mean(cv_taus))
-        if len(cv_aucs) > 0:
-            aucs.append(np.mean(cv_aucs))
-        if len(f1_scores) > 0:
-            f1_scores.append(np.mean(f1_scores))
+                allele))
+            continue
+
+        synthetic_affinity_list = [
+            synthetic_values[peptide]
+            for peptide in combined_peptide_list
+        ]
+
+        true_affinity_list = [
+            allele_to_peptide_to_affinity[allele][peptide]
+            for peptide in combined_peptide_list
+        ]
+        assert len(true_affinity_list) == len(synthetic_affinity_list)
+        if all(x == true_affinity_list[0] for x in true_affinity_list):
+            continue
+
+        tau, _ = stats.kendalltau(
+            synthetic_affinity_list,
+            true_affinity_list)
+        assert not np.isnan(tau)
+        print("==> %s (CV fold %d/%d) tau = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            tau))
+        taus[allele].append(tau)
+
+        true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list))
+        predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list))
+
+        true_binding_label = true_ic50s <= 500
+        if true_binding_label.all() or not true_binding_label.any():
+            # can't compute AUC or F1 without both negative and positive cases
+            continue
+        auc = sklearn.metrics.roc_auc_score(
+            true_binding_label,
+            synthetic_affinity_list)
+        print("==> %s (CV fold %d/%d) AUC = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            auc))
+        aucs[allele].append(auc)
+
+        predicted_binding_label = predicted_ic50s <= 500
+        if predicted_binding_label.all() or not predicted_binding_label.any():
+            # can't compute F1 without both positive and negative predictions
+            continue
+        f1_score = sklearn.metrics.f1_score(
+            true_binding_label,
+            predicted_binding_label)
+        print("==> %s (CV fold %d/%d) F1 = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            f1_score))
+        f1_scores[allele].append(f1_score)
+
     return taus, aucs, f1_scores
 
 
@@ -245,9 +264,15 @@ if __name__ == "__main__":
                 smoothing_coef=smoothing_coef,
                 exponent=exponent,
                 max_ic50=args.max_ic50)
-            median_tau = np.median(taus)
-            median_f1 = np.median(f1_scores)
-            median_auc = np.median(aucs)
+            allele_keys = list(sorted(taus.keys()))
+            tau_values = [taus[allele] for allele in allele_keys]
+            auc_values = [aucs.get(allele, 0.5) for allele in allele_keys]
+            f1_score_values = [
+                f1_scores.get(allele, 0.0) for allele in allele_keys
+            ]
+            median_tau = np.median(tau_values)
+            median_f1 = np.median(f1_score_values)
+            median_auc = np.median(auc_values)
             print(
                 "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
                     exponent,
diff --git a/experiments/model-selection.py b/experiments/model-selection.py
index c46d735d..df52f7e3 100755
--- a/experiments/model-selection.py
+++ b/experiments/model-selection.py
@@ -143,6 +143,8 @@ parser.add_argument(
     help="Comma separated list of optimization methods")
 
 
+
+
 if __name__ == "__main__":
     args = parser.parse_args()
     configs = generate_all_model_configs(
diff --git a/experiments/train-models-with-synthetic-pretraining.py b/experiments/train-models-with-synthetic-pretraining.py
index 7f5e6a67..4d3f90e9 100644
--- a/experiments/train-models-with-synthetic-pretraining.py
+++ b/experiments/train-models-with-synthetic-pretraining.py
@@ -159,53 +159,6 @@ def data_augmentation(
     return aucs, f1s, n_originals
 
 
-def evaluate_model(
-        X,
-        Y,
-        weights_synth,
-        weights_actual,
-        dropout,
-        embedding_dim_size,
-        hidden_layer_size):
-    model = mhcflurry.feedforward.make_embedding_network(
-        peptide_length=9,
-        embedding_input_dim=20,
-        embedding_output_dim=4,
-        layer_sizes=[4],
-        activation="tanh",
-        init="lecun_uniform",
-        loss="mse",
-        output_activation="sigmoid",
-        dropout_probability=0.0,
-        optimizer=None,
-        learning_rate=0.001)
-    total_synth_weights = weights_synth.sum()
-    total_actual_weights = weights_actual.sum()
-    for epoch in range(args.training_epochs):
-        # weights for synthetic points can be shrunk as:
-        #  ~ 1 / (1+epoch)**2
-        # or
-        # 2.0 ** -epoch
-        decay_factor = 2.0 ** -epoch
-        # if the contribution of synthetic samples is less than a
-        # thousandth of the actual data, then stop using it
-        synth_contribution = total_synth_weights * decay_factor
-        if synth_contribution < total_actual_weights / 1000:
-            print("Epoch %d, using only actual data" % (epoch + 1,))
-            model.fit(
-                X_actual,
-                Y_actual,
-                sample_weight=weights_actual,
-                nb_epoch=1)
-        else:
-            print("Epoch %d, synth decay factor = %f" % (
-                epoch + 1, decay_factor))
-            weights[n_actual_samples:] = weights_synth * decay_factor
-            model.fit(X, Y, sample_weight=weights, nb_epoch=1)
-        Y_pred = model.predict(X_actual)
-        print("Training MSE %0.4f" % ((Y_actual - Y_pred) ** 2).mean())
-
-
 if __name__ == "__main__":
     args = parser.parse_args()
     print(args)
diff --git a/experiments/training_helpers.py b/experiments/training_helpers.py
index cebfc237..5d1f2d3e 100644
--- a/experiments/training_helpers.py
+++ b/experiments/training_helpers.py
@@ -17,6 +17,7 @@ from collections import OrderedDict
 import numpy as np
 from sklearn.metrics import roc_auc_score
 
+import mhcflurry
 from mhcflurry.common import normalize_allele_name
 from mhcflurry.data_helpers import indices_to_hotshot_encoding
 
diff --git a/mhcflurry/predict.py b/mhcflurry/predict.py
index aa099d62..1aef5dca 100644
--- a/mhcflurry/predict.py
+++ b/mhcflurry/predict.py
@@ -1,24 +1,29 @@
-import pandas as pd
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
+from collections import OrderedDict
+
+import pandas as pd
 
 from .paths import CLASS1_MODEL_DIRECTORY
 from .mhc1_binding_predictor import Mhc1BindingPredictor
 
 def predict(alleles, peptides):
-    allele_dataframes = []
+    allele_dataframes = OrderedDict([])
     for allele in alleles:
         model = Mhc1BindingPredictor(allele=allele)
-        df = model.predict_peptides(peptides)
+        result_dictionary = model.predict_peptides(peptides)
         allele_dataframes.append(df)
     return pd.concat(allele_dataframes)
-
-def supported_alleles():
-    alleles = []
-    for filename in os.listdir(CLASS1_MODEL_DIRECTORY):
-        allele = filename.replace(".hdf", "")
-        if len(allele) < 5:
-            # skipping serotype names like A2 or B7
-            continue
-        allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:])
-        alleles.append(allele)
-    return alleles
-- 
GitLab