split up CV code for checking best smoothing coefs for first type of data synthesis

5aa591b0 · Alex Rubinsteyn · 2142e440 · 5aa591b0 · 5aa591b0 · 5aa591b0
Commit 5aa591b0 authored 9 years ago by Alex Rubinsteyn
--- a/experiments/best-synthetic-data-hyperparams.py
+++ b/experiments/best-synthetic-data-hyperparams.py
@@ -25,6 +25,7 @@ averaged across all given alleles.
 """
 import argparse
+from collections import defaultdict
 from mhcflurry.data import load_allele_dicts
 from scipy import stats
@@ -69,43 +70,32 @@ parser.add_argument(
    help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent")
-def evaluate_synthetic_data(
+def generate_cross_validation_datasets(
        allele_to_peptide_to_affinity,
-        smoothing_coef,
-        exponent,
-        max_ic50,
        n_folds=4):
-    """
+    for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
-    Use cross-validation over entries of each allele to determine the predictive
-    accuracy of data synthesis on known affinities.
-    """
-    taus = []
-    f1_scores = []
-    aucs = []
-    peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
-        allele_to_peptide_to_affinity)
-    for allele, dataset in allele_to_peptide_to_affinity.items():
        peptides = list(dataset.keys())
        affinities = list(dataset.values())
-        all_indices = np.arange(len(peptides))
-        np.random.shuffle(all_indices)
        n_samples = len(peptides)
-        cv_aucs = []
+        print("Generating similarities for folds of %s data (n=%d)" % (
-        cv_taus = []
+            allele,
-        cv_f1_scores = []
+            n_samples))
-        if len(peptides) < n_folds:
+        if n_samples < n_folds:
+            print("Too few samples (%d) for %d-fold cross-validation" % (
+                n_samples,
+                n_folds))
            continue
        kfold_iterator = enumerate(
            KFold(n_samples, n_folds=n_folds, shuffle=True))
        for fold_number, (train_indices, test_indices) in kfold_iterator:
+            print("-- Fold %d for %s" % (fold_number + 1, allele))
            train_peptides = [peptides[i] for i in train_indices]
            train_affinities = [affinities[i] for i in train_indices]
            test_peptide_set = set([peptides[i] for i in test_indices])
+            print("-- Test peptides = %s" % (list(test_peptide_set),))
            # copy the affinity data for all alleles other than this one
            fold_affinity_dict = {
                allele_key: affinity_dict
@@ -125,97 +115,126 @@ def evaluate_synthetic_data(
                    allele_to_peptide_to_affinity=fold_affinity_dict,
                    min_weight=0.1)
            this_allele_similarities = allele_similarities[allele]
-            # create a peptide -> list[(allele, affinity, weight)] dictionary
-            # restricted only to the peptides for which we want to test accuracy
-            test_reverse_lookup = {
-                peptide: triplets
-                for (peptide, triplets) in peptide_to_affinities.items()
-                if peptide in test_peptide_set
-            }
-            synthetic_values = synthesize_affinities_for_single_allele(
-                similarities=this_allele_similarities,
-                peptide_to_affinities=test_reverse_lookup,
-                smoothing=smoothing_coef,
-                exponent=exponent,
-                exclude_alleles=[allele])
-            synthetic_peptide_set = set(synthetic_values.keys())
-            # set of peptides for which we have both true and synthetic
-            # affinity values
-            combined_peptide_set = synthetic_peptide_set.intersection(
-                test_peptide_set)
-            combined_peptide_list = list(sorted(combined_peptide_set))
+            yield (
+                allele,
+                dataset,
+                fold_number,
+                this_allele_similarities,
+                test_peptide_set
+            )
+        if allele.startswith("HLA"):
+            break
-            # print("-- allele = %s, n_actual = %d, n_synthetic = %d, intersection = %d" % (
-            #    allele,
-            #    len(dataset),
-            #    len(synthetic_peptide_set),
-            #    len(combined_peptide_list)))
-            if len(combined_peptide_list) < 2:
+def evaluate_synthetic_data(
-                continue
+        allele_to_peptide_to_affinity,
+        smoothing_coef,
+        exponent,
+        max_ic50,
+        n_folds=4):
+    """
+    Use cross-validation over entries of each allele to determine the predictive
+    accuracy of data synthesis on known affinities.
+    """
-            synthetic_affinity_list = [
+    peptide_to_affinities = create_reverse_lookup_from_simple_dicts(
-                synthetic_values[peptide]
+        allele_to_peptide_to_affinity)
-                for peptide in combined_peptide_list
-            ]
-            true_affinity_list = [
+    taus = defaultdict(list)
-                dataset[peptide]
+    f1_scores = defaultdict(list)
-                for peptide in combined_peptide_list
+    aucs = defaultdict(list)
-            ]
-            assert len(true_affinity_list) == len(synthetic_affinity_list)
+    for (allele, allele_dataset, fold_num, fold_sims, fold_test_peptides) in \
-            if all(x == true_affinity_list[0] for x in true_affinity_list):
+            generate_cross_validation_datasets(
-                continue
+                allele_to_peptide_to_affinity,
+                n_folds=n_folds):
-            tau, _ = stats.kendalltau(
+        # create a peptide -> list[(allele, affinity, weight)] dictionary
-                synthetic_affinity_list,
+        # restricted only to the peptides for which we want to test accuracy
-                true_affinity_list)
+        test_reverse_lookup = {
-            assert not np.isnan(tau)
+            peptide: triplets
-            print("==> %s (CV fold %d/%d) tau = %f" % (
+            for (peptide, triplets) in peptide_to_affinities.items()
-                allele,
+            if peptide in fold_test_peptides
-                fold_number + 1,
+        }
-                n_folds,
-                tau))
+        synthetic_values = synthesize_affinities_for_single_allele(
-            cv_taus.append(tau)
+            similarities=fold_sims,
+            peptide_to_affinities=test_reverse_lookup,
-            true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list))
+            smoothing=smoothing_coef,
-            predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list))
+            exponent=exponent,
+            exclude_alleles=[allele])
-            true_binding_label = true_ic50s <= 500
-            if true_binding_label.all() or not true_binding_label.any():
+        synthetic_peptide_set = set(synthetic_values.keys())
-                # can't compute AUC or F1 without both negative and positive cases
+        # set of peptides for which we have both true and synthetic
-                continue
+        # affinity values
-            auc = sklearn.metrics.roc_auc_score(
+        combined_peptide_set = synthetic_peptide_set.intersection(
-                true_binding_label,
+            fold_test_peptides)
-                synthetic_affinity_list)
-            print("==> %s (CV fold %d/%d) AUC = %f" % (
+        combined_peptide_list = list(sorted(combined_peptide_set))
-                allele,
-                fold_number + 1,
+        if len(combined_peptide_list) < 2:
-                n_folds,
+            print("Too few peptides in combined set %s for fold %d/%d of %s" % (
-                auc))
+                combined_peptide_list,
-            cv_aucs.append(auc)
+                fold_num + 1,
-            predicted_binding_label = predicted_ic50s <= 500
-            if predicted_binding_label.all() or not predicted_binding_label.any():
-                # can't compute F1 without both positive and negative predictions
-                continue
-            f1_score = sklearn.metrics.f1_score(
-                true_binding_label,
-                predicted_binding_label)
-            print("==> %s (CV fold %d/%d) F1 = %f" % (
-                allele,
-                fold_number + 1,
                n_folds,
-                f1_score))
+                allele))
-            cv_f1_scores.append(f1_score)
+            continue
-        if len(cv_taus) > 0:
-            taus.append(np.mean(cv_taus))
+        synthetic_affinity_list = [
-        if len(cv_aucs) > 0:
+            synthetic_values[peptide]
-            aucs.append(np.mean(cv_aucs))
+            for peptide in combined_peptide_list
-        if len(f1_scores) > 0:
+        ]
-            f1_scores.append(np.mean(f1_scores))
+        true_affinity_list = [
+            allele_to_peptide_to_affinity[allele][peptide]
+            for peptide in combined_peptide_list
+        ]
+        assert len(true_affinity_list) == len(synthetic_affinity_list)
+        if all(x == true_affinity_list[0] for x in true_affinity_list):
+            continue
+        tau, _ = stats.kendalltau(
+            synthetic_affinity_list,
+            true_affinity_list)
+        assert not np.isnan(tau)
+        print("==> %s (CV fold %d/%d) tau = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            tau))
+        taus[allele].append(tau)
+        true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list))
+        predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list))
+        true_binding_label = true_ic50s <= 500
+        if true_binding_label.all() or not true_binding_label.any():
+            # can't compute AUC or F1 without both negative and positive cases
+            continue
+        auc = sklearn.metrics.roc_auc_score(
+            true_binding_label,
+            synthetic_affinity_list)
+        print("==> %s (CV fold %d/%d) AUC = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            auc))
+        aucs[allele].append(auc)
+        predicted_binding_label = predicted_ic50s <= 500
+        if predicted_binding_label.all() or not predicted_binding_label.any():
+            # can't compute F1 without both positive and negative predictions
+            continue
+        f1_score = sklearn.metrics.f1_score(
+            true_binding_label,
+            predicted_binding_label)
+        print("==> %s (CV fold %d/%d) F1 = %f" % (
+            allele,
+            fold_num + 1,
+            n_folds,
+            f1_score))
+        f1_scores[allele].append(f1_score)
    return taus, aucs, f1_scores
@@ -245,9 +264,15 @@ if __name__ == "__main__":
                smoothing_coef=smoothing_coef,
                exponent=exponent,
                max_ic50=args.max_ic50)
-            median_tau = np.median(taus)
+            allele_keys = list(sorted(taus.keys()))
-            median_f1 = np.median(f1_scores)
+            tau_values = [taus[allele] for allele in allele_keys]
-            median_auc = np.median(aucs)
+            auc_values = [aucs.get(allele, 0.5) for allele in allele_keys]
+            f1_score_values = [
+                f1_scores.get(allele, 0.0) for allele in allele_keys
+            ]
+            median_tau = np.median(tau_values)
+            median_f1 = np.median(f1_score_values)
+            median_auc = np.median(auc_values)
            print(
                "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % (
                    exponent,

--- a/experiments/model-selection.py
+++ b/experiments/model-selection.py
@@ -143,6 +143,8 @@ parser.add_argument(
    help="Comma separated list of optimization methods")
 if __name__ == "__main__":
    args = parser.parse_args()
    configs = generate_all_model_configs(

--- a/experiments/train-models-with-synthetic-pretraining.py
+++ b/experiments/train-models-with-synthetic-pretraining.py
@@ -159,53 +159,6 @@ def data_augmentation(
    return aucs, f1s, n_originals
-def evaluate_model(
-        X,
-        Y,
-        weights_synth,
-        weights_actual,
-        dropout,
-        embedding_dim_size,
-        hidden_layer_size):
-    model = mhcflurry.feedforward.make_embedding_network(
-        peptide_length=9,
-        embedding_input_dim=20,
-        embedding_output_dim=4,
-        layer_sizes=[4],
-        activation="tanh",
-        init="lecun_uniform",
-        loss="mse",
-        output_activation="sigmoid",
-        dropout_probability=0.0,
-        optimizer=None,
-        learning_rate=0.001)
-    total_synth_weights = weights_synth.sum()
-    total_actual_weights = weights_actual.sum()
-    for epoch in range(args.training_epochs):
-        # weights for synthetic points can be shrunk as:
-        #  ~ 1 / (1+epoch)**2
-        # or
-        # 2.0 ** -epoch
-        decay_factor = 2.0 ** -epoch
-        # if the contribution of synthetic samples is less than a
-        # thousandth of the actual data, then stop using it
-        synth_contribution = total_synth_weights * decay_factor
-        if synth_contribution < total_actual_weights / 1000:
-            print("Epoch %d, using only actual data" % (epoch + 1,))
-            model.fit(
-                X_actual,
-                Y_actual,
-                sample_weight=weights_actual,
-                nb_epoch=1)
-        else:
-            print("Epoch %d, synth decay factor = %f" % (
-                epoch + 1, decay_factor))
-            weights[n_actual_samples:] = weights_synth * decay_factor
-            model.fit(X, Y, sample_weight=weights, nb_epoch=1)
-        Y_pred = model.predict(X_actual)
-        print("Training MSE %0.4f" % ((Y_actual - Y_pred) ** 2).mean())
 if __name__ == "__main__":
    args = parser.parse_args()
    print(args)

--- a/experiments/training_helpers.py
+++ b/experiments/training_helpers.py
@@ -17,6 +17,7 @@ from collections import OrderedDict
 import numpy as np
 from sklearn.metrics import roc_auc_score
+import mhcflurry
 from mhcflurry.common import normalize_allele_name
 from mhcflurry.data_helpers import indices_to_hotshot_encoding

--- a/mhcflurry/predict.py
+++ b/mhcflurry/predict.py
-import pandas as pd
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
+from collections import OrderedDict
+import pandas as pd
 from .paths import CLASS1_MODEL_DIRECTORY
 from .mhc1_binding_predictor import Mhc1BindingPredictor
 def predict(alleles, peptides):
-    allele_dataframes = []
+    allele_dataframes = OrderedDict([])
    for allele in alleles:
        model = Mhc1BindingPredictor(allele=allele)
-        df = model.predict_peptides(peptides)
+        result_dictionary = model.predict_peptides(peptides)
        allele_dataframes.append(df)
    return pd.concat(allele_dataframes)
-def supported_alleles():
-    alleles = []
-    for filename in os.listdir(CLASS1_MODEL_DIRECTORY):
-        allele = filename.replace(".hdf", "")
-        if len(allele) < 5:
-            # skipping serotype names like A2 or B7
-            continue
-        allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:])
-        alleles.append(allele)
-    return alleles