From 5aa591b01ce5862d5e45d93ca8c590c82a11555c Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Thu, 17 Dec 2015 14:52:55 -0500 Subject: [PATCH] split up CV code for checking best smoothing coefs for first type of data synthesis --- .../best-synthetic-data-hyperparams.py | 247 ++++++++++-------- experiments/model-selection.py | 2 + ...train-models-with-synthetic-pretraining.py | 47 ---- experiments/training_helpers.py | 1 + mhcflurry/predict.py | 33 ++- 5 files changed, 158 insertions(+), 172 deletions(-) diff --git a/experiments/best-synthetic-data-hyperparams.py b/experiments/best-synthetic-data-hyperparams.py index 0c523e9e..36df4f46 100755 --- a/experiments/best-synthetic-data-hyperparams.py +++ b/experiments/best-synthetic-data-hyperparams.py @@ -25,6 +25,7 @@ averaged across all given alleles. """ import argparse +from collections import defaultdict from mhcflurry.data import load_allele_dicts from scipy import stats @@ -69,43 +70,32 @@ parser.add_argument( help="Affinities are synthesized by adding up y_ip * sim(i,j) ** exponent") -def evaluate_synthetic_data( +def generate_cross_validation_datasets( allele_to_peptide_to_affinity, - smoothing_coef, - exponent, - max_ic50, n_folds=4): - """ - Use cross-validation over entries of each allele to determine the predictive - accuracy of data synthesis on known affinities. - """ - taus = [] - f1_scores = [] - aucs = [] - - peptide_to_affinities = create_reverse_lookup_from_simple_dicts( - allele_to_peptide_to_affinity) - for allele, dataset in allele_to_peptide_to_affinity.items(): + for allele, dataset in sorted(allele_to_peptide_to_affinity.items()): peptides = list(dataset.keys()) affinities = list(dataset.values()) - all_indices = np.arange(len(peptides)) - np.random.shuffle(all_indices) n_samples = len(peptides) - cv_aucs = [] - cv_taus = [] - cv_f1_scores = [] - - if len(peptides) < n_folds: + print("Generating similarities for folds of %s data (n=%d)" % ( + allele, + n_samples)) + + if n_samples < n_folds: + print("Too few samples (%d) for %d-fold cross-validation" % ( + n_samples, + n_folds)) continue kfold_iterator = enumerate( KFold(n_samples, n_folds=n_folds, shuffle=True)) for fold_number, (train_indices, test_indices) in kfold_iterator: + print("-- Fold %d for %s" % (fold_number + 1, allele)) train_peptides = [peptides[i] for i in train_indices] train_affinities = [affinities[i] for i in train_indices] test_peptide_set = set([peptides[i] for i in test_indices]) - + print("-- Test peptides = %s" % (list(test_peptide_set),)) # copy the affinity data for all alleles other than this one fold_affinity_dict = { allele_key: affinity_dict @@ -125,97 +115,126 @@ def evaluate_synthetic_data( allele_to_peptide_to_affinity=fold_affinity_dict, min_weight=0.1) this_allele_similarities = allele_similarities[allele] - # create a peptide -> list[(allele, affinity, weight)] dictionary - # restricted only to the peptides for which we want to test accuracy - test_reverse_lookup = { - peptide: triplets - for (peptide, triplets) in peptide_to_affinities.items() - if peptide in test_peptide_set - } - synthetic_values = synthesize_affinities_for_single_allele( - similarities=this_allele_similarities, - peptide_to_affinities=test_reverse_lookup, - smoothing=smoothing_coef, - exponent=exponent, - exclude_alleles=[allele]) - - synthetic_peptide_set = set(synthetic_values.keys()) - # set of peptides for which we have both true and synthetic - # affinity values - combined_peptide_set = synthetic_peptide_set.intersection( - test_peptide_set) - combined_peptide_list = list(sorted(combined_peptide_set)) + yield ( + allele, + dataset, + fold_number, + this_allele_similarities, + test_peptide_set + ) + if allele.startswith("HLA"): + break - # print("-- allele = %s, n_actual = %d, n_synthetic = %d, intersection = %d" % ( - # allele, - # len(dataset), - # len(synthetic_peptide_set), - # len(combined_peptide_list))) - if len(combined_peptide_list) < 2: - continue +def evaluate_synthetic_data( + allele_to_peptide_to_affinity, + smoothing_coef, + exponent, + max_ic50, + n_folds=4): + """ + Use cross-validation over entries of each allele to determine the predictive + accuracy of data synthesis on known affinities. + """ - synthetic_affinity_list = [ - synthetic_values[peptide] - for peptide in combined_peptide_list - ] + peptide_to_affinities = create_reverse_lookup_from_simple_dicts( + allele_to_peptide_to_affinity) - true_affinity_list = [ - dataset[peptide] - for peptide in combined_peptide_list - ] - assert len(true_affinity_list) == len(synthetic_affinity_list) - if all(x == true_affinity_list[0] for x in true_affinity_list): - continue - - tau, _ = stats.kendalltau( - synthetic_affinity_list, - true_affinity_list) - assert not np.isnan(tau) - print("==> %s (CV fold %d/%d) tau = %f" % ( - allele, - fold_number + 1, - n_folds, - tau)) - cv_taus.append(tau) - - true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list)) - predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list)) - - true_binding_label = true_ic50s <= 500 - if true_binding_label.all() or not true_binding_label.any(): - # can't compute AUC or F1 without both negative and positive cases - continue - auc = sklearn.metrics.roc_auc_score( - true_binding_label, - synthetic_affinity_list) - print("==> %s (CV fold %d/%d) AUC = %f" % ( - allele, - fold_number + 1, - n_folds, - auc)) - cv_aucs.append(auc) - - predicted_binding_label = predicted_ic50s <= 500 - if predicted_binding_label.all() or not predicted_binding_label.any(): - # can't compute F1 without both positive and negative predictions - continue - f1_score = sklearn.metrics.f1_score( - true_binding_label, - predicted_binding_label) - print("==> %s (CV fold %d/%d) F1 = %f" % ( - allele, - fold_number + 1, + taus = defaultdict(list) + f1_scores = defaultdict(list) + aucs = defaultdict(list) + + for (allele, allele_dataset, fold_num, fold_sims, fold_test_peptides) in \ + generate_cross_validation_datasets( + allele_to_peptide_to_affinity, + n_folds=n_folds): + # create a peptide -> list[(allele, affinity, weight)] dictionary + # restricted only to the peptides for which we want to test accuracy + test_reverse_lookup = { + peptide: triplets + for (peptide, triplets) in peptide_to_affinities.items() + if peptide in fold_test_peptides + } + + synthetic_values = synthesize_affinities_for_single_allele( + similarities=fold_sims, + peptide_to_affinities=test_reverse_lookup, + smoothing=smoothing_coef, + exponent=exponent, + exclude_alleles=[allele]) + + synthetic_peptide_set = set(synthetic_values.keys()) + # set of peptides for which we have both true and synthetic + # affinity values + combined_peptide_set = synthetic_peptide_set.intersection( + fold_test_peptides) + + combined_peptide_list = list(sorted(combined_peptide_set)) + + if len(combined_peptide_list) < 2: + print("Too few peptides in combined set %s for fold %d/%d of %s" % ( + combined_peptide_list, + fold_num + 1, n_folds, - f1_score)) - cv_f1_scores.append(f1_score) - if len(cv_taus) > 0: - taus.append(np.mean(cv_taus)) - if len(cv_aucs) > 0: - aucs.append(np.mean(cv_aucs)) - if len(f1_scores) > 0: - f1_scores.append(np.mean(f1_scores)) + allele)) + continue + + synthetic_affinity_list = [ + synthetic_values[peptide] + for peptide in combined_peptide_list + ] + + true_affinity_list = [ + allele_to_peptide_to_affinity[allele][peptide] + for peptide in combined_peptide_list + ] + assert len(true_affinity_list) == len(synthetic_affinity_list) + if all(x == true_affinity_list[0] for x in true_affinity_list): + continue + + tau, _ = stats.kendalltau( + synthetic_affinity_list, + true_affinity_list) + assert not np.isnan(tau) + print("==> %s (CV fold %d/%d) tau = %f" % ( + allele, + fold_num + 1, + n_folds, + tau)) + taus[allele].append(tau) + + true_ic50s = max_ic50 ** (1.0 - np.array(true_affinity_list)) + predicted_ic50s = max_ic50 ** (1.0 - np.array(synthetic_affinity_list)) + + true_binding_label = true_ic50s <= 500 + if true_binding_label.all() or not true_binding_label.any(): + # can't compute AUC or F1 without both negative and positive cases + continue + auc = sklearn.metrics.roc_auc_score( + true_binding_label, + synthetic_affinity_list) + print("==> %s (CV fold %d/%d) AUC = %f" % ( + allele, + fold_num + 1, + n_folds, + auc)) + aucs[allele].append(auc) + + predicted_binding_label = predicted_ic50s <= 500 + if predicted_binding_label.all() or not predicted_binding_label.any(): + # can't compute F1 without both positive and negative predictions + continue + f1_score = sklearn.metrics.f1_score( + true_binding_label, + predicted_binding_label) + print("==> %s (CV fold %d/%d) F1 = %f" % ( + allele, + fold_num + 1, + n_folds, + f1_score)) + f1_scores[allele].append(f1_score) + return taus, aucs, f1_scores @@ -245,9 +264,15 @@ if __name__ == "__main__": smoothing_coef=smoothing_coef, exponent=exponent, max_ic50=args.max_ic50) - median_tau = np.median(taus) - median_f1 = np.median(f1_scores) - median_auc = np.median(aucs) + allele_keys = list(sorted(taus.keys())) + tau_values = [taus[allele] for allele in allele_keys] + auc_values = [aucs.get(allele, 0.5) for allele in allele_keys] + f1_score_values = [ + f1_scores.get(allele, 0.0) for allele in allele_keys + ] + median_tau = np.median(tau_values) + median_f1 = np.median(f1_score_values) + median_auc = np.median(auc_values) print( "Exp=%f, Coef=%f, tau=%0.4f, AUC = %0.4f, F1 = %0.4f" % ( exponent, diff --git a/experiments/model-selection.py b/experiments/model-selection.py index c46d735d..df52f7e3 100755 --- a/experiments/model-selection.py +++ b/experiments/model-selection.py @@ -143,6 +143,8 @@ parser.add_argument( help="Comma separated list of optimization methods") + + if __name__ == "__main__": args = parser.parse_args() configs = generate_all_model_configs( diff --git a/experiments/train-models-with-synthetic-pretraining.py b/experiments/train-models-with-synthetic-pretraining.py index 7f5e6a67..4d3f90e9 100644 --- a/experiments/train-models-with-synthetic-pretraining.py +++ b/experiments/train-models-with-synthetic-pretraining.py @@ -159,53 +159,6 @@ def data_augmentation( return aucs, f1s, n_originals -def evaluate_model( - X, - Y, - weights_synth, - weights_actual, - dropout, - embedding_dim_size, - hidden_layer_size): - model = mhcflurry.feedforward.make_embedding_network( - peptide_length=9, - embedding_input_dim=20, - embedding_output_dim=4, - layer_sizes=[4], - activation="tanh", - init="lecun_uniform", - loss="mse", - output_activation="sigmoid", - dropout_probability=0.0, - optimizer=None, - learning_rate=0.001) - total_synth_weights = weights_synth.sum() - total_actual_weights = weights_actual.sum() - for epoch in range(args.training_epochs): - # weights for synthetic points can be shrunk as: - # ~ 1 / (1+epoch)**2 - # or - # 2.0 ** -epoch - decay_factor = 2.0 ** -epoch - # if the contribution of synthetic samples is less than a - # thousandth of the actual data, then stop using it - synth_contribution = total_synth_weights * decay_factor - if synth_contribution < total_actual_weights / 1000: - print("Epoch %d, using only actual data" % (epoch + 1,)) - model.fit( - X_actual, - Y_actual, - sample_weight=weights_actual, - nb_epoch=1) - else: - print("Epoch %d, synth decay factor = %f" % ( - epoch + 1, decay_factor)) - weights[n_actual_samples:] = weights_synth * decay_factor - model.fit(X, Y, sample_weight=weights, nb_epoch=1) - Y_pred = model.predict(X_actual) - print("Training MSE %0.4f" % ((Y_actual - Y_pred) ** 2).mean()) - - if __name__ == "__main__": args = parser.parse_args() print(args) diff --git a/experiments/training_helpers.py b/experiments/training_helpers.py index cebfc237..5d1f2d3e 100644 --- a/experiments/training_helpers.py +++ b/experiments/training_helpers.py @@ -17,6 +17,7 @@ from collections import OrderedDict import numpy as np from sklearn.metrics import roc_auc_score +import mhcflurry from mhcflurry.common import normalize_allele_name from mhcflurry.data_helpers import indices_to_hotshot_encoding diff --git a/mhcflurry/predict.py b/mhcflurry/predict.py index aa099d62..1aef5dca 100644 --- a/mhcflurry/predict.py +++ b/mhcflurry/predict.py @@ -1,24 +1,29 @@ -import pandas as pd +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import os +from collections import OrderedDict + +import pandas as pd from .paths import CLASS1_MODEL_DIRECTORY from .mhc1_binding_predictor import Mhc1BindingPredictor def predict(alleles, peptides): - allele_dataframes = [] + allele_dataframes = OrderedDict([]) for allele in alleles: model = Mhc1BindingPredictor(allele=allele) - df = model.predict_peptides(peptides) + result_dictionary = model.predict_peptides(peptides) allele_dataframes.append(df) return pd.concat(allele_dataframes) - -def supported_alleles(): - alleles = [] - for filename in os.listdir(CLASS1_MODEL_DIRECTORY): - allele = filename.replace(".hdf", "") - if len(allele) < 5: - # skipping serotype names like A2 or B7 - continue - allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:]) - alleles.append(allele) - return alleles -- GitLab