From e3e6f5eb9ce86dae8d94062b21b105cee192f513 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Mon, 21 Dec 2015 12:39:55 -0500 Subject: [PATCH] pulling together more bits of isolated training scripts into helpers file --- ...train-models-with-synthetic-pretraining.py | 93 ++++++------ experiments/training_helpers.py | 135 +++++++++++++----- 2 files changed, 144 insertions(+), 84 deletions(-) diff --git a/experiments/train-models-with-synthetic-pretraining.py b/experiments/train-models-with-synthetic-pretraining.py index 4d3f90e9..c21a9caa 100644 --- a/experiments/train-models-with-synthetic-pretraining.py +++ b/experiments/train-models-with-synthetic-pretraining.py @@ -29,6 +29,7 @@ from mhcflurry.data import ( from arg_parsing import parse_int_list, parse_float_list from dataset_paths import PETERS2009_CSV_PATH from common import load_csv_binding_data_as_dict +from training_helpers import create_and_evaluate_model_with_synthetic_data parser = argparse.ArgumentParser() @@ -159,6 +160,25 @@ def data_augmentation( return aucs, f1s, n_originals +def rescale_ic50(ic50, max_ic50): + log_ic50 = np.log(ic50) / np.log(args.max_ic50) + return max(0.0, min(1.0, 1.0 - log_ic50)) + + +def load_synthetic_data(csv_path, max_ic50): + synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict( + csv_path) + return { + allele: { + peptide: rescale_ic50(ic50, max_ic50=max_ic50) + for (peptide, ic50) + in allele_dict.items() + } + for (allele, allele_dict) + in synthetic_allele_to_peptide_to_ic50_dict.items() + } + + if __name__ == "__main__": args = parser.parse_args() print(args) @@ -169,70 +189,39 @@ if __name__ == "__main__": max_ic50=args.max_ic50, only_human=False) print("Loading synthetic data from %s" % args.synthetic_data_csv) - synthetic_allele_to_peptide_to_ic50_dict = load_csv_binding_data_as_dict( - args.synthetic_data_csv) - synthetic_allele_to_peptide_to_y_dict = { - allele: { - peptide: max( - 0.0, - min( - 1.0, - 1.0 - np.log(ic50) / np.log(args.max_ic50))) - for (peptide, ic50) - in allele_dict.items() - } - for (allele, allele_dict) - in synthetic_allele_to_peptide_to_ic50_dict.items() - } + + synthetic_affinities = load_synthetic_data( + csv_path=args.synthetic_data_csv, + max_ic50=args.max_ic50) combined_allele_set = set(allele_datasets.keys()).union( - synthetic_allele_to_peptide_to_y_dict.keys()) + synthetic_affinities.keys()) + combined_allele_list = list(sorted(combined_allele_set)) for allele in combined_allele_list: - actual_dataset = allele_datasets[allele] - X_actual = actual_dataset.X_index - weights_actual = actual_dataset.weights - Y_actual = actual_dataset.Y - - synthetic_dict = synthetic_allele_to_peptide_to_y_dict[allele] - - _, _, C_synth, X_synth, _, Y_synth = encode_peptide_to_affinity_dict( - synthetic_dict) - - n_actual_samples, n_actual_dims = X_actual.shape - n_synth_samples, n_synth_dims = X_synth.shape - assert n_actual_dims == n_synth_dims, \ - "Mismatch between # of actual dims %d and synthetic dims %d" % ( - n_actual_dims, n_synth_dims) - print("-- Using %d actual samples and %d synthetic samples for %s" % ( - n_actual_samples, n_synth_samples, allele)) - X = np.vstack([X_actual, X_synth]) - print("-- X.shape = %s, dtype = %s" % (X.shape, X.dtype)) - n_samples = n_actual_samples + n_synth_samples - assert X.shape[0] == n_samples, \ - "Expected %d samples but got data array with shape %s" % ( - n_actual_samples + n_synth_samples, X.shape) - Y = np.concatenate([Y_actual, Y_synth]) - print("-- Y.shape = %s, dtype = %s" % (Y.shape, Y.dtype)) - assert Y.min() >= 0, \ - "Y should not contain negative numbers! Y.min() = %f" % (Y.min(),) - assert Y.max() <= 1, \ - "Y should have max value 1.0, got Y.max() = %f" % (Y.max(),) - weights_synth = 1.0 / C_synth - weights = np.concatenate([weights_actual, weights_synth]) - assert len(weights) == n_samples - print("-- weights.shape = %s, dtype = %s" % ( - weights.shape, weights.dtype)) + synthetic_allele_dict = synthetic_affinities[allele] + (_, _, Counts_synth, X_synth, _, Y_synth) = \ + encode_peptide_to_affinity_dict(synthetic_allele_dict) + synthetic_sample_weights = 1.0 / Counts_synth scores = {} for dropout in args.dropouts: for embedding_dim_size in args.embedding_dim_sizes: for hidden_layer_size in args.hidden_layer_sizes: params = ( - ("dropout", dropout), + ("dropout_probability", dropout), ("embedding_dim_size", embedding_dim_size), ("hidden_layer_size", hidden_layer_size), ) - tau, auc, f1 = evaluate_model(**dict(params)) + tau, auc, f1 = create_and_evaluate_model_with_synthetic_data( + X_original=allele_datasets[allele].X_index, + Y_original=allele_datasets[allele].Y, + X_synth=X_synth, + Y_synth=Y_synth, + original_sample_weights=allele_datasets[allele].weights, + synthetic_sample_weights=synthetic_sample_weights, + n_training_epochs=150, + max_ic50=args.max_ic50, + **dict(params)) scores[params] = (tau, auc, f1) print("%s => tau=%f, AUC=%f, F1=%f" % ( params, diff --git a/experiments/training_helpers.py b/experiments/training_helpers.py index 5d1f2d3e..bc15780e 100644 --- a/experiments/training_helpers.py +++ b/experiments/training_helpers.py @@ -128,29 +128,51 @@ def train_model_and_return_scores( return (accuracy, auc, f1_score) -def create_and_evaluate_model_with_synthetic_data( - X, - Y, - weights_synth, - weights_actual, +def train_model_with_synthetic_data( + model, n_training_epochs, - dropout, - embedding_dim_size, - hidden_layer_size): - model = mhcflurry.feedforward.make_embedding_network( - peptide_length=9, - embedding_input_dim=20, - embedding_output_dim=4, - layer_sizes=[4], - activation="tanh", - init="lecun_uniform", - loss="mse", - output_activation="sigmoid", - dropout_probability=0.0, - optimizer=None, - learning_rate=0.001) - total_synth_weights = weights_synth.sum() - total_actual_weights = weights_actual.sum() + max_ic50, + X_original, + Y_original, + X_synth, + Y_synth, + original_sample_weights, + synthetic_sample_weights): + total_synth_weights = synthetic_sample_weights.sum() + total_original_weights = original_sample_weights.sum() + print("Mean Y=%f, Y_synth=%f, weight=%f, weight_synth=%f" % ( + np.mean(Y_original), + np.mean(Y_synth), + np.mean(original_sample_weights), + np.mean(synthetic_sample_weights))) + combined_weights = np.concatenate([ + original_sample_weights, + synthetic_sample_weights + ]) + n_actual_samples, n_actual_dims = X_original.shape + n_synth_samples, n_synth_dims = X_synth.shape + assert n_actual_dims == n_synth_dims, \ + "Mismatch between # of actual dims %d and synthetic dims %d" % ( + n_actual_dims, n_synth_dims) + X_combined = np.vstack([X_original, X_synth]) + n_combined_samples = n_actual_samples + n_synth_samples + assert X_combined.shape[0] == n_combined_samples, \ + "Expected %d samples but got data array with shape %s" % ( + n_actual_samples + n_synth_samples, X_combined.shape) + Y_combined = np.concatenate([Y_original, Y_synth]) + assert Y_combined.min() >= 0, \ + "Y should not contain negative numbers! Y.min() = %f" % ( + Y_combined.min(),) + assert Y_combined.max() <= 1, \ + "Y should have max value 1.0, got Y.max() = %f" % ( + Y_combined.max(),) + combined_weights = np.concatenate([ + original_sample_weights, + synthetic_sample_weights + ]) + + assert len(combined_weights) == n_combined_samples + for epoch in range(n_training_epochs): # weights for synthetic points can be shrunk as: # ~ 1 / (1+epoch)**2 @@ -160,20 +182,69 @@ def create_and_evaluate_model_with_synthetic_data( # if the contribution of synthetic samples is less than a # thousandth of the actual data, then stop using it synth_contribution = total_synth_weights * decay_factor - if synth_contribution < total_actual_weights / 1000: + if synth_contribution < total_original_weights / 1000: print("Epoch %d, using only actual data" % (epoch + 1,)) model.fit( - X_actual, - Y_actual, - sample_weight=weights_actual, - nb_epoch=1) + X_original, + Y_original, + sample_weight=original_sample_weights, + nb_epoch=1, + verbose=0) else: print("Epoch %d, synth decay factor = %f" % ( epoch + 1, decay_factor)) - weights[n_actual_samples:] = weights_synth * decay_factor - model.fit(X, Y, sample_weight=weights, nb_epoch=1) - Y_pred = model.predict(X_actual) - print("Training MSE %0.4f" % ((Y_actual - Y_pred) ** 2).mean()) - + combined_weights[n_actual_samples:] = ( + synthetic_sample_weights * decay_factor) + model.fit( + X_combined, + Y_combined, + sample_weight=combined_weights, + nb_epoch=1, + verbose=0) + Y_pred = model.predict(X_original) + training_mse = ((Y_original - Y_pred) ** 2).mean() + print( + "-- Epoch %d/%d Training MSE %0.4f" % ( + epoch + 1, + n_training_epochs, + training_mse)) +def create_and_evaluate_model_with_synthetic_data( + X_original, + Y_original, + X_synth, + Y_synth, + original_sample_weights=None, + synthetic_sample_weights=None, + n_training_epochs=150, + embedding_dim_size=16, + hidden_layer_size=50, + dropout_probability=0.0, + max_ic50=50000.0): + if original_sample_weights is None: + original_sample_weights = np.ones(len(X_original), dtype=float) + if synthetic_sample_weights is None: + synthetic_sample_weights = np.ones(len(X_synth), dtype=float) + model = mhcflurry.feedforward.make_embedding_network( + peptide_length=9, + embedding_input_dim=20, + embedding_output_dim=embedding_dim_size, + layer_sizes=[hidden_layer_size], + activation="tanh", + init="lecun_uniform", + loss="mse", + output_activation="sigmoid", + dropout_probability=dropout_probability, + optimizer=None, + learning_rate=0.001) + train_model_with_synthetic_data( + model=model, + n_training_epochs=n_training_epochs, + max_ic50=max_ic50, + X_original=X_original, + Y_original=Y_original, + X_synth=X_synth, + Y_synth=Y_synth, + original_sample_weights=original_sample_weights, + synthetic_sample_weights=synthetic_sample_weights) -- GitLab