From 8308c397e6bbc86d64838b0e45a1a413ed16bb0b Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Wed, 3 Feb 2016 17:32:52 -0500 Subject: [PATCH] skip alleles with less than n_folds positive examples --- ...matrix-completion-hyperparameter-search.py | 293 +++++++++++------- experiments/matrix_completion_helpers.py | 18 +- 2 files changed, 189 insertions(+), 122 deletions(-) diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py index 65d4fd6b..ba025c34 100755 --- a/experiments/matrix-completion-hyperparameter-search.py +++ b/experiments/matrix-completion-hyperparameter-search.py @@ -39,7 +39,7 @@ from dataset_paths import PETERS2009_CSV_PATH from score_set import ScoreSet from matrix_completion_helpers import load_data, evaluate_predictions -from arg_parsing import parse_int_list, parse_float_list +from arg_parsing import parse_int_list, parse_float_list, parse_string_list from matrix_completion_helpers import prune_data @@ -130,6 +130,13 @@ parser.add_argument( help="When expanding 8mers into 9mers use 'X' instead of all possible AAs") +parser.add_argument( + "--alleles", + default=None, + type=parse_string_list, + help="Only evaluate these alleles (by default use all in the dataset)") + + def print_length_distribution(peptides, values, name): print("Length distribution for %s (n=%d):" % (name, len(peptides))) grouped_affinity_dict = defaultdict(list) @@ -195,6 +202,8 @@ if __name__ == "__main__": scores = ScoreSet( index=[ "allele", + "peptide_count", + "sample_count", "dropout_probability", "embedding_dim_size", "hidden_layer_size1", @@ -217,48 +226,58 @@ if __name__ == "__main__": else: raise ValueError("Invalid imputation method: %s" % impute_method_name) - # to avoid recompiling and initializing a lot of neural networks - # just save all the models up front along with their initial - # weight matrices predictors = {} initial_weights = {} initial_optimizer_states = {} - for dropout in args.dropouts: - for embedding_dim_size in args.embedding_dim_sizes: - for hidden_layer_size1 in args.first_hidden_layer_sizes: - for hidden_layer_size2 in args.second_hidden_layer_sizes: - for activation in args.activation_functions: - key = "%f,%d,%d,%d,%s" % ( - dropout, - embedding_dim_size, - hidden_layer_size1, - hidden_layer_size2, - activation - ) - layer_sizes = [hidden_layer_size1] - if hidden_layer_size2: - layer_sizes.append(hidden_layer_size2) - if args.verbose: - print("-- Creating predictor for hyperparameters: %s" % key) - predictor = Class1BindingPredictor.from_hyperparameters( - embedding_output_dim=embedding_dim_size, - layer_sizes=layer_sizes, - activation=activation, - output_activation="sigmoid", - dropout_probability=dropout, - verbose=args.verbose, - allow_unknown_amino_acids=args.unknown_amino_acids, - embedding_input_dim=21 if args.unknown_amino_acids else 20, - ) - predictors[key] = predictor - initial_weights[key] = predictor.model.get_weights() - initial_optimizer_states[key] = predictor.model.optimizer.get_state() + + def generate_predictors(): + """ + Generator of all possible predictors generated by combinations of + hyperparameters. + + To avoid recompiling and initializing a lot of neural networks + just save all the models up front along with their initial weight matrices + """ + for dropout in args.dropouts: + for embedding_dim_size in args.embedding_dim_sizes: + for hidden_layer_size1 in args.first_hidden_layer_sizes: + for hidden_layer_size2 in args.second_hidden_layer_sizes: + for activation in args.activation_functions: + key = "%f,%d,%d,%d,%s" % ( + dropout, + embedding_dim_size, + hidden_layer_size1, + hidden_layer_size2, + activation + ) + if key not in predictors: + + layer_sizes = [hidden_layer_size1] + if hidden_layer_size2: + layer_sizes.append(hidden_layer_size2) + + predictor = Class1BindingPredictor.from_hyperparameters( + embedding_output_dim=embedding_dim_size, + layer_sizes=layer_sizes, + activation=activation, + output_activation="sigmoid", + dropout_probability=dropout, + verbose=args.verbose, + allow_unknown_amino_acids=args.unknown_amino_acids, + embedding_input_dim=21 if args.unknown_amino_acids else 20, + ) + weights = predictor.model.get_weights() + opt_state = predictor.model.optimizer.get_state() + yield (key, predictor, weights, opt_state) # want at least 5 samples in each fold of CV # to make meaningful estimates of accuracy min_samples_per_cv_fold = 5 min_samples_per_allele = min_samples_per_cv_fold * args.n_folds for allele_idx, allele in enumerate(allele_list): + if args.alleles and allele not in args.alleles: + # if user specifies an allele list then skip anything which isn't included + continue if n_observed_per_allele[allele_idx] < min_samples_per_allele: print("-- Skipping allele %s which only has %d samples (need %d)" % ( allele, @@ -286,110 +305,144 @@ if __name__ == "__main__": "Evaluating allele %s (n=%d)" % ( allele, len(observed_row_indices))) - median_value = np.median(observed_values) + ic50_values = args.max_ic50 ** (1 - observed_values) + + below_500nM = ic50_values <= 500 + if below_500nM.all(): + print("-- Skipping %s due to all negative predictions" % allele) + continue + if below_500nM.sum() < args.n_folds: + print("-- Skipping %s due to insufficient positive examples (%d)" % ( + allele, + below_500nM.sum())) + continue + observed_peptides = [peptide_list[i] for i in observed_row_indices] print_length_distribution(observed_peptides, observed_values, name=allele) - # k-fold cross validation stratified to keep an even balance of low - # vs. high-binding peptides in each fold - for fold_idx, (train_indices, test_indices) in enumerate( - StratifiedKFold( - y=observed_values < median_value, + for key, predictor, predictor_weights, optimizer_state in generate_predictors(): + # k-fold cross validation stratified to keep an even balance of low + # vs. high-binding peptides in each fold + for fold_idx, (train_indices, test_indices) in enumerate(StratifiedKFold( + y=below_500nM, n_folds=args.n_folds, shuffle=True)): - train_peptides = [observed_peptides[i] for i in train_indices] - train_values = [observed_values[i] for i in train_indices] - train_dict = {k: v for (k, v) in zip(train_peptides, train_values)} - - test_peptides = [observed_peptides[i] for i in test_indices] - test_values = [observed_values[i] for i in test_indices] - test_dict = {k: v for (k, v) in zip(test_peptides, test_values)} - - # drop the test peptides from the full matrix and then - # run completion on it to get synthesized affinities - pMHC_affinity_matrix_fold = pMHC_affinity_matrix.copy() - test_indices_among_all_rows = observed_row_indices[test_indices] - pMHC_affinity_matrix_fold[test_indices_among_all_rows, allele_idx] = np.nan - - # drop peptides with fewer than 2 measurements and alleles - # with fewer than 10 peptides - pMHC_affinity_matrix_fold_pruned, pruned_peptides_fold, pruned_alleles_fold = \ - prune_data( - X=pMHC_affinity_matrix_fold, - peptide_list=peptide_list, - allele_list=allele_list, - min_observations_per_peptide=2, - min_observations_per_allele=min_samples_per_cv_fold) - - pMHC_affinity_matrix_fold_completed = imputer.complete( - pMHC_affinity_matrix_fold_pruned) - - # keep synthetic data for 9mer peptides, - # otherwise we can an explosion of low weight samples - # that are expanded from e.g. 11mers - # In the future: we'll use an neural network that - # takes multiple input lengths - ninemer_mask = np.array([len(p) == 9 for p in pruned_peptides_fold]) - ninemer_indices = np.where(ninemer_mask)[0] - pMHC_affinity_matrix_fold_completed_9mers = \ - pMHC_affinity_matrix_fold_completed[ninemer_indices] - pretrain_peptides = [pruned_peptides_fold[i] for i in ninemer_indices] - X_pretrain = index_encoding(pretrain_peptides, 9) - - if args.verbose: - print("-- pMHC matrix for all peptides shape = %s" % ( - pMHC_affinity_matrix_fold_completed.shape,)) - print("-- pMHC matrix for only 9mers shape = %s" % ( - pMHC_affinity_matrix_fold_completed_9mers.shape,)) - print("-- 9mer indices n=%d max=%d" % ( - len(ninemer_indices), ninemer_indices.max())) - print("-- X_pretrain.shape = %s" % (X_pretrain.shape,)) - - # since we may have dropped some of the columns in the completed - # pMHC matrix need to find which column corresponds to the - # same name we're currently predicting - if allele in pruned_alleles_fold: - pMHC_fold_allele_idx = pruned_alleles_fold.index(allele) - Y_pretrain = pMHC_affinity_matrix_fold_completed_9mers[ - :, pMHC_fold_allele_idx] - else: - print( - "WARNING: Couldn't find allele %s in pre-training matrix" % allele) - column = pMHC_affinity_matrix_fold[:, allele_idx] - column_mean = np.nanmean(column) - print("-- Setting pre-training target value to nanmean = %f" % column_mean) - n_pretrain = len(pretrain_peptides) - Y_pretrain = np.ones(n_pretrain) * column_mean - - X_train, training_row_peptides, training_counts = \ - fixed_length_index_encoding( - peptides=train_peptides, - desired_length=9, - allow_unknown_amino_acids=args.unknown_amino_acids) - training_sample_weights = 1.0 / np.array(training_counts) - Y_train = np.array([ - train_dict[p] for p in training_row_peptides]) - for key, predictor in sorted(predictors.items()): + predictor.model.set_weights([w.copy() for w in predictor_weights]) + predictor.model.optimizer.set_state([s.copy() for s in optimizer_state]) + train_peptides_fold = [observed_peptides[i] for i in train_indices] + train_values_fold = [observed_values[i] for i in train_indices] + train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)} + + if args.verbose: + print("Training peptides for CV fold %d/%d:" % ( + fold_idx + 1, + args.n_folds)) + for p in train_peptides_fold: + aff = train_dict_fold[p] + print("-- %s: %f (%f nM)" % ( + p, + aff, + args.max_ic50 ** (1 - aff))) + + test_peptides = [observed_peptides[i] for i in test_indices] + test_values = [observed_values[i] for i in test_indices] + test_dict = {k: v for (k, v) in zip(test_peptides, test_values)} + + # drop the test peptides from the full matrix and then + # run completion on it to get synthesized affinities + pMHC_affinities_fold_incomplete = pMHC_affinity_matrix.copy() + test_indices_among_all_rows = observed_row_indices[test_indices] + pMHC_affinities_fold_incomplete[ + test_indices_among_all_rows, allele_idx] = np.nan + + # drop peptides with fewer than 2 measurements and alleles + # with fewer than 10 peptides + pMHC_affinities_fold_pruned, pruned_peptides_fold, pruned_alleles_fold = \ + prune_data( + X=pMHC_affinities_fold_incomplete, + peptide_list=peptide_list, + allele_list=allele_list, + min_observations_per_peptide=2, + min_observations_per_allele=min_samples_per_cv_fold) + + pMHC_affinities_fold_pruned_imputed = imputer.complete(pMHC_affinities_fold_pruned) + + if not args.unknown_amino_acids: + # if we're expanding 8mers to >100 9mers + # then the pretraining dataset becomes + # unmanageably large, so let's only use 9mers for pre-training + + # In the future: we'll use an neural network that + # takes multiple input lengths + ninemer_mask = np.array([len(p) == 9 for p in pruned_peptides_fold]) + ninemer_indices = np.where(ninemer_mask)[0] + pMHC_affinities_fold_pruned_imputed = \ + pMHC_affinities_fold_pruned_imputed[ninemer_indices] + pruned_peptides_fold = [pruned_peptides_fold[i] for i in ninemer_indices] + + # since we may have dropped some of the columns in the completed + # pMHC matrix need to find which column corresponds to the + # same name we're currently predicting + if allele in pruned_alleles_fold: + pMHC_fold_allele_idx = pruned_alleles_fold.index(allele) + pMHC_allele_values = pMHC_affinities_fold_pruned_imputed[ + :, pMHC_fold_allele_idx] + if pMHC_allele_values.std() == 0: + print("WARNING: unexpected zero-variance in pretraining affinity values") + else: + print( + "WARNING: Couldn't find allele %s in pre-training matrix" % allele) + column = pMHC_affinities_fold_incomplete[:, allele_idx] + column_mean = np.nanmean(column) + print("-- Setting pre-training target value to nanmean = %f" % column_mean) + pMHC_allele_values = np.ones(len(pruned_peptides_fold)) * column_mean + assert False + + assert len(pruned_peptides_fold) == len(pMHC_allele_values) + + # dictionary mapping peptides to imputed affinity values + pretrain_dict = { + pi: yi + for (pi, yi) + in zip(pruned_peptides_fold, pMHC_allele_values) + } + + X_pretrain, pretrain_row_peptides, pretrain_counts = \ + fixed_length_index_encoding( + peptides=pruned_peptides_fold, + desired_length=9, + allow_unknown_amino_acids=args.unknown_amino_acids) + pretrain_sample_weights = 1.0 / np.array(pretrain_counts) + Y_pretrain = np.array( + [pretrain_dict[p] for p in pretrain_row_peptides]) + + X_train, training_row_peptides, training_counts = \ + fixed_length_index_encoding( + peptides=train_peptides_fold, + desired_length=9, + allow_unknown_amino_acids=args.unknown_amino_acids) + training_sample_weights = 1.0 / np.array(training_counts) + Y_train = np.array([train_dict_fold[p] for p in training_row_peptides]) print("\n-----") print( - ("Training model for %s (# peptides = %d, # samples=%d)" + ("Training CV fold %d/%d for %s (# peptides = %d, # samples=%d)" " with parameters: %s") % ( + fold_idx + 1, + args.n_folds, allele, - len(train_peptides), + len(train_peptides_fold), len(X_train), key)) print("-----") - predictor.model.set_weights([ - w.copy() for w in initial_weights[key]]) - predictor.model.optimizer.set_state([ - s.copy() for s in initial_optimizer_states[key]]) + predictor.fit( X=X_train, Y=Y_train, sample_weights=training_sample_weights, X_pretrain=X_pretrain, Y_pretrain=Y_pretrain, + pretrain_sample_weights=pretrain_sample_weights, n_training_epochs=args.training_epochs, verbose=args.verbose, batch_size=args.batch_size) @@ -403,7 +456,7 @@ if __name__ == "__main__": y_pred=y_pred, max_ic50=args.max_ic50) scores.add_many( - ("%s," % allele) + key, + ("%s,%d,%d," % (allele, len(train_peptides_fold), len(X_train))) + key, mae=mae, tau=tau, f1_score=f1_score, diff --git a/experiments/matrix_completion_helpers.py b/experiments/matrix_completion_helpers.py index 3290cbb8..bb6d803e 100644 --- a/experiments/matrix_completion_helpers.py +++ b/experiments/matrix_completion_helpers.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging + from fancyimpute.dictionary_helpers import ( dense_matrix_from_nested_dictionary, transpose_nested_dictionary, @@ -37,24 +39,36 @@ def evaluate_predictions( # if all predictions are the same if (y_pred[0] == y_pred).all(): + logging.warn("All predicted values were the same: %f" % y_pred[0]) return (mae, 0, 0.5, 0) tau, _ = stats.kendalltau( y_pred, y_true) + if np.isnan(tau): + logging.warn("Value for Kendall's tau was NaN (n=%d)" % (len(y_true),)) + tau = 0.0 true_ic50s = max_ic50 ** (1.0 - np.array(y_true)) predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred)) true_binding_label = true_ic50s <= 500 - if true_binding_label.all() or not true_binding_label.any(): + if true_binding_label.all(): + logging.warn("All true labels are binders, can't compute AUC") + return (mae, tau, 0.5, 0) + elif not true_binding_label.any(): + logging.warn("No true labels are binders, can't compute AUC") # can't compute AUC or F1 without both negative and positive cases return (mae, tau, 0.5, 0) auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred) predicted_binding_label = predicted_ic50s <= 500 - if predicted_binding_label.all() or not predicted_binding_label.any(): + if predicted_binding_label.all(): + logging.warn("All predicted labels are binders, can't compute F1") + return (mae, tau, auc, 0) + elif not predicted_binding_label.any(): + logging.warn("No predicted labels are binders, can't compute F1") # can't compute F1 without both positive and negative predictions return (mae, tau, auc, 0) -- GitLab