From 80d8c161b0278a77c72dc0d713c38211256e8f1b Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Thu, 4 Feb 2016 18:23:59 -0500 Subject: [PATCH] run each fold of data over every possible predictor, should be faster --- ...matrix-completion-hyperparameter-search.py | 239 ++++++++++-------- 1 file changed, 127 insertions(+), 112 deletions(-) diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py index ba025c34..f2de96dd 100755 --- a/experiments/matrix-completion-hyperparameter-search.py +++ b/experiments/matrix-completion-hyperparameter-search.py @@ -129,6 +129,11 @@ parser.add_argument( action="store_true", help="When expanding 8mers into 9mers use 'X' instead of all possible AAs") +parser.add_argument( + "--pretrain-only-9mers", + default=False, + action="store_true", + help="Exclude expanded samples from e.g. 8mers or 10mers") parser.add_argument( "--alleles", @@ -136,6 +141,8 @@ parser.add_argument( type=parse_string_list, help="Only evaluate these alleles (by default use all in the dataset)") +parser.add_argument("--min-samples-per-cv-fold", type=int, default=5) + def print_length_distribution(peptides, values, name): print("Length distribution for %s (n=%d):" % (name, len(peptides))) @@ -251,7 +258,6 @@ if __name__ == "__main__": activation ) if key not in predictors: - layer_sizes = [hidden_layer_size1] if hidden_layer_size2: layer_sizes.append(hidden_layer_size2) @@ -268,12 +274,21 @@ if __name__ == "__main__": ) weights = predictor.model.get_weights() opt_state = predictor.model.optimizer.get_state() - yield (key, predictor, weights, opt_state) - - # want at least 5 samples in each fold of CV - # to make meaningful estimates of accuracy - min_samples_per_cv_fold = 5 - min_samples_per_allele = min_samples_per_cv_fold * args.n_folds + predictors[key] = predictor + initial_weights[key] = weights + initial_optimizer_states[key] = opt_state + else: + predictor = predictors[key] + weights = initial_weights[key] + opt_state = initial_optimizer_states[key] + # reset the predictor to its initial condition + predictor.model.set_weights([ + w.copy() for w in weights]) + predictor.model.optimizer.set_state([ + s.copy() for s in opt_state]) + yield (key, predictor) + + min_samples_per_allele = args.min_samples_per_cv_fold * args.n_folds for allele_idx, allele in enumerate(allele_list): if args.alleles and allele not in args.alleles: # if user specifies an allele list then skip anything which isn't included @@ -320,110 +335,111 @@ if __name__ == "__main__": observed_peptides = [peptide_list[i] for i in observed_row_indices] print_length_distribution(observed_peptides, observed_values, name=allele) - for key, predictor, predictor_weights, optimizer_state in generate_predictors(): - # k-fold cross validation stratified to keep an even balance of low - # vs. high-binding peptides in each fold - for fold_idx, (train_indices, test_indices) in enumerate(StratifiedKFold( - y=below_500nM, - n_folds=args.n_folds, - shuffle=True)): - predictor.model.set_weights([w.copy() for w in predictor_weights]) - predictor.model.optimizer.set_state([s.copy() for s in optimizer_state]) - train_peptides_fold = [observed_peptides[i] for i in train_indices] - train_values_fold = [observed_values[i] for i in train_indices] - train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)} - - if args.verbose: - print("Training peptides for CV fold %d/%d:" % ( - fold_idx + 1, - args.n_folds)) - for p in train_peptides_fold: - aff = train_dict_fold[p] - print("-- %s: %f (%f nM)" % ( - p, - aff, - args.max_ic50 ** (1 - aff))) - - test_peptides = [observed_peptides[i] for i in test_indices] - test_values = [observed_values[i] for i in test_indices] - test_dict = {k: v for (k, v) in zip(test_peptides, test_values)} - - # drop the test peptides from the full matrix and then - # run completion on it to get synthesized affinities - pMHC_affinities_fold_incomplete = pMHC_affinity_matrix.copy() - test_indices_among_all_rows = observed_row_indices[test_indices] - pMHC_affinities_fold_incomplete[ - test_indices_among_all_rows, allele_idx] = np.nan - - # drop peptides with fewer than 2 measurements and alleles - # with fewer than 10 peptides - pMHC_affinities_fold_pruned, pruned_peptides_fold, pruned_alleles_fold = \ - prune_data( - X=pMHC_affinities_fold_incomplete, - peptide_list=peptide_list, - allele_list=allele_list, - min_observations_per_peptide=2, - min_observations_per_allele=min_samples_per_cv_fold) - - pMHC_affinities_fold_pruned_imputed = imputer.complete(pMHC_affinities_fold_pruned) - - if not args.unknown_amino_acids: - # if we're expanding 8mers to >100 9mers - # then the pretraining dataset becomes - # unmanageably large, so let's only use 9mers for pre-training - - # In the future: we'll use an neural network that - # takes multiple input lengths - ninemer_mask = np.array([len(p) == 9 for p in pruned_peptides_fold]) - ninemer_indices = np.where(ninemer_mask)[0] - pMHC_affinities_fold_pruned_imputed = \ - pMHC_affinities_fold_pruned_imputed[ninemer_indices] - pruned_peptides_fold = [pruned_peptides_fold[i] for i in ninemer_indices] - - # since we may have dropped some of the columns in the completed - # pMHC matrix need to find which column corresponds to the - # same name we're currently predicting - if allele in pruned_alleles_fold: - pMHC_fold_allele_idx = pruned_alleles_fold.index(allele) - pMHC_allele_values = pMHC_affinities_fold_pruned_imputed[ - :, pMHC_fold_allele_idx] - if pMHC_allele_values.std() == 0: - print("WARNING: unexpected zero-variance in pretraining affinity values") - else: - print( - "WARNING: Couldn't find allele %s in pre-training matrix" % allele) - column = pMHC_affinities_fold_incomplete[:, allele_idx] - column_mean = np.nanmean(column) - print("-- Setting pre-training target value to nanmean = %f" % column_mean) - pMHC_allele_values = np.ones(len(pruned_peptides_fold)) * column_mean - assert False - - assert len(pruned_peptides_fold) == len(pMHC_allele_values) - - # dictionary mapping peptides to imputed affinity values - pretrain_dict = { - pi: yi - for (pi, yi) - in zip(pruned_peptides_fold, pMHC_allele_values) - } - - X_pretrain, pretrain_row_peptides, pretrain_counts = \ - fixed_length_index_encoding( - peptides=pruned_peptides_fold, - desired_length=9, - allow_unknown_amino_acids=args.unknown_amino_acids) - pretrain_sample_weights = 1.0 / np.array(pretrain_counts) - Y_pretrain = np.array( - [pretrain_dict[p] for p in pretrain_row_peptides]) - - X_train, training_row_peptides, training_counts = \ - fixed_length_index_encoding( - peptides=train_peptides_fold, - desired_length=9, - allow_unknown_amino_acids=args.unknown_amino_acids) - training_sample_weights = 1.0 / np.array(training_counts) - Y_train = np.array([train_dict_fold[p] for p in training_row_peptides]) - + # k-fold cross validation stratified to keep an even balance of low + # vs. high-binding peptides in each fold + for fold_idx, (train_indices, test_indices) in enumerate(StratifiedKFold( + y=below_500nM, + n_folds=args.n_folds, + shuffle=True)): + + train_peptides_fold = [observed_peptides[i] for i in train_indices] + train_values_fold = [observed_values[i] for i in train_indices] + train_dict_fold = {k: v for (k, v) in zip(train_peptides_fold, train_values_fold)} + + """ + if args.verbose: + print("Training peptides for CV fold %d/%d:" % ( + fold_idx + 1, + args.n_folds)) + for p in train_peptides_fold: + aff = train_dict_fold[p] + print("-- %s: %f (%f nM)" % ( + p, + aff, + args.max_ic50 ** (1 - aff))) + """ + test_peptides = [observed_peptides[i] for i in test_indices] + test_values = [observed_values[i] for i in test_indices] + test_dict = {k: v for (k, v) in zip(test_peptides, test_values)} + + # drop the test peptides from the full matrix and then + # run completion on it to get synthesized affinities + pMHC_affinities_fold_incomplete = pMHC_affinity_matrix.copy() + test_indices_among_all_rows = observed_row_indices[test_indices] + pMHC_affinities_fold_incomplete[ + test_indices_among_all_rows, allele_idx] = np.nan + + # drop peptides with fewer than 2 measurements and alleles + # with fewer than 10 peptides + pMHC_affinities_fold_pruned, pruned_peptides_fold, pruned_alleles_fold = \ + prune_data( + X=pMHC_affinities_fold_incomplete, + peptide_list=peptide_list, + allele_list=allele_list, + min_observations_per_peptide=2, + min_observations_per_allele=args.min_samples_per_cv_fold) + + pMHC_affinities_fold_pruned_imputed = imputer.complete( + pMHC_affinities_fold_pruned) + + if args.pretrain_only_9mers: + # if we're expanding 8mers to >100 9mers + # then the pretraining dataset becomes + # unmanageably large, so let's only use 9mers for pre-training + + # In the future: we'll use an neural network that + # takes multiple input lengths + ninemer_mask = np.array([len(p) == 9 for p in pruned_peptides_fold]) + ninemer_indices = np.where(ninemer_mask)[0] + pMHC_affinities_fold_pruned_imputed = \ + pMHC_affinities_fold_pruned_imputed[ninemer_indices] + pruned_peptides_fold = [pruned_peptides_fold[i] for i in ninemer_indices] + + # since we may have dropped some of the columns in the completed + # pMHC matrix need to find which column corresponds to the + # same name we're currently predicting + if allele in pruned_alleles_fold: + pMHC_fold_allele_idx = pruned_alleles_fold.index(allele) + pMHC_allele_values = pMHC_affinities_fold_pruned_imputed[ + :, pMHC_fold_allele_idx] + if pMHC_allele_values.std() == 0: + print("WARNING: unexpected zero-variance in pretraining affinity values") + else: + print( + "WARNING: Couldn't find allele %s in pre-training matrix" % allele) + column = pMHC_affinities_fold_incomplete[:, allele_idx] + column_mean = np.nanmean(column) + print("-- Setting pre-training target value to nanmean = %f" % column_mean) + pMHC_allele_values = np.ones(len(pruned_peptides_fold)) * column_mean + assert False + + assert len(pruned_peptides_fold) == len(pMHC_allele_values) + + # dictionary mapping peptides to imputed affinity values + pretrain_dict = { + pi: yi + for (pi, yi) + in zip(pruned_peptides_fold, pMHC_allele_values) + } + + X_pretrain, pretrain_row_peptides, pretrain_counts = \ + fixed_length_index_encoding( + peptides=pruned_peptides_fold, + desired_length=9, + allow_unknown_amino_acids=args.unknown_amino_acids) + pretrain_sample_weights = 1.0 / np.array(pretrain_counts) + Y_pretrain = np.array( + [pretrain_dict[p] for p in pretrain_row_peptides]) + + X_train, training_row_peptides, training_counts = \ + fixed_length_index_encoding( + peptides=train_peptides_fold, + desired_length=9, + allow_unknown_amino_acids=args.unknown_amino_acids) + training_sample_weights = 1.0 / np.array(training_counts) + Y_train = np.array([train_dict_fold[p] for p in training_row_peptides]) + + for key, predictor in generate_predictors(): print("\n-----") print( ("Training CV fold %d/%d for %s (# peptides = %d, # samples=%d)" @@ -435,7 +451,6 @@ if __name__ == "__main__": len(X_train), key)) print("-----") - predictor.fit( X=X_train, Y=Y_train, -- GitLab