diff --git a/experiments/matrix-completion-accuracy.py b/experiments/matrix-completion-accuracy.py index dc2923b4130241b0b2af43c52bb0513ddd368d00..b3310ae9a3e79a6f87578378aebd99ff2d4dec96 100644 --- a/experiments/matrix-completion-accuracy.py +++ b/experiments/matrix-completion-accuracy.py @@ -59,7 +59,7 @@ parser.add_argument( parser.add_argument( "--output-file", - default="matrix-completion-accuracy-results.csv") + default="imputation-accuracy-comparison.csv") parser.add_argument( "--normalize-columns", @@ -144,12 +144,13 @@ if __name__ == "__main__": ) print("Imputation methods: %s" % imputation_methods) - X, observed_mask, peptide_list, allele_list = load_data( + X, peptide_list, allele_list = load_data( binding_data_csv=args.binding_data_csv, max_ic50=args.max_ic50, only_human=args.only_human, min_observations_per_allele=args.n_folds, min_observations_per_peptide=args.min_observations_per_peptide) + observed_mask = np.isfinite(X) print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % ( X.shape, observed_mask.sum(), diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py index fd220639c87cce00b09938fd17f1d989bf44ec07..b324465b9b27bf9ac46b4d9b90121ea1cdc1a88e 100755 --- a/experiments/matrix-completion-hyperparameter-search.py +++ b/experiments/matrix-completion-hyperparameter-search.py @@ -24,7 +24,7 @@ For each allele, perform 5-fold cross validation to import argparse from collections import defaultdict -from fancyimpute import MICE, KNN, SimpleFill +from fancyimpute import MICE, KNN, SimpleFill, IterativeSVD, SoftImpute import numpy as np import pandas as pd from mhcflurry.peptide_encoding import fixed_length_index_encoding @@ -119,7 +119,7 @@ parser.add_argument( parser.add_argument( "--impute", default="mice", - help="Use {'mice', 'knn', 'meanfill'} for imputing pre-training data") + help="Use {'mice', 'knn', 'meanfill', 'none', 'svt', 'svd'} for imputing pre-training data") parser.add_argument("--batch-size", type=int, default=64) @@ -228,8 +228,14 @@ if __name__ == "__main__": imputer = MICE(n_burn_in=5, n_imputations=20, min_value=0, max_value=1) elif impute_method_name.startswith("knn"): imputer = KNN(k=1, orientation="columns", print_interval=10) + elif impute_method_name.startswith("svd"): + imputer = IterativeSVD(rank=10) + elif impute_method_name.startswith("svt"): + imputer = SoftImpute() elif impute_method_name.startswith("mean"): imputer = SimpleFill("mean") + elif impute_method_name == "none": + imputer = None else: raise ValueError("Invalid imputation method: %s" % impute_method_name) @@ -362,74 +368,78 @@ if __name__ == "__main__": test_peptides = [observed_peptides[i] for i in test_indices] test_values = [observed_values[i] for i in test_indices] test_dict = {k: v for (k, v) in zip(test_peptides, test_values)} - - # drop the test peptides from the full matrix and then - # run completion on it to get synthesized affinities - pMHC_affinities_fold_incomplete = pMHC_affinity_matrix.copy() - test_indices_among_all_rows = observed_row_indices[test_indices] - pMHC_affinities_fold_incomplete[ - test_indices_among_all_rows, allele_idx] = np.nan - - # drop peptides with fewer than 2 measurements and alleles - # with fewer than 10 peptides - pMHC_affinities_fold_pruned, pruned_peptides_fold, pruned_alleles_fold = \ - prune_data( - X=pMHC_affinities_fold_incomplete, - peptide_list=peptide_list, - allele_list=allele_list, - min_observations_per_peptide=2, - min_observations_per_allele=args.min_samples_per_cv_fold) - - pMHC_affinities_fold_pruned_imputed = imputer.complete( - pMHC_affinities_fold_pruned) - - if args.pretrain_only_9mers: - # if we're expanding 8mers to >100 9mers - # then the pretraining dataset becomes - # unmanageably large, so let's only use 9mers for pre-training - - # In the future: we'll use an neural network that - # takes multiple input lengths - ninemer_mask = np.array([len(p) == 9 for p in pruned_peptides_fold]) - ninemer_indices = np.where(ninemer_mask)[0] - pMHC_affinities_fold_pruned_imputed = \ - pMHC_affinities_fold_pruned_imputed[ninemer_indices] - pruned_peptides_fold = [pruned_peptides_fold[i] for i in ninemer_indices] - - # since we may have dropped some of the columns in the completed - # pMHC matrix need to find which column corresponds to the - # same name we're currently predicting - if allele in pruned_alleles_fold: - pMHC_fold_allele_idx = pruned_alleles_fold.index(allele) - pMHC_allele_values = pMHC_affinities_fold_pruned_imputed[ - :, pMHC_fold_allele_idx] - if pMHC_allele_values.std() == 0: - print("WARNING: unexpected zero-variance in pretraining affinity values") + if imputer is None: + X_pretrain = None + Y_pretrain = None + pretrain_sample_weights = None else: - print( - "WARNING: Couldn't find allele %s in pre-training matrix" % allele) - column = pMHC_affinities_fold_incomplete[:, allele_idx] - column_mean = np.nanmean(column) - print("-- Setting pre-training target value to nanmean = %f" % column_mean) - pMHC_allele_values = np.ones(len(pruned_peptides_fold)) * column_mean - - assert len(pruned_peptides_fold) == len(pMHC_allele_values) - - # dictionary mapping peptides to imputed affinity values - pretrain_dict = { - pi: yi - for (pi, yi) - in zip(pruned_peptides_fold, pMHC_allele_values) - } - - X_pretrain, pretrain_row_peptides, pretrain_counts = \ - fixed_length_index_encoding( - peptides=pruned_peptides_fold, - desired_length=9, - allow_unknown_amino_acids=args.unknown_amino_acids) - pretrain_sample_weights = 1.0 / np.array(pretrain_counts) - Y_pretrain = np.array( - [pretrain_dict[p] for p in pretrain_row_peptides]) + # drop the test peptides from the full matrix and then + # run completion on it to get synthesized affinities + pMHC_affinities_fold_incomplete = pMHC_affinity_matrix.copy() + test_indices_among_all_rows = observed_row_indices[test_indices] + pMHC_affinities_fold_incomplete[ + test_indices_among_all_rows, allele_idx] = np.nan + + # drop peptides with fewer than 2 measurements and alleles + # with fewer than 10 peptides + pMHC_affinities_fold_pruned, pruned_peptides_fold, pruned_alleles_fold = \ + prune_data( + X=pMHC_affinities_fold_incomplete, + peptide_list=peptide_list, + allele_list=allele_list, + min_observations_per_peptide=2, + min_observations_per_allele=args.min_samples_per_cv_fold) + + pMHC_affinities_fold_pruned_imputed = imputer.complete( + pMHC_affinities_fold_pruned) + + if args.pretrain_only_9mers: + # if we're expanding 8mers to >100 9mers + # then the pretraining dataset becomes + # unmanageably large, so let's only use 9mers for pre-training + + # In the future: we'll use an neural network that + # takes multiple input lengths + ninemer_mask = np.array([len(p) == 9 for p in pruned_peptides_fold]) + ninemer_indices = np.where(ninemer_mask)[0] + pMHC_affinities_fold_pruned_imputed = \ + pMHC_affinities_fold_pruned_imputed[ninemer_indices] + pruned_peptides_fold = [pruned_peptides_fold[i] for i in ninemer_indices] + + # since we may have dropped some of the columns in the completed + # pMHC matrix need to find which column corresponds to the + # same name we're currently predicting + if allele in pruned_alleles_fold: + pMHC_fold_allele_idx = pruned_alleles_fold.index(allele) + pMHC_allele_values = pMHC_affinities_fold_pruned_imputed[ + :, pMHC_fold_allele_idx] + if pMHC_allele_values.std() == 0: + print("WARNING: unexpected zero-variance in pretraining affinity values") + else: + print( + "WARNING: Couldn't find allele %s in pre-training matrix" % allele) + column = pMHC_affinities_fold_incomplete[:, allele_idx] + column_mean = np.nanmean(column) + print("-- Setting pre-training target value to nanmean = %f" % column_mean) + pMHC_allele_values = np.ones(len(pruned_peptides_fold)) * column_mean + + assert len(pruned_peptides_fold) == len(pMHC_allele_values) + + # dictionary mapping peptides to imputed affinity values + pretrain_dict = { + pi: yi + for (pi, yi) + in zip(pruned_peptides_fold, pMHC_allele_values) + } + + X_pretrain, pretrain_row_peptides, pretrain_counts = \ + fixed_length_index_encoding( + peptides=pruned_peptides_fold, + desired_length=9, + allow_unknown_amino_acids=args.unknown_amino_acids) + pretrain_sample_weights = 1.0 / np.array(pretrain_counts) + Y_pretrain = np.array( + [pretrain_dict[p] for p in pretrain_row_peptides]) X_train, training_row_peptides, training_counts = \ fixed_length_index_encoding( @@ -439,16 +449,21 @@ if __name__ == "__main__": training_sample_weights = 1.0 / np.array(training_counts) Y_train = np.array([train_dict_fold[p] for p in training_row_peptides]) + n_pretrain = len(X_pretrain) + n_train_unique = len(train_peptides_fold) + n_train = len(X_train) for key, predictor in generate_predictors(): print("\n-----") print( - ("Training CV fold %d/%d for %s (# peptides = %d, # samples=%d)" + ("Training CV fold %d/%d for %s " + "(# peptides = %d, # samples=%d, # pretrain samples=%d)" " with parameters: %s") % ( fold_idx + 1, args.n_folds, allele, - len(train_peptides_fold), - len(X_train), + n_train_unique, + n_train, + n_pretrain, key)) print("-----") predictor.fit( @@ -471,7 +486,7 @@ if __name__ == "__main__": y_pred=y_pred, max_ic50=args.max_ic50) scores.add_many( - ("%s,%d,%d," % (allele, len(train_peptides_fold), len(X_train))) + key, + ("%s,%d,%d," % (allele, n_train_unique, n_train)) + key, mae=mae, tau=tau, f1_score=f1_score,