From db6511c2daa0f6240ee4fed43701d46ea10cd78e Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Wed, 27 Jan 2016 17:34:21 -0500 Subject: [PATCH] added option to save incomplete matrix :use 20 rounds of MICE for matrix completion --- experiments/matrix-completion-accuracy.py | 31 ++++++++++++++++++----- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/experiments/matrix-completion-accuracy.py b/experiments/matrix-completion-accuracy.py index 2553c9e8..d09cc714 100644 --- a/experiments/matrix-completion-accuracy.py +++ b/experiments/matrix-completion-accuracy.py @@ -35,6 +35,7 @@ import sklearn.metrics from sklearn.cross_validation import StratifiedKFold from scipy import stats import numpy as np +import pandas as pd from dataset_paths import PETERS2009_CSV_PATH @@ -50,6 +51,11 @@ parser.add_argument( default=50000.0, type=float) +parser.add_argument( + "--save-incomplete-affinity-matrix", + default=None, + help="Path to CSV which will contains the incomplete affinity matrix") + parser.add_argument( "--only-human", default=False, @@ -127,12 +133,12 @@ imputation_methods = { "svdImpute-5": IterativeSVD(5, verbose=VERBOSE), "svdImpute-10": IterativeSVD(10, verbose=VERBOSE), "svdImpute-20": IterativeSVD(20, verbose=VERBOSE), - "colSims": SimilarityWeightedAveraging( + "similarityWeightedAveraging": SimilarityWeightedAveraging( orientation="columns", verbose=VERBOSE), "meanFill": SimpleFill("mean"), "zeroFill": SimpleFill("zero"), - "MICE": MICE(verbose=VERBOSE), + "MICE": MICE(n_burn_in=5, n_imputations=20, verbose=VERBOSE), "knnImpute-3": KNN(3, orientation="columns", verbose=VERBOSE, print_interval=1), "knnImpute-7": KNN(7, orientation="columns", verbose=VERBOSE, print_interval=1), "knnImpute-15": KNN(15, orientation="columns", verbose=VERBOSE, print_interval=1), @@ -221,6 +227,17 @@ if __name__ == "__main__": X, peptide_order, allele_order = \ dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity) + if args.save_incomplete_affinity_matrix: + print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix) + column_names = [None] * len(allele_order) + for (name, position) in allele_order.items(): + column_names[position] = name + row_names = [None] * len(peptide_order) + for (name, position) in peptide_order.items(): + row_names[position] = name + df = pd.DataFrame(X, columns=column_names, index=row_names) + df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") + scores = ScoreSet() missing_mask = np.isnan(X) @@ -236,11 +253,6 @@ if __name__ == "__main__": assert len(observed_indices) == n_observed kfold = StratifiedKFold(observed_y, n_folds=5, shuffle=True) - biscaler = BiScaler( - scale_rows=args.normalize_rows, - center_rows=args.normalize_rows, - scale_columns=args.normalize_columns, - center_columns=args.normalize_rows) for fold_idx, (_, indirect_test_indices) in enumerate(kfold): @@ -268,6 +280,11 @@ if __name__ == "__main__": empty_col_mask.sum())) X_fold_reduced = X_fold[ok_mesh] + biscaler = BiScaler( + scale_rows=args.normalize_rows, + center_rows=args.normalize_rows, + scale_columns=args.normalize_columns, + center_columns=args.normalize_columns) X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced) for (method_name, solver) in sorted(imputation_methods.items()): print("CV fold %d/%d, running %s" % ( -- GitLab