diff --git a/experiments/matrix-completion-accuracy.py b/experiments/matrix-completion-accuracy.py index cf37e90647a0f2b850a39376ebcf7e37007a33b1..13ff95331aeece249e617bbef1bf8c38e4bb30e8 100644 --- a/experiments/matrix-completion-accuracy.py +++ b/experiments/matrix-completion-accuracy.py @@ -131,36 +131,56 @@ def evaluate_predictions( return mae, tau, auc, f1_score -if __name__ == "__main__": - args = parser.parse_args() - print(args) - - imputation_methods = { - "softImpute": SoftImpute(verbose=args.verbose), - "svdImpute-5": IterativeSVD(5, verbose=args.verbose), - "svdImpute-10": IterativeSVD(10, verbose=args.verbose), - "svdImpute-20": IterativeSVD(20, verbose=args.verbose), - "similarityWeightedAveraging": SimilarityWeightedAveraging( - orientation="columns", - verbose=args.verbose), +def create_imputation_methods( + verbose=False, + clip_imputed_values=False, + knn_print_interval=20, + knn_params=[1, 3, 5], + softimpute_params=[1, 5, 10], + svd_params=[5, 10, 20]): + min_value = 0 if clip_imputed_values else None + max_value = 1 if clip_imputed_values else None + result_dict = { "meanFill": SimpleFill("mean"), "zeroFill": SimpleFill("zero"), - "MICE": MICE( + "mice": MICE( n_burn_in=5, n_imputations=25, - min_value=None if args.normalize_rows or args.normalize_columns else 0, - max_value=None if args.normalize_rows or args.normalize_columns else 1, - verbose=args.verbose), - "knnImpute-3": KNN(3, orientation="columns", verbose=args.verbose, print_interval=20), - "knnImpute-7": KNN(7, orientation="columns", verbose=args.verbose, print_interval=20), - "knnImpute-15": KNN(15, orientation="columns", verbose=args.verbose, print_interval=20), + min_value=min_value, + max_value=max_value, + verbose=verbose), + "similarityWeightedAveraging": SimilarityWeightedAveraging( + orientation="columns", + verbose=verbose), } + for threshold in softimpute_params: + result_dict["softImpute-%d" % threshold] = SoftImpute( + threshold, + verbose=verbose, + min_value=min_value, + max_value=max_value) + for rank in svd_params: + result_dict["svdImpute-%d" % rank] = IterativeSVD( + rank, + verbose=verbose, + min_value=min_value, + max_value=max_value) + for k in knn_params: + result_dict["knnImpute-%d" % k] = KNN( + k, + orientation="columns", + verbose=verbose, + print_interval=knn_print_interval) + return result_dict + +def load_data(binding_data_csv, max_ic50, only_human=False, min_allele_size=1): allele_to_peptide_to_affinity = load_allele_dicts( - args.binding_data_csv, - max_ic50=args.max_ic50, - only_human=args.only_human, - regression_output=True) + binding_data_csv, + max_ic50=max_ic50, + only_human=only_human, + regression_output=True, + min_allele_size=min_allele_size) peptide_to_allele_to_affinity = transpose_nested_dictionary( allele_to_peptide_to_affinity) n_binding_values = sum( @@ -172,25 +192,40 @@ if __name__ == "__main__": n_binding_values, len(allele_to_peptide_to_affinity))) - X, peptide_order, allele_order = \ + X, peptide_list, allele_list = \ dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity) - if args.save_incomplete_affinity_matrix: - print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix) - column_names = [None] * len(allele_order) - for (name, position) in allele_order.items(): - column_names[position] = name - row_names = [None] * len(peptide_order) - for (name, position) in peptide_order.items(): - row_names[position] = name - df = pd.DataFrame(X, columns=column_names, index=row_names) - df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") - - scores = ScoreSet() - missing_mask = np.isnan(X) observed_mask = ~missing_mask + n_observed_per_peptide = observed_mask.sum(axis=1) + min_observed_per_peptide = n_observed_per_peptide.min() + min_peptide_indices = np.where( + n_observed_per_peptide == min_observed_per_peptide)[0] + print("%d peptides with %d observations" % ( + len(min_peptide_indices), + min_observed_per_peptide)) + + n_observed_per_allele = observed_mask.sum(axis=0) + min_observed_per_allele = n_observed_per_allele.min() + min_allele_indices = np.where( + n_observed_per_allele == min_observed_per_allele)[0] + print("%d alleles with %d observations: %s" % ( + len(min_allele_indices), + min_observed_per_allele, + [allele_list[i] for i in min_allele_indices])) + return X, missing_mask, observed_mask, peptide_list, allele_list + + +def index_counts(indices): + max_index = indices.max() + counts = np.zeros(max_index + 1, dtype=int) + for index in indices: + counts[index] += 1 + return counts + + +def stratified_cross_validation(X, observed_mask, n_folds=10): n_observed = observed_mask.sum() (observed_peptide_index, observed_allele_index) = np.where(observed_mask) @@ -200,18 +235,26 @@ if __name__ == "__main__": assert len(observed_indices) == n_observed + observed_allele_counts = observed_mask.sum(axis=0) + print("# observed per allele: %s" % (observed_allele_counts,)) + assert (index_counts(observed_allele_index) == observed_allele_counts).all() + kfold = StratifiedKFold( observed_allele_index, - n_folds=args.n_folds, + n_folds=n_folds, shuffle=True) - for fold_idx, (_, indirect_test_indices) in enumerate(kfold): - + for (_, indirect_test_indices) in kfold: test_linear_indices = observed_indices[indirect_test_indices] test_coords = np.unravel_index( test_linear_indices, dims=observed_mask.shape) - y_true = X[test_coords] + + test_allele_counts = index_counts(test_coords[1]) + allele_fractions = test_allele_counts / observed_allele_counts.astype(float) + print("Fraction of each allele in this CV fold: %s" % (allele_fractions,)) + + X_test_vector = X[test_coords] X_fold = X.copy() X_fold[test_coords] = np.nan @@ -229,7 +272,34 @@ if __name__ == "__main__": print("Dropping %d empty rows, %d empty columns" % ( empty_row_mask.sum(), empty_col_mask.sum())) + yield (X_fold, ok_mesh, test_coords, X_test_vector) + +if __name__ == "__main__": + args = parser.parse_args() + print(args) + imputation_methods = create_imputation_methods( + verbose=args.verbose, + clip_imputed_values=not (args.normalize_rows or args.normalize_rows), + ) + print("Imputation methods: %s" % imputation_methods) + X, missing_mask, observed_mask, peptide_list, allele_list = load_data( + binding_data_csv=args.binding_data_csv, + max_ic50=args.max_ic50, + only_human=args.only_human, + min_allele_size=args.n_folds) + + if args.save_incomplete_affinity_matrix: + print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix) + df = pd.DataFrame(X, columns=allele_list, index=peptide_list) + df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide") + + scores = ScoreSet() + kfold = stratified_cross_validation( + X=X, + observed_mask=observed_mask, + n_folds=args.n_folds) + for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold): X_fold_reduced = X_fold[ok_mesh] biscaler = BiScaler( scale_rows=args.normalize_rows, @@ -248,15 +318,12 @@ if __name__ == "__main__": X_completed = np.zeros_like(X) X_completed[ok_mesh] = X_completed_reduced y_pred = X_completed[test_coords] - mae, tau, auc, f1_score = evaluate_predictions( - y_true=y_true, y_pred=y_pred, max_ic50=args.max_ic50) - + y_true=X_test_vector, y_pred=y_pred, max_ic50=args.max_ic50) scores.add_many( method_name, mae=mae, tau=tau, f1_score=f1_score, auc=auc) - scores.to_csv(args.output_file) diff --git a/mhcflurry/data.py b/mhcflurry/data.py index 83bdfa27173ca04c1eee99b758ec83f1f2ff6d4c..75435c3b1de95f7d030452aa3406f53550035a5e 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -156,7 +156,8 @@ def load_allele_dicts( peptide_column_name=None, peptide_length_column_name="peptide_length", ic50_column_name="meas", - only_human=True): + only_human=True, + min_allele_size=1): """ Parsing CSV of binding data into dictionary of dictionaries. The outer key is an allele name, the inner key is a peptide sequence, @@ -187,6 +188,7 @@ def load_allele_dicts( } for (allele_name, group) in binding_df.groupby(allele_column_name) + if len(group) >= min_allele_size }