From 68dd95d9041cf92b5b91b4bff40bb32280dc34b4 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Fri, 29 Jan 2016 13:13:24 -0500 Subject: [PATCH] filter out rows of matrix with only one observation --- experiments/matrix-completion-accuracy.py | 172 +++----------------- experiments/matrix_completion_helpers.py | 181 ++++++++++++++++++++++ 2 files changed, 200 insertions(+), 153 deletions(-) create mode 100644 experiments/matrix_completion_helpers.py diff --git a/experiments/matrix-completion-accuracy.py b/experiments/matrix-completion-accuracy.py index 13ff9533..dc2923b4 100644 --- a/experiments/matrix-completion-accuracy.py +++ b/experiments/matrix-completion-accuracy.py @@ -25,19 +25,16 @@ from fancyimpute import ( BiScaler, SimpleFill, ) -from fancyimpute.dictionary_helpers import ( - dense_matrix_from_nested_dictionary, - transpose_nested_dictionary, -) -from mhcflurry.data import load_allele_dicts -import sklearn.metrics -from sklearn.cross_validation import StratifiedKFold -from scipy import stats import numpy as np import pandas as pd from dataset_paths import PETERS2009_CSV_PATH from score_set import ScoreSet +from matrix_completion_helpers import ( + load_data, + evaluate_predictions, + stratified_cross_validation +) parser = argparse.ArgumentParser() @@ -77,6 +74,12 @@ parser.add_argument( action="store_true", help="Center and rescale rows of affinity matrix") +parser.add_argument( + "--min-observations-per-peptide", + type=int, + default=2, + help="Drop peptide entries with fewer than this number of measured affinities") + parser.add_argument( "--n-folds", default=10, @@ -89,48 +92,6 @@ parser.add_argument( action="store_true") -def evaluate_predictions( - y_true, - y_pred, - max_ic50): - """ - Return mean absolute error, Kendall tau, AUC and F1 score - """ - if len(y_pred) != len(y_true): - raise ValueError("y_pred must have same number of elements as y_true") - - mae = np.mean(np.abs(y_true - y_pred)) - - # if all predictions are the same - if (y_pred[0] == y_pred).all(): - return (mae, 0, 0.5, 0) - - tau, _ = stats.kendalltau( - y_pred, - y_true) - - true_ic50s = max_ic50 ** (1.0 - np.array(y_true)) - predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred)) - - true_binding_label = true_ic50s <= 500 - if true_binding_label.all() or not true_binding_label.any(): - # can't compute AUC or F1 without both negative and positive cases - return (mae, tau, 0.5, 0) - - auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred) - - predicted_binding_label = predicted_ic50s <= 500 - if predicted_binding_label.all() or not predicted_binding_label.any(): - # can't compute F1 without both positive and negative predictions - return (mae, tau, auc, 0) - - f1_score = sklearn.metrics.f1_score( - true_binding_label, - predicted_binding_label) - - return mae, tau, auc, f1_score - - def create_imputation_methods( verbose=False, clip_imputed_values=False, @@ -174,106 +135,6 @@ def create_imputation_methods( return result_dict -def load_data(binding_data_csv, max_ic50, only_human=False, min_allele_size=1): - allele_to_peptide_to_affinity = load_allele_dicts( - binding_data_csv, - max_ic50=max_ic50, - only_human=only_human, - regression_output=True, - min_allele_size=min_allele_size) - peptide_to_allele_to_affinity = transpose_nested_dictionary( - allele_to_peptide_to_affinity) - n_binding_values = sum( - len(allele_dict) - for allele_dict in - allele_to_peptide_to_affinity.values() - ) - print("Loaded %d binding values for %d alleles" % ( - n_binding_values, - len(allele_to_peptide_to_affinity))) - - X, peptide_list, allele_list = \ - dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity) - - missing_mask = np.isnan(X) - observed_mask = ~missing_mask - - n_observed_per_peptide = observed_mask.sum(axis=1) - min_observed_per_peptide = n_observed_per_peptide.min() - min_peptide_indices = np.where( - n_observed_per_peptide == min_observed_per_peptide)[0] - print("%d peptides with %d observations" % ( - len(min_peptide_indices), - min_observed_per_peptide)) - - n_observed_per_allele = observed_mask.sum(axis=0) - min_observed_per_allele = n_observed_per_allele.min() - min_allele_indices = np.where( - n_observed_per_allele == min_observed_per_allele)[0] - print("%d alleles with %d observations: %s" % ( - len(min_allele_indices), - min_observed_per_allele, - [allele_list[i] for i in min_allele_indices])) - return X, missing_mask, observed_mask, peptide_list, allele_list - - -def index_counts(indices): - max_index = indices.max() - counts = np.zeros(max_index + 1, dtype=int) - for index in indices: - counts[index] += 1 - return counts - - -def stratified_cross_validation(X, observed_mask, n_folds=10): - n_observed = observed_mask.sum() - - (observed_peptide_index, observed_allele_index) = np.where(observed_mask) - observed_indices = np.ravel_multi_index( - (observed_peptide_index, observed_allele_index), - dims=observed_mask.shape) - - assert len(observed_indices) == n_observed - - observed_allele_counts = observed_mask.sum(axis=0) - print("# observed per allele: %s" % (observed_allele_counts,)) - assert (index_counts(observed_allele_index) == observed_allele_counts).all() - - kfold = StratifiedKFold( - observed_allele_index, - n_folds=n_folds, - shuffle=True) - - for (_, indirect_test_indices) in kfold: - test_linear_indices = observed_indices[indirect_test_indices] - test_coords = np.unravel_index( - test_linear_indices, - dims=observed_mask.shape) - - test_allele_counts = index_counts(test_coords[1]) - allele_fractions = test_allele_counts / observed_allele_counts.astype(float) - print("Fraction of each allele in this CV fold: %s" % (allele_fractions,)) - - X_test_vector = X[test_coords] - - X_fold = X.copy() - X_fold[test_coords] = np.nan - - empty_row_mask = np.isfinite(X_fold).sum(axis=1) == 0 - ok_row_mask = ~empty_row_mask - ok_row_indices = np.where(ok_row_mask)[0] - - empty_col_mask = np.isfinite(X_fold).sum(axis=0) == 0 - ok_col_mask = ~empty_col_mask - ok_col_indices = np.where(ok_col_mask)[0] - - ok_mesh = np.ix_(ok_row_indices, ok_col_indices) - - print("Dropping %d empty rows, %d empty columns" % ( - empty_row_mask.sum(), - empty_col_mask.sum())) - yield (X_fold, ok_mesh, test_coords, X_test_vector) - if __name__ == "__main__": args = parser.parse_args() print(args) @@ -283,12 +144,17 @@ if __name__ == "__main__": ) print("Imputation methods: %s" % imputation_methods) - X, missing_mask, observed_mask, peptide_list, allele_list = load_data( + X, observed_mask, peptide_list, allele_list = load_data( binding_data_csv=args.binding_data_csv, max_ic50=args.max_ic50, only_human=args.only_human, - min_allele_size=args.n_folds) - + min_observations_per_allele=args.n_folds, + min_observations_per_peptide=args.min_observations_per_peptide) + print("Loaded binding data, shape: %s, n_observed=%d/%d (%0.2f%%)" % ( + X.shape, + observed_mask.sum(), + X.size, + 100.0 * observed_mask.sum() / X.size)) if args.save_incomplete_affinity_matrix: print("Saving incomplete data to %s" % args.save_incomplete_affinity_matrix) df = pd.DataFrame(X, columns=allele_list, index=peptide_list) diff --git a/experiments/matrix_completion_helpers.py b/experiments/matrix_completion_helpers.py new file mode 100644 index 00000000..562cb020 --- /dev/null +++ b/experiments/matrix_completion_helpers.py @@ -0,0 +1,181 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from fancyimpute.dictionary_helpers import ( + dense_matrix_from_nested_dictionary, + transpose_nested_dictionary, +) +from mhcflurry.data import load_allele_dicts +import sklearn.metrics +from sklearn.cross_validation import StratifiedKFold +from scipy import stats +import numpy as np + + +def evaluate_predictions( + y_true, + y_pred, + max_ic50): + """ + Return mean absolute error, Kendall tau, AUC and F1 score + """ + if len(y_pred) != len(y_true): + raise ValueError("y_pred must have same number of elements as y_true") + + mae = np.mean(np.abs(y_true - y_pred)) + + # if all predictions are the same + if (y_pred[0] == y_pred).all(): + return (mae, 0, 0.5, 0) + + tau, _ = stats.kendalltau( + y_pred, + y_true) + + true_ic50s = max_ic50 ** (1.0 - np.array(y_true)) + predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred)) + + true_binding_label = true_ic50s <= 500 + if true_binding_label.all() or not true_binding_label.any(): + # can't compute AUC or F1 without both negative and positive cases + return (mae, tau, 0.5, 0) + + auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred) + + predicted_binding_label = predicted_ic50s <= 500 + if predicted_binding_label.all() or not predicted_binding_label.any(): + # can't compute F1 without both positive and negative predictions + return (mae, tau, auc, 0) + + f1_score = sklearn.metrics.f1_score( + true_binding_label, + predicted_binding_label) + + return mae, tau, auc, f1_score + + +def load_data( + binding_data_csv, + max_ic50, + only_human=False, + min_observations_per_allele=1, + min_observations_per_peptide=1): + allele_to_peptide_to_affinity = load_allele_dicts( + binding_data_csv, + max_ic50=max_ic50, + only_human=only_human, + regression_output=True) + peptide_to_allele_to_affinity = transpose_nested_dictionary( + allele_to_peptide_to_affinity) + n_binding_values = sum( + len(allele_dict) + for allele_dict in + allele_to_peptide_to_affinity.values() + ) + print("Loaded %d binding values for %d alleles" % ( + n_binding_values, + len(allele_to_peptide_to_affinity))) + + X, peptide_list, allele_list = \ + dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity) + + observed_mask = np.isfinite(X) + + n_observed_per_peptide = observed_mask.sum(axis=1) + too_few_peptide_observations = ( + n_observed_per_peptide < min_observations_per_peptide) + if too_few_peptide_observations.any(): + drop_peptide_indices = np.where(too_few_peptide_observations)[0] + keep_peptide_indices = np.where(~too_few_peptide_observations)[0] + print("Dropping %d peptides with <%d observations" % ( + len(drop_peptide_indices), + min_observations_per_peptide)) + X = X[keep_peptide_indices] + observed_mask = observed_mask[keep_peptide_indices] + peptide_list = [peptide_list[i] for i in keep_peptide_indices] + + n_observed_per_allele = observed_mask.sum(axis=0) + too_few_allele_observations = ( + n_observed_per_allele < min_observations_per_peptide) + if too_few_peptide_observations.any(): + drop_allele_indices = np.where(too_few_allele_observations)[0] + keep_allele_indices = np.where(~too_few_allele_observations)[0] + + print("Dropping %d alleles with <%d observations: %s" % ( + len(drop_allele_indices), + min_observations_per_allele, + [allele_list[i] for i in drop_allele_indices])) + X = X[:, keep_allele_indices] + observed_mask = observed_mask[:, keep_allele_indices] + allele_list = [allele_list[i] for i in keep_allele_indices] + + return X, observed_mask, peptide_list, allele_list + + +def index_counts(indices): + max_index = indices.max() + counts = np.zeros(max_index + 1, dtype=int) + for index in indices: + counts[index] += 1 + return counts + + +def stratified_cross_validation(X, observed_mask, n_folds=10): + n_observed = observed_mask.sum() + + (observed_peptide_index, observed_allele_index) = np.where(observed_mask) + observed_indices = np.ravel_multi_index( + (observed_peptide_index, observed_allele_index), + dims=observed_mask.shape) + + assert len(observed_indices) == n_observed + + observed_allele_counts = observed_mask.sum(axis=0) + print("# observed per allele: %s" % (observed_allele_counts,)) + assert (index_counts(observed_allele_index) == observed_allele_counts).all() + + kfold = StratifiedKFold( + observed_allele_index, + n_folds=n_folds, + shuffle=True) + + for (_, indirect_test_indices) in kfold: + test_linear_indices = observed_indices[indirect_test_indices] + test_coords = np.unravel_index( + test_linear_indices, + dims=observed_mask.shape) + + test_allele_counts = index_counts(test_coords[1]) + allele_fractions = test_allele_counts / observed_allele_counts.astype(float) + print("Fraction of each allele in this CV fold: %s" % (allele_fractions,)) + + X_test_vector = X[test_coords] + + X_fold = X.copy() + X_fold[test_coords] = np.nan + + empty_row_mask = np.isfinite(X_fold).sum(axis=1) == 0 + ok_row_mask = ~empty_row_mask + ok_row_indices = np.where(ok_row_mask)[0] + + empty_col_mask = np.isfinite(X_fold).sum(axis=0) == 0 + ok_col_mask = ~empty_col_mask + ok_col_indices = np.where(ok_col_mask)[0] + + ok_mesh = np.ix_(ok_row_indices, ok_col_indices) + + print("Dropping %d empty rows, %d empty columns" % ( + empty_row_mask.sum(), + empty_col_mask.sum())) + yield (X_fold, ok_mesh, test_coords, X_test_vector) -- GitLab