Skip to content
Snippets Groups Projects
Commit 005b8174 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

matrix completion accuracy test with fancyimpute

parent ec6a49d9
No related branches found
No related tags found
No related merge requests found
......@@ -20,7 +20,7 @@ Helper functions for computing pairwise similarities between alleles
import numpy as np
import seaborn
from fancyimpute import ConvexSolver
from fancyimpute import NuclearNormMinimization
from common import matrix_to_dictionary, curry_dictionary
......@@ -181,7 +181,7 @@ def fill_in_similarities(
sims_matrix.shape,
missing.sum()))
solver = ConvexSolver(
solver = NuclearNormMinimization(
require_symmetric_solution=True,
min_value=0.0,
max_value=1.0,
......
......@@ -31,7 +31,6 @@ from mhcflurry.data import load_allele_dicts
from scipy import stats
import numpy as np
import sklearn.metrics
from sklearn.cross_validation import KFold
from dataset_paths import PETERS2009_CSV_PATH
from synthetic_data import (
......
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from sklearn.cross_validation import KFold
def generate_cross_validation_datasets(
allele_to_peptide_to_affinity,
n_folds=4):
for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
peptides = list(dataset.keys())
affinities = list(dataset.values())
n_samples = len(peptides)
print("Generating similarities for folds of %s data (n=%d)" % (
allele,
n_samples))
if n_samples < n_folds:
print("Too few samples (%d) for %d-fold cross-validation" % (
n_samples,
n_folds))
continue
kfold_iterator = enumerate(
KFold(n_samples, n_folds=n_folds, shuffle=True))
for fold_number, (train_indices, test_indices) in kfold_iterator:
train_peptides = [peptides[i] for i in train_indices]
train_affinities = [affinities[i] for i in train_indices]
test_peptide_set = set([peptides[i] for i in test_indices])
# copy the affinity data for all alleles other than this one
fold_affinity_dict = {
allele_key: affinity_dict
for (allele_key, affinity_dict)
in allele_to_peptide_to_affinity.items()
if allele_key != allele
}
# include an affinity dictionary for this allele which
# only uses training data
fold_affinity_dict[allele] = {
train_peptide: train_affinity
for (train_peptide, train_affinity)
in zip(train_peptides, train_affinities)
}
allele_similarities, overlap_counts, overlap_weights = \
compute_allele_similarities(
allele_to_peptide_to_affinity=fold_affinity_dict,
min_weight=0.1)
this_allele_similarities = allele_similarities[allele]
yield (
allele,
dataset,
fold_number,
this_allele_similarities,
test_peptide_set
)
\ No newline at end of file
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from collections import OrderedDict
from fancyimpute import (
SoftImpute,
IterativeSVD,
SimilarityWeightedAveraging,
DenseKNN,
MICE,
BiScaler,
SimpleFill,
)
from fancyimpute.dictionary_helpers import (
dense_matrix_from_nested_dictionary,
transpose_nested_dictionary,
)
from mhcflurry.data import load_allele_dicts
import sklearn.metrics
from sklearn.cross_validation import StratifiedKFold
from scipy import stats
import numpy as np
from dataset_paths import PETERS2009_CSV_PATH
parser = argparse.ArgumentParser()
parser.add_argument(
"--binding-data-csv",
default=PETERS2009_CSV_PATH)
parser.add_argument(
"--max-ic50",
default=50000.0,
type=float)
parser.add_argument(
"--only-human",
default=False,
action="store_true")
parser.add_argument(
"--output-file",
default="matrix-completion-accuracy-results.csv")
parser.add_argument(
"--biscale",
default=False,
action="store_true",
help="Normalize rows and columns of affinity matrix simultaneously")
parser.add_argument(
"--n-folds",
default=5,
type=int,
help="Number of cross-validation folds")
def evaluate_predictions(
y_true,
y_pred,
max_ic50):
"""
Return mean absolute error, Kendall tau, AUC and F1 score
"""
if len(y_pred) != len(y_true):
raise ValueError("y_pred must have same number of elements as y_true")
mae = np.mean(np.abs(y_true - y_pred))
# if all predictions are the same
if (y_pred[0] == y_pred).all():
return (mae, 0, 0.5, 0)
tau, _ = stats.kendalltau(
y_pred,
y_true)
true_ic50s = max_ic50 ** (1.0 - np.array(y_true))
predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred))
true_binding_label = true_ic50s <= 500
if true_binding_label.all() or not true_binding_label.any():
# can't compute AUC or F1 without both negative and positive cases
return (mae, tau, 0.5, 0)
auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred)
predicted_binding_label = predicted_ic50s <= 500
if predicted_binding_label.all() or not predicted_binding_label.any():
# can't compute F1 without both positive and negative predictions
return (mae, tau, auc, 0)
f1_score = sklearn.metrics.f1_score(
true_binding_label,
predicted_binding_label)
return mae, tau, auc, f1_score
VERBOSE = True
imputation_methods = {
#"softImpute": SoftImpute(verbose=VERBOSE),
#"svdImpute-5": IterativeSVD(5, verbose=VERBOSE),
#"svdImpute-10": IterativeSVD(10, verbose=VERBOSE),
#"svdImpute-20": IterativeSVD(20, verbose=VERBOSE),
# "colSims": SimilarityWeightedAveraging(
# orientation="columns",
# verbose=VERBOSE),
"meanFill": SimpleFill("mean"),
"zeroFill": SimpleFill("zero"),
# "MICE": MICE(verbose=VERBOSE),
"knnImpute-3": DenseKNN(3, orientation="columns", verbose=VERBOSE, print_interval=1),
"knnImpute-7": DenseKNN(7, orientation="columns", verbose=VERBOSE, print_interval=1),
"knnImpute-15": DenseKNN(15, orientation="columns", verbose=VERBOSE, print_interval=1),
}
class ScoreSet(object):
"""
Useful for keeping a collection of score dictionaries
which map name->score type->list of values.
"""
def __init__(self, verbose=True):
self.groups = {}
self.verbose = verbose
def add_many(self, group, **kwargs):
for (k, v) in sorted(kwargs.items()):
self.add(group, k, v)
def add(self, group, score_type, value):
if group not in self.groups:
self.groups[group] = {}
if score_type not in self.groups[group]:
self.groups[group][score_type] = []
self.groups[group][score_type].append(value)
if self.verbose:
print("--> %s:%s %0.4f" % (group, score_type, value))
def averages(self):
return {
group:
OrderedDict([
(score_type, np.mean(scores))
for (score_type, scores)
in sorted(score_dict.items())
])
for (group, score_dict)
in self.groups.items()
}
def score_types(self):
result = set([])
for (g, d) in sorted(self.groups.items()):
for score_type in sorted(d.keys()):
result.add(score_type)
return list(result)
def to_csv(self, filename):
with open(filename, "w") as f:
score_types = scores.score_types()
header_list = ["name"] + list(score_types)
header_line = ",".join(header_list) + "\n"
if self.verbose:
print(header_line)
f.write(header_line)
for name, score_type_dict in sorted(scores.averages().items()):
line_elements = [name]
for _, value in sorted(score_type_dict.items()):
line_elements.append("%0.4f" % value)
line = ",".join(line_elements) + "\n"
if self.verbose:
print(line)
f.write(line)
if __name__ == "__main__":
args = parser.parse_args()
print(args)
allele_to_peptide_to_affinity = load_allele_dicts(
args.binding_data_csv,
max_ic50=args.max_ic50,
only_human=args.only_human,
regression_output=True)
peptide_to_allele_to_affinity = transpose_nested_dictionary(
allele_to_peptide_to_affinity)
n_binding_values = sum(
len(allele_dict)
for allele_dict in
allele_to_peptide_to_affinity.values()
)
print("Loaded %d binding values for %d alleles" % (
n_binding_values,
len(allele_to_peptide_to_affinity)))
X, peptide_order, allele_order = \
dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity)
scores = ScoreSet()
missing_mask = np.isnan(X)
observed_mask = ~missing_mask
n_observed = observed_mask.sum()
(observed_x, observed_y) = np.where(observed_mask)
observed_indices = np.ravel_multi_index(
(observed_x, observed_y),
dims=observed_mask.shape)
assert len(observed_indices) == n_observed
kfold = StratifiedKFold(observed_y, n_folds=5, shuffle=True)
biscaler = BiScaler(scale_rows=args.biscale, center_rows=args.biscale)
for fold_idx, (_, indirect_test_indices) in enumerate(kfold):
test_linear_indices = observed_indices[indirect_test_indices]
test_coords = np.unravel_index(
test_linear_indices,
dims=observed_mask.shape)
y_true = X[test_coords]
X_fold = X.copy()
X_fold[test_coords] = np.nan
empty_row_mask = np.isfinite(X_fold).sum(axis=1) == 0
ok_row_mask = ~empty_row_mask
ok_row_indices = np.where(ok_row_mask)[0]
empty_col_mask = np.isfinite(X_fold).sum(axis=0) == 0
ok_col_mask = ~empty_col_mask
ok_col_indices = np.where(ok_col_mask)[0]
ok_mesh = np.ix_(ok_row_indices, ok_col_indices)
print("Dropping %d empty rows, %d empty columns" % (
empty_row_mask.sum(),
empty_col_mask.sum()))
X_fold_reduced = X_fold[ok_mesh]
X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced)
for (method_name, solver) in sorted(imputation_methods.items()):
print("CV fold %d/%d, running %s" % (
fold_idx + 1,
args.n_folds,
method_name))
X_completed_reduced_scaled = solver.complete(X_fold_reduced)
X_completed_reduced = biscaler.inverse_transform(
X_completed_reduced_scaled)
X_completed = np.zeros_like(X)
X_completed[ok_mesh] = X_completed_reduced
y_pred = X_completed[test_coords]
mae, tau, auc, f1_score = evaluate_predictions(
y_true=y_true, y_pred=y_pred, max_ic50=args.max_ic50)
scores.add_many(
method_name,
mae=mae,
tau=tau,
f1_score=f1_score,
auc=auc)
scores.to_csv(args.output_file)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment