matrix completion accuracy test with fancyimpute

005b8174 · Alex Rubinsteyn · ec6a49d9 · 005b8174 · 005b8174 · 005b8174
Commit 005b8174 authored 9 years ago by Alex Rubinsteyn
--- a/experiments/allele_similarities.py
+++ b/experiments/allele_similarities.py
@@ -20,7 +20,7 @@ Helper functions for computing pairwise similarities between alleles

 import numpy as np
 import seaborn
-from fancyimpute import ConvexSolver
+from fancyimpute import NuclearNormMinimization

 from common import matrix_to_dictionary, curry_dictionary

@@ -181,7 +181,7 @@ def fill_in_similarities(
        sims_matrix.shape,
        missing.sum()))

-    solver = ConvexSolver(
+    solver = NuclearNormMinimization(
        require_symmetric_solution=True,
        min_value=0.0,
        max_value=1.0,

--- a/experiments/best-synthetic-data-hyperparams.py
+++ b/experiments/best-synthetic-data-hyperparams.py
@@ -31,7 +31,6 @@ from mhcflurry.data import load_allele_dicts
 from scipy import stats
 import numpy as np
 import sklearn.metrics
-from sklearn.cross_validation import KFold

 from dataset_paths import PETERS2009_CSV_PATH
 from synthetic_data import (

--- a/experiments/cross_validation_affinity_data.py
+++ b/experiments/cross_validation_affinity_data.py
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from sklearn.cross_validation import KFold
+
+
+def generate_cross_validation_datasets(
+        allele_to_peptide_to_affinity,
+        n_folds=4):
+    for allele, dataset in sorted(allele_to_peptide_to_affinity.items()):
+        peptides = list(dataset.keys())
+        affinities = list(dataset.values())
+        n_samples = len(peptides)
+        print("Generating similarities for folds of %s data (n=%d)" % (
+            allele,
+            n_samples))
+
+        if n_samples < n_folds:
+            print("Too few samples (%d) for %d-fold cross-validation" % (
+                n_samples,
+                n_folds))
+            continue
+
+        kfold_iterator = enumerate(
+            KFold(n_samples, n_folds=n_folds, shuffle=True))
+
+        for fold_number, (train_indices, test_indices) in kfold_iterator:
+            train_peptides = [peptides[i] for i in train_indices]
+            train_affinities = [affinities[i] for i in train_indices]
+            test_peptide_set = set([peptides[i] for i in test_indices])
+            # copy the affinity data for all alleles other than this one
+            fold_affinity_dict = {
+                allele_key: affinity_dict
+                for (allele_key, affinity_dict)
+                in allele_to_peptide_to_affinity.items()
+                if allele_key != allele
+            }
+            # include an affinity dictionary for this allele which
+            # only uses training data
+            fold_affinity_dict[allele] = {
+                train_peptide: train_affinity
+                for (train_peptide, train_affinity)
+                in zip(train_peptides, train_affinities)
+            }
+            allele_similarities, overlap_counts, overlap_weights = \
+                compute_allele_similarities(
+                    allele_to_peptide_to_affinity=fold_affinity_dict,
+                    min_weight=0.1)
+            this_allele_similarities = allele_similarities[allele]
+
+            yield (
+                allele,
+                dataset,
+                fold_number,
+                this_allele_similarities,
+                test_peptide_set
+            )
\ No newline at end of file
--- a/experiments/matrix-completion-accuracy.py
+++ b/experiments/matrix-completion-accuracy.py
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+from collections import OrderedDict
+
+from fancyimpute import (
+    SoftImpute,
+    IterativeSVD,
+    SimilarityWeightedAveraging,
+    DenseKNN,
+    MICE,
+    BiScaler,
+    SimpleFill,
+)
+from fancyimpute.dictionary_helpers import (
+    dense_matrix_from_nested_dictionary,
+    transpose_nested_dictionary,
+)
+from mhcflurry.data import load_allele_dicts
+import sklearn.metrics
+from sklearn.cross_validation import StratifiedKFold
+from scipy import stats
+import numpy as np
+
+from dataset_paths import PETERS2009_CSV_PATH
+
+
+parser = argparse.ArgumentParser()
+
+parser.add_argument(
+    "--binding-data-csv",
+    default=PETERS2009_CSV_PATH)
+
+parser.add_argument(
+    "--max-ic50",
+    default=50000.0,
+    type=float)
+
+parser.add_argument(
+    "--only-human",
+    default=False,
+    action="store_true")
+
+parser.add_argument(
+    "--output-file",
+    default="matrix-completion-accuracy-results.csv")
+
+parser.add_argument(
+    "--biscale",
+    default=False,
+    action="store_true",
+    help="Normalize rows and columns of affinity matrix simultaneously")
+
+parser.add_argument(
+    "--n-folds",
+    default=5,
+    type=int,
+    help="Number of cross-validation folds")
+
+
+def evaluate_predictions(
+        y_true,
+        y_pred,
+        max_ic50):
+    """
+    Return mean absolute error, Kendall tau, AUC and F1 score
+    """
+    if len(y_pred) != len(y_true):
+        raise ValueError("y_pred must have same number of elements as y_true")
+
+    mae = np.mean(np.abs(y_true - y_pred))
+
+    # if all predictions are the same
+    if (y_pred[0] == y_pred).all():
+        return (mae, 0, 0.5, 0)
+
+    tau, _ = stats.kendalltau(
+        y_pred,
+        y_true)
+
+    true_ic50s = max_ic50 ** (1.0 - np.array(y_true))
+    predicted_ic50s = max_ic50 ** (1.0 - np.array(y_pred))
+
+    true_binding_label = true_ic50s <= 500
+    if true_binding_label.all() or not true_binding_label.any():
+        # can't compute AUC or F1 without both negative and positive cases
+        return (mae, tau, 0.5, 0)
+
+    auc = sklearn.metrics.roc_auc_score(true_binding_label, y_pred)
+
+    predicted_binding_label = predicted_ic50s <= 500
+    if predicted_binding_label.all() or not predicted_binding_label.any():
+        # can't compute F1 without both positive and negative predictions
+        return (mae, tau, auc, 0)
+
+    f1_score = sklearn.metrics.f1_score(
+        true_binding_label,
+        predicted_binding_label)
+
+    return mae, tau, auc, f1_score
+
+VERBOSE = True
+
+imputation_methods = {
+    #"softImpute": SoftImpute(verbose=VERBOSE),
+    #"svdImpute-5": IterativeSVD(5, verbose=VERBOSE),
+    #"svdImpute-10": IterativeSVD(10, verbose=VERBOSE),
+    #"svdImpute-20": IterativeSVD(20, verbose=VERBOSE),
+    # "colSims": SimilarityWeightedAveraging(
+    #    orientation="columns",
+    #    verbose=VERBOSE),
+    "meanFill": SimpleFill("mean"),
+    "zeroFill": SimpleFill("zero"),
+    # "MICE": MICE(verbose=VERBOSE),
+    "knnImpute-3": DenseKNN(3, orientation="columns", verbose=VERBOSE, print_interval=1),
+    "knnImpute-7": DenseKNN(7, orientation="columns", verbose=VERBOSE, print_interval=1),
+    "knnImpute-15": DenseKNN(15, orientation="columns", verbose=VERBOSE, print_interval=1),
+}
+
+
+class ScoreSet(object):
+    """
+    Useful for keeping a collection of score dictionaries
+    which map name->score type->list of values.
+    """
+    def __init__(self, verbose=True):
+        self.groups = {}
+        self.verbose = verbose
+
+    def add_many(self, group, **kwargs):
+        for (k, v) in sorted(kwargs.items()):
+            self.add(group, k, v)
+
+    def add(self, group, score_type, value):
+        if group not in self.groups:
+            self.groups[group] = {}
+        if score_type not in self.groups[group]:
+            self.groups[group][score_type] = []
+        self.groups[group][score_type].append(value)
+        if self.verbose:
+            print("--> %s:%s %0.4f" % (group, score_type, value))
+
+    def averages(self):
+        return {
+            group:
+                OrderedDict([
+                    (score_type, np.mean(scores))
+                    for (score_type, scores)
+                    in sorted(score_dict.items())
+                ])
+            for (group, score_dict)
+            in self.groups.items()
+        }
+
+    def score_types(self):
+        result = set([])
+        for (g, d) in sorted(self.groups.items()):
+            for score_type in sorted(d.keys()):
+                result.add(score_type)
+        return list(result)
+
+    def to_csv(self, filename):
+        with open(filename, "w") as f:
+            score_types = scores.score_types()
+            header_list = ["name"] + list(score_types)
+            header_line = ",".join(header_list) + "\n"
+            if self.verbose:
+                print(header_line)
+            f.write(header_line)
+            for name, score_type_dict in sorted(scores.averages().items()):
+                line_elements = [name]
+                for _, value in sorted(score_type_dict.items()):
+                    line_elements.append("%0.4f" % value)
+                line = ",".join(line_elements) + "\n"
+                if self.verbose:
+                    print(line)
+                f.write(line)
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    print(args)
+
+    allele_to_peptide_to_affinity = load_allele_dicts(
+        args.binding_data_csv,
+        max_ic50=args.max_ic50,
+        only_human=args.only_human,
+        regression_output=True)
+    peptide_to_allele_to_affinity = transpose_nested_dictionary(
+        allele_to_peptide_to_affinity)
+    n_binding_values = sum(
+        len(allele_dict)
+        for allele_dict in
+        allele_to_peptide_to_affinity.values()
+    )
+    print("Loaded %d binding values for %d alleles" % (
+        n_binding_values,
+        len(allele_to_peptide_to_affinity)))
+
+    X, peptide_order, allele_order = \
+        dense_matrix_from_nested_dictionary(peptide_to_allele_to_affinity)
+
+    scores = ScoreSet()
+
+    missing_mask = np.isnan(X)
+    observed_mask = ~missing_mask
+
+    n_observed = observed_mask.sum()
+
+    (observed_x, observed_y) = np.where(observed_mask)
+    observed_indices = np.ravel_multi_index(
+        (observed_x, observed_y),
+        dims=observed_mask.shape)
+
+    assert len(observed_indices) == n_observed
+
+    kfold = StratifiedKFold(observed_y, n_folds=5, shuffle=True)
+    biscaler = BiScaler(scale_rows=args.biscale, center_rows=args.biscale)
+
+    for fold_idx, (_, indirect_test_indices) in enumerate(kfold):
+
+        test_linear_indices = observed_indices[indirect_test_indices]
+        test_coords = np.unravel_index(
+            test_linear_indices,
+            dims=observed_mask.shape)
+        y_true = X[test_coords]
+
+        X_fold = X.copy()
+        X_fold[test_coords] = np.nan
+
+        empty_row_mask = np.isfinite(X_fold).sum(axis=1) == 0
+        ok_row_mask = ~empty_row_mask
+        ok_row_indices = np.where(ok_row_mask)[0]
+
+        empty_col_mask = np.isfinite(X_fold).sum(axis=0) == 0
+        ok_col_mask = ~empty_col_mask
+        ok_col_indices = np.where(ok_col_mask)[0]
+
+        ok_mesh = np.ix_(ok_row_indices, ok_col_indices)
+
+        print("Dropping %d empty rows, %d empty columns" % (
+            empty_row_mask.sum(),
+            empty_col_mask.sum()))
+
+        X_fold_reduced = X_fold[ok_mesh]
+        X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced)
+        for (method_name, solver) in sorted(imputation_methods.items()):
+            print("CV fold %d/%d, running %s" % (
+                fold_idx + 1,
+                args.n_folds,
+                method_name))
+            X_completed_reduced_scaled = solver.complete(X_fold_reduced)
+            X_completed_reduced = biscaler.inverse_transform(
+                X_completed_reduced_scaled)
+            X_completed = np.zeros_like(X)
+            X_completed[ok_mesh] = X_completed_reduced
+            y_pred = X_completed[test_coords]
+
+            mae, tau, auc, f1_score = evaluate_predictions(
+                y_true=y_true, y_pred=y_pred, max_ic50=args.max_ic50)
+
+            scores.add_many(
+                method_name,
+                mae=mae,
+                tau=tau,
+                f1_score=f1_score,
+                auc=auc)
+
+    scores.to_csv(args.output_file)