From 32d6b97048c0387ee0d47a6829917beababb7452 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Mon, 19 Oct 2015 18:11:35 -0400
Subject: [PATCH] added script to summarize performance on test data of
 multiple classifiers

---
 experiments/evaluate-predictions.py    | 132 +++++++++++++++++++++++++
 experiments/evaluate-test-set.py       |  68 -------------
 experiments/model_selection_helpers.py |  32 +++---
 3 files changed, 146 insertions(+), 86 deletions(-)
 create mode 100644 experiments/evaluate-predictions.py
 delete mode 100644 experiments/evaluate-test-set.py

diff --git a/experiments/evaluate-predictions.py b/experiments/evaluate-predictions.py
new file mode 100644
index 00000000..4df3cb46
--- /dev/null
+++ b/experiments/evaluate-predictions.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Compute accuracy, AUC, and F1 score for allele-specific test datasets
+"""
+
+from os import listdir
+from os.path import join
+from argparse import ArgumentParser
+from collections import defaultdict, OrderedDict
+
+import pandas as pd
+import numpy as np
+from sklearn.metrics import roc_auc_score
+
+from model_selection_helpers import f1_score
+
+parser = ArgumentParser()
+
+parser.add_argument(
+    "--test-data-dir",
+    help="Directory which contains one CSV file per allele",
+    required=True)
+
+parser.add_argument(
+    "--true-ic50-column-name",
+    default="meas")
+
+parser.add_argument(
+    "--peptide-sequence-column-name",
+    default="sequence")
+
+parser.add_argument(
+    "--peptide-length-column-name",
+    default="length")
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # mapping from predictor names to dictionaries
+    results = defaultdict(lambda: OrderedDict([
+        ("allele", []),
+        ("length", []),
+        ("auc", []),
+        ("accuracy", []),
+        ("f1", [])]
+    ))
+
+    for filename in listdir(args.test_data_dir):
+        filepath = join(args.test_data_dir, filename)
+        parts = filename.split(".")
+        if len(parts) != 2:
+            print("Skipping %s" % filepath)
+            continue
+        allele, ext = parts
+        if ext != "csv":
+            print("Skipping %s, only reading CSV files" % filepath)
+            continue
+        df = pd.read_csv(filepath)
+        columns = set(df.columns)
+        drop_columns = {
+            args.true_ic50_column_name,
+            args.peptide_length_column_name,
+            args.peptide_sequence_column_name,
+        }
+        predictor_names = columns.difference(drop_columns)
+        true_ic50 = df[args.true_ic50_column_name]
+        true_label = true_ic50 <= 500
+        n = len(df)
+        print("%s (total = %d, n_pos = %d, n_neg = %d)" % (
+            allele,
+            n,
+            true_label.sum(),
+            n - true_label.sum()))
+
+        for predictor in sorted(predictor_names):
+            pred_ic50 = df[predictor]
+            pred_label = pred_ic50 <= 500
+            if true_label.std() == 0:
+                # can't compute AUC from single class
+                auc = np.nan
+            else:
+                # using negative IC50 since it's inversely related to binding
+                auc = roc_auc_score(true_label, -pred_ic50)
+
+            f1 = f1_score(true_label, pred_label)
+            accuracy = np.mean(true_label == pred_label)
+            print("-- %s AUC=%0.4f, acc=%0.4f, F1=%0.4f" % (
+                predictor,
+                auc,
+                accuracy,
+                f1))
+            results[predictor]["allele"].append(allele)
+            results[predictor]["length"].append(n)
+            results[predictor]["f1"].append(f1)
+            results[predictor]["accuracy"].append(accuracy)
+            results[predictor]["auc"].append(auc)
+
+    print("\n === Aggregate Results ===\n")
+    for (predictor, data) in sorted(results.items()):
+        df = pd.DataFrame(data)
+        print(predictor)
+        aucs = df["auc"]
+        auc_lower = aucs.quantile(0.25)
+        auc_upper = aucs.quantile(0.75)
+        auc_iqr = auc_upper - auc_lower
+        print("-- AUC: median=%0.4f, mean=%0.4f, iqr=%0.4f" % (
+            aucs.median(),
+            aucs.mean(),
+            auc_iqr))
+        f1s = df["f1"]
+        f1_lower = f1s.quantile(0.25)
+        f1_upper = f1s.quantile(0.75)
+        f1_iqr = f1_upper - f1_lower
+        print("-- F1: median=%0.4f, mean=%0.4f, iqr=%0.4f" % (
+            f1s.median(),
+            f1s.mean(),
+            f1_iqr))
diff --git a/experiments/evaluate-test-set.py b/experiments/evaluate-test-set.py
deleted file mode 100644
index 8de3d192..00000000
--- a/experiments/evaluate-test-set.py
+++ /dev/null
@@ -1,68 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (c) 2015. Mount Sinai School of Medicine
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from os.path import join, exists
-from os import makedirs
-from argparse import ArgumentParser
-
-from test_data import load_test_data
-
-parser = ArgumentParser()
-
-
-parser.add_argument(
-    "--test-data-input-dirs",
-    nargs='*',
-    type=str,
-    help="Multiple directories from other predictors",
-    required=True)
-
-parser.add_argument(
-    "--test-data-input-sep",
-    default="\s+",
-    help="Separator to use for loading test data CSV/TSV files",
-    type=str)
-
-parser.add_argument(
-    "--test-data-output-dir",
-    help="Save combined test datasets to this directory",
-    required=True)
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    dataframes, predictor_names = load_test_data(args.test_data_input_dirs)
-    if not exists(args.test_data_output_dir):
-        makedirs(args.test_data_output_dir)
-
-    print("Loaded test data:")
-    for (allele, df) in dataframes.items():
-        df.index.name = "sequence"
-        print("%s: %d results" % (allele, len(df)))
-        filename = "blind-%s.csv" % allele
-        filepath = join(args.test_data_output_dir, filename)
-        df.to_csv(filepath)
-
-    assert False
-    """
-    combined_df = evaluate_model_configs(
-        configs=configs,
-        results_filename=args.output,
-        train_fn=lambda config: evaluate_model_config_train_vs_test(
-            config,
-            training_allele_datasets=training_datasets,
-            testing_allele_datasets=testing_datasets,
-            min_samples_per_allele=5))
-    """
diff --git a/experiments/model_selection_helpers.py b/experiments/model_selection_helpers.py
index ce08c823..1b8bf21c 100644
--- a/experiments/model_selection_helpers.py
+++ b/experiments/model_selection_helpers.py
@@ -19,7 +19,6 @@ from __future__ import (
     unicode_literals
 )
 from collections import OrderedDict
-import logging
 
 import numpy as np
 import sklearn
@@ -34,30 +33,27 @@ from mhcflurry.data_helpers import indices_to_hotshot_encoding
 from score_collection import ScoreCollection
 
 
-def score_predictions(predicted_log_ic50, true_label, max_ic50):
-    """Computes accuracy, AUC, and F1 score of predictions"""
-    auc = sklearn.metrics.roc_auc_score(true_label, predicted_log_ic50)
-    ic50_pred = max_ic50 ** (1.0 - predicted_log_ic50)
-    label_pred = (ic50_pred <= 500)
-    same_mask = true_label == label_pred
-    accuracy = np.mean(same_mask)
+def f1_score(true_label, label_pred, cutoff=500):
     tp = (true_label & label_pred).sum()
     fp = ((~true_label) & label_pred).sum()
-    tn = ((~true_label) & (~label_pred)).sum()
     fn = (true_label & (~label_pred)).sum()
     sensitivity = (tp / float(tp + fn)) if (tp + fn) > 0 else 0.0
     precision = (tp / float(tp + fp)) if (tp + fp) > 0 else 0.0
     if (precision + sensitivity) > 0:
-        f1_score = (2 * precision * sensitivity) / (precision + sensitivity)
+        return (2 * precision * sensitivity) / (precision + sensitivity)
     else:
-        f1_score = 0.0
-    # sanity check that we're computing accuracy correctly
-    accuracy_estimate2 = (tp + tn) / float(tp + fp + tn + fn)
-    if abs(accuracy - accuracy_estimate2) > 0.00001:
-        logging.warn(
-            "!!! Conflicting accuracy estimates! (%0.5f vs. %0.5f)" % (
-                accuracy, accuracy_estimate2))
-    return accuracy, auc, f1_score
+        return 0.0
+
+
+def score_predictions(predicted_log_ic50, true_label, max_ic50):
+    """Computes accuracy, AUC, and F1 score of predictions"""
+    auc = sklearn.metrics.roc_auc_score(true_label, predicted_log_ic50)
+    ic50_pred = max_ic50 ** (1.0 - predicted_log_ic50)
+    label_pred = (ic50_pred <= 500)
+    same_mask = true_label == label_pred
+    accuracy = np.mean(same_mask)
+    f1 = f1_score(true_label, label_pred)
+    return accuracy, auc, f1
 
 
 def train_model_and_return_scores(
-- 
GitLab