From 32d6b97048c0387ee0d47a6829917beababb7452 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Mon, 19 Oct 2015 18:11:35 -0400 Subject: [PATCH] added script to summarize performance on test data of multiple classifiers --- experiments/evaluate-predictions.py | 132 +++++++++++++++++++++++++ experiments/evaluate-test-set.py | 68 ------------- experiments/model_selection_helpers.py | 32 +++--- 3 files changed, 146 insertions(+), 86 deletions(-) create mode 100644 experiments/evaluate-predictions.py delete mode 100644 experiments/evaluate-test-set.py diff --git a/experiments/evaluate-predictions.py b/experiments/evaluate-predictions.py new file mode 100644 index 00000000..4df3cb46 --- /dev/null +++ b/experiments/evaluate-predictions.py @@ -0,0 +1,132 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Compute accuracy, AUC, and F1 score for allele-specific test datasets +""" + +from os import listdir +from os.path import join +from argparse import ArgumentParser +from collections import defaultdict, OrderedDict + +import pandas as pd +import numpy as np +from sklearn.metrics import roc_auc_score + +from model_selection_helpers import f1_score + +parser = ArgumentParser() + +parser.add_argument( + "--test-data-dir", + help="Directory which contains one CSV file per allele", + required=True) + +parser.add_argument( + "--true-ic50-column-name", + default="meas") + +parser.add_argument( + "--peptide-sequence-column-name", + default="sequence") + +parser.add_argument( + "--peptide-length-column-name", + default="length") + +if __name__ == "__main__": + args = parser.parse_args() + + # mapping from predictor names to dictionaries + results = defaultdict(lambda: OrderedDict([ + ("allele", []), + ("length", []), + ("auc", []), + ("accuracy", []), + ("f1", [])] + )) + + for filename in listdir(args.test_data_dir): + filepath = join(args.test_data_dir, filename) + parts = filename.split(".") + if len(parts) != 2: + print("Skipping %s" % filepath) + continue + allele, ext = parts + if ext != "csv": + print("Skipping %s, only reading CSV files" % filepath) + continue + df = pd.read_csv(filepath) + columns = set(df.columns) + drop_columns = { + args.true_ic50_column_name, + args.peptide_length_column_name, + args.peptide_sequence_column_name, + } + predictor_names = columns.difference(drop_columns) + true_ic50 = df[args.true_ic50_column_name] + true_label = true_ic50 <= 500 + n = len(df) + print("%s (total = %d, n_pos = %d, n_neg = %d)" % ( + allele, + n, + true_label.sum(), + n - true_label.sum())) + + for predictor in sorted(predictor_names): + pred_ic50 = df[predictor] + pred_label = pred_ic50 <= 500 + if true_label.std() == 0: + # can't compute AUC from single class + auc = np.nan + else: + # using negative IC50 since it's inversely related to binding + auc = roc_auc_score(true_label, -pred_ic50) + + f1 = f1_score(true_label, pred_label) + accuracy = np.mean(true_label == pred_label) + print("-- %s AUC=%0.4f, acc=%0.4f, F1=%0.4f" % ( + predictor, + auc, + accuracy, + f1)) + results[predictor]["allele"].append(allele) + results[predictor]["length"].append(n) + results[predictor]["f1"].append(f1) + results[predictor]["accuracy"].append(accuracy) + results[predictor]["auc"].append(auc) + + print("\n === Aggregate Results ===\n") + for (predictor, data) in sorted(results.items()): + df = pd.DataFrame(data) + print(predictor) + aucs = df["auc"] + auc_lower = aucs.quantile(0.25) + auc_upper = aucs.quantile(0.75) + auc_iqr = auc_upper - auc_lower + print("-- AUC: median=%0.4f, mean=%0.4f, iqr=%0.4f" % ( + aucs.median(), + aucs.mean(), + auc_iqr)) + f1s = df["f1"] + f1_lower = f1s.quantile(0.25) + f1_upper = f1s.quantile(0.75) + f1_iqr = f1_upper - f1_lower + print("-- F1: median=%0.4f, mean=%0.4f, iqr=%0.4f" % ( + f1s.median(), + f1s.mean(), + f1_iqr)) diff --git a/experiments/evaluate-test-set.py b/experiments/evaluate-test-set.py deleted file mode 100644 index 8de3d192..00000000 --- a/experiments/evaluate-test-set.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from os.path import join, exists -from os import makedirs -from argparse import ArgumentParser - -from test_data import load_test_data - -parser = ArgumentParser() - - -parser.add_argument( - "--test-data-input-dirs", - nargs='*', - type=str, - help="Multiple directories from other predictors", - required=True) - -parser.add_argument( - "--test-data-input-sep", - default="\s+", - help="Separator to use for loading test data CSV/TSV files", - type=str) - -parser.add_argument( - "--test-data-output-dir", - help="Save combined test datasets to this directory", - required=True) - -if __name__ == "__main__": - args = parser.parse_args() - dataframes, predictor_names = load_test_data(args.test_data_input_dirs) - if not exists(args.test_data_output_dir): - makedirs(args.test_data_output_dir) - - print("Loaded test data:") - for (allele, df) in dataframes.items(): - df.index.name = "sequence" - print("%s: %d results" % (allele, len(df))) - filename = "blind-%s.csv" % allele - filepath = join(args.test_data_output_dir, filename) - df.to_csv(filepath) - - assert False - """ - combined_df = evaluate_model_configs( - configs=configs, - results_filename=args.output, - train_fn=lambda config: evaluate_model_config_train_vs_test( - config, - training_allele_datasets=training_datasets, - testing_allele_datasets=testing_datasets, - min_samples_per_allele=5)) - """ diff --git a/experiments/model_selection_helpers.py b/experiments/model_selection_helpers.py index ce08c823..1b8bf21c 100644 --- a/experiments/model_selection_helpers.py +++ b/experiments/model_selection_helpers.py @@ -19,7 +19,6 @@ from __future__ import ( unicode_literals ) from collections import OrderedDict -import logging import numpy as np import sklearn @@ -34,30 +33,27 @@ from mhcflurry.data_helpers import indices_to_hotshot_encoding from score_collection import ScoreCollection -def score_predictions(predicted_log_ic50, true_label, max_ic50): - """Computes accuracy, AUC, and F1 score of predictions""" - auc = sklearn.metrics.roc_auc_score(true_label, predicted_log_ic50) - ic50_pred = max_ic50 ** (1.0 - predicted_log_ic50) - label_pred = (ic50_pred <= 500) - same_mask = true_label == label_pred - accuracy = np.mean(same_mask) +def f1_score(true_label, label_pred, cutoff=500): tp = (true_label & label_pred).sum() fp = ((~true_label) & label_pred).sum() - tn = ((~true_label) & (~label_pred)).sum() fn = (true_label & (~label_pred)).sum() sensitivity = (tp / float(tp + fn)) if (tp + fn) > 0 else 0.0 precision = (tp / float(tp + fp)) if (tp + fp) > 0 else 0.0 if (precision + sensitivity) > 0: - f1_score = (2 * precision * sensitivity) / (precision + sensitivity) + return (2 * precision * sensitivity) / (precision + sensitivity) else: - f1_score = 0.0 - # sanity check that we're computing accuracy correctly - accuracy_estimate2 = (tp + tn) / float(tp + fp + tn + fn) - if abs(accuracy - accuracy_estimate2) > 0.00001: - logging.warn( - "!!! Conflicting accuracy estimates! (%0.5f vs. %0.5f)" % ( - accuracy, accuracy_estimate2)) - return accuracy, auc, f1_score + return 0.0 + + +def score_predictions(predicted_log_ic50, true_label, max_ic50): + """Computes accuracy, AUC, and F1 score of predictions""" + auc = sklearn.metrics.roc_auc_score(true_label, predicted_log_ic50) + ic50_pred = max_ic50 ** (1.0 - predicted_log_ic50) + label_pred = (ic50_pred <= 500) + same_mask = true_label == label_pred + accuracy = np.mean(same_mask) + f1 = f1_score(true_label, label_pred) + return accuracy, auc, f1 def train_model_and_return_scores( -- GitLab