From 6964235d43133d0d1a540c5e0d893fd6b8a0fbc8 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Sat, 17 Oct 2015 17:52:09 -0400 Subject: [PATCH] moved script for evaluating predictors on separate data to different file --- experiments/evaluate-test-set.py | 69 +++++++++++++++++++++ experiments/model-selection.py | 93 +++-------------------------- experiments/test_data.py | 85 ++++++++++++++++++++++++++ mhcflurry/common.py | 20 ++++++- mhcflurry/mhc1_binding_predictor.py | 2 +- 5 files changed, 183 insertions(+), 86 deletions(-) create mode 100644 experiments/evaluate-test-set.py create mode 100644 experiments/test_data.py diff --git a/experiments/evaluate-test-set.py b/experiments/evaluate-test-set.py new file mode 100644 index 00000000..9d082789 --- /dev/null +++ b/experiments/evaluate-test-set.py @@ -0,0 +1,69 @@ +#!/usr/bin/env python +# +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from os.path import join, exists +from os import makedirs +from argparse import ArgumentParser + +from test_data import load_test_data + +parser = ArgumentParser() + + +parser.add_argument( + "--test-data-input-dirs", + nargs='*', + type=str, + help="Multiple directories from other predictors", + required=True) + +parser.add_argument( + "--test-data-input-sep", + default="\s+", + help="Separator to use for loading test data CSV/TSV files", + type=str) + +parser.add_argument( + "--test-data-output-dir", + help="Save combined test datasets to this directory", + required=True) + +if __name__ == "__main__": + args = parser.parse_args() + dataframes, predictor_names = load_test_data(args.test_data_input_dir) + if not exists(args.test_data_output_dir): + makedirs(args.test_data_output_dir) + + print("Loaded test data:") + for (allele, df) in dataframes.items(): + df.index.name = "sequence" + print("%s: %d results" % (allele, len(df))) + if args.test_data_dir: + filename = "blind-%s.csv" % allele + filepath = join(args.test_data_output_dir, filename) + df.to_csv(filepath) + + assert False + """ + combined_df = evaluate_model_configs( + configs=configs, + results_filename=args.output, + train_fn=lambda config: evaluate_model_config_train_vs_test( + config, + training_allele_datasets=training_datasets, + testing_allele_datasets=testing_datasets, + min_samples_per_allele=5)) + """ diff --git a/experiments/model-selection.py b/experiments/model-selection.py index c2aa5a97..72eab84f 100755 --- a/experiments/model-selection.py +++ b/experiments/model-selection.py @@ -20,7 +20,6 @@ from __future__ import ( absolute_import, unicode_literals ) -from os import listdir from os.path import join import argparse from time import time @@ -50,12 +49,12 @@ from model_configs import ( ) from model_selection_helpers import ( evaluate_model_config_by_cross_validation, - evaluate_model_config_train_vs_test, ) from summarize_model_results import hyperparameter_performance from arg_parsing import parse_int_list, parse_float_list, parse_string_list + PETERS2009_CSV_FILENAME = "bdata.2009.mhci.public.1.txt" PETERS2009_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS2009_CSV_FILENAME) @@ -162,18 +161,6 @@ parser.add_argument( help="Comma separated list of optimization methods") -parser.add_argument( - "--test-data-dir", - nargs='*', - type=str) - -parser.add_argument( - "--test-data-sep", - default="\s+", - help="Separator to use for loading test data CSV/TSV files", - type=str) - - def evaluate_model_configs(configs, results_filename, train_fn): all_dataframes = [] all_elapsed_times = [] @@ -204,47 +191,6 @@ def evaluate_model_configs(configs, results_filename, train_fn): return pd.concat(all_dataframes) -def load_test_data(dirpaths, sep="\s+", column_per_predictor=True): - """ - Load all allele-specific datasets from the given path assuming filenames - have the form: - pred.PREDICTOR_NAME.CV_METHOD.ALLELE-LENGTH.xls - Example: - pred.netmhc.blind.HLA-A-3201-9.xls - where ALLELE could be HLA-A-0201 and LENGTH is an integer - - Combines all loaded files into a single DataFrame. - - If `column_per_predictor` is True then reshape the DataFrame to have - multiple prediction columns, one per distinct predictor. - """ - - dataframes = [] - for dirpath in dirpaths: - for filename in listdir(dirpath): - dot_parts = filename.split(".") - if len(dot_parts) != 5: - continue - _, predictor_name, cv_method, suffix, ext = dot_parts - dash_parts = suffix.split("-") - if len(dash_parts) != 2: - continue - allele = "-".join(dash_parts[:-1]) - length = int(dash_parts[-1]) - filepath = join(dirpath, filename) - df = pd.read_csv(filepath, sep=sep) - df["dirpath"] = dirpath - df["predictor"] = predictor_name - df["cv_method"] = cv_method - df["allele"] = allele - df["length"] = length - dataframes.append(df) - combined = pd.concat(dataframes) - if column_per_predictor: - assert False - else: - return combined - if __name__ == "__main__": args = parser.parse_args() configs = generate_all_model_configs( @@ -265,33 +211,12 @@ if __name__ == "__main__": args.binding_data_csv_path, peptide_length=9, binary_encoding=False) - if args.test_data_dir: - test_dataframes = [] - for subdir in args.test_data_dir: - test_dataframes.append(pd.read_csv(subdir, sep=args.test_data_sep)) - test_data = pd.concat(test_dataframes) - print(test_data) - assert False - testing_datasets, _ = load_data( - BLIND_2013_CSV_PATH, - peptide_length=9, - binary_encoding=False) - combined_df = evaluate_model_configs( - configs=configs, - results_filename=args.output, - train_fn=lambda config: evaluate_model_config_train_vs_test( - config, - training_allele_datasets=training_datasets, - testing_allele_datasets=testing_datasets, - min_samples_per_allele=5)) - else: - combined_df = evaluate_model_configs( - configs=configs, - results_filename=args.output, - train_fn=lambda config: evaluate_model_config_by_cross_validation( - config, - training_datasets, - min_samples_per_allele=args.min_samples_per_allele, - cv_folds=args.cv_folds)) - + combined_df = evaluate_model_configs( + configs=configs, + results_filename=args.output, + train_fn=lambda config: evaluate_model_config_by_cross_validation( + config, + training_datasets, + min_samples_per_allele=args.min_samples_per_allele, + cv_folds=args.cv_folds)) hyperparameter_performance(combined_df) diff --git a/experiments/test_data.py b/experiments/test_data.py new file mode 100644 index 00000000..5eb60606 --- /dev/null +++ b/experiments/test_data.py @@ -0,0 +1,85 @@ +# Copyright (c) 2015. Mount Sinai School of Medicine +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import defaultdict, OrderedDict +from os import listdir +from os.path import join + +import pandas as pd +from mhcflurry.common import normalize_allele_name + + +def load_test_data(dirpaths, sep="\s+", ic50_base=10.0, comment_char="B"): + """ + Load all allele-specific datasets from the given path assuming filenames + have the form: + pred.PREDICTOR_NAME.CV_METHOD.ALLELE-LENGTH.xls + Example: + pred.netmhc.blind.HLA-A-3201-9.xls + where ALLELE could be HLA-A-0201 and LENGTH is an integer + + Combines all loaded files into a single DataFrame. + + If `column_per_predictor` is True then reshape the DataFrame to have + multiple prediction columns, one per distinct predictor. + + If ic50_base is not None, then transform IC50 using ic50_base ** pred + """ + + # dictionary mapping from (allele, sequence) to dictionary of binding + # predictions and the actual measuremnt called "meas" + test_datasets = {} + predictor_names = set([]) + + for dirpath in dirpaths: + for filename in listdir(dirpath): + filepath = join(dirpath, filename) + dot_parts = filename.split(".") + if len(dot_parts) != 5: + print("Skipping %s" % filepath) + continue + _, predictor_name, cv_method, suffix, ext = dot_parts + dash_parts = suffix.split("-") + if len(dash_parts) < 2: + print("Skipping %s due to incorrect format" % filepath) + continue + predictor_names.add(predictor_name) + print("Reading %s" % filepath) + allele = normalize_allele_name("-".join(dash_parts[:-1])) + length = int(dash_parts[-1]) + df = pd.read_csv(filepath, sep=sep, comment=comment_char) + df["dirpath"] = dirpath + df["predictor"] = predictor_name + df["cv_method"] = cv_method + df["allele"] = allele + df["length"] = length + if ic50_base is not None: + df["pred"] = ic50_base ** df["pred"] + df["meas"] = ic50_base ** df["meas"] + + if allele not in test_datasets: + test_datasets[allele] = defaultdict(OrderedDict) + + dataset_dict = test_datasets[allele] + for _, row in df.iterrows(): + sequence = row["sequence"] + dataset_dict[sequence]["length"] = length + dataset_dict[sequence]["meas"] = row["meas"] + dataset_dict[sequence][predictor_name] = row["pred"] + test_dataframes = { + allele: pd.DataFrame.from_dict( + ic50_values, orient="index") + for (allele, ic50_values) in test_datasets.items() + } + return test_dataframes, predictor_names diff --git a/mhcflurry/common.py b/mhcflurry/common.py index 7ff1bdcc..8623aa97 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -18,26 +18,44 @@ from __future__ import ( absolute_import, ) + def parse_int_list(s): return [int(part.strip() for part in s.split(","))] + def split_uppercase_sequences(s): return [part.strip().upper() for part in s.split(",")] def normalize_allele_name(allele_name): + """ + Only works for mouse, human, and rhesus monkey alleles. + + TODO: use the same logic as mhctools for MHC name parsing. + Possibly even worth its own small repo called something like "mhcnames" + """ allele_name = allele_name.upper() + if allele_name.startswith("MAMU"): + prefix = "Mamu-" + elif allele_name.startswith("H-2") or allele_name.startswith("H2"): + prefix = "H-2-" + else: + prefix = "" # old school HLA-C serotypes look like "Cw" allele_name = allele_name.replace("CW", "C") patterns = [ "HLA-", + "H-2", + "H2", + "MAMU", "-", "*", ":" ] for pattern in patterns: allele_name = allele_name.replace(pattern, "") - return allele_name + return "%s%s" % (prefix, allele_name) + def split_allele_names(s): return [ diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py index 95f92565..baa567a9 100644 --- a/mhcflurry/mhc1_binding_predictor.py +++ b/mhcflurry/mhc1_binding_predictor.py @@ -85,7 +85,7 @@ class Mhc1BindingPredictor(object): def _log_to_ic50(self, log_value): """ Convert neural network output to IC50 values between 0.0 and - self.max_ic50 (typically 5000, 20000 or 50000) + self.max_ic50 (typically 5000, 20000 or w0) """ return self.max_ic50 ** (1.0 - log_value) -- GitLab