Skip to content
Snippets Groups Projects
Commit 6964235d authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

moved script for evaluating predictors on separate data to different file

parent e2cb6c17
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python
#
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from os.path import join, exists
from os import makedirs
from argparse import ArgumentParser
from test_data import load_test_data
parser = ArgumentParser()
parser.add_argument(
"--test-data-input-dirs",
nargs='*',
type=str,
help="Multiple directories from other predictors",
required=True)
parser.add_argument(
"--test-data-input-sep",
default="\s+",
help="Separator to use for loading test data CSV/TSV files",
type=str)
parser.add_argument(
"--test-data-output-dir",
help="Save combined test datasets to this directory",
required=True)
if __name__ == "__main__":
args = parser.parse_args()
dataframes, predictor_names = load_test_data(args.test_data_input_dir)
if not exists(args.test_data_output_dir):
makedirs(args.test_data_output_dir)
print("Loaded test data:")
for (allele, df) in dataframes.items():
df.index.name = "sequence"
print("%s: %d results" % (allele, len(df)))
if args.test_data_dir:
filename = "blind-%s.csv" % allele
filepath = join(args.test_data_output_dir, filename)
df.to_csv(filepath)
assert False
"""
combined_df = evaluate_model_configs(
configs=configs,
results_filename=args.output,
train_fn=lambda config: evaluate_model_config_train_vs_test(
config,
training_allele_datasets=training_datasets,
testing_allele_datasets=testing_datasets,
min_samples_per_allele=5))
"""
......@@ -20,7 +20,6 @@ from __future__ import (
absolute_import,
unicode_literals
)
from os import listdir
from os.path import join
import argparse
from time import time
......@@ -50,12 +49,12 @@ from model_configs import (
)
from model_selection_helpers import (
evaluate_model_config_by_cross_validation,
evaluate_model_config_train_vs_test,
)
from summarize_model_results import hyperparameter_performance
from arg_parsing import parse_int_list, parse_float_list, parse_string_list
PETERS2009_CSV_FILENAME = "bdata.2009.mhci.public.1.txt"
PETERS2009_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS2009_CSV_FILENAME)
......@@ -162,18 +161,6 @@ parser.add_argument(
help="Comma separated list of optimization methods")
parser.add_argument(
"--test-data-dir",
nargs='*',
type=str)
parser.add_argument(
"--test-data-sep",
default="\s+",
help="Separator to use for loading test data CSV/TSV files",
type=str)
def evaluate_model_configs(configs, results_filename, train_fn):
all_dataframes = []
all_elapsed_times = []
......@@ -204,47 +191,6 @@ def evaluate_model_configs(configs, results_filename, train_fn):
return pd.concat(all_dataframes)
def load_test_data(dirpaths, sep="\s+", column_per_predictor=True):
"""
Load all allele-specific datasets from the given path assuming filenames
have the form:
pred.PREDICTOR_NAME.CV_METHOD.ALLELE-LENGTH.xls
Example:
pred.netmhc.blind.HLA-A-3201-9.xls
where ALLELE could be HLA-A-0201 and LENGTH is an integer
Combines all loaded files into a single DataFrame.
If `column_per_predictor` is True then reshape the DataFrame to have
multiple prediction columns, one per distinct predictor.
"""
dataframes = []
for dirpath in dirpaths:
for filename in listdir(dirpath):
dot_parts = filename.split(".")
if len(dot_parts) != 5:
continue
_, predictor_name, cv_method, suffix, ext = dot_parts
dash_parts = suffix.split("-")
if len(dash_parts) != 2:
continue
allele = "-".join(dash_parts[:-1])
length = int(dash_parts[-1])
filepath = join(dirpath, filename)
df = pd.read_csv(filepath, sep=sep)
df["dirpath"] = dirpath
df["predictor"] = predictor_name
df["cv_method"] = cv_method
df["allele"] = allele
df["length"] = length
dataframes.append(df)
combined = pd.concat(dataframes)
if column_per_predictor:
assert False
else:
return combined
if __name__ == "__main__":
args = parser.parse_args()
configs = generate_all_model_configs(
......@@ -265,33 +211,12 @@ if __name__ == "__main__":
args.binding_data_csv_path,
peptide_length=9,
binary_encoding=False)
if args.test_data_dir:
test_dataframes = []
for subdir in args.test_data_dir:
test_dataframes.append(pd.read_csv(subdir, sep=args.test_data_sep))
test_data = pd.concat(test_dataframes)
print(test_data)
assert False
testing_datasets, _ = load_data(
BLIND_2013_CSV_PATH,
peptide_length=9,
binary_encoding=False)
combined_df = evaluate_model_configs(
configs=configs,
results_filename=args.output,
train_fn=lambda config: evaluate_model_config_train_vs_test(
config,
training_allele_datasets=training_datasets,
testing_allele_datasets=testing_datasets,
min_samples_per_allele=5))
else:
combined_df = evaluate_model_configs(
configs=configs,
results_filename=args.output,
train_fn=lambda config: evaluate_model_config_by_cross_validation(
config,
training_datasets,
min_samples_per_allele=args.min_samples_per_allele,
cv_folds=args.cv_folds))
combined_df = evaluate_model_configs(
configs=configs,
results_filename=args.output,
train_fn=lambda config: evaluate_model_config_by_cross_validation(
config,
training_datasets,
min_samples_per_allele=args.min_samples_per_allele,
cv_folds=args.cv_folds))
hyperparameter_performance(combined_df)
# Copyright (c) 2015. Mount Sinai School of Medicine
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from collections import defaultdict, OrderedDict
from os import listdir
from os.path import join
import pandas as pd
from mhcflurry.common import normalize_allele_name
def load_test_data(dirpaths, sep="\s+", ic50_base=10.0, comment_char="B"):
"""
Load all allele-specific datasets from the given path assuming filenames
have the form:
pred.PREDICTOR_NAME.CV_METHOD.ALLELE-LENGTH.xls
Example:
pred.netmhc.blind.HLA-A-3201-9.xls
where ALLELE could be HLA-A-0201 and LENGTH is an integer
Combines all loaded files into a single DataFrame.
If `column_per_predictor` is True then reshape the DataFrame to have
multiple prediction columns, one per distinct predictor.
If ic50_base is not None, then transform IC50 using ic50_base ** pred
"""
# dictionary mapping from (allele, sequence) to dictionary of binding
# predictions and the actual measuremnt called "meas"
test_datasets = {}
predictor_names = set([])
for dirpath in dirpaths:
for filename in listdir(dirpath):
filepath = join(dirpath, filename)
dot_parts = filename.split(".")
if len(dot_parts) != 5:
print("Skipping %s" % filepath)
continue
_, predictor_name, cv_method, suffix, ext = dot_parts
dash_parts = suffix.split("-")
if len(dash_parts) < 2:
print("Skipping %s due to incorrect format" % filepath)
continue
predictor_names.add(predictor_name)
print("Reading %s" % filepath)
allele = normalize_allele_name("-".join(dash_parts[:-1]))
length = int(dash_parts[-1])
df = pd.read_csv(filepath, sep=sep, comment=comment_char)
df["dirpath"] = dirpath
df["predictor"] = predictor_name
df["cv_method"] = cv_method
df["allele"] = allele
df["length"] = length
if ic50_base is not None:
df["pred"] = ic50_base ** df["pred"]
df["meas"] = ic50_base ** df["meas"]
if allele not in test_datasets:
test_datasets[allele] = defaultdict(OrderedDict)
dataset_dict = test_datasets[allele]
for _, row in df.iterrows():
sequence = row["sequence"]
dataset_dict[sequence]["length"] = length
dataset_dict[sequence]["meas"] = row["meas"]
dataset_dict[sequence][predictor_name] = row["pred"]
test_dataframes = {
allele: pd.DataFrame.from_dict(
ic50_values, orient="index")
for (allele, ic50_values) in test_datasets.items()
}
return test_dataframes, predictor_names
......@@ -18,26 +18,44 @@ from __future__ import (
absolute_import,
)
def parse_int_list(s):
return [int(part.strip() for part in s.split(","))]
def split_uppercase_sequences(s):
return [part.strip().upper() for part in s.split(",")]
def normalize_allele_name(allele_name):
"""
Only works for mouse, human, and rhesus monkey alleles.
TODO: use the same logic as mhctools for MHC name parsing.
Possibly even worth its own small repo called something like "mhcnames"
"""
allele_name = allele_name.upper()
if allele_name.startswith("MAMU"):
prefix = "Mamu-"
elif allele_name.startswith("H-2") or allele_name.startswith("H2"):
prefix = "H-2-"
else:
prefix = ""
# old school HLA-C serotypes look like "Cw"
allele_name = allele_name.replace("CW", "C")
patterns = [
"HLA-",
"H-2",
"H2",
"MAMU",
"-",
"*",
":"
]
for pattern in patterns:
allele_name = allele_name.replace(pattern, "")
return allele_name
return "%s%s" % (prefix, allele_name)
def split_allele_names(s):
return [
......
......@@ -85,7 +85,7 @@ class Mhc1BindingPredictor(object):
def _log_to_ic50(self, log_value):
"""
Convert neural network output to IC50 values between 0.0 and
self.max_ic50 (typically 5000, 20000 or 50000)
self.max_ic50 (typically 5000, 20000 or w0)
"""
return self.max_ic50 ** (1.0 - log_value)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment