From 6964235d43133d0d1a540c5e0d893fd6b8a0fbc8 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Sat, 17 Oct 2015 17:52:09 -0400
Subject: [PATCH] moved script for evaluating predictors on separate data to
 different file

---
 experiments/evaluate-test-set.py    | 69 +++++++++++++++++++++
 experiments/model-selection.py      | 93 +++--------------------------
 experiments/test_data.py            | 85 ++++++++++++++++++++++++++
 mhcflurry/common.py                 | 20 ++++++-
 mhcflurry/mhc1_binding_predictor.py |  2 +-
 5 files changed, 183 insertions(+), 86 deletions(-)
 create mode 100644 experiments/evaluate-test-set.py
 create mode 100644 experiments/test_data.py

diff --git a/experiments/evaluate-test-set.py b/experiments/evaluate-test-set.py
new file mode 100644
index 00000000..9d082789
--- /dev/null
+++ b/experiments/evaluate-test-set.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from os.path import join, exists
+from os import makedirs
+from argparse import ArgumentParser
+
+from test_data import load_test_data
+
+parser = ArgumentParser()
+
+
+parser.add_argument(
+    "--test-data-input-dirs",
+    nargs='*',
+    type=str,
+    help="Multiple directories from other predictors",
+    required=True)
+
+parser.add_argument(
+    "--test-data-input-sep",
+    default="\s+",
+    help="Separator to use for loading test data CSV/TSV files",
+    type=str)
+
+parser.add_argument(
+    "--test-data-output-dir",
+    help="Save combined test datasets to this directory",
+    required=True)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    dataframes, predictor_names = load_test_data(args.test_data_input_dir)
+    if not exists(args.test_data_output_dir):
+        makedirs(args.test_data_output_dir)
+
+    print("Loaded test data:")
+    for (allele, df) in dataframes.items():
+        df.index.name = "sequence"
+        print("%s: %d results" % (allele, len(df)))
+        if args.test_data_dir:
+            filename = "blind-%s.csv" % allele
+            filepath = join(args.test_data_output_dir, filename)
+            df.to_csv(filepath)
+
+    assert False
+    """
+    combined_df = evaluate_model_configs(
+        configs=configs,
+        results_filename=args.output,
+        train_fn=lambda config: evaluate_model_config_train_vs_test(
+            config,
+            training_allele_datasets=training_datasets,
+            testing_allele_datasets=testing_datasets,
+            min_samples_per_allele=5))
+    """
diff --git a/experiments/model-selection.py b/experiments/model-selection.py
index c2aa5a97..72eab84f 100755
--- a/experiments/model-selection.py
+++ b/experiments/model-selection.py
@@ -20,7 +20,6 @@ from __future__ import (
     absolute_import,
     unicode_literals
 )
-from os import listdir
 from os.path import join
 import argparse
 from time import time
@@ -50,12 +49,12 @@ from model_configs import (
 )
 from model_selection_helpers import (
     evaluate_model_config_by_cross_validation,
-    evaluate_model_config_train_vs_test,
 )
 
 from summarize_model_results import hyperparameter_performance
 from arg_parsing import parse_int_list, parse_float_list, parse_string_list
 
+
 PETERS2009_CSV_FILENAME = "bdata.2009.mhci.public.1.txt"
 PETERS2009_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS2009_CSV_FILENAME)
 
@@ -162,18 +161,6 @@ parser.add_argument(
     help="Comma separated list of optimization methods")
 
 
-parser.add_argument(
-    "--test-data-dir",
-    nargs='*',
-    type=str)
-
-parser.add_argument(
-    "--test-data-sep",
-    default="\s+",
-    help="Separator to use for loading test data CSV/TSV files",
-    type=str)
-
-
 def evaluate_model_configs(configs, results_filename, train_fn):
     all_dataframes = []
     all_elapsed_times = []
@@ -204,47 +191,6 @@ def evaluate_model_configs(configs, results_filename, train_fn):
     return pd.concat(all_dataframes)
 
 
-def load_test_data(dirpaths, sep="\s+", column_per_predictor=True):
-    """
-    Load all allele-specific datasets from the given path assuming filenames
-    have the form:
-        pred.PREDICTOR_NAME.CV_METHOD.ALLELE-LENGTH.xls
-    Example:
-        pred.netmhc.blind.HLA-A-3201-9.xls
-    where ALLELE could be HLA-A-0201 and LENGTH is an integer
-
-    Combines all loaded files into a single DataFrame.
-
-    If `column_per_predictor` is True then reshape the DataFrame to have
-    multiple prediction columns, one per distinct predictor.
-    """
-
-    dataframes = []
-    for dirpath in dirpaths:
-        for filename in listdir(dirpath):
-            dot_parts = filename.split(".")
-            if len(dot_parts) != 5:
-                continue
-            _, predictor_name, cv_method, suffix, ext = dot_parts
-            dash_parts = suffix.split("-")
-            if len(dash_parts) != 2:
-                continue
-            allele = "-".join(dash_parts[:-1])
-            length = int(dash_parts[-1])
-            filepath = join(dirpath, filename)
-            df = pd.read_csv(filepath, sep=sep)
-            df["dirpath"] = dirpath
-            df["predictor"] = predictor_name
-            df["cv_method"] = cv_method
-            df["allele"] = allele
-            df["length"] = length
-            dataframes.append(df)
-    combined = pd.concat(dataframes)
-    if column_per_predictor:
-        assert False
-    else:
-        return combined
-
 if __name__ == "__main__":
     args = parser.parse_args()
     configs = generate_all_model_configs(
@@ -265,33 +211,12 @@ if __name__ == "__main__":
         args.binding_data_csv_path,
         peptide_length=9,
         binary_encoding=False)
-    if args.test_data_dir:
-        test_dataframes = []
-        for subdir in args.test_data_dir:
-            test_dataframes.append(pd.read_csv(subdir, sep=args.test_data_sep))
-        test_data = pd.concat(test_dataframes)
-        print(test_data)
-        assert False
-        testing_datasets, _ = load_data(
-            BLIND_2013_CSV_PATH,
-            peptide_length=9,
-            binary_encoding=False)
-        combined_df = evaluate_model_configs(
-            configs=configs,
-            results_filename=args.output,
-            train_fn=lambda config: evaluate_model_config_train_vs_test(
-                config,
-                training_allele_datasets=training_datasets,
-                testing_allele_datasets=testing_datasets,
-                min_samples_per_allele=5))
-    else:
-        combined_df = evaluate_model_configs(
-            configs=configs,
-            results_filename=args.output,
-            train_fn=lambda config: evaluate_model_config_by_cross_validation(
-                config,
-                training_datasets,
-                min_samples_per_allele=args.min_samples_per_allele,
-                cv_folds=args.cv_folds))
-
+    combined_df = evaluate_model_configs(
+        configs=configs,
+        results_filename=args.output,
+        train_fn=lambda config: evaluate_model_config_by_cross_validation(
+            config,
+            training_datasets,
+            min_samples_per_allele=args.min_samples_per_allele,
+            cv_folds=args.cv_folds))
     hyperparameter_performance(combined_df)
diff --git a/experiments/test_data.py b/experiments/test_data.py
new file mode 100644
index 00000000..5eb60606
--- /dev/null
+++ b/experiments/test_data.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2015. Mount Sinai School of Medicine
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict, OrderedDict
+from os import listdir
+from os.path import join
+
+import pandas as pd
+from mhcflurry.common import normalize_allele_name
+
+
+def load_test_data(dirpaths, sep="\s+", ic50_base=10.0, comment_char="B"):
+    """
+    Load all allele-specific datasets from the given path assuming filenames
+    have the form:
+        pred.PREDICTOR_NAME.CV_METHOD.ALLELE-LENGTH.xls
+    Example:
+        pred.netmhc.blind.HLA-A-3201-9.xls
+    where ALLELE could be HLA-A-0201 and LENGTH is an integer
+
+    Combines all loaded files into a single DataFrame.
+
+    If `column_per_predictor` is True then reshape the DataFrame to have
+    multiple prediction columns, one per distinct predictor.
+
+    If ic50_base is not None, then transform IC50 using ic50_base ** pred
+    """
+
+    # dictionary mapping from (allele, sequence) to dictionary of binding
+    # predictions and the actual measuremnt called "meas"
+    test_datasets = {}
+    predictor_names = set([])
+
+    for dirpath in dirpaths:
+        for filename in listdir(dirpath):
+            filepath = join(dirpath, filename)
+            dot_parts = filename.split(".")
+            if len(dot_parts) != 5:
+                print("Skipping %s" % filepath)
+                continue
+            _, predictor_name, cv_method, suffix, ext = dot_parts
+            dash_parts = suffix.split("-")
+            if len(dash_parts) < 2:
+                print("Skipping %s due to incorrect format" % filepath)
+                continue
+            predictor_names.add(predictor_name)
+            print("Reading %s" % filepath)
+            allele = normalize_allele_name("-".join(dash_parts[:-1]))
+            length = int(dash_parts[-1])
+            df = pd.read_csv(filepath, sep=sep, comment=comment_char)
+            df["dirpath"] = dirpath
+            df["predictor"] = predictor_name
+            df["cv_method"] = cv_method
+            df["allele"] = allele
+            df["length"] = length
+            if ic50_base is not None:
+                df["pred"] = ic50_base ** df["pred"]
+                df["meas"] = ic50_base ** df["meas"]
+
+            if allele not in test_datasets:
+                test_datasets[allele] = defaultdict(OrderedDict)
+
+            dataset_dict = test_datasets[allele]
+            for _, row in df.iterrows():
+                sequence = row["sequence"]
+                dataset_dict[sequence]["length"] = length
+                dataset_dict[sequence]["meas"] = row["meas"]
+                dataset_dict[sequence][predictor_name] = row["pred"]
+    test_dataframes = {
+        allele: pd.DataFrame.from_dict(
+            ic50_values, orient="index")
+        for (allele, ic50_values) in test_datasets.items()
+    }
+    return test_dataframes, predictor_names
diff --git a/mhcflurry/common.py b/mhcflurry/common.py
index 7ff1bdcc..8623aa97 100644
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -18,26 +18,44 @@ from __future__ import (
     absolute_import,
 )
 
+
 def parse_int_list(s):
     return [int(part.strip() for part in s.split(","))]
 
+
 def split_uppercase_sequences(s):
     return [part.strip().upper() for part in s.split(",")]
 
 
 def normalize_allele_name(allele_name):
+    """
+    Only works for mouse, human, and rhesus monkey alleles.
+
+    TODO: use the same logic as mhctools for MHC name parsing.
+    Possibly even worth its own small repo called something like "mhcnames"
+    """
     allele_name = allele_name.upper()
+    if allele_name.startswith("MAMU"):
+        prefix = "Mamu-"
+    elif allele_name.startswith("H-2") or allele_name.startswith("H2"):
+        prefix = "H-2-"
+    else:
+        prefix = ""
     # old school HLA-C serotypes look like "Cw"
     allele_name = allele_name.replace("CW", "C")
     patterns = [
         "HLA-",
+        "H-2",
+        "H2",
+        "MAMU",
         "-",
         "*",
         ":"
     ]
     for pattern in patterns:
         allele_name = allele_name.replace(pattern, "")
-    return allele_name
+    return "%s%s" % (prefix, allele_name)
+
 
 def split_allele_names(s):
     return [
diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py
index 95f92565..baa567a9 100644
--- a/mhcflurry/mhc1_binding_predictor.py
+++ b/mhcflurry/mhc1_binding_predictor.py
@@ -85,7 +85,7 @@ class Mhc1BindingPredictor(object):
     def _log_to_ic50(self, log_value):
         """
         Convert neural network output to IC50 values between 0.0 and
-        self.max_ic50 (typically 5000, 20000 or 50000)
+        self.max_ic50 (typically 5000, 20000 or w0)
         """
         return self.max_ic50 ** (1.0 - log_value)
 
-- 
GitLab