From 1d2d7228a3b59261cc080872cfd0fb5da9189707 Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Tue, 19 Apr 2016 21:39:18 -0400
Subject: [PATCH] split up filtering of IEDB MHC dataframe

---
 script/create-iedb-class1-dataset.py | 138 +++++++++++++++++++--------
 1 file changed, 99 insertions(+), 39 deletions(-)

diff --git a/script/create-iedb-class1-dataset.py b/script/create-iedb-class1-dataset.py
index abc88ca7..f6f147c1 100755
--- a/script/create-iedb-class1-dataset.py
+++ b/script/create-iedb-class1-dataset.py
@@ -45,47 +45,76 @@ parser.add_argument(
     default=CLASS1_DATA_DIRECTORY,
     help="Directory to write output pickle file")
 
-
 parser.add_argument(
     "--output-pickle-filename",
     default=PICKLE_OUTPUT_FILENAME,
     help="Path to .pickle file containing dictionary of IEDB assay datasets")
 
-if __name__ == "__main__":
-    args = parser.parse_args()
-    df = pd.read_csv(
-        args.input_csv,
-        error_bad_lines=False,
-        encoding="latin-1",
-        header=[0, 1])
+parser.add_argument(
+    "--alleles",
+    nargs="+",
+    default=[],
+    help="Restrict dataset to specified alleles")
+
+def filter_class1_alleles(df):
+    mhc_class = df["MHC"]["MHC allele class"]
+    print("MHC class counts: \n%s" % (mhc_class.value_counts(),))
+    class1_mask = mhc_class == "I"
+    return df[class1_mask]
+
+def filter_allele_names(df):
     alleles = df["MHC"]["Allele Name"]
-    n = len(alleles)
-    print("# total: %d" % n)
-
-    mask = np.zeros(n, dtype=bool)
-    patterns = [
-        "HLA-A",
-        "HLA-B",
-        "HLA-C",
-        "H-2",
+    invalid_allele_mask = alleles.str.contains(" ") | alleles.str.contains("/")
+    invalid_alleles = alleles[invalid_allele_mask]
+    print("-- Invalid allele names: %s" % (list(sorted(set(invalid_alleles)))))
+    print("Dropping %d with complex alleles (e.g. descriptions of mutations)" % (
+        len(invalid_alleles),))
+    return df[~invalid_allele_mask]
+
+def filter_affinity_values(df):
+    affinities = df["Assay"]["Quantitative measurement"]
+    finite_affinity_mask = ~affinities.isnull() & np.isfinite(affinities)
+    invalid_affinity_mask = ~finite_affinity_mask
+
+    print("Dropping %d rows without finite affinity measurements" % (
+        invalid_affinity_mask.sum(),))
+    return df[finite_affinity_mask]
+
+def filter_mhc_dataframe(df):
+    filter_functions = [
+        filter_class1_alleles,
+        filter_allele_names,
+        filter_affinity_values,
     ]
-    for pattern in patterns:
-        pattern_mask = alleles.str.startswith(pattern)
-        print("# %s: %d" % (pattern, pattern_mask.sum()))
-        mask |= pattern_mask
-    df = df[mask]
-    print("# entries matching allele masks: %d" % (len(df)))
+
+    for fn in filter_functions:
+        df = fn(df)
+
+    return df
+
+
+def groupby_assay(df):
     assay_group = df["Assay"]["Assay Group"]
     assay_method = df["Assay"]["Method/Technique"]
     groups = df.groupby([assay_group, assay_method])
+
+    # speed up repeated calls to np.log by caching log affinities as a column
+    # in the dataframe
+    df["_log_affinity"] = np.log(df["Assay"]["Quantitative measurement"])
+
+    # speed up computing percent positive with the helper column
+    qualitative = df["Assay"]["Qualitative Measure"]
+    df["_qualitative_positive"] = qualitative.str.startswith("Positive")
     print("---")
     print("Assays")
     assay_dataframes = {}
     # create a dataframe for every distinct kind of assay which is used
     # by IEDB submitters to measure peptide-MHC affinity or stability
     for (assay_group, assay_method), group_data in sorted(
-            groups, key=lambda x: len(x[1]), reverse=True):
-        print("%s (%s): %d" % (assay_group, assay_method, len(group_data)))
+            groups,
+            key=lambda x: len(x[1]),
+            reverse=True):
+        print("- %s (%s): %d" % (assay_group, assay_method, len(group_data)))
         group_alleles = group_data["MHC"]["Allele Name"]
         group_peptides = group_data["Epitope"]["Description"]
         distinct_pmhc = group_data.groupby([group_alleles, group_peptides])
@@ -93,19 +122,16 @@ if __name__ == "__main__":
         for (allele, peptide), pmhc_group in distinct_pmhc:
             columns["mhc"].append(allele)
             columns["peptide"].append(peptide)
-            # performing median in log space since in two datapoint case
-            # we don't want to take e.g. (10 + 1000) / 2.0 = 505
-            # but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100
-            columns["value"].append(
-                np.exp(
-                    np.median(
-                        np.log(
-                            pmhc_group["Assay"]["Quantitative measurement"]))))
-            qualitative = pmhc_group["Assay"]["Qualitative Measure"]
-            columns["percent_positive"].append(
-                qualitative.str.startswith("Positive").mean())
-            columns["count"].append(
-                pmhc_group["Assay"]["Quantitative measurement"].count())
+            positive = pmhc_group["_qualitative_positive"]
+            count = len(pmhc_group)
+            if count == 1:
+                ic50 = pmhc_group["Assay"]["Quantitative measurement"].mean()
+            else:
+                ic50 = np.exp(np.mean(pmhc_group["_log_affinity"]))
+            # averaging the log affinities preserves orders of magnitude better
+            columns["value"].append(ic50)
+            columns["percent_positive"].append(positive.mean())
+            columns["count"].append(count)
         assay_dataframes[(assay_group, assay_method)] = pd.DataFrame(
             columns,
             columns=[
@@ -115,10 +141,44 @@ if __name__ == "__main__":
                 "percent_positive",
                 "count"])
         print("# distinct pMHC entries: %d" % len(columns["mhc"]))
+    return assay_dataframes
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    df = pd.read_csv(
+        args.input_csv,
+        error_bad_lines=False,
+        encoding="latin-1",
+        header=[0, 1])
+
+    df = filter_mhc_dataframe(df)
+
+    alleles = df["MHC"]["Allele Name"]
+
+    n = len(alleles)
+
+    print("# Class I rows: %d" % n)
+    print("# Class I alleles: %d" % len(set(alleles)))
+    print("Unique alleles: %s" % list(sorted(set(alleles))))
+
+    if args.alleles:
+        print("User-supplied allele whitelist: %s" % (args.alleles,))
+        mask = np.zeros(n, dtype=bool)
+        for pattern in args.alleles:
+            pattern_mask = alleles.str.startswith(pattern)
+            print("# %s: %d" % (pattern, pattern_mask.sum()))
+            mask |= pattern_mask
+        df = df[mask]
+        print("# entries matching alleles %s: %d" % (
+            args.alleles,
+            len(df)))
+
+    assay_dataframes = groupby_assay(df)
+
     if not exists(args.output_dir):
         makedirs(args.output_dir)
 
     output_path = join(args.output_dir, args.output_pickle_filename)
 
-    with open(args.output, "wb") as f:
+    with open(output_path, "wb") as f:
         pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
-- 
GitLab