From 1d2d7228a3b59261cc080872cfd0fb5da9189707 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Tue, 19 Apr 2016 21:39:18 -0400 Subject: [PATCH] split up filtering of IEDB MHC dataframe --- script/create-iedb-class1-dataset.py | 138 +++++++++++++++++++-------- 1 file changed, 99 insertions(+), 39 deletions(-) diff --git a/script/create-iedb-class1-dataset.py b/script/create-iedb-class1-dataset.py index abc88ca7..f6f147c1 100755 --- a/script/create-iedb-class1-dataset.py +++ b/script/create-iedb-class1-dataset.py @@ -45,47 +45,76 @@ parser.add_argument( default=CLASS1_DATA_DIRECTORY, help="Directory to write output pickle file") - parser.add_argument( "--output-pickle-filename", default=PICKLE_OUTPUT_FILENAME, help="Path to .pickle file containing dictionary of IEDB assay datasets") -if __name__ == "__main__": - args = parser.parse_args() - df = pd.read_csv( - args.input_csv, - error_bad_lines=False, - encoding="latin-1", - header=[0, 1]) +parser.add_argument( + "--alleles", + nargs="+", + default=[], + help="Restrict dataset to specified alleles") + +def filter_class1_alleles(df): + mhc_class = df["MHC"]["MHC allele class"] + print("MHC class counts: \n%s" % (mhc_class.value_counts(),)) + class1_mask = mhc_class == "I" + return df[class1_mask] + +def filter_allele_names(df): alleles = df["MHC"]["Allele Name"] - n = len(alleles) - print("# total: %d" % n) - - mask = np.zeros(n, dtype=bool) - patterns = [ - "HLA-A", - "HLA-B", - "HLA-C", - "H-2", + invalid_allele_mask = alleles.str.contains(" ") | alleles.str.contains("/") + invalid_alleles = alleles[invalid_allele_mask] + print("-- Invalid allele names: %s" % (list(sorted(set(invalid_alleles))))) + print("Dropping %d with complex alleles (e.g. descriptions of mutations)" % ( + len(invalid_alleles),)) + return df[~invalid_allele_mask] + +def filter_affinity_values(df): + affinities = df["Assay"]["Quantitative measurement"] + finite_affinity_mask = ~affinities.isnull() & np.isfinite(affinities) + invalid_affinity_mask = ~finite_affinity_mask + + print("Dropping %d rows without finite affinity measurements" % ( + invalid_affinity_mask.sum(),)) + return df[finite_affinity_mask] + +def filter_mhc_dataframe(df): + filter_functions = [ + filter_class1_alleles, + filter_allele_names, + filter_affinity_values, ] - for pattern in patterns: - pattern_mask = alleles.str.startswith(pattern) - print("# %s: %d" % (pattern, pattern_mask.sum())) - mask |= pattern_mask - df = df[mask] - print("# entries matching allele masks: %d" % (len(df))) + + for fn in filter_functions: + df = fn(df) + + return df + + +def groupby_assay(df): assay_group = df["Assay"]["Assay Group"] assay_method = df["Assay"]["Method/Technique"] groups = df.groupby([assay_group, assay_method]) + + # speed up repeated calls to np.log by caching log affinities as a column + # in the dataframe + df["_log_affinity"] = np.log(df["Assay"]["Quantitative measurement"]) + + # speed up computing percent positive with the helper column + qualitative = df["Assay"]["Qualitative Measure"] + df["_qualitative_positive"] = qualitative.str.startswith("Positive") print("---") print("Assays") assay_dataframes = {} # create a dataframe for every distinct kind of assay which is used # by IEDB submitters to measure peptide-MHC affinity or stability for (assay_group, assay_method), group_data in sorted( - groups, key=lambda x: len(x[1]), reverse=True): - print("%s (%s): %d" % (assay_group, assay_method, len(group_data))) + groups, + key=lambda x: len(x[1]), + reverse=True): + print("- %s (%s): %d" % (assay_group, assay_method, len(group_data))) group_alleles = group_data["MHC"]["Allele Name"] group_peptides = group_data["Epitope"]["Description"] distinct_pmhc = group_data.groupby([group_alleles, group_peptides]) @@ -93,19 +122,16 @@ if __name__ == "__main__": for (allele, peptide), pmhc_group in distinct_pmhc: columns["mhc"].append(allele) columns["peptide"].append(peptide) - # performing median in log space since in two datapoint case - # we don't want to take e.g. (10 + 1000) / 2.0 = 505 - # but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100 - columns["value"].append( - np.exp( - np.median( - np.log( - pmhc_group["Assay"]["Quantitative measurement"])))) - qualitative = pmhc_group["Assay"]["Qualitative Measure"] - columns["percent_positive"].append( - qualitative.str.startswith("Positive").mean()) - columns["count"].append( - pmhc_group["Assay"]["Quantitative measurement"].count()) + positive = pmhc_group["_qualitative_positive"] + count = len(pmhc_group) + if count == 1: + ic50 = pmhc_group["Assay"]["Quantitative measurement"].mean() + else: + ic50 = np.exp(np.mean(pmhc_group["_log_affinity"])) + # averaging the log affinities preserves orders of magnitude better + columns["value"].append(ic50) + columns["percent_positive"].append(positive.mean()) + columns["count"].append(count) assay_dataframes[(assay_group, assay_method)] = pd.DataFrame( columns, columns=[ @@ -115,10 +141,44 @@ if __name__ == "__main__": "percent_positive", "count"]) print("# distinct pMHC entries: %d" % len(columns["mhc"])) + return assay_dataframes + +if __name__ == "__main__": + args = parser.parse_args() + df = pd.read_csv( + args.input_csv, + error_bad_lines=False, + encoding="latin-1", + header=[0, 1]) + + df = filter_mhc_dataframe(df) + + alleles = df["MHC"]["Allele Name"] + + n = len(alleles) + + print("# Class I rows: %d" % n) + print("# Class I alleles: %d" % len(set(alleles))) + print("Unique alleles: %s" % list(sorted(set(alleles)))) + + if args.alleles: + print("User-supplied allele whitelist: %s" % (args.alleles,)) + mask = np.zeros(n, dtype=bool) + for pattern in args.alleles: + pattern_mask = alleles.str.startswith(pattern) + print("# %s: %d" % (pattern, pattern_mask.sum())) + mask |= pattern_mask + df = df[mask] + print("# entries matching alleles %s: %d" % ( + args.alleles, + len(df))) + + assay_dataframes = groupby_assay(df) + if not exists(args.output_dir): makedirs(args.output_dir) output_path = join(args.output_dir, args.output_pickle_filename) - with open(args.output, "wb") as f: + with open(output_path, "wb") as f: pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL) -- GitLab