Skip to content
Snippets Groups Projects
Commit 1d2d7228 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

split up filtering of IEDB MHC dataframe

parent 6859d990
No related merge requests found
......@@ -45,47 +45,76 @@ parser.add_argument(
default=CLASS1_DATA_DIRECTORY,
help="Directory to write output pickle file")
parser.add_argument(
"--output-pickle-filename",
default=PICKLE_OUTPUT_FILENAME,
help="Path to .pickle file containing dictionary of IEDB assay datasets")
if __name__ == "__main__":
args = parser.parse_args()
df = pd.read_csv(
args.input_csv,
error_bad_lines=False,
encoding="latin-1",
header=[0, 1])
parser.add_argument(
"--alleles",
nargs="+",
default=[],
help="Restrict dataset to specified alleles")
def filter_class1_alleles(df):
mhc_class = df["MHC"]["MHC allele class"]
print("MHC class counts: \n%s" % (mhc_class.value_counts(),))
class1_mask = mhc_class == "I"
return df[class1_mask]
def filter_allele_names(df):
alleles = df["MHC"]["Allele Name"]
n = len(alleles)
print("# total: %d" % n)
mask = np.zeros(n, dtype=bool)
patterns = [
"HLA-A",
"HLA-B",
"HLA-C",
"H-2",
invalid_allele_mask = alleles.str.contains(" ") | alleles.str.contains("/")
invalid_alleles = alleles[invalid_allele_mask]
print("-- Invalid allele names: %s" % (list(sorted(set(invalid_alleles)))))
print("Dropping %d with complex alleles (e.g. descriptions of mutations)" % (
len(invalid_alleles),))
return df[~invalid_allele_mask]
def filter_affinity_values(df):
affinities = df["Assay"]["Quantitative measurement"]
finite_affinity_mask = ~affinities.isnull() & np.isfinite(affinities)
invalid_affinity_mask = ~finite_affinity_mask
print("Dropping %d rows without finite affinity measurements" % (
invalid_affinity_mask.sum(),))
return df[finite_affinity_mask]
def filter_mhc_dataframe(df):
filter_functions = [
filter_class1_alleles,
filter_allele_names,
filter_affinity_values,
]
for pattern in patterns:
pattern_mask = alleles.str.startswith(pattern)
print("# %s: %d" % (pattern, pattern_mask.sum()))
mask |= pattern_mask
df = df[mask]
print("# entries matching allele masks: %d" % (len(df)))
for fn in filter_functions:
df = fn(df)
return df
def groupby_assay(df):
assay_group = df["Assay"]["Assay Group"]
assay_method = df["Assay"]["Method/Technique"]
groups = df.groupby([assay_group, assay_method])
# speed up repeated calls to np.log by caching log affinities as a column
# in the dataframe
df["_log_affinity"] = np.log(df["Assay"]["Quantitative measurement"])
# speed up computing percent positive with the helper column
qualitative = df["Assay"]["Qualitative Measure"]
df["_qualitative_positive"] = qualitative.str.startswith("Positive")
print("---")
print("Assays")
assay_dataframes = {}
# create a dataframe for every distinct kind of assay which is used
# by IEDB submitters to measure peptide-MHC affinity or stability
for (assay_group, assay_method), group_data in sorted(
groups, key=lambda x: len(x[1]), reverse=True):
print("%s (%s): %d" % (assay_group, assay_method, len(group_data)))
groups,
key=lambda x: len(x[1]),
reverse=True):
print("- %s (%s): %d" % (assay_group, assay_method, len(group_data)))
group_alleles = group_data["MHC"]["Allele Name"]
group_peptides = group_data["Epitope"]["Description"]
distinct_pmhc = group_data.groupby([group_alleles, group_peptides])
......@@ -93,19 +122,16 @@ if __name__ == "__main__":
for (allele, peptide), pmhc_group in distinct_pmhc:
columns["mhc"].append(allele)
columns["peptide"].append(peptide)
# performing median in log space since in two datapoint case
# we don't want to take e.g. (10 + 1000) / 2.0 = 505
# but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100
columns["value"].append(
np.exp(
np.median(
np.log(
pmhc_group["Assay"]["Quantitative measurement"]))))
qualitative = pmhc_group["Assay"]["Qualitative Measure"]
columns["percent_positive"].append(
qualitative.str.startswith("Positive").mean())
columns["count"].append(
pmhc_group["Assay"]["Quantitative measurement"].count())
positive = pmhc_group["_qualitative_positive"]
count = len(pmhc_group)
if count == 1:
ic50 = pmhc_group["Assay"]["Quantitative measurement"].mean()
else:
ic50 = np.exp(np.mean(pmhc_group["_log_affinity"]))
# averaging the log affinities preserves orders of magnitude better
columns["value"].append(ic50)
columns["percent_positive"].append(positive.mean())
columns["count"].append(count)
assay_dataframes[(assay_group, assay_method)] = pd.DataFrame(
columns,
columns=[
......@@ -115,10 +141,44 @@ if __name__ == "__main__":
"percent_positive",
"count"])
print("# distinct pMHC entries: %d" % len(columns["mhc"]))
return assay_dataframes
if __name__ == "__main__":
args = parser.parse_args()
df = pd.read_csv(
args.input_csv,
error_bad_lines=False,
encoding="latin-1",
header=[0, 1])
df = filter_mhc_dataframe(df)
alleles = df["MHC"]["Allele Name"]
n = len(alleles)
print("# Class I rows: %d" % n)
print("# Class I alleles: %d" % len(set(alleles)))
print("Unique alleles: %s" % list(sorted(set(alleles))))
if args.alleles:
print("User-supplied allele whitelist: %s" % (args.alleles,))
mask = np.zeros(n, dtype=bool)
for pattern in args.alleles:
pattern_mask = alleles.str.startswith(pattern)
print("# %s: %d" % (pattern, pattern_mask.sum()))
mask |= pattern_mask
df = df[mask]
print("# entries matching alleles %s: %d" % (
args.alleles,
len(df)))
assay_dataframes = groupby_assay(df)
if not exists(args.output_dir):
makedirs(args.output_dir)
output_path = join(args.output_dir, args.output_pickle_filename)
with open(args.output, "wb") as f:
with open(output_path, "wb") as f:
pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment