From 3d69d3eae7a99d33fa2b58bfdcaa8b4371d5b161 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Mon, 29 Jun 2015 16:57:55 -0400 Subject: [PATCH] added scripts to download, normalize, and combine datasets --- README.md | 9 +++ scripts/build-iedb-class1-dataset.py | 86 +++++++++++++++++++++++ scripts/create-combined-class1-dataset.py | 70 ++++++++++++++++++ scripts/create-iedb-class1-dataset.py | 86 +++++++++++++++++++++++ scripts/download-iedb.sh | 7 ++ scripts/download-peters-2013-dataset.sh | 8 +++ scripts/print-data-dir.sh | 3 + 7 files changed, 269 insertions(+) create mode 100644 scripts/build-iedb-class1-dataset.py create mode 100644 scripts/create-combined-class1-dataset.py create mode 100644 scripts/create-iedb-class1-dataset.py create mode 100755 scripts/download-iedb.sh create mode 100755 scripts/download-peters-2013-dataset.sh create mode 100755 scripts/print-data-dir.sh diff --git a/README.md b/README.md index 9a655c55..bbfa7cf1 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,11 @@ # mhcflurry Peptide-MHC binding affinity prediction + +## Getting Started: Download, Normalize, and Combine Training Data + +``` +scripts/download-iedb.sh +scripts/download-peters-2013-dataset.sh +python scripts/create-iedb-class1-dataset.py +python scripts/create-combined-class1-dataset.py +``` \ No newline at end of file diff --git a/scripts/build-iedb-class1-dataset.py b/scripts/build-iedb-class1-dataset.py new file mode 100644 index 00000000..70a7fa73 --- /dev/null +++ b/scripts/build-iedb-class1-dataset.py @@ -0,0 +1,86 @@ +""" +Turn a raw CSV snapshot of the IEDB contents into a usable +class I binding prediction dataset by grouping all unique pMHCs +""" +from collections import defaultdict +from os.path import join +import pickle + +import numpy as np +import pandas as pd + +from mhcflurry.paths import CLASS1_DATA_DIRECTORY + +IEDB_SOURCE_FILENAME = "mhc_ligand_full.csv" +IEDB_SOURCE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_SOURCE_FILENAME) +print(IEDB_SOURCE_PATH) +OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle" +OUTPUT_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_FILENAME) + +if __name__ == "__main__": + df = pd.read_csv( + IEDB_SOURCE_PATH, + error_bad_lines=False, + encoding="latin-1", + header=[0, 1]) + alleles = df["MHC"]["Allele Name"] + n = len(alleles) + print("# total: %d" % n) + + mask = np.zeros(n, dtype=bool) + patterns = [ + "HLA-A", + "HLA-B", + "HLA-C", + # "H-2-D", + # "H-2-K", + # "H-2-L", + ] + for pattern in patterns: + pattern_mask = alleles.str.startswith(pattern) + print("# %s: %d" % (pattern, pattern_mask.sum())) + mask |= pattern_mask + df = df[mask] + print("# entries matching allele masks: %d" % (len(df))) + assay_group = df["Assay"]["Assay Group"] + assay_method = df["Assay"]["Method/Technique"] + groups = df.groupby([assay_group, assay_method]) + print("---") + print("Assays") + assay_dataframes = {} + # create a dataframe for every distinct kind of assay which is used + # by IEDB submitters to measure peptide-MHC affinity or stability + for (assay_group, assay_method), group_data in sorted( + groups, key=lambda x: len(x[1]), reverse=True): + print("%s (%s): %d" % (assay_group, assay_method, len(group_data))) + group_alleles = group_data["MHC"]["Allele Name"] + group_peptides = group_data["Epitope"]["Description"] + distinct_pmhc = group_data.groupby([group_alleles, group_peptides]) + columns = defaultdict(list) + for (allele, peptide), pmhc_group in distinct_pmhc: + columns["mhc"].append(allele) + columns["peptide"].append(peptide) + # performing median in log space since in two datapoint case + # we don't want to take e.g. (10 + 1000) / 2.0 = 505 + # but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100 + columns["value"].append( + np.exp( + np.median( + np.log( + pmhc_group["Assay"]["Quantitative measurement"])))) + qualitative = pmhc_group["Assay"]["Qualitative Measure"] + columns["percent_positive"].append( + qualitative.str.startswith("Positive").mean()) + columns["count"].append( + pmhc_group["Assay"]["Quantitative measurement"].count()) + assay_dataframes[(assay_group, assay_method)] = pd.DataFrame( + columns, + columns=[ + "mhc", + "peptide", + "value", + "percent_positive", + "count"]) + print("# distinct pMHC entries: %d" % len(columns["mhc"])) + with open(OUTPUT_PATH, "w") as f: + pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL) diff --git a/scripts/create-combined-class1-dataset.py b/scripts/create-combined-class1-dataset.py new file mode 100644 index 00000000..51b8ad6b --- /dev/null +++ b/scripts/create-combined-class1-dataset.py @@ -0,0 +1,70 @@ +""" +Combine 2013 Kim/Peters NetMHCpan dataset[*] with more recent IEDB entries + +* = "Dataset size and composition impact the reliability..." +""" +import pickle +import pandas as pd +from collections import Counter + +if __name__ == "__main__": + with open("iedb_human_class1_assay_datasets.pickle", "r'") as f: + iedb_datasets = pickle.load(f) + nielsen_data = pd.read_csv("bdata.20130222.mhci.public.1.txt", sep="\t") + print("Size of 2013 Nielsen dataset: %d" % len(nielsen_data)) + new_allele_counts = Counter() + combined_columns = { + "species": list(nielsen_data["species"]), + "mhc": list(nielsen_data["mhc"]), + "peptide": list(nielsen_data["sequence"]), + "peptide_length": list(nielsen_data["peptide_length"]), + "meas": list(nielsen_data["meas"]), + } + + for assay, assay_dataset in sorted(iedb_datasets.items(), key=lambda x: len(x[1])): + joined = nielsen_data.merge( + assay_dataset, + left_on=["mhc", "sequence"], + right_on=["mhc", "peptide"], + how="outer") + + if len(joined) == 0: + continue + # drop NaN binding values and entries without values in both datasets + + left_missing = joined["meas"].isnull() + right_missing = joined["value"].isnull() + overlap_filter_mask = ~(left_missing | right_missing) + filtered = joined[overlap_filter_mask] + n_overlap = len(filtered) + if n_overlap == 0: + continue + # let's count what fraction of this IEDB assay is within 1% of the values in the + # Nielsen dataset + similar_values = ( + (filtered["value"] - filtered["meas"]).abs() <= (filtered["meas"] / 100.0)) + fraction_similar = similar_values.mean() + print("Assay=%s, count=%d" % (assay, len(assay_dataset))) + print(" # entries w/ values in both data sets: %d" % n_overlap) + print(" fraction similar binding values=%0.4f" % fraction_similar) + new_peptides = joined[left_missing & ~right_missing] + if fraction_similar > 0.9: + combined_columns["mhc"].extend(new_peptides["mhc"]) + combined_columns["peptide"].extend(new_peptides["peptide"]) + combined_columns["peptide_length"].extend(new_peptides["peptide"].str.len()) + combined_columns["meas"].extend(new_peptides["value"]) + # TODO: make this work for non-human data + combined_columns["species"].extend(["human"] * len(new_peptides)) + for allele in new_peptides["mhc"]: + new_allele_counts[allele] += 1 + + combined_df = pd.DataFrame( + combined_columns, + columns=["species", "mhc", "peptide", "peptide_length", "meas"]) + print("New entry allele distribution") + for (allele, count) in new_allele_counts.most_common(): + print("%s: %d" % (allele, count)) + print("Combined DataFrame size: %d (+%d)" % ( + len(combined_df), + len(combined_df) - len(nielsen_data))) + combined_df.to_csv("combined_human_class1_dataset.csv", index=False) diff --git a/scripts/create-iedb-class1-dataset.py b/scripts/create-iedb-class1-dataset.py new file mode 100644 index 00000000..0897e7aa --- /dev/null +++ b/scripts/create-iedb-class1-dataset.py @@ -0,0 +1,86 @@ +""" +Turn a raw CSV snapshot of the IEDB contents into a usable +class I binding prediction dataset by grouping all unique pMHCs +""" +from collections import defaultdict +from os.path import join +import pickle + +import numpy as np +import pandas as pd + +from mhcflurry.paths import CLASS1_DATA_DIRECTORY + +IEDB_SOURCE_FILENAME = "mhc_ligand_full.csv" +IEDB_SOURCE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_SOURCE_FILENAME) + +OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle" +OUTPUT_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_FILENAME) + +if __name__ == "__main__": + df = pd.read_csv( + IEDB_SOURCE_PATH, + error_bad_lines=False, + encoding="latin-1", + header=[0, 1]) + alleles = df["MHC"]["Allele Name"] + n = len(alleles) + print("# total: %d" % n) + + mask = np.zeros(n, dtype=bool) + patterns = [ + "HLA-A", + "HLA-B", + "HLA-C", + # "H-2-D", + # "H-2-K", + # "H-2-L", + ] + for pattern in patterns: + pattern_mask = alleles.str.startswith(pattern) + print("# %s: %d" % (pattern, pattern_mask.sum())) + mask |= pattern_mask + df = df[mask] + print("# entries matching allele masks: %d" % (len(df))) + assay_group = df["Assay"]["Assay Group"] + assay_method = df["Assay"]["Method/Technique"] + groups = df.groupby([assay_group, assay_method]) + print("---") + print("Assays") + assay_dataframes = {} + # create a dataframe for every distinct kind of assay which is used + # by IEDB submitters to measure peptide-MHC affinity or stability + for (assay_group, assay_method), group_data in sorted( + groups, key=lambda x: len(x[1]), reverse=True): + print("%s (%s): %d" % (assay_group, assay_method, len(group_data))) + group_alleles = group_data["MHC"]["Allele Name"] + group_peptides = group_data["Epitope"]["Description"] + distinct_pmhc = group_data.groupby([group_alleles, group_peptides]) + columns = defaultdict(list) + for (allele, peptide), pmhc_group in distinct_pmhc: + columns["mhc"].append(allele) + columns["peptide"].append(peptide) + # performing median in log space since in two datapoint case + # we don't want to take e.g. (10 + 1000) / 2.0 = 505 + # but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100 + columns["value"].append( + np.exp( + np.median( + np.log( + pmhc_group["Assay"]["Quantitative measurement"])))) + qualitative = pmhc_group["Assay"]["Qualitative Measure"] + columns["percent_positive"].append( + qualitative.str.startswith("Positive").mean()) + columns["count"].append( + pmhc_group["Assay"]["Quantitative measurement"].count()) + assay_dataframes[(assay_group, assay_method)] = pd.DataFrame( + columns, + columns=[ + "mhc", + "peptide", + "value", + "percent_positive", + "count"]) + print("# distinct pMHC entries: %d" % len(columns["mhc"])) + with open(OUTPUT_PATH, "w") as f: + pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL) diff --git a/scripts/download-iedb.sh b/scripts/download-iedb.sh new file mode 100755 index 00000000..bc962deb --- /dev/null +++ b/scripts/download-iedb.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash +rm -f mhc_ligand_full* +wget http://www.iedb.org/doc/mhc_ligand_full.zip +unzip mhc_ligand_full.zip +DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"` +mkdir -p -- "$DATA_DIR" +mv mhc_ligand_full.csv "$DATA_DIR" \ No newline at end of file diff --git a/scripts/download-peters-2013-dataset.sh b/scripts/download-peters-2013-dataset.sh new file mode 100755 index 00000000..76d814aa --- /dev/null +++ b/scripts/download-peters-2013-dataset.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env bash + +# Download dataset from Kim/Peters 2013 "Dataset size and composition" paper +rm -f bdata.20130222.mhci.public* +wget https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt +DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"` +mkdir -p -- "$DATA_DIR" +mv bdata.20130222.mhci.public.1.txt "$DATA_DIR" diff --git a/scripts/print-data-dir.sh b/scripts/print-data-dir.sh new file mode 100755 index 00000000..8d0c2da2 --- /dev/null +++ b/scripts/print-data-dir.sh @@ -0,0 +1,3 @@ +#!/usr/bin/env bash +DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"` +echo "$DATA_DIR" \ No newline at end of file -- GitLab