Skip to content
Snippets Groups Projects
Commit 3d69d3ea authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

added scripts to download, normalize, and combine datasets

parent c112b3fd
No related merge requests found
# mhcflurry
Peptide-MHC binding affinity prediction
## Getting Started: Download, Normalize, and Combine Training Data
```
scripts/download-iedb.sh
scripts/download-peters-2013-dataset.sh
python scripts/create-iedb-class1-dataset.py
python scripts/create-combined-class1-dataset.py
```
\ No newline at end of file
"""
Turn a raw CSV snapshot of the IEDB contents into a usable
class I binding prediction dataset by grouping all unique pMHCs
"""
from collections import defaultdict
from os.path import join
import pickle
import numpy as np
import pandas as pd
from mhcflurry.paths import CLASS1_DATA_DIRECTORY
IEDB_SOURCE_FILENAME = "mhc_ligand_full.csv"
IEDB_SOURCE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_SOURCE_FILENAME)
print(IEDB_SOURCE_PATH)
OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle"
OUTPUT_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_FILENAME)
if __name__ == "__main__":
df = pd.read_csv(
IEDB_SOURCE_PATH,
error_bad_lines=False,
encoding="latin-1",
header=[0, 1])
alleles = df["MHC"]["Allele Name"]
n = len(alleles)
print("# total: %d" % n)
mask = np.zeros(n, dtype=bool)
patterns = [
"HLA-A",
"HLA-B",
"HLA-C",
# "H-2-D",
# "H-2-K",
# "H-2-L",
]
for pattern in patterns:
pattern_mask = alleles.str.startswith(pattern)
print("# %s: %d" % (pattern, pattern_mask.sum()))
mask |= pattern_mask
df = df[mask]
print("# entries matching allele masks: %d" % (len(df)))
assay_group = df["Assay"]["Assay Group"]
assay_method = df["Assay"]["Method/Technique"]
groups = df.groupby([assay_group, assay_method])
print("---")
print("Assays")
assay_dataframes = {}
# create a dataframe for every distinct kind of assay which is used
# by IEDB submitters to measure peptide-MHC affinity or stability
for (assay_group, assay_method), group_data in sorted(
groups, key=lambda x: len(x[1]), reverse=True):
print("%s (%s): %d" % (assay_group, assay_method, len(group_data)))
group_alleles = group_data["MHC"]["Allele Name"]
group_peptides = group_data["Epitope"]["Description"]
distinct_pmhc = group_data.groupby([group_alleles, group_peptides])
columns = defaultdict(list)
for (allele, peptide), pmhc_group in distinct_pmhc:
columns["mhc"].append(allele)
columns["peptide"].append(peptide)
# performing median in log space since in two datapoint case
# we don't want to take e.g. (10 + 1000) / 2.0 = 505
# but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100
columns["value"].append(
np.exp(
np.median(
np.log(
pmhc_group["Assay"]["Quantitative measurement"]))))
qualitative = pmhc_group["Assay"]["Qualitative Measure"]
columns["percent_positive"].append(
qualitative.str.startswith("Positive").mean())
columns["count"].append(
pmhc_group["Assay"]["Quantitative measurement"].count())
assay_dataframes[(assay_group, assay_method)] = pd.DataFrame(
columns,
columns=[
"mhc",
"peptide",
"value",
"percent_positive",
"count"])
print("# distinct pMHC entries: %d" % len(columns["mhc"]))
with open(OUTPUT_PATH, "w") as f:
pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
"""
Combine 2013 Kim/Peters NetMHCpan dataset[*] with more recent IEDB entries
* = "Dataset size and composition impact the reliability..."
"""
import pickle
import pandas as pd
from collections import Counter
if __name__ == "__main__":
with open("iedb_human_class1_assay_datasets.pickle", "r'") as f:
iedb_datasets = pickle.load(f)
nielsen_data = pd.read_csv("bdata.20130222.mhci.public.1.txt", sep="\t")
print("Size of 2013 Nielsen dataset: %d" % len(nielsen_data))
new_allele_counts = Counter()
combined_columns = {
"species": list(nielsen_data["species"]),
"mhc": list(nielsen_data["mhc"]),
"peptide": list(nielsen_data["sequence"]),
"peptide_length": list(nielsen_data["peptide_length"]),
"meas": list(nielsen_data["meas"]),
}
for assay, assay_dataset in sorted(iedb_datasets.items(), key=lambda x: len(x[1])):
joined = nielsen_data.merge(
assay_dataset,
left_on=["mhc", "sequence"],
right_on=["mhc", "peptide"],
how="outer")
if len(joined) == 0:
continue
# drop NaN binding values and entries without values in both datasets
left_missing = joined["meas"].isnull()
right_missing = joined["value"].isnull()
overlap_filter_mask = ~(left_missing | right_missing)
filtered = joined[overlap_filter_mask]
n_overlap = len(filtered)
if n_overlap == 0:
continue
# let's count what fraction of this IEDB assay is within 1% of the values in the
# Nielsen dataset
similar_values = (
(filtered["value"] - filtered["meas"]).abs() <= (filtered["meas"] / 100.0))
fraction_similar = similar_values.mean()
print("Assay=%s, count=%d" % (assay, len(assay_dataset)))
print(" # entries w/ values in both data sets: %d" % n_overlap)
print(" fraction similar binding values=%0.4f" % fraction_similar)
new_peptides = joined[left_missing & ~right_missing]
if fraction_similar > 0.9:
combined_columns["mhc"].extend(new_peptides["mhc"])
combined_columns["peptide"].extend(new_peptides["peptide"])
combined_columns["peptide_length"].extend(new_peptides["peptide"].str.len())
combined_columns["meas"].extend(new_peptides["value"])
# TODO: make this work for non-human data
combined_columns["species"].extend(["human"] * len(new_peptides))
for allele in new_peptides["mhc"]:
new_allele_counts[allele] += 1
combined_df = pd.DataFrame(
combined_columns,
columns=["species", "mhc", "peptide", "peptide_length", "meas"])
print("New entry allele distribution")
for (allele, count) in new_allele_counts.most_common():
print("%s: %d" % (allele, count))
print("Combined DataFrame size: %d (+%d)" % (
len(combined_df),
len(combined_df) - len(nielsen_data)))
combined_df.to_csv("combined_human_class1_dataset.csv", index=False)
"""
Turn a raw CSV snapshot of the IEDB contents into a usable
class I binding prediction dataset by grouping all unique pMHCs
"""
from collections import defaultdict
from os.path import join
import pickle
import numpy as np
import pandas as pd
from mhcflurry.paths import CLASS1_DATA_DIRECTORY
IEDB_SOURCE_FILENAME = "mhc_ligand_full.csv"
IEDB_SOURCE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_SOURCE_FILENAME)
OUTPUT_FILENAME = "iedb_human_class1_assay_datasets.pickle"
OUTPUT_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_FILENAME)
if __name__ == "__main__":
df = pd.read_csv(
IEDB_SOURCE_PATH,
error_bad_lines=False,
encoding="latin-1",
header=[0, 1])
alleles = df["MHC"]["Allele Name"]
n = len(alleles)
print("# total: %d" % n)
mask = np.zeros(n, dtype=bool)
patterns = [
"HLA-A",
"HLA-B",
"HLA-C",
# "H-2-D",
# "H-2-K",
# "H-2-L",
]
for pattern in patterns:
pattern_mask = alleles.str.startswith(pattern)
print("# %s: %d" % (pattern, pattern_mask.sum()))
mask |= pattern_mask
df = df[mask]
print("# entries matching allele masks: %d" % (len(df)))
assay_group = df["Assay"]["Assay Group"]
assay_method = df["Assay"]["Method/Technique"]
groups = df.groupby([assay_group, assay_method])
print("---")
print("Assays")
assay_dataframes = {}
# create a dataframe for every distinct kind of assay which is used
# by IEDB submitters to measure peptide-MHC affinity or stability
for (assay_group, assay_method), group_data in sorted(
groups, key=lambda x: len(x[1]), reverse=True):
print("%s (%s): %d" % (assay_group, assay_method, len(group_data)))
group_alleles = group_data["MHC"]["Allele Name"]
group_peptides = group_data["Epitope"]["Description"]
distinct_pmhc = group_data.groupby([group_alleles, group_peptides])
columns = defaultdict(list)
for (allele, peptide), pmhc_group in distinct_pmhc:
columns["mhc"].append(allele)
columns["peptide"].append(peptide)
# performing median in log space since in two datapoint case
# we don't want to take e.g. (10 + 1000) / 2.0 = 505
# but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100
columns["value"].append(
np.exp(
np.median(
np.log(
pmhc_group["Assay"]["Quantitative measurement"]))))
qualitative = pmhc_group["Assay"]["Qualitative Measure"]
columns["percent_positive"].append(
qualitative.str.startswith("Positive").mean())
columns["count"].append(
pmhc_group["Assay"]["Quantitative measurement"].count())
assay_dataframes[(assay_group, assay_method)] = pd.DataFrame(
columns,
columns=[
"mhc",
"peptide",
"value",
"percent_positive",
"count"])
print("# distinct pMHC entries: %d" % len(columns["mhc"]))
with open(OUTPUT_PATH, "w") as f:
pickle.dump(assay_dataframes, f, pickle.HIGHEST_PROTOCOL)
#!/usr/bin/env bash
rm -f mhc_ligand_full*
wget http://www.iedb.org/doc/mhc_ligand_full.zip
unzip mhc_ligand_full.zip
DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
mkdir -p -- "$DATA_DIR"
mv mhc_ligand_full.csv "$DATA_DIR"
\ No newline at end of file
#!/usr/bin/env bash
# Download dataset from Kim/Peters 2013 "Dataset size and composition" paper
rm -f bdata.20130222.mhci.public*
wget https://dl.dropboxusercontent.com/u/3967524/bdata.20130222.mhci.public.1.txt
DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
mkdir -p -- "$DATA_DIR"
mv bdata.20130222.mhci.public.1.txt "$DATA_DIR"
#!/usr/bin/env bash
DATA_DIR=`python -c "import mhcflurry; print(mhcflurry.paths.CLASS1_DATA_DIRECTORY)"`
echo "$DATA_DIR"
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment