From b60d22a8a955e4e6f49356f11c5452b9c008b22b Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Mon, 29 Jun 2015 17:19:48 -0400 Subject: [PATCH] fixed script to create combined dataset --- README.md | 13 +++++-- scripts/build-iedb-class1-dataset.py | 0 scripts/create-combined-class1-dataset.py | 34 ++++++++++++++++--- scripts/create-iedb-class1-dataset.py | 2 ++ .../train-class1-allele-specific-models.py | 2 ++ 5 files changed, 43 insertions(+), 8 deletions(-) mode change 100644 => 100755 scripts/build-iedb-class1-dataset.py mode change 100644 => 100755 scripts/create-combined-class1-dataset.py mode change 100644 => 100755 scripts/create-iedb-class1-dataset.py mode change 100644 => 100755 scripts/train-class1-allele-specific-models.py diff --git a/README.md b/README.md index bbfa7cf1..e3d0d505 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,13 @@ Peptide-MHC binding affinity prediction ``` scripts/download-iedb.sh scripts/download-peters-2013-dataset.sh -python scripts/create-iedb-class1-dataset.py -python scripts/create-combined-class1-dataset.py -``` \ No newline at end of file +scripts/create-iedb-class1-dataset.py +scripts/create-combined-class1-dataset.py +``` + +## Getting Started: Train Neural Network Models + +``` +scripts/train-class1-allele-specific-models.py +``` + diff --git a/scripts/build-iedb-class1-dataset.py b/scripts/build-iedb-class1-dataset.py old mode 100644 new mode 100755 diff --git a/scripts/create-combined-class1-dataset.py b/scripts/create-combined-class1-dataset.py old mode 100644 new mode 100755 index 51b8ad6b..4544b819 --- a/scripts/create-combined-class1-dataset.py +++ b/scripts/create-combined-class1-dataset.py @@ -1,17 +1,37 @@ +#!/usr/bin/env python + """ Combine 2013 Kim/Peters NetMHCpan dataset[*] with more recent IEDB entries * = "Dataset size and composition impact the reliability..." """ + +from os.path import join import pickle -import pandas as pd from collections import Counter +import pandas as pd + +from mhcflurry.paths import CLASS1_DATA_DIRECTORY + +IEDB_PICKLE_FILENAME = "iedb_human_class1_assay_datasets.pickle" +IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME) + +PETERS_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt" +PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME) + +OUTPUT_CSV_FILENAME = "combined_human_class1_dataset.csv" +OUTPUT_CSV_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_CSV_FILENAME) + if __name__ == "__main__": - with open("iedb_human_class1_assay_datasets.pickle", "r'") as f: + print("Reading %s..." % IEDB_PICKLE_PATH) + with open(IEDB_PICKLE_PATH, "r'") as f: iedb_datasets = pickle.load(f) - nielsen_data = pd.read_csv("bdata.20130222.mhci.public.1.txt", sep="\t") - print("Size of 2013 Nielsen dataset: %d" % len(nielsen_data)) + + print("Reading %s..." % PETERS_CSV_PATH) + nielsen_data = pd.read_csv(PETERS_CSV_PATH, sep="\t") + print("Size of 2013 Peters dataset: %d" % len(nielsen_data)) + new_allele_counts = Counter() combined_columns = { "species": list(nielsen_data["species"]), @@ -49,6 +69,9 @@ if __name__ == "__main__": print(" fraction similar binding values=%0.4f" % fraction_similar) new_peptides = joined[left_missing & ~right_missing] if fraction_similar > 0.9: + print("---") + print("\t using assay: %s" % (assay,)) + print("---") combined_columns["mhc"].extend(new_peptides["mhc"]) combined_columns["peptide"].extend(new_peptides["peptide"]) combined_columns["peptide_length"].extend(new_peptides["peptide"].str.len()) @@ -67,4 +90,5 @@ if __name__ == "__main__": print("Combined DataFrame size: %d (+%d)" % ( len(combined_df), len(combined_df) - len(nielsen_data))) - combined_df.to_csv("combined_human_class1_dataset.csv", index=False) + print("Writing %s..." % OUTPUT_CSV_PATH) + combined_df.to_csv(OUTPUT_CSV_PATH, index=False) diff --git a/scripts/create-iedb-class1-dataset.py b/scripts/create-iedb-class1-dataset.py old mode 100644 new mode 100755 index 0897e7aa..22728b8a --- a/scripts/create-iedb-class1-dataset.py +++ b/scripts/create-iedb-class1-dataset.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + """ Turn a raw CSV snapshot of the IEDB contents into a usable class I binding prediction dataset by grouping all unique pMHCs diff --git a/scripts/train-class1-allele-specific-models.py b/scripts/train-class1-allele-specific-models.py old mode 100644 new mode 100755 index 97ece893..a50ba56f --- a/scripts/train-class1-allele-specific-models.py +++ b/scripts/train-class1-allele-specific-models.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python + """ Train one neural network for every allele w/ more than 50 data points in our dataset. -- GitLab