From b60d22a8a955e4e6f49356f11c5452b9c008b22b Mon Sep 17 00:00:00 2001
From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com>
Date: Mon, 29 Jun 2015 17:19:48 -0400
Subject: [PATCH] fixed script to create combined dataset

---
 README.md                                     | 13 +++++--
 scripts/build-iedb-class1-dataset.py          |  0
 scripts/create-combined-class1-dataset.py     | 34 ++++++++++++++++---
 scripts/create-iedb-class1-dataset.py         |  2 ++
 .../train-class1-allele-specific-models.py    |  2 ++
 5 files changed, 43 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 scripts/build-iedb-class1-dataset.py
 mode change 100644 => 100755 scripts/create-combined-class1-dataset.py
 mode change 100644 => 100755 scripts/create-iedb-class1-dataset.py
 mode change 100644 => 100755 scripts/train-class1-allele-specific-models.py

diff --git a/README.md b/README.md
index bbfa7cf1..e3d0d505 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,13 @@ Peptide-MHC binding affinity prediction
 ```
 scripts/download-iedb.sh
 scripts/download-peters-2013-dataset.sh
-python scripts/create-iedb-class1-dataset.py
-python scripts/create-combined-class1-dataset.py
-```
\ No newline at end of file
+scripts/create-iedb-class1-dataset.py
+scripts/create-combined-class1-dataset.py
+```
+
+## Getting Started: Train Neural Network Models
+
+```
+scripts/train-class1-allele-specific-models.py
+```
+
diff --git a/scripts/build-iedb-class1-dataset.py b/scripts/build-iedb-class1-dataset.py
old mode 100644
new mode 100755
diff --git a/scripts/create-combined-class1-dataset.py b/scripts/create-combined-class1-dataset.py
old mode 100644
new mode 100755
index 51b8ad6b..4544b819
--- a/scripts/create-combined-class1-dataset.py
+++ b/scripts/create-combined-class1-dataset.py
@@ -1,17 +1,37 @@
+#!/usr/bin/env python
+
 """
 Combine 2013 Kim/Peters NetMHCpan dataset[*] with more recent IEDB entries
 
 * = "Dataset size and composition impact the reliability..."
 """
+
+from os.path import join
 import pickle
-import pandas as pd
 from collections import Counter
 
+import pandas as pd
+
+from mhcflurry.paths import CLASS1_DATA_DIRECTORY
+
+IEDB_PICKLE_FILENAME = "iedb_human_class1_assay_datasets.pickle"
+IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME)
+
+PETERS_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt"
+PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
+
+OUTPUT_CSV_FILENAME = "combined_human_class1_dataset.csv"
+OUTPUT_CSV_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_CSV_FILENAME)
+
 if __name__ == "__main__":
-    with open("iedb_human_class1_assay_datasets.pickle", "r'") as f:
+    print("Reading %s..." % IEDB_PICKLE_PATH)
+    with open(IEDB_PICKLE_PATH, "r'") as f:
         iedb_datasets = pickle.load(f)
-    nielsen_data = pd.read_csv("bdata.20130222.mhci.public.1.txt", sep="\t")
-    print("Size of 2013 Nielsen dataset: %d" % len(nielsen_data))
+
+    print("Reading %s..." % PETERS_CSV_PATH)
+    nielsen_data = pd.read_csv(PETERS_CSV_PATH, sep="\t")
+    print("Size of 2013 Peters dataset: %d" % len(nielsen_data))
+
     new_allele_counts = Counter()
     combined_columns = {
         "species": list(nielsen_data["species"]),
@@ -49,6 +69,9 @@ if __name__ == "__main__":
         print("  fraction similar binding values=%0.4f" % fraction_similar)
         new_peptides = joined[left_missing & ~right_missing]
         if fraction_similar > 0.9:
+            print("---")
+            print("\t using assay: %s" % (assay,))
+            print("---")
             combined_columns["mhc"].extend(new_peptides["mhc"])
             combined_columns["peptide"].extend(new_peptides["peptide"])
             combined_columns["peptide_length"].extend(new_peptides["peptide"].str.len())
@@ -67,4 +90,5 @@ if __name__ == "__main__":
     print("Combined DataFrame size: %d (+%d)" % (
             len(combined_df),
             len(combined_df) - len(nielsen_data)))
-    combined_df.to_csv("combined_human_class1_dataset.csv", index=False)
+    print("Writing %s..." % OUTPUT_CSV_PATH)
+    combined_df.to_csv(OUTPUT_CSV_PATH, index=False)
diff --git a/scripts/create-iedb-class1-dataset.py b/scripts/create-iedb-class1-dataset.py
old mode 100644
new mode 100755
index 0897e7aa..22728b8a
--- a/scripts/create-iedb-class1-dataset.py
+++ b/scripts/create-iedb-class1-dataset.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 """
 Turn a raw CSV snapshot of the IEDB contents into a usable
 class I binding prediction dataset by grouping all unique pMHCs
diff --git a/scripts/train-class1-allele-specific-models.py b/scripts/train-class1-allele-specific-models.py
old mode 100644
new mode 100755
index 97ece893..a50ba56f
--- a/scripts/train-class1-allele-specific-models.py
+++ b/scripts/train-class1-allele-specific-models.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 """
 Train one neural network for every allele w/ more than 50 data points in
 our dataset.
-- 
GitLab