print dataset size for each allele

cf1c077e · Alex Rubinsteyn · 92abfb6e · cf1c077e · cf1c077e · cf1c077e
Commit cf1c077e authored 9 years ago by Alex Rubinsteyn
--- a/mhcflurry/paths.py
+++ b/mhcflurry/paths.py
@@ -17,4 +17,7 @@ from appdirs import user_data_dir
 BASE_DIRECTORY = user_data_dir("mhcflurry", version="0.1")
 CLASS1_DATA_DIRECTORY = join(BASE_DIRECTORY, "class1_data")
 CLASS1_MODEL_DIRECTORY = join(BASE_DIRECTORY, "class1_models")
\ No newline at end of file
+CLASS1_DATA_CSV_FILENAME = "combined_human_class1_dataset.csv"
+CLASS1_DATA_CSV_PATH = join(CLASS1_DATA_DIRECTORY, CLASS1_DATA_CSV_FILENAME)
--- a/scripts/create-combined-class1-dataset.py
+++ b/scripts/create-combined-class1-dataset.py
@@ -19,7 +19,7 @@ import argparse
 import pandas as pd
-from mhcflurry.paths import CLASS1_DATA_DIRECTORY
+from mhcflurry.paths import CLASS1_DATA_DIRECTORY, CLASS1_DATA_CSV_PATH
 IEDB_PICKLE_FILENAME = "iedb_human_class1_assay_datasets.pickle"
 IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME)
@@ -27,9 +27,6 @@ IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME)
 PETERS_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt"
 PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
-OUTPUT_CSV_FILENAME = "combined_human_class1_dataset.csv"
-OUTPUT_CSV_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_CSV_FILENAME)
 parser = argparse.ArgumentParser()
 parser.add_argument("--ic50-fraction-tolerance",
@@ -59,7 +56,7 @@ parser.add_argument("--netmhcpan-csv-path",
    help="Path to CSV with NetMHCpan dataset from 2013 Peters paper")
 parser.add_argument("--output-csv-path",
-    default=OUTPUT_CSV_PATH,
+    default=CLASS1_DATA_CSV_PATH,
    help="Path to CSV of combined assay results")
 parser.add_argument("--extra-dataset-csv-path",

--- a/scripts/print-class1-alleles.py
+++ b/scripts/print-class1-alleles.py
@@ -22,7 +22,9 @@ trained models are available
 import argparse
 import os
-from mhcflurry.paths import CLASS1_MODEL_DIRECTORY
+import pandas as pd
+from mhcflurry.paths import CLASS1_MODEL_DIRECTORY, CLASS1_DATA_CSV_PATH
 parser = argparse.ArgumentParser()
 parser.add_argument(
@@ -30,17 +32,39 @@ parser.add_argument(
    default=False,
    action="store_true")
+parser.add_argument("--with-dataset-size",
+    default=False,
+    action="store_true")
+parser.add_argument("--all",
+    default=False,
+    action="store_true",
+    help="Include serotypes (like 'A2') which include multiple 4-digit types")
 if __name__ == "__main__":
    args = parser.parse_args()
+    if args.with_dataset_size:
+        df = pd.read_csv(CLASS1_DATA_CSV_PATH)
+        allele_sizes = {
+            allele: len(group) for (allele, group) in df.groupby("mhc")
+        }
+    else:
+        allele_sizes = None
    for filename in os.listdir(CLASS1_MODEL_DIRECTORY):
        allele = filename.replace(".hdf", "")
-        if len(allele) < 5:
+        if len(allele) >= 5:
+            allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:])
+        elif args.all:
+            allele = "HLA-%s" % allele
+        else:
            # skipping serotype names like A2 or B7
            continue
-        allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:])
+        line = allele
        if args.with_peptide_lengths:
-            print("%s\t8,9,10,11,12" % allele)
+            line += "\t8,9,10,11,12"
-        else:
+        if args.with_dataset_size:
-            print(allele)
+            line += "\t%d" % allele_sizes[allele]
\ No newline at end of file
+        print(line)
\ No newline at end of file