Skip to content
Snippets Groups Projects
Commit cf1c077e authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

print dataset size for each allele

parent 92abfb6e
No related branches found
No related tags found
No related merge requests found
...@@ -17,4 +17,7 @@ from appdirs import user_data_dir ...@@ -17,4 +17,7 @@ from appdirs import user_data_dir
BASE_DIRECTORY = user_data_dir("mhcflurry", version="0.1") BASE_DIRECTORY = user_data_dir("mhcflurry", version="0.1")
CLASS1_DATA_DIRECTORY = join(BASE_DIRECTORY, "class1_data") CLASS1_DATA_DIRECTORY = join(BASE_DIRECTORY, "class1_data")
CLASS1_MODEL_DIRECTORY = join(BASE_DIRECTORY, "class1_models") CLASS1_MODEL_DIRECTORY = join(BASE_DIRECTORY, "class1_models")
\ No newline at end of file
CLASS1_DATA_CSV_FILENAME = "combined_human_class1_dataset.csv"
CLASS1_DATA_CSV_PATH = join(CLASS1_DATA_DIRECTORY, CLASS1_DATA_CSV_FILENAME)
...@@ -19,7 +19,7 @@ import argparse ...@@ -19,7 +19,7 @@ import argparse
import pandas as pd import pandas as pd
from mhcflurry.paths import CLASS1_DATA_DIRECTORY from mhcflurry.paths import CLASS1_DATA_DIRECTORY, CLASS1_DATA_CSV_PATH
IEDB_PICKLE_FILENAME = "iedb_human_class1_assay_datasets.pickle" IEDB_PICKLE_FILENAME = "iedb_human_class1_assay_datasets.pickle"
IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME) IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME)
...@@ -27,9 +27,6 @@ IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME) ...@@ -27,9 +27,6 @@ IEDB_PICKLE_PATH = join(CLASS1_DATA_DIRECTORY, IEDB_PICKLE_FILENAME)
PETERS_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt" PETERS_CSV_FILENAME = "bdata.20130222.mhci.public.1.txt"
PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME) PETERS_CSV_PATH = join(CLASS1_DATA_DIRECTORY, PETERS_CSV_FILENAME)
OUTPUT_CSV_FILENAME = "combined_human_class1_dataset.csv"
OUTPUT_CSV_PATH = join(CLASS1_DATA_DIRECTORY, OUTPUT_CSV_FILENAME)
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--ic50-fraction-tolerance", parser.add_argument("--ic50-fraction-tolerance",
...@@ -59,7 +56,7 @@ parser.add_argument("--netmhcpan-csv-path", ...@@ -59,7 +56,7 @@ parser.add_argument("--netmhcpan-csv-path",
help="Path to CSV with NetMHCpan dataset from 2013 Peters paper") help="Path to CSV with NetMHCpan dataset from 2013 Peters paper")
parser.add_argument("--output-csv-path", parser.add_argument("--output-csv-path",
default=OUTPUT_CSV_PATH, default=CLASS1_DATA_CSV_PATH,
help="Path to CSV of combined assay results") help="Path to CSV of combined assay results")
parser.add_argument("--extra-dataset-csv-path", parser.add_argument("--extra-dataset-csv-path",
......
...@@ -22,7 +22,9 @@ trained models are available ...@@ -22,7 +22,9 @@ trained models are available
import argparse import argparse
import os import os
from mhcflurry.paths import CLASS1_MODEL_DIRECTORY import pandas as pd
from mhcflurry.paths import CLASS1_MODEL_DIRECTORY, CLASS1_DATA_CSV_PATH
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
...@@ -30,17 +32,39 @@ parser.add_argument( ...@@ -30,17 +32,39 @@ parser.add_argument(
default=False, default=False,
action="store_true") action="store_true")
parser.add_argument("--with-dataset-size",
default=False,
action="store_true")
parser.add_argument("--all",
default=False,
action="store_true",
help="Include serotypes (like 'A2') which include multiple 4-digit types")
if __name__ == "__main__": if __name__ == "__main__":
args = parser.parse_args() args = parser.parse_args()
if args.with_dataset_size:
df = pd.read_csv(CLASS1_DATA_CSV_PATH)
allele_sizes = {
allele: len(group) for (allele, group) in df.groupby("mhc")
}
else:
allele_sizes = None
for filename in os.listdir(CLASS1_MODEL_DIRECTORY): for filename in os.listdir(CLASS1_MODEL_DIRECTORY):
allele = filename.replace(".hdf", "") allele = filename.replace(".hdf", "")
if len(allele) < 5: if len(allele) >= 5:
allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:])
elif args.all:
allele = "HLA-%s" % allele
else:
# skipping serotype names like A2 or B7 # skipping serotype names like A2 or B7
continue continue
allele = "HLA-%s*%s:%s" % (allele[0], allele[1:3], allele[3:])
line = allele
if args.with_peptide_lengths: if args.with_peptide_lengths:
print("%s\t8,9,10,11,12" % allele) line += "\t8,9,10,11,12"
else: if args.with_dataset_size:
print(allele) line += "\t%d" % allele_sizes[allele]
\ No newline at end of file print(line)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment