Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
"""
Filter and combine various peptide/MHC datasets to derive a composite training set,
optionally including eluted peptides identified by mass-spec.
"""
import sys
import argparse
import pandas
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"pmid",
metavar="PMID",
help="PMID of dataset to curate")
parser.add_argument(
"files",
nargs="+",
metavar="FILE",
help="File paths of data to curate")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
parser.add_argument(
"--debug",
action="store_true",
default=False,
help="Leave user in pdb if PMID is unsupported")
HANDLERS = {}
def load(filenames, **kwargs):
result = {}
for filename in filenames:
if filename.endswith(".csv"):
result[filename] = pandas.read_csv(filename, **kwargs)
elif filename.endswith(".xlsx") or filename.endswith(".xls"):
result[filename] = pandas.read_excel(filename, **kwargs)
else:
result[filename] = filename
return result
def debug(*filenames):
loaded = load(filenames)
import ipdb
ipdb.set_trace()
def pmid_27600516(filename):
df = pandas.read_csv(filename)
sample_to_peptides = {}
current_sample = None
for peptide in df.peptide:
if peptide.startswith("#"):
current_sample = peptide[1:]
sample_to_peptides[current_sample] = []
else:
assert current_sample is not None
sample_to_peptides[current_sample].append(peptide.strip().upper())
rows = []
for (sample, peptides) in sample_to_peptides.items():
for peptide in sorted(set(peptides)):
rows.append([sample, peptide])
result = pandas.DataFrame(rows, columns=["sample_id", "peptide"])
return result
HANDLERS["27600516"] = pmid_27600516
def run():
args = parser.parse_args(sys.argv[1:])
if args.pmid in HANDLERS:
df = HANDLERS[args.pmid](*args.files)
elif args.debug:
debug(*args.files)
else:
raise NotImplementedError(args.pmid)
df.to_csv(args.out, index=False)
print("Wrote: %s" % args.out)
if __name__ == '__main__':
run()