Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
"""
import sys
import argparse
import os
import time
import collections
import re
from six.moves import StringIO
import pandas
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
import shellinford
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"input",
metavar="FILE.csv",
help="CSV of annotated mass spec hits")
parser.add_argument(
"reference_csv",
metavar="FILE.csv",
help="CSV of protein sequences")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
parser.add_argument(
"--debug-max-rows",
metavar="N",
type=int,
default=10,
help="Max rows to process. Useful for debugging. If specified an ipdb "
"debugging session is also opened at the end of the script")
parser.add_argument(
"--lengths",
metavar="N",
type=int,
nargs="+",
default=[8,9,10,11],
help="Peptide lengths")
def run():
args = parser.parse_args(sys.argv[1:])
df_original = pandas.read_csv(args.input)
df = df_original
print("Read peptides", df.shape, *df.columns.tolist())
reference_df = pandas.read_csv(args.reference_csv, index_col=0)
reference_df = reference_df.set_index("accession")
print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
print("Subselecting to MHC I hits. Before: ", len(df))
df = df.loc[df.mhc_class == "I"]
print("After: ", len(df))
print("Subselecting to gene-associated hits. Before: ", len(df))
df = df.loc[~df.protein_ensembl_primary.isnull()]
print("After: ", len(df))
(flanking_length,) = list(
set(df.n_flank.str.len().unique()).union(
set(df.n_flank.str.len().unique())))
print("Flanking length", flanking_length)
proteins = df.protein_accession.unique()
if args.debug_max_rows:
proteins = proteins[:args.debug_max_rows]
print("Writing decoys for %d proteins" % len(proteins))
reference_df = reference_df.loc[proteins]
lengths = sorted(args.lengths)
rows = []
total = len(reference_df)
for (accession, info) in tqdm.tqdm(reference_df.iterrows(), total=total):
seq = info.seq
for start in range(0, len(seq) - min(args.lengths)):
for length in lengths:
end_pos = start + length
if end_pos > len(seq):
break
n_flank = seq[
max(start - flanking_length, 0) : start
].rjust(flanking_length, 'X')
c_flank = seq[
end_pos : (end_pos + flanking_length)
].ljust(flanking_length, 'X')
peptide = seq[start : start + length]
rows.append((
accession,
peptide,
n_flank,
c_flank,
start
))
result_df = pandas.DataFrame(
rows,
columns=[
"protein_accession",
"peptide",
"n_flank",
"c_flank",
"start_position",
])
result_df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
if args.debug_max_rows:
# Leave user in a debugger
import ipdb
ipdb.set_trace()
if __name__ == '__main__':
run()