Newer
Older
"""
"""
import sys
import argparse
import os
import time
import collections
import re
from six.moves import StringIO
import pandas
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"input",
metavar="FILE.csv",
help="CSV of annotated mass spec hits")
parser.add_argument(
"reference_csv",
metavar="FILE.csv",
help="CSV of protein sequences")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
parser.add_argument(
"--debug-max-rows",
metavar="N",
type=int,
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
help="Max rows to process. Useful for debugging. If specified an ipdb "
"debugging session is also opened at the end of the script")
parser.add_argument(
"--lengths",
metavar="N",
type=int,
nargs="+",
default=[8,9,10,11],
help="Peptide lengths")
def run():
args = parser.parse_args(sys.argv[1:])
df_original = pandas.read_csv(args.input)
df = df_original
print("Read peptides", df.shape, *df.columns.tolist())
reference_df = pandas.read_csv(args.reference_csv, index_col=0)
reference_df = reference_df.set_index("accession")
print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
print("Subselecting to MHC I hits. Before: ", len(df))
df = df.loc[df.mhc_class == "I"]
print("After: ", len(df))
print("Subselecting to gene-associated hits. Before: ", len(df))
df = df.loc[~df.protein_ensembl_primary.isnull()]
print("After: ", len(df))
(flanking_length,) = list(
set(df.n_flank.str.len().unique()).union(
set(df.n_flank.str.len().unique())))
print("Flanking length", flanking_length)
proteins = df.protein_accession.unique()
if args.debug_max_rows:
proteins = proteins[:args.debug_max_rows]
print("Writing decoys for %d proteins" % len(proteins))
reference_df = reference_df.loc[proteins]
lengths = sorted(args.lengths)
rows = []
total = len(reference_df)
for (accession, info) in tqdm.tqdm(reference_df.iterrows(), total=total):
seq = info.seq
for start in range(0, len(seq) - min(args.lengths)):
for length in lengths:
end_pos = start + length
if end_pos > len(seq):
break
n_flank = seq[
max(start - flanking_length, 0) : start
].rjust(flanking_length, 'X')
c_flank = seq[
end_pos : (end_pos + flanking_length)
].ljust(flanking_length, 'X')
peptide = seq[start : start + length]
rows.append((
accession,
peptide,
n_flank,
c_flank,
start
))
result_df = pandas.DataFrame(
rows,
columns=[
"protein_accession",
"peptide",
"n_flank",
"c_flank",
"start_position",
])
result_df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
if args.debug_max_rows:
# Leave user in a debugger
import ipdb
ipdb.set_trace()
if __name__ == '__main__':
run()