Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
"""
import sys
import argparse
import os
import gzip
import pandas
import shellinford
from Bio import SeqIO
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"input_paths",
nargs="+",
help="Fasta files to process")
parser.add_argument(
"--out-csv",
required=True,
metavar="FILE.csv",
help="CSV output")
parser.add_argument(
"--out-index",
required=True,
metavar="FILE.fm",
help="Index output")
def run():
args = parser.parse_args(sys.argv[1:])
fm = shellinford.FMIndex()
df = []
for f in args.input_paths:
print("Processing", f)
with gzip.open(f, "rt") as fd:
records = SeqIO.parse(fd, format='fasta')
for (i, record) in enumerate(records):
seq = str(record.seq).upper()
df.append((record.name, record.description, seq))
fm.push_back("$" + seq + "$") # include sentinels
df = pandas.DataFrame(df, columns=["name", "description", "seq"])
print("Done reading fastas")
print(df)
print("Building index")
fm.build()
fm.write(args.out_index)
print("Wrote: ", os.path.abspath((args.out_index)))
df.to_csv(args.out_csv, index=True)
print("Wrote: ", os.path.abspath((args.out_csv)))
if __name__ == '__main__':
run()