Skip to content
Snippets Groups Projects
process.py 1.3 KiB
Newer Older
"""
"""
import sys
import argparse
import os
import gzip

import pandas
import shellinford
from Bio import SeqIO

parser = argparse.ArgumentParser(usage=__doc__)

parser.add_argument(
    "input_paths",
    nargs="+",
    help="Fasta files to process")
parser.add_argument(
    "--out-csv",
    required=True,
    metavar="FILE.csv",
    help="CSV output")
parser.add_argument(
    "--out-index",
    required=True,
    metavar="FILE.fm",
    help="Index output")


def run():
    args = parser.parse_args(sys.argv[1:])

    fm = shellinford.FMIndex()
    df = []
    for f in args.input_paths:
        print("Processing", f)
        with gzip.open(f, "rt") as fd:
            records = SeqIO.parse(fd, format='fasta')
            for (i, record) in enumerate(records):
                seq = str(record.seq).upper()
                df.append((record.name, record.description, seq))
                fm.push_back("$" + seq + "$")  # include sentinels
    df = pandas.DataFrame(df, columns=["name", "description", "seq"])

    print("Done reading fastas")
    print(df)

    print("Building index")
    fm.build()
    fm.write(args.out_index)
    print("Wrote: ", os.path.abspath((args.out_index)))

    df.to_csv(args.out_csv, index=True)
    print("Wrote: ", os.path.abspath((args.out_csv)))


if __name__ == '__main__':
    run()