Skip to content
Snippets Groups Projects
annotate.py 2.18 KiB
Newer Older
"""
"""
import sys
import argparse
import os
Tim O'Donnell's avatar
Tim O'Donnell committed
import time
import collections
from six.moves import StringIO

import pandas
Tim O'Donnell's avatar
Tim O'Donnell committed
import tqdm  # progress bar
tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
Tim O'Donnell's avatar
Tim O'Donnell committed
import shellinford


parser = argparse.ArgumentParser(usage=__doc__)

parser.add_argument(
Tim O'Donnell's avatar
Tim O'Donnell committed
    "peptides",
    metavar="FILE.csv",
    help="CSV of mass spec hits")
parser.add_argument(
    "reference_csv",
    metavar="FILE.csv",
    help="CSV of protein sequences")
parser.add_argument(
    "reference_index",
    metavar="FILE.fm",
    help="shellinford index over protein sequences")
parser.add_argument(
    "--out",
    metavar="OUT.csv",
    help="Out file path")


Tim O'Donnell's avatar
Tim O'Donnell committed
def run():
    args = parser.parse_args(sys.argv[1:])
Tim O'Donnell's avatar
Tim O'Donnell committed
    df = pandas.read_csv(args.peptides)
    df["hit_id"] = "hit." + df.index.map(str)
    df = df.set_index("hit_id")
    print("Read peptides", df.shape, *df.columns.tolist())
Tim O'Donnell's avatar
Tim O'Donnell committed
    reference_df = pandas.read_csv(args.reference_csv, index_col=0)
    reference_df = reference_df.set_index("accession")
    print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
Tim O'Donnell's avatar
Tim O'Donnell committed
    fm = shellinford.FMIndex()
    fm.read(args.reference_index)
    print("Read proteins index")
Tim O'Donnell's avatar
Tim O'Donnell committed
    join_df = []
    for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
        matches = fm.search(row.peptide)
        for match in matches:
            join_df.append((hit_id, match.doc_id, len(matches)))
Tim O'Donnell's avatar
Tim O'Donnell committed
    join_df = pandas.DataFrame(
        join_df,
        columns=["hit_id", "match_index", "num_proteins"],
    )
Tim O'Donnell's avatar
Tim O'Donnell committed
    join_df["protein_accession"] = join_df.match_index.map(
        reference_df.index.to_series().reset_index(drop=True))
Tim O'Donnell's avatar
Tim O'Donnell committed
    del join_df["match_index"]

    protein_cols = [
        c for c in reference_df.columns
        if c not in ["name", "description", "seq"]
    ]
    for col in protein_cols:
        join_df["protein_%s" % col] = join_df.protein_accession.map(
            reference_df[col])

    merged_df = pandas.merge(
        join_df,
        df,
        how="left",
        left_on="hit_id",
        right_index=True)

    merged_df.to_csv(args.out, index=False)
    print("Wrote: %s" % os.path.abspath(args.out))


if __name__ == '__main__':
    run()