Skip to content
Snippets Groups Projects
annotate.py 3.88 KiB
Newer Older
"""
"""
import sys
import argparse
import os
Tim O'Donnell's avatar
Tim O'Donnell committed
import time
Tim O'Donnell's avatar
Tim O'Donnell committed
import re
from six.moves import StringIO

import pandas
Tim O'Donnell's avatar
Tim O'Donnell committed
import tqdm  # progress bar
tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
Tim O'Donnell's avatar
Tim O'Donnell committed
import shellinford


parser = argparse.ArgumentParser(usage=__doc__)

parser.add_argument(
Tim O'Donnell's avatar
Tim O'Donnell committed
    "peptides",
    metavar="FILE.csv",
    help="CSV of mass spec hits")
parser.add_argument(
    "reference_csv",
    metavar="FILE.csv",
    help="CSV of protein sequences")
parser.add_argument(
    "reference_index",
    metavar="FILE.fm",
    help="shellinford index over protein sequences")
parser.add_argument(
    "--out",
    metavar="OUT.csv",
    help="Out file path")
Tim O'Donnell's avatar
Tim O'Donnell committed
parser.add_argument(
    "--flanking-length",
    metavar="N",
    type=int,
    default=15,
    help="Length of flanking sequence to include")
parser.add_argument(
    "--debug-max-rows",
    metavar="N",
    type=int,
    default=None,
    help="Max rows to process. Useful for debugging. If specified an ipdb "
    "debugging session is also opened at the end of the script")
Tim O'Donnell's avatar
Tim O'Donnell committed
def run():
    args = parser.parse_args(sys.argv[1:])
Tim O'Donnell's avatar
Tim O'Donnell committed
    df = pandas.read_csv(args.peptides)
Tim O'Donnell's avatar
Tim O'Donnell committed
    df["hit_id"] = "hit." + df.index.map('{0:07d}'.format)
Tim O'Donnell's avatar
Tim O'Donnell committed
    df = df.set_index("hit_id")
    print("Read peptides", df.shape, *df.columns.tolist())
Tim O'Donnell's avatar
Tim O'Donnell committed
    reference_df = pandas.read_csv(args.reference_csv, index_col=0)
    reference_df = reference_df.set_index("accession")
    print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
Tim O'Donnell's avatar
Tim O'Donnell committed
    fm = shellinford.FMIndex()
    fm.read(args.reference_index)
    print("Read proteins index")
Tim O'Donnell's avatar
Tim O'Donnell committed
    join_df = []
    for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
        matches = fm.search(row.peptide)
        for match in matches:
Tim O'Donnell's avatar
Tim O'Donnell committed
            reference_row = reference_df.iloc[match.doc_id]
            starts = [
                m.start() for m in
                re.finditer(row.peptide, reference_row.seq)
            ]
            assert len(starts) > 0, (row.peptide, reference_row.seq)
            for start in starts:
                end_pos = start + len(row.peptide)
                n_flank = reference_row.seq[
                    max(start - args.flanking_length, 0) : start
                ].rjust(args.flanking_length, 'X')
                c_flank = reference_row.seq[
                    end_pos : (end_pos + args.flanking_length)
                ].ljust(args.flanking_length, 'X')
                join_df.append((
                    hit_id,
                    match.doc_id,
                    len(matches),
                    len(starts),
                    start,
                    start / len(reference_row.seq),
                    n_flank,
                    c_flank,
                ))

        if args.debug_max_rows and len(join_df) > args.debug_max_rows:
            break
Tim O'Donnell's avatar
Tim O'Donnell committed
    join_df = pandas.DataFrame(
        join_df,
Tim O'Donnell's avatar
Tim O'Donnell committed
        columns=[
            "hit_id",
            "match_index",
            "num_proteins",
            "num_occurrences_in_protein",
            "start_position",
            "start_fraction_in_protein",
            "n_flank",
            "c_flank",
        ]).drop_duplicates()
Tim O'Donnell's avatar
Tim O'Donnell committed
    join_df["protein_accession"] = join_df.match_index.map(
        reference_df.index.to_series().reset_index(drop=True))
Tim O'Donnell's avatar
Tim O'Donnell committed
    del join_df["match_index"]

    protein_cols = [
        c for c in reference_df.columns
        if c not in ["name", "description", "seq"]
    ]
    for col in protein_cols:
        join_df["protein_%s" % col] = join_df.protein_accession.map(
            reference_df[col])

    merged_df = pandas.merge(
        join_df,
        df,
        how="left",
        left_on="hit_id",
        right_index=True)

    merged_df.to_csv(args.out, index=False)
    print("Wrote: %s" % os.path.abspath(args.out))

Tim O'Donnell's avatar
Tim O'Donnell committed
    if args.debug_max_rows:
        # Leave user in a debugger
        import ipdb
        ipdb.set_trace()