""" """ import sys import argparse import os import time import collections import re from six.moves import StringIO import pandas import tqdm # progress bar tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 import shellinford parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( "peptides", metavar="FILE.csv", help="CSV of mass spec hits") parser.add_argument( "reference_csv", metavar="FILE.csv", help="CSV of protein sequences") parser.add_argument( "reference_index", metavar="FILE.fm", help="shellinford index over protein sequences") parser.add_argument( "--out", metavar="OUT.csv", help="Out file path") parser.add_argument( "--flanking-length", metavar="N", type=int, default=15, help="Length of flanking sequence to include") parser.add_argument( "--debug-max-rows", metavar="N", type=int, default=None, help="Max rows to process. Useful for debugging. If specified an ipdb " "debugging session is also opened at the end of the script") def run(): args = parser.parse_args(sys.argv[1:]) df = pandas.read_csv(args.peptides) df["hit_id"] = "hit." + df.index.map('{0:07d}'.format) df = df.set_index("hit_id") print("Read peptides", df.shape, *df.columns.tolist()) reference_df = pandas.read_csv(args.reference_csv, index_col=0) reference_df = reference_df.set_index("accession") print("Read proteins", reference_df.shape, *reference_df.columns.tolist()) fm = shellinford.FMIndex() fm.read(args.reference_index) print("Read proteins index") join_df = [] for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)): matches = fm.search(row.peptide) for match in matches: reference_row = reference_df.iloc[match.doc_id] starts = [ m.start() for m in re.finditer(row.peptide, reference_row.seq) ] assert len(starts) > 0, (row.peptide, reference_row.seq) for start in starts: end_pos = start + len(row.peptide) n_flank = reference_row.seq[ max(start - args.flanking_length, 0) : start ].rjust(args.flanking_length, 'X') c_flank = reference_row.seq[ end_pos : (end_pos + args.flanking_length) ].ljust(args.flanking_length, 'X') join_df.append(( hit_id, match.doc_id, len(matches), len(starts), start, start / len(reference_row.seq), n_flank, c_flank, )) if args.debug_max_rows and len(join_df) > args.debug_max_rows: break join_df = pandas.DataFrame( join_df, columns=[ "hit_id", "match_index", "num_proteins", "num_occurrences_in_protein", "start_position", "start_fraction_in_protein", "n_flank", "c_flank", ]).drop_duplicates() join_df["protein_accession"] = join_df.match_index.map( reference_df.index.to_series().reset_index(drop=True)) del join_df["match_index"] protein_cols = [ c for c in reference_df.columns if c not in ["name", "description", "seq"] ] for col in protein_cols: join_df["protein_%s" % col] = join_df.protein_accession.map( reference_df[col]) merged_df = pandas.merge( join_df, df, how="left", left_on="hit_id", right_index=True) merged_df.to_csv(args.out, index=False) print("Wrote: %s" % os.path.abspath(args.out)) if args.debug_max_rows: # Leave user in a debugger import ipdb ipdb.set_trace() if __name__ == '__main__': run()