From 11b13122c033a3b0221a5542edb0632c8be0777f Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Mon, 30 Sep 2019 11:45:02 -0400 Subject: [PATCH] add flanking sequence --- .../data_mass_spec_annotated/GENERATE.sh | 4 +- .../data_mass_spec_annotated/annotate.py | 61 +++++++++++++++++-- 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh index e5851c1e..f1897cf5 100755 --- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh +++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh @@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" # Send stdout and stderr to a logfile included with the archive. -exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") -exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) +#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") +#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) # Log some environment info date diff --git a/downloads-generation/data_mass_spec_annotated/annotate.py b/downloads-generation/data_mass_spec_annotated/annotate.py index 56b6b3a6..4dc93fac 100755 --- a/downloads-generation/data_mass_spec_annotated/annotate.py +++ b/downloads-generation/data_mass_spec_annotated/annotate.py @@ -5,6 +5,7 @@ import argparse import os import time import collections +import re from six.moves import StringIO import pandas @@ -32,13 +33,26 @@ parser.add_argument( "--out", metavar="OUT.csv", help="Out file path") +parser.add_argument( + "--flanking-length", + metavar="N", + type=int, + default=15, + help="Length of flanking sequence to include") +parser.add_argument( + "--debug-max-rows", + metavar="N", + type=int, + default=None, + help="Max rows to process. Useful for debugging. If specified an ipdb " + "debugging session is also opened at the end of the script") def run(): args = parser.parse_args(sys.argv[1:]) df = pandas.read_csv(args.peptides) - df["hit_id"] = "hit." + df.index.map(str) + df["hit_id"] = "hit." + df.index.map('{0:07d}'.format) df = df.set_index("hit_id") print("Read peptides", df.shape, *df.columns.tolist()) @@ -54,12 +68,46 @@ def run(): for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)): matches = fm.search(row.peptide) for match in matches: - join_df.append((hit_id, match.doc_id, len(matches))) + reference_row = reference_df.iloc[match.doc_id] + starts = [ + m.start() for m in + re.finditer(row.peptide, reference_row.seq) + ] + assert len(starts) > 0, (row.peptide, reference_row.seq) + for start in starts: + end_pos = start + len(row.peptide) + n_flank = reference_row.seq[ + max(start - args.flanking_length, 0) : start + ].rjust(args.flanking_length, 'X') + c_flank = reference_row.seq[ + end_pos : (end_pos + args.flanking_length) + ].ljust(args.flanking_length, 'X') + join_df.append(( + hit_id, + match.doc_id, + len(matches), + len(starts), + start, + start / len(reference_row.seq), + n_flank, + c_flank, + )) + + if args.debug_max_rows and len(join_df) > args.debug_max_rows: + break join_df = pandas.DataFrame( join_df, - columns=["hit_id", "match_index", "num_proteins"], - ) + columns=[ + "hit_id", + "match_index", + "num_proteins", + "num_occurrences_in_protein", + "start_position", + "start_fraction_in_protein", + "n_flank", + "c_flank", + ]).drop_duplicates() join_df["protein_accession"] = join_df.match_index.map( reference_df.index.to_series().reset_index(drop=True)) @@ -84,6 +132,11 @@ def run(): merged_df.to_csv(args.out, index=False) print("Wrote: %s" % os.path.abspath(args.out)) + if args.debug_max_rows: + # Leave user in a debugger + import ipdb + ipdb.set_trace() + if __name__ == '__main__': run() -- GitLab