From 11b13122c033a3b0221a5542edb0632c8be0777f Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Mon, 30 Sep 2019 11:45:02 -0400
Subject: [PATCH] add flanking sequence

---
 .../data_mass_spec_annotated/GENERATE.sh      |  4 +-
 .../data_mass_spec_annotated/annotate.py      | 61 +++++++++++++++++--
 2 files changed, 59 insertions(+), 6 deletions(-)

diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
index e5851c1e..f1897cf5 100755
--- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
 
 # Send stdout and stderr to a logfile included with the archive.
-exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 
 # Log some environment info
 date
diff --git a/downloads-generation/data_mass_spec_annotated/annotate.py b/downloads-generation/data_mass_spec_annotated/annotate.py
index 56b6b3a6..4dc93fac 100755
--- a/downloads-generation/data_mass_spec_annotated/annotate.py
+++ b/downloads-generation/data_mass_spec_annotated/annotate.py
@@ -5,6 +5,7 @@ import argparse
 import os
 import time
 import collections
+import re
 from six.moves import StringIO
 
 import pandas
@@ -32,13 +33,26 @@ parser.add_argument(
     "--out",
     metavar="OUT.csv",
     help="Out file path")
+parser.add_argument(
+    "--flanking-length",
+    metavar="N",
+    type=int,
+    default=15,
+    help="Length of flanking sequence to include")
+parser.add_argument(
+    "--debug-max-rows",
+    metavar="N",
+    type=int,
+    default=None,
+    help="Max rows to process. Useful for debugging. If specified an ipdb "
+    "debugging session is also opened at the end of the script")
 
 
 def run():
     args = parser.parse_args(sys.argv[1:])
 
     df = pandas.read_csv(args.peptides)
-    df["hit_id"] = "hit." + df.index.map(str)
+    df["hit_id"] = "hit." + df.index.map('{0:07d}'.format)
     df = df.set_index("hit_id")
     print("Read peptides", df.shape, *df.columns.tolist())
 
@@ -54,12 +68,46 @@ def run():
     for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
         matches = fm.search(row.peptide)
         for match in matches:
-            join_df.append((hit_id, match.doc_id, len(matches)))
+            reference_row = reference_df.iloc[match.doc_id]
+            starts = [
+                m.start() for m in
+                re.finditer(row.peptide, reference_row.seq)
+            ]
+            assert len(starts) > 0, (row.peptide, reference_row.seq)
+            for start in starts:
+                end_pos = start + len(row.peptide)
+                n_flank = reference_row.seq[
+                    max(start - args.flanking_length, 0) : start
+                ].rjust(args.flanking_length, 'X')
+                c_flank = reference_row.seq[
+                    end_pos : (end_pos + args.flanking_length)
+                ].ljust(args.flanking_length, 'X')
+                join_df.append((
+                    hit_id,
+                    match.doc_id,
+                    len(matches),
+                    len(starts),
+                    start,
+                    start / len(reference_row.seq),
+                    n_flank,
+                    c_flank,
+                ))
+
+        if args.debug_max_rows and len(join_df) > args.debug_max_rows:
+            break
 
     join_df = pandas.DataFrame(
         join_df,
-        columns=["hit_id", "match_index", "num_proteins"],
-    )
+        columns=[
+            "hit_id",
+            "match_index",
+            "num_proteins",
+            "num_occurrences_in_protein",
+            "start_position",
+            "start_fraction_in_protein",
+            "n_flank",
+            "c_flank",
+        ]).drop_duplicates()
 
     join_df["protein_accession"] = join_df.match_index.map(
         reference_df.index.to_series().reset_index(drop=True))
@@ -84,6 +132,11 @@ def run():
     merged_df.to_csv(args.out, index=False)
     print("Wrote: %s" % os.path.abspath(args.out))
 
+    if args.debug_max_rows:
+        # Leave user in a debugger
+        import ipdb
+        ipdb.set_trace()
+
 
 if __name__ == '__main__':
     run()
-- 
GitLab