add flanking sequence

11b13122 · Tim O'Donnell · c9b30342 · 11b13122 · 11b13122
Commit 11b13122 authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"

 # Send stdout and stderr to a logfile included with the archive.
-exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)

 # Log some environment info
 date

--- a/downloads-generation/data_mass_spec_annotated/annotate.py
+++ b/downloads-generation/data_mass_spec_annotated/annotate.py
@@ -5,6 +5,7 @@ import argparse
 import os
 import time
 import collections
+import re
 from six.moves import StringIO

 import pandas
@@ -32,13 +33,26 @@ parser.add_argument(
    "--out",
    metavar="OUT.csv",
    help="Out file path")
+parser.add_argument(
+    "--flanking-length",
+    metavar="N",
+    type=int,
+    default=15,
+    help="Length of flanking sequence to include")
+parser.add_argument(
+    "--debug-max-rows",
+    metavar="N",
+    type=int,
+    default=None,
+    help="Max rows to process. Useful for debugging. If specified an ipdb "
+    "debugging session is also opened at the end of the script")


 def run():
    args = parser.parse_args(sys.argv[1:])

    df = pandas.read_csv(args.peptides)
-    df["hit_id"] = "hit." + df.index.map(str)
+    df["hit_id"] = "hit." + df.index.map('{0:07d}'.format)
    df = df.set_index("hit_id")
    print("Read peptides", df.shape, *df.columns.tolist())

@@ -54,12 +68,46 @@ def run():
    for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
        matches = fm.search(row.peptide)
        for match in matches:
-            join_df.append((hit_id, match.doc_id, len(matches)))
+            reference_row = reference_df.iloc[match.doc_id]
+            starts = [
+                m.start() for m in
+                re.finditer(row.peptide, reference_row.seq)
+            ]
+            assert len(starts) > 0, (row.peptide, reference_row.seq)
+            for start in starts:
+                end_pos = start + len(row.peptide)
+                n_flank = reference_row.seq[
+                    max(start - args.flanking_length, 0) : start
+                ].rjust(args.flanking_length, 'X')
+                c_flank = reference_row.seq[
+                    end_pos : (end_pos + args.flanking_length)
+                ].ljust(args.flanking_length, 'X')
+                join_df.append((
+                    hit_id,
+                    match.doc_id,
+                    len(matches),
+                    len(starts),
+                    start,
+                    start / len(reference_row.seq),
+                    n_flank,
+                    c_flank,
+                ))
+
+        if args.debug_max_rows and len(join_df) > args.debug_max_rows:
+            break

    join_df = pandas.DataFrame(
        join_df,
-        columns=["hit_id", "match_index", "num_proteins"],
-    )
+        columns=[
+            "hit_id",
+            "match_index",
+            "num_proteins",
+            "num_occurrences_in_protein",
+            "start_position",
+            "start_fraction_in_protein",
+            "n_flank",
+            "c_flank",
+        ]).drop_duplicates()

    join_df["protein_accession"] = join_df.match_index.map(
        reference_df.index.to_series().reset_index(drop=True))
@@ -84,6 +132,11 @@ def run():
    merged_df.to_csv(args.out, index=False)
    print("Wrote: %s" % os.path.abspath(args.out))

+    if args.debug_max_rows:
+        # Leave user in a debugger
+        import ipdb
+        ipdb.set_trace()
+

 if __name__ == '__main__':
    run()