Skip to content
Snippets Groups Projects
Commit 11b13122 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

add flanking sequence

parent c9b30342
No related merge requests found
......@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
......
......@@ -5,6 +5,7 @@ import argparse
import os
import time
import collections
import re
from six.moves import StringIO
import pandas
......@@ -32,13 +33,26 @@ parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
parser.add_argument(
"--flanking-length",
metavar="N",
type=int,
default=15,
help="Length of flanking sequence to include")
parser.add_argument(
"--debug-max-rows",
metavar="N",
type=int,
default=None,
help="Max rows to process. Useful for debugging. If specified an ipdb "
"debugging session is also opened at the end of the script")
def run():
args = parser.parse_args(sys.argv[1:])
df = pandas.read_csv(args.peptides)
df["hit_id"] = "hit." + df.index.map(str)
df["hit_id"] = "hit." + df.index.map('{0:07d}'.format)
df = df.set_index("hit_id")
print("Read peptides", df.shape, *df.columns.tolist())
......@@ -54,12 +68,46 @@ def run():
for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
matches = fm.search(row.peptide)
for match in matches:
join_df.append((hit_id, match.doc_id, len(matches)))
reference_row = reference_df.iloc[match.doc_id]
starts = [
m.start() for m in
re.finditer(row.peptide, reference_row.seq)
]
assert len(starts) > 0, (row.peptide, reference_row.seq)
for start in starts:
end_pos = start + len(row.peptide)
n_flank = reference_row.seq[
max(start - args.flanking_length, 0) : start
].rjust(args.flanking_length, 'X')
c_flank = reference_row.seq[
end_pos : (end_pos + args.flanking_length)
].ljust(args.flanking_length, 'X')
join_df.append((
hit_id,
match.doc_id,
len(matches),
len(starts),
start,
start / len(reference_row.seq),
n_flank,
c_flank,
))
if args.debug_max_rows and len(join_df) > args.debug_max_rows:
break
join_df = pandas.DataFrame(
join_df,
columns=["hit_id", "match_index", "num_proteins"],
)
columns=[
"hit_id",
"match_index",
"num_proteins",
"num_occurrences_in_protein",
"start_position",
"start_fraction_in_protein",
"n_flank",
"c_flank",
]).drop_duplicates()
join_df["protein_accession"] = join_df.match_index.map(
reference_df.index.to_series().reset_index(drop=True))
......@@ -84,6 +132,11 @@ def run():
merged_df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
if args.debug_max_rows:
# Leave user in a debugger
import ipdb
ipdb.set_trace()
if __name__ == '__main__':
run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment