Skip to content
Snippets Groups Projects
Commit 8de5d73d authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

add flanking sequence

parent ec163a5a
No related branches found
No related tags found
No related merge requests found
......@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
......
......@@ -5,6 +5,7 @@ import argparse
import os
import time
import collections
import re
from six.moves import StringIO
import pandas
......@@ -32,13 +33,26 @@ parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
parser.add_argument(
"--flanking-length",
metavar="N",
type=int,
default=15,
help="Length of flanking sequence to include")
parser.add_argument(
"--debug-max-rows",
metavar="N",
type=int,
default=None,
help="Max rows to process. Useful for debugging. If specified an ipdb "
"debugging session is also opened at the end of the script")
def run():
args = parser.parse_args(sys.argv[1:])
df = pandas.read_csv(args.peptides)
df["hit_id"] = "hit." + df.index.map(str)
df["hit_id"] = "hit." + df.index.map('{0:07d}'.format)
df = df.set_index("hit_id")
print("Read peptides", df.shape, *df.columns.tolist())
......@@ -54,12 +68,46 @@ def run():
for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
matches = fm.search(row.peptide)
for match in matches:
join_df.append((hit_id, match.doc_id, len(matches)))
reference_row = reference_df.iloc[match.doc_id]
starts = [
m.start() for m in
re.finditer(row.peptide, reference_row.seq)
]
assert len(starts) > 0, (row.peptide, reference_row.seq)
for start in starts:
end_pos = start + len(row.peptide)
n_flank = reference_row.seq[
max(start - args.flanking_length, 0) : start
].rjust(args.flanking_length, 'X')
c_flank = reference_row.seq[
end_pos : (end_pos + args.flanking_length)
].ljust(args.flanking_length, 'X')
join_df.append((
hit_id,
match.doc_id,
len(matches),
len(starts),
start,
start / len(reference_row.seq),
n_flank,
c_flank,
))
if args.debug_max_rows and len(join_df) > args.debug_max_rows:
break
join_df = pandas.DataFrame(
join_df,
columns=["hit_id", "match_index", "num_proteins"],
)
columns=[
"hit_id",
"match_index",
"num_proteins",
"num_occurrences_in_protein",
"start_position",
"start_fraction_in_protein",
"n_flank",
"c_flank",
]).drop_duplicates()
join_df["protein_accession"] = join_df.match_index.map(
reference_df.index.to_series().reset_index(drop=True))
......@@ -84,6 +132,11 @@ def run():
merged_df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
if args.debug_max_rows:
# Leave user in a debugger
import ipdb
ipdb.set_trace()
if __name__ == '__main__':
run()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment