Skip to content
Snippets Groups Projects
Commit ec163a5a authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent f000e398
No related branches found
No related tags found
No related merge requests found
...@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" ...@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME" mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive. # Send stdout and stderr to a logfile included with the archive.
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt") exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info # Log some environment info
date date
...@@ -27,11 +27,14 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME ...@@ -27,11 +27,14 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/annotate.py . cp $SCRIPT_DIR/annotate.py .
INPUT=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2 PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
python annotate.py "$INPUT" --out annotated_ms.csv python annotate.py \
"$PEPTIDES" \
exit 1 "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
"${REFERENCES_DIR}/uniprot_proteins.fm" \
--out annotated_ms.csv
bzip2 annotated_ms.csv bzip2 annotated_ms.csv
......
...@@ -3,94 +3,85 @@ ...@@ -3,94 +3,85 @@
import sys import sys
import argparse import argparse
import os import os
import time
import collections import collections
from six.moves import StringIO from six.moves import StringIO
import pandas import pandas
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
import mhcnames import shellinford
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__) parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument( parser.add_argument(
"input_path", "peptides",
help="Item to curate: PMID and list of files") metavar="FILE.csv",
help="CSV of mass spec hits")
parser.add_argument(
"reference_csv",
metavar="FILE.csv",
help="CSV of protein sequences")
parser.add_argument(
"reference_index",
metavar="FILE.fm",
help="shellinford index over protein sequences")
parser.add_argument( parser.add_argument(
"--out", "--out",
metavar="OUT.csv", metavar="OUT.csv",
help="Out file path") help="Out file path")
# Build index def run():
PREBUILT_INDEX = "datasets/uniprot-proteome_UP000005640.fasta.gz.fm" args = parser.parse_args(sys.argv[1:])
USE_PREBUILT_INDEX = os.path.exists(PREBUILT_INDEX)
print("Using prebuilt index", USE_PREBUILT_INDEX)
fm = shellinford.FMIndex()
if USE_PREBUILT_INDEX:
fm.read(PREBUILT_INDEX)
fm_keys = []
protein_database = "datasets/uniprot-proteome_UP000005640.fasta.gz"
start = time.time()
proteome_df = []
with gzip.open(protein_database, "rt") as fd:
records = SeqIO.parse(fd, format='fasta')
for (i, record) in enumerate(records):
if i % 10000 == 0:
print(i, time.time() - start)
fm_keys.append(record.name)
proteome_df.append((record.name, record.description, str(record.seq)))
if not USE_PREBUILT_INDEX:
fm.push_back("$" + str(record.seq) + "$") # include sentinels
if not USE_PREBUILT_INDEX:
print("Building")
start = time.time()
fm.build()
print("Done building", time.time() - start)
fm.write(PREBUILT_INDEX)
proteome_df = pandas.DataFrame(proteome_df, columns=["name", "description", "seq"]).set_index("name")
proteome_df
SEARCH_CACHE = {}
def search(peptide, fm=fm):
if peptide in SEARCH_CACHE:
return SEARCH_CACHE[peptide]
hits = fm.search(peptide)
result = proteome_df.iloc[
[hit.doc_id for hit in hits]
]
assert result.seq.str.contains(peptide).all(), (peptide, result)
names = result.index.tolist()
SEARCH_CACHE[peptide] = names
return names
print(search("SIINFEKL"))
print(search("AAAAAKVPA"))
print(search("AAAAALQAK"))
print(search("DEGPLDVSM"))
df = pandas.read_csv(args.peptides)
df["hit_id"] = "hit." + df.index.map(str)
df = df.set_index("hit_id")
print("Read peptides", df.shape, *df.columns.tolist())
reference_df = pandas.read_csv(args.reference_csv, index_col=0)
reference_df = reference_df.set_index("accession")
print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
fm = shellinford.FMIndex()
fm.read(args.reference_index)
print("Read proteins index")
def run(): join_df = []
args = parser.parse_args(sys.argv[1:]) for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
matches = fm.search(row.peptide)
for match in matches:
join_df.append((hit_id, match.doc_id, len(matches)))
df = pandas.read_csv(args.input_path) join_df = pandas.DataFrame(
print("Read input", df.shape) join_df,
columns=["hit_id", "match_index", "num_proteins"],
)
import ipdb ; ipdb.set_trace() join_df["protein_accession"] = join_df.match_index.map(
reference_df.index.to_series().reset_index(drop=True))
df.to_csv(args.out, index=False) del join_df["match_index"]
protein_cols = [
c for c in reference_df.columns
if c not in ["name", "description", "seq"]
]
for col in protein_cols:
join_df["protein_%s" % col] = join_df.protein_accession.map(
reference_df[col])
merged_df = pandas.merge(
join_df,
df,
how="left",
left_on="hit_id",
right_index=True)
merged_df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out)) print("Wrote: %s" % os.path.abspath(args.out))
......
shellinford
...@@ -29,6 +29,10 @@ releases: ...@@ -29,6 +29,10 @@ releases:
- https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa - https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa
default: false default: false
- name: data_references
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
default: false
- name: data_iedb - name: data_iedb
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_iedb.20190916.tar.bz2 url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_iedb.20190916.tar.bz2
default: false default: false
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment