Skip to content
Snippets Groups Projects
Commit ec163a5a authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent f000e398
No related merge requests found
......@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
......@@ -27,11 +27,14 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/annotate.py .
INPUT=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
python annotate.py "$INPUT" --out annotated_ms.csv
exit 1
python annotate.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
"${REFERENCES_DIR}/uniprot_proteins.fm" \
--out annotated_ms.csv
bzip2 annotated_ms.csv
......
......@@ -3,94 +3,85 @@
import sys
import argparse
import os
import time
import collections
from six.moves import StringIO
import pandas
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
import shellinford
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"input_path",
help="Item to curate: PMID and list of files")
"peptides",
metavar="FILE.csv",
help="CSV of mass spec hits")
parser.add_argument(
"reference_csv",
metavar="FILE.csv",
help="CSV of protein sequences")
parser.add_argument(
"reference_index",
metavar="FILE.fm",
help="shellinford index over protein sequences")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
# Build index
PREBUILT_INDEX = "datasets/uniprot-proteome_UP000005640.fasta.gz.fm"
USE_PREBUILT_INDEX = os.path.exists(PREBUILT_INDEX)
print("Using prebuilt index", USE_PREBUILT_INDEX)
fm = shellinford.FMIndex()
if USE_PREBUILT_INDEX:
fm.read(PREBUILT_INDEX)
fm_keys = []
protein_database = "datasets/uniprot-proteome_UP000005640.fasta.gz"
start = time.time()
proteome_df = []
with gzip.open(protein_database, "rt") as fd:
records = SeqIO.parse(fd, format='fasta')
for (i, record) in enumerate(records):
if i % 10000 == 0:
print(i, time.time() - start)
fm_keys.append(record.name)
proteome_df.append((record.name, record.description, str(record.seq)))
if not USE_PREBUILT_INDEX:
fm.push_back("$" + str(record.seq) + "$") # include sentinels
if not USE_PREBUILT_INDEX:
print("Building")
start = time.time()
fm.build()
print("Done building", time.time() - start)
fm.write(PREBUILT_INDEX)
proteome_df = pandas.DataFrame(proteome_df, columns=["name", "description", "seq"]).set_index("name")
proteome_df
SEARCH_CACHE = {}
def search(peptide, fm=fm):
if peptide in SEARCH_CACHE:
return SEARCH_CACHE[peptide]
hits = fm.search(peptide)
result = proteome_df.iloc[
[hit.doc_id for hit in hits]
]
assert result.seq.str.contains(peptide).all(), (peptide, result)
names = result.index.tolist()
SEARCH_CACHE[peptide] = names
return names
print(search("SIINFEKL"))
print(search("AAAAAKVPA"))
print(search("AAAAALQAK"))
print(search("DEGPLDVSM"))
def run():
args = parser.parse_args(sys.argv[1:])
df = pandas.read_csv(args.peptides)
df["hit_id"] = "hit." + df.index.map(str)
df = df.set_index("hit_id")
print("Read peptides", df.shape, *df.columns.tolist())
reference_df = pandas.read_csv(args.reference_csv, index_col=0)
reference_df = reference_df.set_index("accession")
print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
fm = shellinford.FMIndex()
fm.read(args.reference_index)
print("Read proteins index")
def run():
args = parser.parse_args(sys.argv[1:])
join_df = []
for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
matches = fm.search(row.peptide)
for match in matches:
join_df.append((hit_id, match.doc_id, len(matches)))
df = pandas.read_csv(args.input_path)
print("Read input", df.shape)
join_df = pandas.DataFrame(
join_df,
columns=["hit_id", "match_index", "num_proteins"],
)
import ipdb ; ipdb.set_trace()
join_df["protein_accession"] = join_df.match_index.map(
reference_df.index.to_series().reset_index(drop=True))
df.to_csv(args.out, index=False)
del join_df["match_index"]
protein_cols = [
c for c in reference_df.columns
if c not in ["name", "description", "seq"]
]
for col in protein_cols:
join_df["protein_%s" % col] = join_df.protein_accession.map(
reference_df[col])
merged_df = pandas.merge(
join_df,
df,
how="left",
left_on="hit_id",
right_index=True)
merged_df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
......
shellinford
......@@ -29,6 +29,10 @@ releases:
- https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa
default: false
- name: data_references
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
default: false
- name: data_iedb
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_iedb.20190916.tar.bz2
default: false
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment