Skip to content
Snippets Groups Projects
Commit ec163a5a authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent f000e398
No related branches found
No related tags found
No related merge requests found
......@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
# Send stdout and stderr to a logfile included with the archive.
#exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
exec > >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info
date
......@@ -27,11 +27,14 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
cp $SCRIPT_DIR/annotate.py .
INPUT=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
REFERENCES_DIR=$(mhcflurry-downloads path data_references)
python annotate.py "$INPUT" --out annotated_ms.csv
exit 1
python annotate.py \
"$PEPTIDES" \
"${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
"${REFERENCES_DIR}/uniprot_proteins.fm" \
--out annotated_ms.csv
bzip2 annotated_ms.csv
......
......@@ -3,94 +3,85 @@
import sys
import argparse
import os
import time
import collections
from six.moves import StringIO
import pandas
import tqdm # progress bar
tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481
import mhcnames
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
import shellinford
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"input_path",
help="Item to curate: PMID and list of files")
"peptides",
metavar="FILE.csv",
help="CSV of mass spec hits")
parser.add_argument(
"reference_csv",
metavar="FILE.csv",
help="CSV of protein sequences")
parser.add_argument(
"reference_index",
metavar="FILE.fm",
help="shellinford index over protein sequences")
parser.add_argument(
"--out",
metavar="OUT.csv",
help="Out file path")
# Build index
PREBUILT_INDEX = "datasets/uniprot-proteome_UP000005640.fasta.gz.fm"
USE_PREBUILT_INDEX = os.path.exists(PREBUILT_INDEX)
print("Using prebuilt index", USE_PREBUILT_INDEX)
fm = shellinford.FMIndex()
if USE_PREBUILT_INDEX:
fm.read(PREBUILT_INDEX)
fm_keys = []
protein_database = "datasets/uniprot-proteome_UP000005640.fasta.gz"
start = time.time()
proteome_df = []
with gzip.open(protein_database, "rt") as fd:
records = SeqIO.parse(fd, format='fasta')
for (i, record) in enumerate(records):
if i % 10000 == 0:
print(i, time.time() - start)
fm_keys.append(record.name)
proteome_df.append((record.name, record.description, str(record.seq)))
if not USE_PREBUILT_INDEX:
fm.push_back("$" + str(record.seq) + "$") # include sentinels
if not USE_PREBUILT_INDEX:
print("Building")
start = time.time()
fm.build()
print("Done building", time.time() - start)
fm.write(PREBUILT_INDEX)
proteome_df = pandas.DataFrame(proteome_df, columns=["name", "description", "seq"]).set_index("name")
proteome_df
SEARCH_CACHE = {}
def search(peptide, fm=fm):
if peptide in SEARCH_CACHE:
return SEARCH_CACHE[peptide]
hits = fm.search(peptide)
result = proteome_df.iloc[
[hit.doc_id for hit in hits]
]
assert result.seq.str.contains(peptide).all(), (peptide, result)
names = result.index.tolist()
SEARCH_CACHE[peptide] = names
return names
print(search("SIINFEKL"))
print(search("AAAAAKVPA"))
print(search("AAAAALQAK"))
print(search("DEGPLDVSM"))
def run():
args = parser.parse_args(sys.argv[1:])
df = pandas.read_csv(args.peptides)
df["hit_id"] = "hit." + df.index.map(str)
df = df.set_index("hit_id")
print("Read peptides", df.shape, *df.columns.tolist())
reference_df = pandas.read_csv(args.reference_csv, index_col=0)
reference_df = reference_df.set_index("accession")
print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
fm = shellinford.FMIndex()
fm.read(args.reference_index)
print("Read proteins index")
def run():
args = parser.parse_args(sys.argv[1:])
join_df = []
for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
matches = fm.search(row.peptide)
for match in matches:
join_df.append((hit_id, match.doc_id, len(matches)))
df = pandas.read_csv(args.input_path)
print("Read input", df.shape)
join_df = pandas.DataFrame(
join_df,
columns=["hit_id", "match_index", "num_proteins"],
)
import ipdb ; ipdb.set_trace()
join_df["protein_accession"] = join_df.match_index.map(
reference_df.index.to_series().reset_index(drop=True))
df.to_csv(args.out, index=False)
del join_df["match_index"]
protein_cols = [
c for c in reference_df.columns
if c not in ["name", "description", "seq"]
]
for col in protein_cols:
join_df["protein_%s" % col] = join_df.protein_accession.map(
reference_df[col])
merged_df = pandas.merge(
join_df,
df,
how="left",
left_on="hit_id",
right_index=True)
merged_df.to_csv(args.out, index=False)
print("Wrote: %s" % os.path.abspath(args.out))
......
shellinford
......@@ -29,6 +29,10 @@ releases:
- https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa
default: false
- name: data_references
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
default: false
- name: data_iedb
url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_iedb.20190916.tar.bz2
default: false
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment