fix

ec163a5a · Tim O'Donnell · f000e398 · ec163a5a · ec163a5a · ec163a5a
Commit ec163a5a authored 5 years ago by Tim O'Donnell
--- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
 # Send stdout and stderr to a logfile included with the archive.
-#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 # Log some environment info
 date
@@ -27,11 +27,14 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
 cp $SCRIPT_DIR/annotate.py .
-INPUT=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
+PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
+REFERENCES_DIR=$(mhcflurry-downloads path data_references)
-python annotate.py "$INPUT" --out annotated_ms.csv
+python annotate.py \
+    "$PEPTIDES" \
-exit 1
+    "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
+    "${REFERENCES_DIR}/uniprot_proteins.fm" \
+    --out annotated_ms.csv
 bzip2 annotated_ms.csv

--- a/downloads-generation/data_mass_spec_annotated/annotate.py
+++ b/downloads-generation/data_mass_spec_annotated/annotate.py
@@ -3,94 +3,85 @@
 import sys
 import argparse
 import os
+import time
 import collections
 from six.moves import StringIO
 import pandas
+import tqdm  # progress bar
+tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
-import mhcnames
+import shellinford
-def normalize_allele_name(s):
-    try:
-        return mhcnames.normalize_allele_name(s)
-    except Exception:
-        return "UNKNOWN"
 parser = argparse.ArgumentParser(usage=__doc__)
 parser.add_argument(
-    "input_path",
+    "peptides",
-    help="Item to curate: PMID and list of files")
+    metavar="FILE.csv",
+    help="CSV of mass spec hits")
+parser.add_argument(
+    "reference_csv",
+    metavar="FILE.csv",
+    help="CSV of protein sequences")
+parser.add_argument(
+    "reference_index",
+    metavar="FILE.fm",
+    help="shellinford index over protein sequences")
 parser.add_argument(
    "--out",
    metavar="OUT.csv",
    help="Out file path")
-# Build index
+def run():
-PREBUILT_INDEX = "datasets/uniprot-proteome_UP000005640.fasta.gz.fm"
+    args = parser.parse_args(sys.argv[1:])
-USE_PREBUILT_INDEX = os.path.exists(PREBUILT_INDEX)
-print("Using prebuilt index", USE_PREBUILT_INDEX)
-fm = shellinford.FMIndex()
-if USE_PREBUILT_INDEX:
-    fm.read(PREBUILT_INDEX)
-fm_keys = []
-protein_database = "datasets/uniprot-proteome_UP000005640.fasta.gz"
-start = time.time()
-proteome_df = []
-with gzip.open(protein_database, "rt") as fd:
-    records = SeqIO.parse(fd, format='fasta')
-    for (i, record) in enumerate(records):
-        if i % 10000 == 0:
-            print(i, time.time() - start)
-        fm_keys.append(record.name)
-        proteome_df.append((record.name, record.description, str(record.seq)))
-        if not USE_PREBUILT_INDEX:
-            fm.push_back("$" + str(record.seq) + "$")  # include sentinels
-if not USE_PREBUILT_INDEX:
-    print("Building")
-    start = time.time()
-    fm.build()
-    print("Done building", time.time() - start)
-    fm.write(PREBUILT_INDEX)
-proteome_df = pandas.DataFrame(proteome_df, columns=["name", "description", "seq"]).set_index("name")
-proteome_df
-SEARCH_CACHE = {}
-def search(peptide, fm=fm):
-    if peptide in SEARCH_CACHE:
-        return SEARCH_CACHE[peptide]
-    hits = fm.search(peptide)
-    result = proteome_df.iloc[
-        [hit.doc_id for hit in hits]
-    ]
-    assert result.seq.str.contains(peptide).all(), (peptide, result)
-    names = result.index.tolist()
-    SEARCH_CACHE[peptide] = names
-    return names
-print(search("SIINFEKL"))
-print(search("AAAAAKVPA"))
-print(search("AAAAALQAK"))
-print(search("DEGPLDVSM"))
+    df = pandas.read_csv(args.peptides)
+    df["hit_id"] = "hit." + df.index.map(str)
+    df = df.set_index("hit_id")
+    print("Read peptides", df.shape, *df.columns.tolist())
+    reference_df = pandas.read_csv(args.reference_csv, index_col=0)
+    reference_df = reference_df.set_index("accession")
+    print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
+    fm = shellinford.FMIndex()
+    fm.read(args.reference_index)
+    print("Read proteins index")
-def run():
+    join_df = []
-    args = parser.parse_args(sys.argv[1:])
+    for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
+        matches = fm.search(row.peptide)
+        for match in matches:
+            join_df.append((hit_id, match.doc_id, len(matches)))
-    df = pandas.read_csv(args.input_path)
+    join_df = pandas.DataFrame(
-    print("Read input", df.shape)
+        join_df,
+        columns=["hit_id", "match_index", "num_proteins"],
+    )
-    import ipdb ; ipdb.set_trace()
+    join_df["protein_accession"] = join_df.match_index.map(
+        reference_df.index.to_series().reset_index(drop=True))
-    df.to_csv(args.out, index=False)
+    del join_df["match_index"]
+    protein_cols = [
+        c for c in reference_df.columns
+        if c not in ["name", "description", "seq"]
+    ]
+    for col in protein_cols:
+        join_df["protein_%s" % col] = join_df.protein_accession.map(
+            reference_df[col])
+    merged_df = pandas.merge(
+        join_df,
+        df,
+        how="left",
+        left_on="hit_id",
+        right_index=True)
+    merged_df.to_csv(args.out, index=False)
    print("Wrote: %s" % os.path.abspath(args.out))

--- a/downloads-generation/data_mass_spec_annotated/requirements.txt
+++ b/downloads-generation/data_mass_spec_annotated/requirements.txt
+shellinford
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -29,6 +29,10 @@ releases:
                - https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa
              default: false
+            - name: data_references
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
+              default: false
            - name: data_iedb
              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_iedb.20190916.tar.bz2
              default: false