diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
index 96d11498cacec8c152cd0c6418b775724cabf3e4..e5851c1ea15f6e106a121edd26e115807bc37199 100755
--- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
@@ -15,8 +15,8 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
 
 # Send stdout and stderr to a logfile included with the archive.
-#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
-#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 
 # Log some environment info
 date
@@ -27,11 +27,14 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
 cp $SCRIPT_DIR/annotate.py .
 
-INPUT=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
+PEPTIDES=$(mhcflurry-downloads path data_curated)/nontraining_curated.by_pmid.csv.bz2
+REFERENCES_DIR=$(mhcflurry-downloads path data_references)
 
-python annotate.py "$INPUT" --out annotated_ms.csv
-
-exit 1
+python annotate.py \
+    "$PEPTIDES" \
+    "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
+    "${REFERENCES_DIR}/uniprot_proteins.fm" \
+    --out annotated_ms.csv
 
 bzip2 annotated_ms.csv
 
diff --git a/downloads-generation/data_mass_spec_annotated/annotate.py b/downloads-generation/data_mass_spec_annotated/annotate.py
index 98cdabb076691f5ef6d564c834bdc17e79b9ea73..56b6b3a654252c270ec8c851753ed199cdfc00c6 100755
--- a/downloads-generation/data_mass_spec_annotated/annotate.py
+++ b/downloads-generation/data_mass_spec_annotated/annotate.py
@@ -3,94 +3,85 @@
 import sys
 import argparse
 import os
+import time
 import collections
 from six.moves import StringIO
 
 import pandas
+import tqdm  # progress bar
+tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
 
-import mhcnames
-
-
-def normalize_allele_name(s):
-    try:
-        return mhcnames.normalize_allele_name(s)
-    except Exception:
-        return "UNKNOWN"
+import shellinford
 
 
 parser = argparse.ArgumentParser(usage=__doc__)
 
 parser.add_argument(
-    "input_path",
-    help="Item to curate: PMID and list of files")
+    "peptides",
+    metavar="FILE.csv",
+    help="CSV of mass spec hits")
+parser.add_argument(
+    "reference_csv",
+    metavar="FILE.csv",
+    help="CSV of protein sequences")
+parser.add_argument(
+    "reference_index",
+    metavar="FILE.fm",
+    help="shellinford index over protein sequences")
 parser.add_argument(
     "--out",
     metavar="OUT.csv",
     help="Out file path")
 
 
-# Build index
-PREBUILT_INDEX = "datasets/uniprot-proteome_UP000005640.fasta.gz.fm"
-USE_PREBUILT_INDEX = os.path.exists(PREBUILT_INDEX)
-print("Using prebuilt index", USE_PREBUILT_INDEX)
-
-fm = shellinford.FMIndex()
-if USE_PREBUILT_INDEX:
-    fm.read(PREBUILT_INDEX)
-
-fm_keys = []
-protein_database = "datasets/uniprot-proteome_UP000005640.fasta.gz"
-start = time.time()
-proteome_df = []
-with gzip.open(protein_database, "rt") as fd:
-    records = SeqIO.parse(fd, format='fasta')
-    for (i, record) in enumerate(records):
-        if i % 10000 == 0:
-            print(i, time.time() - start)
-        fm_keys.append(record.name)
-        proteome_df.append((record.name, record.description, str(record.seq)))
-        if not USE_PREBUILT_INDEX:
-            fm.push_back("$" + str(record.seq) + "$")  # include sentinels
-
-if not USE_PREBUILT_INDEX:
-    print("Building")
-    start = time.time()
-    fm.build()
-    print("Done building", time.time() - start)
-    fm.write(PREBUILT_INDEX)
-
-proteome_df = pandas.DataFrame(proteome_df, columns=["name", "description", "seq"]).set_index("name")
-proteome_df
-
-SEARCH_CACHE = {}
-def search(peptide, fm=fm):
-    if peptide in SEARCH_CACHE:
-        return SEARCH_CACHE[peptide]
-    hits = fm.search(peptide)
-    result = proteome_df.iloc[
-        [hit.doc_id for hit in hits]
-    ]
-    assert result.seq.str.contains(peptide).all(), (peptide, result)
-    names = result.index.tolist()
-    SEARCH_CACHE[peptide] = names
-    return names
-print(search("SIINFEKL"))
-print(search("AAAAAKVPA"))
-print(search("AAAAALQAK"))
-print(search("DEGPLDVSM"))
+def run():
+    args = parser.parse_args(sys.argv[1:])
 
+    df = pandas.read_csv(args.peptides)
+    df["hit_id"] = "hit." + df.index.map(str)
+    df = df.set_index("hit_id")
+    print("Read peptides", df.shape, *df.columns.tolist())
 
+    reference_df = pandas.read_csv(args.reference_csv, index_col=0)
+    reference_df = reference_df.set_index("accession")
+    print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
 
+    fm = shellinford.FMIndex()
+    fm.read(args.reference_index)
+    print("Read proteins index")
 
-def run():
-    args = parser.parse_args(sys.argv[1:])
+    join_df = []
+    for (hit_id, row) in tqdm.tqdm(df.iterrows(), total=len(df)):
+        matches = fm.search(row.peptide)
+        for match in matches:
+            join_df.append((hit_id, match.doc_id, len(matches)))
 
-    df = pandas.read_csv(args.input_path)
-    print("Read input", df.shape)
+    join_df = pandas.DataFrame(
+        join_df,
+        columns=["hit_id", "match_index", "num_proteins"],
+    )
 
-    import ipdb ; ipdb.set_trace()
+    join_df["protein_accession"] = join_df.match_index.map(
+        reference_df.index.to_series().reset_index(drop=True))
 
-    df.to_csv(args.out, index=False)
+    del join_df["match_index"]
+
+    protein_cols = [
+        c for c in reference_df.columns
+        if c not in ["name", "description", "seq"]
+    ]
+    for col in protein_cols:
+        join_df["protein_%s" % col] = join_df.protein_accession.map(
+            reference_df[col])
+
+    merged_df = pandas.merge(
+        join_df,
+        df,
+        how="left",
+        left_on="hit_id",
+        right_index=True)
+
+    merged_df.to_csv(args.out, index=False)
     print("Wrote: %s" % os.path.abspath(args.out))
 
 
diff --git a/downloads-generation/data_mass_spec_annotated/requirements.txt b/downloads-generation/data_mass_spec_annotated/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..917704c491264bd5002fd01ca2647b7df40585b5
--- /dev/null
+++ b/downloads-generation/data_mass_spec_annotated/requirements.txt
@@ -0,0 +1,2 @@
+shellinford
+
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index 3e05c91e5b5ddf92045db2feff828e7dde51f132..e8499087b796430ae08f0b73d846e2a6886ec0ac 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -29,6 +29,10 @@ releases:
                 - https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa
               default: false
 
+            - name: data_references
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
+              default: false
+
             - name: data_iedb
               url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_iedb.20190916.tar.bz2
               default: false