diff --git a/downloads-generation/data_mass_spec_annotated/GENERATE.sh b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
index e5851c1ea15f6e106a121edd26e115807bc37199..6fd9539030e657a7ead2ebf2c6e46480f868c3c4 100755
--- a/downloads-generation/data_mass_spec_annotated/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_annotated/GENERATE.sh
@@ -35,7 +35,6 @@ python annotate.py \
     "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
     "${REFERENCES_DIR}/uniprot_proteins.fm" \
     --out annotated_ms.csv
-
 bzip2 annotated_ms.csv
 
 cp $SCRIPT_ABSOLUTE_PATH .
diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
new file mode 100755
index 0000000000000000000000000000000000000000..46e3804e84e1b8c463cebc8f9f0edb930e6137a0
--- /dev/null
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+#
+#
+set -e
+set -x
+
+DOWNLOAD_NAME=data_mass_spec_benchmark
+SCRATCH_DIR=${TMPDIR-/tmp}/mhcflurry-downloads-generation
+SCRIPT_ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/$(basename "${BASH_SOURCE[0]}")"
+SCRIPT_DIR=$(dirname "$SCRIPT_ABSOLUTE_PATH")
+export PYTHONUNBUFFERED=1
+
+mkdir -p "$SCRATCH_DIR"
+rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
+mkdir "$SCRATCH_DIR/$DOWNLOAD_NAME"
+
+# Send stdout and stderr to a logfile included with the archive.
+#exec >  >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt")
+#exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
+
+# Log some environment info
+date
+pip freeze
+git status
+
+cd $SCRATCH_DIR/$DOWNLOAD_NAME
+
+cp $SCRIPT_DIR/write_proteome_peptides.py .
+
+PEPTIDES=$(mhcflurry-downloads path data_mass_spec_annotated)/all_sequences.csv.bz2
+REFERENCES_DIR=$(mhcflurry-downloads path data_references)
+
+python write_proteome_peptides.py \
+    "$PEPTIDES" \
+    "${REFERENCES_DIR}/uniprot_proteins.csv.bz2" \
+    --out proteome_peptides.csv
+ls -lh proteome_peptides.csv
+bzip2 proteome_peptides.csv
+
+cp $SCRIPT_ABSOLUTE_PATH .
+bzip2 LOG.txt
+RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
+tar -cjf "$RESULT" *
+echo "Created archive: $RESULT"
diff --git a/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py
new file mode 100644
index 0000000000000000000000000000000000000000..76fa9bb2e63c96a7ad8fb3e2f863818e1225e515
--- /dev/null
+++ b/downloads-generation/data_mass_spec_benchmark/write_proteome_peptides.py
@@ -0,0 +1,127 @@
+"""
+"""
+import sys
+import argparse
+import os
+import time
+import collections
+import re
+from six.moves import StringIO
+
+import pandas
+import tqdm  # progress bar
+tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
+
+import shellinford
+
+
+parser = argparse.ArgumentParser(usage=__doc__)
+
+parser.add_argument(
+    "input",
+    metavar="FILE.csv",
+    help="CSV of annotated mass spec hits")
+parser.add_argument(
+    "reference_csv",
+    metavar="FILE.csv",
+    help="CSV of protein sequences")
+parser.add_argument(
+    "--out",
+    metavar="OUT.csv",
+    help="Out file path")
+parser.add_argument(
+    "--debug-max-rows",
+    metavar="N",
+    type=int,
+    default=10,
+    help="Max rows to process. Useful for debugging. If specified an ipdb "
+    "debugging session is also opened at the end of the script")
+parser.add_argument(
+    "--lengths",
+    metavar="N",
+    type=int,
+    nargs="+",
+    default=[8,9,10,11],
+    help="Peptide lengths")
+
+
+def run():
+    args = parser.parse_args(sys.argv[1:])
+
+    df_original = pandas.read_csv(args.input)
+    df = df_original
+    print("Read peptides", df.shape, *df.columns.tolist())
+
+    reference_df = pandas.read_csv(args.reference_csv, index_col=0)
+    reference_df = reference_df.set_index("accession")
+    print("Read proteins", reference_df.shape, *reference_df.columns.tolist())
+
+    print("Subselecting to MHC I hits. Before: ", len(df))
+    df = df.loc[df.mhc_class == "I"]
+    print("After: ", len(df))
+
+    print("Subselecting to gene-associated hits. Before: ", len(df))
+    df = df.loc[~df.protein_ensembl_primary.isnull()]
+    print("After: ", len(df))
+
+    (flanking_length,) = list(
+        set(df.n_flank.str.len().unique()).union(
+            set(df.n_flank.str.len().unique())))
+    print("Flanking length", flanking_length)
+
+    proteins = df.protein_accession.unique()
+
+    if args.debug_max_rows:
+        proteins = proteins[:args.debug_max_rows]
+
+    print("Writing decoys for %d proteins" % len(proteins))
+
+    reference_df = reference_df.loc[proteins]
+
+    lengths = sorted(args.lengths)
+    rows = []
+    total = len(reference_df)
+    for (accession, info) in tqdm.tqdm(reference_df.iterrows(), total=total):
+        seq = info.seq
+        for start in range(0, len(seq) - min(args.lengths)):
+            for length in lengths:
+                end_pos = start + length
+                if end_pos > len(seq):
+                    break
+                n_flank = seq[
+                    max(start - flanking_length, 0) : start
+                ].rjust(flanking_length, 'X')
+                c_flank = seq[
+                    end_pos : (end_pos + flanking_length)
+                ].ljust(flanking_length, 'X')
+                peptide = seq[start : start + length]
+
+                rows.append((
+                    accession,
+                    peptide,
+                    n_flank,
+                    c_flank,
+                    start
+                ))
+
+    result_df = pandas.DataFrame(
+        rows,
+        columns=[
+            "protein_accession",
+            "peptide",
+            "n_flank",
+            "c_flank",
+            "start_position",
+        ])
+
+    result_df.to_csv(args.out, index=False)
+    print("Wrote: %s" % os.path.abspath(args.out))
+
+    if args.debug_max_rows:
+        # Leave user in a debugger
+        import ipdb
+        ipdb.set_trace()
+
+
+if __name__ == '__main__':
+    run()
diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml
index e8499087b796430ae08f0b73d846e2a6886ec0ac..6ae2b54971bc7bda301e4e59f2576178cbc2500c 100644
--- a/mhcflurry/downloads.yml
+++ b/mhcflurry/downloads.yml
@@ -29,6 +29,10 @@ releases:
                 - https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/models_class1_pan_unselected.20190924.tar.bz2.part.aa
               default: false
 
+            - name: data_mass_spec_annotated
+              url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_mass_spec_annotated.20190930.tar.bz2
+              default: false
+
             - name: data_references
               url: https://github.com/openvax/mhcflurry/releases/download/pre-1.4.0/data_references.20190927.tar.bz2
               default: false