diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
index d67bb086e86162f91704b630b2a49518ec959c78..f6dd0a692b18bf731a06e008e743e8a717c9168f 100755
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
@@ -76,8 +76,8 @@ REFERENCES_DIR=$(mhcflurry-downloads path data_references)
 
 if [ "${2:-reuse-none}" != "reuse-none" ]
 then
-    EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME)
-    echo "Will reuse data from $REFERENCES_DIR"
+    EXISTING_DATA="$(mhcflurry-downloads path $DOWNLOAD_NAME)"
+    echo "Will reuse data from $EXISTING_DATA"
 else
     EXISTING_DATA=""
     echo "Will NOT reuse any data"
@@ -121,14 +121,15 @@ do
     for kind in with_mass_spec no_mass_spec
     do
         OUT_DIR=predictions/${subset}.mhcflurry.${kind}
-        REUSE_ARG=""
+        REUSE1=""
+        REUSE2=""
         if [ "$subset" == "all" ]
         then
-            REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}"
+            REUSE1="predictions/chr1.mhcflurry.${kind}"
         fi
         if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ]
         then
-            REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
+            REUSE2="$EXISTING_DATA"/$OUT_DIR
         fi
 
         python run_predictors.py \
@@ -141,19 +142,20 @@ do
             --out "$OUT_DIR" \
             --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
             --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \
-            $REUSE_ARG $EXTRA_ARGS
+            --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
     done
 
     # Run netmhcpan4
     OUT_DIR=predictions/${subset}.netmhcpan4
-    REUSE_ARG=""
+    REUSE1=""
+    REUSE2=""
     if [ "$subset" == "all" ]
     then
-        REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4"
+        REUSE1="predictions/chr1.netmhcpan4"
     fi
     if [ "${2:-reuse-none}" != "reuse-none" ]
     then
-        REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR"
+        REUSE2="$EXISTING_DATA"/$OUT_DIR
     fi
 
     python run_predictors.py \
@@ -164,7 +166,7 @@ do
         --out "$OUT_DIR" \
         --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
         --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \
-        $REUSE_ARG $EXTRA_ARGS
+        --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS
 done
 
 cp $SCRIPT_ABSOLUTE_PATH .
@@ -182,6 +184,6 @@ do
     echo "WARNING: already exists: $i . Moving to $DEST"
     mv $i $DEST
 done
-split -b 2000M "$RESULT" "$PARTS"
+split -b 2000m "$RESULT" "$PARTS"
 echo "Split into parts:"
 ls -lh "${PARTS}"*
diff --git a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
index 444d2c1157a6d40d33315c7ffe1ad5d5b808c62a..363d02be3d0fddf2a3a6e174b87e21325f973ac2 100644
--- a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
+++ b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf
@@ -5,7 +5,7 @@
 #BSUB -R span[hosts=1] # one node
 #BSUB -n 1 # number of compute cores
 #BSUB -W 12:00 # walltime in HH:MM
-#BSUB -R rusage[mem=4000] # mb memory requested
+#BSUB -R rusage[mem=10000] # mb memory requested
 #BSUB -o {work_dir}/%J.stdout # output log (%J : JobID)
 #BSUB -eo {work_dir}/STDERR # error log
 #BSUB -L /bin/bash # Initialize the execution environment
diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
index ab1d6d1a9265cf7c06f3fc25fdfaecd14e874c52..7b58966f114f35309aea3b1c6f8a8ffc9ae6138a 100644
--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
@@ -75,7 +75,7 @@ parser.add_argument(
 parser.add_argument(
     "--reuse-predictions",
     metavar="DIR",
-    action="append",
+    nargs="*",
     help="Take predictions from indicated DIR instead of re-running them")
 
 add_local_parallelism_args(parser)
@@ -99,21 +99,31 @@ def load_results(dirname, result_df=None):
         len(manifest_df),
         "columns")
 
+    # Make adjustments for old style data. Can be removed later.
+    if "kind" not in manifest_df.columns:
+        manifest_df["kind"] = "affinity"
+    if "col" not in manifest_df.columns:
+        manifest_df["col"] = manifest_df.allele + " " + manifest_df.kind
+
     if result_df is None:
         result_df = pandas.DataFrame(
             index=peptides, columns=manifest_df.col.values, dtype="float32")
         result_df[:] = numpy.nan
+        peptides_to_assign = peptides
+        mask = None
     else:
         manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)]
-        peptides = peptides[peptides.isin(result_df.index)]
+        mask = (peptides.isin(result_df.index)).values
+        peptides_to_assign = peptides[mask]
 
     print("Will load", len(peptides), "peptides and", len(manifest_df), "cols")
 
-    for _, row in manifest_df.iterrows():
+    for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)):
         with open(os.path.join(dirname, row.path), "rb") as fd:
-            result_df.loc[
-                peptides.values, row.col
-            ] = numpy.load(fd)['arr_0']
+            value = numpy.load(fd)['arr_0']
+            if mask is not None:
+                value = value[mask]
+            result_df.loc[peptides_to_assign, row.col] = value
 
     return result_df
 
@@ -222,10 +232,15 @@ def run(argv=sys.argv[1:]):
 
     if args.reuse_predictions:
         for dirname in args.reuse_predictions:
-            print("Loading predictions", dirname)
-            result_df = load_results(dirname, result_df)
-            print("Existing data filled %f%% entries" % (
-                result_df.notnull().values.mean()))
+            if not dirname:
+                continue  # ignore empty strings
+            if os.path.exists(dirname):
+                print("Loading predictions", dirname)
+                result_df = load_results(dirname, result_df)
+                print("Existing data filled %f%% entries" % (
+                    result_df.notnull().values.mean()))
+            else:
+                print("WARNING: skipping because does not exist", dirname)
 
         # We rerun any alleles have nulls for any kind of values
         # (e.g. affinity, percentile rank, elution score).
@@ -306,6 +321,17 @@ def run(argv=sys.argv[1:]):
     for allele in alleles:
         allele_to_chunk_index_to_predictions[allele] = {}
 
+    last_write_time_per_column = dict((col, 0.0) for col in result_df.columns)
+
+    def write_col(col):
+        out_path = os.path.join(
+            args.out, col_to_filename[col])
+        numpy.savez(out_path, result_df[col].values)
+        print(
+            "Wrote [%f%% null]:" % (
+                result_df[col].isnull().mean() * 100.0),
+            out_path)
+
     for (work_item_num, col_to_predictions) in tqdm.tqdm(
             results, total=len(work_items)):
         for (col, predictions) in col_to_predictions.items():
@@ -313,13 +339,13 @@ def run(argv=sys.argv[1:]):
                 work_items[work_item_num]['peptides'],
                 col
             ] = predictions
-            out_path = os.path.join(
-                args.out, col_to_filename[col])
-            numpy.savez(out_path, result_df[col].values)
-            print(
-                "Wrote [%f%% null]:" % (
-                    result_df[col].isnull().mean() * 100.0),
-                out_path)
+            if time.time() - last_write_time_per_column[col] > 180:
+                write_col(col)
+                last_write_time_per_column[col] = time.time()
+
+    print("Done processing. Final write for each column.")
+    for col in result_df.columns:
+        write_col(col)
 
     print("Overall null rate (should be 0): %f" % (
         100.0 * result_df.isnull().values.flatten().mean()))