From b8e76a1fde6c57385198120380212e1ef950a9a3 Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sun, 13 Oct 2019 14:59:33 -0400
Subject: [PATCH] fix

---
 .../data_mass_spec_benchmark/GENERATE.sh                 | 2 +-
 .../data_mass_spec_benchmark/run_predictors.py           | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
index fbb1db71..b798aa58 100755
--- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
+++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh
@@ -137,7 +137,7 @@ do
             proteome_peptides.$subset.csv.bz2 \
             --result-dtype "float16" \
             --predictor netmhcpan4-$kind \
-            --chunk-size 10000 \
+            --chunk-size 5000 \
             --allele $(cat alleles.txt) \
             --out "$OUT_DIR" \
             --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
index af6a7868..482ed8ab 100644
--- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py
+++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py
@@ -311,8 +311,6 @@ def run(argv=sys.argv[1:]):
     for allele in alleles:
         allele_to_chunk_index_to_predictions[allele] = {}
 
-    last_write_time_per_column = dict((col, 0.0) for col in result_df.columns)
-
     def write_col(col):
         out_path = os.path.join(
             args.out, col_to_filename[col])
@@ -322,6 +320,13 @@ def run(argv=sys.argv[1:]):
                 result_df[col].isnull().mean() * 100.0),
             out_path)
 
+    print("Writing all columns.")
+    last_write_time_per_column = {}
+    for col in result_df.columns:
+        write_col(col)
+        last_write_time_per_column[col] = time.time()
+    print("Done writing all columns. Reading results.")
+
     for worker_results in tqdm.tqdm(results, total=len(work_items)):
         for (work_item_num, col_to_predictions) in worker_results:
             for (col, predictions) in col_to_predictions.items():
-- 
GitLab