diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh index fbb1db71e6985a94a3beaebb31bf6d36b766ca72..b798aa58ab74d33e4446715554751d677e83678d 100755 --- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh @@ -137,7 +137,7 @@ do proteome_peptides.$subset.csv.bz2 \ --result-dtype "float16" \ --predictor netmhcpan4-$kind \ - --chunk-size 10000 \ + --chunk-size 5000 \ --allele $(cat alleles.txt) \ --out "$OUT_DIR" \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py index af6a7868ed0e70ac54fe51cd93f4be6a60c90ca4..482ed8ab58d0ce78ee059e7985a73248109bb58f 100644 --- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py +++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py @@ -311,8 +311,6 @@ def run(argv=sys.argv[1:]): for allele in alleles: allele_to_chunk_index_to_predictions[allele] = {} - last_write_time_per_column = dict((col, 0.0) for col in result_df.columns) - def write_col(col): out_path = os.path.join( args.out, col_to_filename[col]) @@ -322,6 +320,13 @@ def run(argv=sys.argv[1:]): result_df[col].isnull().mean() * 100.0), out_path) + print("Writing all columns.") + last_write_time_per_column = {} + for col in result_df.columns: + write_col(col) + last_write_time_per_column[col] = time.time() + print("Done writing all columns. Reading results.") + for worker_results in tqdm.tqdm(results, total=len(work_items)): for (work_item_num, col_to_predictions) in worker_results: for (col, predictions) in col_to_predictions.items():