diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh index d67bb086e86162f91704b630b2a49518ec959c78..f6dd0a692b18bf731a06e008e743e8a717c9168f 100755 --- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh @@ -76,8 +76,8 @@ REFERENCES_DIR=$(mhcflurry-downloads path data_references) if [ "${2:-reuse-none}" != "reuse-none" ] then - EXISTING_DATA=$(mhcflurry-downloads path $DOWNLOAD_NAME) - echo "Will reuse data from $REFERENCES_DIR" + EXISTING_DATA="$(mhcflurry-downloads path $DOWNLOAD_NAME)" + echo "Will reuse data from $EXISTING_DATA" else EXISTING_DATA="" echo "Will NOT reuse any data" @@ -121,14 +121,15 @@ do for kind in with_mass_spec no_mass_spec do OUT_DIR=predictions/${subset}.mhcflurry.${kind} - REUSE_ARG="" + REUSE1="" + REUSE2="" if [ "$subset" == "all" ] then - REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}" + REUSE1="predictions/chr1.mhcflurry.${kind}" fi if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ] then - REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR" + REUSE2="$EXISTING_DATA"/$OUT_DIR fi python run_predictors.py \ @@ -141,19 +142,20 @@ do --out "$OUT_DIR" \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \ - $REUSE_ARG $EXTRA_ARGS + --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS done # Run netmhcpan4 OUT_DIR=predictions/${subset}.netmhcpan4 - REUSE_ARG="" + REUSE1="" + REUSE2="" if [ "$subset" == "all" ] then - REUSE_ARG="--reuse-predictions predictions/chr1.netmhcpan4" + REUSE1="predictions/chr1.netmhcpan4" fi if [ "${2:-reuse-none}" != "reuse-none" ] then - REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR" + REUSE2="$EXISTING_DATA"/$OUT_DIR fi python run_predictors.py \ @@ -164,7 +166,7 @@ do --out "$OUT_DIR" \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \ - $REUSE_ARG $EXTRA_ARGS + --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS done cp $SCRIPT_ABSOLUTE_PATH . @@ -182,6 +184,6 @@ do echo "WARNING: already exists: $i . Moving to $DEST" mv $i $DEST done -split -b 2000M "$RESULT" "$PARTS" +split -b 2000m "$RESULT" "$PARTS" echo "Split into parts:" ls -lh "${PARTS}"* diff --git a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf index 444d2c1157a6d40d33315c7ffe1ad5d5b808c62a..363d02be3d0fddf2a3a6e174b87e21325f973ac2 100644 --- a/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf +++ b/downloads-generation/data_mass_spec_benchmark/cluster_submit_script_header.mssm_hpc.nogpu.lsf @@ -5,7 +5,7 @@ #BSUB -R span[hosts=1] # one node #BSUB -n 1 # number of compute cores #BSUB -W 12:00 # walltime in HH:MM -#BSUB -R rusage[mem=4000] # mb memory requested +#BSUB -R rusage[mem=10000] # mb memory requested #BSUB -o {work_dir}/%J.stdout # output log (%J : JobID) #BSUB -eo {work_dir}/STDERR # error log #BSUB -L /bin/bash # Initialize the execution environment diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py index ab1d6d1a9265cf7c06f3fc25fdfaecd14e874c52..7b58966f114f35309aea3b1c6f8a8ffc9ae6138a 100644 --- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py +++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py @@ -75,7 +75,7 @@ parser.add_argument( parser.add_argument( "--reuse-predictions", metavar="DIR", - action="append", + nargs="*", help="Take predictions from indicated DIR instead of re-running them") add_local_parallelism_args(parser) @@ -99,21 +99,31 @@ def load_results(dirname, result_df=None): len(manifest_df), "columns") + # Make adjustments for old style data. Can be removed later. + if "kind" not in manifest_df.columns: + manifest_df["kind"] = "affinity" + if "col" not in manifest_df.columns: + manifest_df["col"] = manifest_df.allele + " " + manifest_df.kind + if result_df is None: result_df = pandas.DataFrame( index=peptides, columns=manifest_df.col.values, dtype="float32") result_df[:] = numpy.nan + peptides_to_assign = peptides + mask = None else: manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)] - peptides = peptides[peptides.isin(result_df.index)] + mask = (peptides.isin(result_df.index)).values + peptides_to_assign = peptides[mask] print("Will load", len(peptides), "peptides and", len(manifest_df), "cols") - for _, row in manifest_df.iterrows(): + for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)): with open(os.path.join(dirname, row.path), "rb") as fd: - result_df.loc[ - peptides.values, row.col - ] = numpy.load(fd)['arr_0'] + value = numpy.load(fd)['arr_0'] + if mask is not None: + value = value[mask] + result_df.loc[peptides_to_assign, row.col] = value return result_df @@ -222,10 +232,15 @@ def run(argv=sys.argv[1:]): if args.reuse_predictions: for dirname in args.reuse_predictions: - print("Loading predictions", dirname) - result_df = load_results(dirname, result_df) - print("Existing data filled %f%% entries" % ( - result_df.notnull().values.mean())) + if not dirname: + continue # ignore empty strings + if os.path.exists(dirname): + print("Loading predictions", dirname) + result_df = load_results(dirname, result_df) + print("Existing data filled %f%% entries" % ( + result_df.notnull().values.mean())) + else: + print("WARNING: skipping because does not exist", dirname) # We rerun any alleles have nulls for any kind of values # (e.g. affinity, percentile rank, elution score). @@ -306,6 +321,17 @@ def run(argv=sys.argv[1:]): for allele in alleles: allele_to_chunk_index_to_predictions[allele] = {} + last_write_time_per_column = dict((col, 0.0) for col in result_df.columns) + + def write_col(col): + out_path = os.path.join( + args.out, col_to_filename[col]) + numpy.savez(out_path, result_df[col].values) + print( + "Wrote [%f%% null]:" % ( + result_df[col].isnull().mean() * 100.0), + out_path) + for (work_item_num, col_to_predictions) in tqdm.tqdm( results, total=len(work_items)): for (col, predictions) in col_to_predictions.items(): @@ -313,13 +339,13 @@ def run(argv=sys.argv[1:]): work_items[work_item_num]['peptides'], col ] = predictions - out_path = os.path.join( - args.out, col_to_filename[col]) - numpy.savez(out_path, result_df[col].values) - print( - "Wrote [%f%% null]:" % ( - result_df[col].isnull().mean() * 100.0), - out_path) + if time.time() - last_write_time_per_column[col] > 180: + write_col(col) + last_write_time_per_column[col] = time.time() + + print("Done processing. Final write for each column.") + for col in result_df.columns: + write_col(col) print("Overall null rate (should be 0): %f" % ( 100.0 * result_df.isnull().values.flatten().mean()))