diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh index 9c26a55a2331209598a3f92fb14b56207ffea6aa..f42013b3761e01b7dc439ace6bdef9325da4dc12 100755 --- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh @@ -97,7 +97,9 @@ fi # Write out and process peptides. # First just chr1 peptides, then all peptides. -for subset in chr1 all +# TODO: switch this back +#for subset in chr1 all +for subset in all chr1 do if [ "$2" == "reuse-all" ] then @@ -123,7 +125,9 @@ do REUSE2="" if [ "$subset" == "all" ] then - REUSE1="predictions/chr1.netmhcpan4" + #REUSE1="predictions/chr1.netmhcpan4" + # TODO: switch this back + REUSE1="$EXISTING_DATA"/predictions/chr1.netmhcpan4 fi if [ "${2:-reuse-none}" != "reuse-none" ] then @@ -132,6 +136,7 @@ do python run_predictors.py \ proteome_peptides.$subset.csv.bz2 \ + --result-dtype "float16" \ --predictor netmhcpan4 \ --chunk-size 10000 \ --allele $(cat alleles.txt) \ @@ -148,7 +153,9 @@ do REUSE2="" if [ "$subset" == "all" ] then - REUSE1="predictions/chr1.mhcflurry.${kind}" + #REUSE1="predictions/chr1.mhcflurry.${kind}" + # TODO: switch this back + REUSE1="$EXISTING_DATA"/predictions/chr1.mhcflurry.${kind} fi if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ] then @@ -157,6 +164,7 @@ do python run_predictors.py \ proteome_peptides.${subset}.csv.bz2 \ + --result-dtype "float16" \ --predictor mhcflurry \ --chunk-size 500000 \ --mhcflurry-batch-size 65536 \ diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py index b2b9103e61f4898940de0d841727ce2ba8347e75..d5eabd07407c8b2c99d0dd32b1a4da05e1ebab6f 100644 --- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py +++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py @@ -77,6 +77,10 @@ parser.add_argument( metavar="DIR", nargs="*", help="Take predictions from indicated DIR instead of re-running them") +parser.add_argument( + "--result-dtype", + default="float32", + help="Numpy dtype of result. Default: %(default)s.") add_local_parallelism_args(parser) add_cluster_parallelism_args(parser) @@ -87,7 +91,7 @@ PREDICTOR_TO_COLS = { } -def load_results(dirname, result_df=None): +def load_results(dirname, result_df=None, dtype="float32"): peptides = pandas.read_csv( os.path.join(dirname, "peptides.csv")).peptide manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv")) @@ -107,7 +111,9 @@ def load_results(dirname, result_df=None): if result_df is None: result_df = pandas.DataFrame( - index=peptides, columns=manifest_df.col.values, dtype="float32") + index=peptides, + columns=manifest_df.col.values, + dtype=dtype) result_df[:] = numpy.nan peptides_to_assign = peptides mask = None @@ -227,23 +233,28 @@ def run(argv=sys.argv[1:]): print("Wrote: ", out_manifest) result_df = pandas.DataFrame( - index=peptides, columns=manifest_df.col.values, dtype="float32") + index=peptides, columns=manifest_df.col.values, dtype=args.result_dtype) result_df[:] = numpy.nan if args.reuse_predictions: + # Allocating this here to hit any memory errors as early as possible. + is_null_matrix = pandas.DataFrame( + columns=alleles, + index=result_df.index, + dtype="int8") + for dirname in args.reuse_predictions: if not dirname: continue # ignore empty strings if os.path.exists(dirname): print("Loading predictions", dirname) - result_df = load_results(dirname, result_df) + result_df = load_results( + dirname, result_df, dtype=args.result_dtype) else: print("WARNING: skipping because does not exist", dirname) # We rerun any alleles have nulls for any kind of values # (e.g. affinity, percentile rank, elution score). - is_null_matrix = pandas.DataFrame( - columns=alleles, index=result_df.index, dtype="int8") for (allele, sub_df) in manifest_df.groupby("allele"): is_null_matrix[allele] = result_df[sub_df.col.values].isnull().any(1) print("Fraction null", is_null_matrix.values.mean()) @@ -424,7 +435,8 @@ def do_predictions_mhctools(work_item_dicts, constant_data=None): for (allele, sub_df) in df.groupby("allele"): for col in cols: result["%s %s" % (allele, col)] = ( - sub_df[col].values.astype('float32')) + sub_df[col].values.astype( + constant_data['args'].result_dtype)) return results @@ -471,7 +483,7 @@ def do_predictions_mhcflurry(work_item_dicts, constant_data=None): throw=False, model_kwargs={ 'batch_size': args.mhcflurry_batch_size, - }).astype('float32') + }).astype(constant_data['args'].result_dtype) print("Done predicting in", time.time() - start, "sec") return results