From 1fcf39fc73702cd4a66b063ccf2282de5a3d8a65 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Thu, 3 Oct 2019 16:57:48 -0400 Subject: [PATCH] fixes --- .../data_mass_spec_benchmark/GENERATE.sh | 25 +++++++++++++++++-- .../run_predictors.py | 21 +++++++++++----- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh index bab80ffe..d67bb086 100755 --- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh @@ -1,6 +1,27 @@ #!/bin/bash # -# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions> +# This download includes predictions for MHCflurry and NetMHCpan 4.0 over a +# large number of peptides encompassing almost the full proteome. +# +# Usage: +# GENERATE.sh <local|cluster> <reuse-all|reuse-none|reuse-predictions|reuse-predictions-except-mhcflurry> +# +# The first choice listed above for each argument is the default. +# +# Meanings for these arguments: +# +# FIRST ARGUMENT: where to run +# local - run locally using NUM_JOBS cores. +# cluster - run on cluster. +# +# SECOND ARGUMENT: whether to reuse predictions from existing downloaded data +# reuse-all - reuse predictions and peptide / allele lists from existing +# downloaded data_mass_spec_benchmark. +# reuse-none - fully self-contained run; do not reuse anything. +# reuse-predictions - reuse predictions but not peptide or allele lists. Any +# new peptides not already included will be run. +# reuse-predictions-except-mhcflurry +# - Reuse predictions except for mhcflurry. # set -e set -x @@ -105,7 +126,7 @@ do then REUSE_ARG="--reuse-predictions predictions/chr1.mhcflurry.${kind}" fi - if [ "${2:-reuse-none}" != "reuse-none" ] + if [ "${2:-reuse-none}" != "reuse-none" ] && [ "${2:-reuse-none}" != "reuse-predictions-except-mhcflurry" ] then REUSE_ARG+="--reuse-predictions" "$EXISTING_DATA/$OUT_DIR" fi diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py index a86ce0da..ab1d6d1a 100644 --- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py +++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py @@ -81,6 +81,11 @@ parser.add_argument( add_local_parallelism_args(parser) add_cluster_parallelism_args(parser) +PREDICTOR_TO_COLS = { + "mhcflurry": ["affinity"], + "netmhcpan4": ["affinity", "percentile_rank", "elution_score"], +} + def load_results(dirname, result_df=None): peptides = pandas.read_csv( @@ -188,6 +193,7 @@ def run(argv=sys.argv[1:]): GLOBAL_DATA["predictor"] = args.predictor GLOBAL_DATA["args"] = args + GLOBAL_DATA["cols"] = PREDICTOR_TO_COLS[args.predictor] # Write peptide and allele lists to out dir. out_peptides = os.path.abspath(os.path.join(args.out, "peptides.csv")) @@ -196,7 +202,7 @@ def run(argv=sys.argv[1:]): manifest_df = [] for allele in alleles: - for col in ["affinity", "percentile_rank", "elution_score"]: + for col in PREDICTOR_TO_COLS[args.predictor]: manifest_df.append((allele, col)) manifest_df = pandas.DataFrame( manifest_df, columns=["allele", "kind"]) @@ -222,7 +228,7 @@ def run(argv=sys.argv[1:]): result_df.notnull().values.mean())) # We rerun any alleles have nulls for any kind of values - # (affinity, percentile rank, elution score). + # (e.g. affinity, percentile rank, elution score). print("Computing blocks.") start = time.time() blocks = blocks_of_ones(result_df.isnull().values) @@ -327,7 +333,8 @@ def run(argv=sys.argv[1:]): prediction_time / 60.0)) -def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None): +def do_predictions_mhctools( + work_item_num, peptides, alleles, constant_data=None): # This may run on the cluster in a way that misses all top level imports, # so we have to re-import everything here. import time @@ -345,6 +352,8 @@ def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None else: raise ValueError("Unsupported", predictor_name) + cols = constant_data['cols'] + start = time.time() df = predictor.predict_peptides_dataframe(peptides) print("Generated predictions for %d peptides x %d alleles in %0.2f sec." % ( @@ -352,9 +361,8 @@ def do_predictions_mhctools(work_item_num, peptides, alleles, constant_data=None results = {} for (allele, sub_df) in df.groupby("allele"): - for col in ["affinity", "percentile_rank", "elution_score"]: - results["%s %s" % (allele, col)] = sub_df[col].values.astype( - 'float32') + for col in cols: + results["%s %s" % (allele, col)] = sub_df[col].values.astype('float32') return (work_item_num, results) @@ -371,6 +379,7 @@ def do_predictions_mhcflurry(work_item_num, peptides, alleles, constant_data=Non args = constant_data['args'] assert args.predictor == "mhcflurry" + assert constant_data['cols'] == ["affinity"] predictor = Class1AffinityPredictor.load(args.mhcflurry_models_dir) -- GitLab