diff --git a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh index 7189ccb2f62f794ff0fa4c46848c42fababec0be..9c26a55a2331209598a3f92fb14b56207ffea6aa 100755 --- a/downloads-generation/data_mass_spec_benchmark/GENERATE.sh +++ b/downloads-generation/data_mass_spec_benchmark/GENERATE.sh @@ -117,6 +117,29 @@ do bzip2 proteome_peptides.$subset.csv fi + # Run netmhcpan4 + OUT_DIR=predictions/${subset}.netmhcpan4 + REUSE1="" + REUSE2="" + if [ "$subset" == "all" ] + then + REUSE1="predictions/chr1.netmhcpan4" + fi + if [ "${2:-reuse-none}" != "reuse-none" ] + then + REUSE2="$EXISTING_DATA"/$OUT_DIR + fi + + python run_predictors.py \ + proteome_peptides.$subset.csv.bz2 \ + --predictor netmhcpan4 \ + --chunk-size 10000 \ + --allele $(cat alleles.txt) \ + --out "$OUT_DIR" \ + --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ + --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \ + --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS + # Run MHCflurry for kind in with_mass_spec no_mass_spec do @@ -144,29 +167,6 @@ do --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.gpu.lsf \ --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS done - - # Run netmhcpan4 - OUT_DIR=predictions/${subset}.netmhcpan4 - REUSE1="" - REUSE2="" - if [ "$subset" == "all" ] - then - REUSE1="predictions/chr1.netmhcpan4" - fi - if [ "${2:-reuse-none}" != "reuse-none" ] - then - REUSE2="$EXISTING_DATA"/$OUT_DIR - fi - - python run_predictors.py \ - proteome_peptides.$subset.csv.bz2 \ - --predictor netmhcpan4 \ - --chunk-size 10000 \ - --allele $(cat alleles.txt) \ - --out "$OUT_DIR" \ - --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ - --cluster-script-prefix-path $SCRIPT_DIR/cluster_submit_script_header.mssm_hpc.nogpu.lsf \ - --reuse-predictions "$REUSE1" "$REUSE2" $EXTRA_ARGS done cp $SCRIPT_ABSOLUTE_PATH . diff --git a/downloads-generation/data_mass_spec_benchmark/run_predictors.py b/downloads-generation/data_mass_spec_benchmark/run_predictors.py index 3c1a747f9f7a0f26b787194c64a3b8b837b0cc48..4dca1d20732ba0d758693fbffd4f0436ad9efd89 100644 --- a/downloads-generation/data_mass_spec_benchmark/run_predictors.py +++ b/downloads-generation/data_mass_spec_benchmark/run_predictors.py @@ -244,7 +244,8 @@ def run(argv=sys.argv[1:]): # We rerun any alleles have nulls for any kind of values # (e.g. affinity, percentile rank, elution score). - is_null_matrix = pandas.DataFrame(columns=alleles, dtype="int8") + is_null_matrix = pandas.DataFrame( + columns=alleles, index=result_df.index, dtype="int8") for (allele, sub_df) in manifest_df.groupby("allele"): is_null_matrix[allele] = result_df[sub_df.col.values].isnull().any(1) print("Fraction null", is_null_matrix.values.mean())