From 339e7caf02402aeb7bf583fee1c139003b511ff8 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sun, 26 Jan 2020 22:20:10 -0500 Subject: [PATCH] update --- .../data_evaluation/GENERATE.sh | 18 ++- .../data_evaluation/join_with_precomputed.py | 128 ++++++++++++++++++ 2 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 downloads-generation/data_evaluation/join_with_precomputed.py diff --git a/downloads-generation/data_evaluation/GENERATE.sh b/downloads-generation/data_evaluation/GENERATE.sh index 4f1cb778..a004431b 100755 --- a/downloads-generation/data_evaluation/GENERATE.sh +++ b/downloads-generation/data_evaluation/GENERATE.sh @@ -40,6 +40,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME export OMP_NUM_THREADS=1 export PYTHONUNBUFFERED=1 +export MHCFLURRY_DEFAULT_PREDICT_BATCH_SIZE=16384 if [ "$2" == "continue-incomplete" ] && [ -f "hits_with_tpm.csv.bz2" ] then @@ -67,7 +68,6 @@ else --only-format MONOALLELIC \ --out "$(pwd)/benchmark.monoallelic.csv" bzip2 -f benchmark.monoallelic.csv - rm -f benchmark.monoallelic.predictions.csv.bz2 fi ### GENERATE BENCHMARK: MULTIALLELIC @@ -174,6 +174,22 @@ else echo bzip2 -f "$(pwd)/benchmark.multiallelic.presentation_without_flanks.csv" >> commands/multiallelic.presentation_without_flanks.sh fi +### PRECOMPUTED #### +for variant in netmhcpan4.ba netmhcpan4.el mixmhcpred +do + if [ "$2" == "continue-incomplete" ] && [ -f "benchmark.multiallelic.${variant}.csv.bz2" ] + then + echo "Reusing existing multiallelic ${variant}" + else + echo time python join_with_precomputed.py \ + \""$(pwd)/benchmark.multiallelic.csv.bz2"\" \ + \""$(mhcflurry-downloads path data_mass_spec_benchmark)/predictions/all.${variant}"\" \ + ${variant} \ + --out "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh + echo bzip2 -f "$(pwd)/benchmark.multiallelic.${variant}.csv" >> commands/multiallelic.${variant}.sh + fi +done + ls -lh commands if [ "$1" != "cluster" ] diff --git a/downloads-generation/data_evaluation/join_with_precomputed.py b/downloads-generation/data_evaluation/join_with_precomputed.py new file mode 100644 index 00000000..12ba4d92 --- /dev/null +++ b/downloads-generation/data_evaluation/join_with_precomputed.py @@ -0,0 +1,128 @@ +""" +Join benchmark with precomputed predictions. +""" +import sys +import argparse +import os +import numpy +import collections + +import pandas +import tqdm + +import mhcflurry +from mhcflurry.downloads import get_path + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "benchmark") +parser.add_argument( + "precomputed_data") +parser.add_argument( + "predictors", + nargs="+", + choices=("netmhcpan4.ba", "netmhcpan4.el", "mixmhcpred")) +parser.add_argument( + "--out", + metavar="CSV", + required=True, + help="File to write") + + +def load_results(dirname, result_df=None, columns=None): + peptides = pandas.read_csv(os.path.join(dirname, "peptides.csv")).peptide + manifest_df = pandas.read_csv(os.path.join(dirname, "alleles.csv")) + + print("Loading results. Existing data has", len(peptides), "peptides and", + len(manifest_df), "columns") + + if columns is None: + columns = manifest_df.col.values + + if result_df is None: + result_df = pandas.DataFrame(index=peptides, columns=columns, + dtype="float32") + result_df[:] = numpy.nan + peptides_to_assign = peptides + mask = None + else: + mask = (peptides.isin(result_df.index)).values + peptides_to_assign = peptides[mask] + + manifest_df = manifest_df.loc[manifest_df.col.isin(result_df.columns)] + + print("Will load", len(peptides), "peptides and", len(manifest_df), "cols") + + for _, row in tqdm.tqdm(manifest_df.iterrows(), total=len(manifest_df)): + with open(os.path.join(dirname, row.path), "rb") as fd: + value = numpy.load(fd)['arr_0'] + if mask is not None: + value = value[mask] + result_df.loc[peptides_to_assign, row.col] = value + + return result_df + + +def run(): + args = parser.parse_args(sys.argv[1:]) + df = pandas.read_csv(args.hits) + + df["alleles"] = df.hla.str.split() + + peptides = df.peptide.unique() + alleles = set() + for some in df.hla.unique(): + alleles.update(some.split()) + + predictions_dfs = {} + + if 'netmhcpan4.ba' in args.predictor: + predictions_dfs['netmhcpan4.ba'] = load_results( + get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.ba"), + result_df=pandas.DataFrame( + index=peptides, + columns=["%s affinity" % a for a in alleles])).rename( + columns=lambda s: s.replace("affinity", "").strip()) + predictions_dfs['netmhcpan4.ba'] *= -1 + + if 'netmhcpan4.el' in args.predictor: + predictions_dfs['netmhcpan4.el'] = load_results( + get_path("data_mass_spec_benchmark", "predictions/all.netmhcpan4.el"), + result_df=pandas.DataFrame( + index=peptides, + columns=["%s score" % a for a in alleles])).rename( + columns=lambda s: s.replace("score", "").strip()) + + if 'mixmhcpred' in args.predictor: + predictions_dfs['mixmhcpred'] = load_results( + get_path("data_mass_spec_benchmark", "predictions/all.mixmhcpred"), + result_df=pandas.DataFrame( + index=peptides, + columns=["%s score" % a for a in alleles])).rename( + columns=lambda s: s.replace("score", "").strip()) + + skip_experiments = set() + + for hla_text, sub_df in tqdm.tqdm(df.groupby("hla"), total=df.hla.nunique()): + hla = hla_text.split() + for (name, precomputed_df) in predictions_dfs.items(): + df.loc[sub_df.index, name] = numpy.nan + prediction_df = pandas.DataFrame(index=sub_df.peptide) + for allele in hla: + if allele not in precomputed_df.columns or precomputed_df[allele].isnull().all(): + print(sub_df.sample_id.unique(), hla) + skip_experiments.update(sub_df.sample_id.unique()) + prediction_df[allele] = precomputed_df.loc[df.index, allele] + df.loc[sub_df.index, name] = prediction_df.max(1, skipna=False).values + df.loc[sub_df.index, name + " allele"] = prediction_df.idxmax(1, skipna=False).values + + print("Skip experiments", skip_experiments) + print("results") + print(df) + + df.to_csv(args.out) + print("Wrote", args.out) + +if __name__ == '__main__': + run() -- GitLab