From 66c3c43b8f80b3b38b441c83b5355176c6b7183b Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sun, 26 Jan 2020 14:04:43 -0500 Subject: [PATCH] fix --- .../data_evaluation/GENERATE.sh | 4 +- .../make_benchmark.py | 74 ++++++++++++++----- 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/downloads-generation/data_evaluation/GENERATE.sh b/downloads-generation/data_evaluation/GENERATE.sh index 010139c7..4f1cb778 100755 --- a/downloads-generation/data_evaluation/GENERATE.sh +++ b/downloads-generation/data_evaluation/GENERATE.sh @@ -63,6 +63,7 @@ else --hits "$(pwd)/hits_with_tpm.csv.bz2" \ --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \ --decoys-per-hit 99 \ + --exclude-train-data "$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms/train_data.csv.bz2" \ --only-format MONOALLELIC \ --out "$(pwd)/benchmark.monoallelic.csv" bzip2 -f benchmark.monoallelic.csv @@ -77,7 +78,8 @@ else cp $SCRIPT_DIR/make_benchmark.py . time python make_benchmark.py \ --hits "$(pwd)/hits_with_tpm.csv.bz2" \ - --proteome-peptides \""$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2"\" \ + --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \ + --exclude-train-data "$(mhcflurry-downloads path models_class1_pan)/models.combined/train_data.csv.bz2" \ --decoys-per-hit 99 \ --only-format MULTIALLELIC \ --out "$(pwd)/benchmark.multiallelic.csv" diff --git a/downloads-generation/models_class1_presentation/make_benchmark.py b/downloads-generation/models_class1_presentation/make_benchmark.py index 6d094eb4..7db98598 100644 --- a/downloads-generation/models_class1_presentation/make_benchmark.py +++ b/downloads-generation/models_class1_presentation/make_benchmark.py @@ -5,6 +5,7 @@ import sys import argparse import os import numpy +import collections import pandas import tqdm @@ -39,6 +40,11 @@ parser.add_argument( nargs="+", default=[], help="Include only the given PMID") +parser.add_argument( + "--exclude-train-data", + nargs="+", + default=[], + help="Remove hits and decoys included in the given training data") parser.add_argument( "--only-format", choices=("MONOALLELIC", "MULTIALLELIC"), @@ -64,6 +70,7 @@ def run(): (hit_df.peptide.str.match("^[%s]+$" % "".join( mhcflurry.amino_acid.COMMON_AMINO_ACIDS))) ] + hit_df['alleles'] = hit_df.hla.str.split().map(tuple) print("Loaded hits from %d samples" % hit_df.sample_id.nunique()) if args.only_format: hit_df = hit_df.loc[hit_df.format == args.only_format].copy() @@ -92,6 +99,31 @@ def run(): hit_df = new_hit_df.copy() print("Subselected by pmid to %d samples" % hit_df.sample_id.nunique()) + allele_to_excluded_peptides = collections.defaultdict(set) + for train_dataset in args.exclude_train_data: + print("Excluding hits from", train_dataset) + train_df = pandas.read_csv(train_dataset) + for (allele, peptides) in train_df.groupby("allele").peptide.unique().iteritems(): + allele_to_excluded_peptides[allele].update(peptides) + train_counts = train_df.groupby( + ["allele", "peptide"]).measurement_value.count().to_dict() + hit_no_train = hit_df.loc[ + [ + not any([ + train_counts.get((allele, row.peptide)) + for allele in row.alleles + ]) + for _, row in tqdm.tqdm(hit_df.iterrows(), total=len(hit_df))] + ] + print( + "Excluding hits from", + train_dataset, + "reduced dataset from", + len(hit_df), + "to", + len(hit_no_train)) + hit_df = hit_no_train + sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id") grouped = hit_df.groupby("sample_id").nunique() for col in sample_table.columns: @@ -112,26 +144,13 @@ def run(): all_peptides_by_length = dict(iter(all_peptides_df.groupby("length"))) - columns_to_keep = [ - "hit_id", - "protein_accession", - "n_flank", - "c_flank", - "peptide", - "sample_id", - "affinity_prediction", - "hit", - ] - print("Selecting decoys.") lengths = [8, 9, 10, 11] result_df = [] for sample_id, sub_df in tqdm.tqdm( - hit_df.loc[ - (hit_df.peptide.str.len() <= 11) & (hit_df.peptide.str.len() >= 8) - ].groupby("sample_id"), total=hit_df.sample_id.nunique()): + hit_df.groupby("sample_id"), total=hit_df.sample_id.nunique()): result_df.append( sub_df[[ "protein_accession", @@ -141,14 +160,29 @@ def run(): "c_flank", ]].copy()) result_df[-1]["hit"] = 1 + + excluded_peptides = set() + for allele in sample_table.loc[sample_id].alleles: + excluded_peptides.update(allele_to_excluded_peptides[allele]) + print( + sample_id, + "will exclude", + len(excluded_peptides), + "peptides from decoy universe") + for length in lengths: universe = all_peptides_by_length[length] - result_df.append(universe.loc[ - (~universe.peptide.isin(sub_df.peptide.unique())) & ( - universe.protein_accession.isin( - sub_df.protein_accession.unique()))].sample( - n=int(len(sub_df) * args.decoys_per_hit / len(lengths)))[ - ["protein_accession", "peptide", "n_flank", "c_flank"]]) + possible_universe = universe.loc[ + (~universe.peptide.isin(sub_df.peptide.unique())) & + (~universe.peptide.isin(excluded_peptides)) & + (universe.protein_accession.isin(sub_df.protein_accession.unique())) + ] + selected_decoys = possible_universe.sample( + n=int(len(sub_df) * args.decoys_per_hit / len(lengths))) + + result_df.append(selected_decoys[ + ["protein_accession", "peptide", "n_flank", "c_flank"] + ].copy()) result_df[-1]["hit"] = 0 result_df[-1]["sample_id"] = sample_id -- GitLab