From 66c3c43b8f80b3b38b441c83b5355176c6b7183b Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sun, 26 Jan 2020 14:04:43 -0500
Subject: [PATCH] fix

---
 .../data_evaluation/GENERATE.sh               |  4 +-
 .../make_benchmark.py                         | 74 ++++++++++++++-----
 2 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/downloads-generation/data_evaluation/GENERATE.sh b/downloads-generation/data_evaluation/GENERATE.sh
index 010139c7..4f1cb778 100755
--- a/downloads-generation/data_evaluation/GENERATE.sh
+++ b/downloads-generation/data_evaluation/GENERATE.sh
@@ -63,6 +63,7 @@ else
         --hits "$(pwd)/hits_with_tpm.csv.bz2" \
         --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
         --decoys-per-hit 99 \
+        --exclude-train-data "$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms/train_data.csv.bz2" \
         --only-format MONOALLELIC \
         --out "$(pwd)/benchmark.monoallelic.csv"
     bzip2 -f benchmark.monoallelic.csv
@@ -77,7 +78,8 @@ else
     cp $SCRIPT_DIR/make_benchmark.py .
     time python make_benchmark.py \
         --hits "$(pwd)/hits_with_tpm.csv.bz2" \
-        --proteome-peptides \""$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2"\" \
+        --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
+        --exclude-train-data "$(mhcflurry-downloads path models_class1_pan)/models.combined/train_data.csv.bz2" \
         --decoys-per-hit 99 \
         --only-format MULTIALLELIC \
         --out "$(pwd)/benchmark.multiallelic.csv"
diff --git a/downloads-generation/models_class1_presentation/make_benchmark.py b/downloads-generation/models_class1_presentation/make_benchmark.py
index 6d094eb4..7db98598 100644
--- a/downloads-generation/models_class1_presentation/make_benchmark.py
+++ b/downloads-generation/models_class1_presentation/make_benchmark.py
@@ -5,6 +5,7 @@ import sys
 import argparse
 import os
 import numpy
+import collections
 
 import pandas
 import tqdm
@@ -39,6 +40,11 @@ parser.add_argument(
     nargs="+",
     default=[],
     help="Include only the given PMID")
+parser.add_argument(
+    "--exclude-train-data",
+    nargs="+",
+    default=[],
+    help="Remove hits and decoys included in the given training data")
 parser.add_argument(
     "--only-format",
     choices=("MONOALLELIC", "MULTIALLELIC"),
@@ -64,6 +70,7 @@ def run():
         (hit_df.peptide.str.match("^[%s]+$" % "".join(
             mhcflurry.amino_acid.COMMON_AMINO_ACIDS)))
     ]
+    hit_df['alleles'] = hit_df.hla.str.split().map(tuple)
     print("Loaded hits from %d samples" % hit_df.sample_id.nunique())
     if args.only_format:
         hit_df = hit_df.loc[hit_df.format == args.only_format].copy()
@@ -92,6 +99,31 @@ def run():
         hit_df = new_hit_df.copy()
         print("Subselected by pmid to %d samples" % hit_df.sample_id.nunique())
 
+    allele_to_excluded_peptides = collections.defaultdict(set)
+    for train_dataset in args.exclude_train_data:
+        print("Excluding hits from", train_dataset)
+        train_df = pandas.read_csv(train_dataset)
+        for (allele, peptides) in train_df.groupby("allele").peptide.unique().iteritems():
+            allele_to_excluded_peptides[allele].update(peptides)
+        train_counts = train_df.groupby(
+            ["allele", "peptide"]).measurement_value.count().to_dict()
+        hit_no_train = hit_df.loc[
+            [
+                not any([
+                    train_counts.get((allele, row.peptide))
+                    for allele in row.alleles
+                ])
+            for _, row in tqdm.tqdm(hit_df.iterrows(), total=len(hit_df))]
+        ]
+        print(
+            "Excluding hits from",
+            train_dataset,
+            "reduced dataset from",
+            len(hit_df),
+            "to",
+            len(hit_no_train))
+        hit_df = hit_no_train
+
     sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id")
     grouped = hit_df.groupby("sample_id").nunique()
     for col in sample_table.columns:
@@ -112,26 +144,13 @@ def run():
 
     all_peptides_by_length = dict(iter(all_peptides_df.groupby("length")))
 
-    columns_to_keep = [
-        "hit_id",
-        "protein_accession",
-        "n_flank",
-        "c_flank",
-        "peptide",
-        "sample_id",
-        "affinity_prediction",
-        "hit",
-    ]
-
     print("Selecting decoys.")
 
     lengths = [8, 9, 10, 11]
     result_df = []
 
     for sample_id, sub_df in tqdm.tqdm(
-            hit_df.loc[
-                (hit_df.peptide.str.len() <= 11) & (hit_df.peptide.str.len() >= 8)
-            ].groupby("sample_id"), total=hit_df.sample_id.nunique()):
+            hit_df.groupby("sample_id"), total=hit_df.sample_id.nunique()):
         result_df.append(
             sub_df[[
                 "protein_accession",
@@ -141,14 +160,29 @@ def run():
                 "c_flank",
             ]].copy())
         result_df[-1]["hit"] = 1
+
+        excluded_peptides = set()
+        for allele in sample_table.loc[sample_id].alleles:
+            excluded_peptides.update(allele_to_excluded_peptides[allele])
+        print(
+            sample_id,
+            "will exclude",
+            len(excluded_peptides),
+            "peptides from decoy universe")
+
         for length in lengths:
             universe = all_peptides_by_length[length]
-            result_df.append(universe.loc[
-                (~universe.peptide.isin(sub_df.peptide.unique())) & (
-                    universe.protein_accession.isin(
-                        sub_df.protein_accession.unique()))].sample(
-                n=int(len(sub_df) * args.decoys_per_hit / len(lengths)))[
-                ["protein_accession", "peptide", "n_flank", "c_flank"]])
+            possible_universe = universe.loc[
+                (~universe.peptide.isin(sub_df.peptide.unique())) &
+                (~universe.peptide.isin(excluded_peptides)) &
+                (universe.protein_accession.isin(sub_df.protein_accession.unique()))
+            ]
+            selected_decoys = possible_universe.sample(
+                n=int(len(sub_df) * args.decoys_per_hit / len(lengths)))
+
+            result_df.append(selected_decoys[
+                ["protein_accession", "peptide", "n_flank", "c_flank"]
+            ].copy())
             result_df[-1]["hit"] = 0
             result_df[-1]["sample_id"] = sample_id
 
-- 
GitLab