Skip to content
Snippets Groups Projects
Commit 66c3c43b authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fix

parent d3184487
No related branches found
No related tags found
No related merge requests found
...@@ -63,6 +63,7 @@ else ...@@ -63,6 +63,7 @@ else
--hits "$(pwd)/hits_with_tpm.csv.bz2" \ --hits "$(pwd)/hits_with_tpm.csv.bz2" \
--proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \ --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
--decoys-per-hit 99 \ --decoys-per-hit 99 \
--exclude-train-data "$(mhcflurry-downloads path models_class1_pan_variants)/models.no_additional_ms/train_data.csv.bz2" \
--only-format MONOALLELIC \ --only-format MONOALLELIC \
--out "$(pwd)/benchmark.monoallelic.csv" --out "$(pwd)/benchmark.monoallelic.csv"
bzip2 -f benchmark.monoallelic.csv bzip2 -f benchmark.monoallelic.csv
...@@ -77,7 +78,8 @@ else ...@@ -77,7 +78,8 @@ else
cp $SCRIPT_DIR/make_benchmark.py . cp $SCRIPT_DIR/make_benchmark.py .
time python make_benchmark.py \ time python make_benchmark.py \
--hits "$(pwd)/hits_with_tpm.csv.bz2" \ --hits "$(pwd)/hits_with_tpm.csv.bz2" \
--proteome-peptides \""$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2"\" \ --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \
--exclude-train-data "$(mhcflurry-downloads path models_class1_pan)/models.combined/train_data.csv.bz2" \
--decoys-per-hit 99 \ --decoys-per-hit 99 \
--only-format MULTIALLELIC \ --only-format MULTIALLELIC \
--out "$(pwd)/benchmark.multiallelic.csv" --out "$(pwd)/benchmark.multiallelic.csv"
......
...@@ -5,6 +5,7 @@ import sys ...@@ -5,6 +5,7 @@ import sys
import argparse import argparse
import os import os
import numpy import numpy
import collections
import pandas import pandas
import tqdm import tqdm
...@@ -39,6 +40,11 @@ parser.add_argument( ...@@ -39,6 +40,11 @@ parser.add_argument(
nargs="+", nargs="+",
default=[], default=[],
help="Include only the given PMID") help="Include only the given PMID")
parser.add_argument(
"--exclude-train-data",
nargs="+",
default=[],
help="Remove hits and decoys included in the given training data")
parser.add_argument( parser.add_argument(
"--only-format", "--only-format",
choices=("MONOALLELIC", "MULTIALLELIC"), choices=("MONOALLELIC", "MULTIALLELIC"),
...@@ -64,6 +70,7 @@ def run(): ...@@ -64,6 +70,7 @@ def run():
(hit_df.peptide.str.match("^[%s]+$" % "".join( (hit_df.peptide.str.match("^[%s]+$" % "".join(
mhcflurry.amino_acid.COMMON_AMINO_ACIDS))) mhcflurry.amino_acid.COMMON_AMINO_ACIDS)))
] ]
hit_df['alleles'] = hit_df.hla.str.split().map(tuple)
print("Loaded hits from %d samples" % hit_df.sample_id.nunique()) print("Loaded hits from %d samples" % hit_df.sample_id.nunique())
if args.only_format: if args.only_format:
hit_df = hit_df.loc[hit_df.format == args.only_format].copy() hit_df = hit_df.loc[hit_df.format == args.only_format].copy()
...@@ -92,6 +99,31 @@ def run(): ...@@ -92,6 +99,31 @@ def run():
hit_df = new_hit_df.copy() hit_df = new_hit_df.copy()
print("Subselected by pmid to %d samples" % hit_df.sample_id.nunique()) print("Subselected by pmid to %d samples" % hit_df.sample_id.nunique())
allele_to_excluded_peptides = collections.defaultdict(set)
for train_dataset in args.exclude_train_data:
print("Excluding hits from", train_dataset)
train_df = pandas.read_csv(train_dataset)
for (allele, peptides) in train_df.groupby("allele").peptide.unique().iteritems():
allele_to_excluded_peptides[allele].update(peptides)
train_counts = train_df.groupby(
["allele", "peptide"]).measurement_value.count().to_dict()
hit_no_train = hit_df.loc[
[
not any([
train_counts.get((allele, row.peptide))
for allele in row.alleles
])
for _, row in tqdm.tqdm(hit_df.iterrows(), total=len(hit_df))]
]
print(
"Excluding hits from",
train_dataset,
"reduced dataset from",
len(hit_df),
"to",
len(hit_no_train))
hit_df = hit_no_train
sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id") sample_table = hit_df.drop_duplicates("sample_id").set_index("sample_id")
grouped = hit_df.groupby("sample_id").nunique() grouped = hit_df.groupby("sample_id").nunique()
for col in sample_table.columns: for col in sample_table.columns:
...@@ -112,26 +144,13 @@ def run(): ...@@ -112,26 +144,13 @@ def run():
all_peptides_by_length = dict(iter(all_peptides_df.groupby("length"))) all_peptides_by_length = dict(iter(all_peptides_df.groupby("length")))
columns_to_keep = [
"hit_id",
"protein_accession",
"n_flank",
"c_flank",
"peptide",
"sample_id",
"affinity_prediction",
"hit",
]
print("Selecting decoys.") print("Selecting decoys.")
lengths = [8, 9, 10, 11] lengths = [8, 9, 10, 11]
result_df = [] result_df = []
for sample_id, sub_df in tqdm.tqdm( for sample_id, sub_df in tqdm.tqdm(
hit_df.loc[ hit_df.groupby("sample_id"), total=hit_df.sample_id.nunique()):
(hit_df.peptide.str.len() <= 11) & (hit_df.peptide.str.len() >= 8)
].groupby("sample_id"), total=hit_df.sample_id.nunique()):
result_df.append( result_df.append(
sub_df[[ sub_df[[
"protein_accession", "protein_accession",
...@@ -141,14 +160,29 @@ def run(): ...@@ -141,14 +160,29 @@ def run():
"c_flank", "c_flank",
]].copy()) ]].copy())
result_df[-1]["hit"] = 1 result_df[-1]["hit"] = 1
excluded_peptides = set()
for allele in sample_table.loc[sample_id].alleles:
excluded_peptides.update(allele_to_excluded_peptides[allele])
print(
sample_id,
"will exclude",
len(excluded_peptides),
"peptides from decoy universe")
for length in lengths: for length in lengths:
universe = all_peptides_by_length[length] universe = all_peptides_by_length[length]
result_df.append(universe.loc[ possible_universe = universe.loc[
(~universe.peptide.isin(sub_df.peptide.unique())) & ( (~universe.peptide.isin(sub_df.peptide.unique())) &
universe.protein_accession.isin( (~universe.peptide.isin(excluded_peptides)) &
sub_df.protein_accession.unique()))].sample( (universe.protein_accession.isin(sub_df.protein_accession.unique()))
n=int(len(sub_df) * args.decoys_per_hit / len(lengths)))[ ]
["protein_accession", "peptide", "n_flank", "c_flank"]]) selected_decoys = possible_universe.sample(
n=int(len(sub_df) * args.decoys_per_hit / len(lengths)))
result_df.append(selected_decoys[
["protein_accession", "peptide", "n_flank", "c_flank"]
].copy())
result_df[-1]["hit"] = 0 result_df[-1]["hit"] = 0
result_df[-1]["sample_id"] = sample_id result_df[-1]["sample_id"] = sample_id
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment