Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
66c3c43b
Commit
66c3c43b
authored
5 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
fix
parent
d3184487
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
downloads-generation/data_evaluation/GENERATE.sh
+3
-1
3 additions, 1 deletion
downloads-generation/data_evaluation/GENERATE.sh
downloads-generation/models_class1_presentation/make_benchmark.py
+54
-20
54 additions, 20 deletions
...s-generation/models_class1_presentation/make_benchmark.py
with
57 additions
and
21 deletions
downloads-generation/data_evaluation/GENERATE.sh
+
3
−
1
View file @
66c3c43b
...
@@ -63,6 +63,7 @@ else
...
@@ -63,6 +63,7 @@ else
--hits
"
$(
pwd
)
/hits_with_tpm.csv.bz2"
\
--hits
"
$(
pwd
)
/hits_with_tpm.csv.bz2"
\
--proteome-peptides
"
$(
mhcflurry-downloads path data_mass_spec_benchmark
)
/proteome_peptides.all.csv.bz2"
\
--proteome-peptides
"
$(
mhcflurry-downloads path data_mass_spec_benchmark
)
/proteome_peptides.all.csv.bz2"
\
--decoys-per-hit
99
\
--decoys-per-hit
99
\
--exclude-train-data
"
$(
mhcflurry-downloads path models_class1_pan_variants
)
/models.no_additional_ms/train_data.csv.bz2"
\
--only-format
MONOALLELIC
\
--only-format
MONOALLELIC
\
--out
"
$(
pwd
)
/benchmark.monoallelic.csv"
--out
"
$(
pwd
)
/benchmark.monoallelic.csv"
bzip2
-f
benchmark.monoallelic.csv
bzip2
-f
benchmark.monoallelic.csv
...
@@ -77,7 +78,8 @@ else
...
@@ -77,7 +78,8 @@ else
cp
$SCRIPT_DIR
/make_benchmark.py
.
cp
$SCRIPT_DIR
/make_benchmark.py
.
time
python make_benchmark.py
\
time
python make_benchmark.py
\
--hits
"
$(
pwd
)
/hits_with_tpm.csv.bz2"
\
--hits
"
$(
pwd
)
/hits_with_tpm.csv.bz2"
\
--proteome-peptides
\"
"
$(
mhcflurry-downloads path data_mass_spec_benchmark
)
/proteome_peptides.all.csv.bz2"
\"
\
--proteome-peptides
"
$(
mhcflurry-downloads path data_mass_spec_benchmark
)
/proteome_peptides.all.csv.bz2"
\
--exclude-train-data
"
$(
mhcflurry-downloads path models_class1_pan
)
/models.combined/train_data.csv.bz2"
\
--decoys-per-hit
99
\
--decoys-per-hit
99
\
--only-format
MULTIALLELIC
\
--only-format
MULTIALLELIC
\
--out
"
$(
pwd
)
/benchmark.multiallelic.csv"
--out
"
$(
pwd
)
/benchmark.multiallelic.csv"
...
...
This diff is collapsed.
Click to expand it.
downloads-generation/models_class1_presentation/make_benchmark.py
+
54
−
20
View file @
66c3c43b
...
@@ -5,6 +5,7 @@ import sys
...
@@ -5,6 +5,7 @@ import sys
import
argparse
import
argparse
import
os
import
os
import
numpy
import
numpy
import
collections
import
pandas
import
pandas
import
tqdm
import
tqdm
...
@@ -39,6 +40,11 @@ parser.add_argument(
...
@@ -39,6 +40,11 @@ parser.add_argument(
nargs
=
"
+
"
,
nargs
=
"
+
"
,
default
=
[],
default
=
[],
help
=
"
Include only the given PMID
"
)
help
=
"
Include only the given PMID
"
)
parser
.
add_argument
(
"
--exclude-train-data
"
,
nargs
=
"
+
"
,
default
=
[],
help
=
"
Remove hits and decoys included in the given training data
"
)
parser
.
add_argument
(
parser
.
add_argument
(
"
--only-format
"
,
"
--only-format
"
,
choices
=
(
"
MONOALLELIC
"
,
"
MULTIALLELIC
"
),
choices
=
(
"
MONOALLELIC
"
,
"
MULTIALLELIC
"
),
...
@@ -64,6 +70,7 @@ def run():
...
@@ -64,6 +70,7 @@ def run():
(
hit_df
.
peptide
.
str
.
match
(
"
^[%s]+$
"
%
""
.
join
(
(
hit_df
.
peptide
.
str
.
match
(
"
^[%s]+$
"
%
""
.
join
(
mhcflurry
.
amino_acid
.
COMMON_AMINO_ACIDS
)))
mhcflurry
.
amino_acid
.
COMMON_AMINO_ACIDS
)))
]
]
hit_df
[
'
alleles
'
]
=
hit_df
.
hla
.
str
.
split
().
map
(
tuple
)
print
(
"
Loaded hits from %d samples
"
%
hit_df
.
sample_id
.
nunique
())
print
(
"
Loaded hits from %d samples
"
%
hit_df
.
sample_id
.
nunique
())
if
args
.
only_format
:
if
args
.
only_format
:
hit_df
=
hit_df
.
loc
[
hit_df
.
format
==
args
.
only_format
].
copy
()
hit_df
=
hit_df
.
loc
[
hit_df
.
format
==
args
.
only_format
].
copy
()
...
@@ -92,6 +99,31 @@ def run():
...
@@ -92,6 +99,31 @@ def run():
hit_df
=
new_hit_df
.
copy
()
hit_df
=
new_hit_df
.
copy
()
print
(
"
Subselected by pmid to %d samples
"
%
hit_df
.
sample_id
.
nunique
())
print
(
"
Subselected by pmid to %d samples
"
%
hit_df
.
sample_id
.
nunique
())
allele_to_excluded_peptides
=
collections
.
defaultdict
(
set
)
for
train_dataset
in
args
.
exclude_train_data
:
print
(
"
Excluding hits from
"
,
train_dataset
)
train_df
=
pandas
.
read_csv
(
train_dataset
)
for
(
allele
,
peptides
)
in
train_df
.
groupby
(
"
allele
"
).
peptide
.
unique
().
iteritems
():
allele_to_excluded_peptides
[
allele
].
update
(
peptides
)
train_counts
=
train_df
.
groupby
(
[
"
allele
"
,
"
peptide
"
]).
measurement_value
.
count
().
to_dict
()
hit_no_train
=
hit_df
.
loc
[
[
not
any
([
train_counts
.
get
((
allele
,
row
.
peptide
))
for
allele
in
row
.
alleles
])
for
_
,
row
in
tqdm
.
tqdm
(
hit_df
.
iterrows
(),
total
=
len
(
hit_df
))]
]
print
(
"
Excluding hits from
"
,
train_dataset
,
"
reduced dataset from
"
,
len
(
hit_df
),
"
to
"
,
len
(
hit_no_train
))
hit_df
=
hit_no_train
sample_table
=
hit_df
.
drop_duplicates
(
"
sample_id
"
).
set_index
(
"
sample_id
"
)
sample_table
=
hit_df
.
drop_duplicates
(
"
sample_id
"
).
set_index
(
"
sample_id
"
)
grouped
=
hit_df
.
groupby
(
"
sample_id
"
).
nunique
()
grouped
=
hit_df
.
groupby
(
"
sample_id
"
).
nunique
()
for
col
in
sample_table
.
columns
:
for
col
in
sample_table
.
columns
:
...
@@ -112,26 +144,13 @@ def run():
...
@@ -112,26 +144,13 @@ def run():
all_peptides_by_length
=
dict
(
iter
(
all_peptides_df
.
groupby
(
"
length
"
)))
all_peptides_by_length
=
dict
(
iter
(
all_peptides_df
.
groupby
(
"
length
"
)))
columns_to_keep
=
[
"
hit_id
"
,
"
protein_accession
"
,
"
n_flank
"
,
"
c_flank
"
,
"
peptide
"
,
"
sample_id
"
,
"
affinity_prediction
"
,
"
hit
"
,
]
print
(
"
Selecting decoys.
"
)
print
(
"
Selecting decoys.
"
)
lengths
=
[
8
,
9
,
10
,
11
]
lengths
=
[
8
,
9
,
10
,
11
]
result_df
=
[]
result_df
=
[]
for
sample_id
,
sub_df
in
tqdm
.
tqdm
(
for
sample_id
,
sub_df
in
tqdm
.
tqdm
(
hit_df
.
loc
[
hit_df
.
groupby
(
"
sample_id
"
),
total
=
hit_df
.
sample_id
.
nunique
()):
(
hit_df
.
peptide
.
str
.
len
()
<=
11
)
&
(
hit_df
.
peptide
.
str
.
len
()
>=
8
)
].
groupby
(
"
sample_id
"
),
total
=
hit_df
.
sample_id
.
nunique
()):
result_df
.
append
(
result_df
.
append
(
sub_df
[[
sub_df
[[
"
protein_accession
"
,
"
protein_accession
"
,
...
@@ -141,14 +160,29 @@ def run():
...
@@ -141,14 +160,29 @@ def run():
"
c_flank
"
,
"
c_flank
"
,
]].
copy
())
]].
copy
())
result_df
[
-
1
][
"
hit
"
]
=
1
result_df
[
-
1
][
"
hit
"
]
=
1
excluded_peptides
=
set
()
for
allele
in
sample_table
.
loc
[
sample_id
].
alleles
:
excluded_peptides
.
update
(
allele_to_excluded_peptides
[
allele
])
print
(
sample_id
,
"
will exclude
"
,
len
(
excluded_peptides
),
"
peptides from decoy universe
"
)
for
length
in
lengths
:
for
length
in
lengths
:
universe
=
all_peptides_by_length
[
length
]
universe
=
all_peptides_by_length
[
length
]
result_df
.
append
(
universe
.
loc
[
possible_universe
=
universe
.
loc
[
(
~
universe
.
peptide
.
isin
(
sub_df
.
peptide
.
unique
()))
&
(
(
~
universe
.
peptide
.
isin
(
sub_df
.
peptide
.
unique
()))
&
universe
.
protein_accession
.
isin
(
(
~
universe
.
peptide
.
isin
(
excluded_peptides
))
&
sub_df
.
protein_accession
.
unique
()))].
sample
(
(
universe
.
protein_accession
.
isin
(
sub_df
.
protein_accession
.
unique
()))
n
=
int
(
len
(
sub_df
)
*
args
.
decoys_per_hit
/
len
(
lengths
)))[
]
[
"
protein_accession
"
,
"
peptide
"
,
"
n_flank
"
,
"
c_flank
"
]])
selected_decoys
=
possible_universe
.
sample
(
n
=
int
(
len
(
sub_df
)
*
args
.
decoys_per_hit
/
len
(
lengths
)))
result_df
.
append
(
selected_decoys
[
[
"
protein_accession
"
,
"
peptide
"
,
"
n_flank
"
,
"
c_flank
"
]
].
copy
())
result_df
[
-
1
][
"
hit
"
]
=
0
result_df
[
-
1
][
"
hit
"
]
=
0
result_df
[
-
1
][
"
sample_id
"
]
=
sample_id
result_df
[
-
1
][
"
sample_id
"
]
=
sample_id
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment