Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
1d2d7228
Commit
1d2d7228
authored
8 years ago
by
Alex Rubinsteyn
Browse files
Options
Downloads
Patches
Plain Diff
split up filtering of IEDB MHC dataframe
parent
6859d990
Loading
Loading
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
script/create-iedb-class1-dataset.py
+99
-39
99 additions, 39 deletions
script/create-iedb-class1-dataset.py
with
99 additions
and
39 deletions
script/create-iedb-class1-dataset.py
+
99
−
39
View file @
1d2d7228
...
...
@@ -45,47 +45,76 @@ parser.add_argument(
default
=
CLASS1_DATA_DIRECTORY
,
help
=
"
Directory to write output pickle file
"
)
parser
.
add_argument
(
"
--output-pickle-filename
"
,
default
=
PICKLE_OUTPUT_FILENAME
,
help
=
"
Path to .pickle file containing dictionary of IEDB assay datasets
"
)
if
__name__
==
"
__main__
"
:
args
=
parser
.
parse_args
()
df
=
pd
.
read_csv
(
args
.
input_csv
,
error_bad_lines
=
False
,
encoding
=
"
latin-1
"
,
header
=
[
0
,
1
])
parser
.
add_argument
(
"
--alleles
"
,
nargs
=
"
+
"
,
default
=
[],
help
=
"
Restrict dataset to specified alleles
"
)
def
filter_class1_alleles
(
df
):
mhc_class
=
df
[
"
MHC
"
][
"
MHC allele class
"
]
print
(
"
MHC class counts:
\n
%s
"
%
(
mhc_class
.
value_counts
(),))
class1_mask
=
mhc_class
==
"
I
"
return
df
[
class1_mask
]
def
filter_allele_names
(
df
):
alleles
=
df
[
"
MHC
"
][
"
Allele Name
"
]
n
=
len
(
alleles
)
print
(
"
# total: %d
"
%
n
)
mask
=
np
.
zeros
(
n
,
dtype
=
bool
)
patterns
=
[
"
HLA-A
"
,
"
HLA-B
"
,
"
HLA-C
"
,
"
H-2
"
,
invalid_allele_mask
=
alleles
.
str
.
contains
(
"
"
)
|
alleles
.
str
.
contains
(
"
/
"
)
invalid_alleles
=
alleles
[
invalid_allele_mask
]
print
(
"
-- Invalid allele names: %s
"
%
(
list
(
sorted
(
set
(
invalid_alleles
)))))
print
(
"
Dropping %d with complex alleles (e.g. descriptions of mutations)
"
%
(
len
(
invalid_alleles
),))
return
df
[
~
invalid_allele_mask
]
def
filter_affinity_values
(
df
):
affinities
=
df
[
"
Assay
"
][
"
Quantitative measurement
"
]
finite_affinity_mask
=
~
affinities
.
isnull
()
&
np
.
isfinite
(
affinities
)
invalid_affinity_mask
=
~
finite_affinity_mask
print
(
"
Dropping %d rows without finite affinity measurements
"
%
(
invalid_affinity_mask
.
sum
(),))
return
df
[
finite_affinity_mask
]
def
filter_mhc_dataframe
(
df
):
filter_functions
=
[
filter_class1_alleles
,
filter_allele_names
,
filter_affinity_values
,
]
for
pattern
in
patterns
:
pattern_mask
=
alleles
.
str
.
startswith
(
pattern
)
print
(
"
# %s: %d
"
%
(
pattern
,
pattern_mask
.
sum
()))
mask
|=
pattern_mask
df
=
df
[
mask
]
print
(
"
# entries matching allele masks: %d
"
%
(
len
(
df
)))
for
fn
in
filter_functions
:
df
=
fn
(
df
)
return
df
def
groupby_assay
(
df
):
assay_group
=
df
[
"
Assay
"
][
"
Assay Group
"
]
assay_method
=
df
[
"
Assay
"
][
"
Method/Technique
"
]
groups
=
df
.
groupby
([
assay_group
,
assay_method
])
# speed up repeated calls to np.log by caching log affinities as a column
# in the dataframe
df
[
"
_log_affinity
"
]
=
np
.
log
(
df
[
"
Assay
"
][
"
Quantitative measurement
"
])
# speed up computing percent positive with the helper column
qualitative
=
df
[
"
Assay
"
][
"
Qualitative Measure
"
]
df
[
"
_qualitative_positive
"
]
=
qualitative
.
str
.
startswith
(
"
Positive
"
)
print
(
"
---
"
)
print
(
"
Assays
"
)
assay_dataframes
=
{}
# create a dataframe for every distinct kind of assay which is used
# by IEDB submitters to measure peptide-MHC affinity or stability
for
(
assay_group
,
assay_method
),
group_data
in
sorted
(
groups
,
key
=
lambda
x
:
len
(
x
[
1
]),
reverse
=
True
):
print
(
"
%s (%s): %d
"
%
(
assay_group
,
assay_method
,
len
(
group_data
)))
groups
,
key
=
lambda
x
:
len
(
x
[
1
]),
reverse
=
True
):
print
(
"
- %s (%s): %d
"
%
(
assay_group
,
assay_method
,
len
(
group_data
)))
group_alleles
=
group_data
[
"
MHC
"
][
"
Allele Name
"
]
group_peptides
=
group_data
[
"
Epitope
"
][
"
Description
"
]
distinct_pmhc
=
group_data
.
groupby
([
group_alleles
,
group_peptides
])
...
...
@@ -93,19 +122,16 @@ if __name__ == "__main__":
for
(
allele
,
peptide
),
pmhc_group
in
distinct_pmhc
:
columns
[
"
mhc
"
].
append
(
allele
)
columns
[
"
peptide
"
].
append
(
peptide
)
# performing median in log space since in two datapoint case
# we don't want to take e.g. (10 + 1000) / 2.0 = 505
# but would prefer something like 10 ** ( (1 + 3) / 2.0) = 100
columns
[
"
value
"
].
append
(
np
.
exp
(
np
.
median
(
np
.
log
(
pmhc_group
[
"
Assay
"
][
"
Quantitative measurement
"
]))))
qualitative
=
pmhc_group
[
"
Assay
"
][
"
Qualitative Measure
"
]
columns
[
"
percent_positive
"
].
append
(
qualitative
.
str
.
startswith
(
"
Positive
"
).
mean
())
columns
[
"
count
"
].
append
(
pmhc_group
[
"
Assay
"
][
"
Quantitative measurement
"
].
count
())
positive
=
pmhc_group
[
"
_qualitative_positive
"
]
count
=
len
(
pmhc_group
)
if
count
==
1
:
ic50
=
pmhc_group
[
"
Assay
"
][
"
Quantitative measurement
"
].
mean
()
else
:
ic50
=
np
.
exp
(
np
.
mean
(
pmhc_group
[
"
_log_affinity
"
]))
# averaging the log affinities preserves orders of magnitude better
columns
[
"
value
"
].
append
(
ic50
)
columns
[
"
percent_positive
"
].
append
(
positive
.
mean
())
columns
[
"
count
"
].
append
(
count
)
assay_dataframes
[(
assay_group
,
assay_method
)]
=
pd
.
DataFrame
(
columns
,
columns
=
[
...
...
@@ -115,10 +141,44 @@ if __name__ == "__main__":
"
percent_positive
"
,
"
count
"
])
print
(
"
# distinct pMHC entries: %d
"
%
len
(
columns
[
"
mhc
"
]))
return
assay_dataframes
if
__name__
==
"
__main__
"
:
args
=
parser
.
parse_args
()
df
=
pd
.
read_csv
(
args
.
input_csv
,
error_bad_lines
=
False
,
encoding
=
"
latin-1
"
,
header
=
[
0
,
1
])
df
=
filter_mhc_dataframe
(
df
)
alleles
=
df
[
"
MHC
"
][
"
Allele Name
"
]
n
=
len
(
alleles
)
print
(
"
# Class I rows: %d
"
%
n
)
print
(
"
# Class I alleles: %d
"
%
len
(
set
(
alleles
)))
print
(
"
Unique alleles: %s
"
%
list
(
sorted
(
set
(
alleles
))))
if
args
.
alleles
:
print
(
"
User-supplied allele whitelist: %s
"
%
(
args
.
alleles
,))
mask
=
np
.
zeros
(
n
,
dtype
=
bool
)
for
pattern
in
args
.
alleles
:
pattern_mask
=
alleles
.
str
.
startswith
(
pattern
)
print
(
"
# %s: %d
"
%
(
pattern
,
pattern_mask
.
sum
()))
mask
|=
pattern_mask
df
=
df
[
mask
]
print
(
"
# entries matching alleles %s: %d
"
%
(
args
.
alleles
,
len
(
df
)))
assay_dataframes
=
groupby_assay
(
df
)
if
not
exists
(
args
.
output_dir
):
makedirs
(
args
.
output_dir
)
output_path
=
join
(
args
.
output_dir
,
args
.
output_pickle_filename
)
with
open
(
args
.
output
,
"
wb
"
)
as
f
:
with
open
(
output
_path
,
"
wb
"
)
as
f
:
pickle
.
dump
(
assay_dataframes
,
f
,
pickle
.
HIGHEST_PROTOCOL
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment