Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
9e3c4c8b
Commit
9e3c4c8b
authored
8 years ago
by
Alex Rubinsteyn
Browse files
Options
Downloads
Patches
Plain Diff
moving encoding to dataset
parent
59e3b559
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
mhcflurry/common.py
+15
-0
15 additions, 0 deletions
mhcflurry/common.py
mhcflurry/data.py
+2
-221
2 additions, 221 deletions
mhcflurry/data.py
mhcflurry/dataset.py
+26
-1
26 additions, 1 deletion
mhcflurry/dataset.py
mhcflurry/dataset_helpers.py
+200
-13
200 additions, 13 deletions
mhcflurry/dataset_helpers.py
with
243 additions
and
235 deletions
mhcflurry/common.py
+
15
−
0
View file @
9e3c4c8b
...
...
@@ -17,6 +17,7 @@ from __future__ import (
division
,
absolute_import
,
)
from
math
import
exp
,
log
def
parse_int_list
(
s
):
return
[
int
(
part
.
strip
()
for
part
in
s
.
split
(
"
,
"
))]
...
...
@@ -64,3 +65,17 @@ def split_allele_names(s):
for
part
in
s
.
split
(
"
,
"
)
]
def
geometric_mean
(
xs
,
weights
=
None
):
"""
Geometric mean of a collection of affinity measurements, with optional
sample weights.
"""
if
len
(
xs
)
==
1
:
return
xs
[
0
]
elif
weights
is
None
:
return
exp
(
sum
(
log
(
xi
)
for
xi
in
xs
))
sum_weighted_log
=
sum
(
log
(
xi
)
*
wi
for
(
xi
,
wi
)
in
zip
(
xs
,
weights
))
denom
=
sum
(
weights
)
return
exp
(
sum_weighted_log
/
denom
)
This diff is collapsed.
Click to expand it.
mhcflurry/data.py
+
2
−
221
View file @
9e3c4c8b
...
...
@@ -22,14 +22,13 @@ from collections import namedtuple, defaultdict
import
pandas
as
pd
import
numpy
as
np
from
.common
import
normalize_allele_name
from
.amino_acid
import
common_amino_acids
from
.peptide_encoding
import
(
indices_to_hotshot_encoding
,
fixed_length_index_encoding
,
check_valid_index_encoding_array
,
)
from
.regression_target
import
MAX_IC50
,
ic50_to_regression_target
index_encoding
=
common_amino_acids
.
index_encoding
...
...
@@ -50,225 +49,8 @@ AlleleData = namedtuple(
])
def
_infer_csv_separator
(
filename
):
"""
Determine if file is separated by comma, tab, or whitespace.
Default to whitespace if the others are not detected.
Returns (sep, delim_whitespace)
"""
for
candidate
in
[
"
,
"
,
"
\t
"
]:
with
open
(
filename
,
"
r
"
)
as
f
:
for
line
in
f
:
if
line
.
startswith
(
"
#
"
):
continue
if
candidate
in
line
:
return
candidate
,
False
return
None
,
True
def
load_dataframe
(
filename
,
max_ic50
=
MAX_IC50
,
sep
=
None
,
species_column_name
=
"
species
"
,
allele_column_name
=
"
mhc
"
,
peptide_column_name
=
None
,
filter_peptide_length
=
None
,
ic50_column_name
=
"
meas
"
,
only_human
=
False
):
"""
Load a dataframe of peptide-MHC affinity measurements
filename : str
TSV filename with columns:
-
'
species
'
-
'
mhc
'
-
'
peptide_length
'
-
'
sequence
'
-
'
meas
'
max_ic50 : float
Treat IC50 scores above this value as all equally bad
(transform them to 0.0 in the regression output)
sep : str, optional
Separator in CSV file, default is to let Pandas infer
peptide_column_name : str, optional
Default behavior is to try {
"
sequence
"
,
"
peptide
"
,
"
peptide_sequence
"
}
filter_peptide_length : int, optional
Which length peptides to use (default=load all lengths)
only_human : bool
Only load entries from human MHC alleles
Returns DataFrame augmented with extra column:
-
"
regression_output
"
: 1.0 - log(ic50)/log(max_ic50), limited to [0,1]
"""
if
sep
is
None
:
sep
,
delim_whitespace
=
_infer_csv_separator
(
filename
)
else
:
delim_whitespace
=
False
df
=
pd
.
read_csv
(
filename
,
sep
=
sep
,
delim_whitespace
=
delim_whitespace
,
engine
=
"
c
"
)
# hack: get around variability of column naming by checking if
# the peptide_column_name is actually present and if not try "peptide"
if
peptide_column_name
is
None
:
columns
=
set
(
df
.
keys
())
for
candidate
in
[
"
sequence
"
,
"
peptide
"
,
"
peptide_sequence
"
]:
if
candidate
in
columns
:
peptide_column_name
=
candidate
break
if
peptide_column_name
is
None
:
raise
ValueError
(
"
Couldn
'
t find peptide column name, candidates: %s
"
%
(
columns
))
if
only_human
:
human_mask
=
df
[
species_column_name
]
==
"
human
"
df
=
df
[
human_mask
]
if
filter_peptide_length
:
length_mask
=
df
[
peptide_column_name
].
str
.
len
()
==
filter_peptide_length
df
=
df
[
length_mask
]
df
[
allele_column_name
]
=
df
[
allele_column_name
].
map
(
normalize_allele_name
)
ic50
=
np
.
array
(
df
[
ic50_column_name
])
df
[
"
regression_output
"
]
=
ic50_to_regression_target
(
ic50
,
max_ic50
=
max_ic50
)
return
df
,
peptide_column_name
def
load_allele_dicts
(
filename
,
max_ic50
=
MAX_IC50
,
regression_output
=
False
,
sep
=
None
,
species_column_name
=
"
species
"
,
allele_column_name
=
"
mhc
"
,
peptide_column_name
=
None
,
ic50_column_name
=
"
meas
"
,
only_human
=
False
,
min_allele_size
=
1
):
"""
Parsing CSV of binding data into dictionary of dictionaries.
The outer key is an allele name, the inner key is a peptide sequence,
and the inner value is an IC50 or log-transformed value between [0,1]
"""
binding_df
,
peptide_column_name
=
load_dataframe
(
filename
=
filename
,
max_ic50
=
max_ic50
,
sep
=
sep
,
species_column_name
=
species_column_name
,
allele_column_name
=
allele_column_name
,
peptide_column_name
=
peptide_column_name
,
ic50_column_name
=
ic50_column_name
,
only_human
=
only_human
)
# map peptides to either the raw IC50 or rescaled log IC50 depending
# on the `regression_output` parameter
output_column_name
=
(
"
regression_output
"
if
regression_output
else
ic50_column_name
)
return
{
allele_name
:
{
peptide
:
output
for
(
peptide
,
output
)
in
zip
(
group
[
peptide_column_name
],
group
[
output_column_name
])
}
for
(
allele_name
,
group
)
in
binding_df
.
groupby
(
allele_column_name
)
if
len
(
group
)
>=
min_allele_size
}
def
encode_peptide_to_affinity_dict
(
peptide_to_affinity_dict
,
peptide_length
=
9
,
flatten_binary_encoding
=
True
,
allow_unknown_amino_acids
=
True
):
"""
Given a dictionary mapping from peptide sequences to affinity values, return
both index and binary encodings of fixed length peptides, and
a vector of their affinities.
Parameters
----------
peptide_to_affinity_dict : dict
Keys are peptide strings (of multiple lengths), each mapping to a
continuous affinity value.
peptide_length : int
Length of vector encoding
flatten_binary_encoding : bool
Should the binary encoding of a peptide be two-dimensional (9x20)
or a flattened 1d vector
allow_unknown_amino_acids : bool
When extending a short vector to the desired peptide length, should
we insert every possible amino acid or a designated character
"
X
"
indicating an unknown amino acid.
Returns tuple with the following fields:
- kmer_peptides: fixed length peptide strings
- original_peptides: variable length peptide strings
- counts: how many fixed length peptides were made from this original
- X_index: index encoding of fixed length peptides
- X_binary: binary encoding of fixed length peptides
- Y: affinity values associated with original peptides
"""
raw_peptides
=
list
(
sorted
(
peptide_to_affinity_dict
.
keys
()))
X_index
,
kmer_peptides
,
original_peptides
,
counts
=
\
fixed_length_index_encoding
(
peptides
=
raw_peptides
,
desired_length
=
peptide_length
,
start_offset_shorten
=
0
,
end_offset_shorten
=
0
,
start_offset_extend
=
0
,
end_offset_extend
=
0
,
allow_unknown_amino_acids
=
allow_unknown_amino_acids
)
n_samples
=
len
(
kmer_peptides
)
assert
n_samples
==
len
(
original_peptides
),
\
"
Mismatch between # of samples (%d) and # of peptides (%d)
"
%
(
n_samples
,
len
(
original_peptides
))
assert
n_samples
==
len
(
counts
),
\
"
Mismatch between # of samples (%d) and # of counts (%d)
"
%
(
n_samples
,
len
(
counts
))
assert
n_samples
==
len
(
X_index
),
\
"
Mismatch between # of sample (%d) and index feature vectors (%d)
"
%
(
n_samples
,
len
(
X_index
))
X_index
=
check_valid_index_encoding_array
(
X_index
,
allow_unknown_amino_acids
)
n_indices
=
20
+
allow_unknown_amino_acids
X_binary
=
indices_to_hotshot_encoding
(
X_index
,
n_indices
=
n_indices
)
assert
X_binary
.
shape
[
0
]
==
X_index
.
shape
[
0
],
\
(
"
Mismatch between number of samples for index encoding (%d)
"
"
vs. binary encoding (%d)
"
)
%
(
X_binary
.
shape
[
0
],
X_index
.
shape
[
0
])
if
flatten_binary_encoding
:
# collapse 3D input into 2D matrix
n_binary_features
=
peptide_length
*
n_indices
X_binary
=
X_binary
.
reshape
((
n_samples
,
n_binary_features
))
# easier to work with counts when they're an array instead of list
counts
=
np
.
array
(
counts
)
Y
=
np
.
array
([
peptide_to_affinity_dict
[
p
]
for
p
in
original_peptides
])
assert
n_samples
==
len
(
Y
),
\
"
Mismatch between # peptides %d and # regression outputs %d
"
%
(
n_samples
,
len
(
Y
))
return
(
kmer_peptides
,
original_peptides
,
counts
,
X_index
,
X_binary
,
Y
)
def
create_allele_data_from_peptide_to_ic50_dict
(
peptide_to_ic50_dict
,
...
...
@@ -337,8 +119,7 @@ def load_allele_datasets(
peptide_column_name
=
None
,
peptide_length_column_name
=
"
peptide_length
"
,
ic50_column_name
=
"
meas
"
,
only_human
=
False
,
shuffle
=
True
):
only_human
=
False
):
"""
Loads an IEDB dataset, extracts
"
hot-shot
"
encoding of fixed length peptides
and log-transforms the IC50 measurement. Returns dictionary mapping allele
...
...
This diff is collapsed.
Click to expand it.
mhcflurry/dataset.py
+
26
−
1
View file @
9e3c4c8b
...
...
@@ -23,7 +23,11 @@ from six import string_types
import
pandas
as
pd
import
numpy
as
np
from
.dataset_helpers
import
prepare_pMHC_affinity_arrays
,
geometric_mean
from
.common
import
geometric_mean
from
.dataset_helpers
import
(
prepare_pMHC_affinity_arrays
,
load_dataframe
)
class
Dataset
(
object
):
...
...
@@ -250,6 +254,27 @@ class Dataset(object):
return
cls
.
from_allele_to_peptide_to_affinity_dictionary
(
{
allele_name
:
peptide_to_affinity_dict
})
@classmethod
def
from_csv
(
cls
,
filename
,
sep
=
None
,
allele_column_name
=
None
,
peptide_column_name
=
None
,
affinity_column_name
=
None
):
df
,
allele_column_name
,
peptide_column_name
,
affinity_column_name
=
\
load_dataframe
(
filename
=
filename
,
sep
=
sep
,
allele_column_name
=
allele_column_name
,
peptide_column_name
=
peptide_column_name
,
affinity_column_name
=
affinity_column_name
)
df
=
df
.
rename
(
columns
=
{
allele_column_name
:
"
allele
"
,
peptide_column_name
:
"
peptide
"
,
affinity_column_name
:
"
affinity
"
})
return
cls
(
df
)
def
get_allele
(
self
,
allele_name
):
"""
Get Dataset for a single allele
...
...
This diff is collapsed.
Click to expand it.
mhcflurry/dataset_helpers.py
+
200
−
13
View file @
9e3c4c8b
...
...
@@ -20,20 +20,14 @@ from __future__ import (
from
typechecks
import
require_instance
import
numpy
as
np
from
math
import
exp
,
log
import
pandas
as
pd
def
geometric_mean
(
xs
,
weights
=
None
):
"""
Geometric mean of a collection of affinity measurements, with optional
sample weights.
"""
if
len
(
xs
)
==
1
:
return
xs
[
0
]
elif
weights
is
None
:
return
exp
(
sum
(
log
(
xi
)
for
xi
in
xs
))
sum_weighted_log
=
sum
(
log
(
xi
)
*
wi
for
(
xi
,
wi
)
in
zip
(
xs
,
weights
))
denom
=
sum
(
weights
)
return
exp
(
sum_weighted_log
/
denom
)
from
.common
import
normalize_allele_name
from
.peptide_encoding
import
(
indices_to_hotshot_encoding
,
fixed_length_index_encoding
,
check_valid_index_encoding_array
,
)
def
check_pMHC_affinity_arrays
(
alleles
,
peptides
,
affinities
,
sample_weights
):
"""
...
...
@@ -82,3 +76,196 @@ def prepare_pMHC_affinity_arrays(alleles, peptides, affinities, sample_weights=N
affinities
=
affinities
,
sample_weights
=
sample_weights
)
return
alleles
,
peptides
,
affinities
,
sample_weights
def
infer_csv_separator
(
filename
):
"""
Determine if file is separated by comma, tab, or whitespace.
Default to whitespace if the others are not detected.
Returns (sep, delim_whitespace)
"""
for
candidate
in
[
"
,
"
,
"
\t
"
]:
with
open
(
filename
,
"
r
"
)
as
f
:
for
line
in
f
:
if
line
.
startswith
(
"
#
"
):
continue
if
candidate
in
line
:
return
candidate
,
False
return
None
,
True
def
load_dataframe
(
filename
,
sep
=
None
,
allele_column_name
=
None
,
peptide_column_name
=
None
,
affinity_column_name
=
None
,
filter_peptide_length
=
None
,
normalize_allele_names
=
True
):
"""
Load a dataframe of peptide-MHC affinity measurements
filename : str
TSV filename with columns:
-
'
species
'
-
'
mhc
'
-
'
peptide_length
'
-
'
sequence
'
-
'
meas
'
sep : str, optional
Separator in CSV file, default is to let Pandas infer
allele_column_name : str, optional
Default behavior is to try {
"
mhc
"
,
"
allele
"
,
"
hla
"
}
peptide_column_name : str, optional
Default behavior is to try {
"
sequence
"
,
"
peptide
"
,
"
peptide_sequence
"
}
affinity_column_name : str, optional
Default behavior is to try {
"
meas
"
,
"
ic50
"
,
"
affinity
"
,
"
aff
"
}
filter_peptide_length : int, optional
Which length peptides to use (default=load all lengths)
normalize_allele_names : bool
Normalize MHC names or leave them alone
Returns:
- DataFrame
- peptide column name
- allele column name
- affinity column name
"""
if
sep
is
None
:
sep
,
delim_whitespace
=
infer_csv_separator
(
filename
)
else
:
delim_whitespace
=
False
df
=
pd
.
read_csv
(
filename
,
sep
=
sep
,
delim_whitespace
=
delim_whitespace
,
engine
=
"
c
"
)
columns
=
set
(
df
.
keys
())
if
allele_column_name
is
None
:
for
candidate
in
[
"
mhc
"
,
"
allele
"
,
"
hla
"
]:
if
candidate
in
columns
:
allele_column_name
=
candidate
break
if
allele_column_name
is
None
:
raise
ValueError
(
"
Couldn
'
t find alleles, available columns: %s
"
%
(
columns
,))
if
peptide_column_name
is
None
:
for
candidate
in
[
"
sequence
"
,
"
peptide
"
,
"
peptide_sequence
"
]:
if
candidate
in
columns
:
peptide_column_name
=
candidate
break
if
peptide_column_name
is
None
:
raise
ValueError
(
"
Couldn
'
t find peptides, available columns: %s
"
%
(
columns
,))
if
affinity_column_name
is
None
:
for
candidate
in
[
"
meas
"
,
"
ic50
"
,
"
affinity
"
]:
if
candidate
in
columns
:
affinity_column_name
=
candidate
break
if
affinity_column_name
is
None
:
raise
ValueError
(
"
Couldn
'
t find affinity values, available columns: %s
"
%
(
columns
,))
if
filter_peptide_length
:
length_mask
=
df
[
peptide_column_name
].
str
.
len
()
==
filter_peptide_length
df
=
df
[
length_mask
]
df
[
allele_column_name
]
=
df
[
allele_column_name
].
map
(
normalize_allele_name
)
return
df
,
allele_column_name
,
peptide_column_name
,
affinity_column_name
def
encode_peptide_to_affinity_dict
(
peptide_to_affinity_dict
,
peptide_length
=
9
,
flatten_binary_encoding
=
True
,
allow_unknown_amino_acids
=
True
):
"""
Given a dictionary mapping from peptide sequences to affinity values, return
both index and binary encodings of fixed length peptides, and
a vector of their affinities.
Parameters
----------
peptide_to_affinity_dict : dict
Keys are peptide strings (of multiple lengths), each mapping to a
continuous affinity value.
peptide_length : int
Length of vector encoding
flatten_binary_encoding : bool
Should the binary encoding of a peptide be two-dimensional (9x20)
or a flattened 1d vector
allow_unknown_amino_acids : bool
When extending a short vector to the desired peptide length, should
we insert every possible amino acid or a designated character
"
X
"
indicating an unknown amino acid.
Returns tuple with the following fields:
- kmer_peptides: fixed length peptide strings
- original_peptides: variable length peptide strings
- counts: how many fixed length peptides were made from this original
- X_index: index encoding of fixed length peptides
- X_binary: binary encoding of fixed length peptides
- Y: affinity values associated with original peptides
"""
raw_peptides
=
list
(
sorted
(
peptide_to_affinity_dict
.
keys
()))
X_index
,
kmer_peptides
,
original_peptides
,
counts
=
\
fixed_length_index_encoding
(
peptides
=
raw_peptides
,
desired_length
=
peptide_length
,
start_offset_shorten
=
0
,
end_offset_shorten
=
0
,
start_offset_extend
=
0
,
end_offset_extend
=
0
,
allow_unknown_amino_acids
=
allow_unknown_amino_acids
)
n_samples
=
len
(
kmer_peptides
)
assert
n_samples
==
len
(
original_peptides
),
\
"
Mismatch between # of samples (%d) and # of peptides (%d)
"
%
(
n_samples
,
len
(
original_peptides
))
assert
n_samples
==
len
(
counts
),
\
"
Mismatch between # of samples (%d) and # of counts (%d)
"
%
(
n_samples
,
len
(
counts
))
assert
n_samples
==
len
(
X_index
),
\
"
Mismatch between # of sample (%d) and index feature vectors (%d)
"
%
(
n_samples
,
len
(
X_index
))
X_index
=
check_valid_index_encoding_array
(
X_index
,
allow_unknown_amino_acids
)
n_indices
=
20
+
allow_unknown_amino_acids
X_binary
=
indices_to_hotshot_encoding
(
X_index
,
n_indices
=
n_indices
)
assert
X_binary
.
shape
[
0
]
==
X_index
.
shape
[
0
],
\
(
"
Mismatch between number of samples for index encoding (%d)
"
"
vs. binary encoding (%d)
"
)
%
(
X_binary
.
shape
[
0
],
X_index
.
shape
[
0
])
if
flatten_binary_encoding
:
# collapse 3D input into 2D matrix
n_binary_features
=
peptide_length
*
n_indices
X_binary
=
X_binary
.
reshape
((
n_samples
,
n_binary_features
))
# easier to work with counts when they're an array instead of list
counts
=
np
.
array
(
counts
)
Y
=
np
.
array
([
peptide_to_affinity_dict
[
p
]
for
p
in
original_peptides
])
assert
n_samples
==
len
(
Y
),
\
"
Mismatch between # peptides %d and # regression outputs %d
"
%
(
n_samples
,
len
(
Y
))
return
(
kmer_peptides
,
original_peptides
,
counts
,
X_index
,
X_binary
,
Y
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment