Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
b09ed3b4
Commit
b09ed3b4
authored
5 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
cleanup hyperparameters
parent
d77d099b
Loading
Loading
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
mhcflurry/class1_neural_network.py
+34
-58
34 additions, 58 deletions
mhcflurry/class1_neural_network.py
mhcflurry/encodable_sequences.py
+35
-17
35 additions, 17 deletions
mhcflurry/encodable_sequences.py
with
69 additions
and
75 deletions
mhcflurry/class1_neural_network.py
+
34
−
58
View file @
b09ed3b4
...
...
@@ -9,7 +9,7 @@ import pandas
from
.hyperparameters
import
HyperparameterDefaults
from
.encodable_sequences
import
EncodableSequences
from
.encodable_sequences
import
EncodableSequences
,
EncodingError
from
.amino_acid
import
available_vector_encodings
,
vector_encoding_length
from
.regression_target
import
to_ic50
,
from_ic50
from
.common
import
random_peptides
,
amino_acid_distribution
...
...
@@ -28,12 +28,15 @@ class Class1NeuralNetwork(object):
"""
network_hyperparameter_defaults
=
HyperparameterDefaults
(
kmer_size
=
15
,
peptide_amino_acid_encoding
=
"
BLOSUM62
"
,
allele_amino_acid_encoding
=
"
BLOSUM62
"
,
embedding_input_dim
=
21
,
embedding_output_dim
=
8
,
allele_dense_layer_sizes
=
[],
peptide_encoding
=
{
'
vector_encoding_name
'
:
'
BLOSUM62
'
,
'
alignment_method
'
:
'
pad_middle
'
,
'
left_edge
'
:
4
,
'
right_edge
'
:
4
,
'
max_length
'
:
15
,
},
peptide_dense_layer_sizes
=
[],
peptide_allele_merge_method
=
"
multiply
"
,
peptide_allele_merge_activation
=
""
,
...
...
@@ -45,7 +48,6 @@ class Class1NeuralNetwork(object):
output_activation
=
"
sigmoid
"
,
dropout_probability
=
0.0
,
batch_normalization
=
False
,
embedding_init_method
=
"
glorot_uniform
"
,
locally_connected_layers
=
[
{
"
filters
"
:
8
,
...
...
@@ -69,15 +71,6 @@ class Class1NeuralNetwork(object):
used.
"""
input_encoding_hyperparameter_defaults
=
HyperparameterDefaults
(
alignment_method
=
"
pad_middle
"
,
left_edge
=
4
,
right_edge
=
4
)
"""
Number of amino acid residues that are given fixed positions on the each
side in the variable length encoding.
"""
fit_hyperparameter_defaults
=
HyperparameterDefaults
(
max_epochs
=
500
,
validation_split
=
0.1
,
...
...
@@ -110,7 +103,6 @@ class Class1NeuralNetwork(object):
hyperparameter_defaults
=
network_hyperparameter_defaults
.
extend
(
compile_hyperparameter_defaults
).
extend
(
input_encoding_hyperparameter_defaults
).
extend
(
fit_hyperparameter_defaults
).
extend
(
early_stopping_hyperparameter_defaults
).
extend
(
miscelaneous_hyperparameter_defaults
...
...
@@ -132,6 +124,13 @@ class Class1NeuralNetwork(object):
"
verbose
"
:
None
,
"
mode
"
:
None
,
"
take_best_epoch
"
:
None
,
'
kmer_size
'
:
None
,
'
peptide_amino_acid_encoding
'
:
None
,
'
embedding_input_dim
'
:
None
,
'
embedding_output_dim
'
:
None
,
'
embedding_init_method
'
:
None
,
'
left_edge
'
:
None
,
'
right_edge
'
:
None
,
}
@classmethod
...
...
@@ -375,22 +374,8 @@ class Class1NeuralNetwork(object):
numpy.array
"""
encoder
=
EncodableSequences
.
create
(
peptides
)
if
(
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
]
==
"
embedding
"
):
encoded
=
encoder
.
variable_length_to_fixed_length_categorical
(
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
self
.
hyperparameters
))
elif
(
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
]
in
available_vector_encodings
()):
encoded
=
encoder
.
variable_length_to_fixed_length_vector_encoding
(
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
],
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
self
.
hyperparameters
))
else
:
raise
ValueError
(
"
Unsupported peptide_amino_acid_encoding: %s
"
%
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
])
encoded
=
encoder
.
variable_length_to_fixed_length_vector_encoding
(
**
self
.
hyperparameters
[
'
peptide_encoding
'
])
assert
len
(
encoded
)
==
len
(
peptides
)
return
encoded
...
...
@@ -404,10 +389,16 @@ class Class1NeuralNetwork(object):
(int, int) tuple
"""
return
(
self
.
hyperparameters
[
'
left_edge
'
]
+
self
.
hyperparameters
[
'
right_edge
'
],
self
.
hyperparameters
[
'
kmer_size
'
])
# We currently have an arbitrary hard floor of 5, even if the underlying
# peptide encoding supports smaller lengths.
#
# We empirically find the supported peptide lengths based on the
# lengths for which peptides_to_network_input throws ValueError.
try
:
self
.
peptides_to_network_input
([
""
])
except
EncodingError
as
e
:
return
e
.
supported_peptide_lengths
raise
RuntimeError
(
"
peptides_to_network_input did not raise
"
)
def
allele_encoding_to_network_input
(
self
,
allele_encoding
):
"""
...
...
@@ -799,11 +790,8 @@ class Class1NeuralNetwork(object):
def
make_network
(
self
,
kmer_size
,
peptide_encoding
,
allele_amino_acid_encoding
,
peptide_amino_acid_encoding
,
embedding_input_dim
,
embedding_output_dim
,
allele_dense_layer_sizes
,
peptide_dense_layer_sizes
,
peptide_allele_merge_method
,
...
...
@@ -816,7 +804,6 @@ class Class1NeuralNetwork(object):
output_activation
,
dropout_probability
,
batch_normalization
,
embedding_init_method
,
locally_connected_layers
,
allele_representations
=
None
):
"""
...
...
@@ -832,23 +819,12 @@ class Class1NeuralNetwork(object):
from
keras.layers.embeddings
import
Embedding
from
keras.layers.normalization
import
BatchNormalization
if
peptide_amino_acid_encoding
==
"
embedding
"
:
peptide_input
=
Input
(
shape
=
(
kmer_size
,),
dtype
=
'
int32
'
,
name
=
'
peptide
'
)
current_layer
=
Embedding
(
input_dim
=
embedding_input_dim
,
output_dim
=
embedding_output_dim
,
input_length
=
kmer_size
,
embeddings_initializer
=
embedding_init_method
,
name
=
"
peptide_embedding
"
)(
peptide_input
)
else
:
peptide_input
=
Input
(
shape
=
(
kmer_size
,
vector_encoding_length
(
peptide_amino_acid_encoding
)),
dtype
=
'
float32
'
,
name
=
'
peptide
'
)
current_layer
=
peptide_input
peptide_encoding_shape
=
self
.
peptides_to_network_input
([]).
shape
[
1
:]
peptide_input
=
Input
(
shape
=
peptide_encoding_shape
,
dtype
=
'
float32
'
,
name
=
'
peptide
'
)
current_layer
=
peptide_input
inputs
=
[
peptide_input
]
...
...
This diff is collapsed.
Click to expand it.
mhcflurry/encodable_sequences.py
+
35
−
17
View file @
b09ed3b4
...
...
@@ -13,6 +13,14 @@ import pandas
from
.
import
amino_acid
class
EncodingError
(
ValueError
):
def
__init__
(
self
,
message
,
supported_peptide_lengths
):
self
.
supported_peptide_lengths
=
supported_peptide_lengths
ValueError
.
__init__
(
self
,
message
+
"
Supported lengths: %s - %s.
"
%
supported_peptide_lengths
)
class
EncodableSequences
(
object
):
"""
Sequences of amino acids.
...
...
@@ -36,7 +44,7 @@ class EncodableSequences(object):
if
not
all
(
isinstance
(
obj
,
string_types
)
for
obj
in
sequences
):
raise
ValueError
(
"
Sequence of strings is required
"
)
self
.
sequences
=
numpy
.
array
(
sequences
)
lengths
=
pandas
.
Series
(
self
.
sequences
).
str
.
len
()
lengths
=
pandas
.
Series
(
self
.
sequences
,
dtype
=
numpy
.
object_
).
str
.
len
()
self
.
min_length
=
lengths
.
min
()
self
.
max_length
=
lengths
.
max
()
...
...
@@ -187,26 +195,23 @@ class EncodableSequences(object):
shape
=
(
len
(
sequences
),
max_length
),
dtype
=
"
int32
"
)
df
=
pandas
.
DataFrame
({
"
peptide
"
:
sequences
})
df
=
pandas
.
DataFrame
({
"
peptide
"
:
sequences
}
,
dtype
=
numpy
.
object_
)
df
[
"
length
"
]
=
df
.
peptide
.
str
.
len
()
middle_length
=
max_length
-
left_edge
-
right_edge
min_length
=
left_edge
+
right_edge
# For efficiency we handle each supported peptide length using bulk
# array operations.
for
(
length
,
sub_df
)
in
df
.
groupby
(
"
length
"
):
if
length
<
left_edge
+
right_edge
:
raise
ValueError
(
"
Sequence
'
%s
'
(length %d) unsupported: length must be at
"
"
least %d. There are %d total peptides with this length.
"
%
(
sub_df
.
iloc
[
0
].
peptide
,
length
,
left_edge
+
right_edge
,
len
(
sub_df
)))
if
length
>
max_length
:
raise
ValueError
(
"
Sequence
'
%s
'
(length %d) unsupported: length must be at
"
"
most %d. There are %d total peptides with this length.
"
%
(
sub_df
.
iloc
[
0
].
peptide
,
length
,
max_length
,
len
(
sub_df
)))
if
length
<
min_length
or
length
>
max_length
:
raise
EncodingError
(
"
Sequence
'
%s
'
(length %d) unsupported. There are %d
"
"
total peptides with this length.
"
%
(
sub_df
.
iloc
[
0
].
peptide
,
length
,
len
(
sub_df
)),
supported_peptide_lengths
=
(
min_length
,
max_length
))
# Array of shape (num peptides, length) giving fixed-length amino
# acid encoding each peptide of the current length.
...
...
@@ -240,17 +245,30 @@ class EncodableSequences(object):
-
right_edge
:
]
=
fixed_length_sequences
[:,
-
right_edge
:]
elif
alignment_method
==
"
left_pad_right_pad
"
:
# We arbitrarily set a minimum length of 5, although this encoding
# could handle smaller peptides.
min_length
=
5
# Result array is int32, filled with X (null amino acid) value.
result
=
numpy
.
full
(
fill_value
=
amino_acid
.
AMINO_ACID_INDEX
[
'
X
'
],
shape
=
(
len
(
sequences
),
max_length
*
2
),
dtype
=
"
int32
"
)
df
=
pandas
.
DataFrame
({
"
peptide
"
:
sequences
})
df
=
pandas
.
DataFrame
({
"
peptide
"
:
sequences
}
,
dtype
=
numpy
.
object_
)
# For efficiency we handle each supported peptide length using bulk
# array operations.
for
(
length
,
sub_df
)
in
df
.
groupby
(
df
.
peptide
.
str
.
len
()):
# Array of shape (num peptides, length) giving fixed-length amino
# acid encoding each peptide of the current length.
if
length
<
min_length
or
length
>
max_length
:
raise
EncodingError
(
"
Sequence
'
%s
'
(length %d) unsupported. There are %d
"
"
total peptides with this length.
"
%
(
sub_df
.
iloc
[
0
].
peptide
,
length
,
len
(
sub_df
)),
supported_peptide_lengths
=
(
min_length
,
max_length
))
# Array of shape (num peptides, length) giving fixed-length
# amino acid encoding each peptide of the current length.
fixed_length_sequences
=
numpy
.
stack
(
sub_df
.
peptide
.
map
(
lambda
s
:
numpy
.
array
(
[
amino_acid
.
AMINO_ACID_INDEX
[
char
]
for
char
in
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment