Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
8b01a3aa
Commit
8b01a3aa
authored
7 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
update experimental pan-allele infra
parent
b3840422
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
mhcflurry/allele_encoding.py
+71
-0
71 additions, 0 deletions
mhcflurry/allele_encoding.py
mhcflurry/class1_neural_network.py
+147
-75
147 additions, 75 deletions
mhcflurry/class1_neural_network.py
mhcflurry/encodable_sequences.py
+0
-1
0 additions, 1 deletion
mhcflurry/encodable_sequences.py
with
218 additions
and
76 deletions
mhcflurry/allele_encoding.py
0 → 100644
+
71
−
0
View file @
8b01a3aa
import
numpy
import
pandas
from
.encodable_sequences
import
EncodableSequences
from
.
import
amino_acid
class
AlleleEncoding
(
object
):
def
__init__
(
self
,
alleles
,
allele_to_fixed_length_sequence
=
None
):
"""
A place to cache encoding for a (potentially large) sequence of alleles.
Parameters
----------
alleles : list of string
Allele names
allele_to_fixed_length_sequence : dict of str -> str
Allele name to fixed lengths sequence (
"
pseudosequence
"
)
"""
alleles
=
pandas
.
Series
(
alleles
)
all_alleles
=
list
(
sorted
(
alleles
.
unique
()))
self
.
allele_to_index
=
dict
(
(
allele
,
i
)
for
(
i
,
allele
)
in
enumerate
(
all_alleles
))
self
.
indices
=
alleles
.
map
(
self
.
allele_to_index
)
self
.
fixed_length_sequences
=
pandas
.
Series
(
[
allele_to_fixed_length_sequence
[
a
]
for
a
in
all_alleles
],
index
=
all_alleles
)
self
.
encoding_cache
=
{}
def
fixed_length_sequences
(
self
,
vector_encoding_name
):
"""
Encode alleles.
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of
"
BLOSUM62
"
,
"
one-hot
"
, etc. Full list of supported vector
encodings is given by available_vector_encodings() in amino_acid.
Returns
-------
numpy.array with shape (num sequences, sequence length, m) where m is
vector_encoding_length(vector_encoding_name)
"""
cache_key
=
(
"
fixed_length_vector_encoding
"
,
vector_encoding_name
)
if
cache_key
not
in
self
.
encoding_cache
:
index_encoded_matrix
=
amino_acid
.
fixed_vectors_encoding
(
self
.
fixed_length_sequences
.
values
,
amino_acid
.
COMMON_AMINO_ACIDS_WITH_UNKNOWN
)
vector_encoded
=
amino_acid
.
fixed_vectors_encoding
(
index_encoded_matrix
,
amino_acid
.
ENCODING_DATA_FRAMES
[
vector_encoding_name
])
vector_encoded_df
=
pandas
.
DataFrame
(
vector_encoded
)
result
=
vector_encoded_df
.
iloc
[
self
.
indices
]
self
.
encoding_cache
[
cache_key
]
=
result
return
self
.
encoding_cache
[
cache_key
]
This diff is collapsed.
Click to expand it.
mhcflurry/class1_neural_network.py
+
147
−
75
View file @
8b01a3aa
...
...
@@ -27,11 +27,13 @@ class Class1NeuralNetwork(object):
network_hyperparameter_defaults
=
HyperparameterDefaults
(
kmer_size
=
15
,
use_embedding
=
False
,
peptide_amino_acid_encoding
=
"
one-hot
"
,
embedding_input_dim
=
21
,
embedding_output_dim
=
8
,
pseudosequence_use_embedding
=
False
,
allele_dense_layer_sizes
=
[],
peptide_dense_layer_sizes
=
[],
peptide_allele_merge_method
=
"
multiply
"
,
peptide_allele_merge_activation
=
""
,
layer_sizes
=
[
32
],
dense_layer_l1_regularization
=
0.001
,
dense_layer_l2_regularization
=
0.0
,
...
...
@@ -107,9 +109,41 @@ class Class1NeuralNetwork(object):
Combined set of all supported hyperparameters and their default values.
"""
# Hyperparameter renames.
# These are updated from time to time as new versions are developed. It
# provides a primitive way to allow new code to work with models trained
# using older code.
# None indicates the hyperparameter has been dropped.
hyperparameter_renames
=
{
"
use_embedding
"
:
None
,
"
pseudosequence_use_embedding
"
:
None
,
}
@classmethod
def
apply_hyperparameter_renames
(
cls
,
hyperparameters
):
"""
Handle hyperparameter renames.
Parameters
----------
hyperparameters : dict
Returns
-------
dict : updated hyperparameters
"""
for
(
from_name
,
to_name
)
in
cls
.
hyperparameter_renames
.
items
():
if
from_name
in
hyperparameters
:
value
=
hyperparameters
.
pop
(
from_name
)
if
to_name
:
hyperparameters
[
to_name
]
=
value
return
hyperparameters
def
__init__
(
self
,
**
hyperparameters
):
self
.
hyperparameters
=
self
.
hyperparameter_defaults
.
with_defaults
(
hyperparameters
)
self
.
apply_hyperparameter_renames
(
hyperparameters
)
)
self
.
_network
=
None
self
.
network_json
=
None
...
...
@@ -278,8 +312,7 @@ class Class1NeuralNetwork(object):
numpy.array
"""
encoder
=
EncodableSequences
.
create
(
peptides
)
if
(
self
.
hyperparameters
[
'
use_embedding
'
]
or
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
]
==
"
embedding
"
):
if
(
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
]
==
"
embedding
"
):
encoded
=
encoder
.
variable_length_to_fixed_length_categorical
(
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
...
...
@@ -313,33 +346,26 @@ class Class1NeuralNetwork(object):
self
.
hyperparameters
[
'
right_edge
'
],
self
.
hyperparameters
[
'
kmer_size
'
])
def
pseudosequence
_to_network_input
(
self
,
pseudosequences
):
def
allele_encoding
_to_network_input
(
self
,
allele_encoding
):
"""
Encode
pseudosequenc
es to the fixed-length encoding expected by the neural
Encode
allel
es to the fixed-length encoding expected by the neural
network (which depends on the architecture).
Parameters
----------
pseudosequences : EncodableSequences or list of str
ing
allele_encoding : AlleleEncod
ing
Returns
-------
numpy.array
"""
encoder
=
EncodableSequences
.
create
(
pseudosequences
)
if
self
.
hyperparameters
[
'
pseudosequence_use_embedding
'
]:
encoded
=
encoder
.
fixed_length_categorical
()
else
:
raise
NotImplementedError
# encoded = encoder.fixed_length_one_hot()
assert
len
(
encoded
)
==
len
(
pseudosequences
)
return
encoded
return
allele_encoding
.
fixed_length_sequences
(
"
BLOSUM62
"
)
def
fit
(
self
,
peptides
,
affinities
,
allele_
pseudosequences
=
None
,
allele_
encoding
=
None
,
inequalities
=
None
,
sample_weights
=
None
,
shuffle_permutation
=
None
,
...
...
@@ -355,7 +381,7 @@ class Class1NeuralNetwork(object):
affinities : list of float
nM affinities. Must be same length of as peptides.
allele_
pseudosequences : EncodableSequences or list of str
ing, optional
allele_
encoding : AlleleEncod
ing, optional
If not specified, the model will be a single-allele predictor.
inequalities : list of string, each element one of
"
>
"
,
"
<
"
, or
"
=
"
.
...
...
@@ -429,13 +455,12 @@ class Class1NeuralNetwork(object):
x_dict_without_random_negatives
=
{
'
peptide
'
:
peptide_encoding
,
}
pseudosequence_length
=
None
if
allele_pseudosequences
is
not
None
:
pseudosequences_input
=
self
.
pseudosequence_to_network_input
(
allele_pseudosequences
)
pseudosequence_length
=
len
(
pseudosequences_input
[
0
])
x_dict_without_random_negatives
[
'
pseudosequence
'
]
=
(
pseudosequences_input
)
allele_encoding_dims
=
None
if
allele_encoding
is
not
None
:
allele_encoding_input
=
self
.
allele_encoding_to_network_input
(
allele_encoding
)
allele_encoding_dims
=
allele_encoding_input
.
shape
[
1
:]
x_dict_without_random_negatives
[
'
allele
'
]
=
allele_encoding_input
# Shuffle y_values and the contents of x_dict_without_random_negatives
# This ensures different data is used for the test set for early stopping
...
...
@@ -480,7 +505,7 @@ class Class1NeuralNetwork(object):
if
self
.
network
()
is
None
:
self
.
_network
=
self
.
make_network
(
pseudosequence_length
=
pseudosequence_length
,
allele_encoding_dims
=
allele_encoding_dims
,
**
self
.
network_hyperparameter_defaults
.
subselect
(
self
.
hyperparameters
))
self
.
network
().
compile
(
...
...
@@ -537,6 +562,7 @@ class Class1NeuralNetwork(object):
self
.
loss_history
=
collections
.
defaultdict
(
list
)
start
=
time
.
time
()
last_progress_print
=
None
x_dict_with_random_negatives
=
{}
for
i
in
range
(
self
.
hyperparameters
[
'
max_epochs
'
]):
random_negative_peptides_list
=
[]
for
(
length
,
count
)
in
num_random_negative
.
iteritems
():
...
...
@@ -545,21 +571,45 @@ class Class1NeuralNetwork(object):
count
,
length
=
length
,
distribution
=
aa_distribution
))
random_negative_peptides
=
EncodableSequences
.
create
(
random_negative_peptides_list
)
random_negative_peptides_encoding
=
(
self
.
peptides_to_network_input
(
random_negative_peptides_list
))
x_dict_with_random_negatives
=
{
"
peptide
"
:
numpy
.
concatenate
([
random_negative_peptides_encoding
,
peptide_encoding
,
])
if
len
(
random_negative_peptides_encoding
)
>
0
else
peptide_encoding
}
if
pseudosequence_length
:
# TODO: add random pseudosequences for random negative peptides
raise
NotImplementedError
(
"
Allele pseudosequences unsupported with random negatives
"
)
self
.
peptides_to_network_input
(
random_negative_peptides
))
if
not
x_dict_with_random_negatives
:
if
len
(
random_negative_peptides
)
>
0
:
x_dict_with_random_negatives
[
"
peptide
"
]
=
numpy
.
concatenate
([
random_negative_peptides_encoding
,
peptide_encoding
,
])
if
'
allele
'
in
x_dict_without_random_negatives
:
x_dict_with_random_negatives
[
'
allele
'
]
=
numpy
.
concatenate
([
x_dict_without_random_negatives
[
'
allele
'
][
numpy
.
random
.
choice
(
x_dict_without_random_negatives
[
'
allele
'
].
shape
[
0
],
size
=
len
(
random_negative_peptides_list
))],
x_dict_without_random_negatives
[
'
allele
'
]
])
else
:
x_dict_with_random_negatives
=
(
x_dict_without_random_negatives
)
else
:
# Update x_dict_with_random_negatives in place.
# This is more memory efficient than recreating it as above.
if
len
(
random_negative_peptides
)
>
0
:
x_dict_with_random_negatives
[
"
peptide
"
][:
len
(
random_negative_peptides
)]
=
(
random_negative_peptides_encoding
)
if
'
allele
'
in
x_dict_with_random_negatives
:
x_dict_with_random_negatives
[
'
allele
'
][:
len
(
random_negative_peptides
)]
=
(
x_dict_with_random_negatives
[
'
allele
'
][
len
(
random_negative_peptides
)
+
numpy
.
random
.
choice
(
x_dict_with_random_negatives
[
'
allele
'
].
shape
[
0
]
-
len
(
random_negative_peptides
),
size
=
len
(
random_negative_peptides
))
]
)
fit_history
=
self
.
network
().
fit
(
x_dict_with_random_negatives
,
...
...
@@ -610,7 +660,7 @@ class Class1NeuralNetwork(object):
break
self
.
fit_seconds
=
time
.
time
()
-
start
def
predict
(
self
,
peptides
,
allele_
pseudosequences
=
None
,
batch_size
=
4096
):
def
predict
(
self
,
peptides
,
allele_
encoding
=
None
,
batch_size
=
4096
):
"""
Predict affinities
...
...
@@ -618,7 +668,7 @@ class Class1NeuralNetwork(object):
----------
peptides : EncodableSequences or list of string
allele_pseudosequences :
EncodableSequences or list of str
ing, optional
allele_pseudosequences :
AlleleEncod
ing, optional
Only required when this model is a pan-allele model
batch_size : int
...
...
@@ -631,7 +681,7 @@ class Class1NeuralNetwork(object):
x_dict
=
{
'
peptide
'
:
self
.
peptides_to_network_input
(
peptides
)
}
if
allele_
pseudosequences
is
not
None
:
if
allele_
encoding
is
not
None
:
pseudosequences_input
=
self
.
pseudosequence_to_network_input
(
allele_pseudosequences
)
x_dict
[
'
pseudosequence
'
]
=
pseudosequences_input
...
...
@@ -643,13 +693,15 @@ class Class1NeuralNetwork(object):
@staticmethod
def
make_network
(
pseudosequence_length
,
allele_encoding_dims
,
kmer_size
,
peptide_amino_acid_encoding
,
use_embedding
,
embedding_input_dim
,
embedding_output_dim
,
pseudosequence_use_embedding
,
allele_dense_layer_sizes
,
peptide_dense_layer_sizes
,
peptide_allele_merge_method
,
peptide_allele_merge_activation
,
layer_sizes
,
dense_layer_l1_regularization
,
dense_layer_l2_regularization
,
...
...
@@ -673,7 +725,7 @@ class Class1NeuralNetwork(object):
from
keras.layers.embeddings
import
Embedding
from
keras.layers.normalization
import
BatchNormalization
if
use_embedding
or
peptide_amino_acid_encoding
==
"
embedding
"
:
if
peptide_amino_acid_encoding
==
"
embedding
"
:
peptide_input
=
Input
(
shape
=
(
kmer_size
,),
dtype
=
'
int32
'
,
name
=
'
peptide
'
)
current_layer
=
Embedding
(
...
...
@@ -693,6 +745,12 @@ class Class1NeuralNetwork(object):
inputs
=
[
peptide_input
]
kernel_regularizer
=
None
l1
=
dense_layer_l1_regularization
l2
=
dense_layer_l2_regularization
if
l1
>
0
or
l2
>
0
:
kernel_regularizer
=
keras
.
regularizers
.
l1_l2
(
l1
,
l2
)
for
(
i
,
locally_connected_params
)
in
enumerate
(
locally_connected_layers
):
current_layer
=
keras
.
layers
.
LocallyConnected1D
(
name
=
"
lc_%d
"
%
i
,
...
...
@@ -700,6 +758,13 @@ class Class1NeuralNetwork(object):
current_layer
=
Flatten
(
name
=
"
flattened_0
"
)(
current_layer
)
for
(
i
,
layer_size
)
in
enumerate
(
peptide_dense_layer_sizes
):
current_layer
=
Dense
(
layer_size
,
name
=
"
peptide_dense_%d
"
%
i
,
kernel_regularizer
=
kernel_regularizer
,
activation
=
activation
)(
current_layer
)
if
batch_normalization
:
current_layer
=
BatchNormalization
(
name
=
"
batch_norm_early
"
)(
current_layer
)
...
...
@@ -708,37 +773,44 @@ class Class1NeuralNetwork(object):
current_layer
=
Dropout
(
dropout_probability
,
name
=
"
dropout_early
"
)(
current_layer
)
if
pseudosequence_length
:
if
pseudosequence_use_embedding
:
pseudosequence_input
=
Input
(
shape
=
(
pseudosequence_length
,),
dtype
=
'
int32
'
,
name
=
'
pseudosequence
'
)
pseudo_embedding_layer
=
Embedding
(
input_dim
=
embedding_input_dim
,
output_dim
=
embedding_output_dim
,
input_length
=
pseudosequence_length
,
embeddings_initializer
=
embedding_init_method
)(
pseudosequence_input
)
if
allele_encoding_dims
:
allele_input
=
Input
(
shape
=
allele_encoding_dims
,
dtype
=
'
int32
'
,
name
=
'
peptide
'
)
inputs
.
append
(
allele_input
)
allele_embedding_layer
=
Flatten
(
name
=
"
allele_flat
"
)(
allele_input
)
for
(
i
,
layer_size
)
in
enumerate
(
allele_dense_layer_sizes
):
allele_embedding_layer
=
Dense
(
layer_size
,
name
=
"
allele_dense_%d
"
%
i
,
kernel_regularizer
=
kernel_regularizer
,
activation
=
activation
)(
allele_embedding_layer
)
if
peptide_allele_merge_method
==
'
concatenate
'
:
current_layer
=
keras
.
layers
.
concatenate
([
current_layer
,
allele_embedding_layer
],
name
=
"
allele_peptide_merged
"
)
elif
peptide_allele_merge_method
==
'
multiply
'
:
current_layer
=
keras
.
layers
.
multiply
([
current_layer
,
allele_embedding_layer
],
name
=
"
allele_peptide_merged
"
)
current_layer
=
keras
.
layers
.
concatenate
(
[
current_layer
,
allele_embedding_layer
],
name
=
"
concatenated_0
"
)
else
:
pseudosequence_input
=
Input
(
shape
=
(
pseudosequence_length
,
21
),
dtype
=
'
float32
'
,
name
=
'
peptide
'
)
pseudo_embedding_layer
=
pseudosequence_input
inputs
.
append
(
pseudosequence_input
)
pseudo_embedding_layer
=
Flatten
(
name
=
"
flattened_1
"
)(
pseudo_embedding_layer
)
current_layer
=
keras
.
layers
.
concatenate
([
current_layer
,
pseudo_embedding_layer
],
name
=
"
concatenated_0
"
)
raise
ValueError
(
"
Unsupported peptide_allele_encoding_merge_method: %s
"
%
peptide_allele_merge_method
)
if
peptide_allele_merge_activation
:
current_layer
=
keras
.
layers
.
Activation
(
peptide_allele_merge_activation
,
name
=
"
alelle_peptide_merged_%s
"
%
peptide_allele_merge_activation
)(
current_layer
)
for
(
i
,
layer_size
)
in
enumerate
(
layer_sizes
):
kernel_regularizer
=
None
l1
=
dense_layer_l1_regularization
l2
=
dense_layer_l2_regularization
if
l1
>
0
or
l2
>
0
:
kernel_regularizer
=
keras
.
regularizers
.
l1_l2
(
l1
,
l2
)
current_layer
=
Dense
(
layer_size
,
activation
=
activation
,
...
...
This diff is collapsed.
Click to expand it.
mhcflurry/encodable_sequences.py
+
0
−
1
View file @
8b01a3aa
...
...
@@ -104,7 +104,6 @@ class EncodableSequences(object):
-------
numpy.array with shape (num sequences, max_length, m) where m is
vector_encoding_length(vector_encoding_name)
"""
cache_key
=
(
"
fixed_length_vector_encoding
"
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment