Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
984120cf
Commit
984120cf
authored
7 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
Support BLOSUM62 encoding
parent
ecc169e8
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
mhcflurry/class1_affinity_prediction/class1_neural_network.py
+27
-8
27 additions, 8 deletions
...lurry/class1_affinity_prediction/class1_neural_network.py
mhcflurry/encodable_sequences.py
+98
-75
98 additions, 75 deletions
mhcflurry/encodable_sequences.py
with
125 additions
and
83 deletions
mhcflurry/class1_affinity_prediction/class1_neural_network.py
+
27
−
8
View file @
984120cf
...
@@ -16,7 +16,10 @@ from keras.layers.normalization import BatchNormalization
...
@@ -16,7 +16,10 @@ from keras.layers.normalization import BatchNormalization
from
mhcflurry.hyperparameters
import
HyperparameterDefaults
from
mhcflurry.hyperparameters
import
HyperparameterDefaults
from
..encodable_sequences
import
EncodableSequences
from
..encodable_sequences
import
(
EncodableSequences
,
available_vector_encodings
,
vector_encoding_length
)
from
..regression_target
import
to_ic50
,
from_ic50
from
..regression_target
import
to_ic50
,
from_ic50
from
..common
import
random_peptides
,
amino_acid_distribution
from
..common
import
random_peptides
,
amino_acid_distribution
...
@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object):
...
@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object):
network_hyperparameter_defaults
=
HyperparameterDefaults
(
network_hyperparameter_defaults
=
HyperparameterDefaults
(
kmer_size
=
15
,
kmer_size
=
15
,
use_embedding
=
False
,
use_embedding
=
False
,
peptide_amino_acid_encoding
=
"
one-hot
"
,
embedding_input_dim
=
21
,
embedding_input_dim
=
21
,
embedding_output_dim
=
8
,
embedding_output_dim
=
8
,
pseudosequence_use_embedding
=
False
,
pseudosequence_use_embedding
=
False
,
...
@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object):
...
@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object):
numpy.array
numpy.array
"""
"""
encoder
=
EncodableSequences
.
create
(
peptides
)
encoder
=
EncodableSequences
.
create
(
peptides
)
if
self
.
hyperparameters
[
'
use_embedding
'
]:
if
(
self
.
hyperparameters
[
'
use_embedding
'
]
or
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
]
==
"
embedding
"
):
encoded
=
encoder
.
variable_length_to_fixed_length_categorical
(
encoded
=
encoder
.
variable_length_to_fixed_length_categorical
(
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
self
.
hyperparameters
))
self
.
hyperparameters
))
else
:
elif
(
encoded
=
encoder
.
variable_length_to_fixed_length_one_hot
(
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
]
in
available_vector_encodings
()):
encoded
=
encoder
.
variable_length_to_fixed_length_vector_encoding
(
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
],
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
self
.
hyperparameters
))
self
.
hyperparameters
))
else
:
raise
ValueError
(
"
Unsupported peptide_amino_acid_encoding: %s
"
%
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
])
assert
len
(
encoded
)
==
len
(
peptides
)
assert
len
(
encoded
)
==
len
(
peptides
)
return
encoded
return
encoded
@property
@property
def
supported_peptide_lengths
(
self
):
def
supported_peptide_lengths
(
self
):
"""
"""
...
@@ -492,7 +502,8 @@ class Class1NeuralNetwork(object):
...
@@ -492,7 +502,8 @@ class Class1NeuralNetwork(object):
pseudosequences_input
=
self
.
pseudosequence_to_network_input
(
pseudosequences_input
=
self
.
pseudosequence_to_network_input
(
allele_pseudosequences
)
allele_pseudosequences
)
x_dict
[
'
pseudosequence
'
]
=
pseudosequences_input
x_dict
[
'
pseudosequence
'
]
=
pseudosequences_input
(
predictions
,)
=
numpy
.
array
(
self
.
network
(
borrow
=
True
).
predict
(
x_dict
)).
T
(
predictions
,)
=
numpy
.
array
(
self
.
network
(
borrow
=
True
).
predict
(
x_dict
),
dtype
=
"
float64
"
).
T
return
to_ic50
(
predictions
)
return
to_ic50
(
predictions
)
def
compile
(
self
):
def
compile
(
self
):
...
@@ -507,6 +518,7 @@ class Class1NeuralNetwork(object):
...
@@ -507,6 +518,7 @@ class Class1NeuralNetwork(object):
def
make_network
(
def
make_network
(
pseudosequence_length
,
pseudosequence_length
,
kmer_size
,
kmer_size
,
peptide_amino_acid_encoding
,
use_embedding
,
use_embedding
,
embedding_input_dim
,
embedding_input_dim
,
embedding_output_dim
,
embedding_output_dim
,
...
@@ -524,7 +536,7 @@ class Class1NeuralNetwork(object):
...
@@ -524,7 +536,7 @@ class Class1NeuralNetwork(object):
"""
"""
Helper function to make a keras network for class1 affinity prediction.
Helper function to make a keras network for class1 affinity prediction.
"""
"""
if
use_embedding
:
if
use_embedding
or
peptide_amino_acid_encoding
==
"
embedding
"
:
peptide_input
=
Input
(
peptide_input
=
Input
(
shape
=
(
kmer_size
,),
dtype
=
'
int32
'
,
name
=
'
peptide
'
)
shape
=
(
kmer_size
,),
dtype
=
'
int32
'
,
name
=
'
peptide
'
)
current_layer
=
Embedding
(
current_layer
=
Embedding
(
...
@@ -535,7 +547,11 @@ class Class1NeuralNetwork(object):
...
@@ -535,7 +547,11 @@ class Class1NeuralNetwork(object):
name
=
"
peptide_embedding
"
)(
peptide_input
)
name
=
"
peptide_embedding
"
)(
peptide_input
)
else
:
else
:
peptide_input
=
Input
(
peptide_input
=
Input
(
shape
=
(
kmer_size
,
21
),
dtype
=
'
float32
'
,
name
=
'
peptide
'
)
shape
=
(
kmer_size
,
vector_encoding_length
(
peptide_amino_acid_encoding
)),
dtype
=
'
float32
'
,
name
=
'
peptide
'
)
current_layer
=
peptide_input
current_layer
=
peptide_input
inputs
=
[
peptide_input
]
inputs
=
[
peptide_input
]
...
@@ -609,4 +625,7 @@ class Class1NeuralNetwork(object):
...
@@ -609,4 +625,7 @@ class Class1NeuralNetwork(object):
inputs
=
inputs
,
inputs
=
inputs
,
outputs
=
[
output
],
outputs
=
[
output
],
name
=
"
predictor
"
)
name
=
"
predictor
"
)
print
(
"
*** ARCHITECTURE ***
"
)
model
.
summary
()
return
model
return
model
This diff is collapsed.
Click to expand it.
mhcflurry/encodable_sequences.py
+
98
−
75
View file @
984120cf
...
@@ -22,11 +22,75 @@ import math
...
@@ -22,11 +22,75 @@ import math
import
pandas
import
pandas
import
numpy
import
numpy
from
six
import
StringIO
import
typechecks
import
typechecks
from
.
import
amino_acid
from
.
import
amino_acid
AMINO_ACIDS
=
list
(
amino_acid
.
COMMON_AMINO_ACIDS_WITH_UNKNOWN
.
keys
())
BLOSUM62_MATRIX
=
pandas
.
read_table
(
StringIO
(
"""
A R N D C Q E G H I L K M F P S T W Y V X
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
"""
),
sep
=
'
\s+
'
).
loc
[
AMINO_ACIDS
,
AMINO_ACIDS
]
assert
(
BLOSUM62_MATRIX
==
BLOSUM62_MATRIX
.
T
).
all
().
all
()
ENCODING_DFS
=
{
"
BLOSUM62
"
:
BLOSUM62_MATRIX
,
"
one-hot
"
:
pandas
.
DataFrame
([
[
1
if
i
==
j
else
0
for
i
in
range
(
len
(
AMINO_ACIDS
))]
for
j
in
range
(
len
(
AMINO_ACIDS
))
],
index
=
AMINO_ACIDS
,
columns
=
AMINO_ACIDS
)
}
def
available_vector_encodings
():
"""
Return list of supported amino acid vector encodings.
Returns
-------
list of string
"""
return
list
(
ENCODING_DFS
)
def
vector_encoding_length
(
name
):
"""
Return the length of the given vector encoding.
Parameters
----------
name : string
Returns
-------
int
"""
return
ENCODING_DFS
[
name
].
shape
[
1
]
def
index_encoding
(
sequences
,
letter_to_index_dict
):
def
index_encoding
(
sequences
,
letter_to_index_dict
):
"""
"""
...
@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict):
...
@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict):
return
result
.
values
return
result
.
values
def
one_hot_encoding
(
index_encoded
,
alphabet_size
):
def
fixed_vectors_encoding
(
sequences
,
letter_to_vector_function
):
"""
"""
Given an n * k array of integers in the range [0, alphabet_size), return
Given a sequence of n strings all of length k, return a n * k * m array where
an n * k * alphabet_size array where element (i, k, j) is 1 if element
the (i, j)th element is letter_to_vector_function(sequence[i][j]).
(i, k) == j in the input array and zero otherwise.
Parameters
Parameters
----------
----------
index_encoded : numpy.array of integers with shape (n,
k
)
sequences : list of length n of strings of length
k
alphabet_size : int
letter_to_vector_function : function : string -> vector of length m
Returns
Returns
-------
-------
numpy.array of integers of shape (n, k, alphabet_size)
numpy.array of integers with shape (n, k, m)
"""
"""
alphabet_size
=
int
(
alphabet_size
)
arr
=
numpy
.
array
([
list
(
s
)
for
s
in
sequences
])
(
num_sequences
,
sequence_length
)
=
index_encoded
.
shape
result
=
numpy
.
vectorize
(
result
=
numpy
.
zeros
(
letter_to_vector_function
,
signature
=
'
()->(n)
'
)(
arr
)
(
num_sequences
,
sequence_length
,
alphabet_size
),
dtype
=
'
int32
'
)
# Transform the index encoded array into an array of indices into the
# flattened result, which we will set to 1.
flattened_indices
=
(
index_encoded
+
(
sequence_length
*
alphabet_size
*
numpy
.
arange
(
num_sequences
)
).
reshape
((
-
1
,
1
))
+
numpy
.
tile
(
numpy
.
arange
(
sequence_length
),
(
num_sequences
,
1
))
*
alphabet_size
)
result
.
put
(
flattened_indices
,
1
)
return
result
return
result
...
@@ -114,40 +163,6 @@ class EncodableSequences(object):
...
@@ -114,40 +163,6 @@ class EncodableSequences(object):
def
__len__
(
self
):
def
__len__
(
self
):
return
len
(
self
.
sequences
)
return
len
(
self
.
sequences
)
def
fixed_length_categorical
(
self
):
"""
Returns a categorical encoding (i.e. integers 0 <= x < 21) of the
sequences, which must already be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key
=
(
"
categorical
"
,)
if
cache_key
not
in
self
.
encoding_cache
:
assert
self
.
fixed_sequence_length
self
.
encoding_cache
[
cache_key
]
=
index_encoding
(
self
.
sequences
,
amino_acid
.
AMINO_ACID_INDEX
)
return
self
.
encoding_cache
[
cache_key
]
def
fixed_length_one_hot
(
self
):
"""
Returns a binary one-hot encoding of the sequences, which must already
be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key
=
(
"
one_hot
"
,)
if
cache_key
not
in
self
.
encoding_cache
:
assert
self
.
fixed_sequence_length
encoded
=
self
.
categorical_encoding
()
result
=
one_hot_encoding
(
encoded
,
alphabet_size
=
len
(
amino_acid
.
AMINO_ACID_INDEX
))
self
.
encoding_cache
[
cache_key
]
=
result
return
self
.
encoding_cache
[
cache_key
]
def
variable_length_to_fixed_length_categorical
(
def
variable_length_to_fixed_length_categorical
(
self
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
self
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
"""
"""
...
@@ -187,8 +202,8 @@ class EncodableSequences(object):
...
@@ -187,8 +202,8 @@ class EncodableSequences(object):
fixed_length_sequences
,
amino_acid
.
AMINO_ACID_INDEX
)
fixed_length_sequences
,
amino_acid
.
AMINO_ACID_INDEX
)
return
self
.
encoding_cache
[
cache_key
]
return
self
.
encoding_cache
[
cache_key
]
def
variable_length_to_fixed_length_
one_hot
(
def
variable_length_to_fixed_length_
vector_encoding
(
self
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
self
,
vector_encoding_name
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
"""
"""
Encode variable-length sequences using a fixed-length encoding designed
Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides.
for preserving the anchor positions of class I peptides.
...
@@ -198,35 +213,43 @@ class EncodableSequences(object):
...
@@ -198,35 +213,43 @@ class EncodableSequences(object):
Parameters
Parameters
----------
----------
vector_encoding_name : string
How to represent amino acids.
One of
"
BLOSUM62
"
,
"
one-hot
"
, etc. Full list of supported vector
encodings is given by available_vector_encodings().
left_edge : int, size of fixed-position left side
left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side
right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding
max_length : sequence length of the resulting encoding
Returns
Returns
-------
-------
binary
numpy.array with shape (num sequences, max_length,
21)
numpy.array with shape (num sequences, max_length,
m) where m is
"""
vector_encoding_length(vector_encoding_name)
"""
cache_key
=
(
cache_key
=
(
"
fixed_length_one_hot
"
,
"
fixed_length_vector_encoding
"
,
vector_encoding_name
,
left_edge
,
left_edge
,
right_edge
,
right_edge
,
max_length
)
max_length
)
if
cache_key
not
in
self
.
encoding_cache
:
if
cache_key
not
in
self
.
encoding_cache
:
encoded
=
self
.
variable_length_to_fixed_length_categorical
(
fixed_length_sequences
=
[
left_edge
=
left_edge
,
self
.
sequence_to_fixed_length_string
(
right_edge
=
right_edge
,
sequence
,
max_length
=
max_length
)
left_edge
=
left_edge
,
result
=
one_hot_encoding
(
right_edge
=
right_edge
,
encoded
,
alphabet_size
=
len
(
amino_acid
.
AMINO_ACID_INDEX
))
max_length
=
max_length
)
assert
result
.
shape
==
(
for
sequence
in
self
.
sequences
len
(
self
.
sequences
),
]
encoded
.
shape
[
1
],
result
=
fixed_vectors_encoding
(
len
(
amino_acid
.
AMINO_ACID_INDEX
))
fixed_length_sequences
,
ENCODING_DFS
[
vector_encoding_name
].
loc
.
__getitem__
)
assert
result
.
shape
[
0
]
==
len
(
self
.
sequences
)
self
.
encoding_cache
[
cache_key
]
=
result
self
.
encoding_cache
[
cache_key
]
=
result
return
self
.
encoding_cache
[
cache_key
]
return
self
.
encoding_cache
[
cache_key
]
@classmethod
@classmethod
def
sequence_to_fixed_length_string
(
def
sequence_to_fixed_length_string
(
klass
,
sequence
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
klass
,
sequence
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment