Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
M
mhc_rank
Manage
Activity
Members
Labels
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Skillman-Lawrence
mhc_rank
Commits
984120cf
Commit
984120cf
authored
7 years ago
by
Tim O'Donnell
Browse files
Options
Downloads
Patches
Plain Diff
Support BLOSUM62 encoding
parent
ecc169e8
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
mhcflurry/class1_affinity_prediction/class1_neural_network.py
+27
-8
27 additions, 8 deletions
...lurry/class1_affinity_prediction/class1_neural_network.py
mhcflurry/encodable_sequences.py
+98
-75
98 additions, 75 deletions
mhcflurry/encodable_sequences.py
with
125 additions
and
83 deletions
mhcflurry/class1_affinity_prediction/class1_neural_network.py
+
27
−
8
View file @
984120cf
...
...
@@ -16,7 +16,10 @@ from keras.layers.normalization import BatchNormalization
from
mhcflurry.hyperparameters
import
HyperparameterDefaults
from
..encodable_sequences
import
EncodableSequences
from
..encodable_sequences
import
(
EncodableSequences
,
available_vector_encodings
,
vector_encoding_length
)
from
..regression_target
import
to_ic50
,
from_ic50
from
..common
import
random_peptides
,
amino_acid_distribution
...
...
@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object):
network_hyperparameter_defaults
=
HyperparameterDefaults
(
kmer_size
=
15
,
use_embedding
=
False
,
peptide_amino_acid_encoding
=
"
one-hot
"
,
embedding_input_dim
=
21
,
embedding_output_dim
=
8
,
pseudosequence_use_embedding
=
False
,
...
...
@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object):
numpy.array
"""
encoder
=
EncodableSequences
.
create
(
peptides
)
if
self
.
hyperparameters
[
'
use_embedding
'
]:
if
(
self
.
hyperparameters
[
'
use_embedding
'
]
or
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
]
==
"
embedding
"
):
encoded
=
encoder
.
variable_length_to_fixed_length_categorical
(
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
self
.
hyperparameters
))
else
:
encoded
=
encoder
.
variable_length_to_fixed_length_one_hot
(
elif
(
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
]
in
available_vector_encodings
()):
encoded
=
encoder
.
variable_length_to_fixed_length_vector_encoding
(
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
],
max_length
=
self
.
hyperparameters
[
'
kmer_size
'
],
**
self
.
input_encoding_hyperparameter_defaults
.
subselect
(
self
.
hyperparameters
))
else
:
raise
ValueError
(
"
Unsupported peptide_amino_acid_encoding: %s
"
%
self
.
hyperparameters
[
'
peptide_amino_acid_encoding
'
])
assert
len
(
encoded
)
==
len
(
peptides
)
return
encoded
@property
def
supported_peptide_lengths
(
self
):
"""
...
...
@@ -492,7 +502,8 @@ class Class1NeuralNetwork(object):
pseudosequences_input
=
self
.
pseudosequence_to_network_input
(
allele_pseudosequences
)
x_dict
[
'
pseudosequence
'
]
=
pseudosequences_input
(
predictions
,)
=
numpy
.
array
(
self
.
network
(
borrow
=
True
).
predict
(
x_dict
)).
T
(
predictions
,)
=
numpy
.
array
(
self
.
network
(
borrow
=
True
).
predict
(
x_dict
),
dtype
=
"
float64
"
).
T
return
to_ic50
(
predictions
)
def
compile
(
self
):
...
...
@@ -507,6 +518,7 @@ class Class1NeuralNetwork(object):
def
make_network
(
pseudosequence_length
,
kmer_size
,
peptide_amino_acid_encoding
,
use_embedding
,
embedding_input_dim
,
embedding_output_dim
,
...
...
@@ -524,7 +536,7 @@ class Class1NeuralNetwork(object):
"""
Helper function to make a keras network for class1 affinity prediction.
"""
if
use_embedding
:
if
use_embedding
or
peptide_amino_acid_encoding
==
"
embedding
"
:
peptide_input
=
Input
(
shape
=
(
kmer_size
,),
dtype
=
'
int32
'
,
name
=
'
peptide
'
)
current_layer
=
Embedding
(
...
...
@@ -535,7 +547,11 @@ class Class1NeuralNetwork(object):
name
=
"
peptide_embedding
"
)(
peptide_input
)
else
:
peptide_input
=
Input
(
shape
=
(
kmer_size
,
21
),
dtype
=
'
float32
'
,
name
=
'
peptide
'
)
shape
=
(
kmer_size
,
vector_encoding_length
(
peptide_amino_acid_encoding
)),
dtype
=
'
float32
'
,
name
=
'
peptide
'
)
current_layer
=
peptide_input
inputs
=
[
peptide_input
]
...
...
@@ -609,4 +625,7 @@ class Class1NeuralNetwork(object):
inputs
=
inputs
,
outputs
=
[
output
],
name
=
"
predictor
"
)
print
(
"
*** ARCHITECTURE ***
"
)
model
.
summary
()
return
model
This diff is collapsed.
Click to expand it.
mhcflurry/encodable_sequences.py
+
98
−
75
View file @
984120cf
...
...
@@ -22,11 +22,75 @@ import math
import
pandas
import
numpy
from
six
import
StringIO
import
typechecks
from
.
import
amino_acid
AMINO_ACIDS
=
list
(
amino_acid
.
COMMON_AMINO_ACIDS_WITH_UNKNOWN
.
keys
())
BLOSUM62_MATRIX
=
pandas
.
read_table
(
StringIO
(
"""
A R N D C Q E G H I L K M F P S T W Y V X
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
"""
),
sep
=
'
\s+
'
).
loc
[
AMINO_ACIDS
,
AMINO_ACIDS
]
assert
(
BLOSUM62_MATRIX
==
BLOSUM62_MATRIX
.
T
).
all
().
all
()
ENCODING_DFS
=
{
"
BLOSUM62
"
:
BLOSUM62_MATRIX
,
"
one-hot
"
:
pandas
.
DataFrame
([
[
1
if
i
==
j
else
0
for
i
in
range
(
len
(
AMINO_ACIDS
))]
for
j
in
range
(
len
(
AMINO_ACIDS
))
],
index
=
AMINO_ACIDS
,
columns
=
AMINO_ACIDS
)
}
def
available_vector_encodings
():
"""
Return list of supported amino acid vector encodings.
Returns
-------
list of string
"""
return
list
(
ENCODING_DFS
)
def
vector_encoding_length
(
name
):
"""
Return the length of the given vector encoding.
Parameters
----------
name : string
Returns
-------
int
"""
return
ENCODING_DFS
[
name
].
shape
[
1
]
def
index_encoding
(
sequences
,
letter_to_index_dict
):
"""
...
...
@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict):
return
result
.
values
def
one_hot_encoding
(
index_encoded
,
alphabet_size
):
def
fixed_vectors_encoding
(
sequences
,
letter_to_vector_function
):
"""
Given an n * k array of integers in the range [0, alphabet_size), return
an n * k * alphabet_size array where element (i, k, j) is 1 if element
(i, k) == j in the input array and zero otherwise.
Given a sequence of n strings all of length k, return a n * k * m array where
the (i, j)th element is letter_to_vector_function(sequence[i][j]).
Parameters
----------
index_encoded : numpy.array of integers with shape (n,
k
)
alphabet_size : int
sequences : list of length n of strings of length
k
letter_to_vector_function : function : string -> vector of length m
Returns
-------
numpy.array of integers of shape (n, k, alphabet_size)
numpy.array of integers with shape (n, k, m)
"""
alphabet_size
=
int
(
alphabet_size
)
(
num_sequences
,
sequence_length
)
=
index_encoded
.
shape
result
=
numpy
.
zeros
(
(
num_sequences
,
sequence_length
,
alphabet_size
),
dtype
=
'
int32
'
)
# Transform the index encoded array into an array of indices into the
# flattened result, which we will set to 1.
flattened_indices
=
(
index_encoded
+
(
sequence_length
*
alphabet_size
*
numpy
.
arange
(
num_sequences
)
).
reshape
((
-
1
,
1
))
+
numpy
.
tile
(
numpy
.
arange
(
sequence_length
),
(
num_sequences
,
1
))
*
alphabet_size
)
result
.
put
(
flattened_indices
,
1
)
arr
=
numpy
.
array
([
list
(
s
)
for
s
in
sequences
])
result
=
numpy
.
vectorize
(
letter_to_vector_function
,
signature
=
'
()->(n)
'
)(
arr
)
return
result
...
...
@@ -114,40 +163,6 @@ class EncodableSequences(object):
def
__len__
(
self
):
return
len
(
self
.
sequences
)
def
fixed_length_categorical
(
self
):
"""
Returns a categorical encoding (i.e. integers 0 <= x < 21) of the
sequences, which must already be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key
=
(
"
categorical
"
,)
if
cache_key
not
in
self
.
encoding_cache
:
assert
self
.
fixed_sequence_length
self
.
encoding_cache
[
cache_key
]
=
index_encoding
(
self
.
sequences
,
amino_acid
.
AMINO_ACID_INDEX
)
return
self
.
encoding_cache
[
cache_key
]
def
fixed_length_one_hot
(
self
):
"""
Returns a binary one-hot encoding of the sequences, which must already
be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key
=
(
"
one_hot
"
,)
if
cache_key
not
in
self
.
encoding_cache
:
assert
self
.
fixed_sequence_length
encoded
=
self
.
categorical_encoding
()
result
=
one_hot_encoding
(
encoded
,
alphabet_size
=
len
(
amino_acid
.
AMINO_ACID_INDEX
))
self
.
encoding_cache
[
cache_key
]
=
result
return
self
.
encoding_cache
[
cache_key
]
def
variable_length_to_fixed_length_categorical
(
self
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
"""
...
...
@@ -187,8 +202,8 @@ class EncodableSequences(object):
fixed_length_sequences
,
amino_acid
.
AMINO_ACID_INDEX
)
return
self
.
encoding_cache
[
cache_key
]
def
variable_length_to_fixed_length_
one_hot
(
self
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
def
variable_length_to_fixed_length_
vector_encoding
(
self
,
vector_encoding_name
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
"""
Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides.
...
...
@@ -198,35 +213,43 @@ class EncodableSequences(object):
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of
"
BLOSUM62
"
,
"
one-hot
"
, etc. Full list of supported vector
encodings is given by available_vector_encodings().
left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding
Returns
-------
binary
numpy.array with shape (num sequences, max_length,
21)
"""
numpy.array with shape (num sequences, max_length,
m) where m is
vector_encoding_length(vector_encoding_name)
"""
cache_key
=
(
"
fixed_length_one_hot
"
,
"
fixed_length_vector_encoding
"
,
vector_encoding_name
,
left_edge
,
right_edge
,
max_length
)
if
cache_key
not
in
self
.
encoding_cache
:
encoded
=
self
.
variable_length_to_fixed_length_categorical
(
left_edge
=
left_edge
,
right_edge
=
right_edge
,
max_length
=
max_length
)
result
=
one_hot_encoding
(
encoded
,
alphabet_size
=
len
(
amino_acid
.
AMINO_ACID_INDEX
))
assert
result
.
shape
==
(
len
(
self
.
sequences
),
encoded
.
shape
[
1
],
len
(
amino_acid
.
AMINO_ACID_INDEX
))
fixed_length_sequences
=
[
self
.
sequence_to_fixed_length_string
(
sequence
,
left_edge
=
left_edge
,
right_edge
=
right_edge
,
max_length
=
max_length
)
for
sequence
in
self
.
sequences
]
result
=
fixed_vectors_encoding
(
fixed_length_sequences
,
ENCODING_DFS
[
vector_encoding_name
].
loc
.
__getitem__
)
assert
result
.
shape
[
0
]
==
len
(
self
.
sequences
)
self
.
encoding_cache
[
cache_key
]
=
result
return
self
.
encoding_cache
[
cache_key
]
@classmethod
def
sequence_to_fixed_length_string
(
klass
,
sequence
,
left_edge
=
4
,
right_edge
=
4
,
max_length
=
15
):
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment