Skip to content
Snippets Groups Projects
Commit 984120cf authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Support BLOSUM62 encoding

parent ecc169e8
No related branches found
No related tags found
No related merge requests found
...@@ -16,7 +16,10 @@ from keras.layers.normalization import BatchNormalization ...@@ -16,7 +16,10 @@ from keras.layers.normalization import BatchNormalization
from mhcflurry.hyperparameters import HyperparameterDefaults from mhcflurry.hyperparameters import HyperparameterDefaults
from ..encodable_sequences import EncodableSequences from ..encodable_sequences import (
EncodableSequences,
available_vector_encodings,
vector_encoding_length)
from ..regression_target import to_ic50, from_ic50 from ..regression_target import to_ic50, from_ic50
from ..common import random_peptides, amino_acid_distribution from ..common import random_peptides, amino_acid_distribution
...@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object): ...@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object):
network_hyperparameter_defaults = HyperparameterDefaults( network_hyperparameter_defaults = HyperparameterDefaults(
kmer_size=15, kmer_size=15,
use_embedding=False, use_embedding=False,
peptide_amino_acid_encoding="one-hot",
embedding_input_dim=21, embedding_input_dim=21,
embedding_output_dim=8, embedding_output_dim=8,
pseudosequence_use_embedding=False, pseudosequence_use_embedding=False,
...@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object): ...@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object):
numpy.array numpy.array
""" """
encoder = EncodableSequences.create(peptides) encoder = EncodableSequences.create(peptides)
if self.hyperparameters['use_embedding']: if (self.hyperparameters['use_embedding'] or
self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"):
encoded = encoder.variable_length_to_fixed_length_categorical( encoded = encoder.variable_length_to_fixed_length_categorical(
max_length=self.hyperparameters['kmer_size'], max_length=self.hyperparameters['kmer_size'],
**self.input_encoding_hyperparameter_defaults.subselect( **self.input_encoding_hyperparameter_defaults.subselect(
self.hyperparameters)) self.hyperparameters))
else: elif (
encoded = encoder.variable_length_to_fixed_length_one_hot( self.hyperparameters['peptide_amino_acid_encoding'] in
available_vector_encodings()):
encoded = encoder.variable_length_to_fixed_length_vector_encoding(
self.hyperparameters['peptide_amino_acid_encoding'],
max_length=self.hyperparameters['kmer_size'], max_length=self.hyperparameters['kmer_size'],
**self.input_encoding_hyperparameter_defaults.subselect( **self.input_encoding_hyperparameter_defaults.subselect(
self.hyperparameters)) self.hyperparameters))
else:
raise ValueError("Unsupported peptide_amino_acid_encoding: %s" %
self.hyperparameters['peptide_amino_acid_encoding'])
assert len(encoded) == len(peptides) assert len(encoded) == len(peptides)
return encoded return encoded
@property @property
def supported_peptide_lengths(self): def supported_peptide_lengths(self):
""" """
...@@ -492,7 +502,8 @@ class Class1NeuralNetwork(object): ...@@ -492,7 +502,8 @@ class Class1NeuralNetwork(object):
pseudosequences_input = self.pseudosequence_to_network_input( pseudosequences_input = self.pseudosequence_to_network_input(
allele_pseudosequences) allele_pseudosequences)
x_dict['pseudosequence'] = pseudosequences_input x_dict['pseudosequence'] = pseudosequences_input
(predictions,) = numpy.array(self.network(borrow=True).predict(x_dict)).T (predictions,) = numpy.array(
self.network(borrow=True).predict(x_dict), dtype="float64").T
return to_ic50(predictions) return to_ic50(predictions)
def compile(self): def compile(self):
...@@ -507,6 +518,7 @@ class Class1NeuralNetwork(object): ...@@ -507,6 +518,7 @@ class Class1NeuralNetwork(object):
def make_network( def make_network(
pseudosequence_length, pseudosequence_length,
kmer_size, kmer_size,
peptide_amino_acid_encoding,
use_embedding, use_embedding,
embedding_input_dim, embedding_input_dim,
embedding_output_dim, embedding_output_dim,
...@@ -524,7 +536,7 @@ class Class1NeuralNetwork(object): ...@@ -524,7 +536,7 @@ class Class1NeuralNetwork(object):
""" """
Helper function to make a keras network for class1 affinity prediction. Helper function to make a keras network for class1 affinity prediction.
""" """
if use_embedding: if use_embedding or peptide_amino_acid_encoding == "embedding":
peptide_input = Input( peptide_input = Input(
shape=(kmer_size,), dtype='int32', name='peptide') shape=(kmer_size,), dtype='int32', name='peptide')
current_layer = Embedding( current_layer = Embedding(
...@@ -535,7 +547,11 @@ class Class1NeuralNetwork(object): ...@@ -535,7 +547,11 @@ class Class1NeuralNetwork(object):
name="peptide_embedding")(peptide_input) name="peptide_embedding")(peptide_input)
else: else:
peptide_input = Input( peptide_input = Input(
shape=(kmer_size, 21), dtype='float32', name='peptide') shape=(
kmer_size,
vector_encoding_length(peptide_amino_acid_encoding)),
dtype='float32',
name='peptide')
current_layer = peptide_input current_layer = peptide_input
inputs = [peptide_input] inputs = [peptide_input]
...@@ -609,4 +625,7 @@ class Class1NeuralNetwork(object): ...@@ -609,4 +625,7 @@ class Class1NeuralNetwork(object):
inputs=inputs, inputs=inputs,
outputs=[output], outputs=[output],
name="predictor") name="predictor")
print("*** ARCHITECTURE ***")
model.summary()
return model return model
...@@ -22,11 +22,75 @@ import math ...@@ -22,11 +22,75 @@ import math
import pandas import pandas
import numpy import numpy
from six import StringIO
import typechecks import typechecks
from . import amino_acid from . import amino_acid
AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
BLOSUM62_MATRIX = pandas.read_table(StringIO("""
A R N D C Q E G H I L K M F P S T W Y V X
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
ENCODING_DFS = {
"BLOSUM62": BLOSUM62_MATRIX,
"one-hot": pandas.DataFrame([
[1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
for j in range(len(AMINO_ACIDS))
], index=AMINO_ACIDS, columns=AMINO_ACIDS)
}
def available_vector_encodings():
"""
Return list of supported amino acid vector encodings.
Returns
-------
list of string
"""
return list(ENCODING_DFS)
def vector_encoding_length(name):
"""
Return the length of the given vector encoding.
Parameters
----------
name : string
Returns
-------
int
"""
return ENCODING_DFS[name].shape[1]
def index_encoding(sequences, letter_to_index_dict): def index_encoding(sequences, letter_to_index_dict):
""" """
...@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict): ...@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict):
return result.values return result.values
def one_hot_encoding(index_encoded, alphabet_size): def fixed_vectors_encoding(sequences, letter_to_vector_function):
""" """
Given an n * k array of integers in the range [0, alphabet_size), return Given a sequence of n strings all of length k, return a n * k * m array where
an n * k * alphabet_size array where element (i, k, j) is 1 if element the (i, j)th element is letter_to_vector_function(sequence[i][j]).
(i, k) == j in the input array and zero otherwise.
Parameters Parameters
---------- ----------
index_encoded : numpy.array of integers with shape (n, k) sequences : list of length n of strings of length k
alphabet_size : int letter_to_vector_function : function : string -> vector of length m
Returns Returns
------- -------
numpy.array of integers of shape (n, k, alphabet_size) numpy.array of integers with shape (n, k, m)
""" """
alphabet_size = int(alphabet_size) arr = numpy.array([list(s) for s in sequences])
(num_sequences, sequence_length) = index_encoded.shape result = numpy.vectorize(
result = numpy.zeros( letter_to_vector_function, signature='()->(n)')(arr)
(num_sequences, sequence_length, alphabet_size),
dtype='int32')
# Transform the index encoded array into an array of indices into the
# flattened result, which we will set to 1.
flattened_indices = (
index_encoded +
(
sequence_length * alphabet_size * numpy.arange(num_sequences)
).reshape((-1, 1)) +
numpy.tile(numpy.arange(sequence_length),
(num_sequences, 1)) * alphabet_size)
result.put(flattened_indices, 1)
return result return result
...@@ -114,40 +163,6 @@ class EncodableSequences(object): ...@@ -114,40 +163,6 @@ class EncodableSequences(object):
def __len__(self): def __len__(self):
return len(self.sequences) return len(self.sequences)
def fixed_length_categorical(self):
"""
Returns a categorical encoding (i.e. integers 0 <= x < 21) of the
sequences, which must already be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key = ("categorical",)
if cache_key not in self.encoding_cache:
assert self.fixed_sequence_length
self.encoding_cache[cache_key] = index_encoding(
self.sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key]
def fixed_length_one_hot(self):
"""
Returns a binary one-hot encoding of the sequences, which must already
be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key = ("one_hot",)
if cache_key not in self.encoding_cache:
assert self.fixed_sequence_length
encoded = self.categorical_encoding()
result = one_hot_encoding(
encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_categorical( def variable_length_to_fixed_length_categorical(
self, left_edge=4, right_edge=4, max_length=15): self, left_edge=4, right_edge=4, max_length=15):
""" """
...@@ -187,8 +202,8 @@ class EncodableSequences(object): ...@@ -187,8 +202,8 @@ class EncodableSequences(object):
fixed_length_sequences, amino_acid.AMINO_ACID_INDEX) fixed_length_sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key] return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_one_hot( def variable_length_to_fixed_length_vector_encoding(
self, left_edge=4, right_edge=4, max_length=15): self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15):
""" """
Encode variable-length sequences using a fixed-length encoding designed Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides. for preserving the anchor positions of class I peptides.
...@@ -198,35 +213,43 @@ class EncodableSequences(object): ...@@ -198,35 +213,43 @@ class EncodableSequences(object):
Parameters Parameters
---------- ----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings().
left_edge : int, size of fixed-position left side left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding max_length : sequence length of the resulting encoding
Returns Returns
------- -------
binary numpy.array with shape (num sequences, max_length, 21) numpy.array with shape (num sequences, max_length, m) where m is
""" vector_encoding_length(vector_encoding_name)
"""
cache_key = ( cache_key = (
"fixed_length_one_hot", "fixed_length_vector_encoding",
vector_encoding_name,
left_edge, left_edge,
right_edge, right_edge,
max_length) max_length)
if cache_key not in self.encoding_cache: if cache_key not in self.encoding_cache:
encoded = self.variable_length_to_fixed_length_categorical( fixed_length_sequences = [
left_edge=left_edge, self.sequence_to_fixed_length_string(
right_edge=right_edge, sequence,
max_length=max_length) left_edge=left_edge,
result = one_hot_encoding( right_edge=right_edge,
encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX)) max_length=max_length)
assert result.shape == ( for sequence in self.sequences
len(self.sequences), ]
encoded.shape[1], result = fixed_vectors_encoding(
len(amino_acid.AMINO_ACID_INDEX)) fixed_length_sequences,
ENCODING_DFS[vector_encoding_name].loc.__getitem__)
assert result.shape[0] == len(self.sequences)
self.encoding_cache[cache_key] = result self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key] return self.encoding_cache[cache_key]
@classmethod @classmethod
def sequence_to_fixed_length_string( def sequence_to_fixed_length_string(
klass, sequence, left_edge=4, right_edge=4, max_length=15): klass, sequence, left_edge=4, right_edge=4, max_length=15):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment