Skip to content
Snippets Groups Projects
Commit 984120cf authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Support BLOSUM62 encoding

parent ecc169e8
No related branches found
No related tags found
No related merge requests found
......@@ -16,7 +16,10 @@ from keras.layers.normalization import BatchNormalization
from mhcflurry.hyperparameters import HyperparameterDefaults
from ..encodable_sequences import EncodableSequences
from ..encodable_sequences import (
EncodableSequences,
available_vector_encodings,
vector_encoding_length)
from ..regression_target import to_ic50, from_ic50
from ..common import random_peptides, amino_acid_distribution
......@@ -35,6 +38,7 @@ class Class1NeuralNetwork(object):
network_hyperparameter_defaults = HyperparameterDefaults(
kmer_size=15,
use_embedding=False,
peptide_amino_acid_encoding="one-hot",
embedding_input_dim=21,
embedding_output_dim=8,
pseudosequence_use_embedding=False,
......@@ -256,20 +260,26 @@ class Class1NeuralNetwork(object):
numpy.array
"""
encoder = EncodableSequences.create(peptides)
if self.hyperparameters['use_embedding']:
if (self.hyperparameters['use_embedding'] or
self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"):
encoded = encoder.variable_length_to_fixed_length_categorical(
max_length=self.hyperparameters['kmer_size'],
**self.input_encoding_hyperparameter_defaults.subselect(
self.hyperparameters))
else:
encoded = encoder.variable_length_to_fixed_length_one_hot(
elif (
self.hyperparameters['peptide_amino_acid_encoding'] in
available_vector_encodings()):
encoded = encoder.variable_length_to_fixed_length_vector_encoding(
self.hyperparameters['peptide_amino_acid_encoding'],
max_length=self.hyperparameters['kmer_size'],
**self.input_encoding_hyperparameter_defaults.subselect(
self.hyperparameters))
else:
raise ValueError("Unsupported peptide_amino_acid_encoding: %s" %
self.hyperparameters['peptide_amino_acid_encoding'])
assert len(encoded) == len(peptides)
return encoded
@property
def supported_peptide_lengths(self):
"""
......@@ -492,7 +502,8 @@ class Class1NeuralNetwork(object):
pseudosequences_input = self.pseudosequence_to_network_input(
allele_pseudosequences)
x_dict['pseudosequence'] = pseudosequences_input
(predictions,) = numpy.array(self.network(borrow=True).predict(x_dict)).T
(predictions,) = numpy.array(
self.network(borrow=True).predict(x_dict), dtype="float64").T
return to_ic50(predictions)
def compile(self):
......@@ -507,6 +518,7 @@ class Class1NeuralNetwork(object):
def make_network(
pseudosequence_length,
kmer_size,
peptide_amino_acid_encoding,
use_embedding,
embedding_input_dim,
embedding_output_dim,
......@@ -524,7 +536,7 @@ class Class1NeuralNetwork(object):
"""
Helper function to make a keras network for class1 affinity prediction.
"""
if use_embedding:
if use_embedding or peptide_amino_acid_encoding == "embedding":
peptide_input = Input(
shape=(kmer_size,), dtype='int32', name='peptide')
current_layer = Embedding(
......@@ -535,7 +547,11 @@ class Class1NeuralNetwork(object):
name="peptide_embedding")(peptide_input)
else:
peptide_input = Input(
shape=(kmer_size, 21), dtype='float32', name='peptide')
shape=(
kmer_size,
vector_encoding_length(peptide_amino_acid_encoding)),
dtype='float32',
name='peptide')
current_layer = peptide_input
inputs = [peptide_input]
......@@ -609,4 +625,7 @@ class Class1NeuralNetwork(object):
inputs=inputs,
outputs=[output],
name="predictor")
print("*** ARCHITECTURE ***")
model.summary()
return model
......@@ -22,11 +22,75 @@ import math
import pandas
import numpy
from six import StringIO
import typechecks
from . import amino_acid
AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys())
BLOSUM62_MATRIX = pandas.read_table(StringIO("""
A R N D C Q E G H I L K M F P S T W Y V X
A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0
R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0
N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0
D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0
C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0
Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0
E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0
G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0
H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0
I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0
L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0
K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0
M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0
F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0
P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0
S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0
T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0
W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0
Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0
V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS]
assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all()
ENCODING_DFS = {
"BLOSUM62": BLOSUM62_MATRIX,
"one-hot": pandas.DataFrame([
[1 if i == j else 0 for i in range(len(AMINO_ACIDS))]
for j in range(len(AMINO_ACIDS))
], index=AMINO_ACIDS, columns=AMINO_ACIDS)
}
def available_vector_encodings():
"""
Return list of supported amino acid vector encodings.
Returns
-------
list of string
"""
return list(ENCODING_DFS)
def vector_encoding_length(name):
"""
Return the length of the given vector encoding.
Parameters
----------
name : string
Returns
-------
int
"""
return ENCODING_DFS[name].shape[1]
def index_encoding(sequences, letter_to_index_dict):
"""
......@@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict):
return result.values
def one_hot_encoding(index_encoded, alphabet_size):
def fixed_vectors_encoding(sequences, letter_to_vector_function):
"""
Given an n * k array of integers in the range [0, alphabet_size), return
an n * k * alphabet_size array where element (i, k, j) is 1 if element
(i, k) == j in the input array and zero otherwise.
Given a sequence of n strings all of length k, return a n * k * m array where
the (i, j)th element is letter_to_vector_function(sequence[i][j]).
Parameters
----------
index_encoded : numpy.array of integers with shape (n, k)
alphabet_size : int
sequences : list of length n of strings of length k
letter_to_vector_function : function : string -> vector of length m
Returns
-------
numpy.array of integers of shape (n, k, alphabet_size)
numpy.array of integers with shape (n, k, m)
"""
alphabet_size = int(alphabet_size)
(num_sequences, sequence_length) = index_encoded.shape
result = numpy.zeros(
(num_sequences, sequence_length, alphabet_size),
dtype='int32')
# Transform the index encoded array into an array of indices into the
# flattened result, which we will set to 1.
flattened_indices = (
index_encoded +
(
sequence_length * alphabet_size * numpy.arange(num_sequences)
).reshape((-1, 1)) +
numpy.tile(numpy.arange(sequence_length),
(num_sequences, 1)) * alphabet_size)
result.put(flattened_indices, 1)
arr = numpy.array([list(s) for s in sequences])
result = numpy.vectorize(
letter_to_vector_function, signature='()->(n)')(arr)
return result
......@@ -114,40 +163,6 @@ class EncodableSequences(object):
def __len__(self):
return len(self.sequences)
def fixed_length_categorical(self):
"""
Returns a categorical encoding (i.e. integers 0 <= x < 21) of the
sequences, which must already be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key = ("categorical",)
if cache_key not in self.encoding_cache:
assert self.fixed_sequence_length
self.encoding_cache[cache_key] = index_encoding(
self.sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key]
def fixed_length_one_hot(self):
"""
Returns a binary one-hot encoding of the sequences, which must already
be all the same length.
Returns
-------
numpy.array of integers
"""
cache_key = ("one_hot",)
if cache_key not in self.encoding_cache:
assert self.fixed_sequence_length
encoded = self.categorical_encoding()
result = one_hot_encoding(
encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_categorical(
self, left_edge=4, right_edge=4, max_length=15):
"""
......@@ -187,8 +202,8 @@ class EncodableSequences(object):
fixed_length_sequences, amino_acid.AMINO_ACID_INDEX)
return self.encoding_cache[cache_key]
def variable_length_to_fixed_length_one_hot(
self, left_edge=4, right_edge=4, max_length=15):
def variable_length_to_fixed_length_vector_encoding(
self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15):
"""
Encode variable-length sequences using a fixed-length encoding designed
for preserving the anchor positions of class I peptides.
......@@ -198,35 +213,43 @@ class EncodableSequences(object):
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings().
left_edge : int, size of fixed-position left side
right_edge : int, size of the fixed-position right side
max_length : sequence length of the resulting encoding
Returns
-------
binary numpy.array with shape (num sequences, max_length, 21)
"""
numpy.array with shape (num sequences, max_length, m) where m is
vector_encoding_length(vector_encoding_name)
"""
cache_key = (
"fixed_length_one_hot",
"fixed_length_vector_encoding",
vector_encoding_name,
left_edge,
right_edge,
max_length)
if cache_key not in self.encoding_cache:
encoded = self.variable_length_to_fixed_length_categorical(
left_edge=left_edge,
right_edge=right_edge,
max_length=max_length)
result = one_hot_encoding(
encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX))
assert result.shape == (
len(self.sequences),
encoded.shape[1],
len(amino_acid.AMINO_ACID_INDEX))
fixed_length_sequences = [
self.sequence_to_fixed_length_string(
sequence,
left_edge=left_edge,
right_edge=right_edge,
max_length=max_length)
for sequence in self.sequences
]
result = fixed_vectors_encoding(
fixed_length_sequences,
ENCODING_DFS[vector_encoding_name].loc.__getitem__)
assert result.shape[0] == len(self.sequences)
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
@classmethod
def sequence_to_fixed_length_string(
klass, sequence, left_edge=4, right_edge=4, max_length=15):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment