From 984120cf6be654298d1b13f8ea6e610e687a77eb Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 14 Nov 2017 14:52:02 -0500 Subject: [PATCH] Support BLOSUM62 encoding --- .../class1_neural_network.py | 35 +++- mhcflurry/encodable_sequences.py | 173 ++++++++++-------- 2 files changed, 125 insertions(+), 83 deletions(-) diff --git a/mhcflurry/class1_affinity_prediction/class1_neural_network.py b/mhcflurry/class1_affinity_prediction/class1_neural_network.py index 8e2a638e..e22cf8dc 100644 --- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py +++ b/mhcflurry/class1_affinity_prediction/class1_neural_network.py @@ -16,7 +16,10 @@ from keras.layers.normalization import BatchNormalization from mhcflurry.hyperparameters import HyperparameterDefaults -from ..encodable_sequences import EncodableSequences +from ..encodable_sequences import ( + EncodableSequences, + available_vector_encodings, + vector_encoding_length) from ..regression_target import to_ic50, from_ic50 from ..common import random_peptides, amino_acid_distribution @@ -35,6 +38,7 @@ class Class1NeuralNetwork(object): network_hyperparameter_defaults = HyperparameterDefaults( kmer_size=15, use_embedding=False, + peptide_amino_acid_encoding="one-hot", embedding_input_dim=21, embedding_output_dim=8, pseudosequence_use_embedding=False, @@ -256,20 +260,26 @@ class Class1NeuralNetwork(object): numpy.array """ encoder = EncodableSequences.create(peptides) - if self.hyperparameters['use_embedding']: + if (self.hyperparameters['use_embedding'] or + self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"): encoded = encoder.variable_length_to_fixed_length_categorical( max_length=self.hyperparameters['kmer_size'], **self.input_encoding_hyperparameter_defaults.subselect( self.hyperparameters)) - else: - encoded = encoder.variable_length_to_fixed_length_one_hot( + elif ( + self.hyperparameters['peptide_amino_acid_encoding'] in + available_vector_encodings()): + encoded = encoder.variable_length_to_fixed_length_vector_encoding( + self.hyperparameters['peptide_amino_acid_encoding'], max_length=self.hyperparameters['kmer_size'], **self.input_encoding_hyperparameter_defaults.subselect( self.hyperparameters)) + else: + raise ValueError("Unsupported peptide_amino_acid_encoding: %s" % + self.hyperparameters['peptide_amino_acid_encoding']) assert len(encoded) == len(peptides) return encoded - @property def supported_peptide_lengths(self): """ @@ -492,7 +502,8 @@ class Class1NeuralNetwork(object): pseudosequences_input = self.pseudosequence_to_network_input( allele_pseudosequences) x_dict['pseudosequence'] = pseudosequences_input - (predictions,) = numpy.array(self.network(borrow=True).predict(x_dict)).T + (predictions,) = numpy.array( + self.network(borrow=True).predict(x_dict), dtype="float64").T return to_ic50(predictions) def compile(self): @@ -507,6 +518,7 @@ class Class1NeuralNetwork(object): def make_network( pseudosequence_length, kmer_size, + peptide_amino_acid_encoding, use_embedding, embedding_input_dim, embedding_output_dim, @@ -524,7 +536,7 @@ class Class1NeuralNetwork(object): """ Helper function to make a keras network for class1 affinity prediction. """ - if use_embedding: + if use_embedding or peptide_amino_acid_encoding == "embedding": peptide_input = Input( shape=(kmer_size,), dtype='int32', name='peptide') current_layer = Embedding( @@ -535,7 +547,11 @@ class Class1NeuralNetwork(object): name="peptide_embedding")(peptide_input) else: peptide_input = Input( - shape=(kmer_size, 21), dtype='float32', name='peptide') + shape=( + kmer_size, + vector_encoding_length(peptide_amino_acid_encoding)), + dtype='float32', + name='peptide') current_layer = peptide_input inputs = [peptide_input] @@ -609,4 +625,7 @@ class Class1NeuralNetwork(object): inputs=inputs, outputs=[output], name="predictor") + + print("*** ARCHITECTURE ***") + model.summary() return model diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index d5e9c7a0..4abc94a1 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -22,11 +22,75 @@ import math import pandas import numpy +from six import StringIO import typechecks from . import amino_acid +AMINO_ACIDS = list(amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN.keys()) + +BLOSUM62_MATRIX = pandas.read_table(StringIO(""" + A R N D C Q E G H I L K M F P S T W Y V X +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 0 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 0 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 0 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 0 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 0 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 0 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 0 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 0 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 0 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 0 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 0 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 0 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 0 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 0 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 0 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 0 +X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 +"""), sep='\s+').loc[AMINO_ACIDS, AMINO_ACIDS] +assert (BLOSUM62_MATRIX == BLOSUM62_MATRIX.T).all().all() + +ENCODING_DFS = { + "BLOSUM62": BLOSUM62_MATRIX, + "one-hot": pandas.DataFrame([ + [1 if i == j else 0 for i in range(len(AMINO_ACIDS))] + for j in range(len(AMINO_ACIDS)) + ], index=AMINO_ACIDS, columns=AMINO_ACIDS) +} + + +def available_vector_encodings(): + """ + Return list of supported amino acid vector encodings. + + Returns + ------- + list of string + + """ + return list(ENCODING_DFS) + + +def vector_encoding_length(name): + """ + Return the length of the given vector encoding. + + Parameters + ---------- + name : string + + Returns + ------- + int + """ + return ENCODING_DFS[name].shape[1] + def index_encoding(sequences, letter_to_index_dict): """ @@ -47,38 +111,23 @@ def index_encoding(sequences, letter_to_index_dict): return result.values -def one_hot_encoding(index_encoded, alphabet_size): +def fixed_vectors_encoding(sequences, letter_to_vector_function): """ - Given an n * k array of integers in the range [0, alphabet_size), return - an n * k * alphabet_size array where element (i, k, j) is 1 if element - (i, k) == j in the input array and zero otherwise. - + Given a sequence of n strings all of length k, return a n * k * m array where + the (i, j)th element is letter_to_vector_function(sequence[i][j]). + Parameters ---------- - index_encoded : numpy.array of integers with shape (n, k) - alphabet_size : int + sequences : list of length n of strings of length k + letter_to_vector_function : function : string -> vector of length m Returns ------- - numpy.array of integers of shape (n, k, alphabet_size) - + numpy.array of integers with shape (n, k, m) """ - alphabet_size = int(alphabet_size) - (num_sequences, sequence_length) = index_encoded.shape - result = numpy.zeros( - (num_sequences, sequence_length, alphabet_size), - dtype='int32') - - # Transform the index encoded array into an array of indices into the - # flattened result, which we will set to 1. - flattened_indices = ( - index_encoded + - ( - sequence_length * alphabet_size * numpy.arange(num_sequences) - ).reshape((-1, 1)) + - numpy.tile(numpy.arange(sequence_length), - (num_sequences, 1)) * alphabet_size) - result.put(flattened_indices, 1) + arr = numpy.array([list(s) for s in sequences]) + result = numpy.vectorize( + letter_to_vector_function, signature='()->(n)')(arr) return result @@ -114,40 +163,6 @@ class EncodableSequences(object): def __len__(self): return len(self.sequences) - def fixed_length_categorical(self): - """ - Returns a categorical encoding (i.e. integers 0 <= x < 21) of the - sequences, which must already be all the same length. - - Returns - ------- - numpy.array of integers - """ - cache_key = ("categorical",) - if cache_key not in self.encoding_cache: - assert self.fixed_sequence_length - self.encoding_cache[cache_key] = index_encoding( - self.sequences, amino_acid.AMINO_ACID_INDEX) - return self.encoding_cache[cache_key] - - def fixed_length_one_hot(self): - """ - Returns a binary one-hot encoding of the sequences, which must already - be all the same length. - - Returns - ------- - numpy.array of integers - """ - cache_key = ("one_hot",) - if cache_key not in self.encoding_cache: - assert self.fixed_sequence_length - encoded = self.categorical_encoding() - result = one_hot_encoding( - encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX)) - self.encoding_cache[cache_key] = result - return self.encoding_cache[cache_key] - def variable_length_to_fixed_length_categorical( self, left_edge=4, right_edge=4, max_length=15): """ @@ -187,8 +202,8 @@ class EncodableSequences(object): fixed_length_sequences, amino_acid.AMINO_ACID_INDEX) return self.encoding_cache[cache_key] - def variable_length_to_fixed_length_one_hot( - self, left_edge=4, right_edge=4, max_length=15): + def variable_length_to_fixed_length_vector_encoding( + self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15): """ Encode variable-length sequences using a fixed-length encoding designed for preserving the anchor positions of class I peptides. @@ -198,35 +213,43 @@ class EncodableSequences(object): Parameters ---------- + vector_encoding_name : string + How to represent amino acids. + One of "BLOSUM62", "one-hot", etc. Full list of supported vector + encodings is given by available_vector_encodings(). left_edge : int, size of fixed-position left side right_edge : int, size of the fixed-position right side max_length : sequence length of the resulting encoding Returns ------- - binary numpy.array with shape (num sequences, max_length, 21) - """ + numpy.array with shape (num sequences, max_length, m) where m is + vector_encoding_length(vector_encoding_name) + """ cache_key = ( - "fixed_length_one_hot", + "fixed_length_vector_encoding", + vector_encoding_name, left_edge, right_edge, max_length) - if cache_key not in self.encoding_cache: - encoded = self.variable_length_to_fixed_length_categorical( - left_edge=left_edge, - right_edge=right_edge, - max_length=max_length) - result = one_hot_encoding( - encoded, alphabet_size=len(amino_acid.AMINO_ACID_INDEX)) - assert result.shape == ( - len(self.sequences), - encoded.shape[1], - len(amino_acid.AMINO_ACID_INDEX)) + fixed_length_sequences = [ + self.sequence_to_fixed_length_string( + sequence, + left_edge=left_edge, + right_edge=right_edge, + max_length=max_length) + for sequence in self.sequences + ] + result = fixed_vectors_encoding( + fixed_length_sequences, + ENCODING_DFS[vector_encoding_name].loc.__getitem__) + assert result.shape[0] == len(self.sequences) self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] + @classmethod def sequence_to_fixed_length_string( klass, sequence, left_edge=4, right_edge=4, max_length=15): -- GitLab