From b09ed3b403785cffe9dfc4bdada6ec8c6fb3a115 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Mon, 22 Apr 2019 13:46:06 -0400 Subject: [PATCH] cleanup hyperparameters --- mhcflurry/class1_neural_network.py | 92 +++++++++++------------------- mhcflurry/encodable_sequences.py | 52 +++++++++++------ 2 files changed, 69 insertions(+), 75 deletions(-) diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index f47a3eeb..a9c1edb5 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -9,7 +9,7 @@ import pandas from .hyperparameters import HyperparameterDefaults -from .encodable_sequences import EncodableSequences +from .encodable_sequences import EncodableSequences, EncodingError from .amino_acid import available_vector_encodings, vector_encoding_length from .regression_target import to_ic50, from_ic50 from .common import random_peptides, amino_acid_distribution @@ -28,12 +28,15 @@ class Class1NeuralNetwork(object): """ network_hyperparameter_defaults = HyperparameterDefaults( - kmer_size=15, - peptide_amino_acid_encoding="BLOSUM62", allele_amino_acid_encoding="BLOSUM62", - embedding_input_dim=21, - embedding_output_dim=8, allele_dense_layer_sizes=[], + peptide_encoding={ + 'vector_encoding_name': 'BLOSUM62', + 'alignment_method': 'pad_middle', + 'left_edge': 4, + 'right_edge': 4, + 'max_length': 15, + }, peptide_dense_layer_sizes=[], peptide_allele_merge_method="multiply", peptide_allele_merge_activation="", @@ -45,7 +48,6 @@ class Class1NeuralNetwork(object): output_activation="sigmoid", dropout_probability=0.0, batch_normalization=False, - embedding_init_method="glorot_uniform", locally_connected_layers=[ { "filters": 8, @@ -69,15 +71,6 @@ class Class1NeuralNetwork(object): used. """ - input_encoding_hyperparameter_defaults = HyperparameterDefaults( - alignment_method="pad_middle", - left_edge=4, - right_edge=4) - """ - Number of amino acid residues that are given fixed positions on the each - side in the variable length encoding. - """ - fit_hyperparameter_defaults = HyperparameterDefaults( max_epochs=500, validation_split=0.1, @@ -110,7 +103,6 @@ class Class1NeuralNetwork(object): hyperparameter_defaults = network_hyperparameter_defaults.extend( compile_hyperparameter_defaults).extend( - input_encoding_hyperparameter_defaults).extend( fit_hyperparameter_defaults).extend( early_stopping_hyperparameter_defaults).extend( miscelaneous_hyperparameter_defaults @@ -132,6 +124,13 @@ class Class1NeuralNetwork(object): "verbose": None, "mode": None, "take_best_epoch": None, + 'kmer_size': None, + 'peptide_amino_acid_encoding': None, + 'embedding_input_dim': None, + 'embedding_output_dim': None, + 'embedding_init_method': None, + 'left_edge': None, + 'right_edge': None, } @classmethod @@ -375,22 +374,8 @@ class Class1NeuralNetwork(object): numpy.array """ encoder = EncodableSequences.create(peptides) - if (self.hyperparameters['peptide_amino_acid_encoding'] == "embedding"): - encoded = encoder.variable_length_to_fixed_length_categorical( - max_length=self.hyperparameters['kmer_size'], - **self.input_encoding_hyperparameter_defaults.subselect( - self.hyperparameters)) - elif ( - self.hyperparameters['peptide_amino_acid_encoding'] in - available_vector_encodings()): - encoded = encoder.variable_length_to_fixed_length_vector_encoding( - self.hyperparameters['peptide_amino_acid_encoding'], - max_length=self.hyperparameters['kmer_size'], - **self.input_encoding_hyperparameter_defaults.subselect( - self.hyperparameters)) - else: - raise ValueError("Unsupported peptide_amino_acid_encoding: %s" % - self.hyperparameters['peptide_amino_acid_encoding']) + encoded = encoder.variable_length_to_fixed_length_vector_encoding( + **self.hyperparameters['peptide_encoding']) assert len(encoded) == len(peptides) return encoded @@ -404,10 +389,16 @@ class Class1NeuralNetwork(object): (int, int) tuple """ - return ( - self.hyperparameters['left_edge'] + - self.hyperparameters['right_edge'], - self.hyperparameters['kmer_size']) + # We currently have an arbitrary hard floor of 5, even if the underlying + # peptide encoding supports smaller lengths. + # + # We empirically find the supported peptide lengths based on the + # lengths for which peptides_to_network_input throws ValueError. + try: + self.peptides_to_network_input([""]) + except EncodingError as e: + return e.supported_peptide_lengths + raise RuntimeError("peptides_to_network_input did not raise") def allele_encoding_to_network_input(self, allele_encoding): """ @@ -799,11 +790,8 @@ class Class1NeuralNetwork(object): def make_network( self, - kmer_size, + peptide_encoding, allele_amino_acid_encoding, - peptide_amino_acid_encoding, - embedding_input_dim, - embedding_output_dim, allele_dense_layer_sizes, peptide_dense_layer_sizes, peptide_allele_merge_method, @@ -816,7 +804,6 @@ class Class1NeuralNetwork(object): output_activation, dropout_probability, batch_normalization, - embedding_init_method, locally_connected_layers, allele_representations=None): """ @@ -832,23 +819,12 @@ class Class1NeuralNetwork(object): from keras.layers.embeddings import Embedding from keras.layers.normalization import BatchNormalization - if peptide_amino_acid_encoding == "embedding": - peptide_input = Input( - shape=(kmer_size,), dtype='int32', name='peptide') - current_layer = Embedding( - input_dim=embedding_input_dim, - output_dim=embedding_output_dim, - input_length=kmer_size, - embeddings_initializer=embedding_init_method, - name="peptide_embedding")(peptide_input) - else: - peptide_input = Input( - shape=( - kmer_size, - vector_encoding_length(peptide_amino_acid_encoding)), - dtype='float32', - name='peptide') - current_layer = peptide_input + peptide_encoding_shape = self.peptides_to_network_input([]).shape[1:] + peptide_input = Input( + shape=peptide_encoding_shape, + dtype='float32', + name='peptide') + current_layer = peptide_input inputs = [peptide_input] diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index 84e9a2da..0f0fcbd9 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -13,6 +13,14 @@ import pandas from . import amino_acid +class EncodingError(ValueError): + def __init__(self, message, supported_peptide_lengths): + self.supported_peptide_lengths = supported_peptide_lengths + ValueError.__init__( + self, + message + " Supported lengths: %s - %s." % supported_peptide_lengths) + + class EncodableSequences(object): """ Sequences of amino acids. @@ -36,7 +44,7 @@ class EncodableSequences(object): if not all(isinstance(obj, string_types) for obj in sequences): raise ValueError("Sequence of strings is required") self.sequences = numpy.array(sequences) - lengths = pandas.Series(self.sequences).str.len() + lengths = pandas.Series(self.sequences, dtype=numpy.object_).str.len() self.min_length = lengths.min() self.max_length = lengths.max() @@ -187,26 +195,23 @@ class EncodableSequences(object): shape=(len(sequences), max_length), dtype="int32") - df = pandas.DataFrame({"peptide": sequences}) + df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_) df["length"] = df.peptide.str.len() middle_length = max_length - left_edge - right_edge + min_length = left_edge + right_edge # For efficiency we handle each supported peptide length using bulk # array operations. for (length, sub_df) in df.groupby("length"): - if length < left_edge + right_edge: - raise ValueError( - "Sequence '%s' (length %d) unsupported: length must be at " - "least %d. There are %d total peptides with this length." % ( - sub_df.iloc[0].peptide, length, left_edge + right_edge, - len(sub_df))) - if length > max_length: - raise ValueError( - "Sequence '%s' (length %d) unsupported: length must be at " - "most %d. There are %d total peptides with this length." % ( - sub_df.iloc[0].peptide, length, max_length, - len(sub_df))) + if length < min_length or length > max_length: + raise EncodingError( + "Sequence '%s' (length %d) unsupported. There are %d " + "total peptides with this length." % ( + sub_df.iloc[0].peptide, + length, + len(sub_df)), supported_peptide_lengths=( + min_length, max_length)) # Array of shape (num peptides, length) giving fixed-length amino # acid encoding each peptide of the current length. @@ -240,17 +245,30 @@ class EncodableSequences(object): -right_edge: ] = fixed_length_sequences[:, -right_edge:] elif alignment_method == "left_pad_right_pad": + # We arbitrarily set a minimum length of 5, although this encoding + # could handle smaller peptides. + min_length = 5 + # Result array is int32, filled with X (null amino acid) value. result = numpy.full(fill_value=amino_acid.AMINO_ACID_INDEX['X'], shape=(len(sequences), max_length * 2), dtype="int32") - df = pandas.DataFrame({"peptide": sequences}) + df = pandas.DataFrame({"peptide": sequences}, dtype=numpy.object_) # For efficiency we handle each supported peptide length using bulk # array operations. for (length, sub_df) in df.groupby(df.peptide.str.len()): - # Array of shape (num peptides, length) giving fixed-length amino - # acid encoding each peptide of the current length. + if length < min_length or length > max_length: + raise EncodingError( + "Sequence '%s' (length %d) unsupported. There are %d " + "total peptides with this length." % ( + sub_df.iloc[0].peptide, + length, + len(sub_df)), supported_peptide_lengths=( + min_length, max_length)) + + # Array of shape (num peptides, length) giving fixed-length + # amino acid encoding each peptide of the current length. fixed_length_sequences = numpy.stack(sub_df.peptide.map( lambda s: numpy.array( [amino_acid.AMINO_ACID_INDEX[char] for char in -- GitLab