diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py index fb33f4877ea3dad8aa66737d9874bb884945cb4d..d627993783e6c809fbc57b2b1fce51150d10c6f8 100644 --- a/mhcflurry/allele_encoding.py +++ b/mhcflurry/allele_encoding.py @@ -1,14 +1,10 @@ -import numpy import pandas -from .encodable_sequences import EncodableSequences from . import amino_acid + class AlleleEncoding(object): - def __init__( - self, - alleles, - allele_to_fixed_length_sequence): + def __init__(self, alleles=None, allele_to_sequence=None, borrow_from=None): """ A place to cache encodings for a (potentially large) sequence of alleles. @@ -17,74 +13,82 @@ class AlleleEncoding(object): alleles : list of string Allele names - allele_to_fixed_length_sequence : dict of str -> str - Allele name to fixed lengths sequence ("pseudosequence"), or a - pandas dataframe with allele names as the index and arbitrary values - to use for the encoding of those alleles + allele_to_sequence : dict of str -> str + Allele name to amino acid sequence """ - self.alleles = pandas.Series(alleles) + if alleles is not None: + alleles = pandas.Series(alleles) + self.borrow_from = borrow_from + self.allele_to_sequence = allele_to_sequence + + if self.borrow_from is None: + assert allele_to_sequence is not None + all_alleles = ( + sorted(allele_to_sequence) + if alleles is None + else list(sorted(alleles.unique()))) + self.allele_to_index = dict( + (allele, i) + for (i, allele) in enumerate(all_alleles)) + unpadded = pandas.Series( + [allele_to_sequence[a] for a in all_alleles], + index=all_alleles) + self.sequences = unpadded.str.pad( + unpadded.str.len().max(), fillchar="X") + else: + assert allele_to_sequence is None + self.allele_to_index = borrow_from.allele_to_index + self.sequences = borrow_from.sequences + self.allele_to_sequence = borrow_from.allele_to_sequence - if isinstance(allele_to_fixed_length_sequence, dict): - self.allele_to_fixed_length_sequence = pandas.DataFrame( - index=allele_to_fixed_length_sequence) - self.allele_to_fixed_length_sequence["value"] = ( - self.allele_to_fixed_length_sequence.index.map( - allele_to_fixed_length_sequence.get)) + if alleles is not None: + assert all( + allele in self.allele_to_index for allele in alleles) + self.indices = alleles.map(self.allele_to_index) + assert not self.indices.isnull().any() else: - assert isinstance(allele_to_fixed_length_sequence, pandas.DataFrame) - self.allele_to_fixed_length_sequence = allele_to_fixed_length_sequence + self.indices = None self.encoding_cache = {} + def allele_representations(self, vector_encoding_name): + if self.borrow_from is not None: + return self.borrow_from.allele_representations(vector_encoding_name) + + cache_key = ( + "allele_representations", + vector_encoding_name) + if cache_key not in self.encoding_cache: + index_encoded_matrix = amino_acid.index_encoding( + self.sequences.values, + amino_acid.AMINO_ACID_INDEX) + vector_encoded = amino_acid.fixed_vectors_encoding( + index_encoded_matrix, + amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name]) + self.encoding_cache[cache_key] = vector_encoded + return self.encoding_cache[cache_key] + def fixed_length_vector_encoded_sequences(self, vector_encoding_name): """ Encode alleles. - Parameters ---------- vector_encoding_name : string How to represent amino acids. One of "BLOSUM62", "one-hot", etc. Full list of supported vector encodings is given by available_vector_encodings() in amino_acid. - - If a DataFrame was provided as `allele_to_fixed_length_sequence` - in the constructor, then those values will be used and this argument - will be ignored. - Returns ------- - numpy.array to get an array with shape - (num sequences, sequence length, m) where m is + numpy.array with shape (num sequences, sequence length, m) where m is vector_encoding_length(vector_encoding_name) """ cache_key = ( "fixed_length_vector_encoding", vector_encoding_name) if cache_key not in self.encoding_cache: - all_alleles = list(sorted(self.alleles.unique())) - allele_to_index = dict( - (allele, i) - for (i, allele) in enumerate(all_alleles)) - indices = self.alleles.map(allele_to_index) - - allele_to_fixed_length_sequence = self.allele_to_fixed_length_sequence.loc[ - all_alleles - ].copy() - - if list(allele_to_fixed_length_sequence) == ["value"]: - # Pseudosequence - index_encoded_matrix = amino_acid.index_encoding( - allele_to_fixed_length_sequence["value"].values, - amino_acid.AMINO_ACID_INDEX) - vector_encoded = amino_acid.fixed_vectors_encoding( - index_encoded_matrix, - amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name]) - else: - # Raw values - vector_encoded = allele_to_fixed_length_sequence.values - result = numpy.array([vector_encoded[i] for i in indices]) + vector_encoded = self.allele_representations(vector_encoding_name) + result = vector_encoded[self.indices] self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] - diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index c8f1d0e9a1ac5057e557401fe11c3b7d2057f7a3..b70337849017c6ddaf719c798415b3d2340506b6 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -45,7 +45,7 @@ class Class1AffinityPredictor(object): self, allele_to_allele_specific_models=None, class1_pan_allele_models=None, - allele_to_fixed_length_sequence=None, + allele_to_sequence=None, manifest_df=None, allele_to_percent_rank_transform=None, metadata_dataframes=None): @@ -58,7 +58,7 @@ class Class1AffinityPredictor(object): class1_pan_allele_models : list of `Class1NeuralNetwork` Ensemble of pan-allele models. - allele_to_fixed_length_sequence : dict of string -> string + allele_to_sequence : dict of string -> string Required only if class1_pan_allele_models is specified. manifest_df : `pandas.DataFrame`, optional @@ -80,12 +80,14 @@ class Class1AffinityPredictor(object): if class1_pan_allele_models is None: class1_pan_allele_models = [] + + self.allele_to_sequence = allele_to_sequence + self.master_allele_encoding = None if class1_pan_allele_models: - assert allele_to_fixed_length_sequence, "Allele sequences required" + assert self.allele_to_sequence self.allele_to_allele_specific_models = allele_to_allele_specific_models self.class1_pan_allele_models = class1_pan_allele_models - self.allele_to_fixed_length_sequence = allele_to_fixed_length_sequence self._manifest_df = manifest_df if not allele_to_percent_rank_transform: @@ -129,7 +131,7 @@ class Class1AffinityPredictor(object): Users should call this after mutating any of the following: - class1_pan_allele_models - allele_to_allele_specific_models - - allele_to_fixed_length_sequence + - allele_to_sequence Methods that mutate these instance variables will call this method on their own if needed. @@ -174,7 +176,7 @@ class Class1AffinityPredictor(object): allele_to_allele_specific_models = collections.defaultdict(list) class1_pan_allele_models = [] - allele_to_fixed_length_sequence = predictors[0].allele_to_fixed_length_sequence + allele_to_sequence = predictors[0].allele_to_fixed_length_sequence for predictor in predictors: for (allele, networks) in ( @@ -186,7 +188,7 @@ class Class1AffinityPredictor(object): return Class1AffinityPredictor( allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, - allele_to_fixed_length_sequence=allele_to_fixed_length_sequence + allele_to_sequence=allele_to_sequence ) def merge_in_place(self, others): @@ -251,8 +253,8 @@ class Class1AffinityPredictor(object): """ if 'supported_alleles' not in self._cache: result = set(self.allele_to_allele_specific_models) - if self.allele_to_fixed_length_sequence: - result = result.union(self.allele_to_fixed_length_sequence) + if self.allele_to_sequence: + result = result.union(self.allele_to_sequence) self._cache["supported_alleles"] = sorted(result) return self._cache["supported_alleles"] @@ -348,9 +350,9 @@ class Class1AffinityPredictor(object): metadata_df_path = join(models_dir, "%s.csv.bz2" % name) df.to_csv(metadata_df_path, index=False, compression="bz2") - if self.allele_to_fixed_length_sequence is not None: + if self.allele_to_sequence is not None: allele_to_sequence_df = pandas.DataFrame( - list(self.allele_to_fixed_length_sequence.items()), + list(self.allele_to_sequence.items()), columns=['allele', 'sequence'] ) allele_to_sequence_df.to_csv( @@ -446,7 +448,7 @@ class Class1AffinityPredictor(object): result = Class1AffinityPredictor( allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, - allele_to_fixed_length_sequence=allele_to_fixed_length_sequence, + allele_to_sequence=allele_to_fixed_length_sequence, manifest_df=manifest_df, allele_to_percent_rank_transform=allele_to_percent_rank_transform, ) @@ -487,6 +489,14 @@ class Class1AffinityPredictor(object): """ return join(models_dir, "weights_%s.npz" % model_name) + def get_master_allele_encoding(self): + if (self.master_allele_encoding is None or + self.master_allele_encoding.allele_to_sequence != + self.allele_to_sequence): + self.master_allele_encoding = AlleleEncoding( + allele_to_sequence=self.allele_to_sequence) + return self.master_allele_encoding + def fit_allele_specific_predictors( self, n_models, @@ -685,7 +695,7 @@ class Class1AffinityPredictor(object): alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name) allele_encoding = AlleleEncoding( alleles, - allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence) + borrow_from=self.get_master_allele_encoding()) encodable_peptides = EncodableSequences.create(peptides) models = [] @@ -933,17 +943,18 @@ class Class1AffinityPredictor(object): predictions_array[:] = numpy.nan if self.class1_pan_allele_models: + master_allele_encoding = self.get_master_allele_encoding() unsupported_alleles = [ allele for allele in df.normalized_allele.unique() - if allele not in self.allele_to_fixed_length_sequence + if allele not in self.allele_to_sequence ] if unsupported_alleles: msg = ( "No sequences for allele(s): %s.\n" "Supported alleles: %s" % ( " ".join(unsupported_alleles), - " ".join(sorted(self.allele_to_fixed_length_sequence)))) + " ".join(sorted(self.allele_to_sequence)))) logging.warning(msg) if throw: raise ValueError(msg) @@ -951,7 +962,7 @@ class Class1AffinityPredictor(object): if mask.sum() > 0: masked_allele_encoding = AlleleEncoding( df.loc[mask].normalized_allele, - allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence) + borrow_from=master_allele_encoding) masked_peptides = peptides.sequences[mask] for (i, model) in enumerate(self.class1_pan_allele_models): predictions_array[mask, i] = model.predict( @@ -1153,7 +1164,7 @@ class Class1AffinityPredictor(object): return Class1AffinityPredictor( allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, - allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence, + allele_to_sequence=self.allele_to_sequence, ) def model_select( diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 8b17e78cbe97507519de706b5ef4a77ffb087126..582dac78e335d7a70b8d61b2b8ec0eddcb813817 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -329,8 +329,8 @@ class Class1NeuralNetwork(object): Returns ------- - list of numpy.array giving weights for each layer - or None if there is no network + list of numpy.array giving weights for each layer or None if there is no + network """ self.update_network_description() self.load_weights() @@ -420,7 +420,9 @@ class Class1NeuralNetwork(object): ------- numpy.array """ - return allele_encoding.fixed_length_vector_encoded_sequences("BLOSUM62") + return ( + allele_encoding.indices, + allele_encoding.allele_representations("BLOSUM62")) def fit( self, @@ -518,11 +520,10 @@ class Class1NeuralNetwork(object): x_dict_without_random_negatives = { 'peptide': peptide_encoding, } - allele_encoding_dims = None + allele_representations = None if allele_encoding is not None: - allele_encoding_input = self.allele_encoding_to_network_input( - allele_encoding) - allele_encoding_dims = allele_encoding_input.shape[1:] + (allele_encoding_input, allele_representations) = ( + self.allele_encoding_to_network_input(allele_encoding)) x_dict_without_random_negatives['allele'] = allele_encoding_input # Shuffle y_values and the contents of x_dict_without_random_negatives @@ -568,12 +569,16 @@ class Class1NeuralNetwork(object): if self.network() is None: self._network = self.make_network( - allele_encoding_dims=allele_encoding_dims, + allele_representations=allele_representations, **self.network_hyperparameter_defaults.subselect( self.hyperparameters)) - self.network().compile( - loss=loss_name_or_function, - optimizer=self.hyperparameters['optimizer']) + + if allele_representations is not None: + self.set_allele_representations(allele_representations) + + self.network().compile( + loss=loss_name_or_function, + optimizer=self.hyperparameters['optimizer']) if self.hyperparameters['learning_rate'] is not None: from keras import backend as K @@ -769,11 +774,15 @@ class Class1NeuralNetwork(object): x_dict = { 'peptide': self.peptides_to_network_input(peptides) } - if allele_encoding is not None: - allele_input = self.allele_encoding_to_network_input(allele_encoding) - x_dict['allele'] = allele_input - network = self.network(borrow=True) + if allele_encoding is not None: + (allele_encoding_input, allele_representations) = ( + self.allele_encoding_to_network_input(allele_encoding)) + x_dict['allele'] = allele_encoding_input + self.set_allele_representations(allele_representations) + network = self.network() + else: + network = self.network(borrow=True) raw_predictions = network.predict(x_dict, batch_size=batch_size) predictions = numpy.array(raw_predictions, dtype = "float64")[:,0] result = to_ic50(predictions) @@ -783,7 +792,7 @@ class Class1NeuralNetwork(object): @staticmethod def make_network( - allele_encoding_dims, + allele_representations, kmer_size, peptide_amino_acid_encoding, embedding_input_dim, @@ -863,28 +872,36 @@ class Class1NeuralNetwork(object): current_layer = Dropout(dropout_probability, name="dropout_early")( current_layer) - if allele_encoding_dims: + if allele_representations is not None: allele_input = Input( - shape=allele_encoding_dims, + shape=(1,), dtype='float32', name='allele') inputs.append(allele_input) - allele_embedding_layer = Flatten(name="allele_flat")(allele_input) + + allele_representation = Embedding( + name="allele_representation", + input_dim=allele_representations.shape[0], + output_dim=allele_representations.shape[1], + input_length=1, + trainable=False) + + allele_layer = Flatten(name="allele_flat")(allele_representation) for (i, layer_size) in enumerate(allele_dense_layer_sizes): - allele_embedding_layer = Dense( + allele_layer = Dense( layer_size, name="allele_dense_%d" % i, kernel_regularizer=kernel_regularizer, - activation=activation)(allele_embedding_layer) + activation=activation)(allele_layer) if peptide_allele_merge_method == 'concatenate': current_layer = keras.layers.concatenate([ - current_layer, allele_embedding_layer + current_layer, allele_layer ], name="allele_peptide_merged") elif peptide_allele_merge_method == 'multiply': current_layer = keras.layers.multiply([ - current_layer, allele_embedding_layer + current_layer, allele_layer ], name="allele_peptide_merged") else: raise ValueError( @@ -921,4 +938,22 @@ class Class1NeuralNetwork(object): inputs=inputs, outputs=[output], name="predictor") + return model + + @staticmethod + def set_allele_representations(self, allele_representations): + """ + + Parameters + ---------- + model + allele_representations + + """ + layer = self.network().get_layer("allele_representation") + (existing,) = layer.get_weights() + if existing.shape == allele_representations.shape: + layer.set_weights([allele_representations]) + else: + raise NotImplementedError("Network surgery required") diff --git a/test/test_allele_encoding.py b/test/test_allele_encoding.py index a3f270e033b1d1ac71cabb6626499f8fcb8f0be5..e8b772904b0f221a08f712579c909e093c68c0c2 100644 --- a/test/test_allele_encoding.py +++ b/test/test_allele_encoding.py @@ -37,22 +37,3 @@ def test_allele_encoding_speed(): start = time.time() encoding1 = encoding.fixed_length_vector_encoded_sequences("BLOSUM62") print("Long encoding in %0.2f sec." % (time.time() - start)) - - -def test_allele_encoding_raw_values(): - encoding = AlleleEncoding( - ["A*02:01", "A*02:03", "A*02:01"], - pandas.DataFrame( - [ - [0, 1, -1], - [10, 11, 12], - ], - index=["A*02:01", "A*02:03"])) - - encoding1 = encoding.fixed_length_vector_encoded_sequences("BLOSUM62") - assert_equal( - [ - [0, 1, -1], - [10, 11, 12], - [0, 1, -1], - ], numpy.array(encoding1))