diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py index a2bc87156d71ec054b8a6e4a94dfa75caf2f6fd6..d95d2ce2c00baff5944298e0486afcd1bbc9a2fb 100644 --- a/mhcflurry/allele_encoding.py +++ b/mhcflurry/allele_encoding.py @@ -37,7 +37,7 @@ class AlleleEncoding(object): self.encoding_cache = {} - def fixed_length_sequences(self, vector_encoding_name): + def fixed_length_vector_encoded_sequences(self, vector_encoding_name): """ Encode alleles. @@ -57,14 +57,13 @@ class AlleleEncoding(object): "fixed_length_vector_encoding", vector_encoding_name) if cache_key not in self.encoding_cache: - index_encoded_matrix = amino_acid.fixed_vectors_encoding( + index_encoded_matrix = amino_acid.index_encoding( self.fixed_length_sequences.values, - amino_acid.COMMON_AMINO_ACIDS_WITH_UNKNOWN) + amino_acid.AMINO_ACID_INDEX) vector_encoded = amino_acid.fixed_vectors_encoding( index_encoded_matrix, amino_acid.ENCODING_DATA_FRAMES[vector_encoding_name]) - vector_encoded_df = pandas.DataFrame(vector_encoded) - result = vector_encoded_df.iloc[self.indices] + result = vector_encoded[self.indices] self.encoding_cache[cache_key] = result return self.encoding_cache[cache_key] diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index fad3aa11af26ed788d2f498aa0e81592e00902f2..5f82e9823554ab24a311f6eac38400e3747613f1 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -24,6 +24,7 @@ from .percent_rank_transform import PercentRankTransform from .regression_target import to_ic50 from .version import __version__ from .ensemble_centrality import CENTRALITY_MEASURES +from .allele_encoding import AlleleEncoding class Class1AffinityPredictor(object): @@ -39,7 +40,7 @@ class Class1AffinityPredictor(object): self, allele_to_allele_specific_models=None, class1_pan_allele_models=None, - allele_to_pseudosequence=None, + allele_to_fixed_length_sequence=None, manifest_df=None, allele_to_percent_rank_transform=None): """ @@ -51,7 +52,7 @@ class Class1AffinityPredictor(object): class1_pan_allele_models : list of `Class1NeuralNetwork` Ensemble of pan-allele models. - allele_to_pseudosequence : dict of string -> string + allele_to_fixed_length_sequence : dict of string -> string Required only if class1_pan_allele_models is specified. manifest_df : `pandas.DataFrame`, optional @@ -70,11 +71,11 @@ class Class1AffinityPredictor(object): class1_pan_allele_models = [] if class1_pan_allele_models: - assert allele_to_pseudosequence, "Pseudosequences required" + assert allele_to_fixed_length_sequence, "Allele sequences required" self.allele_to_allele_specific_models = allele_to_allele_specific_models self.class1_pan_allele_models = class1_pan_allele_models - self.allele_to_pseudosequence = allele_to_pseudosequence + self.allele_to_fixed_length_sequence = allele_to_fixed_length_sequence if manifest_df is None: rows = [] @@ -140,7 +141,7 @@ class Class1AffinityPredictor(object): allele_to_allele_specific_models = collections.defaultdict(list) class1_pan_allele_models = [] - allele_to_pseudosequence = predictors[0].allele_to_pseudosequence + allele_to_fixed_length_sequence = predictors[0].allele_to_fixed_length_sequence for predictor in predictors: for (allele, networks) in ( @@ -152,7 +153,7 @@ class Class1AffinityPredictor(object): return Class1AffinityPredictor( allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, - allele_to_pseudosequence=allele_to_pseudosequence + allele_to_fixed_length_sequence=allele_to_fixed_length_sequence ) @property @@ -165,8 +166,8 @@ class Class1AffinityPredictor(object): list of string """ result = set(self.allele_to_allele_specific_models) - if self.allele_to_pseudosequence: - result = result.union(self.allele_to_pseudosequence) + if self.allele_to_fixed_length_sequence: + result = result.union(self.allele_to_fixed_length_sequence) return sorted(result) @property @@ -196,7 +197,7 @@ class Class1AffinityPredictor(object): The serialization format consists of a file called "manifest.csv" with the configurations of each Class1NeuralNetwork, along with per-network files giving the model weights. If there are pan-allele predictors in - the ensemble, the allele pseudosequences are also stored in the + the ensemble, the allele sequences are also stored in the directory. There is also a small file "index.txt" with basic metadata: when the models were trained, by whom, on what host. @@ -250,6 +251,15 @@ class Class1AffinityPredictor(object): pandas.DataFrame(rows).to_csv( info_path, sep="\t", header=False, index=False) + if self.allele_to_fixed_length_sequence is not None: + allele_to_sequence_df = pandas.DataFrame( + list(self.allele_to_fixed_length_sequence.items()), + columns=['allele', 'sequence'] + ) + allele_to_sequence_df.to_csv( + join(models_dir, "allele_sequences.csv"), index=False) + logging.info("Wrote: %s" % join(models_dir, "allele_sequences.csv")) + if self.allele_to_percent_rank_transform: percent_ranks_df = None for (allele, transform) in self.allele_to_percent_rank_transform.items(): @@ -310,10 +320,10 @@ class Class1AffinityPredictor(object): manifest_df["model"] = all_models - pseudosequences = None - if exists(join(models_dir, "pseudosequences.csv")): - pseudosequences = pandas.read_csv( - join(models_dir, "pseudosequences.csv"), + allele_to_fixed_length_sequence = None + if exists(join(models_dir, "allele_sequences.csv")): + allele_to_fixed_length_sequence = pandas.read_csv( + join(models_dir, "allele_sequences.csv"), index_col="allele").to_dict() allele_to_percent_rank_transform = {} @@ -325,10 +335,10 @@ class Class1AffinityPredictor(object): PercentRankTransform.from_series(percent_ranks_df[allele])) logging.info( - "Loaded %d class1 pan allele predictors, %d pseudosequences, " + "Loaded %d class1 pan allele predictors, %d allele sequences, " "%d percent rank distributions, and %d allele specific models: %s" % ( len(class1_pan_allele_models), - len(pseudosequences) if pseudosequences else 0, + len(allele_to_fixed_length_sequence) if allele_to_fixed_length_sequence else 0, len(allele_to_percent_rank_transform), sum(len(v) for v in allele_to_allele_specific_models.values()), ", ".join( @@ -339,7 +349,7 @@ class Class1AffinityPredictor(object): result = Class1AffinityPredictor( allele_to_allele_specific_models=allele_to_allele_specific_models, class1_pan_allele_models=class1_pan_allele_models, - allele_to_pseudosequence=pseudosequences, + allele_to_fixed_length_sequence=allele_to_fixed_length_sequence, manifest_df=manifest_df, allele_to_percent_rank_transform=allele_to_percent_rank_transform, ) @@ -516,6 +526,7 @@ class Class1AffinityPredictor(object): alleles, peptides, affinities, + inequalities, models_dir_for_save=None, verbose=1, progress_preamble=""): @@ -534,12 +545,15 @@ class Class1AffinityPredictor(object): architecture_hyperparameters : dict alleles : list of string - Allele names (not pseudosequences) corresponding to each peptide + Allele names (not sequences) corresponding to each peptide peptides : `EncodableSequences` or list of string affinities : list of float nM affinities + + inequalities : list of string, each element one of ">", "<", or "=" + See Class1NeuralNetwork.fit for details. models_dir_for_save : string, optional If specified, the Class1AffinityPredictor is (incrementally) written @@ -557,7 +571,9 @@ class Class1AffinityPredictor(object): """ alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name) - allele_pseudosequences = alleles.map(self.allele_to_pseudosequence) + allele_encoding = AlleleEncoding( + alleles, + allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence) encodable_peptides = EncodableSequences.create(peptides) models = [] @@ -567,7 +583,8 @@ class Class1AffinityPredictor(object): model.fit( encodable_peptides, affinities, - allele_pseudosequences=allele_pseudosequences, + inequalities=inequalities, + allele_encoding=allele_encoding, verbose=verbose, progress_preamble=progress_preamble) @@ -777,27 +794,27 @@ class Class1AffinityPredictor(object): unsupported_alleles = [ allele for allele in df.normalized_allele.unique() - if allele not in self.allele_to_pseudosequence + if allele not in self.allele_to_fixed_length_sequence ] if unsupported_alleles: msg = ( - "No pseudosequences for allele(s): %s.\n" + "No sequences for allele(s): %s.\n" "Supported alleles: %s" % ( " ".join(unsupported_alleles), - " ".join(sorted(self.allele_to_pseudosequence)))) + " ".join(sorted(self.allele_to_fixed_length_sequence)))) logging.warning(msg) if throw: raise ValueError(msg) mask = df.supported_peptide_length if mask.sum() > 0: - masked_allele_pseudosequences = ( - df.ix[mask].normalized_allele.map( - self.allele_to_pseudosequence)) + masked_allele_encoding = AlleleEncoding( + df.loc[mask].normalized_allele, + allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence) masked_peptides = peptides.sequences[mask] for (i, model) in enumerate(self.class1_pan_allele_models): df.loc[mask, "model_pan_%d" % i] = model.predict( masked_peptides, - allele_pseudosequences=masked_allele_pseudosequences) + allele_encoding=masked_allele_encoding) if self.allele_to_allele_specific_models: query_alleles = df.normalized_allele.unique() diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 5cf87569b625d4243ee98ac7123933c7aa047525..c48b051ff22600e593676190226841eeb2f6df56 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -370,7 +370,7 @@ class Class1NeuralNetwork(object): ------- numpy.array """ - return allele_encoding.fixed_length_sequences("BLOSUM62") + return allele_encoding.fixed_length_vector_encoded_sequences("BLOSUM62") def fit( self, @@ -693,9 +693,8 @@ class Class1NeuralNetwork(object): 'peptide': self.peptides_to_network_input(peptides) } if allele_encoding is not None: - pseudosequences_input = self.pseudosequence_to_network_input( - allele_pseudosequences) - x_dict['pseudosequence'] = pseudosequences_input + allele_input = self.allele_encoding_to_network_input(allele_encoding) + x_dict['allele'] = allele_input network = self.network(borrow=True) raw_predictions = network.predict(x_dict, batch_size=batch_size) @@ -787,8 +786,8 @@ class Class1NeuralNetwork(object): if allele_encoding_dims: allele_input = Input( shape=allele_encoding_dims, - dtype='int32', - name='peptide') + dtype='float32', + name='allele') inputs.append(allele_input) allele_embedding_layer = Flatten(name="allele_flat")(allele_input) @@ -807,9 +806,6 @@ class Class1NeuralNetwork(object): current_layer = keras.layers.multiply([ current_layer, allele_embedding_layer ], name="allele_peptide_merged") - - current_layer = keras.layers.concatenate( - [current_layer, allele_embedding_layer], name="concatenated_0") else: raise ValueError( "Unsupported peptide_allele_encoding_merge_method: %s"