From dd062baa9aedff6f5fa23740234c3396387cc174 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Sun, 21 May 2017 19:20:15 -0400 Subject: [PATCH] fixes --- .../class1_affinity_predictor.py | 115 ++++++++---------- .../class1_neural_network.py | 63 +++++++--- .../train_allele_specific_models_command.py | 73 ++--------- 3 files changed, 111 insertions(+), 140 deletions(-) diff --git a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py index 7c6fb5ef..223ff08a 100644 --- a/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_prediction/class1_affinity_predictor.py @@ -1,7 +1,7 @@ import collections -import pickle import time import hashlib +import json from os.path import join, exists import numpy @@ -17,11 +17,16 @@ from .class1_neural_network import Class1NeuralNetwork class Class1AffinityPredictor(object): def __init__( self, - allele_to_allele_specific_models={}, - class1_pan_allele_models=[], + allele_to_allele_specific_models=None, + class1_pan_allele_models=None, allele_to_pseudosequence=None, manifest_df=None): + if allele_to_allele_specific_models is None: + allele_to_allele_specific_models = {} + if class1_pan_allele_models is None: + class1_pan_allele_models = [] + if class1_pan_allele_models: assert allele_to_pseudosequence, "Pseudosequences required" @@ -32,14 +37,9 @@ class Class1AffinityPredictor(object): if manifest_df is None: manifest_df = pandas.DataFrame() - manifest_df["name"] = [] + manifest_df["model_name"] = [] manifest_df["allele"] = [] - manifest_df["hyperparameters"] = [] - manifest_df["history"] = [] - manifest_df["num_measurements"] = [] - manifest_df["random_negative_rate"] = [] - manifest_df["sources"] = [] - manifest_df["fit_seconds"] = [] + manifest_df["config_json"] = [] manifest_df["model"] = [] self.manifest_df = manifest_df @@ -52,17 +52,16 @@ class Class1AffinityPredictor(object): if model_names_to_write is None: # Write all models - models_names_to_write = self.manifest_df.model_name.values + model_names_to_write = self.manifest_df.model_name.values sub_manifest_df = self.manifest_df.ix[ - self.manifest_df.model_name.isin(models_names_to_write) + self.manifest_df.model_name.isin(model_names_to_write) ] for (_, row) in sub_manifest_df.iterrows(): - model_path = join(models_dir, "%s.pickle" % row.name) - with open(join(model_path), 'wb') as fd: - pickle.dump(row.model, fd, protocol=2) - print("Wrote: %s" % model_path) + weights_path = self.weights_path(models_dir, row.model_name) + row.model.save_weights(weights_path) + print("Wrote: %s" % weights_path) write_manifest_df = self.manifest_df[[ c for c in self.manifest_df.columns if c != "model" @@ -77,21 +76,30 @@ class Class1AffinityPredictor(object): str(time.time()).encode()).hexdigest()[:16] return "%s-%d-%s" % (allele, num, random_string) + @staticmethod + def weights_path(models_dir, model_name): + return join( + models_dir, + "%s.%s" % ( + model_name, Class1NeuralNetwork.weights_filename_extension)) + + @staticmethod def load(models_dir, max_models=None): manifest_path = join(models_dir, "manifest.csv") manifest_df = pandas.read_csv(manifest_path, nrows=max_models) - manifest_df["hyperparameters"] = manifest_df.hyperparameters.map(eval) - manifest_df["history"] = manifest_df.history.map(eval) allele_to_allele_specific_models = collections.defaultdict(list) class1_pan_allele_models = [] all_models = [] for (_, row) in manifest_df.iterrows(): - model_path = join(models_dir, "%s.pickle" % row["name"]) - print("Loading model: %s" % model_path) - with open(model_path, 'rb') as fd: - model = pickle.load(fd) + model = Class1NeuralNetwork.from_config( + json.loads(row.config_json)) + weights_path = Class1AffinityPredictor.weights_path( + models_dir, row.model_name) + print("Loading model weights: %s" % weights_path) + model.restore_weights(weights_path) + if row.allele == "pan-class1": class1_pan_allele_models.append(model) else: @@ -131,7 +139,6 @@ class Class1AffinityPredictor(object): allele, peptides, affinities, - output_assignments=None, models_dir_for_save=None, verbose=1): @@ -139,33 +146,30 @@ class Class1AffinityPredictor(object): models = self._fit_predictors( n_models=n_models, architecture_hyperparameters=architecture_hyperparameters, - peptide=peptides, + peptides=peptides, affinities=affinities, - output_assignments=output_assignments, allele_pseudosequences=None, verbose=verbose) + if allele not in self.allele_to_allele_specific_models: + self.allele_to_allele_specific_models[allele] = [] + models_list = [] for (i, model) in enumerate(models): - name = self.model_name(allele, i) + model_name = self.model_name(allele, i) models_list.append(model) # models is a generator - row = pandas.Series({ - "allele": allele, - "hyperparameters": architecture_hyperparameters, - "history": model.fit_history.history, - "name": name, - "num_measurements": len(peptides), - "fit_seconds": model.fit_seconds, - "model": model, - }).to_frame().T + row = pandas.Series(collections.OrderedDict([ + ("model_name", model_name), + ("allele", allele), + ("config_json", json.dumps(model.get_config())), + ("model", model), + ])).to_frame().T self.manifest_df = pandas.concat( [self.manifest_df, row], ignore_index=True) + self.allele_to_allele_specific_models[allele].append(model) if models_dir_for_save: - self.save(models_dir_for_save, model_names_to_write=[name]) - - if allele not in self.allele_to_allele_specific_models: - self.allele_to_allele_specific_models[allele] = [] - self.allele_to_allele_specific_models[allele].extend(models_list) + self.save( + models_dir_for_save, model_names_to_write=[model_name]) return models def fit_class1_pan_allele_models( @@ -175,7 +179,6 @@ class Class1AffinityPredictor(object): alleles, peptides, affinities, - output_assignments=None, models_dir_for_save=None, verbose=1): @@ -187,28 +190,22 @@ class Class1AffinityPredictor(object): architecture_hyperparameters=architecture_hyperparameters, peptides=peptides, affinities=affinities, - output_assignments=output_assignments, allele_pseudosequences=allele_pseudosequences) - models_list = [] for (i, model) in enumerate(models): - name = self.model_name("pan-class1", i) - models_list.append(model) # models is a generator - row = pandas.Series({ - "allele": "pan-class1", - "hyperparameters": architecture_hyperparameters, - "history": model.fit_history.history, - "name": name, - "num_measurements": len(peptides), - "fit_seconds": model.fit_seconds, - "model": model, - }).to_frame().T + model_name = self.model_name("pan-class1", i) + self.class1_pan_allele_models.append(model) + row = pandas.Series(collections.OrderedDict([ + ("model_name", model_name), + ("allele", "pan-class1"), + ("config_json", json.dumps(model.get_config())), + ("model", model), + ])).to_frame().T self.manifest_df = pandas.concat( [self.manifest_df, row], ignore_index=True) if models_dir_for_save: - self.save(models_dir_for_save, model_names_to_write=[name]) - - self.class1_pan_allele_models.extend(models_list) + self.save( + models_dir_for_save, model_names_to_write=[model_name]) return models def _fit_predictors( @@ -217,20 +214,16 @@ class Class1AffinityPredictor(object): architecture_hyperparameters, peptides, affinities, - output_assignments, allele_pseudosequences, verbose=1): encodable_peptides = EncodableSequences.create(peptides) - if output_assignments is None: - output_assignments = ["output"] * len(encodable_peptides.sequences) for i in range(n_models): print("Training model %d / %d" % (i + 1, n_models)) model = Class1NeuralNetwork(**architecture_hyperparameters) model.fit( encodable_peptides, affinities, - output_assignments=output_assignments, allele_pseudosequences=allele_pseudosequences, verbose=verbose) yield model diff --git a/mhcflurry/class1_affinity_prediction/class1_neural_network.py b/mhcflurry/class1_affinity_prediction/class1_neural_network.py index a6d93d2e..e5e382ed 100644 --- a/mhcflurry/class1_affinity_prediction/class1_neural_network.py +++ b/mhcflurry/class1_affinity_prediction/class1_neural_network.py @@ -22,6 +22,8 @@ from ..common import random_peptides, amino_acid_distribution class Class1NeuralNetwork(object): + weights_filename_extension = "npz" + network_hyperparameter_defaults = HyperparameterDefaults( kmer_size=15, use_embedding=True, @@ -38,6 +40,10 @@ class Class1NeuralNetwork(object): batch_normalization=True, embedding_init_method="glorot_uniform", locally_connected_layers=[], + ) + + compile_hyperparameter_defaults = HyperparameterDefaults( + loss="mse", optimizer="rmsprop", ) @@ -65,6 +71,7 @@ class Class1NeuralNetwork(object): mode='auto') hyperparameter_defaults = network_hyperparameter_defaults.extend( + compile_hyperparameter_defaults).extend( input_encoding_hyperparameter_defaults).extend( fit_hyperparameter_defaults).extend( early_stopping_hyperparameter_defaults) @@ -75,11 +82,25 @@ class Class1NeuralNetwork(object): self.network = None self.loss_history = None self.fit_seconds = None + self.fit_num_points = None - def __getstate__(self): + def get_config(self): result = dict(self.__dict__) del result['network'] result['network_json'] = self.network.to_json() + return result + + @classmethod + def from_config(cls, config): + config = dict(config) + instance = cls(**config.pop('hyperparameters')) + instance.network = keras.models.model_from_json( + config.pop('network_json')) + instance.__dict__.update(config) + return instance + + def __getstate__(self): + result = self.get_config() result['network_weights'] = self.get_weights() return result @@ -90,16 +111,19 @@ class Class1NeuralNetwork(object): self.network = keras.models.model_from_json(network_json) self.set_weights(network_weights) - def get_weights(self): - """ - Returns weights, which can be passed to set_weights later. - """ - return [x.copy() for x in self.network.get_weights()] - - def set_weights(self, weights): - """ - Reset the model weights. - """ + def save_weights(self, filename): + weights_list = self.network.get_weights() + numpy.savez( + filename, + **dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list))) + + def restore_weights(self, filename): + loaded = numpy.load(filename) + weights = [ + loaded["array_%d" % i] + for i in range(len(loaded.keys())) + ] + loaded.close() self.network.set_weights(weights) def peptides_to_network_input(self, peptides): @@ -133,10 +157,11 @@ class Class1NeuralNetwork(object): allele_pseudosequences=None, sample_weights=None, verbose=1): + + self.fit_num_points = len(peptides) + encodable_peptides = EncodableSequences.create(peptides) peptide_encoding = self.peptides_to_network_input(encodable_peptides) - peptide_to_encoding = dict( - zip(encodable_peptides.sequences, peptide_encoding)) length_counts = ( pandas.Series(encodable_peptides.sequences) @@ -181,6 +206,7 @@ class Class1NeuralNetwork(object): pseudosequence_length=pseudosequence_length, **self.network_hyperparameter_defaults.subselect( self.hyperparameters)) + self.compile() y_dict_with_random_negatives = { "output": numpy.concatenate([ @@ -279,6 +305,11 @@ class Class1NeuralNetwork(object): (predictions,) = numpy.array(self.network.predict(x_dict)).T return to_ic50(predictions) + def compile(self): + self.network.compile( + **self.compile_hyperparameter_defaults.subselect( + self.hyperparameters)) + @staticmethod def make_network( pseudosequence_length, @@ -296,8 +327,7 @@ class Class1NeuralNetwork(object): dropout_probability, batch_normalization, embedding_init_method, - locally_connected_layers, - optimizer): + locally_connected_layers): if use_embedding: peptide_input = Input( @@ -374,7 +404,4 @@ class Class1NeuralNetwork(object): activation=output_activation, name="output")(current_layer) model = keras.models.Model(inputs=inputs, outputs=[output]) - model.compile( - loss="mse", - optimizer=optimizer) return model diff --git a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py index e3d7f715..fd62529f 100644 --- a/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py +++ b/mhcflurry/class1_affinity_prediction/train_allele_specific_models_command.py @@ -5,25 +5,14 @@ Train single allele models import sys import argparse import json -import os -import pickle import pandas -import mhcnames - -from .class1_neural_network import Class1NeuralNetwork +from .class1_affinity_predictor import Class1AffinityPredictor from ..common import configure_logging -def normalize_allele_name(s): - try: - return mhcnames.normalize_allele_name(s) - except Exception: - return "UNKNOWN" - - parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( @@ -62,9 +51,8 @@ parser.add_argument( default=1) - -def run(): - args = parser.parse_args(sys.argv[1:]) +def run(argv=sys.argv[1:]): + args = parser.parse_args(argv) configure_logging(verbose=args.verbosity > 1) @@ -93,20 +81,11 @@ def run(): print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) print("Training data: %s" % (str(df.shape))) - manifest = pandas.DataFrame() - manifest["name"] = [] - manifest["hyperparameters_index"] = [] - manifest["model_group"] = [] - manifest["allele"] = [] - manifest["hyperparameters"] = [] - manifest["history"] = [] - manifest["num_measurements"] = [] - manifest["fit_seconds"] = [] - - manifest_path = os.path.join(args.out_models_dir, "manifest.csv") + predictor = Class1AffinityPredictor() for (h, hyperparameters) in enumerate(hyperparameters_lst): n_models = hyperparameters.pop("n_models") + for model_group in range(n_models): for (i, allele) in enumerate(alleles): print( @@ -123,41 +102,13 @@ def run(): train_data = df.ix[df.allele == allele].dropna().sample( frac=1.0) - model = Class1NeuralNetwork( - verbose=args.verbosity, - **hyperparameters) - - model.fit( - train_data.peptide.values, - train_data.measurement_value.values) - print("Fit in %0.2f sec" % model.fit_seconds) - - name = "%s-%d-%d" % ( - allele.replace("*", "_"), - h, - model_group) - - row = pandas.Series({ - "hyperparameters_index": h, - "model_group": model_group, - "allele": allele, - "hyperparameters": hyperparameters, - "history": model.fit_history, - "name": name, - "num_measurements": len(train_data), - "fit_seconds": model.fit_seconds, - }).to_frame().T - manifest = pandas.concat([manifest, row], ignore_index=True) - print(manifest) - - manifest.to_csv(manifest_path, index=False) - print("Wrote: %s" % manifest_path) - - model_path = os.path.join( - args.out_models_dir, "%s.pickle" % name) - with open(model_path, 'wb') as fd: - pickle.dump(model, fd, protocol=2) - print("Wrote: %s" % model_path) + predictor.fit_allele_specific_predictors( + n_models=1, + architecture_hyperparameters=hyperparameters, + allele=allele, + peptides=train_data.peptide.values, + affinities=train_data.measurement_value.values, + models_dir_for_save=args.out_models_dir) if __name__ == '__main__': -- GitLab