Skip to content
Snippets Groups Projects
Commit dd062baa authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent fbbef344
No related merge requests found
import collections import collections
import pickle
import time import time
import hashlib import hashlib
import json
from os.path import join, exists from os.path import join, exists
import numpy import numpy
...@@ -17,11 +17,16 @@ from .class1_neural_network import Class1NeuralNetwork ...@@ -17,11 +17,16 @@ from .class1_neural_network import Class1NeuralNetwork
class Class1AffinityPredictor(object): class Class1AffinityPredictor(object):
def __init__( def __init__(
self, self,
allele_to_allele_specific_models={}, allele_to_allele_specific_models=None,
class1_pan_allele_models=[], class1_pan_allele_models=None,
allele_to_pseudosequence=None, allele_to_pseudosequence=None,
manifest_df=None): manifest_df=None):
if allele_to_allele_specific_models is None:
allele_to_allele_specific_models = {}
if class1_pan_allele_models is None:
class1_pan_allele_models = []
if class1_pan_allele_models: if class1_pan_allele_models:
assert allele_to_pseudosequence, "Pseudosequences required" assert allele_to_pseudosequence, "Pseudosequences required"
...@@ -32,14 +37,9 @@ class Class1AffinityPredictor(object): ...@@ -32,14 +37,9 @@ class Class1AffinityPredictor(object):
if manifest_df is None: if manifest_df is None:
manifest_df = pandas.DataFrame() manifest_df = pandas.DataFrame()
manifest_df["name"] = [] manifest_df["model_name"] = []
manifest_df["allele"] = [] manifest_df["allele"] = []
manifest_df["hyperparameters"] = [] manifest_df["config_json"] = []
manifest_df["history"] = []
manifest_df["num_measurements"] = []
manifest_df["random_negative_rate"] = []
manifest_df["sources"] = []
manifest_df["fit_seconds"] = []
manifest_df["model"] = [] manifest_df["model"] = []
self.manifest_df = manifest_df self.manifest_df = manifest_df
...@@ -52,17 +52,16 @@ class Class1AffinityPredictor(object): ...@@ -52,17 +52,16 @@ class Class1AffinityPredictor(object):
if model_names_to_write is None: if model_names_to_write is None:
# Write all models # Write all models
models_names_to_write = self.manifest_df.model_name.values model_names_to_write = self.manifest_df.model_name.values
sub_manifest_df = self.manifest_df.ix[ sub_manifest_df = self.manifest_df.ix[
self.manifest_df.model_name.isin(models_names_to_write) self.manifest_df.model_name.isin(model_names_to_write)
] ]
for (_, row) in sub_manifest_df.iterrows(): for (_, row) in sub_manifest_df.iterrows():
model_path = join(models_dir, "%s.pickle" % row.name) weights_path = self.weights_path(models_dir, row.model_name)
with open(join(model_path), 'wb') as fd: row.model.save_weights(weights_path)
pickle.dump(row.model, fd, protocol=2) print("Wrote: %s" % weights_path)
print("Wrote: %s" % model_path)
write_manifest_df = self.manifest_df[[ write_manifest_df = self.manifest_df[[
c for c in self.manifest_df.columns if c != "model" c for c in self.manifest_df.columns if c != "model"
...@@ -77,21 +76,30 @@ class Class1AffinityPredictor(object): ...@@ -77,21 +76,30 @@ class Class1AffinityPredictor(object):
str(time.time()).encode()).hexdigest()[:16] str(time.time()).encode()).hexdigest()[:16]
return "%s-%d-%s" % (allele, num, random_string) return "%s-%d-%s" % (allele, num, random_string)
@staticmethod
def weights_path(models_dir, model_name):
return join(
models_dir,
"%s.%s" % (
model_name, Class1NeuralNetwork.weights_filename_extension))
@staticmethod @staticmethod
def load(models_dir, max_models=None): def load(models_dir, max_models=None):
manifest_path = join(models_dir, "manifest.csv") manifest_path = join(models_dir, "manifest.csv")
manifest_df = pandas.read_csv(manifest_path, nrows=max_models) manifest_df = pandas.read_csv(manifest_path, nrows=max_models)
manifest_df["hyperparameters"] = manifest_df.hyperparameters.map(eval)
manifest_df["history"] = manifest_df.history.map(eval)
allele_to_allele_specific_models = collections.defaultdict(list) allele_to_allele_specific_models = collections.defaultdict(list)
class1_pan_allele_models = [] class1_pan_allele_models = []
all_models = [] all_models = []
for (_, row) in manifest_df.iterrows(): for (_, row) in manifest_df.iterrows():
model_path = join(models_dir, "%s.pickle" % row["name"]) model = Class1NeuralNetwork.from_config(
print("Loading model: %s" % model_path) json.loads(row.config_json))
with open(model_path, 'rb') as fd: weights_path = Class1AffinityPredictor.weights_path(
model = pickle.load(fd) models_dir, row.model_name)
print("Loading model weights: %s" % weights_path)
model.restore_weights(weights_path)
if row.allele == "pan-class1": if row.allele == "pan-class1":
class1_pan_allele_models.append(model) class1_pan_allele_models.append(model)
else: else:
...@@ -131,7 +139,6 @@ class Class1AffinityPredictor(object): ...@@ -131,7 +139,6 @@ class Class1AffinityPredictor(object):
allele, allele,
peptides, peptides,
affinities, affinities,
output_assignments=None,
models_dir_for_save=None, models_dir_for_save=None,
verbose=1): verbose=1):
...@@ -139,33 +146,30 @@ class Class1AffinityPredictor(object): ...@@ -139,33 +146,30 @@ class Class1AffinityPredictor(object):
models = self._fit_predictors( models = self._fit_predictors(
n_models=n_models, n_models=n_models,
architecture_hyperparameters=architecture_hyperparameters, architecture_hyperparameters=architecture_hyperparameters,
peptide=peptides, peptides=peptides,
affinities=affinities, affinities=affinities,
output_assignments=output_assignments,
allele_pseudosequences=None, allele_pseudosequences=None,
verbose=verbose) verbose=verbose)
if allele not in self.allele_to_allele_specific_models:
self.allele_to_allele_specific_models[allele] = []
models_list = [] models_list = []
for (i, model) in enumerate(models): for (i, model) in enumerate(models):
name = self.model_name(allele, i) model_name = self.model_name(allele, i)
models_list.append(model) # models is a generator models_list.append(model) # models is a generator
row = pandas.Series({ row = pandas.Series(collections.OrderedDict([
"allele": allele, ("model_name", model_name),
"hyperparameters": architecture_hyperparameters, ("allele", allele),
"history": model.fit_history.history, ("config_json", json.dumps(model.get_config())),
"name": name, ("model", model),
"num_measurements": len(peptides), ])).to_frame().T
"fit_seconds": model.fit_seconds,
"model": model,
}).to_frame().T
self.manifest_df = pandas.concat( self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True) [self.manifest_df, row], ignore_index=True)
self.allele_to_allele_specific_models[allele].append(model)
if models_dir_for_save: if models_dir_for_save:
self.save(models_dir_for_save, model_names_to_write=[name]) self.save(
models_dir_for_save, model_names_to_write=[model_name])
if allele not in self.allele_to_allele_specific_models:
self.allele_to_allele_specific_models[allele] = []
self.allele_to_allele_specific_models[allele].extend(models_list)
return models return models
def fit_class1_pan_allele_models( def fit_class1_pan_allele_models(
...@@ -175,7 +179,6 @@ class Class1AffinityPredictor(object): ...@@ -175,7 +179,6 @@ class Class1AffinityPredictor(object):
alleles, alleles,
peptides, peptides,
affinities, affinities,
output_assignments=None,
models_dir_for_save=None, models_dir_for_save=None,
verbose=1): verbose=1):
...@@ -187,28 +190,22 @@ class Class1AffinityPredictor(object): ...@@ -187,28 +190,22 @@ class Class1AffinityPredictor(object):
architecture_hyperparameters=architecture_hyperparameters, architecture_hyperparameters=architecture_hyperparameters,
peptides=peptides, peptides=peptides,
affinities=affinities, affinities=affinities,
output_assignments=output_assignments,
allele_pseudosequences=allele_pseudosequences) allele_pseudosequences=allele_pseudosequences)
models_list = []
for (i, model) in enumerate(models): for (i, model) in enumerate(models):
name = self.model_name("pan-class1", i) model_name = self.model_name("pan-class1", i)
models_list.append(model) # models is a generator self.class1_pan_allele_models.append(model)
row = pandas.Series({ row = pandas.Series(collections.OrderedDict([
"allele": "pan-class1", ("model_name", model_name),
"hyperparameters": architecture_hyperparameters, ("allele", "pan-class1"),
"history": model.fit_history.history, ("config_json", json.dumps(model.get_config())),
"name": name, ("model", model),
"num_measurements": len(peptides), ])).to_frame().T
"fit_seconds": model.fit_seconds,
"model": model,
}).to_frame().T
self.manifest_df = pandas.concat( self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True) [self.manifest_df, row], ignore_index=True)
if models_dir_for_save: if models_dir_for_save:
self.save(models_dir_for_save, model_names_to_write=[name]) self.save(
models_dir_for_save, model_names_to_write=[model_name])
self.class1_pan_allele_models.extend(models_list)
return models return models
def _fit_predictors( def _fit_predictors(
...@@ -217,20 +214,16 @@ class Class1AffinityPredictor(object): ...@@ -217,20 +214,16 @@ class Class1AffinityPredictor(object):
architecture_hyperparameters, architecture_hyperparameters,
peptides, peptides,
affinities, affinities,
output_assignments,
allele_pseudosequences, allele_pseudosequences,
verbose=1): verbose=1):
encodable_peptides = EncodableSequences.create(peptides) encodable_peptides = EncodableSequences.create(peptides)
if output_assignments is None:
output_assignments = ["output"] * len(encodable_peptides.sequences)
for i in range(n_models): for i in range(n_models):
print("Training model %d / %d" % (i + 1, n_models)) print("Training model %d / %d" % (i + 1, n_models))
model = Class1NeuralNetwork(**architecture_hyperparameters) model = Class1NeuralNetwork(**architecture_hyperparameters)
model.fit( model.fit(
encodable_peptides, encodable_peptides,
affinities, affinities,
output_assignments=output_assignments,
allele_pseudosequences=allele_pseudosequences, allele_pseudosequences=allele_pseudosequences,
verbose=verbose) verbose=verbose)
yield model yield model
......
...@@ -22,6 +22,8 @@ from ..common import random_peptides, amino_acid_distribution ...@@ -22,6 +22,8 @@ from ..common import random_peptides, amino_acid_distribution
class Class1NeuralNetwork(object): class Class1NeuralNetwork(object):
weights_filename_extension = "npz"
network_hyperparameter_defaults = HyperparameterDefaults( network_hyperparameter_defaults = HyperparameterDefaults(
kmer_size=15, kmer_size=15,
use_embedding=True, use_embedding=True,
...@@ -38,6 +40,10 @@ class Class1NeuralNetwork(object): ...@@ -38,6 +40,10 @@ class Class1NeuralNetwork(object):
batch_normalization=True, batch_normalization=True,
embedding_init_method="glorot_uniform", embedding_init_method="glorot_uniform",
locally_connected_layers=[], locally_connected_layers=[],
)
compile_hyperparameter_defaults = HyperparameterDefaults(
loss="mse",
optimizer="rmsprop", optimizer="rmsprop",
) )
...@@ -65,6 +71,7 @@ class Class1NeuralNetwork(object): ...@@ -65,6 +71,7 @@ class Class1NeuralNetwork(object):
mode='auto') mode='auto')
hyperparameter_defaults = network_hyperparameter_defaults.extend( hyperparameter_defaults = network_hyperparameter_defaults.extend(
compile_hyperparameter_defaults).extend(
input_encoding_hyperparameter_defaults).extend( input_encoding_hyperparameter_defaults).extend(
fit_hyperparameter_defaults).extend( fit_hyperparameter_defaults).extend(
early_stopping_hyperparameter_defaults) early_stopping_hyperparameter_defaults)
...@@ -75,11 +82,25 @@ class Class1NeuralNetwork(object): ...@@ -75,11 +82,25 @@ class Class1NeuralNetwork(object):
self.network = None self.network = None
self.loss_history = None self.loss_history = None
self.fit_seconds = None self.fit_seconds = None
self.fit_num_points = None
def __getstate__(self): def get_config(self):
result = dict(self.__dict__) result = dict(self.__dict__)
del result['network'] del result['network']
result['network_json'] = self.network.to_json() result['network_json'] = self.network.to_json()
return result
@classmethod
def from_config(cls, config):
config = dict(config)
instance = cls(**config.pop('hyperparameters'))
instance.network = keras.models.model_from_json(
config.pop('network_json'))
instance.__dict__.update(config)
return instance
def __getstate__(self):
result = self.get_config()
result['network_weights'] = self.get_weights() result['network_weights'] = self.get_weights()
return result return result
...@@ -90,16 +111,19 @@ class Class1NeuralNetwork(object): ...@@ -90,16 +111,19 @@ class Class1NeuralNetwork(object):
self.network = keras.models.model_from_json(network_json) self.network = keras.models.model_from_json(network_json)
self.set_weights(network_weights) self.set_weights(network_weights)
def get_weights(self): def save_weights(self, filename):
""" weights_list = self.network.get_weights()
Returns weights, which can be passed to set_weights later. numpy.savez(
""" filename,
return [x.copy() for x in self.network.get_weights()] **dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
def set_weights(self, weights): def restore_weights(self, filename):
""" loaded = numpy.load(filename)
Reset the model weights. weights = [
""" loaded["array_%d" % i]
for i in range(len(loaded.keys()))
]
loaded.close()
self.network.set_weights(weights) self.network.set_weights(weights)
def peptides_to_network_input(self, peptides): def peptides_to_network_input(self, peptides):
...@@ -133,10 +157,11 @@ class Class1NeuralNetwork(object): ...@@ -133,10 +157,11 @@ class Class1NeuralNetwork(object):
allele_pseudosequences=None, allele_pseudosequences=None,
sample_weights=None, sample_weights=None,
verbose=1): verbose=1):
self.fit_num_points = len(peptides)
encodable_peptides = EncodableSequences.create(peptides) encodable_peptides = EncodableSequences.create(peptides)
peptide_encoding = self.peptides_to_network_input(encodable_peptides) peptide_encoding = self.peptides_to_network_input(encodable_peptides)
peptide_to_encoding = dict(
zip(encodable_peptides.sequences, peptide_encoding))
length_counts = ( length_counts = (
pandas.Series(encodable_peptides.sequences) pandas.Series(encodable_peptides.sequences)
...@@ -181,6 +206,7 @@ class Class1NeuralNetwork(object): ...@@ -181,6 +206,7 @@ class Class1NeuralNetwork(object):
pseudosequence_length=pseudosequence_length, pseudosequence_length=pseudosequence_length,
**self.network_hyperparameter_defaults.subselect( **self.network_hyperparameter_defaults.subselect(
self.hyperparameters)) self.hyperparameters))
self.compile()
y_dict_with_random_negatives = { y_dict_with_random_negatives = {
"output": numpy.concatenate([ "output": numpy.concatenate([
...@@ -279,6 +305,11 @@ class Class1NeuralNetwork(object): ...@@ -279,6 +305,11 @@ class Class1NeuralNetwork(object):
(predictions,) = numpy.array(self.network.predict(x_dict)).T (predictions,) = numpy.array(self.network.predict(x_dict)).T
return to_ic50(predictions) return to_ic50(predictions)
def compile(self):
self.network.compile(
**self.compile_hyperparameter_defaults.subselect(
self.hyperparameters))
@staticmethod @staticmethod
def make_network( def make_network(
pseudosequence_length, pseudosequence_length,
...@@ -296,8 +327,7 @@ class Class1NeuralNetwork(object): ...@@ -296,8 +327,7 @@ class Class1NeuralNetwork(object):
dropout_probability, dropout_probability,
batch_normalization, batch_normalization,
embedding_init_method, embedding_init_method,
locally_connected_layers, locally_connected_layers):
optimizer):
if use_embedding: if use_embedding:
peptide_input = Input( peptide_input = Input(
...@@ -374,7 +404,4 @@ class Class1NeuralNetwork(object): ...@@ -374,7 +404,4 @@ class Class1NeuralNetwork(object):
activation=output_activation, activation=output_activation,
name="output")(current_layer) name="output")(current_layer)
model = keras.models.Model(inputs=inputs, outputs=[output]) model = keras.models.Model(inputs=inputs, outputs=[output])
model.compile(
loss="mse",
optimizer=optimizer)
return model return model
...@@ -5,25 +5,14 @@ Train single allele models ...@@ -5,25 +5,14 @@ Train single allele models
import sys import sys
import argparse import argparse
import json import json
import os
import pickle
import pandas import pandas
import mhcnames
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_neural_network import Class1NeuralNetwork
from ..common import configure_logging from ..common import configure_logging
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__) parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument( parser.add_argument(
...@@ -62,9 +51,8 @@ parser.add_argument( ...@@ -62,9 +51,8 @@ parser.add_argument(
default=1) default=1)
def run(argv=sys.argv[1:]):
def run(): args = parser.parse_args(argv)
args = parser.parse_args(sys.argv[1:])
configure_logging(verbose=args.verbosity > 1) configure_logging(verbose=args.verbosity > 1)
...@@ -93,20 +81,11 @@ def run(): ...@@ -93,20 +81,11 @@ def run():
print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
print("Training data: %s" % (str(df.shape))) print("Training data: %s" % (str(df.shape)))
manifest = pandas.DataFrame() predictor = Class1AffinityPredictor()
manifest["name"] = []
manifest["hyperparameters_index"] = []
manifest["model_group"] = []
manifest["allele"] = []
manifest["hyperparameters"] = []
manifest["history"] = []
manifest["num_measurements"] = []
manifest["fit_seconds"] = []
manifest_path = os.path.join(args.out_models_dir, "manifest.csv")
for (h, hyperparameters) in enumerate(hyperparameters_lst): for (h, hyperparameters) in enumerate(hyperparameters_lst):
n_models = hyperparameters.pop("n_models") n_models = hyperparameters.pop("n_models")
for model_group in range(n_models): for model_group in range(n_models):
for (i, allele) in enumerate(alleles): for (i, allele) in enumerate(alleles):
print( print(
...@@ -123,41 +102,13 @@ def run(): ...@@ -123,41 +102,13 @@ def run():
train_data = df.ix[df.allele == allele].dropna().sample( train_data = df.ix[df.allele == allele].dropna().sample(
frac=1.0) frac=1.0)
model = Class1NeuralNetwork( predictor.fit_allele_specific_predictors(
verbose=args.verbosity, n_models=1,
**hyperparameters) architecture_hyperparameters=hyperparameters,
allele=allele,
model.fit( peptides=train_data.peptide.values,
train_data.peptide.values, affinities=train_data.measurement_value.values,
train_data.measurement_value.values) models_dir_for_save=args.out_models_dir)
print("Fit in %0.2f sec" % model.fit_seconds)
name = "%s-%d-%d" % (
allele.replace("*", "_"),
h,
model_group)
row = pandas.Series({
"hyperparameters_index": h,
"model_group": model_group,
"allele": allele,
"hyperparameters": hyperparameters,
"history": model.fit_history,
"name": name,
"num_measurements": len(train_data),
"fit_seconds": model.fit_seconds,
}).to_frame().T
manifest = pandas.concat([manifest, row], ignore_index=True)
print(manifest)
manifest.to_csv(manifest_path, index=False)
print("Wrote: %s" % manifest_path)
model_path = os.path.join(
args.out_models_dir, "%s.pickle" % name)
with open(model_path, 'wb') as fd:
pickle.dump(model, fd, protocol=2)
print("Wrote: %s" % model_path)
if __name__ == '__main__': if __name__ == '__main__':
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment