Skip to content
Snippets Groups Projects
Commit dd062baa authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent fbbef344
No related merge requests found
import collections
import pickle
import time
import hashlib
import json
from os.path import join, exists
import numpy
......@@ -17,11 +17,16 @@ from .class1_neural_network import Class1NeuralNetwork
class Class1AffinityPredictor(object):
def __init__(
self,
allele_to_allele_specific_models={},
class1_pan_allele_models=[],
allele_to_allele_specific_models=None,
class1_pan_allele_models=None,
allele_to_pseudosequence=None,
manifest_df=None):
if allele_to_allele_specific_models is None:
allele_to_allele_specific_models = {}
if class1_pan_allele_models is None:
class1_pan_allele_models = []
if class1_pan_allele_models:
assert allele_to_pseudosequence, "Pseudosequences required"
......@@ -32,14 +37,9 @@ class Class1AffinityPredictor(object):
if manifest_df is None:
manifest_df = pandas.DataFrame()
manifest_df["name"] = []
manifest_df["model_name"] = []
manifest_df["allele"] = []
manifest_df["hyperparameters"] = []
manifest_df["history"] = []
manifest_df["num_measurements"] = []
manifest_df["random_negative_rate"] = []
manifest_df["sources"] = []
manifest_df["fit_seconds"] = []
manifest_df["config_json"] = []
manifest_df["model"] = []
self.manifest_df = manifest_df
......@@ -52,17 +52,16 @@ class Class1AffinityPredictor(object):
if model_names_to_write is None:
# Write all models
models_names_to_write = self.manifest_df.model_name.values
model_names_to_write = self.manifest_df.model_name.values
sub_manifest_df = self.manifest_df.ix[
self.manifest_df.model_name.isin(models_names_to_write)
self.manifest_df.model_name.isin(model_names_to_write)
]
for (_, row) in sub_manifest_df.iterrows():
model_path = join(models_dir, "%s.pickle" % row.name)
with open(join(model_path), 'wb') as fd:
pickle.dump(row.model, fd, protocol=2)
print("Wrote: %s" % model_path)
weights_path = self.weights_path(models_dir, row.model_name)
row.model.save_weights(weights_path)
print("Wrote: %s" % weights_path)
write_manifest_df = self.manifest_df[[
c for c in self.manifest_df.columns if c != "model"
......@@ -77,21 +76,30 @@ class Class1AffinityPredictor(object):
str(time.time()).encode()).hexdigest()[:16]
return "%s-%d-%s" % (allele, num, random_string)
@staticmethod
def weights_path(models_dir, model_name):
return join(
models_dir,
"%s.%s" % (
model_name, Class1NeuralNetwork.weights_filename_extension))
@staticmethod
def load(models_dir, max_models=None):
manifest_path = join(models_dir, "manifest.csv")
manifest_df = pandas.read_csv(manifest_path, nrows=max_models)
manifest_df["hyperparameters"] = manifest_df.hyperparameters.map(eval)
manifest_df["history"] = manifest_df.history.map(eval)
allele_to_allele_specific_models = collections.defaultdict(list)
class1_pan_allele_models = []
all_models = []
for (_, row) in manifest_df.iterrows():
model_path = join(models_dir, "%s.pickle" % row["name"])
print("Loading model: %s" % model_path)
with open(model_path, 'rb') as fd:
model = pickle.load(fd)
model = Class1NeuralNetwork.from_config(
json.loads(row.config_json))
weights_path = Class1AffinityPredictor.weights_path(
models_dir, row.model_name)
print("Loading model weights: %s" % weights_path)
model.restore_weights(weights_path)
if row.allele == "pan-class1":
class1_pan_allele_models.append(model)
else:
......@@ -131,7 +139,6 @@ class Class1AffinityPredictor(object):
allele,
peptides,
affinities,
output_assignments=None,
models_dir_for_save=None,
verbose=1):
......@@ -139,33 +146,30 @@ class Class1AffinityPredictor(object):
models = self._fit_predictors(
n_models=n_models,
architecture_hyperparameters=architecture_hyperparameters,
peptide=peptides,
peptides=peptides,
affinities=affinities,
output_assignments=output_assignments,
allele_pseudosequences=None,
verbose=verbose)
if allele not in self.allele_to_allele_specific_models:
self.allele_to_allele_specific_models[allele] = []
models_list = []
for (i, model) in enumerate(models):
name = self.model_name(allele, i)
model_name = self.model_name(allele, i)
models_list.append(model) # models is a generator
row = pandas.Series({
"allele": allele,
"hyperparameters": architecture_hyperparameters,
"history": model.fit_history.history,
"name": name,
"num_measurements": len(peptides),
"fit_seconds": model.fit_seconds,
"model": model,
}).to_frame().T
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
("allele", allele),
("config_json", json.dumps(model.get_config())),
("model", model),
])).to_frame().T
self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True)
self.allele_to_allele_specific_models[allele].append(model)
if models_dir_for_save:
self.save(models_dir_for_save, model_names_to_write=[name])
if allele not in self.allele_to_allele_specific_models:
self.allele_to_allele_specific_models[allele] = []
self.allele_to_allele_specific_models[allele].extend(models_list)
self.save(
models_dir_for_save, model_names_to_write=[model_name])
return models
def fit_class1_pan_allele_models(
......@@ -175,7 +179,6 @@ class Class1AffinityPredictor(object):
alleles,
peptides,
affinities,
output_assignments=None,
models_dir_for_save=None,
verbose=1):
......@@ -187,28 +190,22 @@ class Class1AffinityPredictor(object):
architecture_hyperparameters=architecture_hyperparameters,
peptides=peptides,
affinities=affinities,
output_assignments=output_assignments,
allele_pseudosequences=allele_pseudosequences)
models_list = []
for (i, model) in enumerate(models):
name = self.model_name("pan-class1", i)
models_list.append(model) # models is a generator
row = pandas.Series({
"allele": "pan-class1",
"hyperparameters": architecture_hyperparameters,
"history": model.fit_history.history,
"name": name,
"num_measurements": len(peptides),
"fit_seconds": model.fit_seconds,
"model": model,
}).to_frame().T
model_name = self.model_name("pan-class1", i)
self.class1_pan_allele_models.append(model)
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
("allele", "pan-class1"),
("config_json", json.dumps(model.get_config())),
("model", model),
])).to_frame().T
self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True)
if models_dir_for_save:
self.save(models_dir_for_save, model_names_to_write=[name])
self.class1_pan_allele_models.extend(models_list)
self.save(
models_dir_for_save, model_names_to_write=[model_name])
return models
def _fit_predictors(
......@@ -217,20 +214,16 @@ class Class1AffinityPredictor(object):
architecture_hyperparameters,
peptides,
affinities,
output_assignments,
allele_pseudosequences,
verbose=1):
encodable_peptides = EncodableSequences.create(peptides)
if output_assignments is None:
output_assignments = ["output"] * len(encodable_peptides.sequences)
for i in range(n_models):
print("Training model %d / %d" % (i + 1, n_models))
model = Class1NeuralNetwork(**architecture_hyperparameters)
model.fit(
encodable_peptides,
affinities,
output_assignments=output_assignments,
allele_pseudosequences=allele_pseudosequences,
verbose=verbose)
yield model
......
......@@ -22,6 +22,8 @@ from ..common import random_peptides, amino_acid_distribution
class Class1NeuralNetwork(object):
weights_filename_extension = "npz"
network_hyperparameter_defaults = HyperparameterDefaults(
kmer_size=15,
use_embedding=True,
......@@ -38,6 +40,10 @@ class Class1NeuralNetwork(object):
batch_normalization=True,
embedding_init_method="glorot_uniform",
locally_connected_layers=[],
)
compile_hyperparameter_defaults = HyperparameterDefaults(
loss="mse",
optimizer="rmsprop",
)
......@@ -65,6 +71,7 @@ class Class1NeuralNetwork(object):
mode='auto')
hyperparameter_defaults = network_hyperparameter_defaults.extend(
compile_hyperparameter_defaults).extend(
input_encoding_hyperparameter_defaults).extend(
fit_hyperparameter_defaults).extend(
early_stopping_hyperparameter_defaults)
......@@ -75,11 +82,25 @@ class Class1NeuralNetwork(object):
self.network = None
self.loss_history = None
self.fit_seconds = None
self.fit_num_points = None
def __getstate__(self):
def get_config(self):
result = dict(self.__dict__)
del result['network']
result['network_json'] = self.network.to_json()
return result
@classmethod
def from_config(cls, config):
config = dict(config)
instance = cls(**config.pop('hyperparameters'))
instance.network = keras.models.model_from_json(
config.pop('network_json'))
instance.__dict__.update(config)
return instance
def __getstate__(self):
result = self.get_config()
result['network_weights'] = self.get_weights()
return result
......@@ -90,16 +111,19 @@ class Class1NeuralNetwork(object):
self.network = keras.models.model_from_json(network_json)
self.set_weights(network_weights)
def get_weights(self):
"""
Returns weights, which can be passed to set_weights later.
"""
return [x.copy() for x in self.network.get_weights()]
def set_weights(self, weights):
"""
Reset the model weights.
"""
def save_weights(self, filename):
weights_list = self.network.get_weights()
numpy.savez(
filename,
**dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
def restore_weights(self, filename):
loaded = numpy.load(filename)
weights = [
loaded["array_%d" % i]
for i in range(len(loaded.keys()))
]
loaded.close()
self.network.set_weights(weights)
def peptides_to_network_input(self, peptides):
......@@ -133,10 +157,11 @@ class Class1NeuralNetwork(object):
allele_pseudosequences=None,
sample_weights=None,
verbose=1):
self.fit_num_points = len(peptides)
encodable_peptides = EncodableSequences.create(peptides)
peptide_encoding = self.peptides_to_network_input(encodable_peptides)
peptide_to_encoding = dict(
zip(encodable_peptides.sequences, peptide_encoding))
length_counts = (
pandas.Series(encodable_peptides.sequences)
......@@ -181,6 +206,7 @@ class Class1NeuralNetwork(object):
pseudosequence_length=pseudosequence_length,
**self.network_hyperparameter_defaults.subselect(
self.hyperparameters))
self.compile()
y_dict_with_random_negatives = {
"output": numpy.concatenate([
......@@ -279,6 +305,11 @@ class Class1NeuralNetwork(object):
(predictions,) = numpy.array(self.network.predict(x_dict)).T
return to_ic50(predictions)
def compile(self):
self.network.compile(
**self.compile_hyperparameter_defaults.subselect(
self.hyperparameters))
@staticmethod
def make_network(
pseudosequence_length,
......@@ -296,8 +327,7 @@ class Class1NeuralNetwork(object):
dropout_probability,
batch_normalization,
embedding_init_method,
locally_connected_layers,
optimizer):
locally_connected_layers):
if use_embedding:
peptide_input = Input(
......@@ -374,7 +404,4 @@ class Class1NeuralNetwork(object):
activation=output_activation,
name="output")(current_layer)
model = keras.models.Model(inputs=inputs, outputs=[output])
model.compile(
loss="mse",
optimizer=optimizer)
return model
......@@ -5,25 +5,14 @@ Train single allele models
import sys
import argparse
import json
import os
import pickle
import pandas
import mhcnames
from .class1_neural_network import Class1NeuralNetwork
from .class1_affinity_predictor import Class1AffinityPredictor
from ..common import configure_logging
def normalize_allele_name(s):
try:
return mhcnames.normalize_allele_name(s)
except Exception:
return "UNKNOWN"
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
......@@ -62,9 +51,8 @@ parser.add_argument(
default=1)
def run():
args = parser.parse_args(sys.argv[1:])
def run(argv=sys.argv[1:]):
args = parser.parse_args(argv)
configure_logging(verbose=args.verbosity > 1)
......@@ -93,20 +81,11 @@ def run():
print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
print("Training data: %s" % (str(df.shape)))
manifest = pandas.DataFrame()
manifest["name"] = []
manifest["hyperparameters_index"] = []
manifest["model_group"] = []
manifest["allele"] = []
manifest["hyperparameters"] = []
manifest["history"] = []
manifest["num_measurements"] = []
manifest["fit_seconds"] = []
manifest_path = os.path.join(args.out_models_dir, "manifest.csv")
predictor = Class1AffinityPredictor()
for (h, hyperparameters) in enumerate(hyperparameters_lst):
n_models = hyperparameters.pop("n_models")
for model_group in range(n_models):
for (i, allele) in enumerate(alleles):
print(
......@@ -123,41 +102,13 @@ def run():
train_data = df.ix[df.allele == allele].dropna().sample(
frac=1.0)
model = Class1NeuralNetwork(
verbose=args.verbosity,
**hyperparameters)
model.fit(
train_data.peptide.values,
train_data.measurement_value.values)
print("Fit in %0.2f sec" % model.fit_seconds)
name = "%s-%d-%d" % (
allele.replace("*", "_"),
h,
model_group)
row = pandas.Series({
"hyperparameters_index": h,
"model_group": model_group,
"allele": allele,
"hyperparameters": hyperparameters,
"history": model.fit_history,
"name": name,
"num_measurements": len(train_data),
"fit_seconds": model.fit_seconds,
}).to_frame().T
manifest = pandas.concat([manifest, row], ignore_index=True)
print(manifest)
manifest.to_csv(manifest_path, index=False)
print("Wrote: %s" % manifest_path)
model_path = os.path.join(
args.out_models_dir, "%s.pickle" % name)
with open(model_path, 'wb') as fd:
pickle.dump(model, fd, protocol=2)
print("Wrote: %s" % model_path)
predictor.fit_allele_specific_predictors(
n_models=1,
architecture_hyperparameters=hyperparameters,
allele=allele,
peptides=train_data.peptide.values,
affinities=train_data.measurement_value.values,
models_dir_for_save=args.out_models_dir)
if __name__ == '__main__':
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment