Skip to content
Snippets Groups Projects
Commit 024618ec authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

presentation model saving and loading

parent 09dd0cd9
No related branches found
No related tags found
No related merge requests found
......@@ -26,6 +26,7 @@ from .regression_target import to_ic50
from .version import __version__
from .ensemble_centrality import CENTRALITY_MEASURES
from .allele_encoding import AlleleEncoding
from .common import save_weights, load_weights
# Default function for combining predictions across models in an ensemble.
......@@ -370,8 +371,7 @@ class Class1AffinityPredictor(object):
updated_network_config_jsons.append(
json.dumps(row.model.get_config()))
weights_path = self.weights_path(models_dir, row.model_name)
Class1AffinityPredictor.save_weights(
row.model.get_weights(), weights_path)
save_weights(row.model.get_weights(), weights_path)
logging.info("Wrote: %s", weights_path)
sub_manifest_df["config_json"] = updated_network_config_jsons
self.manifest_df.loc[
......@@ -469,9 +469,7 @@ class Class1AffinityPredictor(object):
# We will lazy-load weights when the network is used.
model = Class1NeuralNetwork.from_config(
config,
weights_loader=partial(
Class1AffinityPredictor.load_weights,
abspath(weights_filename)))
weights_loader=partial(load_weights, abspath(weights_filename)))
if row.allele == "pan-class1":
class1_pan_allele_models.append(model)
else:
......@@ -1235,46 +1233,6 @@ class Class1AffinityPredictor(object):
del df["normalized_allele"]
return df
@staticmethod
def save_weights(weights_list, filename):
"""
Save the model weights to the given filename using numpy's ".npz"
format.
Parameters
----------
weights_list : list of array
filename : string
Should end in ".npz".
"""
numpy.savez(
filename,
**dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
@staticmethod
def load_weights(filename):
"""
Restore model weights from the given filename, which should have been
created with `save_weights`.
Parameters
----------
filename : string
Should end in ".npz".
Returns
----------
list of array
"""
with numpy.load(filename) as loaded:
weights = [
loaded["array_%d" % i]
for i in range(len(loaded.keys()))
]
return weights
def calibrate_percentile_ranks(
self,
peptides=None,
......
......@@ -699,4 +699,49 @@ class Class1PresentationNeuralNetwork(object):
if network_weights is not None:
self.network.set_weights(network_weights)
def get_config(self):
"""
serialize to a dict all attributes except model weights
Returns
-------
dict
"""
result = dict(self.__dict__)
result['network'] = None
result['network_weights'] = None
result['network_json'] = None
if self.network:
result['network_weights'] = self.network.get_weights()
result['network_json'] = self.network.to_json()
return result
@classmethod
def from_config(cls, config, weights=None):
"""
deserialize from a dict returned by get_config().
Parameters
----------
config : dict
weights : list of array, optional
Network weights to restore
weights_loader : callable, optional
Function to call (no arguments) to load weights when needed
Returns
-------
Class1NeuralNetwork
"""
config = dict(config)
instance = cls(**config.pop('hyperparameters'))
network_json = config.pop('network_json')
network_weights = config.pop('network_weights')
instance.__dict__.update(config)
assert instance.network is None
if network_json is not None:
import keras.models
instance.network = keras.models.model_from_json(network_json)
if network_weights is not None:
instance.network.set_weights(network_weights)
return instance
\ No newline at end of file
......@@ -31,16 +31,19 @@ from .custom_loss import (
MSEWithInequalities,
MultiallelicMassSpecLoss,
ZeroLoss)
from .downloads import get_default_class1_presentation_models_dir
from .class1_presentation_neural_network import Class1PresentationNeuralNetwork
from .common import save_weights, load_weights
class Class1PresentationPredictor(object):
def __init__(
self,
class1_presentation_neural_networks,
models,
allele_to_sequence,
manifest_df=None,
metadata_dataframes=None):
self.networks = class1_presentation_neural_networks
self.models = models
self.allele_to_sequence = allele_to_sequence
self._manifest_df = manifest_df
self.metadata_dataframes = (
......@@ -57,7 +60,7 @@ class Class1PresentationPredictor(object):
"""
if self._manifest_df is None:
rows = []
for (i, model) in enumerate(self.networks):
for (i, model) in enumerate(self.models):
rows.append((
self.model_name(i),
json.dumps(model.get_config()),
......@@ -70,10 +73,10 @@ class Class1PresentationPredictor(object):
@property
def max_alleles(self):
max_alleles = self.networks[0].hyperparameters['max_alleles']
max_alleles = self.models[0].hyperparameters['max_alleles']
assert all(
n.hyperparameters['max_alleles'] == self.max_alleles
for n in self.networks)
for n in self.models)
return max_alleles
@staticmethod
......@@ -153,7 +156,7 @@ class Class1PresentationPredictor(object):
score_array = []
affinity_array = []
for (i, network) in enumerate(self.networks):
for (i, network) in enumerate(self.models):
predictions = network.predict(
peptides=peptides,
allele_encoding=alleles,
......@@ -191,24 +194,6 @@ class Class1PresentationPredictor(object):
numpy.percentile(affinity_array[:, :, i], 5.0, axis=0))
return result_df
@staticmethod
def save_weights(weights_list, filename):
"""
Save the model weights to the given filename using numpy's ".npz"
format.
Parameters
----------
weights_list : list of array
filename : string
Should end in ".npz".
"""
numpy.savez(
filename,
**dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
def check_consistency(self):
"""
Verify that self.manifest_df is consistent with instance variables.
......@@ -217,10 +202,10 @@ class Class1PresentationPredictor(object):
Throws AssertionError if inconsistent.
"""
assert len(self.manifest_df) == len(self.networks), (
assert len(self.manifest_df) == len(self.models), (
"Manifest seems out of sync with models: %d vs %d entries: \n%s"% (
len(self.manifest_df),
len(self.networks),
len(self.models),
str(self.manifest_df)))
def save(self, models_dir, model_names_to_write=None, write_metadata=True):
......@@ -301,8 +286,8 @@ class Class1PresentationPredictor(object):
join(models_dir, "allele_sequences.csv"), index=False)
logging.info("Wrote: %s", join(models_dir, "allele_sequences.csv"))
@staticmethod
def load(models_dir=None, max_models=None):
@classmethod
def load(cls, models_dir=None, max_models=None):
"""
Deserialize a predictor from a directory on disk.
......@@ -317,35 +302,24 @@ class Class1PresentationPredictor(object):
Returns
-------
`Class1AffinityPredictor` instance
`Class1PresentationPredictor` instance
"""
if models_dir is None:
models_dir = get_default_class1_models_dir()
models_dir = get_default_class1_presentation_models_dir()
manifest_path = join(models_dir, "manifest.csv")
manifest_df = pandas.read_csv(manifest_path, nrows=max_models)
allele_to_allele_specific_models = collections.defaultdict(list)
class1_pan_allele_models = []
all_models = []
models = []
for (_, row) in manifest_df.iterrows():
weights_filename = Class1AffinityPredictor.weights_path(
models_dir, row.model_name)
weights_filename = cls.weights_path(models_dir, row.model_name)
config = json.loads(row.config_json)
# We will lazy-load weights when the network is used.
model = Class1NeuralNetwork.from_config(
model = Class1PresentationNeuralNetwork.from_config(
config,
weights_loader=partial(
Class1AffinityPredictor.load_weights,
abspath(weights_filename)))
if row.allele == "pan-class1":
class1_pan_allele_models.append(model)
else:
allele_to_allele_specific_models[row.allele].append(model)
all_models.append(model)
weights=load_weights(abspath(weights_filename)))
models.append(model)
manifest_df["model"] = all_models
manifest_df["model"] = models
# Load allele sequences
allele_to_sequence = None
......@@ -354,40 +328,9 @@ class Class1PresentationPredictor(object):
join(models_dir, "allele_sequences.csv"),
index_col=0).iloc[:, 0].to_dict()
allele_to_percent_rank_transform = {}
percent_ranks_path = join(models_dir, "percent_ranks.csv")
if exists(percent_ranks_path):
percent_ranks_df = pandas.read_csv(percent_ranks_path, index_col=0)
for allele in percent_ranks_df.columns:
allele_to_percent_rank_transform[allele] = (
PercentRankTransform.from_series(percent_ranks_df[allele]))
logging.info(
"Loaded %d class1 pan allele predictors, %d allele sequences, "
"%d percent rank distributions, and %d allele specific models: %s",
len(class1_pan_allele_models),
len(allele_to_sequence) if allele_to_sequence else 0,
len(allele_to_percent_rank_transform),
sum(len(v) for v in allele_to_allele_specific_models.values()),
", ".join(
"%s (%d)" % (allele, len(v))
for (allele, v)
in sorted(allele_to_allele_specific_models.items())))
result = Class1AffinityPredictor(
allele_to_allele_specific_models=allele_to_allele_specific_models,
class1_pan_allele_models=class1_pan_allele_models,
logging.info("Loaded %d class1 presentation models", len(models))
result = cls(
models=models,
allele_to_sequence=allele_to_sequence,
manifest_df=manifest_df,
allele_to_percent_rank_transform=allele_to_percent_rank_transform,
)
if optimization_level >= 1:
optimized = result.optimize()
logging.info(
"Model optimization %s",
"succeeded" if optimized else "not supported for these models")
manifest_df=manifest_df)
return result
# TODO: implement saving and loading
\ No newline at end of file
......@@ -174,3 +174,35 @@ def positional_frequency_matrix(peptides):
result = (counts / len(peptides)).fillna(0.0).T
result.index.name = 'position'
return result
def save_weights(weights_list, filename):
"""
Save model weights to the given filename using numpy's ".npz" format.
Parameters
----------
weights_list : list of numpy array
filename : string
"""
numpy.savez(filename,
**dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
def load_weights(filename):
"""
Restore model weights from the given filename, which should have been
created with `save_weights`.
Parameters
----------
filename : string
Returns
----------
list of array
"""
with numpy.load(filename) as loaded:
weights = [loaded["array_%d" % i] for i in range(len(loaded.keys()))]
return weights
......@@ -28,6 +28,8 @@ _CURRENT_RELEASE = None
_METADATA = None
_MHCFLURRY_DEFAULT_CLASS1_MODELS_DIR = environ.get(
"MHCFLURRY_DEFAULT_CLASS1_MODELS")
_MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR = environ.get(
"MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR")
def get_downloads_dir():
......@@ -84,6 +86,38 @@ def get_default_class1_models_dir(test_exists=True):
return get_path("models_class1", "models", test_exists=test_exists)
def get_default_class1_presentation_models_dir(test_exists=True):
"""
Return the absolute path to the default class1 presentation models dir.
See `get_default_class1_models_dir`.
If environment variable MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS is set
to an absolute path, return that path. If it's set to a relative path (does
not start with /) then return that path taken to be relative to the mhcflurry
downloads dir.
Parameters
----------
test_exists : boolean, optional
Whether to raise an exception of the path does not exist
Returns
-------
string : absolute path
"""
if _MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR:
result = join(
get_downloads_dir(),
_MHCFLURRY_DEFAULT_CLASS1_PRESENTATION_MODELS_DIR)
if test_exists and not exists(result):
raise IOError("No such directory: %s" % result)
return result
return get_path(
"models_class1_pan_refined", "presentation", test_exists=test_exists)
def get_current_release_downloads():
"""
Return a dict of all available downloads in the current release.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment