data_dependent_weights_initialization.py

"""
Layer-sequential unit-variance initialization for neural networks.

See:
    Mishkin and Matas, "All you need is a good init". 2016.
    https://arxiv.org/abs/1511.06422
"""
#
# LSUV initialization code in this file is adapted from:
#   https://github.com/ducha-aiki/LSUV-keras/blob/master/lsuv_init.py
# by Dmytro Mishkin
#
# Here is the license for the original code:
#
#
# Copyright (C) 2017, Dmytro Mishkin
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the
#    distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from __future__ import print_function
import numpy


def svd_orthonormal(shape):
    # Orthonormal init code is from Lasagne
    # https://github.com/Lasagne/Lasagne/blob/master/lasagne/init.py
    if len(shape) < 2:
        raise RuntimeError("Only shapes of length 2 or more are supported.")
    flat_shape = (shape[0], numpy.prod(shape[1:]))
    a = numpy.random.standard_normal(flat_shape).astype("float32")
    u, _, v = numpy.linalg.svd(a, full_matrices=False)
    q = u if u.shape == flat_shape else v
    q = q.reshape(shape)
    return q


def get_activations(model, layer, X_batch):
    from keras.models import Model
    intermediate_layer_model = Model(
        inputs=model.get_input_at(0),
        outputs=layer.get_output_at(0)
    )
    activations = intermediate_layer_model.predict(X_batch)
    return activations


def lsuv_init(model, batch, verbose=True, margin=0.1, max_iter=100):
    """
    Initialize neural network weights using layer-sequential unit-variance
    initialization.

    See:
        Mishkin and Matas, "All you need is a good init". 2016.
        https://arxiv.org/abs/1511.06422

    Parameters
    ----------
    model : keras.Model
    batch : dict
        Training data, as would be passed keras.Model.fit()
    verbose : boolean
        Whether to print progress to stdout
    margin : float
    max_iter : int

    Returns
    -------
    keras.Model
        Same as what was passed in.
    """
    from keras.layers import Dense, Convolution2D
    needed_variance = 1.0
    layers_inintialized = 0
    for layer in model.layers:
        if not isinstance(layer, (Dense, Convolution2D)):
            continue
        # avoid small layers where activation variance close to zero, esp.
        # for small batches_generator
        if numpy.prod(layer.get_output_shape_at(0)[1:]) < 32:
            if verbose:
                print('LSUV initialization skipping', layer.name)
            continue
        layers_inintialized += 1
        weights_and_biases = layer.get_weights()
        weights_and_biases[0] = svd_orthonormal(weights_and_biases[0].shape)
        layer.set_weights(weights_and_biases)
        activations = get_activations(model, layer, batch)
        variance = numpy.var(activations)
        iteration = 0
        if verbose:
            print(layer.name, variance)
        while abs(needed_variance - variance) > margin:
            if verbose:
                print(
                    'LSUV initialization',
                    layer.name,
                    iteration,
                    needed_variance,
                    margin,
                    variance)

            if numpy.abs(numpy.sqrt(variance)) < 1e-7:
                break  # avoid zero division

            weights_and_biases = layer.get_weights()
            weights_and_biases[0] /= numpy.sqrt(variance) / numpy.sqrt(
                needed_variance)
            layer.set_weights(weights_and_biases)
            activations = get_activations(model, layer, batch)
            variance = numpy.var(activations)

            iteration += 1
            if iteration >= max_iter:
                break
    if verbose:
        print('Done with LSUV: total layers initialized', layers_inintialized)
    return model