Skip to content
Snippets Groups Projects
class1_ligandome_predictor.py 30 KiB
Newer Older
from __future__ import print_function

Tim O'Donnell's avatar
Tim O'Donnell committed
import time
import collections
Tim O'Donnell's avatar
Tim O'Donnell committed
from six import string_types
Tim O'Donnell's avatar
Tim O'Donnell committed

import numpy
Tim O'Donnell's avatar
Tim O'Donnell committed
import pandas
Tim O'Donnell's avatar
Tim O'Donnell committed
import mhcnames
import hashlib

from .hyperparameters import HyperparameterDefaults
Tim O'Donnell's avatar
Tim O'Donnell committed
from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
from .encodable_sequences import EncodableSequences
Tim O'Donnell's avatar
Tim O'Donnell committed
from .regression_target import from_ic50, to_ic50
from .random_negative_peptides import RandomNegativePeptides
Tim O'Donnell's avatar
Tim O'Donnell committed
from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding
from .auxiliary_input import AuxiliaryInputEncoder
Tim O'Donnell's avatar
Tim O'Donnell committed
from .batch_generator import MultiallelicMassSpecBatchGenerator
Tim O'Donnell's avatar
Tim O'Donnell committed
from .custom_loss import (
    MSEWithInequalities,
Tim O'Donnell's avatar
Tim O'Donnell committed
    MultiallelicMassSpecLoss,
    ZeroLoss)
Tim O'Donnell's avatar
Tim O'Donnell committed

class Class1LigandomePredictor(object):
    network_hyperparameter_defaults = HyperparameterDefaults(
Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_amino_acid_encoding="BLOSUM62",
        peptide_encoding={
            'vector_encoding_name': 'BLOSUM62',
            'alignment_method': 'left_pad_centered_right_pad',
            'max_length': 15,
        },
Tim O'Donnell's avatar
Tim O'Donnell committed
        max_alleles=6,
Tim O'Donnell's avatar
Tim O'Donnell committed
    )
    """
    Hyperparameters (and their default values) that affect the neural network
    architecture.
    """

    fit_hyperparameter_defaults = HyperparameterDefaults(
        max_epochs=500,
        early_stopping=True,
Tim O'Donnell's avatar
Tim O'Donnell committed
        random_negative_affinity_min=20000.0,).extend(
Tim O'Donnell's avatar
Tim O'Donnell committed
        RandomNegativePeptides.hyperparameter_defaults).extend(
        MultiallelicMassSpecBatchGenerator.hyperparameter_defaults
Tim O'Donnell's avatar
Tim O'Donnell committed
    )
    """
    Hyperparameters for neural network training.
    """

    early_stopping_hyperparameter_defaults = HyperparameterDefaults(
        patience=20,
        min_delta=0.0,
    )
    """
    Hyperparameters for early stopping.
    """

    compile_hyperparameter_defaults = HyperparameterDefaults(
Tim O'Donnell's avatar
Tim O'Donnell committed
        loss_multiallelic_mass_spec_delta=0.2,
        loss_multiallelic_mass_spec_multiplier=1.0,
Tim O'Donnell's avatar
Tim O'Donnell committed
        optimizer="rmsprop",
        learning_rate=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
    """
    Loss and optimizer hyperparameters. Any values supported by keras may be
    used.
    """
Tim O'Donnell's avatar
Tim O'Donnell committed
    auxiliary_input_hyperparameter_defaults = HyperparameterDefaults(
        auxiliary_input_features=["gene"],
        auxiliary_input_feature_parameters={},
Tim O'Donnell's avatar
Tim O'Donnell committed
    )
    """
    Allele feature hyperparameters.
    """

Tim O'Donnell's avatar
Tim O'Donnell committed
    hyperparameter_defaults = network_hyperparameter_defaults.extend(
        fit_hyperparameter_defaults).extend(
        early_stopping_hyperparameter_defaults).extend(
Tim O'Donnell's avatar
Tim O'Donnell committed
        compile_hyperparameter_defaults).extend(
Tim O'Donnell's avatar
Tim O'Donnell committed
        auxiliary_input_hyperparameter_defaults)
Tim O'Donnell's avatar
Tim O'Donnell committed

    def __init__(
            self,
            class1_affinity_predictor,
            max_ensemble_size=None,
            **hyperparameters):
        if not class1_affinity_predictor.class1_pan_allele_models:
            raise NotImplementedError("Pan allele models required")
        if class1_affinity_predictor.allele_to_allele_specific_models:
            raise NotImplementedError("Only pan allele models are supported")

Tim O'Donnell's avatar
Tim O'Donnell committed
        self.hyperparameters = self.hyperparameter_defaults.with_defaults(
            hyperparameters)

        models = class1_affinity_predictor.class1_pan_allele_models
        if max_ensemble_size is not None:
            models = models[:max_ensemble_size]
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.network = self.make_network(
            models,
            self.hyperparameters)

        self.fit_info = []
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.allele_to_sequence = class1_affinity_predictor.allele_to_sequence
        self.allele_representation_hash = None
Tim O'Donnell's avatar
Tim O'Donnell committed

    @staticmethod
    def make_network(pan_allele_class1_neural_networks, hyperparameters):
        import keras.backend as K
Tim O'Donnell's avatar
Tim O'Donnell committed
        from keras.layers import (
            Input,
            TimeDistributed,
            Dense,
            Flatten,
            RepeatVector,
            concatenate,
Tim O'Donnell's avatar
Tim O'Donnell committed
            Activation,
Tim O'Donnell's avatar
Tim O'Donnell committed
            Lambda,
Tim O'Donnell's avatar
Tim O'Donnell committed
            Add,
Tim O'Donnell's avatar
Tim O'Donnell committed
            Embedding)
        from keras.models import Model
Tim O'Donnell's avatar
Tim O'Donnell committed
        import keras.initializers
Tim O'Donnell's avatar
Tim O'Donnell committed
        networks = [
            model.network() for model in pan_allele_class1_neural_networks
        ]
Tim O'Donnell's avatar
Tim O'Donnell committed
        merged_ensemble = Class1NeuralNetwork.merge(
            networks,
            merge_method="average")

        peptide_shape = tuple(
            int(x) for x in K.int_shape(merged_ensemble.inputs[0])[1:])

Tim O'Donnell's avatar
Tim O'Donnell committed
        input_alleles = Input(
            shape=(hyperparameters['max_alleles'],), name="allele")
Tim O'Donnell's avatar
Tim O'Donnell committed
        input_peptides = Input(
            shape=peptide_shape,
            dtype='float32',
            name='peptide')

        peptides_flattened = Flatten()(input_peptides)
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides_repeated = RepeatVector(hyperparameters['max_alleles'])(
            peptides_flattened)
Tim O'Donnell's avatar
Tim O'Donnell committed

        allele_representation = Embedding(
            name="allele_representation",
            input_dim=64,  # arbitrary, how many alleles to have room for
            output_dim=1029,
Tim O'Donnell's avatar
Tim O'Donnell committed
            input_length=hyperparameters['max_alleles'],
Tim O'Donnell's avatar
Tim O'Donnell committed
            trainable=False,
            mask_zero=True)(input_alleles)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        #allele_flat = Reshape((6, -1), name="allele_flat")(allele_representation)
        allele_flat = allele_representation
Tim O'Donnell's avatar
Tim O'Donnell committed

        allele_peptide_merged = concatenate(
            [peptides_repeated, allele_flat], name="allele_peptide_merged")
Tim O'Donnell's avatar
Tim O'Donnell committed

        layer_names = [
            layer.name for layer in merged_ensemble.layers
        ]

        pan_allele_layer_initial_names = [
            'allele', 'peptide',
            'allele_representation', 'flattened_0', 'allele_flat',
            'allele_peptide_merged', 'dense_0', 'dropout_0',
        ]

        def startswith(lst, prefix):
            return lst[:len(prefix)] == prefix

        assert startswith(layer_names, pan_allele_layer_initial_names), layer_names

        layers = merged_ensemble.layers[
            pan_allele_layer_initial_names.index(
                "allele_peptide_merged") + 1:
        ]
        node = allele_peptide_merged
        layer_name_to_new_node = {
            "allele_peptide_merged": allele_peptide_merged,
        }
        for layer in layers:
            assert layer.name not in layer_name_to_new_node
            input_layer_names = []
            for inbound_node in layer._inbound_nodes:
                for inbound_layer in inbound_node.inbound_layers:
                    input_layer_names.append(inbound_layer.name)
            input_nodes = [
                layer_name_to_new_node[name]
                for name in input_layer_names
            ]

            if len(input_nodes) == 1:
                lifted = TimeDistributed(layer)
                node = lifted(input_nodes[0])
                node = layer(input_nodes)
            print(layer, layer.name, node, lifted)
Tim O'Donnell's avatar
Tim O'Donnell committed

            layer_name_to_new_node[layer.name] = node
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        affinity_predictor_matrix_output = node

        affinity_predictor_output = Lambda(
            lambda x: x[:, 0], name="affinity_output")(
                affinity_predictor_matrix_output)

        """
        layer = Dense(8, activation="sigmoid", kernel_initializer=keras.initializers.RandomNormal(mean=1.0/8.0, stddev=1e-5), use_bias=False)
        lifted = TimeDistributed(layer, name="ligandome_hidden1")
        node = lifted(affinity_predictor_matrix_output)
        """

        auxiliary_input = None
        if hyperparameters['auxiliary_input_features']:
            auxiliary_input = Input(
                shape=(
                    hyperparameters['max_alleles'],
                    len(
                        AuxiliaryInputEncoder.get_columns(
                            hyperparameters['auxiliary_input_features'],
                            feature_parameters=hyperparameters[
                                'auxiliary_input_feature_parameters']))),
                dtype="float32",
                name="auxiliary")
            node = concatenate(
                [node, auxiliary_input], name="affinities_with_auxiliary")

        #layer = Dense(1, activation="linear", kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=1e-5), use_bias=False)
Tim O'Donnell's avatar
Tim O'Donnell committed
        layer = Dense(8, activation="tanh")
        lifted = TimeDistributed(layer, name="ligandome_hidden1")
        node = lifted(node)

Tim O'Donnell's avatar
Tim O'Donnell committed
        layer = Dense(1, activation="tanh")
        lifted = TimeDistributed(layer, name="ligandome_output")
        ligandome_adjustment = lifted(node)
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        weights = layers[-1].get_weights()
        layer = Dense(1, activation="sigmoid", kernel_initializer=keras.initializers.Constant(weights[0]), bias_initializer=keras.initializers.Constant(weights[1]))
        lifted = TimeDistributed(layer, name="ligandome_output")
        ligandome_output = lifted(prev_node)
        """

        def logit(x):
            import tensorflow as tf
            return - tf.log(1. / x - 1.)

        ligandome_output_pre_sigmoid = Add()([Lambda(logit)(affinity_predictor_matrix_output), ligandome_adjustment])
        ligandome_output = Activation("sigmoid")(ligandome_output_pre_sigmoid)

        #ligandome_output = affinity_predictor_matrix_output
Tim O'Donnell's avatar
Tim O'Donnell committed
        #output_node = concatenate([
        #    affinity_predictor_output, ligandome_output
        #], name="combined_output")

Tim O'Donnell's avatar
Tim O'Donnell committed
        network = Model(
Tim O'Donnell's avatar
Tim O'Donnell committed
            inputs=[
                input_peptides,
                input_alleles,
            ] + ([] if auxiliary_input is None else [auxiliary_input]),
            outputs=[
                affinity_predictor_output,
                ligandome_output,
                affinity_predictor_matrix_output
            ],
Tim O'Donnell's avatar
Tim O'Donnell committed
            name="ligandome",
        )
        network.summary()
Tim O'Donnell's avatar
Tim O'Donnell committed
        return network

    def peptides_to_network_input(self, peptides):
        """
        Encode peptides to the fixed-length encoding expected by the neural
        network (which depends on the architecture).

        Parameters
        ----------
        peptides : EncodableSequences or list of string

        Returns
        -------
        numpy.array
        """
        encoder = EncodableSequences.create(peptides)
        encoded = encoder.variable_length_to_fixed_length_vector_encoding(
            **self.hyperparameters['peptide_encoding'])
        assert len(encoded) == len(peptides)
        return encoded

    def allele_encoding_to_network_input(self, allele_encoding):
        """
        Encode alleles to the fixed-length encoding expected by the neural
        network (which depends on the architecture).

        Parameters
        ----------
        allele_encoding : AlleleEncoding

        Returns
        -------
        (numpy.array, numpy.array)

        Indices and allele representations.

        """
        return (
            allele_encoding.indices,
            allele_encoding.allele_representations(
                self.hyperparameters['allele_amino_acid_encoding']))

    def fit(
            self,
            peptides,
            labels,
            allele_encoding,
Tim O'Donnell's avatar
Tim O'Donnell committed
            affinities_mask=None,  # True when a peptide/label is actually a peptide and an affinity
            inequalities=None,  # interpreted only for elements where affinities_mask is True, otherwise ignored
Tim O'Donnell's avatar
Tim O'Donnell committed
            verbose=1,
            progress_callback=None,
            progress_preamble="",
            progress_print_interval=5.0):

        import keras.backend as K

Tim O'Donnell's avatar
Tim O'Donnell committed
        assert isinstance(allele_encoding, MultipleAlleleEncoding)
        assert (
            allele_encoding.max_alleles_per_experiment ==
            self.hyperparameters['max_alleles'])

Tim O'Donnell's avatar
Tim O'Donnell committed
        #for layer in self.network._layers[:8]:
        #    print("Setting non trainable", layer)
        #    layer.trainable = False
        #    import ipdb ; ipdb.set_trace()

Tim O'Donnell's avatar
Tim O'Donnell committed
        encodable_peptides = EncodableSequences.create(peptides)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        if labels is not None:
            labels = numpy.array(labels, copy=False)
Tim O'Donnell's avatar
Tim O'Donnell committed
        if inequalities is not None:
            inequalities = numpy.array(inequalities, copy=True)
        else:
            inequalities = numpy.tile("=", len(labels))
Tim O'Donnell's avatar
Tim O'Donnell committed
        if affinities_mask is not None:
            affinities_mask = numpy.array(affinities_mask, copy=False)
Tim O'Donnell's avatar
Tim O'Donnell committed
        else:
            affinities_mask = numpy.tile(False, len(labels))
        inequalities[~affinities_mask] = "="
Tim O'Donnell's avatar
Tim O'Donnell committed

        random_negatives_planner = RandomNegativePeptides(
            **RandomNegativePeptides.hyperparameter_defaults.subselect(
                self.hyperparameters))
        random_negatives_planner.plan(
Tim O'Donnell's avatar
Tim O'Donnell committed
            peptides=encodable_peptides.sequences,
            affinities=numpy.where(affinities_mask, labels, to_ic50(labels)),
            alleles=[
                numpy.random.choice(row[row != numpy.array(None)])
                for row in allele_encoding.alleles
            ],
            inequalities=inequalities)
Tim O'Donnell's avatar
Tim O'Donnell committed

        peptide_input = self.peptides_to_network_input(encodable_peptides)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        # Optional optimization
        (allele_encoding_input, allele_representations) = (
            self.allele_encoding_to_network_input(allele_encoding))

        x_dict_without_random_negatives = {
            'peptide': peptide_input,
Tim O'Donnell's avatar
Tim O'Donnell committed
            'allele': allele_encoding_input,
        }
Tim O'Donnell's avatar
Tim O'Donnell committed
        if self.hyperparameters['auxiliary_input_features']:
            auxiliary_encoder = AuxiliaryInputEncoder(
                alleles=allele_encoding.alleles,
                peptides=peptides)
            x_dict_without_random_negatives[
                'auxiliary'
            ] = auxiliary_encoder.get_array(
                features=self.hyperparameters['auxiliary_input_features'],
                feature_parameters=self.hyperparameters[
                    'auxiliary_input_feature_parameters'])
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        y1 = numpy.zeros(shape=len(labels))
Tim O'Donnell's avatar
Tim O'Donnell committed
        y1[affinities_mask] = from_ic50(labels[affinities_mask])

        random_negative_alleles = random_negatives_planner.get_alleles()
        random_negatives_allele_encoding = MultipleAlleleEncoding(
            experiment_names=random_negative_alleles,
            experiment_to_allele_list=dict(
                (a, [a]) for a in random_negative_alleles),
            max_alleles_per_experiment=(
                allele_encoding.max_alleles_per_experiment),
            borrow_from=allele_encoding.allele_encoding)
Tim O'Donnell's avatar
Tim O'Donnell committed
        num_random_negatives = random_negatives_planner.get_total_count()

Tim O'Donnell's avatar
Tim O'Donnell committed
        # Reverse inequalities because from_ic50() flips the direction
        # (i.e. lower affinity results in higher y values).
        adjusted_inequalities = pandas.Series(inequalities).map({
            "=": "=",
            ">": "<",
            "<": ">",
        }).values
Tim O'Donnell's avatar
Tim O'Donnell committed
        adjusted_inequalities[~affinities_mask] = ">"

Tim O'Donnell's avatar
Tim O'Donnell committed
        # Note: we are using "<" here not ">" because the inequalities are
        # now in target-space (0-1) not affinity-space.
        adjusted_inequalities_with_random_negative = numpy.concatenate([
            numpy.tile("<", num_random_negatives),
            adjusted_inequalities
        ])
        random_negative_ic50 = self.hyperparameters[
            'random_negative_affinity_min'
        ]
        y1_with_random_negatives = numpy.concatenate([
            numpy.tile(
                from_ic50(random_negative_ic50), num_random_negatives),
            y1,
        ])

Tim O'Donnell's avatar
Tim O'Donnell committed
        affinities_loss = MSEWithInequalities()
        encoded_y1 = affinities_loss.encode_y(
Tim O'Donnell's avatar
Tim O'Donnell committed
            y1_with_random_negatives,
            inequalities=adjusted_inequalities_with_random_negative)
Tim O'Donnell's avatar
Tim O'Donnell committed

        mms_loss = MultiallelicMassSpecLoss(
Tim O'Donnell's avatar
Tim O'Donnell committed
            delta=self.hyperparameters['loss_multiallelic_mass_spec_delta'],
            multiplier=self.hyperparameters[
                'loss_multiallelic_mass_spec_multiplier'])
Tim O'Donnell's avatar
Tim O'Donnell committed
        y2 = labels.copy()
        y2[affinities_mask] = -1
Tim O'Donnell's avatar
Tim O'Donnell committed
        y2_with_random_negatives = numpy.concatenate([
            numpy.tile(0.0, num_random_negatives),
            y2,
        ])
        encoded_y2 = mms_loss.encode_y(y2_with_random_negatives)
Tim O'Donnell's avatar
Tim O'Donnell committed
        fit_info = collections.defaultdict(list)

Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_representations_hash = self.set_allele_representations(
            allele_representations)
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.network.compile(
Tim O'Donnell's avatar
Tim O'Donnell committed
            loss=[affinities_loss.loss, mms_loss.loss, ZeroLoss.loss],
Tim O'Donnell's avatar
Tim O'Donnell committed
            optimizer=self.hyperparameters['optimizer'])
        if self.hyperparameters['learning_rate'] is not None:
            K.set_value(
                self.network.optimizer.lr,
                self.hyperparameters['learning_rate'])
        fit_info["learning_rate"] = float(
            K.get_value(self.network.optimizer.lr))

        if verbose:
            self.network.summary()

Tim O'Donnell's avatar
Tim O'Donnell committed
        batch_generator = MultiallelicMassSpecBatchGenerator(
            MultiallelicMassSpecBatchGenerator.hyperparameter_defaults.subselect(
                self.hyperparameters))
        start = time.time()
        batch_generator.plan(
            affinities_mask=numpy.concatenate([
                numpy.tile(True, num_random_negatives),
                affinities_mask
            ]),
            experiment_names=numpy.concatenate([
                numpy.tile(None, num_random_negatives),
                allele_encoding.experiment_names
            ]),
            alleles_matrix=numpy.concatenate([
                random_negatives_allele_encoding.alleles,
                allele_encoding.alleles,
            ]),
            is_binder=numpy.concatenate([
                numpy.tile(False, num_random_negatives),
                numpy.where(affinities_mask, labels, to_ic50(labels)) < 1000.0
            ]),
            potential_validation_mask=numpy.concatenate([
                numpy.tile(False, num_random_negatives),
                numpy.tile(True, len(labels))
            ]),
        )
        if verbose:
            print("Generated batch generation plan in %0.2f sec." % (
                time.time() - start))
            print(batch_generator.summary())

Tim O'Donnell's avatar
Tim O'Donnell committed
        min_val_loss_iteration = None
        min_val_loss = None
        last_progress_print = 0
        start = time.time()
Tim O'Donnell's avatar
Tim O'Donnell committed
        x_dict_with_random_negatives = {}
Tim O'Donnell's avatar
Tim O'Donnell committed
        for i in range(self.hyperparameters['max_epochs']):
            epoch_start = time.time()
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
            random_negative_peptides = EncodableSequences.create(
                random_negatives_planner.get_peptides())
            random_negative_peptides_encoding = (
                self.peptides_to_network_input(random_negative_peptides))

            if not x_dict_with_random_negatives:
                if len(random_negative_peptides) > 0:
                    x_dict_with_random_negatives[
                        "peptide"
                    ] = numpy.concatenate([
                        random_negative_peptides_encoding,
                        x_dict_without_random_negatives['peptide'],
                    ])
                    x_dict_with_random_negatives[
                        'allele'
                    ] = numpy.concatenate([
                        self.allele_encoding_to_network_input(
                            random_negatives_allele_encoding)[0],
                        x_dict_without_random_negatives['allele']
                    ])
Tim O'Donnell's avatar
Tim O'Donnell committed
                    if 'auxiliary' in x_dict_without_random_negatives:
                        random_negative_auxiliary_encoder = AuxiliaryInputEncoder(
                            alleles=random_negatives_allele_encoding.alleles,
                            #peptides=random_negative_peptides.sequences
                        )
                        x_dict_with_random_negatives['auxiliary'] = (
                            numpy.concatenate([
                                random_negative_auxiliary_encoder.get_array(
                                    features=self.hyperparameters[
                                        'auxiliary_input_features'],
                                    feature_parameters=self.hyperparameters[
                                        'auxiliary_input_feature_parameters']),
                                x_dict_without_random_negatives['auxiliary']
                            ]))
Tim O'Donnell's avatar
Tim O'Donnell committed
                else:
                    x_dict_with_random_negatives = (
                        x_dict_without_random_negatives)
            else:
                # Update x_dict_with_random_negatives in place.
                # This is more memory efficient than recreating it as above.
                if len(random_negative_peptides) > 0:
                    x_dict_with_random_negatives[
                        "peptide"
                    ][:num_random_negatives] = random_negative_peptides_encoding

Tim O'Donnell's avatar
Tim O'Donnell committed
            (train_generator, test_generator) = (
                batch_generator.get_train_and_test_generators(
Tim O'Donnell's avatar
Tim O'Donnell committed
                    x_dict=x_dict_with_random_negatives,
                    y_list=[encoded_y1, encoded_y2, encoded_y2],
Tim O'Donnell's avatar
Tim O'Donnell committed
                    epochs=1))
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.assert_allele_representations_hash(allele_representations_hash)
Tim O'Donnell's avatar
Tim O'Donnell committed
            fit_history = self.network.fit_generator(
                train_generator,
Tim O'Donnell's avatar
Tim O'Donnell committed
                steps_per_epoch=batch_generator.num_train_batches,
Tim O'Donnell's avatar
Tim O'Donnell committed
                epochs=i + 1,
                initial_epoch=i,
                verbose=verbose,
                use_multiprocessing=False,
                workers=0,
                validation_data=test_generator,
Tim O'Donnell's avatar
Tim O'Donnell committed
                validation_steps=batch_generator.num_test_batches)
Tim O'Donnell's avatar
Tim O'Donnell committed

            """
Tim O'Donnell's avatar
Tim O'Donnell committed
            fit_history = self.network.fit(
Tim O'Donnell's avatar
Tim O'Donnell committed
                x_dict_with_random_negatives,
Tim O'Donnell's avatar
Tim O'Donnell committed
                [encoded_y1, encoded_y2, encoded_y2],
Tim O'Donnell's avatar
Tim O'Donnell committed
                shuffle=True,
                batch_size=self.hyperparameters['minibatch_size'],
                verbose=verbose,
                epochs=i + 1,
                initial_epoch=i,
Tim O'Donnell's avatar
Tim O'Donnell committed
                validation_split=self.hyperparameters['validation_split'],
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
            )
Tim O'Donnell's avatar
Tim O'Donnell committed
            """
Tim O'Donnell's avatar
Tim O'Donnell committed
            epoch_time = time.time() - epoch_start

            for (key, value) in fit_history.history.items():
                fit_info[key].extend(value)

            # Print progress no more often than once every few seconds.
            if progress_print_interval is not None and (
                    not last_progress_print or (
                        time.time() - last_progress_print
                        > progress_print_interval)):
                print((progress_preamble + " " +
                       "Epoch %3d / %3d [%0.2f sec]: loss=%g. "
                       "Min val loss (%s) at epoch %s" % (
                           i,
                           self.hyperparameters['max_epochs'],
                           epoch_time,
                           fit_info['loss'][-1],
                           str(min_val_loss),
                           min_val_loss_iteration)).strip())
                last_progress_print = time.time()

Tim O'Donnell's avatar
Tim O'Donnell committed
            if batch_generator.num_test_batches:
Tim O'Donnell's avatar
Tim O'Donnell committed
                #import ipdb ; ipdb.set_trace()
Tim O'Donnell's avatar
Tim O'Donnell committed
                val_loss = fit_info['val_loss'][-1]
                if min_val_loss is None or (
                        val_loss < min_val_loss -
                        self.hyperparameters['min_delta']):
                    min_val_loss = val_loss
                    min_val_loss_iteration = i

                if self.hyperparameters['early_stopping']:
                    threshold = (
                        min_val_loss_iteration +
                        self.hyperparameters['patience'])
                    if i > threshold:
                        if progress_print_interval is not None:
                            print((progress_preamble + " " +
                                "Stopping at epoch %3d / %3d: loss=%g. "
                                "Min val loss (%g) at epoch %s" % (
                                    i,
                                    self.hyperparameters['max_epochs'],
                                    fit_info['loss'][-1],
                                    (
                                        min_val_loss if min_val_loss is not None
                                        else numpy.nan),
                                    min_val_loss_iteration)).strip())
                        break

            if progress_callback:
                progress_callback()

        fit_info["time"] = time.time() - start
Tim O'Donnell's avatar
Tim O'Donnell committed
        fit_info["num_points"] = len(labels)
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.fit_info.append(dict(fit_info))

Tim O'Donnell's avatar
Tim O'Donnell committed
        return {
            'batch_generator': batch_generator,
            'last_x': x_dict_with_random_negatives,
            'last_y': [encoded_y1, encoded_y2, encoded_y2],
            'fit_info': fit_info,
        }

Tim O'Donnell's avatar
Tim O'Donnell committed
    def predict(
            self,
            peptides,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele=None,
            alleles=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            output="affinities",
Tim O'Donnell's avatar
Tim O'Donnell committed
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
Tim O'Donnell's avatar
Tim O'Donnell committed

        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list or array, not a string")
        if isinstance(alleles, string_types):
            raise TypeError(
                "alleles must be an iterable, AlleleEncoding, or "
                "MultipleAlleleEncoding")
        if allele is None and alleles is None:
            raise ValueError("Must specify 'allele' or 'alleles'.")

        if allele is not None:
            if alleles is not None:
                raise ValueError("Specify exactly one of allele or alleles")
            normalized_allele = mhcnames.normalize_allele_name(allele)
            alleles = [normalized_allele] * len(peptides)

        if not isinstance(alleles, MultipleAlleleEncoding):
            new_alleles = MultipleAlleleEncoding(
                allele_to_sequence=self.allele_to_sequence,
                max_alleles_per_experiment=self.hyperparameters['max_alleles'])
            new_alleles.append_alleles(alleles)
            alleles = new_alleles

        peptides = EncodableSequences.create(peptides)

Tim O'Donnell's avatar
Tim O'Donnell committed
        (allele_encoding_input, allele_representations) = (
Tim O'Donnell's avatar
Tim O'Donnell committed
                self.allele_encoding_to_network_input(alleles.compact()))
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.set_allele_representations(allele_representations)
        x_dict = {
            'peptide': self.peptides_to_network_input(peptides),
            'allele': allele_encoding_input,
        }
Tim O'Donnell's avatar
Tim O'Donnell committed
        if self.hyperparameters['auxiliary_input_features']:
            auxiliary_encoder = AuxiliaryInputEncoder(
                alleles=alleles.alleles,
                peptides=peptides.sequences)
            x_dict[
                'auxiliary'
            ] = auxiliary_encoder.get_array(
                features=self.hyperparameters['auxiliary_input_features'],
                feature_parameters=self.hyperparameters[
                    'auxiliary_input_feature_parameters'])

        predictions = [
            numpy.squeeze(output)
            for output in self.network.predict(x_dict, batch_size=batch_size)
        ]
        predictions[0] = to_ic50(predictions[0])
        predictions[2] = to_ic50(predictions[2])
Tim O'Donnell's avatar
Tim O'Donnell committed
        if output == "affinities":
Tim O'Donnell's avatar
Tim O'Donnell committed
            predictions = predictions[0]
        elif output == "ligandome":
Tim O'Donnell's avatar
Tim O'Donnell committed
            predictions = predictions[1]
Tim O'Donnell's avatar
Tim O'Donnell committed
        elif output == "affinities_matrix":
            predictions = predictions[2]
Tim O'Donnell's avatar
Tim O'Donnell committed
        elif output == "both":
Tim O'Donnell's avatar
Tim O'Donnell committed
            predictions = predictions[:2]
        elif output == "all":
Tim O'Donnell's avatar
Tim O'Donnell committed
            pass
        else:
            raise NotImplementedError("Unknown output", output)
Tim O'Donnell's avatar
Tim O'Donnell committed
        return predictions
Tim O'Donnell's avatar
Tim O'Donnell committed

    def set_allele_representations(self, allele_representations):
        """
        """
        from keras.models import clone_model
        import keras.backend as K
        import tensorflow as tf

        reshaped = allele_representations.reshape(
            (allele_representations.shape[0], -1))
        original_model = self.network

        layer = original_model.get_layer("allele_representation")
        existing_weights_shape = (layer.input_dim, layer.output_dim)

        # Only changes to the number of supported alleles (not the length of
        # the allele sequences) are allowed.
        assert existing_weights_shape[1:] == reshaped.shape[1:]

        if existing_weights_shape[0] > reshaped.shape[0]:
            # Extend with NaNs so we can avoid having to reshape the weights
            # matrix, which is expensive.
            reshaped = numpy.append(
                reshaped,
                numpy.ones([
                    existing_weights_shape[0] - reshaped.shape[0],
                    reshaped.shape[1]
                ]) * numpy.nan,
                axis=0)

        if existing_weights_shape != reshaped.shape:
Tim O'Donnell's avatar
Tim O'Donnell committed
            print(
                "Performing network surgery", existing_weights_shape, reshaped.shape)
Tim O'Donnell's avatar
Tim O'Donnell committed
            # Network surgery required. Make a new network with this layer's
            # dimensions changed. Kind of a hack.
            layer.input_dim = reshaped.shape[0]
            new_model = clone_model(original_model)

            # copy weights for other layers over
            for layer in new_model.layers:
                if layer.name != "allele_representation":
                    layer.set_weights(
                        original_model.get_layer(name=layer.name).get_weights())

            self.network = new_model

            layer = new_model.get_layer("allele_representation")

            # Disable the old model to catch bugs.
            def throw(*args, **kwargs):
                raise RuntimeError("Using a disabled model!")
            original_model.predict = \
                original_model.fit = \
                original_model.fit_generator = throw

        layer.set_weights([reshaped])
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.allele_representation_hash = hashlib.sha1(
            allele_representations.tobytes()).hexdigest()
        return self.allele_representation_hash

    def assert_allele_representations_hash(self, value):
        numpy.testing.assert_equal(self.allele_representation_hash, value)

    def __getstate__(self):
        """
        serialize to a dict. Model weights are included. For pickle support.

        Returns
        -------
        dict

        """
        result = dict(self.__dict__)
        result['network'] = None
        result['network_json'] = None
        result['network_weights'] = None

        if self.network is not None:
            result['network_json'] = self.network.to_json()
            result['network_weights'] = self.network.get_weights()
        return result

    def __setstate__(self, state):
        """
        Deserialize. For pickle support.
        """
        network_json = state.pop("network_json")
        network_weights = state.pop("network_weights")
        self.__dict__.update(state)
        if network_json is not None:
            import keras.models
            self.network = keras.models.model_from_json(network_json)
            if network_weights is not None:
                self.network.set_weights(network_weights)