fixes

694e6634 · Tim O'Donnell · 7205db5c · 694e6634 · 694e6634 · 694e6634
Commit 694e6634 authored 5 years ago by Tim O'Donnell
--- a/mhcflurry/allele_encoding.py
+++ b/mhcflurry/allele_encoding.py
@@ -150,8 +150,8 @@ class AlleleEncoding(object):
 class MultipleAlleleEncoding(object):
    def __init__(
            self,
-            experiment_names,
-            experiment_to_allele_list,
+            experiment_names=[],
+            experiment_to_allele_list={},
            max_alleles_per_experiment=6,
            allele_to_sequence=None,
            borrow_from=None):
@@ -194,6 +194,12 @@ class MultipleAlleleEncoding(object):
        return self.allele_encoding.indices.values.reshape(
            (-1, self.max_alleles_per_experiment))

+    @property
+    def alleles(self):
+        return numpy.reshape(
+            self.allele_encoding.alleles.values,
+            (-1, self.max_alleles_per_experiment))
+
    def compact(self):
        result = copy(self)
        result.allele_encoding = self.allele_encoding.compact()

--- a/mhcflurry/class1_ligandome_predictor.py
+++ b/mhcflurry/class1_ligandome_predictor.py
@@ -2,20 +2,24 @@ from __future__ import print_function

 import time
 import collections
-from functools import partial
+from six import string_types

 import numpy
 import pandas
+import mhcnames
+import hashlib

 from .hyperparameters import HyperparameterDefaults
 from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
 from .encodable_sequences import EncodableSequences
 from .regression_target import from_ic50, to_ic50
 from .random_negative_peptides import RandomNegativePeptides
-from .allele_encoding import MultipleAlleleEncoding
+from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding
+from .auxiliary_input import AuxiliaryInputEncoder
 from .custom_loss import (
    MSEWithInequalities,
-    MultiallelicMassSpecLoss)
+    MultiallelicMassSpecLoss,
+    ZeroLoss)


 class Class1LigandomePredictor(object):
@@ -26,8 +30,7 @@ class Class1LigandomePredictor(object):
            'alignment_method': 'left_pad_centered_right_pad',
            'max_length': 15,
        },
-        additional_dense_layers=[],
-        additional_dense_activation="sigmoid",
+        max_alleles=6,
    )
    """
    Hyperparameters (and their default values) that affect the neural network
@@ -55,7 +58,8 @@ class Class1LigandomePredictor(object):
    """

    compile_hyperparameter_defaults = HyperparameterDefaults(
-        loss_delta=0.2,
+        loss_multiallelic_mass_spec_delta=0.2,
+        loss_multiallelic_mass_spec_multiplier=1.0,
        optimizer="rmsprop",
        learning_rate=None,
    )
@@ -64,8 +68,9 @@ class Class1LigandomePredictor(object):
    used.
    """

-    allele_features_hyperparameter_defaults = HyperparameterDefaults(
-        allele_features_include_gene=True,
+    auxiliary_input_hyperparameter_defaults = HyperparameterDefaults(
+        auxiliary_input_features=["gene"],
+        auxiliary_input_feature_parameters={},
    )
    """
    Allele feature hyperparameters.
@@ -75,7 +80,7 @@ class Class1LigandomePredictor(object):
        fit_hyperparameter_defaults).extend(
        early_stopping_hyperparameter_defaults).extend(
        compile_hyperparameter_defaults).extend(
-        allele_features_hyperparameter_defaults)
+        auxiliary_input_hyperparameter_defaults)

    def __init__(
            self,
@@ -99,6 +104,8 @@ class Class1LigandomePredictor(object):
            self.hyperparameters)

        self.fit_info = []
+        self.allele_to_sequence = class1_affinity_predictor.allele_to_sequence
+        self.allele_representation_hash = None

    @staticmethod
    def make_network(pan_allele_class1_neural_networks, hyperparameters):
@@ -110,12 +117,16 @@ class Class1LigandomePredictor(object):
            Flatten,
            RepeatVector,
            concatenate,
-            Reshape,
+            Activation,
            Lambda,
+            Add,
            Embedding)
        from keras.models import Model
+        import keras.initializers

-        networks = [model.network() for model in pan_allele_class1_neural_networks]
+        networks = [
+            model.network() for model in pan_allele_class1_neural_networks
+        ]
        merged_ensemble = Class1NeuralNetwork.merge(
            networks,
            merge_method="average")
@@ -123,20 +134,22 @@ class Class1LigandomePredictor(object):
        peptide_shape = tuple(
            int(x) for x in K.int_shape(merged_ensemble.inputs[0])[1:])

-        input_alleles = Input(shape=(6,), name="allele")  # up to 6 alleles
+        input_alleles = Input(
+            shape=(hyperparameters['max_alleles'],), name="allele")
        input_peptides = Input(
            shape=peptide_shape,
            dtype='float32',
            name='peptide')

        peptides_flattened = Flatten()(input_peptides)
-        peptides_repeated = RepeatVector(6)(peptides_flattened)
+        peptides_repeated = RepeatVector(hyperparameters['max_alleles'])(
+            peptides_flattened)

        allele_representation = Embedding(
            name="allele_representation",
            input_dim=64,  # arbitrary, how many alleles to have room for
            output_dim=1029,
-            input_length=6,
+            input_length=hyperparameters['max_alleles'],
            trainable=False,
            mask_zero=True)(input_alleles)

@@ -189,27 +202,69 @@ class Class1LigandomePredictor(object):

            layer_name_to_new_node[layer.name] = node

-        affinity_predictor_output = Lambda(lambda x: x[:, 0])(node)
+        affinity_predictor_matrix_output = node
+
+        affinity_predictor_output = Lambda(
+            lambda x: x[:, 0], name="affinity_output")(
+                affinity_predictor_matrix_output)
+
+        """
+        layer = Dense(8, activation="sigmoid", kernel_initializer=keras.initializers.RandomNormal(mean=1.0/8.0, stddev=1e-5), use_bias=False)
+        lifted = TimeDistributed(layer, name="ligandome_hidden1")
+        node = lifted(affinity_predictor_matrix_output)
+        """
+
+        auxiliary_input = None
+        if hyperparameters['auxiliary_input_features']:
+            auxiliary_input = Input(
+                shape=(
+                    hyperparameters['max_alleles'],
+                    len(
+                        AuxiliaryInputEncoder.get_columns(
+                            hyperparameters['auxiliary_input_features'],
+                            feature_parameters=hyperparameters[
+                                'auxiliary_input_feature_parameters']))),
+                dtype="float32",
+                name="auxiliary")
+            node = concatenate(
+                [node, auxiliary_input], name="affinities_with_auxiliary")
+
+        #layer = Dense(1, activation="linear", kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=1e-5), use_bias=False)
+        layer = Dense(1, activation="tanh")
+        lifted = TimeDistributed(layer, name="ligandome_output")
+        ligandome_adjustment = lifted(node)

-        for (i, layer_size) in enumerate(
-                hyperparameters['additional_dense_layers']):
-            layer = Dense(
-                layer_size,
-                activation=hyperparameters['additional_dense_activation'])
-            lifted = TimeDistributed(layer)
-            node = lifted(node)
+        """
+        weights = layers[-1].get_weights()
+        layer = Dense(1, activation="sigmoid", kernel_initializer=keras.initializers.Constant(weights[0]), bias_initializer=keras.initializers.Constant(weights[1]))
+        lifted = TimeDistributed(layer, name="ligandome_output")
+        ligandome_output = lifted(prev_node)
+        """
+
+        def logit(x):
+            import tensorflow as tf
+            return - tf.log(1. / x - 1.)
+
+        ligandome_output_pre_sigmoid = Add()([Lambda(logit)(affinity_predictor_matrix_output), ligandome_adjustment])
+        ligandome_output = Activation("sigmoid")(ligandome_output_pre_sigmoid)
+
+        #ligandome_output = affinity_predictor_matrix_output

-        layer = Dense(1, activation="sigmoid")
-        lifted = TimeDistributed(layer)
-        ligandome_output = lifted(node)

        #output_node = concatenate([
        #    affinity_predictor_output, ligandome_output
        #], name="combined_output")

        network = Model(
-            inputs=[input_peptides, input_alleles],
-            outputs=[affinity_predictor_output, ligandome_output],
+            inputs=[
+                input_peptides,
+                input_alleles,
+            ] + ([] if auxiliary_input is None else [auxiliary_input]),
+            outputs=[
+                affinity_predictor_output,
+                ligandome_output,
+                affinity_predictor_matrix_output
+            ],
            name="ligandome",
        )
        network.summary()
@@ -270,31 +325,42 @@ class Class1LigandomePredictor(object):

        import keras.backend as K

+        assert isinstance(allele_encoding, MultipleAlleleEncoding)
+        assert (
+            allele_encoding.max_alleles_per_experiment ==
+            self.hyperparameters['max_alleles'])
+
        #for layer in self.network._layers[:8]:
        #    print("Setting non trainable", layer)
        #    layer.trainable = False
        #    import ipdb ; ipdb.set_trace()

        encodable_peptides = EncodableSequences.create(peptides)
+        del peptides
+
        if labels is not None:
            labels = numpy.array(labels, copy=False)
+        if inequalities is not None:
+            inequalities = numpy.array(inequalities, copy=True)
+        else:
+            inequalities = numpy.tile("=", len(labels))
        if affinities_mask is not None:
            affinities_mask = numpy.array(affinities_mask, copy=False)
-        if inequalities is not None:
-            inequalities = numpy.array(inequalities, copy=False)
+        else:
+            affinities_mask = numpy.tile(False, len(labels))
+        inequalities[~affinities_mask] = "="

        random_negatives_planner = RandomNegativePeptides(
            **RandomNegativePeptides.hyperparameter_defaults.subselect(
                self.hyperparameters))
        random_negatives_planner.plan(
-            peptides=encodable_peptides.sequences[affinities_mask],
-            affinities=(labels[affinities_mask]),
-            alleles=numpy.reshape(
-                allele_encoding.allele_encoding.alleles.values,
-                (-1, allele_encoding.max_alleles_per_experiment))[
-                    affinities_mask, 0
-            ].flatten(),
-            inequalities=inequalities[affinities_mask])
+            peptides=encodable_peptides.sequences,
+            affinities=numpy.where(affinities_mask, labels, to_ic50(labels)),
+            alleles=[
+                numpy.random.choice(row[row != numpy.array(None)])
+                for row in allele_encoding.alleles
+            ],
+            inequalities=inequalities)

        peptide_input = self.peptides_to_network_input(encodable_peptides)

@@ -302,15 +368,14 @@ class Class1LigandomePredictor(object):
        if shuffle_permutation is None:
            shuffle_permutation = numpy.random.permutation(len(labels))
        peptide_input = peptide_input[shuffle_permutation]
+        peptides = encodable_peptides.sequences[shuffle_permutation]
        allele_encoding.shuffle_in_place(shuffle_permutation)
        labels = labels[shuffle_permutation]
        inequalities = inequalities[shuffle_permutation]
        affinities_mask = affinities_mask[shuffle_permutation]
-        del peptides
        del encodable_peptides

        # Optional optimization
-        allele_encoding = allele_encoding.compact()
        (allele_encoding_input, allele_representations) = (
            self.allele_encoding_to_network_input(allele_encoding))

@@ -318,34 +383,37 @@ class Class1LigandomePredictor(object):
            'peptide': peptide_input,
            'allele': allele_encoding_input,
        }
+        if self.hyperparameters['auxiliary_input_features']:
+            auxiliary_encoder = AuxiliaryInputEncoder(
+                alleles=allele_encoding.alleles,
+                peptides=peptides)
+            x_dict_without_random_negatives[
+                'auxiliary'
+            ] = auxiliary_encoder.get_array(
+                features=self.hyperparameters['auxiliary_input_features'],
+                feature_parameters=self.hyperparameters[
+                    'auxiliary_input_feature_parameters'])

        y1 = numpy.zeros(shape=len(labels))
-        if affinities_mask is not None:
-            y1[affinities_mask] = from_ic50(labels[affinities_mask])
-
-        random_negatives_allele_encoding = None
-        if allele_encoding is not None:
-            random_negative_alleles = random_negatives_planner.get_alleles()
-            random_negatives_allele_encoding = MultipleAlleleEncoding(
-                experiment_names=random_negative_alleles,
-                experiment_to_allele_list=dict(
-                    (a, [a]) for a in random_negative_alleles),
-                max_alleles_per_experiment=(
-                    allele_encoding.max_alleles_per_experiment),
-                borrow_from=allele_encoding.allele_encoding)
+        y1[affinities_mask] = from_ic50(labels[affinities_mask])
+
+        random_negative_alleles = random_negatives_planner.get_alleles()
+        random_negatives_allele_encoding = MultipleAlleleEncoding(
+            experiment_names=random_negative_alleles,
+            experiment_to_allele_list=dict(
+                (a, [a]) for a in random_negative_alleles),
+            max_alleles_per_experiment=(
+                allele_encoding.max_alleles_per_experiment),
+            borrow_from=allele_encoding.allele_encoding)
        num_random_negatives = random_negatives_planner.get_total_count()

-        if inequalities is not None:
-            # Reverse inequalities because from_ic50() flips the direction
-            # (i.e. lower affinity results in higher y values).
-            adjusted_inequalities = pandas.Series(inequalities).map({
-                "=": "=",
-                ">": "<",
-                "<": ">",
-            }).values
-        else:
-            adjusted_inequalities = numpy.tile("=", len(y1))
-
+        # Reverse inequalities because from_ic50() flips the direction
+        # (i.e. lower affinity results in higher y values).
+        adjusted_inequalities = pandas.Series(inequalities).map({
+            "=": "=",
+            ">": "<",
+            "<": ">",
+        }).values
        adjusted_inequalities[~affinities_mask] = ">"

        # Note: we are using "<" here not ">" because the inequalities are
@@ -369,7 +437,9 @@ class Class1LigandomePredictor(object):
            inequalities=adjusted_inequalities_with_random_negative)

        mms_loss = MultiallelicMassSpecLoss(
-            delta=self.hyperparameters['loss_delta'])
+            delta=self.hyperparameters['loss_multiallelic_mass_spec_delta'],
+            multiplier=self.hyperparameters[
+                'loss_multiallelic_mass_spec_multiplier'])
        y2 = labels.copy()
        y2[affinities_mask] = -1
        y2_with_random_negatives = numpy.concatenate([
@@ -380,9 +450,10 @@ class Class1LigandomePredictor(object):

        fit_info = collections.defaultdict(list)

-        self.set_allele_representations(allele_representations)
+        allele_representations_hash = self.set_allele_representations(
+            allele_representations)
        self.network.compile(
-            loss=[affinities_loss.loss, mms_loss.loss],
+            loss=[affinities_loss.loss, mms_loss.loss, ZeroLoss.loss],
            optimizer=self.hyperparameters['optimizer'])
        if self.hyperparameters['learning_rate'] is not None:
            K.set_value(
@@ -422,6 +493,20 @@ class Class1LigandomePredictor(object):
                            random_negatives_allele_encoding)[0],
                        x_dict_without_random_negatives['allele']
                    ])
+                    if 'auxiliary' in x_dict_without_random_negatives:
+                        random_negative_auxiliary_encoder = AuxiliaryInputEncoder(
+                            alleles=random_negatives_allele_encoding.alleles,
+                            #peptides=random_negative_peptides.sequences
+                        )
+                        x_dict_with_random_negatives['auxiliary'] = (
+                            numpy.concatenate([
+                                random_negative_auxiliary_encoder.get_array(
+                                    features=self.hyperparameters[
+                                        'auxiliary_input_features'],
+                                    feature_parameters=self.hyperparameters[
+                                        'auxiliary_input_feature_parameters']),
+                                x_dict_without_random_negatives['auxiliary']
+                            ]))
                else:
                    x_dict_with_random_negatives = (
                        x_dict_without_random_negatives)
@@ -433,17 +518,27 @@ class Class1LigandomePredictor(object):
                        "peptide"
                    ][:num_random_negatives] = random_negative_peptides_encoding

+            #def generator(x, ys, batch_size):
+            #    # Each batch should have a mix of:
+            #    #   - random negative peptides
+            #    #   - affinity measurements (binder + non-binder)
+            #    #   - multiallelic mass spec
+
+
            # TODO: need to use fit_generator to keep each minibatch corresponding
            # to a single experiment
+            self.assert_allele_representations_hash(allele_representations_hash)
+            #import ipdb ; ipdb.set_trace()
            fit_history = self.network.fit(
                x_dict_with_random_negatives,
-                [encoded_y1, encoded_y2],
+                [encoded_y1, encoded_y2, encoded_y2],
                shuffle=True,
                batch_size=self.hyperparameters['minibatch_size'],
                verbose=verbose,
                epochs=i + 1,
                initial_epoch=i,
                validation_split=self.hyperparameters['validation_split'],
+
            )
            epoch_time = time.time() - epoch_start

@@ -467,6 +562,7 @@ class Class1LigandomePredictor(object):
                last_progress_print = time.time()

            if self.hyperparameters['validation_split']:
+                #import ipdb ; ipdb.set_trace()
                val_loss = fit_info['val_loss'][-1]
                if min_val_loss is None or (
                        val_loss < min_val_loss -
@@ -496,32 +592,78 @@ class Class1LigandomePredictor(object):
                progress_callback()

        fit_info["time"] = time.time() - start
-        fit_info["num_points"] = len(encodable_peptides)
+        fit_info["num_points"] = len(labels)
        self.fit_info.append(dict(fit_info))

    def predict(
            self,
            peptides,
-            allele_encoding,
+            allele=None,
+            alleles=None,
            output="affinities",
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
+
+        if isinstance(peptides, string_types):
+            raise TypeError("peptides must be a list or array, not a string")
+        if isinstance(alleles, string_types):
+            raise TypeError(
+                "alleles must be an iterable, AlleleEncoding, or "
+                "MultipleAlleleEncoding")
+        if allele is None and alleles is None:
+            raise ValueError("Must specify 'allele' or 'alleles'.")
+
+        if allele is not None:
+            if alleles is not None:
+                raise ValueError("Specify exactly one of allele or alleles")
+            normalized_allele = mhcnames.normalize_allele_name(allele)
+            alleles = [normalized_allele] * len(peptides)
+
+        if not isinstance(alleles, MultipleAlleleEncoding):
+            new_alleles = MultipleAlleleEncoding(
+                allele_to_sequence=self.allele_to_sequence,
+                max_alleles_per_experiment=self.hyperparameters['max_alleles'])
+            new_alleles.append_alleles(alleles)
+            alleles = new_alleles
+
+        peptides = EncodableSequences.create(peptides)
+
        (allele_encoding_input, allele_representations) = (
-                self.allele_encoding_to_network_input(allele_encoding.compact()))
+                self.allele_encoding_to_network_input(alleles.compact()))
        self.set_allele_representations(allele_representations)
        x_dict = {
            'peptide': self.peptides_to_network_input(peptides),
            'allele': allele_encoding_input,
        }
-        predictions = self.network.predict(x_dict, batch_size=batch_size)
+        if self.hyperparameters['auxiliary_input_features']:
+            auxiliary_encoder = AuxiliaryInputEncoder(
+                alleles=alleles.alleles,
+                peptides=peptides.sequences)
+            x_dict[
+                'auxiliary'
+            ] = auxiliary_encoder.get_array(
+                features=self.hyperparameters['auxiliary_input_features'],
+                feature_parameters=self.hyperparameters[
+                    'auxiliary_input_feature_parameters'])
+
+        predictions = [
+            numpy.squeeze(output)
+            for output in self.network.predict(x_dict, batch_size=batch_size)
+        ]
+        predictions[0] = to_ic50(predictions[0])
+        predictions[2] = to_ic50(predictions[2])
        if output == "affinities":
-            predictions = to_ic50(predictions[0])
-        elif output == "ligandome_presentation":
+            predictions = predictions[0]
+        elif output == "ligandome":
            predictions = predictions[1]
+        elif output == "affinities_matrix":
+            predictions = predictions[2]
        elif output == "both":
+            predictions = predictions[:2]
+        elif output == "all":
            pass
        else:
            raise NotImplementedError("Unknown output", output)
-        return numpy.squeeze(predictions)
+        return predictions

    def set_allele_representations(self, allele_representations):
        """
@@ -577,13 +719,43 @@ class Class1LigandomePredictor(object):
                original_model.fit_generator = throw

        layer.set_weights([reshaped])
+        self.allele_representation_hash = hashlib.sha1(
+            allele_representations.tobytes()).hexdigest()
+        return self.allele_representation_hash
+
+    def assert_allele_representations_hash(self, value):
+        numpy.testing.assert_equal(self.allele_representation_hash, value)
+
+    def __getstate__(self):
+        """
+        serialize to a dict. Model weights are included. For pickle support.
+
+        Returns
+        -------
+        dict
+
+        """
+        result = dict(self.__dict__)
+        result['network'] = None
+        result['network_json'] = None
+        result['network_weights'] = None
+
+        if self.network is not None:
+            result['network_json'] = self.network.to_json()
+            result['network_weights'] = self.network.get_weights()
+        return result
+
+    def __setstate__(self, state):
+        """
+        Deserialize. For pickle support.
+        """
+        network_json = state.pop("network_json")
+        network_weights = state.pop("network_weights")
+        self.__dict__.update(state)
+        if network_json is not None:
+            import keras.models
+            self.network = keras.models.model_from_json(network_json)
+            if network_weights is not None:
+                self.network.set_weights(network_weights)
+

-    @staticmethod
-    def allele_features(allele_names, hyperparameters):
-        df = pandas.DataFrame({"allele_name": allele_names})
-        if hyperparameters['allele_features_include_gene']:
-            # TODO: support other organisms.
-            for gene in ["A", "B", "C"]:
-                df[gene] = df.allele_name.str.startswith(
-                    "HLA-%s" % gene).astype(float)
-        return gene
--- a/mhcflurry/custom_loss.py
+++ b/mhcflurry/custom_loss.py
@@ -92,12 +92,12 @@ class MSEWithInequalities(Loss):
       y_pred is greater or less than y_true.

    between 2 - 3:
-       Treated as a "<" inequality. Penalty (y_pred - (y_true - 2))**2 is
-       applied only if y_pred is greater than y_true - 2.
+       Treated as a ">" inequality. Penalty (y_pred - (y_true - 2))**2 is
+       applied only if y_pred is less than y_true - 2.

    between 4 - 5:
-       Treated as a ">" inequality. Penalty (y_pred - (y_true - 4))**2 is
-       applied only if y_pred is less than y_true - 4.
+       Treated as a "<" inequality. Penalty (y_pred - (y_true - 4))**2 is
+       applied only if y_pred is greater than y_true - 4.
    """
    name = "mse_with_inequalities"
    supports_inequalities = True
@@ -240,11 +240,12 @@ class MultiallelicMassSpecLoss(Loss):
    supports_inequalities = True
    supports_multiple_outputs = False

-    def __init__(self, delta=0.2):
+    def __init__(self, delta=0.2, multiplier=1.0):
        self.delta = delta
+        self.multiplier = multiplier

    @staticmethod
-    def encode_y(y, affinities_mask=None, inequalities=None):
+    def encode_y(y):
        encoded = pandas.Series(y, dtype="float32", copy=True)
        assert all(item in (-1.0, 1.0, 0.0) for item in encoded), set(y)
        print(
@@ -262,9 +263,25 @@ class MultiallelicMassSpecLoss(Loss):
        pos_max = tf.reduce_max(pos, axis=1)
        neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0))
        term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta
-        result = tf.reduce_sum(tf.maximum(0.0, term) ** 2)
-        return result
+        result = tf.reduce_sum(tf.maximum(0.0, term) ** 2) / tf.cast(
+            tf.shape(term)[0], tf.float32) * self.multiplier
+        return tf.where(tf.is_nan(result), 0.0, result)
+
+
+class ZeroLoss(Loss):
+    """
+    """
+    name = "zero_loss"
+    supports_inequalities = False
+    supports_multiple_outputs = False

+    @staticmethod
+    def encode_y(y):
+        return y
+
+    @staticmethod
+    def loss(y_true, y_pred):
+        return 0.0


 def check_shape(name, arr, expected_shape):
@@ -284,5 +301,7 @@ def check_shape(name, arr, expected_shape):


 # Register custom losses.
-for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
+for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss, ZeroLoss]:
    CUSTOM_LOSSES[cls.name] = cls()
+
+
--- a/test/test_class1_ligandome_predictor.py
+++ b/test/test_class1_ligandome_predictor.py
@@ -21,6 +21,7 @@ logging.getLogger('matplotlib').disabled = True
 import pandas
 import argparse
 import sys
+import copy
 from functools import partial

 from numpy.testing import assert_, assert_equal, assert_allclose
@@ -153,7 +154,7 @@ def test_loss():
                        )
                        contributions.append(contribution)
        contributions = numpy.array(contributions)
-        expected1 = contributions.sum()
+        expected1 = contributions.sum() / len(contributions)

        # reference implementation 2: numpy
        pos = numpy.array([
@@ -164,7 +165,8 @@ def test_loss():

        neg = y_pred[(y_true == 0.0).astype(bool)]
        expected2 = (
-                numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
+                numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() / (
+            len(pos) * len(neg))

        yield numpy.testing.assert_almost_equal, expected1, expected2, 4

@@ -226,7 +228,7 @@ def make_motif(allele, peptides, frac=0.01):
    return matrix


-def test_real_data_multiallelic_refinement(max_epochs=10):
+def Xtest_real_data_multiallelic_refinement(max_epochs=10):
    ms_df = pandas.read_csv(
        get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
    ms_df = ms_df.loc[
@@ -240,10 +242,20 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
            del sample_table[col]
    sample_table["alleles"] = sample_table.hla.str.split()

-    multi_train_df = ms_df.loc[
+    multi_train_hit_df = ms_df.loc[
        ms_df.sample_id  == "RA957"
    ].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True)
-    multi_train_df["label"] = 1.0
+    multi_train_hit_df["label"] = 1.0
+
+    multi_train_decoy_df = ms_df.loc[
+        (ms_df.sample_id  == "CD165") &
+        (~ms_df.peptide.isin(multi_train_hit_df.peptide.unique()))
+    ].drop_duplicates("peptide")[["peptide"]]
+    (multi_train_decoy_df["sample_id"],) = multi_train_hit_df.sample_id.unique()
+    multi_train_decoy_df["label"] = 0.0
+
+    multi_train_df = pandas.concat(
+        [multi_train_hit_df, multi_train_decoy_df], ignore_index=True)
    multi_train_df["is_affinity"] = False

    multi_train_alleles = set()
@@ -281,6 +293,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):

    ligandome_predictor = Class1LigandomePredictor(
        pan_predictor,
+        auxiliary_input_features=[],
        max_ensemble_size=1,
        max_epochs=50,
        learning_rate=0.0001,
@@ -292,7 +305,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
        ligandome_predictor.predict(
            output="affinities",
            peptides=combined_train_df.peptide.values,
-            allele_encoding=allele_encoding))
+            alleles=allele_encoding))

    (model,) = pan_predictor.class1_pan_allele_models
    expected_pre_predictions = from_ic50(
@@ -325,11 +338,220 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
        progress_callback=update_motifs,
    )

+    import ipdb ; ipdb.set_trace()
+
+
+def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
+    refine_allele = "HLA-C*01:02"
+    alleles = [
+        "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
+        "HLA-A*03:01", "HLA-B*15:01", refine_allele
+    ]
+    peptides_per_allele = [
+        2000, 1000, 500,
+        1500, 1200, 800,
+    ]
+
+    allele_to_peptides = dict(zip(alleles, peptides_per_allele))
+
+    length = 9
+
+    train_with_ms = pandas.read_csv(
+        get_path("data_curated", "curated_training_data.with_mass_spec.csv.bz2"))
+    train_no_ms = pandas.read_csv(get_path("data_curated",
+        "curated_training_data.no_mass_spec.csv.bz2"))
+
+    def filter_df(df):
+        df = df.loc[
+            (df.allele.isin(alleles)) &
+            (df.peptide.str.len() == length)
+        ]
+        return df
+
+    train_with_ms = filter_df(train_with_ms)
+    train_no_ms = filter_df(train_no_ms)
+
+    ms_specific = train_with_ms.loc[
+        ~train_with_ms.peptide.isin(train_no_ms.peptide)
+    ]
+
+    train_peptides = []
+    train_true_alleles = []
+    for allele in alleles:
+        peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample(
+            n=allele_to_peptides[allele])
+        train_peptides.extend(peptides)
+        train_true_alleles.extend([allele] * len(peptides))
+
+    hits_df = pandas.DataFrame({"peptide": train_peptides})
+    hits_df["true_allele"] = train_true_alleles
+    hits_df["hit"] = 1.0
+
+    decoys_df = hits_df.copy()
+    decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide)
+    decoys_df["true_allele"] = ""
+    decoys_df["hit"] = 0.0
+
+    mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True)
+    mms_train_df["label"] =  mms_train_df.hit
+    mms_train_df["is_affinity"] = False
+
+    affinity_train_df = pandas.read_csv(
+        get_path(
+            "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
+    affinity_train_df = affinity_train_df.loc[
+        affinity_train_df.allele.isin(alleles),
+        ["peptide", "allele",  "measurement_inequality", "measurement_value"]]
+
+    affinity_train_df["label"] = affinity_train_df["measurement_value"]
+    del affinity_train_df["measurement_value"]
+    affinity_train_df["is_affinity"] = True
+
+    predictor = Class1LigandomePredictor(
+        PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
+        auxiliary_input_features=["gene"],
+        max_ensemble_size=1,
+        max_epochs=max_epochs,
+        learning_rate=0.0001,
+        patience=5,
+        min_delta=0.0,
+        random_negative_rate=1.0,
+        random_negative_constant=25)
+
+    mms_allele_encoding = MultipleAlleleEncoding(
+        experiment_names=["experiment1"] * len(mms_train_df),
+        experiment_to_allele_list={
+            "experiment1": alleles,
+        },
+        max_alleles_per_experiment=6,
+        allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
+    )
+    allele_encoding = copy.deepcopy(mms_allele_encoding)
+    allele_encoding.append_alleles(affinity_train_df.allele.values)
+    allele_encoding = allele_encoding.compact()
+
+    train_df = pandas.concat(
+        [mms_train_df, affinity_train_df], ignore_index=True, sort=False)
+
+    pre_predictions = from_ic50(
+        predictor.predict(
+            output="affinities_matrix",
+            peptides=mms_train_df.peptide.values,
+            alleles=mms_allele_encoding))
+
+    (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
+    expected_pre_predictions = from_ic50(
+        model.predict(
+            peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)),
+            allele_encoding=mms_allele_encoding.allele_encoding,
+    )).reshape((-1, len(alleles)))
+
    #import ipdb ; ipdb.set_trace()

+    mms_train_df["pre_max_prediction"] = pre_predictions.max(1)
+    pre_auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.pre_max_prediction.values)
+    print("PRE_AUC", pre_auc)
+
+    assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)

+    motifs_history = []
+    random_peptides_encodable = make_random_peptides(10000, [9])


+    def update_motifs():
+        for allele in alleles:
+            motif = make_motif(allele, random_peptides_encodable)
+            motifs_history.append((allele, motif))
+
+    metric_rows = []
+
+    def progress():
+        (_, ligandome_prediction, affinities_predictions) = (
+            predictor.predict(
+                output="all",
+                peptides=mms_train_df.peptide.values,
+                alleles=mms_allele_encoding))
+        affinities_predictions = from_ic50(affinities_predictions)
+        for (kind, predictions) in [
+                ("affinities", affinities_predictions),
+                ("ligandome", ligandome_prediction)]:
+
+            mms_train_df["max_prediction"] = predictions.max(1)
+            mms_train_df["predicted_allele"] = pandas.Series(alleles).loc[
+                predictions.argmax(1).flatten()
+            ].values
+
+            print(kind)
+            print(predictions)
+
+            mean_predictions_for_hit = mms_train_df.loc[
+                mms_train_df.hit == 1.0
+            ].max_prediction.mean()
+            mean_predictions_for_decoy = mms_train_df.loc[
+                mms_train_df.hit == 0.0
+            ].max_prediction.mean()
+            correct_allele_fraction = (
+                    mms_train_df.loc[mms_train_df.hit == 1.0].predicted_allele ==
+                    mms_train_df.loc[mms_train_df.hit == 1.0].true_allele
+            ).mean()
+            auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.max_prediction.values)
+
+            print(kind, "Mean prediction for hit", mean_predictions_for_hit)
+            print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
+            print(kind, "Correct predicted allele fraction", correct_allele_fraction)
+            print(kind, "AUC", auc)
+
+            metric_rows.append((
+                kind,
+                mean_predictions_for_hit,
+                mean_predictions_for_decoy,
+                correct_allele_fraction,
+                auc,
+            ))
+
+            update_motifs()
+
+        return (ligandome_prediction, auc)
+
+    print("Pre fitting:")
+    progress()
+    update_motifs()
+    print("Fitting...")
+
+    predictor.fit(
+        peptides=train_df.peptide.values,
+        labels=train_df.label.values,
+        inequalities=train_df.measurement_inequality.values,
+        affinities_mask=train_df.is_affinity.values,
+        allele_encoding=allele_encoding,
+        progress_callback=progress,
+    )
+
+    (predictions, final_auc) = progress()
+    print("Final AUC", final_auc)
+
+    update_motifs()
+
+    motifs = pandas.DataFrame(
+        motifs_history,
+        columns=[
+            "allele",
+            "motif",
+        ]
+    )
+
+    metrics = pandas.DataFrame(
+        metric_rows,
+        columns=[
+            "output",
+            "mean_predictions_for_hit",
+            "mean_predictions_for_decoy",
+            "correct_allele_fraction",
+            "auc"
+        ])
+
+    return (predictor, predictions, metrics, motifs)
+


 def Xtest_synthetic_allele_refinement(max_epochs=10):
@@ -387,12 +609,13 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):

    predictor = Class1LigandomePredictor(
        PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
-        additional_dense_layers=[8, 1],
        max_ensemble_size=1,
        max_epochs=max_epochs,
        learning_rate=0.0001,
        patience=5,
-        min_delta=0.0)
+        min_delta=0.0,
+        random_negative_rate=0.0,
+        random_negative_constant=0)

    allele_encoding = MultipleAlleleEncoding(
        experiment_names=["experiment1"] * len(train_df),
@@ -405,9 +628,9 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):

    pre_predictions = from_ic50(
        predictor.predict(
-            output="affinities",
+            output="affinities_matrix",
            peptides=train_df.peptide.values,
-            allele_encoding=allele_encoding))
+            alleles=allele_encoding))

    (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
    expected_pre_predictions = from_ic50(
@@ -436,45 +659,52 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
    metric_rows = []

    def progress():
-        predictions = from_ic50(
+        (_, ligandome_prediction, affinities_predictions) = (
            predictor.predict(
-                output="affinities",
+                output="all",
                peptides=train_df.peptide.values,
-                allele_encoding=allele_encoding))
-
-        train_df["max_prediction"] = predictions.max(1)
-        train_df["predicted_allele"] = pandas.Series(alleles).loc[
-            predictions.argmax(1).flatten()].values
-
-        print(predictions)
-
-        mean_predictions_for_hit = train_df.loc[
-            train_df.hit == 1.0
-        ].max_prediction.mean()
-        mean_predictions_for_decoy = train_df.loc[
-            train_df.hit == 0.0
-        ].max_prediction.mean()
-        correct_allele_fraction = (
-                train_df.loc[train_df.hit == 1.0].predicted_allele ==
-                train_df.loc[train_df.hit == 1.0].true_allele
-        ).mean()
-        auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
-
-        print("Mean prediction for hit", mean_predictions_for_hit)
-        print("Mean prediction for decoy", mean_predictions_for_decoy)
-        print("Correct predicted allele fraction", correct_allele_fraction)
-        print("AUC", auc)
-
-        metric_rows.append((
-            mean_predictions_for_hit,
-            mean_predictions_for_decoy,
-            correct_allele_fraction,
-            auc,
-        ))
-
-        update_motifs()
-
-        return (predictions, auc)
+                alleles=allele_encoding))
+        affinities_predictions = from_ic50(affinities_predictions)
+        for (kind, predictions) in [
+                ("affinities", affinities_predictions),
+                ("ligandome", ligandome_prediction)]:
+
+            train_df["max_prediction"] = predictions.max(1)
+            train_df["predicted_allele"] = pandas.Series(alleles).loc[
+                predictions.argmax(1).flatten()
+            ].values
+
+            print(kind)
+            print(predictions)
+
+            mean_predictions_for_hit = train_df.loc[
+                train_df.hit == 1.0
+            ].max_prediction.mean()
+            mean_predictions_for_decoy = train_df.loc[
+                train_df.hit == 0.0
+            ].max_prediction.mean()
+            correct_allele_fraction = (
+                    train_df.loc[train_df.hit == 1.0].predicted_allele ==
+                    train_df.loc[train_df.hit == 1.0].true_allele
+            ).mean()
+            auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
+
+            print(kind, "Mean prediction for hit", mean_predictions_for_hit)
+            print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
+            print(kind, "Correct predicted allele fraction", correct_allele_fraction)
+            print(kind, "AUC", auc)
+
+            metric_rows.append((
+                kind,
+                mean_predictions_for_hit,
+                mean_predictions_for_decoy,
+                correct_allele_fraction,
+                auc,
+            ))
+
+            update_motifs()
+
+        return (ligandome_prediction, auc)

    print("Pre fitting:")
    progress()
@@ -504,6 +734,7 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
    metrics = pandas.DataFrame(
        metric_rows,
        columns=[
+            "output",
            "mean_predictions_for_hit",
            "mean_predictions_for_decoy",
            "correct_allele_fraction",