From 1a7264bae17067abbebbb005ea256e12d30e229c Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sun, 14 Jan 2018 18:23:42 -0500
Subject: [PATCH] First cut at model selection and training on data with
 inequalities

---
 docs/python_tutorial.rst                      |   5 +-
 mhcflurry/class1_affinity_predictor.py        | 151 ++++++++++--------
 mhcflurry/class1_neural_network.py            | 118 ++++++++++----
 mhcflurry/loss_with_inequalities.py           |  78 +++++++++
 .../train_allele_specific_models_command.py   |   2 +-
 test/test_class1_affinity_predictor.py        |   4 +-
 6 files changed, 255 insertions(+), 103 deletions(-)
 create mode 100644 mhcflurry/loss_with_inequalities.py

diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst
index f8a93799..9b3d8b88 100644
--- a/docs/python_tutorial.rst
+++ b/docs/python_tutorial.rst
@@ -87,15 +87,16 @@ some models.
     >>> single_allele_train_data = df.loc[df.allele == "HLA-B*57:01"].sample(100)
     >>> new_predictor.fit_allele_specific_predictors(
     ...    n_models=1,
-    ...    architecture_hyperparameters={
+    ...    architecture_hyperparameters_list=[{
     ...         "layer_sizes": [16],
     ...         "max_epochs": 5,
     ...         "random_negative_constant": 5,
-    ...    },
+    ...    }],
     ...    peptides=single_allele_train_data.peptide.values,
     ...    affinities=single_allele_train_data.measurement_value.values,
     ...    allele="HLA-B*57:01")
 
+
 The `~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` method
 can be called any number of times on the same instance to build up ensembles
 of models across alleles. The `architecture_hyperparameters` we specified are
diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index 1c523e35..f56865eb 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -362,10 +362,11 @@ class Class1AffinityPredictor(object):
     def fit_allele_specific_predictors(
             self,
             n_models,
-            architecture_hyperparameters,
+            architecture_hyperparameters_list,
             allele,
             peptides,
             affinities,
+            inequalities=None,
             models_dir_for_save=None,
             verbose=1,
             progress_preamble=""):
@@ -381,7 +382,7 @@ class Class1AffinityPredictor(object):
         n_models : int
             Number of neural networks to fit
         
-        architecture_hyperparameters : dict 
+        architecture_hyperparameters_list : list of dict
                
         allele : string
         
@@ -389,6 +390,9 @@ class Class1AffinityPredictor(object):
         
         affinities : list of float
             nM affinities
+
+        inequalities : list of string, each element one of ">", "<", or "="
+            See Class1NeuralNetwork.fit for details.
         
         models_dir_for_save : string, optional
             If specified, the Class1AffinityPredictor is (incrementally) written
@@ -406,35 +410,81 @@ class Class1AffinityPredictor(object):
         """
 
         allele = mhcnames.normalize_allele_name(allele)
-        models = self._fit_predictors(
-            n_models=n_models,
-            architecture_hyperparameters=architecture_hyperparameters,
-            peptides=peptides,
-            affinities=affinities,
-            allele_pseudosequences=None,
-            verbose=verbose,
-            progress_preamble=progress_preamble)
-
         if allele not in self.allele_to_allele_specific_models:
             self.allele_to_allele_specific_models[allele] = []
 
-        models_list = []
-        for (i, model) in enumerate(models):
-            model_name = self.model_name(allele, i)
-            models_list.append(model)  # models is a generator
+        encodable_peptides = EncodableSequences.create(peptides)
+
+        n_architectures = len(architecture_hyperparameters_list)
+        if n_models > 1 or n_architectures > 1:
+            # Adjust progress info to indicate number of models and
+            # architectures.
+            pieces = []
+            if n_models > 1:
+                pieces.append("Model {model_num:2d} / {n_models:2d}")
+            if n_architectures > 1:
+                pieces.append(
+                    "Architecture {architecture_num:2d} / {n_architectures:2d}"
+                    " (best so far: {best_num:2d)")
+            progress_preamble_template = "[ %s ] {user_progress_preamble}" % (
+                ", ".join(pieces))
+        else:
+            # Just use the user-provided progress message.
+            progress_preamble_template = "{user_progress_preamble}"
+
+        models = []
+        for model_num in range(n_models):
+            shuffle_permutation = numpy.random.permutation(len(affinities))
+
+            best_num = None
+            best_loss = None
+            best_model = None
+            for (architecture_num, architecture_hyperparameters) in enumerate(
+                    architecture_hyperparameters_list):
+                model = Class1NeuralNetwork(**architecture_hyperparameters)
+                model.fit(
+                    encodable_peptides,
+                    affinities,
+                    shuffle_permutation=shuffle_permutation,
+                    inequalities=inequalities,
+                    verbose=verbose,
+                    progress_preamble=progress_preamble_template.format(
+                        user_progress_preamble=progress_preamble,
+                        best_num=best_num,
+                        model_num=model_num,
+                        n_models=n_models,
+                        architecture_num=architecture_num,
+                        n_architectures=n_architectures))
+
+
+                if n_architectures > 1:
+                    # We require val_loss (i.e. a validation set) if we have
+                    # multiple architectures.
+                    loss = model.loss_history['val_loss'][-1]
+                else:
+                    loss = None
+                if loss is None or best_loss is None or best_loss > loss:
+                    best_loss = best_loss
+                    best_num = architecture_num
+                    best_model = model
+                del model
+
+            model_name = self.model_name(allele, model_num)
             row = pandas.Series(collections.OrderedDict([
                 ("model_name", model_name),
                 ("allele", allele),
-                ("config_json", json.dumps(model.get_config())),
-                ("model", model),
+                ("config_json", json.dumps(best_model.get_config())),
+                ("model", best_model),
             ])).to_frame().T
             self.manifest_df = pandas.concat(
                 [self.manifest_df, row], ignore_index=True)
-            self.allele_to_allele_specific_models[allele].append(model)
+            self.allele_to_allele_specific_models[allele].append(best_model)
             if models_dir_for_save:
                 self.save(
                     models_dir_for_save, model_names_to_write=[model_name])
-        return models_list
+            models.append(best_model)
+
+        return models
 
     def fit_class1_pan_allele_models(
             self,
@@ -486,19 +536,19 @@ class Class1AffinityPredictor(object):
         alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name)
         allele_pseudosequences = alleles.map(self.allele_to_pseudosequence)
 
-        models = self._fit_predictors(
-            n_models=n_models,
-            architecture_hyperparameters=architecture_hyperparameters,
-            peptides=peptides,
-            affinities=affinities,
-            allele_pseudosequences=allele_pseudosequences,
-            verbose=verbose,
-            progress_preamble=progress_preamble)
+        encodable_peptides = EncodableSequences.create(peptides)
+        models = []
+        for i in range(n_models):
+            logging.info("Training model %d / %d" % (i + 1, n_models))
+            model = Class1NeuralNetwork(**architecture_hyperparameters)
+            model.fit(
+                encodable_peptides,
+                affinities,
+                allele_pseudosequences=allele_pseudosequences,
+                verbose=verbose,
+                progress_preamble=progress_preamble)
 
-        models_list = []
-        for (i, model) in enumerate(models):
             model_name = self.model_name("pan-class1", i)
-            models_list.append(model)  # models is a generator
             self.class1_pan_allele_models.append(model)
             row = pandas.Series(collections.OrderedDict([
                 ("model_name", model_name),
@@ -511,46 +561,9 @@ class Class1AffinityPredictor(object):
             if models_dir_for_save:
                 self.save(
                     models_dir_for_save, model_names_to_write=[model_name])
-        return models_list
-
-    def _fit_predictors(
-            self,
-            n_models,
-            architecture_hyperparameters,
-            peptides,
-            affinities,
-            allele_pseudosequences,
-            verbose=1,
-            progress_preamble = ""):
-        """
-        Private helper method
-        
-        Parameters
-        ----------
-        n_models : int
-        architecture_hyperparameters : dict
-        peptides : EncodableSequences or list of string
-        affinities : list of float
-        allele_pseudosequences : EncodableSequences or list of string
-        verbose : int
-        progress_preamble : string
-            Optional string of information to include in each progress update
+            models.append(model)
 
-        Returns
-        -------
-        generator of `Class1NeuralNetwork`
-        """
-        encodable_peptides = EncodableSequences.create(peptides)
-        for i in range(n_models):
-            logging.info("Training model %d / %d" % (i + 1, n_models))
-            model = Class1NeuralNetwork(**architecture_hyperparameters)
-            model.fit(
-                encodable_peptides,
-                affinities,
-                allele_pseudosequences=allele_pseudosequences,
-                verbose=verbose,
-                progress_preamble=progress_preamble)
-            yield model
+        return models
 
     def calibrate_percentile_ranks(
             self,
diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py
index eafad7c2..c8e674b2 100644
--- a/mhcflurry/class1_neural_network.py
+++ b/mhcflurry/class1_neural_network.py
@@ -5,12 +5,13 @@ import logging
 import numpy
 import pandas
 
-from mhcflurry.hyperparameters import HyperparameterDefaults
+from .hyperparameters import HyperparameterDefaults
 
-from mhcflurry.encodable_sequences import EncodableSequences
-from mhcflurry.amino_acid import available_vector_encodings, vector_encoding_length
-from mhcflurry.regression_target import to_ic50, from_ic50
-from mhcflurry.common import random_peptides, amino_acid_distribution
+from .encodable_sequences import EncodableSequences
+from .amino_acid import available_vector_encodings, vector_encoding_length
+from .regression_target import to_ic50, from_ic50
+from .common import random_peptides, amino_acid_distribution
+from .loss_with_inequalities import encode_y, LOSSES
 
 
 class Class1NeuralNetwork(object):
@@ -332,7 +333,9 @@ class Class1NeuralNetwork(object):
             peptides,
             affinities,
             allele_pseudosequences=None,
+            inequalities=None,
             sample_weights=None,
+            shuffle_permutation=None,
             verbose=1,
             progress_preamble=""):
         """
@@ -347,12 +350,24 @@ class Class1NeuralNetwork(object):
         
         allele_pseudosequences : EncodableSequences or list of string, optional
             If not specified, the model will be a single-allele predictor.
+
+        inequalities : list of string, each element one of ">", "<", or "=".
+            Inequalities to use for fitting. Same length as affinities.
+            Each element must be one of ">", "<", or "=". For example, a ">"
+            will train on y_pred > y_true for that element in the training set.
+            Requires using a custom losses that support inequalities (e.g.
+            mse_with_ineqalities).
+            If None all inequalities are taken to be "=".
             
         sample_weights : list of float, optional
             If not specified, all samples (including random negatives added
             during training) will have equal weight. If specified, the random
             negatives will be assigned weight=1.0.
-        
+
+        shuffle_permutation : list of int, optional
+            Permutation (integer list) of same length as peptides and affinities
+            If None, then a random permutation will be generated.
+
         verbose : int
             Keras verbosity level
 
@@ -391,6 +406,18 @@ class Class1NeuralNetwork(object):
 
         y_values = from_ic50(affinities)
         assert numpy.isnan(y_values).sum() == 0, numpy.isnan(y_values).sum()
+        if inequalities is not None:
+            # Reverse inequalities because from_ic50() flips the direction
+            # (i.e. lower affinity results in higher y values).
+            adjusted_inequalities = pandas.Series(inequalities).map({
+                "=": "=",
+                ">": "<",
+                "<": ">",
+            }).values
+        else:
+            adjusted_inequalities = numpy.tile("=", len(y_values))
+        if len(adjusted_inequalities) != len(y_values):
+            raise ValueError("Inequalities and y_values must have same length")
 
         x_dict_without_random_negatives = {
             'peptide': peptide_encoding,
@@ -406,34 +433,70 @@ class Class1NeuralNetwork(object):
         # Shuffle y_values and the contents of x_dict_without_random_negatives
         # This ensures different data is used for the test set for early stopping
         # when multiple models are trained.
-        shuffle_permutation = numpy.random.permutation(len(y_values))
+        if shuffle_permutation is None:
+            shuffle_permutation = numpy.random.permutation(len(y_values))
         y_values = y_values[shuffle_permutation]
         peptide_encoding = peptide_encoding[shuffle_permutation]
+        adjusted_inequalities = adjusted_inequalities[shuffle_permutation]
         for key in x_dict_without_random_negatives:
             x_dict_without_random_negatives[key] = (
                 x_dict_without_random_negatives[key][shuffle_permutation])
         if sample_weights is not None:
             sample_weights = sample_weights[shuffle_permutation]
 
+        if self.hyperparameters['loss'] in LOSSES:
+            # Using a custom loss that supports inequalities
+            loss_name_or_function = LOSSES[self.hyperparameters['loss']]
+            loss_supports_inequalities = True
+        else:
+            # Using a regular keras loss. No inequalities supported.
+            loss_name_or_function = self.hyperparameters['loss']
+            loss_supports_inequalities = False
+
+            if any(inequality != "=" for inequality in adjusted_inequalities):
+                raise ValueError("Loss %s does not support inequalities" % (
+                    loss_name_or_function))
+
         if self.network() is None:
             self._network = self.make_network(
                 pseudosequence_length=pseudosequence_length,
                 **self.network_hyperparameter_defaults.subselect(
                     self.hyperparameters))
-            self.compile()
-
-        y_dict_with_random_negatives = {
-            "output": numpy.concatenate([
-                from_ic50(
-                    numpy.random.uniform(
-                        self.hyperparameters[
-                            'random_negative_affinity_min'],
-                        self.hyperparameters[
-                            'random_negative_affinity_max'],
-                        int(num_random_negative.sum()))),
-                y_values,
-            ]),
-        }
+            self.network().compile(
+                loss=loss_name_or_function,
+                optimizer=self.hyperparameters['optimizer'])
+
+        if loss_supports_inequalities:
+            # Do not sample negative affinities: just use an inequality.
+            random_negative_ic50 = self.hyperparameters['random_negative_affinity_min']
+            random_negative_target = from_ic50(random_negative_ic50)
+
+            y_dict_with_random_negatives = {
+                "output": numpy.concatenate([
+                    numpy.tile(
+                        random_negative_target, int(num_random_negative.sum())),
+                    y_values,
+                ]),
+            }
+            # Note: we are using "<" here not ">" because the inequalities are
+            # now in target-space (0-1) not affinity-space.
+            adjusted_inequalities_with_random_negatives = (
+                ["<"] * int(num_random_negative.sum()) +
+                list(adjusted_inequalities))
+        else:
+            # Randomly sample random negative affinities
+            y_dict_with_random_negatives = {
+                "output": numpy.concatenate([
+                    from_ic50(
+                        numpy.random.uniform(
+                            self.hyperparameters[
+                                'random_negative_affinity_min'],
+                            self.hyperparameters[
+                                'random_negative_affinity_max'],
+                            int(num_random_negative.sum()))),
+                    y_values,
+                ]),
+            }
         if sample_weights is not None:
             sample_weights_with_random_negatives = numpy.concatenate([
                 numpy.ones(int(num_random_negative.sum())),
@@ -441,6 +504,11 @@ class Class1NeuralNetwork(object):
         else:
             sample_weights_with_random_negatives = None
 
+        if loss_supports_inequalities:
+            y_dict_with_random_negatives['output'] = encode_y(
+                y_dict_with_random_negatives['output'],
+                adjusted_inequalities_with_random_negatives)
+
         val_losses = []
         min_val_loss_iteration = None
         min_val_loss = None
@@ -552,14 +620,6 @@ class Class1NeuralNetwork(object):
         predictions = numpy.array(raw_predictions, dtype = "float64")[:,0]
         return to_ic50(predictions)
 
-    def compile(self):
-        """
-        Compile the keras model. Used internally.
-        """
-        self.network().compile(
-            **self.compile_hyperparameter_defaults.subselect(
-                self.hyperparameters))
-
     @staticmethod
     def make_network(
             pseudosequence_length,
diff --git a/mhcflurry/loss_with_inequalities.py b/mhcflurry/loss_with_inequalities.py
new file mode 100644
index 00000000..a1efe068
--- /dev/null
+++ b/mhcflurry/loss_with_inequalities.py
@@ -0,0 +1,78 @@
+"""
+Custom loss functions.
+
+Supports training a regressor on data that includes inequalities (e.g. x < 100).
+
+This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a
+hack, the implementation uses other intervals for y_pred to encode the
+inequality information.
+
+y_true is interpreted as follows:
+
+between 0 - 1
+   Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if
+   y_pred is greater or less than y_true.
+
+between 2 - 3:
+   Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is
+   applied only if y_pred is greater than y_true - 2.
+
+between 4 - 5:
+   Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is
+   applied only if y_pred is less than y_true - 4.
+"""
+
+from keras import backend as K
+
+import pandas
+import numpy
+
+LOSSES = {}
+
+
+def encode_y(y, inequalities=None):
+    y = numpy.array(y, dtype="float32")
+    if y.isnan().any():
+        raise ValueError("y contains NaN")
+    if (y > 1.0).any():
+        raise ValueError("y contains values > 1.0")
+    if (y < 0.0).any():
+        raise ValueError("y contains values < 0.0")
+
+    if inequalities is None:
+        encoded = y
+    else:
+        offsets = pandas.Series(inequalities).map({
+            '=': 0,
+            '<': 2,
+            '>': 4,
+        }).values
+        if offsets.isnan().any():
+            raise ValueError("Invalid inequality. Must be =, <, or >")
+        encoded = y + offsets
+    assert not encoded.isnan().any()
+    return encoded
+
+
+def mse_with_ineqalities(y_true, y_pred):
+    # Handle (=) inequalities
+    diff1 = y_pred - y_true
+    diff1 *= K.cast(y_true >= 0.0, "float32")
+    diff1 *= K.cast(y_true <= 1.0, "float32")
+
+    # Handle (>) inequalities
+    diff2 = y_pred - (y_true - 2.0)
+    diff2 *= K.cast(y_true >= 2.0, "float32")
+    diff2 *= K.cast(y_true <= 3.0, "float32")
+    diff2 *= K.cast(diff2 < 0.0, "float32")
+
+    # Handle (<) inequalities
+    diff3 = y_pred - (y_true - 4.0)
+    diff3 *= K.cast(y_true >= 4.0, "float32")
+    diff3 *= K.cast(diff3 > 0.0, "float32")
+
+    return (
+        K.sum(K.square(diff1), axis=-1) +
+        K.sum(K.square(diff2), axis=-1) +
+        K.sum(K.square(diff3), axis=-1))
+LOSSES["mse_with_ineqalities"] = mse_with_ineqalities
\ No newline at end of file
diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py
index 6602761b..b095e6a4 100644
--- a/mhcflurry/train_allele_specific_models_command.py
+++ b/mhcflurry/train_allele_specific_models_command.py
@@ -259,7 +259,7 @@ def process_work(
     train_data = data.sample(frac=1.0)
     (model,) = predictor.fit_allele_specific_predictors(
         n_models=1,
-        architecture_hyperparameters=hyperparameters,
+        architecture_hyperparameters_list=[hyperparameters],
         allele=allele,
         peptides=train_data.peptide.values,
         affinities=train_data.measurement_value.values,
diff --git a/test/test_class1_affinity_predictor.py b/test/test_class1_affinity_predictor.py
index 65f1f587..320b3961 100644
--- a/test/test_class1_affinity_predictor.py
+++ b/test/test_class1_affinity_predictor.py
@@ -92,7 +92,7 @@ def test_a1_known_epitopes_in_newly_trained_model():
     predictor = Class1AffinityPredictor()
     predictor.fit_allele_specific_predictors(
         n_models=2,
-        architecture_hyperparameters=hyperparameters,
+        architecture_hyperparameters_list=[hyperparameters],
         allele=allele,
         peptides=df.peptide.values,
         affinities=df.measurement_value.values,
@@ -153,7 +153,7 @@ def test_class1_affinity_predictor_a0205_memorize_training_data():
     predictor = Class1AffinityPredictor()
     predictor.fit_allele_specific_predictors(
         n_models=2,
-        architecture_hyperparameters=hyperparameters,
+        architecture_hyperparameters_list=[hyperparameters],
         allele=allele,
         peptides=df.peptide.values,
         affinities=df.measurement_value.values,
-- 
GitLab