From 26e6d92756b43c958fa04709d15162eda1df6e3d Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Sun, 28 Jan 2018 20:19:11 -0500
Subject: [PATCH] Add robust mean as centrality measure to combine predictions

---
 .../models_class1/GENERATE.sh                 |  4 +-
 .../models_class1/generate_hyperparameters.py | 76 +++++++++++++++++++
 .../models_class1/hyperparameters.yaml        |  0
 mhcflurry/class1_affinity_predictor.py        | 19 +++--
 mhcflurry/ensemble_centrality.py              | 37 +++++++++
 5 files changed, 129 insertions(+), 7 deletions(-)
 create mode 100644 downloads-generation/models_class1/generate_hyperparameters.py
 delete mode 100644 downloads-generation/models_class1/hyperparameters.yaml
 create mode 100644 mhcflurry/ensemble_centrality.py

diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh
index cf77bcf7..79a48cf7 100755
--- a/downloads-generation/models_class1/GENERATE.sh
+++ b/downloads-generation/models_class1/GENERATE.sh
@@ -29,7 +29,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
 
 mkdir models
 
-cp $SCRIPT_DIR/hyperparameters.yaml .
+python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml
 
 time mhcflurry-class1-train-allele-specific-models \
     --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
@@ -37,7 +37,7 @@ time mhcflurry-class1-train-allele-specific-models \
     --out-models-dir models \
     --percent-rank-calibration-num-peptides-per-length 1000000 \
     --min-measurements-per-allele 75 \
-    --num-jobs 0
+    --num-jobs 32
 
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 LOG.txt
diff --git a/downloads-generation/models_class1/generate_hyperparameters.py b/downloads-generation/models_class1/generate_hyperparameters.py
new file mode 100644
index 00000000..ae249a95
--- /dev/null
+++ b/downloads-generation/models_class1/generate_hyperparameters.py
@@ -0,0 +1,76 @@
+"""
+Generate grid of hyperparameters
+"""
+
+from sys import stdout
+from copy import deepcopy
+from yaml import dump
+
+base_hyperparameters = {
+    ##########################################
+    # ENSEMBLE SIZE
+    ##########################################
+    "n_models": 1,
+
+    ##########################################
+    # OPTIMIZATION
+    ##########################################
+    "max_epochs": 500,
+    "patience": 20,
+    "early_stopping": True,
+    "validation_split": 0.1,
+    "minibatch_size": 128,
+    "loss": "custom:mse_with_inequalities",
+
+    ##########################################
+    # RANDOM NEGATIVE PEPTIDES
+    ##########################################
+    "random_negative_rate": 0.2,
+    "random_negative_constant": 25,
+    "random_negative_affinity_min": 20000.0,
+    "random_negative_affinity_max": 50000.0,
+
+    ##########################################
+    # PEPTIDE REPRESENTATION
+    ##########################################
+    # One of "one-hot", "embedding", or "BLOSUM62".
+    "peptide_amino_acid_encoding": "BLOSUM62",
+    "use_embedding": False,  # maintained for backward compatability
+    "embedding_output_dim": 8,  # only used if using embedding
+    "kmer_size": 15,
+
+    ##########################################
+    # NEURAL NETWORK ARCHITECTURE
+    ##########################################
+    "locally_connected_layers": [
+        {
+            "filters": 8,
+            "activation": "tanh",
+            "kernel_size": 3
+        }
+    ],
+    "activation": "relu",
+    "output_activation": "sigmoid",
+    "layer_sizes": [16],
+    "dense_layer_l1_regularization": 0.001,
+    "batch_normalization": False,
+    "dropout_probability": 0.0,
+}
+
+grid = []
+for dense_layer_size in [64, 16]:
+    for num_lc in [0, 1, 2]:
+        for lc_kernel_size in [3, 5]:
+            new = deepcopy(base_hyperparameters)
+            new["layer_sizes"] = [dense_layer_size]
+            (lc_layer,) = new["locally_connected_layers"]
+            lc_layer['kernel_size'] = lc_kernel_size
+            if num_lc == 0:
+                new["locally_connected_layers"] = []
+            elif num_lc == 1:
+                new["locally_connected_layers"] = [lc_layer]
+            elif num_lc == 2:
+                new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)]
+                grid.append(new)
+
+dump(grid, stdout)
\ No newline at end of file
diff --git a/downloads-generation/models_class1/hyperparameters.yaml b/downloads-generation/models_class1/hyperparameters.yaml
deleted file mode 100644
index e69de29b..00000000
diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py
index fa2023eb..46e8d080 100644
--- a/mhcflurry/class1_affinity_predictor.py
+++ b/mhcflurry/class1_affinity_predictor.py
@@ -9,12 +9,10 @@ from os.path import join, exists
 from os import mkdir
 from socket import gethostname
 from getpass import getuser
-from functools import partial
 
 import mhcnames
 import numpy
 import pandas
-import tqdm  # progress bars
 from numpy.testing import assert_equal
 from six import string_types
 
@@ -25,6 +23,7 @@ from .encodable_sequences import EncodableSequences
 from .percent_rank_transform import PercentRankTransform
 from .regression_target import to_ic50
 from .version import __version__
+from .ensemble_centrality import CENTRALITY_MEASURES
 
 
 class Class1AffinityPredictor(object):
@@ -672,7 +671,8 @@ class Class1AffinityPredictor(object):
             allele=None,
             throw=True,
             include_individual_model_predictions=False,
-            include_percentile_ranks=True):
+            include_percentile_ranks=True,
+            centrality_measure="robust_mean"):
         """
         Predict nM binding affinities. Gives more detailed output than `predict`
         method, including 5-95% prediction intervals.
@@ -701,6 +701,9 @@ class Class1AffinityPredictor(object):
             If True, a "prediction_percentile" column will be included giving the
             percentile ranks. If no percentile rank information is available,
             this will be ignored with a warning.
+        centrality_measure : string or callable
+            Measure of central tendency to use to combine predictions in the
+            ensemble.
 
         Returns
         -------
@@ -817,9 +820,15 @@ class Class1AffinityPredictor(object):
         df_predictions = df[
             [c for c in df.columns if c.startswith("model_")]
         ]
+
+        if callable(centrality_measure):
+            centrality_function = centrality_measure
+        else:
+            centrality_function = CENTRALITY_MEASURES[centrality_measure]
+
         logs = numpy.log(df_predictions)
-        log_means = logs.mean(1)
-        df["prediction"] = numpy.exp(log_means)
+        log_centers = centrality_function(logs.values)
+        df["prediction"] = numpy.exp(log_centers)
         df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1))
         df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1))
 
diff --git a/mhcflurry/ensemble_centrality.py b/mhcflurry/ensemble_centrality.py
new file mode 100644
index 00000000..8a812d76
--- /dev/null
+++ b/mhcflurry/ensemble_centrality.py
@@ -0,0 +1,37 @@
+"""
+Measures of centrality (e.g. mean) used to combine predictions across an
+ensemble. The input to these functions are log affinities, and they are expected
+to return a centrality measure also in log-space.
+"""
+
+import numpy
+from functools import partial
+
+
+def robust_mean(log_values):
+    """
+    Mean of values falling within the 25-75 percentiles.
+
+    Parameters
+    ----------
+    log_values : 2-d numpy.array
+        Center is computed along the second axis (i.e. per row).
+
+    Returns
+    -------
+    center : numpy.array of length log_values.shape[1]
+
+    """
+    if log_values.shape[1] <= 3:
+        # Too few values to use robust mean.
+        return numpy.nanmean(log_values, axis=1)
+    mask = (
+        (log_values <= numpy.nanpercentile(log_values, 75, axis=1).reshape((-1, 1))) &
+        (log_values >= numpy.nanpercentile(log_values, 25, axis=1).reshape((-1, 1))))
+    return (log_values * mask.astype(float)).sum(1) / mask.sum(1)
+
+CENTRALITY_MEASURES = {
+    "mean": partial(numpy.nanmean, axis=1),
+    "median": partial(numpy.nanmedian, axis=1),
+    "robust_mean": robust_mean,
+}
\ No newline at end of file
-- 
GitLab