Skip to content
Snippets Groups Projects
Commit 6287ee2b authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Add robust mean as centrality measure to combine predictions

parent 2466408b
Branches
No related tags found
No related merge requests found
......@@ -29,7 +29,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
mkdir models
cp $SCRIPT_DIR/hyperparameters.yaml .
python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml
time mhcflurry-class1-train-allele-specific-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
......@@ -37,7 +37,7 @@ time mhcflurry-class1-train-allele-specific-models \
--out-models-dir models \
--percent-rank-calibration-num-peptides-per-length 1000000 \
--min-measurements-per-allele 75 \
--num-jobs 0
--num-jobs 32
cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt
......
"""
Generate grid of hyperparameters
"""
from sys import stdout
from copy import deepcopy
from yaml import dump
base_hyperparameters = {
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 1,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 20,
"early_stopping": True,
"validation_split": 0.1,
"minibatch_size": 128,
"loss": "custom:mse_with_inequalities",
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.2,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": False, # maintained for backward compatability
"embedding_output_dim": 8, # only used if using embedding
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [16],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": False,
"dropout_probability": 0.0,
}
grid = []
for dense_layer_size in [64, 16]:
for num_lc in [0, 1, 2]:
for lc_kernel_size in [3, 5]:
new = deepcopy(base_hyperparameters)
new["layer_sizes"] = [dense_layer_size]
(lc_layer,) = new["locally_connected_layers"]
lc_layer['kernel_size'] = lc_kernel_size
if num_lc == 0:
new["locally_connected_layers"] = []
elif num_lc == 1:
new["locally_connected_layers"] = [lc_layer]
elif num_lc == 2:
new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)]
grid.append(new)
dump(grid, stdout)
\ No newline at end of file
......@@ -9,12 +9,10 @@ from os.path import join, exists
from os import mkdir
from socket import gethostname
from getpass import getuser
from functools import partial
import mhcnames
import numpy
import pandas
import tqdm # progress bars
from numpy.testing import assert_equal
from six import string_types
......@@ -25,6 +23,7 @@ from .encodable_sequences import EncodableSequences
from .percent_rank_transform import PercentRankTransform
from .regression_target import to_ic50
from .version import __version__
from .ensemble_centrality import CENTRALITY_MEASURES
class Class1AffinityPredictor(object):
......@@ -672,7 +671,8 @@ class Class1AffinityPredictor(object):
allele=None,
throw=True,
include_individual_model_predictions=False,
include_percentile_ranks=True):
include_percentile_ranks=True,
centrality_measure="robust_mean"):
"""
Predict nM binding affinities. Gives more detailed output than `predict`
method, including 5-95% prediction intervals.
......@@ -701,6 +701,9 @@ class Class1AffinityPredictor(object):
If True, a "prediction_percentile" column will be included giving the
percentile ranks. If no percentile rank information is available,
this will be ignored with a warning.
centrality_measure : string or callable
Measure of central tendency to use to combine predictions in the
ensemble.
Returns
-------
......@@ -817,9 +820,15 @@ class Class1AffinityPredictor(object):
df_predictions = df[
[c for c in df.columns if c.startswith("model_")]
]
if callable(centrality_measure):
centrality_function = centrality_measure
else:
centrality_function = CENTRALITY_MEASURES[centrality_measure]
logs = numpy.log(df_predictions)
log_means = logs.mean(1)
df["prediction"] = numpy.exp(log_means)
log_centers = centrality_function(logs.values)
df["prediction"] = numpy.exp(log_centers)
df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1))
df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1))
......
"""
Measures of centrality (e.g. mean) used to combine predictions across an
ensemble. The input to these functions are log affinities, and they are expected
to return a centrality measure also in log-space.
"""
import numpy
from functools import partial
def robust_mean(log_values):
"""
Mean of values falling within the 25-75 percentiles.
Parameters
----------
log_values : 2-d numpy.array
Center is computed along the second axis (i.e. per row).
Returns
-------
center : numpy.array of length log_values.shape[1]
"""
if log_values.shape[1] <= 3:
# Too few values to use robust mean.
return numpy.nanmean(log_values, axis=1)
mask = (
(log_values <= numpy.nanpercentile(log_values, 75, axis=1).reshape((-1, 1))) &
(log_values >= numpy.nanpercentile(log_values, 25, axis=1).reshape((-1, 1))))
return (log_values * mask.astype(float)).sum(1) / mask.sum(1)
CENTRALITY_MEASURES = {
"mean": partial(numpy.nanmean, axis=1),
"median": partial(numpy.nanmedian, axis=1),
"robust_mean": robust_mean,
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment