Skip to content
Snippets Groups Projects
Commit 26e6d927 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Add robust mean as centrality measure to combine predictions

parent 4f36f57b
No related branches found
No related tags found
No related merge requests found
...@@ -29,7 +29,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME ...@@ -29,7 +29,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME
mkdir models mkdir models
cp $SCRIPT_DIR/hyperparameters.yaml . python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml
time mhcflurry-class1-train-allele-specific-models \ time mhcflurry-class1-train-allele-specific-models \
--data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
...@@ -37,7 +37,7 @@ time mhcflurry-class1-train-allele-specific-models \ ...@@ -37,7 +37,7 @@ time mhcflurry-class1-train-allele-specific-models \
--out-models-dir models \ --out-models-dir models \
--percent-rank-calibration-num-peptides-per-length 1000000 \ --percent-rank-calibration-num-peptides-per-length 1000000 \
--min-measurements-per-allele 75 \ --min-measurements-per-allele 75 \
--num-jobs 0 --num-jobs 32
cp $SCRIPT_ABSOLUTE_PATH . cp $SCRIPT_ABSOLUTE_PATH .
bzip2 LOG.txt bzip2 LOG.txt
......
"""
Generate grid of hyperparameters
"""
from sys import stdout
from copy import deepcopy
from yaml import dump
base_hyperparameters = {
##########################################
# ENSEMBLE SIZE
##########################################
"n_models": 1,
##########################################
# OPTIMIZATION
##########################################
"max_epochs": 500,
"patience": 20,
"early_stopping": True,
"validation_split": 0.1,
"minibatch_size": 128,
"loss": "custom:mse_with_inequalities",
##########################################
# RANDOM NEGATIVE PEPTIDES
##########################################
"random_negative_rate": 0.2,
"random_negative_constant": 25,
"random_negative_affinity_min": 20000.0,
"random_negative_affinity_max": 50000.0,
##########################################
# PEPTIDE REPRESENTATION
##########################################
# One of "one-hot", "embedding", or "BLOSUM62".
"peptide_amino_acid_encoding": "BLOSUM62",
"use_embedding": False, # maintained for backward compatability
"embedding_output_dim": 8, # only used if using embedding
"kmer_size": 15,
##########################################
# NEURAL NETWORK ARCHITECTURE
##########################################
"locally_connected_layers": [
{
"filters": 8,
"activation": "tanh",
"kernel_size": 3
}
],
"activation": "relu",
"output_activation": "sigmoid",
"layer_sizes": [16],
"dense_layer_l1_regularization": 0.001,
"batch_normalization": False,
"dropout_probability": 0.0,
}
grid = []
for dense_layer_size in [64, 16]:
for num_lc in [0, 1, 2]:
for lc_kernel_size in [3, 5]:
new = deepcopy(base_hyperparameters)
new["layer_sizes"] = [dense_layer_size]
(lc_layer,) = new["locally_connected_layers"]
lc_layer['kernel_size'] = lc_kernel_size
if num_lc == 0:
new["locally_connected_layers"] = []
elif num_lc == 1:
new["locally_connected_layers"] = [lc_layer]
elif num_lc == 2:
new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)]
grid.append(new)
dump(grid, stdout)
\ No newline at end of file
...@@ -9,12 +9,10 @@ from os.path import join, exists ...@@ -9,12 +9,10 @@ from os.path import join, exists
from os import mkdir from os import mkdir
from socket import gethostname from socket import gethostname
from getpass import getuser from getpass import getuser
from functools import partial
import mhcnames import mhcnames
import numpy import numpy
import pandas import pandas
import tqdm # progress bars
from numpy.testing import assert_equal from numpy.testing import assert_equal
from six import string_types from six import string_types
...@@ -25,6 +23,7 @@ from .encodable_sequences import EncodableSequences ...@@ -25,6 +23,7 @@ from .encodable_sequences import EncodableSequences
from .percent_rank_transform import PercentRankTransform from .percent_rank_transform import PercentRankTransform
from .regression_target import to_ic50 from .regression_target import to_ic50
from .version import __version__ from .version import __version__
from .ensemble_centrality import CENTRALITY_MEASURES
class Class1AffinityPredictor(object): class Class1AffinityPredictor(object):
...@@ -672,7 +671,8 @@ class Class1AffinityPredictor(object): ...@@ -672,7 +671,8 @@ class Class1AffinityPredictor(object):
allele=None, allele=None,
throw=True, throw=True,
include_individual_model_predictions=False, include_individual_model_predictions=False,
include_percentile_ranks=True): include_percentile_ranks=True,
centrality_measure="robust_mean"):
""" """
Predict nM binding affinities. Gives more detailed output than `predict` Predict nM binding affinities. Gives more detailed output than `predict`
method, including 5-95% prediction intervals. method, including 5-95% prediction intervals.
...@@ -701,6 +701,9 @@ class Class1AffinityPredictor(object): ...@@ -701,6 +701,9 @@ class Class1AffinityPredictor(object):
If True, a "prediction_percentile" column will be included giving the If True, a "prediction_percentile" column will be included giving the
percentile ranks. If no percentile rank information is available, percentile ranks. If no percentile rank information is available,
this will be ignored with a warning. this will be ignored with a warning.
centrality_measure : string or callable
Measure of central tendency to use to combine predictions in the
ensemble.
Returns Returns
------- -------
...@@ -817,9 +820,15 @@ class Class1AffinityPredictor(object): ...@@ -817,9 +820,15 @@ class Class1AffinityPredictor(object):
df_predictions = df[ df_predictions = df[
[c for c in df.columns if c.startswith("model_")] [c for c in df.columns if c.startswith("model_")]
] ]
if callable(centrality_measure):
centrality_function = centrality_measure
else:
centrality_function = CENTRALITY_MEASURES[centrality_measure]
logs = numpy.log(df_predictions) logs = numpy.log(df_predictions)
log_means = logs.mean(1) log_centers = centrality_function(logs.values)
df["prediction"] = numpy.exp(log_means) df["prediction"] = numpy.exp(log_centers)
df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1)) df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1))
df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1)) df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1))
......
"""
Measures of centrality (e.g. mean) used to combine predictions across an
ensemble. The input to these functions are log affinities, and they are expected
to return a centrality measure also in log-space.
"""
import numpy
from functools import partial
def robust_mean(log_values):
"""
Mean of values falling within the 25-75 percentiles.
Parameters
----------
log_values : 2-d numpy.array
Center is computed along the second axis (i.e. per row).
Returns
-------
center : numpy.array of length log_values.shape[1]
"""
if log_values.shape[1] <= 3:
# Too few values to use robust mean.
return numpy.nanmean(log_values, axis=1)
mask = (
(log_values <= numpy.nanpercentile(log_values, 75, axis=1).reshape((-1, 1))) &
(log_values >= numpy.nanpercentile(log_values, 25, axis=1).reshape((-1, 1))))
return (log_values * mask.astype(float)).sum(1) / mask.sum(1)
CENTRALITY_MEASURES = {
"mean": partial(numpy.nanmean, axis=1),
"median": partial(numpy.nanmedian, axis=1),
"robust_mean": robust_mean,
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment