diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh index cf77bcf77d667e7b9266d02b6516bb4fe9015964..79a48cf7ce151913f8648043276efd56c4442b40 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1/GENERATE.sh @@ -29,7 +29,7 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME mkdir models -cp $SCRIPT_DIR/hyperparameters.yaml . +python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml time mhcflurry-class1-train-allele-specific-models \ --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ @@ -37,7 +37,7 @@ time mhcflurry-class1-train-allele-specific-models \ --out-models-dir models \ --percent-rank-calibration-num-peptides-per-length 1000000 \ --min-measurements-per-allele 75 \ - --num-jobs 0 + --num-jobs 32 cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt diff --git a/downloads-generation/models_class1/generate_hyperparameters.py b/downloads-generation/models_class1/generate_hyperparameters.py new file mode 100644 index 0000000000000000000000000000000000000000..ae249a95e4d8c0b223fc0aca6aba12f167ec2494 --- /dev/null +++ b/downloads-generation/models_class1/generate_hyperparameters.py @@ -0,0 +1,76 @@ +""" +Generate grid of hyperparameters +""" + +from sys import stdout +from copy import deepcopy +from yaml import dump + +base_hyperparameters = { + ########################################## + # ENSEMBLE SIZE + ########################################## + "n_models": 1, + + ########################################## + # OPTIMIZATION + ########################################## + "max_epochs": 500, + "patience": 20, + "early_stopping": True, + "validation_split": 0.1, + "minibatch_size": 128, + "loss": "custom:mse_with_inequalities", + + ########################################## + # RANDOM NEGATIVE PEPTIDES + ########################################## + "random_negative_rate": 0.2, + "random_negative_constant": 25, + "random_negative_affinity_min": 20000.0, + "random_negative_affinity_max": 50000.0, + + ########################################## + # PEPTIDE REPRESENTATION + ########################################## + # One of "one-hot", "embedding", or "BLOSUM62". + "peptide_amino_acid_encoding": "BLOSUM62", + "use_embedding": False, # maintained for backward compatability + "embedding_output_dim": 8, # only used if using embedding + "kmer_size": 15, + + ########################################## + # NEURAL NETWORK ARCHITECTURE + ########################################## + "locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } + ], + "activation": "relu", + "output_activation": "sigmoid", + "layer_sizes": [16], + "dense_layer_l1_regularization": 0.001, + "batch_normalization": False, + "dropout_probability": 0.0, +} + +grid = [] +for dense_layer_size in [64, 16]: + for num_lc in [0, 1, 2]: + for lc_kernel_size in [3, 5]: + new = deepcopy(base_hyperparameters) + new["layer_sizes"] = [dense_layer_size] + (lc_layer,) = new["locally_connected_layers"] + lc_layer['kernel_size'] = lc_kernel_size + if num_lc == 0: + new["locally_connected_layers"] = [] + elif num_lc == 1: + new["locally_connected_layers"] = [lc_layer] + elif num_lc == 2: + new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)] + grid.append(new) + +dump(grid, stdout) \ No newline at end of file diff --git a/downloads-generation/models_class1/hyperparameters.yaml b/downloads-generation/models_class1/hyperparameters.yaml deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index fa2023eb9e8f661b3d31b78ce25f735bbb2aa505..46e8d080e5e48b0188e05d7e20015d9f4536c23b 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -9,12 +9,10 @@ from os.path import join, exists from os import mkdir from socket import gethostname from getpass import getuser -from functools import partial import mhcnames import numpy import pandas -import tqdm # progress bars from numpy.testing import assert_equal from six import string_types @@ -25,6 +23,7 @@ from .encodable_sequences import EncodableSequences from .percent_rank_transform import PercentRankTransform from .regression_target import to_ic50 from .version import __version__ +from .ensemble_centrality import CENTRALITY_MEASURES class Class1AffinityPredictor(object): @@ -672,7 +671,8 @@ class Class1AffinityPredictor(object): allele=None, throw=True, include_individual_model_predictions=False, - include_percentile_ranks=True): + include_percentile_ranks=True, + centrality_measure="robust_mean"): """ Predict nM binding affinities. Gives more detailed output than `predict` method, including 5-95% prediction intervals. @@ -701,6 +701,9 @@ class Class1AffinityPredictor(object): If True, a "prediction_percentile" column will be included giving the percentile ranks. If no percentile rank information is available, this will be ignored with a warning. + centrality_measure : string or callable + Measure of central tendency to use to combine predictions in the + ensemble. Returns ------- @@ -817,9 +820,15 @@ class Class1AffinityPredictor(object): df_predictions = df[ [c for c in df.columns if c.startswith("model_")] ] + + if callable(centrality_measure): + centrality_function = centrality_measure + else: + centrality_function = CENTRALITY_MEASURES[centrality_measure] + logs = numpy.log(df_predictions) - log_means = logs.mean(1) - df["prediction"] = numpy.exp(log_means) + log_centers = centrality_function(logs.values) + df["prediction"] = numpy.exp(log_centers) df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1)) df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1)) diff --git a/mhcflurry/ensemble_centrality.py b/mhcflurry/ensemble_centrality.py new file mode 100644 index 0000000000000000000000000000000000000000..8a812d7656b40099f8bec67e7a0d5832c9cc677f --- /dev/null +++ b/mhcflurry/ensemble_centrality.py @@ -0,0 +1,37 @@ +""" +Measures of centrality (e.g. mean) used to combine predictions across an +ensemble. The input to these functions are log affinities, and they are expected +to return a centrality measure also in log-space. +""" + +import numpy +from functools import partial + + +def robust_mean(log_values): + """ + Mean of values falling within the 25-75 percentiles. + + Parameters + ---------- + log_values : 2-d numpy.array + Center is computed along the second axis (i.e. per row). + + Returns + ------- + center : numpy.array of length log_values.shape[1] + + """ + if log_values.shape[1] <= 3: + # Too few values to use robust mean. + return numpy.nanmean(log_values, axis=1) + mask = ( + (log_values <= numpy.nanpercentile(log_values, 75, axis=1).reshape((-1, 1))) & + (log_values >= numpy.nanpercentile(log_values, 25, axis=1).reshape((-1, 1)))) + return (log_values * mask.astype(float)).sum(1) / mask.sum(1) + +CENTRALITY_MEASURES = { + "mean": partial(numpy.nanmean, axis=1), + "median": partial(numpy.nanmedian, axis=1), + "robust_mean": robust_mean, +} \ No newline at end of file