Skip to content
Snippets Groups Projects
Commit 1a7264ba authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

First cut at model selection and training on data with inequalities

parent aea2f493
No related branches found
No related tags found
No related merge requests found
......@@ -87,15 +87,16 @@ some models.
>>> single_allele_train_data = df.loc[df.allele == "HLA-B*57:01"].sample(100)
>>> new_predictor.fit_allele_specific_predictors(
... n_models=1,
... architecture_hyperparameters={
... architecture_hyperparameters_list=[{
... "layer_sizes": [16],
... "max_epochs": 5,
... "random_negative_constant": 5,
... },
... }],
... peptides=single_allele_train_data.peptide.values,
... affinities=single_allele_train_data.measurement_value.values,
... allele="HLA-B*57:01")
The `~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` method
can be called any number of times on the same instance to build up ensembles
of models across alleles. The `architecture_hyperparameters` we specified are
......
......@@ -362,10 +362,11 @@ class Class1AffinityPredictor(object):
def fit_allele_specific_predictors(
self,
n_models,
architecture_hyperparameters,
architecture_hyperparameters_list,
allele,
peptides,
affinities,
inequalities=None,
models_dir_for_save=None,
verbose=1,
progress_preamble=""):
......@@ -381,7 +382,7 @@ class Class1AffinityPredictor(object):
n_models : int
Number of neural networks to fit
architecture_hyperparameters : dict
architecture_hyperparameters_list : list of dict
allele : string
......@@ -389,6 +390,9 @@ class Class1AffinityPredictor(object):
affinities : list of float
nM affinities
inequalities : list of string, each element one of ">", "<", or "="
See Class1NeuralNetwork.fit for details.
models_dir_for_save : string, optional
If specified, the Class1AffinityPredictor is (incrementally) written
......@@ -406,35 +410,81 @@ class Class1AffinityPredictor(object):
"""
allele = mhcnames.normalize_allele_name(allele)
models = self._fit_predictors(
n_models=n_models,
architecture_hyperparameters=architecture_hyperparameters,
peptides=peptides,
affinities=affinities,
allele_pseudosequences=None,
verbose=verbose,
progress_preamble=progress_preamble)
if allele not in self.allele_to_allele_specific_models:
self.allele_to_allele_specific_models[allele] = []
models_list = []
for (i, model) in enumerate(models):
model_name = self.model_name(allele, i)
models_list.append(model) # models is a generator
encodable_peptides = EncodableSequences.create(peptides)
n_architectures = len(architecture_hyperparameters_list)
if n_models > 1 or n_architectures > 1:
# Adjust progress info to indicate number of models and
# architectures.
pieces = []
if n_models > 1:
pieces.append("Model {model_num:2d} / {n_models:2d}")
if n_architectures > 1:
pieces.append(
"Architecture {architecture_num:2d} / {n_architectures:2d}"
" (best so far: {best_num:2d)")
progress_preamble_template = "[ %s ] {user_progress_preamble}" % (
", ".join(pieces))
else:
# Just use the user-provided progress message.
progress_preamble_template = "{user_progress_preamble}"
models = []
for model_num in range(n_models):
shuffle_permutation = numpy.random.permutation(len(affinities))
best_num = None
best_loss = None
best_model = None
for (architecture_num, architecture_hyperparameters) in enumerate(
architecture_hyperparameters_list):
model = Class1NeuralNetwork(**architecture_hyperparameters)
model.fit(
encodable_peptides,
affinities,
shuffle_permutation=shuffle_permutation,
inequalities=inequalities,
verbose=verbose,
progress_preamble=progress_preamble_template.format(
user_progress_preamble=progress_preamble,
best_num=best_num,
model_num=model_num,
n_models=n_models,
architecture_num=architecture_num,
n_architectures=n_architectures))
if n_architectures > 1:
# We require val_loss (i.e. a validation set) if we have
# multiple architectures.
loss = model.loss_history['val_loss'][-1]
else:
loss = None
if loss is None or best_loss is None or best_loss > loss:
best_loss = best_loss
best_num = architecture_num
best_model = model
del model
model_name = self.model_name(allele, model_num)
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
("allele", allele),
("config_json", json.dumps(model.get_config())),
("model", model),
("config_json", json.dumps(best_model.get_config())),
("model", best_model),
])).to_frame().T
self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True)
self.allele_to_allele_specific_models[allele].append(model)
self.allele_to_allele_specific_models[allele].append(best_model)
if models_dir_for_save:
self.save(
models_dir_for_save, model_names_to_write=[model_name])
return models_list
models.append(best_model)
return models
def fit_class1_pan_allele_models(
self,
......@@ -486,19 +536,19 @@ class Class1AffinityPredictor(object):
alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name)
allele_pseudosequences = alleles.map(self.allele_to_pseudosequence)
models = self._fit_predictors(
n_models=n_models,
architecture_hyperparameters=architecture_hyperparameters,
peptides=peptides,
affinities=affinities,
allele_pseudosequences=allele_pseudosequences,
verbose=verbose,
progress_preamble=progress_preamble)
encodable_peptides = EncodableSequences.create(peptides)
models = []
for i in range(n_models):
logging.info("Training model %d / %d" % (i + 1, n_models))
model = Class1NeuralNetwork(**architecture_hyperparameters)
model.fit(
encodable_peptides,
affinities,
allele_pseudosequences=allele_pseudosequences,
verbose=verbose,
progress_preamble=progress_preamble)
models_list = []
for (i, model) in enumerate(models):
model_name = self.model_name("pan-class1", i)
models_list.append(model) # models is a generator
self.class1_pan_allele_models.append(model)
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
......@@ -511,46 +561,9 @@ class Class1AffinityPredictor(object):
if models_dir_for_save:
self.save(
models_dir_for_save, model_names_to_write=[model_name])
return models_list
def _fit_predictors(
self,
n_models,
architecture_hyperparameters,
peptides,
affinities,
allele_pseudosequences,
verbose=1,
progress_preamble = ""):
"""
Private helper method
Parameters
----------
n_models : int
architecture_hyperparameters : dict
peptides : EncodableSequences or list of string
affinities : list of float
allele_pseudosequences : EncodableSequences or list of string
verbose : int
progress_preamble : string
Optional string of information to include in each progress update
models.append(model)
Returns
-------
generator of `Class1NeuralNetwork`
"""
encodable_peptides = EncodableSequences.create(peptides)
for i in range(n_models):
logging.info("Training model %d / %d" % (i + 1, n_models))
model = Class1NeuralNetwork(**architecture_hyperparameters)
model.fit(
encodable_peptides,
affinities,
allele_pseudosequences=allele_pseudosequences,
verbose=verbose,
progress_preamble=progress_preamble)
yield model
return models
def calibrate_percentile_ranks(
self,
......
......@@ -5,12 +5,13 @@ import logging
import numpy
import pandas
from mhcflurry.hyperparameters import HyperparameterDefaults
from .hyperparameters import HyperparameterDefaults
from mhcflurry.encodable_sequences import EncodableSequences
from mhcflurry.amino_acid import available_vector_encodings, vector_encoding_length
from mhcflurry.regression_target import to_ic50, from_ic50
from mhcflurry.common import random_peptides, amino_acid_distribution
from .encodable_sequences import EncodableSequences
from .amino_acid import available_vector_encodings, vector_encoding_length
from .regression_target import to_ic50, from_ic50
from .common import random_peptides, amino_acid_distribution
from .loss_with_inequalities import encode_y, LOSSES
class Class1NeuralNetwork(object):
......@@ -332,7 +333,9 @@ class Class1NeuralNetwork(object):
peptides,
affinities,
allele_pseudosequences=None,
inequalities=None,
sample_weights=None,
shuffle_permutation=None,
verbose=1,
progress_preamble=""):
"""
......@@ -347,12 +350,24 @@ class Class1NeuralNetwork(object):
allele_pseudosequences : EncodableSequences or list of string, optional
If not specified, the model will be a single-allele predictor.
inequalities : list of string, each element one of ">", "<", or "=".
Inequalities to use for fitting. Same length as affinities.
Each element must be one of ">", "<", or "=". For example, a ">"
will train on y_pred > y_true for that element in the training set.
Requires using a custom losses that support inequalities (e.g.
mse_with_ineqalities).
If None all inequalities are taken to be "=".
sample_weights : list of float, optional
If not specified, all samples (including random negatives added
during training) will have equal weight. If specified, the random
negatives will be assigned weight=1.0.
shuffle_permutation : list of int, optional
Permutation (integer list) of same length as peptides and affinities
If None, then a random permutation will be generated.
verbose : int
Keras verbosity level
......@@ -391,6 +406,18 @@ class Class1NeuralNetwork(object):
y_values = from_ic50(affinities)
assert numpy.isnan(y_values).sum() == 0, numpy.isnan(y_values).sum()
if inequalities is not None:
# Reverse inequalities because from_ic50() flips the direction
# (i.e. lower affinity results in higher y values).
adjusted_inequalities = pandas.Series(inequalities).map({
"=": "=",
">": "<",
"<": ">",
}).values
else:
adjusted_inequalities = numpy.tile("=", len(y_values))
if len(adjusted_inequalities) != len(y_values):
raise ValueError("Inequalities and y_values must have same length")
x_dict_without_random_negatives = {
'peptide': peptide_encoding,
......@@ -406,34 +433,70 @@ class Class1NeuralNetwork(object):
# Shuffle y_values and the contents of x_dict_without_random_negatives
# This ensures different data is used for the test set for early stopping
# when multiple models are trained.
shuffle_permutation = numpy.random.permutation(len(y_values))
if shuffle_permutation is None:
shuffle_permutation = numpy.random.permutation(len(y_values))
y_values = y_values[shuffle_permutation]
peptide_encoding = peptide_encoding[shuffle_permutation]
adjusted_inequalities = adjusted_inequalities[shuffle_permutation]
for key in x_dict_without_random_negatives:
x_dict_without_random_negatives[key] = (
x_dict_without_random_negatives[key][shuffle_permutation])
if sample_weights is not None:
sample_weights = sample_weights[shuffle_permutation]
if self.hyperparameters['loss'] in LOSSES:
# Using a custom loss that supports inequalities
loss_name_or_function = LOSSES[self.hyperparameters['loss']]
loss_supports_inequalities = True
else:
# Using a regular keras loss. No inequalities supported.
loss_name_or_function = self.hyperparameters['loss']
loss_supports_inequalities = False
if any(inequality != "=" for inequality in adjusted_inequalities):
raise ValueError("Loss %s does not support inequalities" % (
loss_name_or_function))
if self.network() is None:
self._network = self.make_network(
pseudosequence_length=pseudosequence_length,
**self.network_hyperparameter_defaults.subselect(
self.hyperparameters))
self.compile()
y_dict_with_random_negatives = {
"output": numpy.concatenate([
from_ic50(
numpy.random.uniform(
self.hyperparameters[
'random_negative_affinity_min'],
self.hyperparameters[
'random_negative_affinity_max'],
int(num_random_negative.sum()))),
y_values,
]),
}
self.network().compile(
loss=loss_name_or_function,
optimizer=self.hyperparameters['optimizer'])
if loss_supports_inequalities:
# Do not sample negative affinities: just use an inequality.
random_negative_ic50 = self.hyperparameters['random_negative_affinity_min']
random_negative_target = from_ic50(random_negative_ic50)
y_dict_with_random_negatives = {
"output": numpy.concatenate([
numpy.tile(
random_negative_target, int(num_random_negative.sum())),
y_values,
]),
}
# Note: we are using "<" here not ">" because the inequalities are
# now in target-space (0-1) not affinity-space.
adjusted_inequalities_with_random_negatives = (
["<"] * int(num_random_negative.sum()) +
list(adjusted_inequalities))
else:
# Randomly sample random negative affinities
y_dict_with_random_negatives = {
"output": numpy.concatenate([
from_ic50(
numpy.random.uniform(
self.hyperparameters[
'random_negative_affinity_min'],
self.hyperparameters[
'random_negative_affinity_max'],
int(num_random_negative.sum()))),
y_values,
]),
}
if sample_weights is not None:
sample_weights_with_random_negatives = numpy.concatenate([
numpy.ones(int(num_random_negative.sum())),
......@@ -441,6 +504,11 @@ class Class1NeuralNetwork(object):
else:
sample_weights_with_random_negatives = None
if loss_supports_inequalities:
y_dict_with_random_negatives['output'] = encode_y(
y_dict_with_random_negatives['output'],
adjusted_inequalities_with_random_negatives)
val_losses = []
min_val_loss_iteration = None
min_val_loss = None
......@@ -552,14 +620,6 @@ class Class1NeuralNetwork(object):
predictions = numpy.array(raw_predictions, dtype = "float64")[:,0]
return to_ic50(predictions)
def compile(self):
"""
Compile the keras model. Used internally.
"""
self.network().compile(
**self.compile_hyperparameter_defaults.subselect(
self.hyperparameters))
@staticmethod
def make_network(
pseudosequence_length,
......
"""
Custom loss functions.
Supports training a regressor on data that includes inequalities (e.g. x < 100).
This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a
hack, the implementation uses other intervals for y_pred to encode the
inequality information.
y_true is interpreted as follows:
between 0 - 1
Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if
y_pred is greater or less than y_true.
between 2 - 3:
Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is
applied only if y_pred is greater than y_true - 2.
between 4 - 5:
Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is
applied only if y_pred is less than y_true - 4.
"""
from keras import backend as K
import pandas
import numpy
LOSSES = {}
def encode_y(y, inequalities=None):
y = numpy.array(y, dtype="float32")
if y.isnan().any():
raise ValueError("y contains NaN")
if (y > 1.0).any():
raise ValueError("y contains values > 1.0")
if (y < 0.0).any():
raise ValueError("y contains values < 0.0")
if inequalities is None:
encoded = y
else:
offsets = pandas.Series(inequalities).map({
'=': 0,
'<': 2,
'>': 4,
}).values
if offsets.isnan().any():
raise ValueError("Invalid inequality. Must be =, <, or >")
encoded = y + offsets
assert not encoded.isnan().any()
return encoded
def mse_with_ineqalities(y_true, y_pred):
# Handle (=) inequalities
diff1 = y_pred - y_true
diff1 *= K.cast(y_true >= 0.0, "float32")
diff1 *= K.cast(y_true <= 1.0, "float32")
# Handle (>) inequalities
diff2 = y_pred - (y_true - 2.0)
diff2 *= K.cast(y_true >= 2.0, "float32")
diff2 *= K.cast(y_true <= 3.0, "float32")
diff2 *= K.cast(diff2 < 0.0, "float32")
# Handle (<) inequalities
diff3 = y_pred - (y_true - 4.0)
diff3 *= K.cast(y_true >= 4.0, "float32")
diff3 *= K.cast(diff3 > 0.0, "float32")
return (
K.sum(K.square(diff1), axis=-1) +
K.sum(K.square(diff2), axis=-1) +
K.sum(K.square(diff3), axis=-1))
LOSSES["mse_with_ineqalities"] = mse_with_ineqalities
\ No newline at end of file
......@@ -259,7 +259,7 @@ def process_work(
train_data = data.sample(frac=1.0)
(model,) = predictor.fit_allele_specific_predictors(
n_models=1,
architecture_hyperparameters=hyperparameters,
architecture_hyperparameters_list=[hyperparameters],
allele=allele,
peptides=train_data.peptide.values,
affinities=train_data.measurement_value.values,
......
......@@ -92,7 +92,7 @@ def test_a1_known_epitopes_in_newly_trained_model():
predictor = Class1AffinityPredictor()
predictor.fit_allele_specific_predictors(
n_models=2,
architecture_hyperparameters=hyperparameters,
architecture_hyperparameters_list=[hyperparameters],
allele=allele,
peptides=df.peptide.values,
affinities=df.measurement_value.values,
......@@ -153,7 +153,7 @@ def test_class1_affinity_predictor_a0205_memorize_training_data():
predictor = Class1AffinityPredictor()
predictor.fit_allele_specific_predictors(
n_models=2,
architecture_hyperparameters=hyperparameters,
architecture_hyperparameters_list=[hyperparameters],
allele=allele,
peptides=df.peptide.values,
affinities=df.measurement_value.values,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment