diff --git a/docs/python_tutorial.rst b/docs/python_tutorial.rst index f8a9379984b9fec2ce7a31a41ac60b29bb8c8881..9b3d8b882a91bea6899cb6583382d691f717c7e8 100644 --- a/docs/python_tutorial.rst +++ b/docs/python_tutorial.rst @@ -87,15 +87,16 @@ some models. >>> single_allele_train_data = df.loc[df.allele == "HLA-B*57:01"].sample(100) >>> new_predictor.fit_allele_specific_predictors( ... n_models=1, - ... architecture_hyperparameters={ + ... architecture_hyperparameters_list=[{ ... "layer_sizes": [16], ... "max_epochs": 5, ... "random_negative_constant": 5, - ... }, + ... }], ... peptides=single_allele_train_data.peptide.values, ... affinities=single_allele_train_data.measurement_value.values, ... allele="HLA-B*57:01") + The `~mhcflurry.Class1AffinityPredictor.fit_allele_specific_predictors` method can be called any number of times on the same instance to build up ensembles of models across alleles. The `architecture_hyperparameters` we specified are diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 1c523e35be470a8eecdf5ae218a9a110b29ec075..f56865ebf9efacb619791fde6b34db906199d045 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -362,10 +362,11 @@ class Class1AffinityPredictor(object): def fit_allele_specific_predictors( self, n_models, - architecture_hyperparameters, + architecture_hyperparameters_list, allele, peptides, affinities, + inequalities=None, models_dir_for_save=None, verbose=1, progress_preamble=""): @@ -381,7 +382,7 @@ class Class1AffinityPredictor(object): n_models : int Number of neural networks to fit - architecture_hyperparameters : dict + architecture_hyperparameters_list : list of dict allele : string @@ -389,6 +390,9 @@ class Class1AffinityPredictor(object): affinities : list of float nM affinities + + inequalities : list of string, each element one of ">", "<", or "=" + See Class1NeuralNetwork.fit for details. models_dir_for_save : string, optional If specified, the Class1AffinityPredictor is (incrementally) written @@ -406,35 +410,81 @@ class Class1AffinityPredictor(object): """ allele = mhcnames.normalize_allele_name(allele) - models = self._fit_predictors( - n_models=n_models, - architecture_hyperparameters=architecture_hyperparameters, - peptides=peptides, - affinities=affinities, - allele_pseudosequences=None, - verbose=verbose, - progress_preamble=progress_preamble) - if allele not in self.allele_to_allele_specific_models: self.allele_to_allele_specific_models[allele] = [] - models_list = [] - for (i, model) in enumerate(models): - model_name = self.model_name(allele, i) - models_list.append(model) # models is a generator + encodable_peptides = EncodableSequences.create(peptides) + + n_architectures = len(architecture_hyperparameters_list) + if n_models > 1 or n_architectures > 1: + # Adjust progress info to indicate number of models and + # architectures. + pieces = [] + if n_models > 1: + pieces.append("Model {model_num:2d} / {n_models:2d}") + if n_architectures > 1: + pieces.append( + "Architecture {architecture_num:2d} / {n_architectures:2d}" + " (best so far: {best_num:2d)") + progress_preamble_template = "[ %s ] {user_progress_preamble}" % ( + ", ".join(pieces)) + else: + # Just use the user-provided progress message. + progress_preamble_template = "{user_progress_preamble}" + + models = [] + for model_num in range(n_models): + shuffle_permutation = numpy.random.permutation(len(affinities)) + + best_num = None + best_loss = None + best_model = None + for (architecture_num, architecture_hyperparameters) in enumerate( + architecture_hyperparameters_list): + model = Class1NeuralNetwork(**architecture_hyperparameters) + model.fit( + encodable_peptides, + affinities, + shuffle_permutation=shuffle_permutation, + inequalities=inequalities, + verbose=verbose, + progress_preamble=progress_preamble_template.format( + user_progress_preamble=progress_preamble, + best_num=best_num, + model_num=model_num, + n_models=n_models, + architecture_num=architecture_num, + n_architectures=n_architectures)) + + + if n_architectures > 1: + # We require val_loss (i.e. a validation set) if we have + # multiple architectures. + loss = model.loss_history['val_loss'][-1] + else: + loss = None + if loss is None or best_loss is None or best_loss > loss: + best_loss = best_loss + best_num = architecture_num + best_model = model + del model + + model_name = self.model_name(allele, model_num) row = pandas.Series(collections.OrderedDict([ ("model_name", model_name), ("allele", allele), - ("config_json", json.dumps(model.get_config())), - ("model", model), + ("config_json", json.dumps(best_model.get_config())), + ("model", best_model), ])).to_frame().T self.manifest_df = pandas.concat( [self.manifest_df, row], ignore_index=True) - self.allele_to_allele_specific_models[allele].append(model) + self.allele_to_allele_specific_models[allele].append(best_model) if models_dir_for_save: self.save( models_dir_for_save, model_names_to_write=[model_name]) - return models_list + models.append(best_model) + + return models def fit_class1_pan_allele_models( self, @@ -486,19 +536,19 @@ class Class1AffinityPredictor(object): alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name) allele_pseudosequences = alleles.map(self.allele_to_pseudosequence) - models = self._fit_predictors( - n_models=n_models, - architecture_hyperparameters=architecture_hyperparameters, - peptides=peptides, - affinities=affinities, - allele_pseudosequences=allele_pseudosequences, - verbose=verbose, - progress_preamble=progress_preamble) + encodable_peptides = EncodableSequences.create(peptides) + models = [] + for i in range(n_models): + logging.info("Training model %d / %d" % (i + 1, n_models)) + model = Class1NeuralNetwork(**architecture_hyperparameters) + model.fit( + encodable_peptides, + affinities, + allele_pseudosequences=allele_pseudosequences, + verbose=verbose, + progress_preamble=progress_preamble) - models_list = [] - for (i, model) in enumerate(models): model_name = self.model_name("pan-class1", i) - models_list.append(model) # models is a generator self.class1_pan_allele_models.append(model) row = pandas.Series(collections.OrderedDict([ ("model_name", model_name), @@ -511,46 +561,9 @@ class Class1AffinityPredictor(object): if models_dir_for_save: self.save( models_dir_for_save, model_names_to_write=[model_name]) - return models_list - - def _fit_predictors( - self, - n_models, - architecture_hyperparameters, - peptides, - affinities, - allele_pseudosequences, - verbose=1, - progress_preamble = ""): - """ - Private helper method - - Parameters - ---------- - n_models : int - architecture_hyperparameters : dict - peptides : EncodableSequences or list of string - affinities : list of float - allele_pseudosequences : EncodableSequences or list of string - verbose : int - progress_preamble : string - Optional string of information to include in each progress update + models.append(model) - Returns - ------- - generator of `Class1NeuralNetwork` - """ - encodable_peptides = EncodableSequences.create(peptides) - for i in range(n_models): - logging.info("Training model %d / %d" % (i + 1, n_models)) - model = Class1NeuralNetwork(**architecture_hyperparameters) - model.fit( - encodable_peptides, - affinities, - allele_pseudosequences=allele_pseudosequences, - verbose=verbose, - progress_preamble=progress_preamble) - yield model + return models def calibrate_percentile_ranks( self, diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index eafad7c28f422f9c753613b680a1aceaf7a6ae2d..c8e674b2ab211df1417c76593f607bcd91190cbc 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -5,12 +5,13 @@ import logging import numpy import pandas -from mhcflurry.hyperparameters import HyperparameterDefaults +from .hyperparameters import HyperparameterDefaults -from mhcflurry.encodable_sequences import EncodableSequences -from mhcflurry.amino_acid import available_vector_encodings, vector_encoding_length -from mhcflurry.regression_target import to_ic50, from_ic50 -from mhcflurry.common import random_peptides, amino_acid_distribution +from .encodable_sequences import EncodableSequences +from .amino_acid import available_vector_encodings, vector_encoding_length +from .regression_target import to_ic50, from_ic50 +from .common import random_peptides, amino_acid_distribution +from .loss_with_inequalities import encode_y, LOSSES class Class1NeuralNetwork(object): @@ -332,7 +333,9 @@ class Class1NeuralNetwork(object): peptides, affinities, allele_pseudosequences=None, + inequalities=None, sample_weights=None, + shuffle_permutation=None, verbose=1, progress_preamble=""): """ @@ -347,12 +350,24 @@ class Class1NeuralNetwork(object): allele_pseudosequences : EncodableSequences or list of string, optional If not specified, the model will be a single-allele predictor. + + inequalities : list of string, each element one of ">", "<", or "=". + Inequalities to use for fitting. Same length as affinities. + Each element must be one of ">", "<", or "=". For example, a ">" + will train on y_pred > y_true for that element in the training set. + Requires using a custom losses that support inequalities (e.g. + mse_with_ineqalities). + If None all inequalities are taken to be "=". sample_weights : list of float, optional If not specified, all samples (including random negatives added during training) will have equal weight. If specified, the random negatives will be assigned weight=1.0. - + + shuffle_permutation : list of int, optional + Permutation (integer list) of same length as peptides and affinities + If None, then a random permutation will be generated. + verbose : int Keras verbosity level @@ -391,6 +406,18 @@ class Class1NeuralNetwork(object): y_values = from_ic50(affinities) assert numpy.isnan(y_values).sum() == 0, numpy.isnan(y_values).sum() + if inequalities is not None: + # Reverse inequalities because from_ic50() flips the direction + # (i.e. lower affinity results in higher y values). + adjusted_inequalities = pandas.Series(inequalities).map({ + "=": "=", + ">": "<", + "<": ">", + }).values + else: + adjusted_inequalities = numpy.tile("=", len(y_values)) + if len(adjusted_inequalities) != len(y_values): + raise ValueError("Inequalities and y_values must have same length") x_dict_without_random_negatives = { 'peptide': peptide_encoding, @@ -406,34 +433,70 @@ class Class1NeuralNetwork(object): # Shuffle y_values and the contents of x_dict_without_random_negatives # This ensures different data is used for the test set for early stopping # when multiple models are trained. - shuffle_permutation = numpy.random.permutation(len(y_values)) + if shuffle_permutation is None: + shuffle_permutation = numpy.random.permutation(len(y_values)) y_values = y_values[shuffle_permutation] peptide_encoding = peptide_encoding[shuffle_permutation] + adjusted_inequalities = adjusted_inequalities[shuffle_permutation] for key in x_dict_without_random_negatives: x_dict_without_random_negatives[key] = ( x_dict_without_random_negatives[key][shuffle_permutation]) if sample_weights is not None: sample_weights = sample_weights[shuffle_permutation] + if self.hyperparameters['loss'] in LOSSES: + # Using a custom loss that supports inequalities + loss_name_or_function = LOSSES[self.hyperparameters['loss']] + loss_supports_inequalities = True + else: + # Using a regular keras loss. No inequalities supported. + loss_name_or_function = self.hyperparameters['loss'] + loss_supports_inequalities = False + + if any(inequality != "=" for inequality in adjusted_inequalities): + raise ValueError("Loss %s does not support inequalities" % ( + loss_name_or_function)) + if self.network() is None: self._network = self.make_network( pseudosequence_length=pseudosequence_length, **self.network_hyperparameter_defaults.subselect( self.hyperparameters)) - self.compile() - - y_dict_with_random_negatives = { - "output": numpy.concatenate([ - from_ic50( - numpy.random.uniform( - self.hyperparameters[ - 'random_negative_affinity_min'], - self.hyperparameters[ - 'random_negative_affinity_max'], - int(num_random_negative.sum()))), - y_values, - ]), - } + self.network().compile( + loss=loss_name_or_function, + optimizer=self.hyperparameters['optimizer']) + + if loss_supports_inequalities: + # Do not sample negative affinities: just use an inequality. + random_negative_ic50 = self.hyperparameters['random_negative_affinity_min'] + random_negative_target = from_ic50(random_negative_ic50) + + y_dict_with_random_negatives = { + "output": numpy.concatenate([ + numpy.tile( + random_negative_target, int(num_random_negative.sum())), + y_values, + ]), + } + # Note: we are using "<" here not ">" because the inequalities are + # now in target-space (0-1) not affinity-space. + adjusted_inequalities_with_random_negatives = ( + ["<"] * int(num_random_negative.sum()) + + list(adjusted_inequalities)) + else: + # Randomly sample random negative affinities + y_dict_with_random_negatives = { + "output": numpy.concatenate([ + from_ic50( + numpy.random.uniform( + self.hyperparameters[ + 'random_negative_affinity_min'], + self.hyperparameters[ + 'random_negative_affinity_max'], + int(num_random_negative.sum()))), + y_values, + ]), + } if sample_weights is not None: sample_weights_with_random_negatives = numpy.concatenate([ numpy.ones(int(num_random_negative.sum())), @@ -441,6 +504,11 @@ class Class1NeuralNetwork(object): else: sample_weights_with_random_negatives = None + if loss_supports_inequalities: + y_dict_with_random_negatives['output'] = encode_y( + y_dict_with_random_negatives['output'], + adjusted_inequalities_with_random_negatives) + val_losses = [] min_val_loss_iteration = None min_val_loss = None @@ -552,14 +620,6 @@ class Class1NeuralNetwork(object): predictions = numpy.array(raw_predictions, dtype = "float64")[:,0] return to_ic50(predictions) - def compile(self): - """ - Compile the keras model. Used internally. - """ - self.network().compile( - **self.compile_hyperparameter_defaults.subselect( - self.hyperparameters)) - @staticmethod def make_network( pseudosequence_length, diff --git a/mhcflurry/loss_with_inequalities.py b/mhcflurry/loss_with_inequalities.py new file mode 100644 index 0000000000000000000000000000000000000000..a1efe0682b28b7c62028a5f4bdaf0d581ef7cb39 --- /dev/null +++ b/mhcflurry/loss_with_inequalities.py @@ -0,0 +1,78 @@ +""" +Custom loss functions. + +Supports training a regressor on data that includes inequalities (e.g. x < 100). + +This loss assumes that the normal range for y_true and y_pred is 0 - 1. As a +hack, the implementation uses other intervals for y_pred to encode the +inequality information. + +y_true is interpreted as follows: + +between 0 - 1 + Regular MSE loss is used. Penality (y_pred - y_true)**2 is applied if + y_pred is greater or less than y_true. + +between 2 - 3: + Treated as a "<" inequality. Penality (y_pred - (y_true - 2))**2 is + applied only if y_pred is greater than y_true - 2. + +between 4 - 5: + Treated as a ">" inequality. Penality (y_pred - (y_true - 4))**2 is + applied only if y_pred is less than y_true - 4. +""" + +from keras import backend as K + +import pandas +import numpy + +LOSSES = {} + + +def encode_y(y, inequalities=None): + y = numpy.array(y, dtype="float32") + if y.isnan().any(): + raise ValueError("y contains NaN") + if (y > 1.0).any(): + raise ValueError("y contains values > 1.0") + if (y < 0.0).any(): + raise ValueError("y contains values < 0.0") + + if inequalities is None: + encoded = y + else: + offsets = pandas.Series(inequalities).map({ + '=': 0, + '<': 2, + '>': 4, + }).values + if offsets.isnan().any(): + raise ValueError("Invalid inequality. Must be =, <, or >") + encoded = y + offsets + assert not encoded.isnan().any() + return encoded + + +def mse_with_ineqalities(y_true, y_pred): + # Handle (=) inequalities + diff1 = y_pred - y_true + diff1 *= K.cast(y_true >= 0.0, "float32") + diff1 *= K.cast(y_true <= 1.0, "float32") + + # Handle (>) inequalities + diff2 = y_pred - (y_true - 2.0) + diff2 *= K.cast(y_true >= 2.0, "float32") + diff2 *= K.cast(y_true <= 3.0, "float32") + diff2 *= K.cast(diff2 < 0.0, "float32") + + # Handle (<) inequalities + diff3 = y_pred - (y_true - 4.0) + diff3 *= K.cast(y_true >= 4.0, "float32") + diff3 *= K.cast(diff3 > 0.0, "float32") + + return ( + K.sum(K.square(diff1), axis=-1) + + K.sum(K.square(diff2), axis=-1) + + K.sum(K.square(diff3), axis=-1)) +LOSSES["mse_with_ineqalities"] = mse_with_ineqalities \ No newline at end of file diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py index 6602761bf7c6a096d7c23438543f297fb8356bb5..b095e6a45ffdf64bacba6e3125a6a521d5493bb6 100644 --- a/mhcflurry/train_allele_specific_models_command.py +++ b/mhcflurry/train_allele_specific_models_command.py @@ -259,7 +259,7 @@ def process_work( train_data = data.sample(frac=1.0) (model,) = predictor.fit_allele_specific_predictors( n_models=1, - architecture_hyperparameters=hyperparameters, + architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=train_data.peptide.values, affinities=train_data.measurement_value.values, diff --git a/test/test_class1_affinity_predictor.py b/test/test_class1_affinity_predictor.py index 65f1f587316b9c56c1c30dfa0ff4c828c88c9648..320b3961beb6e5058463f7aabdf087762110036b 100644 --- a/test/test_class1_affinity_predictor.py +++ b/test/test_class1_affinity_predictor.py @@ -92,7 +92,7 @@ def test_a1_known_epitopes_in_newly_trained_model(): predictor = Class1AffinityPredictor() predictor.fit_allele_specific_predictors( n_models=2, - architecture_hyperparameters=hyperparameters, + architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=df.peptide.values, affinities=df.measurement_value.values, @@ -153,7 +153,7 @@ def test_class1_affinity_predictor_a0205_memorize_training_data(): predictor = Class1AffinityPredictor() predictor.fit_allele_specific_predictors( n_models=2, - architecture_hyperparameters=hyperparameters, + architecture_hyperparameters_list=[hyperparameters], allele=allele, peptides=df.peptide.values, affinities=df.measurement_value.values,