From 9e89b4fae7dcb1c0d37a89dfeedee9397c8d682e Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Thu, 23 Jan 2020 18:13:53 -0500 Subject: [PATCH] working on presentation model --- .travis.yml | 1 + .../class1_presentation_neural_network.py | 668 +++++++----------- mhcflurry/class1_presentation_predictor.py | 41 +- mhcflurry/multiallelic_refinement_command.py | 2 +- mhcflurry/random_negative_peptides.py | 2 +- ...test_class1_presentation_neural_network.py | 307 ++------ test/test_class1_presentation_predictor.py | 2 +- 7 files changed, 342 insertions(+), 681 deletions(-) diff --git a/.travis.yml b/.travis.yml index 58b32a53..b4b2f9f5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -56,6 +56,7 @@ script: data_mass_spec_annotated models_class1 models_class1_pan + models_class1_pan_variants allele_sequences --already-downloaded-dir /tmp/downloads - mhcflurry-downloads info # just to test this command works diff --git a/mhcflurry/class1_presentation_neural_network.py b/mhcflurry/class1_presentation_neural_network.py index 32ab07bf..b8e30958 100644 --- a/mhcflurry/class1_presentation_neural_network.py +++ b/mhcflurry/class1_presentation_neural_network.py @@ -8,29 +8,19 @@ import numpy import pandas import mhcnames import hashlib +from copy import copy from .hyperparameters import HyperparameterDefaults from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE +from .class1_cleavage_neural_network import Class1CleavageNeuralNetwork from .encodable_sequences import EncodableSequences -from .regression_target import from_ic50, to_ic50 -from .random_negative_peptides import RandomNegativePeptides from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding from .auxiliary_input import AuxiliaryInputEncoder -from .batch_generator import BatchGenerator -from .custom_loss import ( - MSEWithInequalities, - TransformPredictionsLossWrapper, - MultiallelicMassSpecLoss) +from .flanking_encoding import FlankingEncoding class Class1PresentationNeuralNetwork(object): network_hyperparameter_defaults = HyperparameterDefaults( - allele_amino_acid_encoding="BLOSUM62", - peptide_encoding={ - 'vector_encoding_name': 'BLOSUM62', - 'alignment_method': 'left_pad_centered_right_pad', - 'max_length': 15, - }, max_alleles=6, ) """ @@ -39,11 +29,12 @@ class Class1PresentationNeuralNetwork(object): """ fit_hyperparameter_defaults = HyperparameterDefaults( + trainable_cleavage_predictor=False, + trainable_affinity_predictor=False, max_epochs=500, + validation_split=0.1, early_stopping=True, - random_negative_affinity_min=20000.0,).extend( - RandomNegativePeptides.hyperparameter_defaults).extend( - BatchGenerator.hyperparameter_defaults + minibatch_size=256, ) """ Hyperparameters for neural network training. @@ -58,8 +49,7 @@ class Class1PresentationNeuralNetwork(object): """ compile_hyperparameter_defaults = HyperparameterDefaults( - loss_multiallelic_mass_spec_delta=0.2, - loss_multiallelic_mass_spec_multiplier=1.0, + loss="binary_crossentropy", optimizer="rmsprop", learning_rate=None, ) @@ -71,6 +61,7 @@ class Class1PresentationNeuralNetwork(object): auxiliary_input_hyperparameter_defaults = HyperparameterDefaults( auxiliary_input_features=["gene"], auxiliary_input_feature_parameters={}, + include_cleavage=True, ) """ Allele feature hyperparameters. @@ -88,54 +79,48 @@ class Class1PresentationNeuralNetwork(object): self.network = None self.fit_info = [] self.allele_representation_hash = None + self.affinity_model = None + self.cleavage_model = None - def load_from_class1_neural_network(self, model): + def build(self, affinity_model, cleavage_model=None): import keras.backend as K - from keras.layers import ( - Input, - TimeDistributed, - Dense, - Flatten, - RepeatVector, - Reshape, - concatenate, - Activation, - Lambda, - Add, - Multiply, - Embedding) - from keras.models import Model - from keras.initializers import Zeros - - assert isinstance(model, Class1NeuralNetwork), model - affinity_network = model.network() + import keras.models + + assert isinstance(affinity_model, Class1NeuralNetwork), affinity_model + affinity_model = copy(affinity_model) + + self.affinity_model = affinity_model + affinity_network = affinity_model.network() + + model_inputs = {} peptide_shape = tuple( int(x) for x in K.int_shape(affinity_network.inputs[0])[1:]) - input_alleles = Input( - shape=(self.hyperparameters['max_alleles'],), name="allele") - input_peptides = Input( + model_inputs['allele_set'] = keras.layers.Input( + shape=(self.hyperparameters['max_alleles'],), name="allele_set") + model_inputs['peptide'] = keras.layers.Input( shape=peptide_shape, dtype='float32', name='peptide') - peptides_flattened = Flatten()(input_peptides) - peptides_repeated = RepeatVector(self.hyperparameters['max_alleles'])( + peptides_flattened = keras.layers.Flatten()(model_inputs['peptide']) + peptides_repeated = keras.layers.RepeatVector( + self.hyperparameters['max_alleles'])( peptides_flattened) - allele_representation = Embedding( + allele_representation = keras.layers.Embedding( name="allele_representation", input_dim=64, # arbitrary, how many alleles to have room for output_dim=affinity_network.get_layer( "allele_representation").output_shape[-1], input_length=self.hyperparameters['max_alleles'], trainable=False, - mask_zero=False)(input_alleles) + mask_zero=False)(model_inputs['allele_set']) allele_flat = allele_representation - allele_peptide_merged = concatenate( + allele_peptide_merged = keras.layers.concatenate( [peptides_repeated, allele_flat], name="allele_peptide_merged") layer_names = [ @@ -159,56 +144,61 @@ class Class1PresentationNeuralNetwork(object): "allele_peptide_merged") + 1: ] node = allele_peptide_merged - layer_name_to_new_node = { + affinity_predictor_layer_name_to_new_node = { "allele_peptide_merged": allele_peptide_merged, } for layer in layers: - assert layer.name not in layer_name_to_new_node + assert layer.name not in affinity_predictor_layer_name_to_new_node input_layer_names = [] for inbound_node in layer._inbound_nodes: for inbound_layer in inbound_node.inbound_layers: input_layer_names.append(inbound_layer.name) input_nodes = [ - layer_name_to_new_node[name] + affinity_predictor_layer_name_to_new_node[name] for name in input_layer_names ] if len(input_nodes) == 1: - lifted = TimeDistributed(layer) + lifted = keras.layers.TimeDistributed(layer, name=layer.name) node = lifted(input_nodes[0]) else: node = layer(input_nodes) - layer_name_to_new_node[layer.name] = node + affinity_predictor_layer_name_to_new_node[layer.name] = node - node = Reshape( - (self.hyperparameters['max_alleles'],), - name="unmasked_affinity_matrix_output")(node) - - pre_mask_affinity_predictor_matrix_output = node + def logit(x): + import tensorflow as tf + return -tf.log(1. / x - 1.) + + #node = keras.layers.Lambda(logit, name="logit")(node) + affinity_prediction_and_other_signals = [node] + if self.hyperparameters['include_cleavage']: + assert isinstance(cleavage_model, Class1CleavageNeuralNetwork) + cleavage_model = copy(cleavage_model) + self.cleavage_model = cleavage_model + cleavage_network = cleavage_model.network() + + model_inputs['sequence'] = keras.layers.Input( + shape=cleavage_network.get_layer("sequence").output_shape[1:], + dtype='float32', + name='sequence') + model_inputs['peptide_length'] = keras.layers.Input( + shape=(1,), + dtype='int32', + name='peptide_length') + cleavage_network.name = "cleavage_predictor" + cleavage_prediction = cleavage_network([ + model_inputs['peptide_length'], + model_inputs['sequence'], + ]) + cleavage_prediction.trainable = False + cleavage_prediction_repeated = keras.layers.RepeatVector( + self.hyperparameters['max_alleles'])(cleavage_prediction) + affinity_prediction_and_other_signals.append( + cleavage_prediction_repeated) - # Apply allele mask: zero out all outputs corresponding to alleles - # with the special index 0. - def alleles_to_mask(x): - import keras.backend as K - result = K.cast(K.not_equal(x, 0), "float32") - return result - - allele_mask = Lambda(alleles_to_mask, name="allele_mask")(input_alleles) - - affinity_predictor_matrix_output = Multiply( - name="affinity_matrix_output")([ - allele_mask, - pre_mask_affinity_predictor_matrix_output - ]) - node = Reshape( - (self.hyperparameters['max_alleles'], 1), - name="expand_dims_affinity_matrix_output")( - affinity_predictor_matrix_output) - - auxiliary_input = None if self.hyperparameters['auxiliary_input_features']: - auxiliary_input = Input( + model_inputs['auxiliary'] = keras.layers.Input( shape=( self.hyperparameters['max_alleles'], len( @@ -218,116 +208,154 @@ class Class1PresentationNeuralNetwork(object): 'auxiliary_input_feature_parameters']))), dtype="float32", name="auxiliary") - node = concatenate( - [node, auxiliary_input], name="affinities_with_auxiliary") - - layer = Dense(8, activation="tanh") - lifted = TimeDistributed(layer, name="presentation_adjustment_hidden1") - node = lifted(node) - - # By initializing to zero we ensure that before training the - # presentation output is the same as the affinity output. - layer = Dense( - 1, - activation="tanh", - kernel_initializer=Zeros(), - bias_initializer=Zeros()) - lifted = TimeDistributed(layer, name="presentation_adjustment") - presentation_adjustment = lifted(node) - presentation_adjustment = Reshape( - target_shape=(self.hyperparameters['max_alleles'],), - name="reshaped_presentation_adjustment")(presentation_adjustment) - - - def logit(x): - import tensorflow as tf - return - tf.math.log( - tf.maximum( - tf.math.divide_no_nan(1., x) - 1., - 0.0)) - - presentation_output_pre_sigmoid = Add()([ - Lambda(logit, name="logit")(affinity_predictor_matrix_output), - presentation_adjustment, - ]) - pre_mask_presentation_output = Activation( - "sigmoid", name="unmasked_presentation_output")( - presentation_output_pre_sigmoid) + affinity_prediction_and_other_signals.append( + model_inputs['auxiliary']) + + if len(affinity_prediction_and_other_signals) > 1: + node = keras.layers.concatenate( + affinity_prediction_and_other_signals, + name="affinity_prediction_and_other_signals") + layer = keras.layers.Dense( + 1, + activation="sigmoid", + kernel_initializer=keras.initializers.Ones(), + name="combine") + lifted = keras.layers.TimeDistributed(layer, name="per_allele_output") + node = lifted(node) + else: + (node,) = affinity_prediction_and_other_signals # Apply allele mask: zero out all outputs corresponding to alleles # with the special index 0. - presentation_output = Multiply(name="presentation_output")([ - allele_mask, - pre_mask_presentation_output - ]) - - self.network = Model( - inputs=[ - input_peptides, - input_alleles, - ] + ([] if auxiliary_input is None else [auxiliary_input]), - outputs=[ - affinity_predictor_matrix_output, - presentation_output, - ], + #def alleles_to_mask(x): + # import keras.backend as K + # result = K.expand_dims( + # K.cast(K.not_equal(x, 0), "float32"), axis=-1) + # return result + + #allele_mask = keras.layers.Lambda( + # alleles_to_mask, name="allele_mask")(model_inputs['allele_set']) + + #node = keras.layers.Multiply( + # name="masked_per_allele_outputs")( + # [allele_mask, node]) + + presentation_output = keras.layers.Reshape( + target_shape=(self.hyperparameters['max_alleles'],))( + node) + + self.network = keras.models.Model( + inputs=list(model_inputs.values()), + outputs=presentation_output, name="presentation", ) - def copy_weights_to_affinity_model(self, model): - # We assume that the other model's layers are a prefix of ours. - self.clear_allele_representations() - model.clear_allele_representations() - model.network().set_weights( - self.get_weights()[:len(model.get_weights())]) + if not self.hyperparameters['trainable_cleavage_predictor']: + if self.hyperparameters['include_cleavage']: + self.network.get_layer("cleavage_predictor").trainable = False - def peptides_to_network_input(self, peptides): - """ - Encode peptides to the fixed-length encoding expected by the neural - network (which depends on the architecture). + self.affinity_predictor_layer_names = list( + affinity_predictor_layer_name_to_new_node) - Parameters - ---------- - peptides : EncodableSequences or list of string + self.set_trainable( + trainable_affinity_predictor=( + self.hyperparameters['trainable_affinity_predictor'])) - Returns - ------- - numpy.array - """ - encoder = EncodableSequences.create(peptides) - encoded = encoder.variable_length_to_fixed_length_vector_encoding( - **self.hyperparameters['peptide_encoding']) - assert len(encoded) == len(peptides) - return encoded + def set_trainable(self, trainable_affinity_predictor=None): + if trainable_affinity_predictor is not None: + for name in self.affinity_predictor_layer_names: + self.network.get_layer(name).trainable = trainable_affinity_predictor - def allele_encoding_to_network_input(self, allele_encoding): + + @staticmethod + def loss(y_true, y_pred): + # Binary cross entropy + from keras import backend as K + import tensorflow as tf + + y_pred = K.constant(y_pred) if not K.is_tensor(y_pred) else y_pred + y_true = K.cast(y_true, y_pred.dtype) + + #y_pred = tf.Print(y_pred, [y_pred], message="y_pred", summarize=50) + #y_true = tf.Print(y_true, [y_true], message="y_true", summarize=50) + + #logit_y_pred = -tf.log(1. / y_pred - 1.) + #logit_y_pred = tf.Print(logit_y_pred, [logit_y_pred], message="logit_y_pred", summarize=50) + + #softmax = K.softmax(5 * logit_y_pred, axis=-1) + #softmax = tf.Print(softmax, [softmax], message="softmax", summarize=50) + + #product = softmax * y_pred + #product = tf.Print(product, [product], message="product", summarize=50) + + #result = tf.reduce_sum(product, axis=-1) + #result = tf.Print(result, [result], message="result", summarize=50) + + #result = tf.reduce_max(y_pred, axis=-1) + result = tf.reduce_sum(y_pred, axis=-1) + + return K.mean( + K.binary_crossentropy(y_true, result), + axis=-1) + + def network_input( + self, peptides, allele_encoding, flanking_encoding=None): """ - Encode alleles to the fixed-length encoding expected by the neural - network (which depends on the architecture). Parameters ---------- + peptides : EncodableSequences or list of string + allele_encoding : AlleleEncoding - Returns - ------- - (numpy.array, numpy.array) + flanking_encoding: Flank - Indices and allele representations. + Returns + ------- + numpy.array """ - return ( - allele_encoding.indices, - allele_encoding.allele_representations( - self.hyperparameters['allele_amino_acid_encoding'])) + assert self.affinity_model is not None + + (allele_input, allele_representations) = ( + self.affinity_model.allele_encoding_to_network_input( + allele_encoding)) + peptides = EncodableSequences.create(peptides) + x_dict = { + 'peptide': self.affinity_model.peptides_to_network_input(peptides), + 'allele_set': allele_input, + } + if self.hyperparameters['include_cleavage']: + assert self.cleavage_model is not None + numpy.testing.assert_array_equal( + peptides.sequences, + flanking_encoding.dataframe.peptide.values) + if flanking_encoding is None: + raise RuntimeError("flanking_encoding required") + cleavage_x_dict = self.cleavage_model.network_input( + flanking_encoding) + x_dict.update(cleavage_x_dict) + if self.hyperparameters['auxiliary_input_features']: + auxiliary_encoder = AuxiliaryInputEncoder( + alleles=allele_encoding.alleles, + peptides=peptides.sequences) + x_dict[ + 'auxiliary' + ] = auxiliary_encoder.get_array( + features=self.hyperparameters['auxiliary_input_features'], + feature_parameters=self.hyperparameters[ + 'auxiliary_input_feature_parameters']) * 0.01 + #import ipdb;ipdb.set_trace() + return (x_dict, allele_representations) def fit( self, + targets, peptides, - labels, allele_encoding, - affinities_mask=None, # True when a peptide/label is actually a peptide and an affinity - inequalities=None, # interpreted only for elements where affinities_mask is True, otherwise ignored - validation_weights=None, + flanking_encoding=None, + sample_weights=None, + shuffle_permutation=None, verbose=1, progress_callback=None, progress_preamble="", @@ -340,260 +368,59 @@ class Class1PresentationNeuralNetwork(object): allele_encoding.max_alleles_per_experiment == self.hyperparameters['max_alleles']) - encodable_peptides = EncodableSequences.create(peptides) - - if labels is not None: - labels = numpy.array(labels, copy=False) - if inequalities is not None: - inequalities = numpy.array(inequalities, copy=True) - else: - inequalities = numpy.tile("=", len(labels)) - if affinities_mask is not None: - affinities_mask = numpy.array(affinities_mask, copy=False) - else: - affinities_mask = numpy.tile(False, len(labels)) - if validation_weights is None: - validation_weights = numpy.tile(1.0, len(labels)) - else: - validation_weights = numpy.array(validation_weights, copy=False) - inequalities[~affinities_mask] = "=" - - random_negatives_planner = RandomNegativePeptides( - **RandomNegativePeptides.hyperparameter_defaults.subselect( - self.hyperparameters)) - random_negatives_planner.plan( - peptides=encodable_peptides.sequences, - affinities=numpy.where(affinities_mask, labels, to_ic50(labels)), - alleles=[ - numpy.random.choice(row[row != numpy.array(None)]) - for row in allele_encoding.alleles - ], - inequalities=inequalities) - - peptide_input = self.peptides_to_network_input(encodable_peptides) - - # Optional optimization - (allele_encoding_input, allele_representations) = ( - self.allele_encoding_to_network_input(allele_encoding)) - - x_dict_without_random_negatives = { - 'peptide': peptide_input, - 'allele': allele_encoding_input, - } - if self.hyperparameters['auxiliary_input_features']: - auxiliary_encoder = AuxiliaryInputEncoder( - alleles=allele_encoding.alleles, - peptides=peptides) - x_dict_without_random_negatives[ - 'auxiliary' - ] = auxiliary_encoder.get_array( - features=self.hyperparameters['auxiliary_input_features'], - feature_parameters=self.hyperparameters[ - 'auxiliary_input_feature_parameters']) - - y1 = numpy.zeros(shape=len(labels)) - y1[affinities_mask] = from_ic50(labels[affinities_mask]) - - random_negative_alleles = random_negatives_planner.get_alleles() - random_negatives_allele_encoding = MultipleAlleleEncoding( - experiment_names=random_negative_alleles, - experiment_to_allele_list=dict( - (a, [a]) for a in random_negative_alleles), - max_alleles_per_experiment=( - allele_encoding.max_alleles_per_experiment), - borrow_from=allele_encoding.allele_encoding) - num_random_negatives = random_negatives_planner.get_total_count() - - # Reverse inequalities because from_ic50() flips the direction - # (i.e. lower affinity results in higher y values). - adjusted_inequalities = pandas.Series(inequalities).map({ - "=": "=", - ">": "<", - "<": ">", - }).values - adjusted_inequalities[~affinities_mask] = ">" - - # Note: we are using "<" here not ">" because the inequalities are - # now in target-space (0-1) not affinity-space. - adjusted_inequalities_with_random_negative = numpy.concatenate([ - numpy.tile("<", num_random_negatives), - adjusted_inequalities - ]) - random_negative_ic50 = self.hyperparameters[ - 'random_negative_affinity_min' - ] - y1_with_random_negatives = numpy.concatenate([ - numpy.tile( - from_ic50(random_negative_ic50), num_random_negatives), - y1, - ]) - - def tensor_max(matrix): - import keras.backend as K - return K.max(matrix, axis=1) - - affinities_loss = TransformPredictionsLossWrapper( - loss=MSEWithInequalities(), - y_pred_transform=tensor_max) - encoded_y1 = affinities_loss.encode_y( - y1_with_random_negatives, - inequalities=adjusted_inequalities_with_random_negative) - - mms_loss = MultiallelicMassSpecLoss( - delta=self.hyperparameters['loss_multiallelic_mass_spec_delta'], - multiplier=self.hyperparameters[ - 'loss_multiallelic_mass_spec_multiplier']) - y2 = labels.copy() - y2[affinities_mask] = -1 - y2_with_random_negatives = numpy.concatenate([ - numpy.tile(0.0, num_random_negatives), - y2, - ]) - encoded_y2 = mms_loss.encode_y(y2_with_random_negatives) + (x_dict, allele_representations) = ( + self.network_input( + peptides=peptides, + allele_encoding=allele_encoding, + flanking_encoding=flanking_encoding)) + + # Shuffle + if shuffle_permutation is None: + shuffle_permutation = numpy.random.permutation(len(targets)) + targets = numpy.array(targets)[shuffle_permutation] + assert numpy.isnan(targets).sum() == 0, targets + if sample_weights is not None: + sample_weights = numpy.array(sample_weights)[shuffle_permutation] + for key in list(x_dict): + x_dict[key] = x_dict[key][shuffle_permutation] + del peptides + del allele_encoding + del flanking_encoding fit_info = collections.defaultdict(list) allele_representations_hash = self.set_allele_representations( allele_representations) - loss_reduction = "sum" + self.network.compile( - loss=[ - affinities_loss.get_keras_loss(reduction=loss_reduction), - mms_loss.get_keras_loss(reduction=loss_reduction), - ], + loss=self.loss, optimizer=self.hyperparameters['optimizer']) if self.hyperparameters['learning_rate'] is not None: K.set_value( self.network.optimizer.lr, self.hyperparameters['learning_rate']) - fit_info["learning_rate"] = float( - K.get_value(self.network.optimizer.lr)) + fit_info["learning_rate"] = float(K.get_value(self.network.optimizer.lr)) if verbose: self.network.summary() - batch_generator = BatchGenerator.create( - hyperparameters=BatchGenerator.hyperparameter_defaults.subselect( - self.hyperparameters)) - start = time.time() - batch_generator.plan( - num=len(peptides) + num_random_negatives, - affinities_mask=numpy.concatenate([ - numpy.tile(True, num_random_negatives), - affinities_mask - ]), - experiment_names=numpy.concatenate([ - numpy.tile(None, num_random_negatives), - allele_encoding.experiment_names - ]), - alleles_matrix=numpy.concatenate([ - random_negatives_allele_encoding.alleles, - allele_encoding.alleles, - ]), - is_binder=numpy.concatenate([ - numpy.tile(False, num_random_negatives), - numpy.where(affinities_mask, labels, to_ic50(labels)) < 1000.0 - ]), - validation_weights=numpy.concatenate([ - numpy.tile(0.0, num_random_negatives), - validation_weights - ]), - ) - if verbose: - print("Generated batch generation plan in %0.2f sec." % ( - time.time() - start)) - print(batch_generator.summary()) + training_start = time.time() min_val_loss_iteration = None min_val_loss = None last_progress_print = 0 - start = time.time() - x_dict_with_random_negatives = {} for i in range(self.hyperparameters['max_epochs']): epoch_start = time.time() - - random_negative_peptides = EncodableSequences.create( - random_negatives_planner.get_peptides()) - random_negative_peptides_encoding = ( - self.peptides_to_network_input(random_negative_peptides)) - - if not x_dict_with_random_negatives: - if len(random_negative_peptides) > 0: - x_dict_with_random_negatives[ - "peptide" - ] = numpy.concatenate([ - random_negative_peptides_encoding, - x_dict_without_random_negatives['peptide'], - ]) - x_dict_with_random_negatives[ - 'allele' - ] = numpy.concatenate([ - self.allele_encoding_to_network_input( - random_negatives_allele_encoding)[0], - x_dict_without_random_negatives['allele'] - ]) - if 'auxiliary' in x_dict_without_random_negatives: - random_negative_auxiliary_encoder = AuxiliaryInputEncoder( - alleles=random_negatives_allele_encoding.alleles, - #peptides=random_negative_peptides.sequences - ) - x_dict_with_random_negatives['auxiliary'] = ( - numpy.concatenate([ - random_negative_auxiliary_encoder.get_array( - features=self.hyperparameters[ - 'auxiliary_input_features'], - feature_parameters=self.hyperparameters[ - 'auxiliary_input_feature_parameters']), - x_dict_without_random_negatives['auxiliary'] - ])) - else: - x_dict_with_random_negatives = ( - x_dict_without_random_negatives) - else: - # Update x_dict_with_random_negatives in place. - # This is more memory efficient than recreating it as above. - if len(random_negative_peptides) > 0: - x_dict_with_random_negatives[ - "peptide" - ][:num_random_negatives] = random_negative_peptides_encoding - - if i == 0: - (train_generator, test_generator) = ( - batch_generator.get_train_and_test_generators( - x_dict=x_dict_with_random_negatives, - y_list=[encoded_y1, encoded_y2], - epochs=1)) - pairs = [ - ("train", train_generator, batch_generator.num_train_batches), - ("test", test_generator, batch_generator.num_test_batches), - ] - for (kind, generator, steps) in pairs: - self.assert_allele_representations_hash( - allele_representations_hash) - metrics = self.network.evaluate_generator( - generator=generator, - steps=steps, - workers=0, - use_multiprocessing=False) - for (key, val) in zip(self.network.metrics_names, metrics): - fit_info["pre_fit_%s_%s" % (kind, key)] = val - (train_generator, test_generator) = ( - batch_generator.get_train_and_test_generators( - x_dict=x_dict_with_random_negatives, - y_list=[encoded_y1, encoded_y2], - epochs=1)) self.assert_allele_representations_hash(allele_representations_hash) - fit_history = self.network.fit_generator( - train_generator, - steps_per_epoch=batch_generator.num_train_batches, + fit_history = self.network.fit( + x_dict, + targets, + validation_split=self.hyperparameters['validation_split'], + batch_size=self.hyperparameters['minibatch_size'], epochs=i + 1, + sample_weight=sample_weights, initial_epoch=i, - verbose=verbose, - use_multiprocessing=False, - workers=0, - validation_data=test_generator, - validation_steps=batch_generator.num_test_batches) - + verbose=verbose) epoch_time = time.time() - epoch_start for (key, value) in fit_history.history.items(): @@ -608,21 +435,25 @@ class Class1PresentationNeuralNetwork(object): time.time() - last_progress_print > progress_print_interval)): print((progress_preamble + " " + - "Epoch %3d / %3d [%0.2f sec]: loss=%g. " + "Epoch %3d / %3d [%0.2f sec]: loss=%g val_loss=%g. " "Min val loss (%s) at epoch %s" % ( i, self.hyperparameters['max_epochs'], epoch_time, fit_info['loss'][-1], + ( + fit_info['val_loss'][-1] + if 'val_loss' in fit_info else numpy.nan + ), str(min_val_loss), min_val_loss_iteration)).strip()) last_progress_print = time.time() - if batch_generator.num_test_batches: + if self.hyperparameters['validation_split']: val_loss = fit_info['val_loss'][-1] + if min_val_loss is None or ( - val_loss < min_val_loss - - self.hyperparameters['min_delta']): + val_loss < min_val_loss - self.hyperparameters['min_delta']): min_val_loss = val_loss min_val_loss_iteration = i @@ -647,51 +478,28 @@ class Class1PresentationNeuralNetwork(object): if progress_callback: progress_callback() - fit_info["time"] = time.time() - start - fit_info["num_points"] = len(labels) + fit_info["time"] = time.time() - training_start + fit_info["num_points"] = len(targets) self.fit_info.append(dict(fit_info)) - return { - 'batch_generator': batch_generator, - 'last_x': x_dict_with_random_negatives, - 'last_y': [encoded_y1, encoded_y2], - 'fit_info': fit_info, - } - - Predictions = collections.namedtuple( - "ligandone_neural_network_predictions", - "affinity score") - def predict( self, peptides, - allele_encoding=None, + allele_encoding, + flanking_encoding=None, batch_size=DEFAULT_PREDICT_BATCH_SIZE): peptides = EncodableSequences.create(peptides) assert isinstance(allele_encoding, MultipleAlleleEncoding) - (allele_encoding_input, allele_representations) = ( - self.allele_encoding_to_network_input(allele_encoding)) - self.set_allele_representations(allele_representations) - x_dict = { - 'peptide': self.peptides_to_network_input(peptides), - 'allele': allele_encoding_input, - } - if self.hyperparameters['auxiliary_input_features']: - auxiliary_encoder = AuxiliaryInputEncoder( - alleles=allele_encoding.alleles, - peptides=peptides.sequences) - x_dict[ - 'auxiliary' - ] = auxiliary_encoder.get_array( - features=self.hyperparameters['auxiliary_input_features'], - feature_parameters=self.hyperparameters[ - 'auxiliary_input_feature_parameters']) + (x_dict, allele_representations) = self.network_input( + peptides=peptides, + allele_encoding=allele_encoding, + flanking_encoding=flanking_encoding) - predictions = self.Predictions._make( - self.network.predict(x_dict, batch_size=batch_size)) - return predictions + self.set_allele_representations(allele_representations) + raw_predictions = self.network.predict(x_dict, batch_size=batch_size) + return raw_predictions def clear_allele_representations(self): """ diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py index cc266f22..fcea5053 100644 --- a/mhcflurry/class1_presentation_predictor.py +++ b/mhcflurry/class1_presentation_predictor.py @@ -26,6 +26,7 @@ from .allele_encoding import MultipleAlleleEncoding from .downloads import get_default_class1_presentation_models_dir from .class1_presentation_neural_network import Class1PresentationNeuralNetwork from .common import save_weights, load_weights, NumpyJSONEncoder +from .flanking_encoding import FlankingEncoding class Class1PresentationPredictor(object): @@ -104,16 +105,27 @@ class Class1PresentationPredictor(object): """ return join(models_dir, "weights_%s.npz" % model_name) - def predict(self, peptides, alleles, batch_size=DEFAULT_PREDICT_BATCH_SIZE): + def predict( + self, + peptides, + alleles, + n_flanks=None, + c_flanks=None, + batch_size=DEFAULT_PREDICT_BATCH_SIZE): return self.predict_to_dataframe( peptides=peptides, alleles=alleles, + n_flanks=n_flanks, + c_flanks=c_flanks, batch_size=batch_size).score.values def predict_to_dataframe( self, peptides, alleles, + n_flanks=None, + c_flanks=None, + flanking_encoding=None, include_details=False, batch_size=DEFAULT_PREDICT_BATCH_SIZE): @@ -146,31 +158,38 @@ class Class1PresentationPredictor(object): allele_to_sequence=self.allele_to_sequence, max_alleles_per_experiment=self.max_alleles) + if n_flanks is not None: + if flanking_encoding is not None: + raise ValueError( + "Specify either n_flanks/c_flanks or flanking_encoding, not" + "both.") + if c_flanks is None: + raise ValueError("Both flanks required") + flanking_encoding = FlankingEncoding( + peptides=peptides.sequences, + n_flanks=n_flanks, + c_flanks=c_flanks) + score_array = [] - affinity_array = [] for (i, network) in enumerate(self.models): predictions = network.predict( peptides=peptides, allele_encoding=alleles, + flanking_encoding=flanking_encoding, batch_size=batch_size) - score_array.append(predictions.score) - affinity_array.append(predictions.affinity) + score_array.append(predictions) score_array = numpy.array(score_array) - affinity_array = numpy.array(affinity_array) ensemble_scores = numpy.mean(score_array, axis=0) - ensemble_affinity = numpy.mean(affinity_array, axis=0) top_allele_index = numpy.argmax(ensemble_scores, axis=-1) top_allele_flat_indices = ( numpy.arange(len(peptides)) * self.max_alleles + top_allele_index) top_score = ensemble_scores.flatten()[top_allele_flat_indices] - top_affinity = ensemble_affinity.flatten()[top_allele_flat_indices] result_df = pandas.DataFrame({"peptide": peptides.sequences}) result_df["allele"] = alleles.alleles.flatten()[top_allele_flat_indices] result_df["score"] = top_score - result_df["affinity"] = to_ic50(top_affinity) if include_details: for i in range(self.max_alleles): @@ -180,12 +199,6 @@ class Class1PresentationPredictor(object): score_array[:, :, i], 5.0, axis=0) result_df["allele%d score high" % (i + 1)] = numpy.percentile( score_array[:, :, i], 95.0, axis=0) - result_df["allele%d affinity" % (i + 1)] = to_ic50( - ensemble_affinity[:, i]) - result_df["allele%d affinity low" % (i + 1)] = to_ic50( - numpy.percentile(affinity_array[:, :, i], 95.0, axis=0)) - result_df["allele%d affinity high" % (i + 1)] = to_ic50( - numpy.percentile(affinity_array[:, :, i], 5.0, axis=0)) return result_df def check_consistency(self): diff --git a/mhcflurry/multiallelic_refinement_command.py b/mhcflurry/multiallelic_refinement_command.py index 38cbb0e5..6165c4bc 100644 --- a/mhcflurry/multiallelic_refinement_command.py +++ b/mhcflurry/multiallelic_refinement_command.py @@ -310,7 +310,7 @@ def refine_model( presentation_model.load_from_class1_neural_network(affinity_model) presentation_model.fit( peptides=combined_train_df.peptide.values, - labels=combined_train_df.label.values, + targets=combined_train_df.label.values, allele_encoding=allele_encoding, affinities_mask=combined_train_df.is_affinity.values, inequalities=combined_train_df.measurement_inequality.values, diff --git a/mhcflurry/random_negative_peptides.py b/mhcflurry/random_negative_peptides.py index 978a93e3..684a8aa8 100644 --- a/mhcflurry/random_negative_peptides.py +++ b/mhcflurry/random_negative_peptides.py @@ -16,7 +16,7 @@ class RandomNegativePeptides(object): hyperparameter_defaults = HyperparameterDefaults( random_negative_rate=0.0, - random_negative_constant=25, + random_negative_constant=0, random_negative_match_distribution=True, random_negative_distribution_smoothing=0.0, random_negative_method="recommended", diff --git a/test/test_class1_presentation_neural_network.py b/test/test_class1_presentation_neural_network.py index 08056889..cb52127b 100644 --- a/test/test_class1_presentation_neural_network.py +++ b/test/test_class1_presentation_neural_network.py @@ -18,6 +18,7 @@ from random import shuffle from sklearn.metrics import roc_auc_score from mhcflurry import Class1AffinityPredictor +from mhcflurry.class1_cleavage_predictor import Class1CleavagePredictor from mhcflurry.allele_encoding import MultipleAlleleEncoding from mhcflurry.class1_presentation_neural_network import Class1PresentationNeuralNetwork from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor @@ -32,7 +33,7 @@ from mhcflurry.regression_target import to_ic50 # disable -sys.exit(0) +#sys.exit(0) ################################################### # SETUP @@ -41,19 +42,26 @@ sys.exit(0) COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) AFFINITY_PREDICTOR = None +CLEAVAGE_PREDICTOR = None + def setup(): global AFFINITY_PREDICTOR + global CLEAVAGE_PREDICTOR startup() AFFINITY_PREDICTOR = Class1AffinityPredictor.load( get_path("models_class1_pan_variants", "models.affinity_only"), optimization_level=0, max_models=1) + CLEAVAGE_PREDICTOR = Class1CleavagePredictor.load(max_models=1) + def teardown(): global AFFINITY_PREDICTOR + global CLEAVAGE_PREDICTOR AFFINITY_PREDICTOR = None + CLEAVAGE_PREDICTOR = None cleanup() @@ -64,6 +72,9 @@ def data_path(name): ''' return os.path.join(os.path.dirname(__file__), "data", name) +#disable +#sys.exit(0) + ################################################### # UTILITY FUNCTIONS @@ -100,7 +111,22 @@ def make_motif(presentation_predictor, allele, peptides, frac=0.01, master_allel # TESTS ################################################### -def Xtest_synthetic_allele_refinement_max_affinity(include_affinities=True): +def Xtest_build(): + global AFFINITY_PREDICTOR + global CLEAVAGE_PREDICTOR + + for include_cleavage in [False, True, False, True]: + print("Include cleavage: %s" % include_cleavage) + model = Class1PresentationNeuralNetwork( + include_cleavage=include_cleavage) + model.build( + affinity_model=AFFINITY_PREDICTOR.class1_pan_allele_models[0], + cleavage_model=CLEAVAGE_PREDICTOR.models[0]) + network = model.network + print(network.summary()) + + +def test_synthetic_allele_refinement(): """ Test that in a synthetic example the model is able to learn that HLA-C*01:02 prefers P at position 3. @@ -140,79 +166,53 @@ def Xtest_synthetic_allele_refinement_max_affinity(include_affinities=True): hits_df = pandas.DataFrame({"peptide": train_peptides}) hits_df["true_allele"] = train_true_alleles hits_df["hit"] = 1.0 - hits_df["label"] = 500 - hits_df["measurement_inequality"] = "<" decoys_df = hits_df.copy() decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide) decoys_df["true_allele"] = "" decoys_df["hit"] = 0.0 - decoys_df["label"] = 500 - hits_df["measurement_inequality"] = ">" - - mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True) - mms_train_df["label"] = mms_train_df.hit - mms_train_df["is_affinity"] = True - - if include_affinities: - affinity_train_df = pandas.read_csv(get_path("models_class1_pan", - "models.combined/train_data.csv.bz2")) - affinity_train_df = affinity_train_df.loc[ - affinity_train_df.allele.isin(alleles), - ["peptide", "allele", "measurement_inequality", "measurement_value"] - ] - - affinity_train_df["label"] = affinity_train_df["measurement_value"] - del affinity_train_df["measurement_value"] - affinity_train_df["is_affinity"] = True - else: - affinity_train_df = None + + train_df = pandas.concat( + [hits_df, decoys_df], ignore_index=True).sample(frac=1.0) (affinity_model,) = AFFINITY_PREDICTOR.class1_pan_allele_models presentation_model = Class1PresentationNeuralNetwork( + include_cleavage=False, + trainable_affinity_predictor=False, auxiliary_input_features=["gene"], - batch_generator_batch_size=1024, + minibatch_size=1024, max_epochs=10, learning_rate=0.001, patience=5, - min_delta=0.0, - random_negative_rate=1.0, - random_negative_constant=25) - presentation_model.load_from_class1_neural_network(affinity_model) + min_delta=0.0) + presentation_model.build(affinity_model) + print(presentation_model.network.summary()) presentation_predictor = Class1PresentationPredictor( models=[presentation_model], allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence) - mms_allele_encoding = MultipleAlleleEncoding( - experiment_names=["experiment1"] * len(mms_train_df), + allele_encoding = MultipleAlleleEncoding( + experiment_names=["experiment1"] * len(train_df), experiment_to_allele_list={ "experiment1": alleles, }, max_alleles_per_experiment=6, allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence, ) - allele_encoding = copy.deepcopy(mms_allele_encoding) - if affinity_train_df is not None: - allele_encoding.append_alleles(affinity_train_df.allele.values) - train_df = pandas.concat([mms_train_df, affinity_train_df], - ignore_index=True, sort=False) - else: - train_df = mms_train_df allele_encoding = allele_encoding.compact() - mms_allele_encoding = mms_allele_encoding.compact() pre_predictions = presentation_model.predict( - peptides=mms_train_df.peptide.values, - allele_encoding=mms_allele_encoding).score + peptides=train_df.peptide.values, + allele_encoding=allele_encoding) - expected_pre_predictions = from_ic50(affinity_model.predict( - peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)), - allele_encoding=mms_allele_encoding.allele_encoding, )).reshape( - (-1, len(alleles))) - assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4) + #expected_pre_predictions = from_ic50(affinity_model.predict( + # peptides=numpy.repeat(train_df.peptide.values, len(alleles)), + # allele_encoding=mms_allele_encoding.allele_encoding, )).reshape( + # (-1, len(alleles))) + #assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4) random_peptides_encodable = EncodableSequences.create( - random_peptides(10000, 9)) + random_peptides(20000, 9)) original_motif = make_motif( presentation_predictor=presentation_predictor, @@ -229,209 +229,47 @@ def Xtest_synthetic_allele_refinement_max_affinity(include_affinities=True): iteration_box[0] += 1 print("*** iteration ", label, "***") predictions_df = presentation_predictor.predict_to_dataframe( - peptides=mms_train_df.peptide.values, - alleles=mms_allele_encoding) - merged_df = pandas.merge(mms_train_df, predictions_df, on="peptide") + peptides=train_df.peptide.values, + alleles=allele_encoding) + merged_df = pandas.merge(train_df, predictions_df, on="peptide") merged_hit_df = merged_df.loc[merged_df.hit == 1.0] correct_allele_fraction = ( merged_hit_df.allele == merged_hit_df.true_allele).mean() print("Correct allele fraction", correct_allele_fraction) print( - "Mean score/affinity for hit", - merged_df.loc[merged_df.hit == 1.0].score.mean(), - merged_df.loc[merged_df.hit == 1.0].affinity.mean()) + "Mean score for hit", + merged_df.loc[merged_df.hit == 1.0].score.mean()) print( - "Mean score/affinity for decoy", - merged_df.loc[merged_df.hit == 0.0].score.mean(), - merged_df.loc[merged_df.hit == 0.0].affinity.mean()) + "Mean score for decoy", + merged_df.loc[merged_df.hit == 0.0].score.mean()) + print("Scores for hit", + merged_df.loc[merged_df.hit == 1.0].score.values) + print("Scores for decoy", + merged_df.loc[merged_df.hit == 0.0].score.values) + print("Weights", presentation_model.network.get_layer("per_allele_output").get_weights()) auc = roc_auc_score(merged_df.hit.values, merged_df.score.values) print("AUC", auc) return (auc, correct_allele_fraction) (pre_auc, pre_correct_allele_fraction) = progress(label="Pre fitting") - presentation_model.fit(peptides=train_df.peptide.values, - labels=train_df.label.values, - inequalities=train_df.measurement_inequality.values, - affinities_mask=train_df.is_affinity.values, + #import ipdb ; ipdb.set_trace() + presentation_model.fit( + peptides=train_df.peptide.values, + targets=train_df.hit.values, allele_encoding=allele_encoding, progress_callback=progress) - (post_auc, post_correct_allele_fraction) = progress(label="Done fitting") - - final_motif = make_motif( - presentation_predictor=presentation_predictor, - peptides=random_peptides_encodable, - allele=refine_allele) - print("Final motif proline-3 rate: ", final_motif.loc[3, "P"]) - - assert_greater(post_auc, pre_auc) - assert_greater( - post_correct_allele_fraction, pre_correct_allele_fraction - 0.05) - assert_greater(final_motif.loc[3, "P"], original_motif.loc[3, "P"]) - - - -def Xtest_synthetic_allele_refinement(include_affinities=True): - """ - Test that in a synthetic example the model is able to learn that HLA-C*01:02 - prefers P at position 3. - """ - refine_allele = "HLA-C*01:02" - alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01", "HLA-A*03:01", - "HLA-B*15:01", refine_allele] - peptides_per_allele = [2000, 1000, 500, 1500, 1200, 800, ] - - allele_to_peptides = dict(zip(alleles, peptides_per_allele)) - - length = 9 - - train_with_ms = pandas.read_csv(get_path("data_curated", - "curated_training_data.csv.bz2")) - train_no_ms = pandas.read_csv( - get_path("data_curated", "curated_training_data.affinity.csv.bz2")) - - def filter_df(df): - return df.loc[ - (df.allele.isin(alleles)) & (df.peptide.str.len() == length)] - - train_with_ms = filter_df(train_with_ms) - train_no_ms = filter_df(train_no_ms) - - ms_specific = train_with_ms.loc[ - ~train_with_ms.peptide.isin(train_no_ms.peptide)] - - train_peptides = [] - train_true_alleles = [] - for allele in alleles: - peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample( - n=allele_to_peptides[allele]) - train_peptides.extend(peptides) - train_true_alleles.extend([allele] * len(peptides)) - - hits_df = pandas.DataFrame({"peptide": train_peptides}) - hits_df["true_allele"] = train_true_alleles - hits_df["hit"] = 1.0 - - decoys_df = hits_df.copy() - decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide) - decoys_df["true_allele"] = "" - decoys_df["hit"] = 0.0 - - mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True) - mms_train_df["label"] = mms_train_df.hit - mms_train_df["is_affinity"] = False - mms_train_df["measurement_inequality"] = None - - if include_affinities: - affinity_train_df = pandas.read_csv(get_path("models_class1_pan", - "models.combined/train_data.csv.bz2")) - affinity_train_df = affinity_train_df.loc[ - affinity_train_df.allele.isin(alleles), ["peptide", "allele", - "measurement_inequality", "measurement_value"]] - - affinity_train_df["label"] = affinity_train_df["measurement_value"] - del affinity_train_df["measurement_value"] - affinity_train_df["is_affinity"] = True - else: - affinity_train_df = None - - (affinity_model,) = AFFINITY_PREDICTOR.class1_pan_allele_models - presentation_model = Class1PresentationNeuralNetwork( - #batch_generator="multiallelic_mass_spec", - batch_generator="simple", - auxiliary_input_features=["gene"], - batch_generator_batch_size=1024, - max_epochs=10, - learning_rate=0.0001, - patience=5, - min_delta=0.0, - random_negative_rate=0, - random_negative_constant=0) - presentation_model.load_from_class1_neural_network(affinity_model) - - presentation_predictor = Class1PresentationPredictor( - models=[presentation_model], - allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence) - - mms_allele_encoding = MultipleAlleleEncoding( - experiment_names=["experiment1"] * len(mms_train_df), - experiment_to_allele_list={ - "experiment1": alleles, - }, max_alleles_per_experiment=6, - allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence) - allele_encoding = copy.deepcopy(mms_allele_encoding) - if affinity_train_df is not None: - allele_encoding.append_alleles(affinity_train_df.allele.values) - train_df = pandas.concat([mms_train_df, affinity_train_df], - ignore_index=True, sort=False) - else: - train_df = mms_train_df - - allele_encoding = allele_encoding.compact() - mms_allele_encoding = mms_allele_encoding.compact() - - pre_predictions = presentation_model.predict( - peptides=mms_train_df.peptide.values, - allele_encoding=mms_allele_encoding).score - - expected_pre_predictions = from_ic50(affinity_model.predict( - peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)), - allele_encoding=mms_allele_encoding.allele_encoding, )).reshape( - (-1, len(alleles))) - assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4) - - random_peptides_encodable = EncodableSequences.create( - random_peptides(10000, 9)) - original_motif = make_motif( - presentation_predictor=presentation_predictor, - peptides=random_peptides_encodable, - allele=refine_allele) - print("Original motif proline-3 rate: ", original_motif.loc[3, "P"]) - assert_less(original_motif.loc[3, "P"], 0.1) - - iteration_box = [0] + progress(label="Done fitting first round") - def progress(label = None): - if label is None: - label = str(iteration_box[0]) - iteration_box[0] += 1 - print("*** iteration ", label, "***") - predictions_df = presentation_predictor.predict_to_dataframe( - peptides=mms_train_df.peptide.values, - alleles=mms_allele_encoding) - merged_df = pandas.merge(mms_train_df, predictions_df, on="peptide") - merged_hit_df = merged_df.loc[merged_df.hit == 1.0] - correct_allele_fraction = ( - merged_hit_df.allele == merged_hit_df.true_allele).mean() - print("Correct allele fraction", correct_allele_fraction) - print( - "Mean score/affinity for hit", - merged_df.loc[merged_df.hit == 1.0].score.mean(), - merged_df.loc[merged_df.hit == 1.0].affinity.mean()) - print( - "Mean score/affinity for decoy", - merged_df.loc[merged_df.hit == 0.0].score.mean(), - merged_df.loc[merged_df.hit == 0.0].affinity.mean()) - auc = roc_auc_score(merged_df.hit.values, merged_df.score.values) - print("AUC", auc) - - motif = make_motif( - presentation_predictor=presentation_predictor, - peptides=random_peptides_encodable, - allele=refine_allele, - master_allele_encoding=allele_encoding.allele_encoding) - print("Proline-3 rate: ", motif.loc[3, "P"]) - - return (auc, correct_allele_fraction) - - (pre_auc, pre_correct_allele_fraction) = progress(label="Pre fitting") - presentation_model.fit(peptides=train_df.peptide.values, - labels=train_df.label.values, - inequalities=train_df.measurement_inequality.values, - affinities_mask=train_df.is_affinity.values, + presentation_model.set_trainable(trainable_affinity_predictor=True) + presentation_model.hyperparameters['learning_rate'] = 1e-4 + presentation_model.fit( + peptides=train_df.peptide.values, + targets=train_df.hit.values, allele_encoding=allele_encoding, progress_callback=progress) - (post_auc, post_correct_allele_fraction) = progress(label="Done fitting") + + (post_auc, post_correct_allele_fraction) = progress(label="Done fitting second round") final_motif = make_motif( presentation_predictor=presentation_predictor, @@ -445,6 +283,7 @@ def Xtest_synthetic_allele_refinement(include_affinities=True): assert_greater(final_motif.loc[3, "P"], original_motif.loc[3, "P"]) + def Xtest_real_data_multiallelic_refinement(max_epochs=10): """ Test on real data that we can learn that HLA-A*02:20 has a preference K at @@ -550,7 +389,7 @@ def Xtest_real_data_multiallelic_refinement(max_epochs=10): presentation_model.fit( peptides=combined_train_df.peptide.values, - labels=combined_train_df.label.values, + targets=combined_train_df.label.values, allele_encoding=allele_encoding, affinities_mask=combined_train_df.is_affinity.values, inequalities=combined_train_df.measurement_inequality.values, diff --git a/test/test_class1_presentation_predictor.py b/test/test_class1_presentation_predictor.py index 11e2a3e2..55d16014 100644 --- a/test/test_class1_presentation_predictor.py +++ b/test/test_class1_presentation_predictor.py @@ -130,7 +130,7 @@ def Xtest_basic(): train_df.peptide.values, alleles=["HLA-A*02:20"]) model.fit( peptides=train_df.peptide.values, - labels=train_df.label.values, + targets=train_df.label.values, allele_encoding=allele_encoding) train_df["updated_score"] = new_predictor.predict( train_df.peptide.values, -- GitLab