diff --git a/mhcflurry/allele_encoding.py b/mhcflurry/allele_encoding.py index 273f8fcb4f50c45b833274a6c955973ec72b2a0a..26e450819dceb12be6af4f94fb2f6ae8a67f7c6b 100644 --- a/mhcflurry/allele_encoding.py +++ b/mhcflurry/allele_encoding.py @@ -150,8 +150,8 @@ class AlleleEncoding(object): class MultipleAlleleEncoding(object): def __init__( self, - experiment_names, - experiment_to_allele_list, + experiment_names=[], + experiment_to_allele_list={}, max_alleles_per_experiment=6, allele_to_sequence=None, borrow_from=None): @@ -194,6 +194,12 @@ class MultipleAlleleEncoding(object): return self.allele_encoding.indices.values.reshape( (-1, self.max_alleles_per_experiment)) + @property + def alleles(self): + return numpy.reshape( + self.allele_encoding.alleles.values, + (-1, self.max_alleles_per_experiment)) + def compact(self): result = copy(self) result.allele_encoding = self.allele_encoding.compact() diff --git a/mhcflurry/class1_ligandome_predictor.py b/mhcflurry/class1_ligandome_predictor.py index 32c2c1df042fb287043e57823cc1473951b05f9d..b20b334ac4a47a02ff1e969aa57947cea85f60dd 100644 --- a/mhcflurry/class1_ligandome_predictor.py +++ b/mhcflurry/class1_ligandome_predictor.py @@ -2,20 +2,24 @@ from __future__ import print_function import time import collections -from functools import partial +from six import string_types import numpy import pandas +import mhcnames +import hashlib from .hyperparameters import HyperparameterDefaults from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE from .encodable_sequences import EncodableSequences from .regression_target import from_ic50, to_ic50 from .random_negative_peptides import RandomNegativePeptides -from .allele_encoding import MultipleAlleleEncoding +from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding +from .auxiliary_input import AuxiliaryInputEncoder from .custom_loss import ( MSEWithInequalities, - MultiallelicMassSpecLoss) + MultiallelicMassSpecLoss, + ZeroLoss) class Class1LigandomePredictor(object): @@ -26,8 +30,7 @@ class Class1LigandomePredictor(object): 'alignment_method': 'left_pad_centered_right_pad', 'max_length': 15, }, - additional_dense_layers=[], - additional_dense_activation="sigmoid", + max_alleles=6, ) """ Hyperparameters (and their default values) that affect the neural network @@ -55,7 +58,8 @@ class Class1LigandomePredictor(object): """ compile_hyperparameter_defaults = HyperparameterDefaults( - loss_delta=0.2, + loss_multiallelic_mass_spec_delta=0.2, + loss_multiallelic_mass_spec_multiplier=1.0, optimizer="rmsprop", learning_rate=None, ) @@ -64,8 +68,9 @@ class Class1LigandomePredictor(object): used. """ - allele_features_hyperparameter_defaults = HyperparameterDefaults( - allele_features_include_gene=True, + auxiliary_input_hyperparameter_defaults = HyperparameterDefaults( + auxiliary_input_features=["gene"], + auxiliary_input_feature_parameters={}, ) """ Allele feature hyperparameters. @@ -75,7 +80,7 @@ class Class1LigandomePredictor(object): fit_hyperparameter_defaults).extend( early_stopping_hyperparameter_defaults).extend( compile_hyperparameter_defaults).extend( - allele_features_hyperparameter_defaults) + auxiliary_input_hyperparameter_defaults) def __init__( self, @@ -99,6 +104,8 @@ class Class1LigandomePredictor(object): self.hyperparameters) self.fit_info = [] + self.allele_to_sequence = class1_affinity_predictor.allele_to_sequence + self.allele_representation_hash = None @staticmethod def make_network(pan_allele_class1_neural_networks, hyperparameters): @@ -110,12 +117,16 @@ class Class1LigandomePredictor(object): Flatten, RepeatVector, concatenate, - Reshape, + Activation, Lambda, + Add, Embedding) from keras.models import Model + import keras.initializers - networks = [model.network() for model in pan_allele_class1_neural_networks] + networks = [ + model.network() for model in pan_allele_class1_neural_networks + ] merged_ensemble = Class1NeuralNetwork.merge( networks, merge_method="average") @@ -123,20 +134,22 @@ class Class1LigandomePredictor(object): peptide_shape = tuple( int(x) for x in K.int_shape(merged_ensemble.inputs[0])[1:]) - input_alleles = Input(shape=(6,), name="allele") # up to 6 alleles + input_alleles = Input( + shape=(hyperparameters['max_alleles'],), name="allele") input_peptides = Input( shape=peptide_shape, dtype='float32', name='peptide') peptides_flattened = Flatten()(input_peptides) - peptides_repeated = RepeatVector(6)(peptides_flattened) + peptides_repeated = RepeatVector(hyperparameters['max_alleles'])( + peptides_flattened) allele_representation = Embedding( name="allele_representation", input_dim=64, # arbitrary, how many alleles to have room for output_dim=1029, - input_length=6, + input_length=hyperparameters['max_alleles'], trainable=False, mask_zero=True)(input_alleles) @@ -189,27 +202,69 @@ class Class1LigandomePredictor(object): layer_name_to_new_node[layer.name] = node - affinity_predictor_output = Lambda(lambda x: x[:, 0])(node) + affinity_predictor_matrix_output = node + + affinity_predictor_output = Lambda( + lambda x: x[:, 0], name="affinity_output")( + affinity_predictor_matrix_output) + + """ + layer = Dense(8, activation="sigmoid", kernel_initializer=keras.initializers.RandomNormal(mean=1.0/8.0, stddev=1e-5), use_bias=False) + lifted = TimeDistributed(layer, name="ligandome_hidden1") + node = lifted(affinity_predictor_matrix_output) + """ + + auxiliary_input = None + if hyperparameters['auxiliary_input_features']: + auxiliary_input = Input( + shape=( + hyperparameters['max_alleles'], + len( + AuxiliaryInputEncoder.get_columns( + hyperparameters['auxiliary_input_features'], + feature_parameters=hyperparameters[ + 'auxiliary_input_feature_parameters']))), + dtype="float32", + name="auxiliary") + node = concatenate( + [node, auxiliary_input], name="affinities_with_auxiliary") + + #layer = Dense(1, activation="linear", kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=1e-5), use_bias=False) + layer = Dense(1, activation="tanh") + lifted = TimeDistributed(layer, name="ligandome_output") + ligandome_adjustment = lifted(node) - for (i, layer_size) in enumerate( - hyperparameters['additional_dense_layers']): - layer = Dense( - layer_size, - activation=hyperparameters['additional_dense_activation']) - lifted = TimeDistributed(layer) - node = lifted(node) + """ + weights = layers[-1].get_weights() + layer = Dense(1, activation="sigmoid", kernel_initializer=keras.initializers.Constant(weights[0]), bias_initializer=keras.initializers.Constant(weights[1])) + lifted = TimeDistributed(layer, name="ligandome_output") + ligandome_output = lifted(prev_node) + """ + + def logit(x): + import tensorflow as tf + return - tf.log(1. / x - 1.) + + ligandome_output_pre_sigmoid = Add()([Lambda(logit)(affinity_predictor_matrix_output), ligandome_adjustment]) + ligandome_output = Activation("sigmoid")(ligandome_output_pre_sigmoid) + + #ligandome_output = affinity_predictor_matrix_output - layer = Dense(1, activation="sigmoid") - lifted = TimeDistributed(layer) - ligandome_output = lifted(node) #output_node = concatenate([ # affinity_predictor_output, ligandome_output #], name="combined_output") network = Model( - inputs=[input_peptides, input_alleles], - outputs=[affinity_predictor_output, ligandome_output], + inputs=[ + input_peptides, + input_alleles, + ] + ([] if auxiliary_input is None else [auxiliary_input]), + outputs=[ + affinity_predictor_output, + ligandome_output, + affinity_predictor_matrix_output + ], name="ligandome", ) network.summary() @@ -270,31 +325,42 @@ class Class1LigandomePredictor(object): import keras.backend as K + assert isinstance(allele_encoding, MultipleAlleleEncoding) + assert ( + allele_encoding.max_alleles_per_experiment == + self.hyperparameters['max_alleles']) + #for layer in self.network._layers[:8]: # print("Setting non trainable", layer) # layer.trainable = False # import ipdb ; ipdb.set_trace() encodable_peptides = EncodableSequences.create(peptides) + del peptides + if labels is not None: labels = numpy.array(labels, copy=False) + if inequalities is not None: + inequalities = numpy.array(inequalities, copy=True) + else: + inequalities = numpy.tile("=", len(labels)) if affinities_mask is not None: affinities_mask = numpy.array(affinities_mask, copy=False) - if inequalities is not None: - inequalities = numpy.array(inequalities, copy=False) + else: + affinities_mask = numpy.tile(False, len(labels)) + inequalities[~affinities_mask] = "=" random_negatives_planner = RandomNegativePeptides( **RandomNegativePeptides.hyperparameter_defaults.subselect( self.hyperparameters)) random_negatives_planner.plan( - peptides=encodable_peptides.sequences[affinities_mask], - affinities=(labels[affinities_mask]), - alleles=numpy.reshape( - allele_encoding.allele_encoding.alleles.values, - (-1, allele_encoding.max_alleles_per_experiment))[ - affinities_mask, 0 - ].flatten(), - inequalities=inequalities[affinities_mask]) + peptides=encodable_peptides.sequences, + affinities=numpy.where(affinities_mask, labels, to_ic50(labels)), + alleles=[ + numpy.random.choice(row[row != numpy.array(None)]) + for row in allele_encoding.alleles + ], + inequalities=inequalities) peptide_input = self.peptides_to_network_input(encodable_peptides) @@ -302,15 +368,14 @@ class Class1LigandomePredictor(object): if shuffle_permutation is None: shuffle_permutation = numpy.random.permutation(len(labels)) peptide_input = peptide_input[shuffle_permutation] + peptides = encodable_peptides.sequences[shuffle_permutation] allele_encoding.shuffle_in_place(shuffle_permutation) labels = labels[shuffle_permutation] inequalities = inequalities[shuffle_permutation] affinities_mask = affinities_mask[shuffle_permutation] - del peptides del encodable_peptides # Optional optimization - allele_encoding = allele_encoding.compact() (allele_encoding_input, allele_representations) = ( self.allele_encoding_to_network_input(allele_encoding)) @@ -318,34 +383,37 @@ class Class1LigandomePredictor(object): 'peptide': peptide_input, 'allele': allele_encoding_input, } + if self.hyperparameters['auxiliary_input_features']: + auxiliary_encoder = AuxiliaryInputEncoder( + alleles=allele_encoding.alleles, + peptides=peptides) + x_dict_without_random_negatives[ + 'auxiliary' + ] = auxiliary_encoder.get_array( + features=self.hyperparameters['auxiliary_input_features'], + feature_parameters=self.hyperparameters[ + 'auxiliary_input_feature_parameters']) y1 = numpy.zeros(shape=len(labels)) - if affinities_mask is not None: - y1[affinities_mask] = from_ic50(labels[affinities_mask]) - - random_negatives_allele_encoding = None - if allele_encoding is not None: - random_negative_alleles = random_negatives_planner.get_alleles() - random_negatives_allele_encoding = MultipleAlleleEncoding( - experiment_names=random_negative_alleles, - experiment_to_allele_list=dict( - (a, [a]) for a in random_negative_alleles), - max_alleles_per_experiment=( - allele_encoding.max_alleles_per_experiment), - borrow_from=allele_encoding.allele_encoding) + y1[affinities_mask] = from_ic50(labels[affinities_mask]) + + random_negative_alleles = random_negatives_planner.get_alleles() + random_negatives_allele_encoding = MultipleAlleleEncoding( + experiment_names=random_negative_alleles, + experiment_to_allele_list=dict( + (a, [a]) for a in random_negative_alleles), + max_alleles_per_experiment=( + allele_encoding.max_alleles_per_experiment), + borrow_from=allele_encoding.allele_encoding) num_random_negatives = random_negatives_planner.get_total_count() - if inequalities is not None: - # Reverse inequalities because from_ic50() flips the direction - # (i.e. lower affinity results in higher y values). - adjusted_inequalities = pandas.Series(inequalities).map({ - "=": "=", - ">": "<", - "<": ">", - }).values - else: - adjusted_inequalities = numpy.tile("=", len(y1)) - + # Reverse inequalities because from_ic50() flips the direction + # (i.e. lower affinity results in higher y values). + adjusted_inequalities = pandas.Series(inequalities).map({ + "=": "=", + ">": "<", + "<": ">", + }).values adjusted_inequalities[~affinities_mask] = ">" # Note: we are using "<" here not ">" because the inequalities are @@ -369,7 +437,9 @@ class Class1LigandomePredictor(object): inequalities=adjusted_inequalities_with_random_negative) mms_loss = MultiallelicMassSpecLoss( - delta=self.hyperparameters['loss_delta']) + delta=self.hyperparameters['loss_multiallelic_mass_spec_delta'], + multiplier=self.hyperparameters[ + 'loss_multiallelic_mass_spec_multiplier']) y2 = labels.copy() y2[affinities_mask] = -1 y2_with_random_negatives = numpy.concatenate([ @@ -380,9 +450,10 @@ class Class1LigandomePredictor(object): fit_info = collections.defaultdict(list) - self.set_allele_representations(allele_representations) + allele_representations_hash = self.set_allele_representations( + allele_representations) self.network.compile( - loss=[affinities_loss.loss, mms_loss.loss], + loss=[affinities_loss.loss, mms_loss.loss, ZeroLoss.loss], optimizer=self.hyperparameters['optimizer']) if self.hyperparameters['learning_rate'] is not None: K.set_value( @@ -422,6 +493,20 @@ class Class1LigandomePredictor(object): random_negatives_allele_encoding)[0], x_dict_without_random_negatives['allele'] ]) + if 'auxiliary' in x_dict_without_random_negatives: + random_negative_auxiliary_encoder = AuxiliaryInputEncoder( + alleles=random_negatives_allele_encoding.alleles, + #peptides=random_negative_peptides.sequences + ) + x_dict_with_random_negatives['auxiliary'] = ( + numpy.concatenate([ + random_negative_auxiliary_encoder.get_array( + features=self.hyperparameters[ + 'auxiliary_input_features'], + feature_parameters=self.hyperparameters[ + 'auxiliary_input_feature_parameters']), + x_dict_without_random_negatives['auxiliary'] + ])) else: x_dict_with_random_negatives = ( x_dict_without_random_negatives) @@ -433,17 +518,27 @@ class Class1LigandomePredictor(object): "peptide" ][:num_random_negatives] = random_negative_peptides_encoding + #def generator(x, ys, batch_size): + # # Each batch should have a mix of: + # # - random negative peptides + # # - affinity measurements (binder + non-binder) + # # - multiallelic mass spec + + # TODO: need to use fit_generator to keep each minibatch corresponding # to a single experiment + self.assert_allele_representations_hash(allele_representations_hash) + #import ipdb ; ipdb.set_trace() fit_history = self.network.fit( x_dict_with_random_negatives, - [encoded_y1, encoded_y2], + [encoded_y1, encoded_y2, encoded_y2], shuffle=True, batch_size=self.hyperparameters['minibatch_size'], verbose=verbose, epochs=i + 1, initial_epoch=i, validation_split=self.hyperparameters['validation_split'], + ) epoch_time = time.time() - epoch_start @@ -467,6 +562,7 @@ class Class1LigandomePredictor(object): last_progress_print = time.time() if self.hyperparameters['validation_split']: + #import ipdb ; ipdb.set_trace() val_loss = fit_info['val_loss'][-1] if min_val_loss is None or ( val_loss < min_val_loss - @@ -496,32 +592,78 @@ class Class1LigandomePredictor(object): progress_callback() fit_info["time"] = time.time() - start - fit_info["num_points"] = len(encodable_peptides) + fit_info["num_points"] = len(labels) self.fit_info.append(dict(fit_info)) def predict( self, peptides, - allele_encoding, + allele=None, + alleles=None, output="affinities", batch_size=DEFAULT_PREDICT_BATCH_SIZE): + + if isinstance(peptides, string_types): + raise TypeError("peptides must be a list or array, not a string") + if isinstance(alleles, string_types): + raise TypeError( + "alleles must be an iterable, AlleleEncoding, or " + "MultipleAlleleEncoding") + if allele is None and alleles is None: + raise ValueError("Must specify 'allele' or 'alleles'.") + + if allele is not None: + if alleles is not None: + raise ValueError("Specify exactly one of allele or alleles") + normalized_allele = mhcnames.normalize_allele_name(allele) + alleles = [normalized_allele] * len(peptides) + + if not isinstance(alleles, MultipleAlleleEncoding): + new_alleles = MultipleAlleleEncoding( + allele_to_sequence=self.allele_to_sequence, + max_alleles_per_experiment=self.hyperparameters['max_alleles']) + new_alleles.append_alleles(alleles) + alleles = new_alleles + + peptides = EncodableSequences.create(peptides) + (allele_encoding_input, allele_representations) = ( - self.allele_encoding_to_network_input(allele_encoding.compact())) + self.allele_encoding_to_network_input(alleles.compact())) self.set_allele_representations(allele_representations) x_dict = { 'peptide': self.peptides_to_network_input(peptides), 'allele': allele_encoding_input, } - predictions = self.network.predict(x_dict, batch_size=batch_size) + if self.hyperparameters['auxiliary_input_features']: + auxiliary_encoder = AuxiliaryInputEncoder( + alleles=alleles.alleles, + peptides=peptides.sequences) + x_dict[ + 'auxiliary' + ] = auxiliary_encoder.get_array( + features=self.hyperparameters['auxiliary_input_features'], + feature_parameters=self.hyperparameters[ + 'auxiliary_input_feature_parameters']) + + predictions = [ + numpy.squeeze(output) + for output in self.network.predict(x_dict, batch_size=batch_size) + ] + predictions[0] = to_ic50(predictions[0]) + predictions[2] = to_ic50(predictions[2]) if output == "affinities": - predictions = to_ic50(predictions[0]) - elif output == "ligandome_presentation": + predictions = predictions[0] + elif output == "ligandome": predictions = predictions[1] + elif output == "affinities_matrix": + predictions = predictions[2] elif output == "both": + predictions = predictions[:2] + elif output == "all": pass else: raise NotImplementedError("Unknown output", output) - return numpy.squeeze(predictions) + return predictions def set_allele_representations(self, allele_representations): """ @@ -577,13 +719,43 @@ class Class1LigandomePredictor(object): original_model.fit_generator = throw layer.set_weights([reshaped]) + self.allele_representation_hash = hashlib.sha1( + allele_representations.tobytes()).hexdigest() + return self.allele_representation_hash + + def assert_allele_representations_hash(self, value): + numpy.testing.assert_equal(self.allele_representation_hash, value) + + def __getstate__(self): + """ + serialize to a dict. Model weights are included. For pickle support. + + Returns + ------- + dict + + """ + result = dict(self.__dict__) + result['network'] = None + result['network_json'] = None + result['network_weights'] = None + + if self.network is not None: + result['network_json'] = self.network.to_json() + result['network_weights'] = self.network.get_weights() + return result + + def __setstate__(self, state): + """ + Deserialize. For pickle support. + """ + network_json = state.pop("network_json") + network_weights = state.pop("network_weights") + self.__dict__.update(state) + if network_json is not None: + import keras.models + self.network = keras.models.model_from_json(network_json) + if network_weights is not None: + self.network.set_weights(network_weights) + - @staticmethod - def allele_features(allele_names, hyperparameters): - df = pandas.DataFrame({"allele_name": allele_names}) - if hyperparameters['allele_features_include_gene']: - # TODO: support other organisms. - for gene in ["A", "B", "C"]: - df[gene] = df.allele_name.str.startswith( - "HLA-%s" % gene).astype(float) - return gene diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py index 2434bfaff5b2c55bb9845ee533c79dfd68398e6f..65a6586b54b394f972596bc5d2e05df8f52a15a5 100644 --- a/mhcflurry/custom_loss.py +++ b/mhcflurry/custom_loss.py @@ -92,12 +92,12 @@ class MSEWithInequalities(Loss): y_pred is greater or less than y_true. between 2 - 3: - Treated as a "<" inequality. Penalty (y_pred - (y_true - 2))**2 is - applied only if y_pred is greater than y_true - 2. + Treated as a ">" inequality. Penalty (y_pred - (y_true - 2))**2 is + applied only if y_pred is less than y_true - 2. between 4 - 5: - Treated as a ">" inequality. Penalty (y_pred - (y_true - 4))**2 is - applied only if y_pred is less than y_true - 4. + Treated as a "<" inequality. Penalty (y_pred - (y_true - 4))**2 is + applied only if y_pred is greater than y_true - 4. """ name = "mse_with_inequalities" supports_inequalities = True @@ -240,11 +240,12 @@ class MultiallelicMassSpecLoss(Loss): supports_inequalities = True supports_multiple_outputs = False - def __init__(self, delta=0.2): + def __init__(self, delta=0.2, multiplier=1.0): self.delta = delta + self.multiplier = multiplier @staticmethod - def encode_y(y, affinities_mask=None, inequalities=None): + def encode_y(y): encoded = pandas.Series(y, dtype="float32", copy=True) assert all(item in (-1.0, 1.0, 0.0) for item in encoded), set(y) print( @@ -262,9 +263,25 @@ class MultiallelicMassSpecLoss(Loss): pos_max = tf.reduce_max(pos, axis=1) neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0)) term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta - result = tf.reduce_sum(tf.maximum(0.0, term) ** 2) - return result + result = tf.reduce_sum(tf.maximum(0.0, term) ** 2) / tf.cast( + tf.shape(term)[0], tf.float32) * self.multiplier + return tf.where(tf.is_nan(result), 0.0, result) + + +class ZeroLoss(Loss): + """ + """ + name = "zero_loss" + supports_inequalities = False + supports_multiple_outputs = False + @staticmethod + def encode_y(y): + return y + + @staticmethod + def loss(y_true, y_pred): + return 0.0 def check_shape(name, arr, expected_shape): @@ -284,5 +301,7 @@ def check_shape(name, arr, expected_shape): # Register custom losses. -for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]: +for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss, ZeroLoss]: CUSTOM_LOSSES[cls.name] = cls() + + diff --git a/test/test_class1_ligandome_predictor.py b/test/test_class1_ligandome_predictor.py index 41ff42ff12f711953498b7b727d2dc546e965d85..0561998ab4d114a320bd80d7b529baba4de766f0 100644 --- a/test/test_class1_ligandome_predictor.py +++ b/test/test_class1_ligandome_predictor.py @@ -21,6 +21,7 @@ logging.getLogger('matplotlib').disabled = True import pandas import argparse import sys +import copy from functools import partial from numpy.testing import assert_, assert_equal, assert_allclose @@ -153,7 +154,7 @@ def test_loss(): ) contributions.append(contribution) contributions = numpy.array(contributions) - expected1 = contributions.sum() + expected1 = contributions.sum() / len(contributions) # reference implementation 2: numpy pos = numpy.array([ @@ -164,7 +165,8 @@ def test_loss(): neg = y_pred[(y_true == 0.0).astype(bool)] expected2 = ( - numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() + numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() / ( + len(pos) * len(neg)) yield numpy.testing.assert_almost_equal, expected1, expected2, 4 @@ -226,7 +228,7 @@ def make_motif(allele, peptides, frac=0.01): return matrix -def test_real_data_multiallelic_refinement(max_epochs=10): +def Xtest_real_data_multiallelic_refinement(max_epochs=10): ms_df = pandas.read_csv( get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2")) ms_df = ms_df.loc[ @@ -240,10 +242,20 @@ def test_real_data_multiallelic_refinement(max_epochs=10): del sample_table[col] sample_table["alleles"] = sample_table.hla.str.split() - multi_train_df = ms_df.loc[ + multi_train_hit_df = ms_df.loc[ ms_df.sample_id == "RA957" ].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True) - multi_train_df["label"] = 1.0 + multi_train_hit_df["label"] = 1.0 + + multi_train_decoy_df = ms_df.loc[ + (ms_df.sample_id == "CD165") & + (~ms_df.peptide.isin(multi_train_hit_df.peptide.unique())) + ].drop_duplicates("peptide")[["peptide"]] + (multi_train_decoy_df["sample_id"],) = multi_train_hit_df.sample_id.unique() + multi_train_decoy_df["label"] = 0.0 + + multi_train_df = pandas.concat( + [multi_train_hit_df, multi_train_decoy_df], ignore_index=True) multi_train_df["is_affinity"] = False multi_train_alleles = set() @@ -281,6 +293,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10): ligandome_predictor = Class1LigandomePredictor( pan_predictor, + auxiliary_input_features=[], max_ensemble_size=1, max_epochs=50, learning_rate=0.0001, @@ -292,7 +305,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10): ligandome_predictor.predict( output="affinities", peptides=combined_train_df.peptide.values, - allele_encoding=allele_encoding)) + alleles=allele_encoding)) (model,) = pan_predictor.class1_pan_allele_models expected_pre_predictions = from_ic50( @@ -325,11 +338,220 @@ def test_real_data_multiallelic_refinement(max_epochs=10): progress_callback=update_motifs, ) + import ipdb ; ipdb.set_trace() + + +def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10): + refine_allele = "HLA-C*01:02" + alleles = [ + "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01", + "HLA-A*03:01", "HLA-B*15:01", refine_allele + ] + peptides_per_allele = [ + 2000, 1000, 500, + 1500, 1200, 800, + ] + + allele_to_peptides = dict(zip(alleles, peptides_per_allele)) + + length = 9 + + train_with_ms = pandas.read_csv( + get_path("data_curated", "curated_training_data.with_mass_spec.csv.bz2")) + train_no_ms = pandas.read_csv(get_path("data_curated", + "curated_training_data.no_mass_spec.csv.bz2")) + + def filter_df(df): + df = df.loc[ + (df.allele.isin(alleles)) & + (df.peptide.str.len() == length) + ] + return df + + train_with_ms = filter_df(train_with_ms) + train_no_ms = filter_df(train_no_ms) + + ms_specific = train_with_ms.loc[ + ~train_with_ms.peptide.isin(train_no_ms.peptide) + ] + + train_peptides = [] + train_true_alleles = [] + for allele in alleles: + peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample( + n=allele_to_peptides[allele]) + train_peptides.extend(peptides) + train_true_alleles.extend([allele] * len(peptides)) + + hits_df = pandas.DataFrame({"peptide": train_peptides}) + hits_df["true_allele"] = train_true_alleles + hits_df["hit"] = 1.0 + + decoys_df = hits_df.copy() + decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide) + decoys_df["true_allele"] = "" + decoys_df["hit"] = 0.0 + + mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True) + mms_train_df["label"] = mms_train_df.hit + mms_train_df["is_affinity"] = False + + affinity_train_df = pandas.read_csv( + get_path( + "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2")) + affinity_train_df = affinity_train_df.loc[ + affinity_train_df.allele.isin(alleles), + ["peptide", "allele", "measurement_inequality", "measurement_value"]] + + affinity_train_df["label"] = affinity_train_df["measurement_value"] + del affinity_train_df["measurement_value"] + affinity_train_df["is_affinity"] = True + + predictor = Class1LigandomePredictor( + PAN_ALLELE_PREDICTOR_NO_MASS_SPEC, + auxiliary_input_features=["gene"], + max_ensemble_size=1, + max_epochs=max_epochs, + learning_rate=0.0001, + patience=5, + min_delta=0.0, + random_negative_rate=1.0, + random_negative_constant=25) + + mms_allele_encoding = MultipleAlleleEncoding( + experiment_names=["experiment1"] * len(mms_train_df), + experiment_to_allele_list={ + "experiment1": alleles, + }, + max_alleles_per_experiment=6, + allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence, + ) + allele_encoding = copy.deepcopy(mms_allele_encoding) + allele_encoding.append_alleles(affinity_train_df.allele.values) + allele_encoding = allele_encoding.compact() + + train_df = pandas.concat( + [mms_train_df, affinity_train_df], ignore_index=True, sort=False) + + pre_predictions = from_ic50( + predictor.predict( + output="affinities_matrix", + peptides=mms_train_df.peptide.values, + alleles=mms_allele_encoding)) + + (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models + expected_pre_predictions = from_ic50( + model.predict( + peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)), + allele_encoding=mms_allele_encoding.allele_encoding, + )).reshape((-1, len(alleles))) + #import ipdb ; ipdb.set_trace() + mms_train_df["pre_max_prediction"] = pre_predictions.max(1) + pre_auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.pre_max_prediction.values) + print("PRE_AUC", pre_auc) + + assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4) + motifs_history = [] + random_peptides_encodable = make_random_peptides(10000, [9]) + def update_motifs(): + for allele in alleles: + motif = make_motif(allele, random_peptides_encodable) + motifs_history.append((allele, motif)) + + metric_rows = [] + + def progress(): + (_, ligandome_prediction, affinities_predictions) = ( + predictor.predict( + output="all", + peptides=mms_train_df.peptide.values, + alleles=mms_allele_encoding)) + affinities_predictions = from_ic50(affinities_predictions) + for (kind, predictions) in [ + ("affinities", affinities_predictions), + ("ligandome", ligandome_prediction)]: + + mms_train_df["max_prediction"] = predictions.max(1) + mms_train_df["predicted_allele"] = pandas.Series(alleles).loc[ + predictions.argmax(1).flatten() + ].values + + print(kind) + print(predictions) + + mean_predictions_for_hit = mms_train_df.loc[ + mms_train_df.hit == 1.0 + ].max_prediction.mean() + mean_predictions_for_decoy = mms_train_df.loc[ + mms_train_df.hit == 0.0 + ].max_prediction.mean() + correct_allele_fraction = ( + mms_train_df.loc[mms_train_df.hit == 1.0].predicted_allele == + mms_train_df.loc[mms_train_df.hit == 1.0].true_allele + ).mean() + auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.max_prediction.values) + + print(kind, "Mean prediction for hit", mean_predictions_for_hit) + print(kind, "Mean prediction for decoy", mean_predictions_for_decoy) + print(kind, "Correct predicted allele fraction", correct_allele_fraction) + print(kind, "AUC", auc) + + metric_rows.append(( + kind, + mean_predictions_for_hit, + mean_predictions_for_decoy, + correct_allele_fraction, + auc, + )) + + update_motifs() + + return (ligandome_prediction, auc) + + print("Pre fitting:") + progress() + update_motifs() + print("Fitting...") + + predictor.fit( + peptides=train_df.peptide.values, + labels=train_df.label.values, + inequalities=train_df.measurement_inequality.values, + affinities_mask=train_df.is_affinity.values, + allele_encoding=allele_encoding, + progress_callback=progress, + ) + + (predictions, final_auc) = progress() + print("Final AUC", final_auc) + + update_motifs() + + motifs = pandas.DataFrame( + motifs_history, + columns=[ + "allele", + "motif", + ] + ) + + metrics = pandas.DataFrame( + metric_rows, + columns=[ + "output", + "mean_predictions_for_hit", + "mean_predictions_for_decoy", + "correct_allele_fraction", + "auc" + ]) + + return (predictor, predictions, metrics, motifs) + def Xtest_synthetic_allele_refinement(max_epochs=10): @@ -387,12 +609,13 @@ def Xtest_synthetic_allele_refinement(max_epochs=10): predictor = Class1LigandomePredictor( PAN_ALLELE_PREDICTOR_NO_MASS_SPEC, - additional_dense_layers=[8, 1], max_ensemble_size=1, max_epochs=max_epochs, learning_rate=0.0001, patience=5, - min_delta=0.0) + min_delta=0.0, + random_negative_rate=0.0, + random_negative_constant=0) allele_encoding = MultipleAlleleEncoding( experiment_names=["experiment1"] * len(train_df), @@ -405,9 +628,9 @@ def Xtest_synthetic_allele_refinement(max_epochs=10): pre_predictions = from_ic50( predictor.predict( - output="affinities", + output="affinities_matrix", peptides=train_df.peptide.values, - allele_encoding=allele_encoding)) + alleles=allele_encoding)) (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models expected_pre_predictions = from_ic50( @@ -436,45 +659,52 @@ def Xtest_synthetic_allele_refinement(max_epochs=10): metric_rows = [] def progress(): - predictions = from_ic50( + (_, ligandome_prediction, affinities_predictions) = ( predictor.predict( - output="affinities", + output="all", peptides=train_df.peptide.values, - allele_encoding=allele_encoding)) - - train_df["max_prediction"] = predictions.max(1) - train_df["predicted_allele"] = pandas.Series(alleles).loc[ - predictions.argmax(1).flatten()].values - - print(predictions) - - mean_predictions_for_hit = train_df.loc[ - train_df.hit == 1.0 - ].max_prediction.mean() - mean_predictions_for_decoy = train_df.loc[ - train_df.hit == 0.0 - ].max_prediction.mean() - correct_allele_fraction = ( - train_df.loc[train_df.hit == 1.0].predicted_allele == - train_df.loc[train_df.hit == 1.0].true_allele - ).mean() - auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values) - - print("Mean prediction for hit", mean_predictions_for_hit) - print("Mean prediction for decoy", mean_predictions_for_decoy) - print("Correct predicted allele fraction", correct_allele_fraction) - print("AUC", auc) - - metric_rows.append(( - mean_predictions_for_hit, - mean_predictions_for_decoy, - correct_allele_fraction, - auc, - )) - - update_motifs() - - return (predictions, auc) + alleles=allele_encoding)) + affinities_predictions = from_ic50(affinities_predictions) + for (kind, predictions) in [ + ("affinities", affinities_predictions), + ("ligandome", ligandome_prediction)]: + + train_df["max_prediction"] = predictions.max(1) + train_df["predicted_allele"] = pandas.Series(alleles).loc[ + predictions.argmax(1).flatten() + ].values + + print(kind) + print(predictions) + + mean_predictions_for_hit = train_df.loc[ + train_df.hit == 1.0 + ].max_prediction.mean() + mean_predictions_for_decoy = train_df.loc[ + train_df.hit == 0.0 + ].max_prediction.mean() + correct_allele_fraction = ( + train_df.loc[train_df.hit == 1.0].predicted_allele == + train_df.loc[train_df.hit == 1.0].true_allele + ).mean() + auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values) + + print(kind, "Mean prediction for hit", mean_predictions_for_hit) + print(kind, "Mean prediction for decoy", mean_predictions_for_decoy) + print(kind, "Correct predicted allele fraction", correct_allele_fraction) + print(kind, "AUC", auc) + + metric_rows.append(( + kind, + mean_predictions_for_hit, + mean_predictions_for_decoy, + correct_allele_fraction, + auc, + )) + + update_motifs() + + return (ligandome_prediction, auc) print("Pre fitting:") progress() @@ -504,6 +734,7 @@ def Xtest_synthetic_allele_refinement(max_epochs=10): metrics = pandas.DataFrame( metric_rows, columns=[ + "output", "mean_predictions_for_hit", "mean_predictions_for_decoy", "correct_allele_fraction",