Skip to content
Snippets Groups Projects
Commit 2e8dc426 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

working on ligandome

parent 0eb0a4e1
No related branches found
No related tags found
No related merge requests found
...@@ -5,11 +5,17 @@ import collections ...@@ -5,11 +5,17 @@ import collections
from functools import partial from functools import partial
import numpy import numpy
import pandas
from .hyperparameters import HyperparameterDefaults from .hyperparameters import HyperparameterDefaults
from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
from .encodable_sequences import EncodableSequences from .encodable_sequences import EncodableSequences
from .regression_target import from_ic50, to_ic50
from .random_negative_peptides import RandomNegativePeptides
from .custom_loss import (
MSEWithInequalities,
MSEWithInequalitiesAndMultipleOutputs,
MultiallelicMassSpecLoss)
class Class1LigandomePredictor(object): class Class1LigandomePredictor(object):
network_hyperparameter_defaults = HyperparameterDefaults( network_hyperparameter_defaults = HyperparameterDefaults(
...@@ -19,6 +25,8 @@ class Class1LigandomePredictor(object): ...@@ -19,6 +25,8 @@ class Class1LigandomePredictor(object):
'alignment_method': 'left_pad_centered_right_pad', 'alignment_method': 'left_pad_centered_right_pad',
'max_length': 15, 'max_length': 15,
}, },
additional_dense_layers=[],
additional_dense_activation="sigmoid",
) )
""" """
Hyperparameters (and their default values) that affect the neural network Hyperparameters (and their default values) that affect the neural network
...@@ -29,9 +37,8 @@ class Class1LigandomePredictor(object): ...@@ -29,9 +37,8 @@ class Class1LigandomePredictor(object):
max_epochs=500, max_epochs=500,
validation_split=0.1, validation_split=0.1,
early_stopping=True, early_stopping=True,
minibatch_size=128, minibatch_size=128,).extend(
random_negative_rate=0.0, RandomNegativePeptides.hyperparameter_defaults
random_negative_constant=0,
) )
""" """
Hyperparameters for neural network training. Hyperparameters for neural network training.
...@@ -47,7 +54,6 @@ class Class1LigandomePredictor(object): ...@@ -47,7 +54,6 @@ class Class1LigandomePredictor(object):
compile_hyperparameter_defaults = HyperparameterDefaults( compile_hyperparameter_defaults = HyperparameterDefaults(
loss_delta=0.2, loss_delta=0.2,
loss_alpha=None,
optimizer="rmsprop", optimizer="rmsprop",
learning_rate=None, learning_rate=None,
) )
...@@ -56,10 +62,18 @@ class Class1LigandomePredictor(object): ...@@ -56,10 +62,18 @@ class Class1LigandomePredictor(object):
used. used.
""" """
allele_features_hyperparameter_defaults = HyperparameterDefaults(
allele_features_include_gene=True,
)
"""
Allele feature hyperparameters.
"""
hyperparameter_defaults = network_hyperparameter_defaults.extend( hyperparameter_defaults = network_hyperparameter_defaults.extend(
fit_hyperparameter_defaults).extend( fit_hyperparameter_defaults).extend(
early_stopping_hyperparameter_defaults).extend( early_stopping_hyperparameter_defaults).extend(
compile_hyperparameter_defaults) compile_hyperparameter_defaults).extend(
allele_features_hyperparameter_defaults)
def __init__( def __init__(
self, self,
...@@ -87,8 +101,15 @@ class Class1LigandomePredictor(object): ...@@ -87,8 +101,15 @@ class Class1LigandomePredictor(object):
@staticmethod @staticmethod
def make_network(pan_allele_class1_neural_networks, hyperparameters): def make_network(pan_allele_class1_neural_networks, hyperparameters):
import keras.backend as K import keras.backend as K
from keras.layers import Input, TimeDistributed, Lambda, Flatten, RepeatVector, concatenate, Dropout, Reshape, Embedding from keras.layers import (
from keras.activations import sigmoid Input,
TimeDistributed,
Dense,
Flatten,
RepeatVector,
concatenate,
Reshape,
Embedding)
from keras.models import Model from keras.models import Model
networks = [model.network() for model in pan_allele_class1_neural_networks] networks = [model.network() for model in pan_allele_class1_neural_networks]
...@@ -163,39 +184,28 @@ class Class1LigandomePredictor(object): ...@@ -163,39 +184,28 @@ class Class1LigandomePredictor(object):
layer_name_to_new_node[layer.name] = node layer_name_to_new_node[layer.name] = node
affinity_predictor_output = node
for (i, layer_size) in enumerate(
hyperparameters['additional_dense_layers']):
layer = Dense(
layer_size,
activation=hyperparameters['additional_dense_activation'])
lifted = TimeDistributed(layer)
node = lifted(node)
layer = Dense(1, activation="sigmoid")
lifted = TimeDistributed(layer)
ligandome_output = lifted(node)
network = Model( network = Model(
inputs=[input_peptides, input_alleles], inputs=[input_peptides, input_alleles],
outputs=node, outputs=[affinity_predictor_output, ligandome_output],
name="ligandome", name="ligandome",
) )
network.summary() network.summary()
return network return network
@staticmethod
def loss(y_true, y_pred, sample_weight=None, delta=0.2, alpha=None):
"""
Loss function for ligandome prediction.
"""
import tensorflow as tf
y_pred = tf.squeeze(y_pred, axis=-1)
y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
pos = tf.boolean_mask(y_pred, y_true)
if alpha is None:
pos_max = tf.reduce_max(pos, axis=1)
else:
# Smooth maximum
exp_alpha_x = tf.exp(alpha * pos)
numerator = tf.reduce_sum(tf.multiply(pos, exp_alpha_x), axis=1)
denominator = tf.reduce_sum(exp_alpha_x, axis=1)
pos_max = numerator / denominator
neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
result = tf.reduce_sum(
tf.maximum(0.0, tf.reshape(neg, (-1, 1)) - pos_max + delta) ** 2)
return result
def peptides_to_network_input(self, peptides): def peptides_to_network_input(self, peptides):
""" """
Encode peptides to the fixed-length encoding expected by the neural Encode peptides to the fixed-length encoding expected by the neural
...@@ -241,6 +251,8 @@ class Class1LigandomePredictor(object): ...@@ -241,6 +251,8 @@ class Class1LigandomePredictor(object):
peptides, peptides,
labels, labels,
allele_encoding, allele_encoding,
affinities_mask=None, # True when a peptide/label is actually a peptide and an affinity
inequalities=None, # interpreted only for elements where affinities_mask is True, otherwise ignored
shuffle_permutation=None, shuffle_permutation=None,
verbose=1, verbose=1,
progress_callback=None, progress_callback=None,
...@@ -275,14 +287,25 @@ class Class1LigandomePredictor(object): ...@@ -275,14 +287,25 @@ class Class1LigandomePredictor(object):
'allele': allele_encoding_input, 'allele': allele_encoding_input,
} }
loss = MSEWithInequalitiesAndMultipleOutputs(losses=[
MSEWithInequalities(),
MultiallelicMassSpecLoss(
delta=self.hyperparameters['loss_delta']),
])
y_values_pre_encoding = labels.copy()
if affinities_mask is not None:
y_values_pre_encoding[affinities_mask] = from_ic50(labels)
y_values = loss.encode_y(
y_values_pre_encoding,
inequalities=inequalities[affinities_mask] if inequalities is not None else None,
output_indices=(~affinities_mask).astype(int) if affinities_mask is not None else numpy.ones(len(y_values_pre_encoding), dtype=int))
fit_info = collections.defaultdict(list) fit_info = collections.defaultdict(list)
self.set_allele_representations(allele_representations) self.set_allele_representations(allele_representations)
self.network.compile( self.network.compile(
loss=partial( loss=loss.loss,
self.loss,
delta=self.hyperparameters['loss_delta'],
alpha=self.hyperparameters['loss_alpha']),
optimizer=self.hyperparameters['optimizer']) optimizer=self.hyperparameters['optimizer'])
if self.hyperparameters['learning_rate'] is not None: if self.hyperparameters['learning_rate'] is not None:
K.set_value( K.set_value(
...@@ -371,6 +394,7 @@ class Class1LigandomePredictor(object): ...@@ -371,6 +394,7 @@ class Class1LigandomePredictor(object):
self, self,
peptides, peptides,
allele_encoding, allele_encoding,
output="affinities",
batch_size=DEFAULT_PREDICT_BATCH_SIZE): batch_size=DEFAULT_PREDICT_BATCH_SIZE):
(allele_encoding_input, allele_representations) = ( (allele_encoding_input, allele_representations) = (
self.allele_encoding_to_network_input(allele_encoding.compact())) self.allele_encoding_to_network_input(allele_encoding.compact()))
...@@ -380,12 +404,16 @@ class Class1LigandomePredictor(object): ...@@ -380,12 +404,16 @@ class Class1LigandomePredictor(object):
'allele': allele_encoding_input, 'allele': allele_encoding_input,
} }
predictions = self.network.predict(x_dict, batch_size=batch_size) predictions = self.network.predict(x_dict, batch_size=batch_size)
if output == "affinities":
predictions = to_ic50(predictions[0])
elif output == "ligandome_presentation":
predictions = predictions[1]
elif output == "both":
pass
else:
raise NotImplementedError("Unknown output", output)
return numpy.squeeze(predictions) return numpy.squeeze(predictions)
#def predict(self):
def set_allele_representations(self, allele_representations): def set_allele_representations(self, allele_representations):
""" """
""" """
...@@ -440,3 +468,13 @@ class Class1LigandomePredictor(object): ...@@ -440,3 +468,13 @@ class Class1LigandomePredictor(object):
original_model.fit_generator = throw original_model.fit_generator = throw
layer.set_weights([reshaped]) layer.set_weights([reshaped])
@staticmethod
def allele_features(allele_names, hyperparameters):
df = pandas.DataFrame({"allele_name": allele_names})
if hyperparameters['allele_features_include_gene']:
# TODO: support other organisms.
for gene in ["A", "B", "C"]:
df[gene] = df.allele_name.str.startswith(
"HLA-%s" % gene).astype(float)
return gene
...@@ -182,8 +182,10 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss): ...@@ -182,8 +182,10 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
supports_inequalities = True supports_inequalities = True
supports_multiple_outputs = True supports_multiple_outputs = True
@staticmethod def __init__(self, losses=MSEWithInequalities):
def encode_y(y, inequalities=None, output_indices=None): self.losses = losses
def encode_y(self, y, inequalities=None, output_indices=None):
y = array(y, dtype="float32") y = array(y, dtype="float32")
if isnan(y).any(): if isnan(y).any():
raise ValueError("y contains NaN", y) raise ValueError("y contains NaN", y)
...@@ -192,8 +194,25 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss): ...@@ -192,8 +194,25 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
if (y < 0.0).any(): if (y < 0.0).any():
raise ValueError("y contains values < 0.0", y) raise ValueError("y contains values < 0.0", y)
encoded = MSEWithInequalities.encode_y( if isinstance(self.losses, Loss):
y, inequalities=inequalities) # Single loss applied to all outputs
encoded = MSEWithInequalities.encode_y(y, inequalities=inequalities)
else:
assert output_indices is not None
df = pandas.DataFrame({
"y": y,
"inequality": inequalities,
"output_index": output_indices,
})
encoded = y.copy()
encoded[:] = numpy.nan
for (output_index, sub_df) in df.groupby("output_index"):
loss = self.losses[output_index]
loss_kwargs = {}
if not sub_df.inequality.isnull().all():
loss_kwargs['inequalities'] = sub_df.inequality.values
encoded[sub_df.index.values] = loss.encode_y(
sub_df.y.values, **loss_kwargs)
if output_indices is not None: if output_indices is not None:
output_indices = numpy.array(output_indices) output_indices = numpy.array(output_indices)
...@@ -205,8 +224,7 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss): ...@@ -205,8 +224,7 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
return encoded return encoded
@staticmethod def loss(self, y_true, y_pred):
def loss(y_true, y_pred):
from keras import backend as K from keras import backend as K
y_true = K.flatten(y_true) y_true = K.flatten(y_true)
...@@ -230,7 +248,49 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss): ...@@ -230,7 +248,49 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
# ], axis=-1) # ], axis=-1)
#updated_y_pred = tf.gather_nd(y_pred, indexer) #updated_y_pred = tf.gather_nd(y_pred, indexer)
return MSEWithInequalities.loss(updated_y_true, updated_y_pred) if isinstance(self.losses, Loss):
# Single loss for all outputs.
return self.losses.loss(updated_y_true, updated_y_pred)
else:
# TODO: make this more efficient?
result = None
for (i, loss) in enumerate(self.losses):
values = (
loss.loss(updated_y_true, updated_y_pred) *
K.cast(K.equal(output_indices, i), "float32"))
if result is None:
result = values
else:
result += values
return result
class MultiallelicMassSpecLoss(Loss):
"""
"""
name = "multiallelic_mass_spec_loss"
supports_inequalities = True
supports_multiple_outputs = False
def __init__(self, delta=0.2):
self.delta = delta
def encode_y(self, y):
return y
def loss(self, y_true, y_pred):
import tensorflow as tf
#y_pred = tf.squeeze(y_pred, axis=-1)
y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
pos = tf.boolean_mask(y_pred, y_true)
pos_max = tf.reduce_max(pos, axis=1)
neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
result = tf.reduce_sum(
tf.maximum(
0.0, tf.reshape(neg, (-1, 1)) - pos_max + self.delta) ** 2)
return result
def check_shape(name, arr, expected_shape): def check_shape(name, arr, expected_shape):
...@@ -250,5 +310,5 @@ def check_shape(name, arr, expected_shape): ...@@ -250,5 +310,5 @@ def check_shape(name, arr, expected_shape):
# Register custom losses. # Register custom losses.
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]: for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss]:
CUSTOM_LOSSES[cls.name] = cls() CUSTOM_LOSSES[cls.name] = cls()
...@@ -38,6 +38,7 @@ from mhcflurry.regression_target import from_ic50 ...@@ -38,6 +38,7 @@ from mhcflurry.regression_target import from_ic50
from mhcflurry.common import random_peptides, positional_frequency_matrix from mhcflurry.common import random_peptides, positional_frequency_matrix
from mhcflurry.testing_utils import cleanup, startup from mhcflurry.testing_utils import cleanup, startup
from mhcflurry.amino_acid import COMMON_AMINO_ACIDS from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
from mhcflurry.custom_loss import MultiallelicMassSpecLoss
COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)
...@@ -110,73 +111,66 @@ def evaluate_loss(loss, y_true, y_pred): ...@@ -110,73 +111,66 @@ def evaluate_loss(loss, y_true, y_pred):
def test_loss(): def test_loss():
for delta in [0.0, 0.3]: for delta in [0.0, 0.3]:
for alpha in [None, 1.0, 20.0]: print("delta", delta)
print("delta", delta) # Hit labels
print("alpha", alpha) y_true = [
# Hit labels 1.0,
y_true = [ 0.0,
1.0, 1.0,
0.0, 1.0,
1.0, 0.0
1.0, ]
0.0 y_true = numpy.array(y_true)
] y_pred = [
y_true = numpy.array(y_true) [0.3, 0.7, 0.5],
y_pred = [ [0.2, 0.4, 0.6],
[0.3, 0.7, 0.5], [0.1, 0.5, 0.3],
[0.2, 0.4, 0.6], [0.1, 0.7, 0.1],
[0.1, 0.5, 0.3], [0.8, 0.2, 0.4],
[0.1, 0.7, 0.1], ]
[0.8, 0.2, 0.4], y_pred = numpy.array(y_pred)
]
y_pred = numpy.array(y_pred) # reference implementation 1
# reference implementation 1 def smooth_max(x, alpha):
x = numpy.array(x)
def smooth_max(x, alpha): alpha = numpy.array([alpha])
x = numpy.array(x) return (x * numpy.exp(x * alpha)).sum() / (
alpha = numpy.array([alpha]) numpy.exp(x * alpha)).sum()
return (x * numpy.exp(x * alpha)).sum() / (
numpy.exp(x * alpha)).sum() contributions = []
for i in range(len(y_true)):
if alpha is None: if y_true[i] == 1.0:
max_func = max for j in range(len(y_true)):
else: if y_true[j] == 0.0:
max_func = partial(smooth_max, alpha=alpha) tightest_i = max(y_pred[i])
contribution = sum(
contributions = [] max(0, y_pred[j, k] - tightest_i + delta)**2
for i in range(len(y_true)): for k in range(y_pred.shape[1])
if y_true[i] == 1.0: )
for j in range(len(y_true)): contributions.append(contribution)
if y_true[j] == 0.0: contributions = numpy.array(contributions)
tightest_i = max_func(y_pred[i]) expected1 = contributions.sum()
contribution = sum(
max(0, y_pred[j, k] - tightest_i + delta)**2 # reference implementation 2: numpy
for k in range(y_pred.shape[1]) pos = numpy.array([
) max(y_pred[i])
contributions.append(contribution) for i in range(len(y_pred))
contributions = numpy.array(contributions) if y_true[i] == 1.0
expected1 = contributions.sum() ])
# reference implementation 2: numpy neg = y_pred[~y_true.astype(bool)]
pos = numpy.array([ expected2 = (
max_func(y_pred[i]) numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
for i in range(len(y_pred))
if y_true[i] == 1.0 yield numpy.testing.assert_almost_equal, expected1, expected2, 4
])
computed = evaluate_loss(
neg = y_pred[~y_true.astype(bool)] MultiallelicMassSpecLoss(delta=delta).loss,
expected2 = ( y_true,
numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() y_pred.reshape(y_pred.shape + (1,)))
yield numpy.testing.assert_almost_equal, expected1, expected2, 4 yield numpy.testing.assert_almost_equal, computed, expected1, 4
computed = evaluate_loss(
partial(Class1LigandomePredictor.loss, delta=delta, alpha=alpha),
y_true,
y_pred.reshape(y_pred.shape + (1,)))
yield numpy.testing.assert_almost_equal, computed, expected1, 4
AA_DIST = pandas.Series( AA_DIST = pandas.Series(
...@@ -284,6 +278,7 @@ def test_synthetic_allele_refinement(max_epochs=10): ...@@ -284,6 +278,7 @@ def test_synthetic_allele_refinement(max_epochs=10):
predictor = Class1LigandomePredictor( predictor = Class1LigandomePredictor(
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC, PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
additional_dense_layers=[8, 1],
max_ensemble_size=1, max_ensemble_size=1,
max_epochs=max_epochs, max_epochs=max_epochs,
learning_rate=0.0001, learning_rate=0.0001,
...@@ -299,9 +294,11 @@ def test_synthetic_allele_refinement(max_epochs=10): ...@@ -299,9 +294,11 @@ def test_synthetic_allele_refinement(max_epochs=10):
allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence, allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
).compact() ).compact()
pre_predictions = predictor.predict( pre_predictions = from_ic50(
peptides=train_df.peptide.values, predictor.predict(
allele_encoding=allele_encoding) output="affinities",
peptides=train_df.peptide.values,
allele_encoding=allele_encoding))
(model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
expected_pre_predictions = from_ic50( expected_pre_predictions = from_ic50(
...@@ -310,11 +307,13 @@ def test_synthetic_allele_refinement(max_epochs=10): ...@@ -310,11 +307,13 @@ def test_synthetic_allele_refinement(max_epochs=10):
allele_encoding=allele_encoding.allele_encoding, allele_encoding=allele_encoding.allele_encoding,
)).reshape((-1, len(alleles))) )).reshape((-1, len(alleles)))
#import ipdb ; ipdb.set_trace()
train_df["pre_max_prediction"] = pre_predictions.max(1) train_df["pre_max_prediction"] = pre_predictions.max(1)
pre_auc = roc_auc_score(train_df.hit.values, train_df.pre_max_prediction.values) pre_auc = roc_auc_score(train_df.hit.values, train_df.pre_max_prediction.values)
print("PRE_AUC", pre_auc) print("PRE_AUC", pre_auc)
assert_allclose(pre_predictions, expected_pre_predictions) assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
motifs_history = [] motifs_history = []
random_peptides_encodable = make_random_peptides(10000, [9]) random_peptides_encodable = make_random_peptides(10000, [9])
...@@ -328,8 +327,11 @@ def test_synthetic_allele_refinement(max_epochs=10): ...@@ -328,8 +327,11 @@ def test_synthetic_allele_refinement(max_epochs=10):
metric_rows = [] metric_rows = []
def progress(): def progress():
predictions = predictor.predict(peptides=train_df.peptide.values, predictions = from_ic50(
allele_encoding=allele_encoding, ) predictor.predict(
output="affinities",
peptides=train_df.peptide.values,
allele_encoding=allele_encoding))
train_df["max_prediction"] = predictions.max(1) train_df["max_prediction"] = predictions.max(1)
train_df["predicted_allele"] = pandas.Series(alleles).loc[ train_df["predicted_allele"] = pandas.Series(alleles).loc[
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment