Skip to content
Snippets Groups Projects
Commit a2cebd98 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

updates

parent 2e8dc426
No related branches found
No related tags found
No related merge requests found
......@@ -16,7 +16,7 @@ class AlleleEncoding(object):
integer indices in a consistent way by keeping track of the universe
of alleles under use, i.e. a distinction is made between the universe
of supported alleles (what's in `allele_to_sequence`) and the actual
set of alleles used (what's in `alleles`).
set of alleles used for some task (what's in `alleles`).
Parameters
----------
......@@ -45,11 +45,12 @@ class AlleleEncoding(object):
sorted(allele_to_sequence))
self.allele_to_index = dict(
(allele, i)
for (i, allele) in enumerate(all_alleles))
self.allele_to_index[None] = -1 # special mask value
unpadded = pandas.Series(
[allele_to_sequence[a] for a in all_alleles],
index=all_alleles)
for (i, allele) in enumerate([None] + all_alleles))
unpadded = pandas.Series([
allele_to_sequence[a] if a is not None else ""
for a in [None] + all_alleles
],
index=[None] + all_alleles)
self.sequences = unpadded.str.pad(
unpadded.str.len().max(), fillchar="X")
else:
......@@ -85,7 +86,8 @@ class AlleleEncoding(object):
alleles=self.alleles,
allele_to_sequence=dict(
(allele, self.allele_to_sequence[allele])
for allele in self.alleles.unique()))
for allele in self.alleles.unique()
if allele is not None))
def allele_representations(self, encoding_name):
"""
......@@ -173,6 +175,20 @@ class MultipleAlleleEncoding(object):
)
self.max_alleles_per_experiment = max_alleles_per_experiment
def append_alleles(self, alleles):
extended_alleles = list(self.allele_encoding.alleles)
for allele in alleles:
extended_alleles.append(allele)
extended_alleles.extend(
[None] * (self.max_alleles_per_experiment - 1))
assert len(extended_alleles) % self.max_alleles_per_experiment == 0, (
len(extended_alleles))
self.allele_encoding = AlleleEncoding(
alleles=extended_alleles,
borrow_from=self.allele_encoding)
@property
def indices(self):
return self.allele_encoding.indices.values.reshape(
......
......@@ -109,6 +109,7 @@ class Class1LigandomePredictor(object):
RepeatVector,
concatenate,
Reshape,
Lambda,
Embedding)
from keras.models import Model
......@@ -134,9 +135,11 @@ class Class1LigandomePredictor(object):
input_dim=64, # arbitrary, how many alleles to have room for
output_dim=1029,
input_length=6,
trainable=False)(input_alleles)
trainable=False,
mask_zero=True)(input_alleles)
allele_flat = Reshape((6, -1), name="allele_flat")(allele_representation)
#allele_flat = Reshape((6, -1), name="allele_flat")(allele_representation)
allele_flat = allele_representation
allele_peptide_merged = concatenate(
[peptides_repeated, allele_flat], name="allele_peptide_merged")
......@@ -184,7 +187,7 @@ class Class1LigandomePredictor(object):
layer_name_to_new_node[layer.name] = node
affinity_predictor_output = node
affinity_predictor_output = Lambda(lambda x: x[:, 0])(node)
for (i, layer_size) in enumerate(
hyperparameters['additional_dense_layers']):
......@@ -198,6 +201,10 @@ class Class1LigandomePredictor(object):
lifted = TimeDistributed(layer)
ligandome_output = lifted(node)
#output_node = concatenate([
# affinity_predictor_output, ligandome_output
#], name="combined_output")
network = Model(
inputs=[input_peptides, input_alleles],
outputs=[affinity_predictor_output, ligandome_output],
......@@ -281,31 +288,47 @@ class Class1LigandomePredictor(object):
peptide_encoding = peptide_encoding[shuffle_permutation]
allele_encoding_input = allele_encoding_input[shuffle_permutation]
labels = labels[shuffle_permutation]
inequalities = inequalities[shuffle_permutation]
affinities_mask = affinities_mask[shuffle_permutation]
x_dict = {
'peptide': peptide_encoding,
'allele': allele_encoding_input,
}
loss = MSEWithInequalitiesAndMultipleOutputs(losses=[
MSEWithInequalities(),
MultiallelicMassSpecLoss(
delta=self.hyperparameters['loss_delta']),
])
y_values_pre_encoding = labels.copy()
y1 = numpy.zeros(shape=len(labels))
if affinities_mask is not None:
y_values_pre_encoding[affinities_mask] = from_ic50(labels)
y_values = loss.encode_y(
y_values_pre_encoding,
inequalities=inequalities[affinities_mask] if inequalities is not None else None,
output_indices=(~affinities_mask).astype(int) if affinities_mask is not None else numpy.ones(len(y_values_pre_encoding), dtype=int))
y1[affinities_mask] = from_ic50(labels[affinities_mask])
if inequalities is not None:
# Reverse inequalities because from_ic50() flips the direction
# (i.e. lower affinity results in higher y values).
adjusted_inequalities = pandas.Series(inequalities).map({
"=": "=",
">": "<",
"<": ">",
}).values
else:
adjusted_inequalities = numpy.tile("=", len(y1))
adjusted_inequalities[~affinities_mask] = ">"
affinities_loss = MSEWithInequalities()
encoded_y1 = affinities_loss.encode_y(
y1, inequalities=adjusted_inequalities)
mms_loss = MultiallelicMassSpecLoss(
delta=self.hyperparameters['loss_delta'])
y2 = labels.copy()
y2[affinities_mask] = -1
encoded_y2 = mms_loss.encode_y(y2)
fit_info = collections.defaultdict(list)
self.set_allele_representations(allele_representations)
self.network.compile(
loss=loss.loss,
loss=[affinities_loss.loss, mms_loss.loss],
optimizer=self.hyperparameters['optimizer'])
if self.hyperparameters['learning_rate'] is not None:
K.set_value(
......@@ -328,7 +351,7 @@ class Class1LigandomePredictor(object):
# to a single experiment
fit_history = self.network.fit(
x_dict,
labels,
[encoded_y1, encoded_y2],
shuffle=True,
batch_size=self.hyperparameters['minibatch_size'],
verbose=verbose,
......
......@@ -182,10 +182,8 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
supports_inequalities = True
supports_multiple_outputs = True
def __init__(self, losses=MSEWithInequalities):
self.losses = losses
def encode_y(self, y, inequalities=None, output_indices=None):
@staticmethod
def encode_y(y, inequalities=None, output_indices=None):
y = array(y, dtype="float32")
if isnan(y).any():
raise ValueError("y contains NaN", y)
......@@ -194,25 +192,8 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
if (y < 0.0).any():
raise ValueError("y contains values < 0.0", y)
if isinstance(self.losses, Loss):
# Single loss applied to all outputs
encoded = MSEWithInequalities.encode_y(y, inequalities=inequalities)
else:
assert output_indices is not None
df = pandas.DataFrame({
"y": y,
"inequality": inequalities,
"output_index": output_indices,
})
encoded = y.copy()
encoded[:] = numpy.nan
for (output_index, sub_df) in df.groupby("output_index"):
loss = self.losses[output_index]
loss_kwargs = {}
if not sub_df.inequality.isnull().all():
loss_kwargs['inequalities'] = sub_df.inequality.values
encoded[sub_df.index.values] = loss.encode_y(
sub_df.y.values, **loss_kwargs)
encoded = MSEWithInequalities.encode_y(
y, inequalities=inequalities)
if output_indices is not None:
output_indices = numpy.array(output_indices)
......@@ -224,7 +205,8 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
return encoded
def loss(self, y_true, y_pred):
@staticmethod
def loss(y_true, y_pred):
from keras import backend as K
y_true = K.flatten(y_true)
......@@ -248,25 +230,17 @@ class MSEWithInequalitiesAndMultipleOutputs(Loss):
# ], axis=-1)
#updated_y_pred = tf.gather_nd(y_pred, indexer)
if isinstance(self.losses, Loss):
# Single loss for all outputs.
return self.losses.loss(updated_y_true, updated_y_pred)
else:
# TODO: make this more efficient?
result = None
for (i, loss) in enumerate(self.losses):
values = (
loss.loss(updated_y_true, updated_y_pred) *
K.cast(K.equal(output_indices, i), "float32"))
if result is None:
result = values
else:
result += values
return result
return MSEWithInequalities.loss(updated_y_true, updated_y_pred)
class MultiallelicMassSpecLoss(Loss):
"""
Affiniities are mapped according to MSEWithInequalities, then by:
x -> x + 10.
Mass spec hit vs. decoy are kept at [0, 1].
"""
name = "multiallelic_mass_spec_loss"
supports_inequalities = True
......@@ -275,24 +249,38 @@ class MultiallelicMassSpecLoss(Loss):
def __init__(self, delta=0.2):
self.delta = delta
def encode_y(self, y):
return y
@staticmethod
def encode_y(y, affinities_mask=None, inequalities=None):
y = array(y, dtype="float32")
if isnan(y).any():
raise ValueError("y contains NaN", y)
if (y > 1.0).any():
raise ValueError("y contains values > 1.0", y)
if (y < 0.0).any():
raise ValueError("y contains values < 0.0", y)
encoded = y.copy()
if affinities_mask is not None:
encoded[affinities_mask] = MSEWithInequalities.encode_y(
encoded[affinities_mask], inequalities=inequalities) + 10.0
return encoded
def loss(self, y_true, y_pred):
import tensorflow as tf
#y_pred = tf.squeeze(y_pred, axis=-1)
y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
y_pred = tf.squeeze(y_pred, axis=-1)
y_true = tf.reshape(y_true, (-1,))
pos = tf.boolean_mask(y_pred, y_true)
pos = tf.boolean_mask(y_pred, tf.math.equal(y_true, 1.0))
pos_max = tf.reduce_max(pos, axis=1)
neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
result = tf.reduce_sum(
tf.maximum(
0.0, tf.reshape(neg, (-1, 1)) - pos_max + self.delta) ** 2)
neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0))
term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta
result = tf.reduce_sum(tf.maximum(0.0, term) ** 2)
return result
def check_shape(name, arr, expected_shape):
"""
Raise ValueError if arr.shape != expected_shape.
......@@ -310,5 +298,5 @@ def check_shape(name, arr, expected_shape):
# Register custom losses.
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss]:
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
CUSTOM_LOSSES[cls.name] = cls()
......@@ -117,6 +117,7 @@ def test_loss():
1.0,
0.0,
1.0,
-1.0, # ignored
1.0,
0.0
]
......@@ -125,8 +126,10 @@ def test_loss():
[0.3, 0.7, 0.5],
[0.2, 0.4, 0.6],
[0.1, 0.5, 0.3],
[0.9, 0.1, 0.2],
[0.1, 0.7, 0.1],
[0.8, 0.2, 0.4],
]
y_pred = numpy.array(y_pred)
......@@ -159,7 +162,7 @@ def test_loss():
if y_true[i] == 1.0
])
neg = y_pred[~y_true.astype(bool)]
neg = y_pred[(y_true == 0.0).astype(bool)]
expected2 = (
numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
......@@ -223,7 +226,112 @@ def make_motif(allele, peptides, frac=0.01):
return matrix
def test_synthetic_allele_refinement(max_epochs=10):
def test_real_data_multiallelic_refinement(max_epochs=10):
ms_df = pandas.read_csv(
get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
ms_df = ms_df.loc[
(ms_df.mhc_class == "I") & (~ms_df.protein_ensembl.isnull())].copy()
sample_table = ms_df.drop_duplicates(
"sample_id").set_index("sample_id").loc[ms_df.sample_id.unique()]
grouped = ms_df.groupby("sample_id").nunique()
for col in sample_table.columns:
if (grouped[col] > 1).any():
del sample_table[col]
sample_table["alleles"] = sample_table.hla.str.split()
multi_train_df = ms_df.loc[
ms_df.sample_id == "RA957"
].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True)
multi_train_df["label"] = 1.0
multi_train_df["is_affinity"] = False
multi_train_alleles = set()
for alleles in sample_table.loc[multi_train_df.sample_id.unique()].alleles:
multi_train_alleles.update(alleles)
multi_train_alleles = sorted(multi_train_alleles)
pan_train_df = pandas.read_csv(
get_path(
"models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
pan_sub_train_df = pan_train_df.loc[
pan_train_df.allele.isin(multi_train_alleles),
["peptide", "allele", "measurement_inequality", "measurement_value"]
]
pan_sub_train_df["label"] = pan_sub_train_df["measurement_value"]
del pan_sub_train_df["measurement_value"]
pan_sub_train_df["is_affinity"] = True
pan_predictor = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.with_mass_spec"),
optimization_level=0,
max_models=1)
allele_encoding = MultipleAlleleEncoding(
experiment_names=multi_train_df.sample_id.values,
experiment_to_allele_list=sample_table.alleles.to_dict(),
max_alleles_per_experiment=sample_table.alleles.str.len().max(),
allele_to_sequence=pan_predictor.allele_to_sequence,
)
allele_encoding.append_alleles(pan_sub_train_df.allele.values)
allele_encoding = allele_encoding.compact()
combined_train_df = pandas.concat([multi_train_df, pan_sub_train_df])
ligandome_predictor = Class1LigandomePredictor(
pan_predictor,
max_ensemble_size=1,
max_epochs=500,
learning_rate=0.0001,
patience=5,
min_delta=0.0)
pre_predictions = from_ic50(
ligandome_predictor.predict(
output="affinities",
peptides=combined_train_df.peptide.values,
allele_encoding=allele_encoding))
(model,) = pan_predictor.class1_pan_allele_models
expected_pre_predictions = from_ic50(
model.predict(
peptides=numpy.repeat(combined_train_df.peptide.values, len(alleles)),
allele_encoding=allele_encoding.allele_encoding,
)).reshape((-1, len(alleles)))[:,0]
assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
motifs_history = []
random_peptides_encodable = make_random_peptides(10000, [9])
def update_motifs():
for allele in multi_train_alleles:
motif = make_motif(allele, random_peptides_encodable)
motifs_history.append((allele, motif))
print("Pre fitting:")
update_motifs()
print("Fitting...")
ligandome_predictor.fit(
peptides=combined_train_df.peptide.values,
labels=combined_train_df.label.values,
allele_encoding=allele_encoding,
affinities_mask=combined_train_df.is_affinity.values,
inequalities=combined_train_df.measurement_inequality.values,
progress_callback=update_motifs,
)
#import ipdb ; ipdb.set_trace()
def Xtest_synthetic_allele_refinement(max_epochs=10):
refine_allele = "HLA-C*01:02"
alleles = [
"HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment