Skip to content
Snippets Groups Projects
Commit 694e6634 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 7205db5c
No related branches found
No related tags found
No related merge requests found
......@@ -150,8 +150,8 @@ class AlleleEncoding(object):
class MultipleAlleleEncoding(object):
def __init__(
self,
experiment_names,
experiment_to_allele_list,
experiment_names=[],
experiment_to_allele_list={},
max_alleles_per_experiment=6,
allele_to_sequence=None,
borrow_from=None):
......@@ -194,6 +194,12 @@ class MultipleAlleleEncoding(object):
return self.allele_encoding.indices.values.reshape(
(-1, self.max_alleles_per_experiment))
@property
def alleles(self):
return numpy.reshape(
self.allele_encoding.alleles.values,
(-1, self.max_alleles_per_experiment))
def compact(self):
result = copy(self)
result.allele_encoding = self.allele_encoding.compact()
......
......@@ -2,20 +2,24 @@ from __future__ import print_function
import time
import collections
from functools import partial
from six import string_types
import numpy
import pandas
import mhcnames
import hashlib
from .hyperparameters import HyperparameterDefaults
from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
from .encodable_sequences import EncodableSequences
from .regression_target import from_ic50, to_ic50
from .random_negative_peptides import RandomNegativePeptides
from .allele_encoding import MultipleAlleleEncoding
from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding
from .auxiliary_input import AuxiliaryInputEncoder
from .custom_loss import (
MSEWithInequalities,
MultiallelicMassSpecLoss)
MultiallelicMassSpecLoss,
ZeroLoss)
class Class1LigandomePredictor(object):
......@@ -26,8 +30,7 @@ class Class1LigandomePredictor(object):
'alignment_method': 'left_pad_centered_right_pad',
'max_length': 15,
},
additional_dense_layers=[],
additional_dense_activation="sigmoid",
max_alleles=6,
)
"""
Hyperparameters (and their default values) that affect the neural network
......@@ -55,7 +58,8 @@ class Class1LigandomePredictor(object):
"""
compile_hyperparameter_defaults = HyperparameterDefaults(
loss_delta=0.2,
loss_multiallelic_mass_spec_delta=0.2,
loss_multiallelic_mass_spec_multiplier=1.0,
optimizer="rmsprop",
learning_rate=None,
)
......@@ -64,8 +68,9 @@ class Class1LigandomePredictor(object):
used.
"""
allele_features_hyperparameter_defaults = HyperparameterDefaults(
allele_features_include_gene=True,
auxiliary_input_hyperparameter_defaults = HyperparameterDefaults(
auxiliary_input_features=["gene"],
auxiliary_input_feature_parameters={},
)
"""
Allele feature hyperparameters.
......@@ -75,7 +80,7 @@ class Class1LigandomePredictor(object):
fit_hyperparameter_defaults).extend(
early_stopping_hyperparameter_defaults).extend(
compile_hyperparameter_defaults).extend(
allele_features_hyperparameter_defaults)
auxiliary_input_hyperparameter_defaults)
def __init__(
self,
......@@ -99,6 +104,8 @@ class Class1LigandomePredictor(object):
self.hyperparameters)
self.fit_info = []
self.allele_to_sequence = class1_affinity_predictor.allele_to_sequence
self.allele_representation_hash = None
@staticmethod
def make_network(pan_allele_class1_neural_networks, hyperparameters):
......@@ -110,12 +117,16 @@ class Class1LigandomePredictor(object):
Flatten,
RepeatVector,
concatenate,
Reshape,
Activation,
Lambda,
Add,
Embedding)
from keras.models import Model
import keras.initializers
networks = [model.network() for model in pan_allele_class1_neural_networks]
networks = [
model.network() for model in pan_allele_class1_neural_networks
]
merged_ensemble = Class1NeuralNetwork.merge(
networks,
merge_method="average")
......@@ -123,20 +134,22 @@ class Class1LigandomePredictor(object):
peptide_shape = tuple(
int(x) for x in K.int_shape(merged_ensemble.inputs[0])[1:])
input_alleles = Input(shape=(6,), name="allele") # up to 6 alleles
input_alleles = Input(
shape=(hyperparameters['max_alleles'],), name="allele")
input_peptides = Input(
shape=peptide_shape,
dtype='float32',
name='peptide')
peptides_flattened = Flatten()(input_peptides)
peptides_repeated = RepeatVector(6)(peptides_flattened)
peptides_repeated = RepeatVector(hyperparameters['max_alleles'])(
peptides_flattened)
allele_representation = Embedding(
name="allele_representation",
input_dim=64, # arbitrary, how many alleles to have room for
output_dim=1029,
input_length=6,
input_length=hyperparameters['max_alleles'],
trainable=False,
mask_zero=True)(input_alleles)
......@@ -189,27 +202,69 @@ class Class1LigandomePredictor(object):
layer_name_to_new_node[layer.name] = node
affinity_predictor_output = Lambda(lambda x: x[:, 0])(node)
affinity_predictor_matrix_output = node
affinity_predictor_output = Lambda(
lambda x: x[:, 0], name="affinity_output")(
affinity_predictor_matrix_output)
"""
layer = Dense(8, activation="sigmoid", kernel_initializer=keras.initializers.RandomNormal(mean=1.0/8.0, stddev=1e-5), use_bias=False)
lifted = TimeDistributed(layer, name="ligandome_hidden1")
node = lifted(affinity_predictor_matrix_output)
"""
auxiliary_input = None
if hyperparameters['auxiliary_input_features']:
auxiliary_input = Input(
shape=(
hyperparameters['max_alleles'],
len(
AuxiliaryInputEncoder.get_columns(
hyperparameters['auxiliary_input_features'],
feature_parameters=hyperparameters[
'auxiliary_input_feature_parameters']))),
dtype="float32",
name="auxiliary")
node = concatenate(
[node, auxiliary_input], name="affinities_with_auxiliary")
#layer = Dense(1, activation="linear", kernel_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=1e-5), use_bias=False)
layer = Dense(1, activation="tanh")
lifted = TimeDistributed(layer, name="ligandome_output")
ligandome_adjustment = lifted(node)
for (i, layer_size) in enumerate(
hyperparameters['additional_dense_layers']):
layer = Dense(
layer_size,
activation=hyperparameters['additional_dense_activation'])
lifted = TimeDistributed(layer)
node = lifted(node)
"""
weights = layers[-1].get_weights()
layer = Dense(1, activation="sigmoid", kernel_initializer=keras.initializers.Constant(weights[0]), bias_initializer=keras.initializers.Constant(weights[1]))
lifted = TimeDistributed(layer, name="ligandome_output")
ligandome_output = lifted(prev_node)
"""
def logit(x):
import tensorflow as tf
return - tf.log(1. / x - 1.)
ligandome_output_pre_sigmoid = Add()([Lambda(logit)(affinity_predictor_matrix_output), ligandome_adjustment])
ligandome_output = Activation("sigmoid")(ligandome_output_pre_sigmoid)
#ligandome_output = affinity_predictor_matrix_output
layer = Dense(1, activation="sigmoid")
lifted = TimeDistributed(layer)
ligandome_output = lifted(node)
#output_node = concatenate([
# affinity_predictor_output, ligandome_output
#], name="combined_output")
network = Model(
inputs=[input_peptides, input_alleles],
outputs=[affinity_predictor_output, ligandome_output],
inputs=[
input_peptides,
input_alleles,
] + ([] if auxiliary_input is None else [auxiliary_input]),
outputs=[
affinity_predictor_output,
ligandome_output,
affinity_predictor_matrix_output
],
name="ligandome",
)
network.summary()
......@@ -270,31 +325,42 @@ class Class1LigandomePredictor(object):
import keras.backend as K
assert isinstance(allele_encoding, MultipleAlleleEncoding)
assert (
allele_encoding.max_alleles_per_experiment ==
self.hyperparameters['max_alleles'])
#for layer in self.network._layers[:8]:
# print("Setting non trainable", layer)
# layer.trainable = False
# import ipdb ; ipdb.set_trace()
encodable_peptides = EncodableSequences.create(peptides)
del peptides
if labels is not None:
labels = numpy.array(labels, copy=False)
if inequalities is not None:
inequalities = numpy.array(inequalities, copy=True)
else:
inequalities = numpy.tile("=", len(labels))
if affinities_mask is not None:
affinities_mask = numpy.array(affinities_mask, copy=False)
if inequalities is not None:
inequalities = numpy.array(inequalities, copy=False)
else:
affinities_mask = numpy.tile(False, len(labels))
inequalities[~affinities_mask] = "="
random_negatives_planner = RandomNegativePeptides(
**RandomNegativePeptides.hyperparameter_defaults.subselect(
self.hyperparameters))
random_negatives_planner.plan(
peptides=encodable_peptides.sequences[affinities_mask],
affinities=(labels[affinities_mask]),
alleles=numpy.reshape(
allele_encoding.allele_encoding.alleles.values,
(-1, allele_encoding.max_alleles_per_experiment))[
affinities_mask, 0
].flatten(),
inequalities=inequalities[affinities_mask])
peptides=encodable_peptides.sequences,
affinities=numpy.where(affinities_mask, labels, to_ic50(labels)),
alleles=[
numpy.random.choice(row[row != numpy.array(None)])
for row in allele_encoding.alleles
],
inequalities=inequalities)
peptide_input = self.peptides_to_network_input(encodable_peptides)
......@@ -302,15 +368,14 @@ class Class1LigandomePredictor(object):
if shuffle_permutation is None:
shuffle_permutation = numpy.random.permutation(len(labels))
peptide_input = peptide_input[shuffle_permutation]
peptides = encodable_peptides.sequences[shuffle_permutation]
allele_encoding.shuffle_in_place(shuffle_permutation)
labels = labels[shuffle_permutation]
inequalities = inequalities[shuffle_permutation]
affinities_mask = affinities_mask[shuffle_permutation]
del peptides
del encodable_peptides
# Optional optimization
allele_encoding = allele_encoding.compact()
(allele_encoding_input, allele_representations) = (
self.allele_encoding_to_network_input(allele_encoding))
......@@ -318,34 +383,37 @@ class Class1LigandomePredictor(object):
'peptide': peptide_input,
'allele': allele_encoding_input,
}
if self.hyperparameters['auxiliary_input_features']:
auxiliary_encoder = AuxiliaryInputEncoder(
alleles=allele_encoding.alleles,
peptides=peptides)
x_dict_without_random_negatives[
'auxiliary'
] = auxiliary_encoder.get_array(
features=self.hyperparameters['auxiliary_input_features'],
feature_parameters=self.hyperparameters[
'auxiliary_input_feature_parameters'])
y1 = numpy.zeros(shape=len(labels))
if affinities_mask is not None:
y1[affinities_mask] = from_ic50(labels[affinities_mask])
random_negatives_allele_encoding = None
if allele_encoding is not None:
random_negative_alleles = random_negatives_planner.get_alleles()
random_negatives_allele_encoding = MultipleAlleleEncoding(
experiment_names=random_negative_alleles,
experiment_to_allele_list=dict(
(a, [a]) for a in random_negative_alleles),
max_alleles_per_experiment=(
allele_encoding.max_alleles_per_experiment),
borrow_from=allele_encoding.allele_encoding)
y1[affinities_mask] = from_ic50(labels[affinities_mask])
random_negative_alleles = random_negatives_planner.get_alleles()
random_negatives_allele_encoding = MultipleAlleleEncoding(
experiment_names=random_negative_alleles,
experiment_to_allele_list=dict(
(a, [a]) for a in random_negative_alleles),
max_alleles_per_experiment=(
allele_encoding.max_alleles_per_experiment),
borrow_from=allele_encoding.allele_encoding)
num_random_negatives = random_negatives_planner.get_total_count()
if inequalities is not None:
# Reverse inequalities because from_ic50() flips the direction
# (i.e. lower affinity results in higher y values).
adjusted_inequalities = pandas.Series(inequalities).map({
"=": "=",
">": "<",
"<": ">",
}).values
else:
adjusted_inequalities = numpy.tile("=", len(y1))
# Reverse inequalities because from_ic50() flips the direction
# (i.e. lower affinity results in higher y values).
adjusted_inequalities = pandas.Series(inequalities).map({
"=": "=",
">": "<",
"<": ">",
}).values
adjusted_inequalities[~affinities_mask] = ">"
# Note: we are using "<" here not ">" because the inequalities are
......@@ -369,7 +437,9 @@ class Class1LigandomePredictor(object):
inequalities=adjusted_inequalities_with_random_negative)
mms_loss = MultiallelicMassSpecLoss(
delta=self.hyperparameters['loss_delta'])
delta=self.hyperparameters['loss_multiallelic_mass_spec_delta'],
multiplier=self.hyperparameters[
'loss_multiallelic_mass_spec_multiplier'])
y2 = labels.copy()
y2[affinities_mask] = -1
y2_with_random_negatives = numpy.concatenate([
......@@ -380,9 +450,10 @@ class Class1LigandomePredictor(object):
fit_info = collections.defaultdict(list)
self.set_allele_representations(allele_representations)
allele_representations_hash = self.set_allele_representations(
allele_representations)
self.network.compile(
loss=[affinities_loss.loss, mms_loss.loss],
loss=[affinities_loss.loss, mms_loss.loss, ZeroLoss.loss],
optimizer=self.hyperparameters['optimizer'])
if self.hyperparameters['learning_rate'] is not None:
K.set_value(
......@@ -422,6 +493,20 @@ class Class1LigandomePredictor(object):
random_negatives_allele_encoding)[0],
x_dict_without_random_negatives['allele']
])
if 'auxiliary' in x_dict_without_random_negatives:
random_negative_auxiliary_encoder = AuxiliaryInputEncoder(
alleles=random_negatives_allele_encoding.alleles,
#peptides=random_negative_peptides.sequences
)
x_dict_with_random_negatives['auxiliary'] = (
numpy.concatenate([
random_negative_auxiliary_encoder.get_array(
features=self.hyperparameters[
'auxiliary_input_features'],
feature_parameters=self.hyperparameters[
'auxiliary_input_feature_parameters']),
x_dict_without_random_negatives['auxiliary']
]))
else:
x_dict_with_random_negatives = (
x_dict_without_random_negatives)
......@@ -433,17 +518,27 @@ class Class1LigandomePredictor(object):
"peptide"
][:num_random_negatives] = random_negative_peptides_encoding
#def generator(x, ys, batch_size):
# # Each batch should have a mix of:
# # - random negative peptides
# # - affinity measurements (binder + non-binder)
# # - multiallelic mass spec
# TODO: need to use fit_generator to keep each minibatch corresponding
# to a single experiment
self.assert_allele_representations_hash(allele_representations_hash)
#import ipdb ; ipdb.set_trace()
fit_history = self.network.fit(
x_dict_with_random_negatives,
[encoded_y1, encoded_y2],
[encoded_y1, encoded_y2, encoded_y2],
shuffle=True,
batch_size=self.hyperparameters['minibatch_size'],
verbose=verbose,
epochs=i + 1,
initial_epoch=i,
validation_split=self.hyperparameters['validation_split'],
)
epoch_time = time.time() - epoch_start
......@@ -467,6 +562,7 @@ class Class1LigandomePredictor(object):
last_progress_print = time.time()
if self.hyperparameters['validation_split']:
#import ipdb ; ipdb.set_trace()
val_loss = fit_info['val_loss'][-1]
if min_val_loss is None or (
val_loss < min_val_loss -
......@@ -496,32 +592,78 @@ class Class1LigandomePredictor(object):
progress_callback()
fit_info["time"] = time.time() - start
fit_info["num_points"] = len(encodable_peptides)
fit_info["num_points"] = len(labels)
self.fit_info.append(dict(fit_info))
def predict(
self,
peptides,
allele_encoding,
allele=None,
alleles=None,
output="affinities",
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
if isinstance(peptides, string_types):
raise TypeError("peptides must be a list or array, not a string")
if isinstance(alleles, string_types):
raise TypeError(
"alleles must be an iterable, AlleleEncoding, or "
"MultipleAlleleEncoding")
if allele is None and alleles is None:
raise ValueError("Must specify 'allele' or 'alleles'.")
if allele is not None:
if alleles is not None:
raise ValueError("Specify exactly one of allele or alleles")
normalized_allele = mhcnames.normalize_allele_name(allele)
alleles = [normalized_allele] * len(peptides)
if not isinstance(alleles, MultipleAlleleEncoding):
new_alleles = MultipleAlleleEncoding(
allele_to_sequence=self.allele_to_sequence,
max_alleles_per_experiment=self.hyperparameters['max_alleles'])
new_alleles.append_alleles(alleles)
alleles = new_alleles
peptides = EncodableSequences.create(peptides)
(allele_encoding_input, allele_representations) = (
self.allele_encoding_to_network_input(allele_encoding.compact()))
self.allele_encoding_to_network_input(alleles.compact()))
self.set_allele_representations(allele_representations)
x_dict = {
'peptide': self.peptides_to_network_input(peptides),
'allele': allele_encoding_input,
}
predictions = self.network.predict(x_dict, batch_size=batch_size)
if self.hyperparameters['auxiliary_input_features']:
auxiliary_encoder = AuxiliaryInputEncoder(
alleles=alleles.alleles,
peptides=peptides.sequences)
x_dict[
'auxiliary'
] = auxiliary_encoder.get_array(
features=self.hyperparameters['auxiliary_input_features'],
feature_parameters=self.hyperparameters[
'auxiliary_input_feature_parameters'])
predictions = [
numpy.squeeze(output)
for output in self.network.predict(x_dict, batch_size=batch_size)
]
predictions[0] = to_ic50(predictions[0])
predictions[2] = to_ic50(predictions[2])
if output == "affinities":
predictions = to_ic50(predictions[0])
elif output == "ligandome_presentation":
predictions = predictions[0]
elif output == "ligandome":
predictions = predictions[1]
elif output == "affinities_matrix":
predictions = predictions[2]
elif output == "both":
predictions = predictions[:2]
elif output == "all":
pass
else:
raise NotImplementedError("Unknown output", output)
return numpy.squeeze(predictions)
return predictions
def set_allele_representations(self, allele_representations):
"""
......@@ -577,13 +719,43 @@ class Class1LigandomePredictor(object):
original_model.fit_generator = throw
layer.set_weights([reshaped])
self.allele_representation_hash = hashlib.sha1(
allele_representations.tobytes()).hexdigest()
return self.allele_representation_hash
def assert_allele_representations_hash(self, value):
numpy.testing.assert_equal(self.allele_representation_hash, value)
def __getstate__(self):
"""
serialize to a dict. Model weights are included. For pickle support.
Returns
-------
dict
"""
result = dict(self.__dict__)
result['network'] = None
result['network_json'] = None
result['network_weights'] = None
if self.network is not None:
result['network_json'] = self.network.to_json()
result['network_weights'] = self.network.get_weights()
return result
def __setstate__(self, state):
"""
Deserialize. For pickle support.
"""
network_json = state.pop("network_json")
network_weights = state.pop("network_weights")
self.__dict__.update(state)
if network_json is not None:
import keras.models
self.network = keras.models.model_from_json(network_json)
if network_weights is not None:
self.network.set_weights(network_weights)
@staticmethod
def allele_features(allele_names, hyperparameters):
df = pandas.DataFrame({"allele_name": allele_names})
if hyperparameters['allele_features_include_gene']:
# TODO: support other organisms.
for gene in ["A", "B", "C"]:
df[gene] = df.allele_name.str.startswith(
"HLA-%s" % gene).astype(float)
return gene
......@@ -92,12 +92,12 @@ class MSEWithInequalities(Loss):
y_pred is greater or less than y_true.
between 2 - 3:
Treated as a "<" inequality. Penalty (y_pred - (y_true - 2))**2 is
applied only if y_pred is greater than y_true - 2.
Treated as a ">" inequality. Penalty (y_pred - (y_true - 2))**2 is
applied only if y_pred is less than y_true - 2.
between 4 - 5:
Treated as a ">" inequality. Penalty (y_pred - (y_true - 4))**2 is
applied only if y_pred is less than y_true - 4.
Treated as a "<" inequality. Penalty (y_pred - (y_true - 4))**2 is
applied only if y_pred is greater than y_true - 4.
"""
name = "mse_with_inequalities"
supports_inequalities = True
......@@ -240,11 +240,12 @@ class MultiallelicMassSpecLoss(Loss):
supports_inequalities = True
supports_multiple_outputs = False
def __init__(self, delta=0.2):
def __init__(self, delta=0.2, multiplier=1.0):
self.delta = delta
self.multiplier = multiplier
@staticmethod
def encode_y(y, affinities_mask=None, inequalities=None):
def encode_y(y):
encoded = pandas.Series(y, dtype="float32", copy=True)
assert all(item in (-1.0, 1.0, 0.0) for item in encoded), set(y)
print(
......@@ -262,9 +263,25 @@ class MultiallelicMassSpecLoss(Loss):
pos_max = tf.reduce_max(pos, axis=1)
neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0))
term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta
result = tf.reduce_sum(tf.maximum(0.0, term) ** 2)
return result
result = tf.reduce_sum(tf.maximum(0.0, term) ** 2) / tf.cast(
tf.shape(term)[0], tf.float32) * self.multiplier
return tf.where(tf.is_nan(result), 0.0, result)
class ZeroLoss(Loss):
"""
"""
name = "zero_loss"
supports_inequalities = False
supports_multiple_outputs = False
@staticmethod
def encode_y(y):
return y
@staticmethod
def loss(y_true, y_pred):
return 0.0
def check_shape(name, arr, expected_shape):
......@@ -284,5 +301,7 @@ def check_shape(name, arr, expected_shape):
# Register custom losses.
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss, ZeroLoss]:
CUSTOM_LOSSES[cls.name] = cls()
......@@ -21,6 +21,7 @@ logging.getLogger('matplotlib').disabled = True
import pandas
import argparse
import sys
import copy
from functools import partial
from numpy.testing import assert_, assert_equal, assert_allclose
......@@ -153,7 +154,7 @@ def test_loss():
)
contributions.append(contribution)
contributions = numpy.array(contributions)
expected1 = contributions.sum()
expected1 = contributions.sum() / len(contributions)
# reference implementation 2: numpy
pos = numpy.array([
......@@ -164,7 +165,8 @@ def test_loss():
neg = y_pred[(y_true == 0.0).astype(bool)]
expected2 = (
numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() / (
len(pos) * len(neg))
yield numpy.testing.assert_almost_equal, expected1, expected2, 4
......@@ -226,7 +228,7 @@ def make_motif(allele, peptides, frac=0.01):
return matrix
def test_real_data_multiallelic_refinement(max_epochs=10):
def Xtest_real_data_multiallelic_refinement(max_epochs=10):
ms_df = pandas.read_csv(
get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
ms_df = ms_df.loc[
......@@ -240,10 +242,20 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
del sample_table[col]
sample_table["alleles"] = sample_table.hla.str.split()
multi_train_df = ms_df.loc[
multi_train_hit_df = ms_df.loc[
ms_df.sample_id == "RA957"
].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True)
multi_train_df["label"] = 1.0
multi_train_hit_df["label"] = 1.0
multi_train_decoy_df = ms_df.loc[
(ms_df.sample_id == "CD165") &
(~ms_df.peptide.isin(multi_train_hit_df.peptide.unique()))
].drop_duplicates("peptide")[["peptide"]]
(multi_train_decoy_df["sample_id"],) = multi_train_hit_df.sample_id.unique()
multi_train_decoy_df["label"] = 0.0
multi_train_df = pandas.concat(
[multi_train_hit_df, multi_train_decoy_df], ignore_index=True)
multi_train_df["is_affinity"] = False
multi_train_alleles = set()
......@@ -281,6 +293,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
ligandome_predictor = Class1LigandomePredictor(
pan_predictor,
auxiliary_input_features=[],
max_ensemble_size=1,
max_epochs=50,
learning_rate=0.0001,
......@@ -292,7 +305,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
ligandome_predictor.predict(
output="affinities",
peptides=combined_train_df.peptide.values,
allele_encoding=allele_encoding))
alleles=allele_encoding))
(model,) = pan_predictor.class1_pan_allele_models
expected_pre_predictions = from_ic50(
......@@ -325,11 +338,220 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
progress_callback=update_motifs,
)
import ipdb ; ipdb.set_trace()
def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
refine_allele = "HLA-C*01:02"
alleles = [
"HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
"HLA-A*03:01", "HLA-B*15:01", refine_allele
]
peptides_per_allele = [
2000, 1000, 500,
1500, 1200, 800,
]
allele_to_peptides = dict(zip(alleles, peptides_per_allele))
length = 9
train_with_ms = pandas.read_csv(
get_path("data_curated", "curated_training_data.with_mass_spec.csv.bz2"))
train_no_ms = pandas.read_csv(get_path("data_curated",
"curated_training_data.no_mass_spec.csv.bz2"))
def filter_df(df):
df = df.loc[
(df.allele.isin(alleles)) &
(df.peptide.str.len() == length)
]
return df
train_with_ms = filter_df(train_with_ms)
train_no_ms = filter_df(train_no_ms)
ms_specific = train_with_ms.loc[
~train_with_ms.peptide.isin(train_no_ms.peptide)
]
train_peptides = []
train_true_alleles = []
for allele in alleles:
peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample(
n=allele_to_peptides[allele])
train_peptides.extend(peptides)
train_true_alleles.extend([allele] * len(peptides))
hits_df = pandas.DataFrame({"peptide": train_peptides})
hits_df["true_allele"] = train_true_alleles
hits_df["hit"] = 1.0
decoys_df = hits_df.copy()
decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide)
decoys_df["true_allele"] = ""
decoys_df["hit"] = 0.0
mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True)
mms_train_df["label"] = mms_train_df.hit
mms_train_df["is_affinity"] = False
affinity_train_df = pandas.read_csv(
get_path(
"models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
affinity_train_df = affinity_train_df.loc[
affinity_train_df.allele.isin(alleles),
["peptide", "allele", "measurement_inequality", "measurement_value"]]
affinity_train_df["label"] = affinity_train_df["measurement_value"]
del affinity_train_df["measurement_value"]
affinity_train_df["is_affinity"] = True
predictor = Class1LigandomePredictor(
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
auxiliary_input_features=["gene"],
max_ensemble_size=1,
max_epochs=max_epochs,
learning_rate=0.0001,
patience=5,
min_delta=0.0,
random_negative_rate=1.0,
random_negative_constant=25)
mms_allele_encoding = MultipleAlleleEncoding(
experiment_names=["experiment1"] * len(mms_train_df),
experiment_to_allele_list={
"experiment1": alleles,
},
max_alleles_per_experiment=6,
allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
)
allele_encoding = copy.deepcopy(mms_allele_encoding)
allele_encoding.append_alleles(affinity_train_df.allele.values)
allele_encoding = allele_encoding.compact()
train_df = pandas.concat(
[mms_train_df, affinity_train_df], ignore_index=True, sort=False)
pre_predictions = from_ic50(
predictor.predict(
output="affinities_matrix",
peptides=mms_train_df.peptide.values,
alleles=mms_allele_encoding))
(model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
expected_pre_predictions = from_ic50(
model.predict(
peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)),
allele_encoding=mms_allele_encoding.allele_encoding,
)).reshape((-1, len(alleles)))
#import ipdb ; ipdb.set_trace()
mms_train_df["pre_max_prediction"] = pre_predictions.max(1)
pre_auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.pre_max_prediction.values)
print("PRE_AUC", pre_auc)
assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
motifs_history = []
random_peptides_encodable = make_random_peptides(10000, [9])
def update_motifs():
for allele in alleles:
motif = make_motif(allele, random_peptides_encodable)
motifs_history.append((allele, motif))
metric_rows = []
def progress():
(_, ligandome_prediction, affinities_predictions) = (
predictor.predict(
output="all",
peptides=mms_train_df.peptide.values,
alleles=mms_allele_encoding))
affinities_predictions = from_ic50(affinities_predictions)
for (kind, predictions) in [
("affinities", affinities_predictions),
("ligandome", ligandome_prediction)]:
mms_train_df["max_prediction"] = predictions.max(1)
mms_train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()
].values
print(kind)
print(predictions)
mean_predictions_for_hit = mms_train_df.loc[
mms_train_df.hit == 1.0
].max_prediction.mean()
mean_predictions_for_decoy = mms_train_df.loc[
mms_train_df.hit == 0.0
].max_prediction.mean()
correct_allele_fraction = (
mms_train_df.loc[mms_train_df.hit == 1.0].predicted_allele ==
mms_train_df.loc[mms_train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.max_prediction.values)
print(kind, "Mean prediction for hit", mean_predictions_for_hit)
print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
print(kind, "Correct predicted allele fraction", correct_allele_fraction)
print(kind, "AUC", auc)
metric_rows.append((
kind,
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (ligandome_prediction, auc)
print("Pre fitting:")
progress()
update_motifs()
print("Fitting...")
predictor.fit(
peptides=train_df.peptide.values,
labels=train_df.label.values,
inequalities=train_df.measurement_inequality.values,
affinities_mask=train_df.is_affinity.values,
allele_encoding=allele_encoding,
progress_callback=progress,
)
(predictions, final_auc) = progress()
print("Final AUC", final_auc)
update_motifs()
motifs = pandas.DataFrame(
motifs_history,
columns=[
"allele",
"motif",
]
)
metrics = pandas.DataFrame(
metric_rows,
columns=[
"output",
"mean_predictions_for_hit",
"mean_predictions_for_decoy",
"correct_allele_fraction",
"auc"
])
return (predictor, predictions, metrics, motifs)
def Xtest_synthetic_allele_refinement(max_epochs=10):
......@@ -387,12 +609,13 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
predictor = Class1LigandomePredictor(
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
additional_dense_layers=[8, 1],
max_ensemble_size=1,
max_epochs=max_epochs,
learning_rate=0.0001,
patience=5,
min_delta=0.0)
min_delta=0.0,
random_negative_rate=0.0,
random_negative_constant=0)
allele_encoding = MultipleAlleleEncoding(
experiment_names=["experiment1"] * len(train_df),
......@@ -405,9 +628,9 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
pre_predictions = from_ic50(
predictor.predict(
output="affinities",
output="affinities_matrix",
peptides=train_df.peptide.values,
allele_encoding=allele_encoding))
alleles=allele_encoding))
(model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
expected_pre_predictions = from_ic50(
......@@ -436,45 +659,52 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
metric_rows = []
def progress():
predictions = from_ic50(
(_, ligandome_prediction, affinities_predictions) = (
predictor.predict(
output="affinities",
output="all",
peptides=train_df.peptide.values,
allele_encoding=allele_encoding))
train_df["max_prediction"] = predictions.max(1)
train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()].values
print(predictions)
mean_predictions_for_hit = train_df.loc[
train_df.hit == 1.0
].max_prediction.mean()
mean_predictions_for_decoy = train_df.loc[
train_df.hit == 0.0
].max_prediction.mean()
correct_allele_fraction = (
train_df.loc[train_df.hit == 1.0].predicted_allele ==
train_df.loc[train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
print("Mean prediction for hit", mean_predictions_for_hit)
print("Mean prediction for decoy", mean_predictions_for_decoy)
print("Correct predicted allele fraction", correct_allele_fraction)
print("AUC", auc)
metric_rows.append((
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (predictions, auc)
alleles=allele_encoding))
affinities_predictions = from_ic50(affinities_predictions)
for (kind, predictions) in [
("affinities", affinities_predictions),
("ligandome", ligandome_prediction)]:
train_df["max_prediction"] = predictions.max(1)
train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()
].values
print(kind)
print(predictions)
mean_predictions_for_hit = train_df.loc[
train_df.hit == 1.0
].max_prediction.mean()
mean_predictions_for_decoy = train_df.loc[
train_df.hit == 0.0
].max_prediction.mean()
correct_allele_fraction = (
train_df.loc[train_df.hit == 1.0].predicted_allele ==
train_df.loc[train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
print(kind, "Mean prediction for hit", mean_predictions_for_hit)
print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
print(kind, "Correct predicted allele fraction", correct_allele_fraction)
print(kind, "AUC", auc)
metric_rows.append((
kind,
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (ligandome_prediction, auc)
print("Pre fitting:")
progress()
......@@ -504,6 +734,7 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
metrics = pandas.DataFrame(
metric_rows,
columns=[
"output",
"mean_predictions_for_hit",
"mean_predictions_for_decoy",
"correct_allele_fraction",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment