Skip to content
Snippets Groups Projects
Commit 694e6634 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 7205db5c
No related branches found
No related tags found
No related merge requests found
...@@ -150,8 +150,8 @@ class AlleleEncoding(object): ...@@ -150,8 +150,8 @@ class AlleleEncoding(object):
class MultipleAlleleEncoding(object): class MultipleAlleleEncoding(object):
def __init__( def __init__(
self, self,
experiment_names, experiment_names=[],
experiment_to_allele_list, experiment_to_allele_list={},
max_alleles_per_experiment=6, max_alleles_per_experiment=6,
allele_to_sequence=None, allele_to_sequence=None,
borrow_from=None): borrow_from=None):
...@@ -194,6 +194,12 @@ class MultipleAlleleEncoding(object): ...@@ -194,6 +194,12 @@ class MultipleAlleleEncoding(object):
return self.allele_encoding.indices.values.reshape( return self.allele_encoding.indices.values.reshape(
(-1, self.max_alleles_per_experiment)) (-1, self.max_alleles_per_experiment))
@property
def alleles(self):
return numpy.reshape(
self.allele_encoding.alleles.values,
(-1, self.max_alleles_per_experiment))
def compact(self): def compact(self):
result = copy(self) result = copy(self)
result.allele_encoding = self.allele_encoding.compact() result.allele_encoding = self.allele_encoding.compact()
......
This diff is collapsed.
...@@ -92,12 +92,12 @@ class MSEWithInequalities(Loss): ...@@ -92,12 +92,12 @@ class MSEWithInequalities(Loss):
y_pred is greater or less than y_true. y_pred is greater or less than y_true.
between 2 - 3: between 2 - 3:
Treated as a "<" inequality. Penalty (y_pred - (y_true - 2))**2 is Treated as a ">" inequality. Penalty (y_pred - (y_true - 2))**2 is
applied only if y_pred is greater than y_true - 2. applied only if y_pred is less than y_true - 2.
between 4 - 5: between 4 - 5:
Treated as a ">" inequality. Penalty (y_pred - (y_true - 4))**2 is Treated as a "<" inequality. Penalty (y_pred - (y_true - 4))**2 is
applied only if y_pred is less than y_true - 4. applied only if y_pred is greater than y_true - 4.
""" """
name = "mse_with_inequalities" name = "mse_with_inequalities"
supports_inequalities = True supports_inequalities = True
...@@ -240,11 +240,12 @@ class MultiallelicMassSpecLoss(Loss): ...@@ -240,11 +240,12 @@ class MultiallelicMassSpecLoss(Loss):
supports_inequalities = True supports_inequalities = True
supports_multiple_outputs = False supports_multiple_outputs = False
def __init__(self, delta=0.2): def __init__(self, delta=0.2, multiplier=1.0):
self.delta = delta self.delta = delta
self.multiplier = multiplier
@staticmethod @staticmethod
def encode_y(y, affinities_mask=None, inequalities=None): def encode_y(y):
encoded = pandas.Series(y, dtype="float32", copy=True) encoded = pandas.Series(y, dtype="float32", copy=True)
assert all(item in (-1.0, 1.0, 0.0) for item in encoded), set(y) assert all(item in (-1.0, 1.0, 0.0) for item in encoded), set(y)
print( print(
...@@ -262,9 +263,25 @@ class MultiallelicMassSpecLoss(Loss): ...@@ -262,9 +263,25 @@ class MultiallelicMassSpecLoss(Loss):
pos_max = tf.reduce_max(pos, axis=1) pos_max = tf.reduce_max(pos, axis=1)
neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0)) neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0))
term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta
result = tf.reduce_sum(tf.maximum(0.0, term) ** 2) result = tf.reduce_sum(tf.maximum(0.0, term) ** 2) / tf.cast(
return result tf.shape(term)[0], tf.float32) * self.multiplier
return tf.where(tf.is_nan(result), 0.0, result)
class ZeroLoss(Loss):
"""
"""
name = "zero_loss"
supports_inequalities = False
supports_multiple_outputs = False
@staticmethod
def encode_y(y):
return y
@staticmethod
def loss(y_true, y_pred):
return 0.0
def check_shape(name, arr, expected_shape): def check_shape(name, arr, expected_shape):
...@@ -284,5 +301,7 @@ def check_shape(name, arr, expected_shape): ...@@ -284,5 +301,7 @@ def check_shape(name, arr, expected_shape):
# Register custom losses. # Register custom losses.
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]: for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss, ZeroLoss]:
CUSTOM_LOSSES[cls.name] = cls() CUSTOM_LOSSES[cls.name] = cls()
...@@ -21,6 +21,7 @@ logging.getLogger('matplotlib').disabled = True ...@@ -21,6 +21,7 @@ logging.getLogger('matplotlib').disabled = True
import pandas import pandas
import argparse import argparse
import sys import sys
import copy
from functools import partial from functools import partial
from numpy.testing import assert_, assert_equal, assert_allclose from numpy.testing import assert_, assert_equal, assert_allclose
...@@ -153,7 +154,7 @@ def test_loss(): ...@@ -153,7 +154,7 @@ def test_loss():
) )
contributions.append(contribution) contributions.append(contribution)
contributions = numpy.array(contributions) contributions = numpy.array(contributions)
expected1 = contributions.sum() expected1 = contributions.sum() / len(contributions)
# reference implementation 2: numpy # reference implementation 2: numpy
pos = numpy.array([ pos = numpy.array([
...@@ -164,7 +165,8 @@ def test_loss(): ...@@ -164,7 +165,8 @@ def test_loss():
neg = y_pred[(y_true == 0.0).astype(bool)] neg = y_pred[(y_true == 0.0).astype(bool)]
expected2 = ( expected2 = (
numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() / (
len(pos) * len(neg))
yield numpy.testing.assert_almost_equal, expected1, expected2, 4 yield numpy.testing.assert_almost_equal, expected1, expected2, 4
...@@ -226,7 +228,7 @@ def make_motif(allele, peptides, frac=0.01): ...@@ -226,7 +228,7 @@ def make_motif(allele, peptides, frac=0.01):
return matrix return matrix
def test_real_data_multiallelic_refinement(max_epochs=10): def Xtest_real_data_multiallelic_refinement(max_epochs=10):
ms_df = pandas.read_csv( ms_df = pandas.read_csv(
get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2")) get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
ms_df = ms_df.loc[ ms_df = ms_df.loc[
...@@ -240,10 +242,20 @@ def test_real_data_multiallelic_refinement(max_epochs=10): ...@@ -240,10 +242,20 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
del sample_table[col] del sample_table[col]
sample_table["alleles"] = sample_table.hla.str.split() sample_table["alleles"] = sample_table.hla.str.split()
multi_train_df = ms_df.loc[ multi_train_hit_df = ms_df.loc[
ms_df.sample_id == "RA957" ms_df.sample_id == "RA957"
].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True) ].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True)
multi_train_df["label"] = 1.0 multi_train_hit_df["label"] = 1.0
multi_train_decoy_df = ms_df.loc[
(ms_df.sample_id == "CD165") &
(~ms_df.peptide.isin(multi_train_hit_df.peptide.unique()))
].drop_duplicates("peptide")[["peptide"]]
(multi_train_decoy_df["sample_id"],) = multi_train_hit_df.sample_id.unique()
multi_train_decoy_df["label"] = 0.0
multi_train_df = pandas.concat(
[multi_train_hit_df, multi_train_decoy_df], ignore_index=True)
multi_train_df["is_affinity"] = False multi_train_df["is_affinity"] = False
multi_train_alleles = set() multi_train_alleles = set()
...@@ -281,6 +293,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10): ...@@ -281,6 +293,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
ligandome_predictor = Class1LigandomePredictor( ligandome_predictor = Class1LigandomePredictor(
pan_predictor, pan_predictor,
auxiliary_input_features=[],
max_ensemble_size=1, max_ensemble_size=1,
max_epochs=50, max_epochs=50,
learning_rate=0.0001, learning_rate=0.0001,
...@@ -292,7 +305,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10): ...@@ -292,7 +305,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
ligandome_predictor.predict( ligandome_predictor.predict(
output="affinities", output="affinities",
peptides=combined_train_df.peptide.values, peptides=combined_train_df.peptide.values,
allele_encoding=allele_encoding)) alleles=allele_encoding))
(model,) = pan_predictor.class1_pan_allele_models (model,) = pan_predictor.class1_pan_allele_models
expected_pre_predictions = from_ic50( expected_pre_predictions = from_ic50(
...@@ -325,11 +338,220 @@ def test_real_data_multiallelic_refinement(max_epochs=10): ...@@ -325,11 +338,220 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
progress_callback=update_motifs, progress_callback=update_motifs,
) )
import ipdb ; ipdb.set_trace()
def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
refine_allele = "HLA-C*01:02"
alleles = [
"HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
"HLA-A*03:01", "HLA-B*15:01", refine_allele
]
peptides_per_allele = [
2000, 1000, 500,
1500, 1200, 800,
]
allele_to_peptides = dict(zip(alleles, peptides_per_allele))
length = 9
train_with_ms = pandas.read_csv(
get_path("data_curated", "curated_training_data.with_mass_spec.csv.bz2"))
train_no_ms = pandas.read_csv(get_path("data_curated",
"curated_training_data.no_mass_spec.csv.bz2"))
def filter_df(df):
df = df.loc[
(df.allele.isin(alleles)) &
(df.peptide.str.len() == length)
]
return df
train_with_ms = filter_df(train_with_ms)
train_no_ms = filter_df(train_no_ms)
ms_specific = train_with_ms.loc[
~train_with_ms.peptide.isin(train_no_ms.peptide)
]
train_peptides = []
train_true_alleles = []
for allele in alleles:
peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample(
n=allele_to_peptides[allele])
train_peptides.extend(peptides)
train_true_alleles.extend([allele] * len(peptides))
hits_df = pandas.DataFrame({"peptide": train_peptides})
hits_df["true_allele"] = train_true_alleles
hits_df["hit"] = 1.0
decoys_df = hits_df.copy()
decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide)
decoys_df["true_allele"] = ""
decoys_df["hit"] = 0.0
mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True)
mms_train_df["label"] = mms_train_df.hit
mms_train_df["is_affinity"] = False
affinity_train_df = pandas.read_csv(
get_path(
"models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
affinity_train_df = affinity_train_df.loc[
affinity_train_df.allele.isin(alleles),
["peptide", "allele", "measurement_inequality", "measurement_value"]]
affinity_train_df["label"] = affinity_train_df["measurement_value"]
del affinity_train_df["measurement_value"]
affinity_train_df["is_affinity"] = True
predictor = Class1LigandomePredictor(
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
auxiliary_input_features=["gene"],
max_ensemble_size=1,
max_epochs=max_epochs,
learning_rate=0.0001,
patience=5,
min_delta=0.0,
random_negative_rate=1.0,
random_negative_constant=25)
mms_allele_encoding = MultipleAlleleEncoding(
experiment_names=["experiment1"] * len(mms_train_df),
experiment_to_allele_list={
"experiment1": alleles,
},
max_alleles_per_experiment=6,
allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
)
allele_encoding = copy.deepcopy(mms_allele_encoding)
allele_encoding.append_alleles(affinity_train_df.allele.values)
allele_encoding = allele_encoding.compact()
train_df = pandas.concat(
[mms_train_df, affinity_train_df], ignore_index=True, sort=False)
pre_predictions = from_ic50(
predictor.predict(
output="affinities_matrix",
peptides=mms_train_df.peptide.values,
alleles=mms_allele_encoding))
(model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
expected_pre_predictions = from_ic50(
model.predict(
peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)),
allele_encoding=mms_allele_encoding.allele_encoding,
)).reshape((-1, len(alleles)))
#import ipdb ; ipdb.set_trace() #import ipdb ; ipdb.set_trace()
mms_train_df["pre_max_prediction"] = pre_predictions.max(1)
pre_auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.pre_max_prediction.values)
print("PRE_AUC", pre_auc)
assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
motifs_history = []
random_peptides_encodable = make_random_peptides(10000, [9])
def update_motifs():
for allele in alleles:
motif = make_motif(allele, random_peptides_encodable)
motifs_history.append((allele, motif))
metric_rows = []
def progress():
(_, ligandome_prediction, affinities_predictions) = (
predictor.predict(
output="all",
peptides=mms_train_df.peptide.values,
alleles=mms_allele_encoding))
affinities_predictions = from_ic50(affinities_predictions)
for (kind, predictions) in [
("affinities", affinities_predictions),
("ligandome", ligandome_prediction)]:
mms_train_df["max_prediction"] = predictions.max(1)
mms_train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()
].values
print(kind)
print(predictions)
mean_predictions_for_hit = mms_train_df.loc[
mms_train_df.hit == 1.0
].max_prediction.mean()
mean_predictions_for_decoy = mms_train_df.loc[
mms_train_df.hit == 0.0
].max_prediction.mean()
correct_allele_fraction = (
mms_train_df.loc[mms_train_df.hit == 1.0].predicted_allele ==
mms_train_df.loc[mms_train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.max_prediction.values)
print(kind, "Mean prediction for hit", mean_predictions_for_hit)
print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
print(kind, "Correct predicted allele fraction", correct_allele_fraction)
print(kind, "AUC", auc)
metric_rows.append((
kind,
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (ligandome_prediction, auc)
print("Pre fitting:")
progress()
update_motifs()
print("Fitting...")
predictor.fit(
peptides=train_df.peptide.values,
labels=train_df.label.values,
inequalities=train_df.measurement_inequality.values,
affinities_mask=train_df.is_affinity.values,
allele_encoding=allele_encoding,
progress_callback=progress,
)
(predictions, final_auc) = progress()
print("Final AUC", final_auc)
update_motifs()
motifs = pandas.DataFrame(
motifs_history,
columns=[
"allele",
"motif",
]
)
metrics = pandas.DataFrame(
metric_rows,
columns=[
"output",
"mean_predictions_for_hit",
"mean_predictions_for_decoy",
"correct_allele_fraction",
"auc"
])
return (predictor, predictions, metrics, motifs)
def Xtest_synthetic_allele_refinement(max_epochs=10): def Xtest_synthetic_allele_refinement(max_epochs=10):
...@@ -387,12 +609,13 @@ def Xtest_synthetic_allele_refinement(max_epochs=10): ...@@ -387,12 +609,13 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
predictor = Class1LigandomePredictor( predictor = Class1LigandomePredictor(
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC, PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
additional_dense_layers=[8, 1],
max_ensemble_size=1, max_ensemble_size=1,
max_epochs=max_epochs, max_epochs=max_epochs,
learning_rate=0.0001, learning_rate=0.0001,
patience=5, patience=5,
min_delta=0.0) min_delta=0.0,
random_negative_rate=0.0,
random_negative_constant=0)
allele_encoding = MultipleAlleleEncoding( allele_encoding = MultipleAlleleEncoding(
experiment_names=["experiment1"] * len(train_df), experiment_names=["experiment1"] * len(train_df),
...@@ -405,9 +628,9 @@ def Xtest_synthetic_allele_refinement(max_epochs=10): ...@@ -405,9 +628,9 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
pre_predictions = from_ic50( pre_predictions = from_ic50(
predictor.predict( predictor.predict(
output="affinities", output="affinities_matrix",
peptides=train_df.peptide.values, peptides=train_df.peptide.values,
allele_encoding=allele_encoding)) alleles=allele_encoding))
(model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
expected_pre_predictions = from_ic50( expected_pre_predictions = from_ic50(
...@@ -436,45 +659,52 @@ def Xtest_synthetic_allele_refinement(max_epochs=10): ...@@ -436,45 +659,52 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
metric_rows = [] metric_rows = []
def progress(): def progress():
predictions = from_ic50( (_, ligandome_prediction, affinities_predictions) = (
predictor.predict( predictor.predict(
output="affinities", output="all",
peptides=train_df.peptide.values, peptides=train_df.peptide.values,
allele_encoding=allele_encoding)) alleles=allele_encoding))
affinities_predictions = from_ic50(affinities_predictions)
train_df["max_prediction"] = predictions.max(1) for (kind, predictions) in [
train_df["predicted_allele"] = pandas.Series(alleles).loc[ ("affinities", affinities_predictions),
predictions.argmax(1).flatten()].values ("ligandome", ligandome_prediction)]:
print(predictions) train_df["max_prediction"] = predictions.max(1)
train_df["predicted_allele"] = pandas.Series(alleles).loc[
mean_predictions_for_hit = train_df.loc[ predictions.argmax(1).flatten()
train_df.hit == 1.0 ].values
].max_prediction.mean()
mean_predictions_for_decoy = train_df.loc[ print(kind)
train_df.hit == 0.0 print(predictions)
].max_prediction.mean()
correct_allele_fraction = ( mean_predictions_for_hit = train_df.loc[
train_df.loc[train_df.hit == 1.0].predicted_allele == train_df.hit == 1.0
train_df.loc[train_df.hit == 1.0].true_allele ].max_prediction.mean()
).mean() mean_predictions_for_decoy = train_df.loc[
auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values) train_df.hit == 0.0
].max_prediction.mean()
print("Mean prediction for hit", mean_predictions_for_hit) correct_allele_fraction = (
print("Mean prediction for decoy", mean_predictions_for_decoy) train_df.loc[train_df.hit == 1.0].predicted_allele ==
print("Correct predicted allele fraction", correct_allele_fraction) train_df.loc[train_df.hit == 1.0].true_allele
print("AUC", auc) ).mean()
auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
metric_rows.append((
mean_predictions_for_hit, print(kind, "Mean prediction for hit", mean_predictions_for_hit)
mean_predictions_for_decoy, print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
correct_allele_fraction, print(kind, "Correct predicted allele fraction", correct_allele_fraction)
auc, print(kind, "AUC", auc)
))
metric_rows.append((
update_motifs() kind,
mean_predictions_for_hit,
return (predictions, auc) mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (ligandome_prediction, auc)
print("Pre fitting:") print("Pre fitting:")
progress() progress()
...@@ -504,6 +734,7 @@ def Xtest_synthetic_allele_refinement(max_epochs=10): ...@@ -504,6 +734,7 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
metrics = pandas.DataFrame( metrics = pandas.DataFrame(
metric_rows, metric_rows,
columns=[ columns=[
"output",
"mean_predictions_for_hit", "mean_predictions_for_hit",
"mean_predictions_for_decoy", "mean_predictions_for_decoy",
"correct_allele_fraction", "correct_allele_fraction",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment