Skip to content
Snippets Groups Projects
Commit 694e6634 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 7205db5c
No related branches found
No related tags found
No related merge requests found
......@@ -150,8 +150,8 @@ class AlleleEncoding(object):
class MultipleAlleleEncoding(object):
def __init__(
self,
experiment_names,
experiment_to_allele_list,
experiment_names=[],
experiment_to_allele_list={},
max_alleles_per_experiment=6,
allele_to_sequence=None,
borrow_from=None):
......@@ -194,6 +194,12 @@ class MultipleAlleleEncoding(object):
return self.allele_encoding.indices.values.reshape(
(-1, self.max_alleles_per_experiment))
@property
def alleles(self):
return numpy.reshape(
self.allele_encoding.alleles.values,
(-1, self.max_alleles_per_experiment))
def compact(self):
result = copy(self)
result.allele_encoding = self.allele_encoding.compact()
......
This diff is collapsed.
......@@ -92,12 +92,12 @@ class MSEWithInequalities(Loss):
y_pred is greater or less than y_true.
between 2 - 3:
Treated as a "<" inequality. Penalty (y_pred - (y_true - 2))**2 is
applied only if y_pred is greater than y_true - 2.
Treated as a ">" inequality. Penalty (y_pred - (y_true - 2))**2 is
applied only if y_pred is less than y_true - 2.
between 4 - 5:
Treated as a ">" inequality. Penalty (y_pred - (y_true - 4))**2 is
applied only if y_pred is less than y_true - 4.
Treated as a "<" inequality. Penalty (y_pred - (y_true - 4))**2 is
applied only if y_pred is greater than y_true - 4.
"""
name = "mse_with_inequalities"
supports_inequalities = True
......@@ -240,11 +240,12 @@ class MultiallelicMassSpecLoss(Loss):
supports_inequalities = True
supports_multiple_outputs = False
def __init__(self, delta=0.2):
def __init__(self, delta=0.2, multiplier=1.0):
self.delta = delta
self.multiplier = multiplier
@staticmethod
def encode_y(y, affinities_mask=None, inequalities=None):
def encode_y(y):
encoded = pandas.Series(y, dtype="float32", copy=True)
assert all(item in (-1.0, 1.0, 0.0) for item in encoded), set(y)
print(
......@@ -262,9 +263,25 @@ class MultiallelicMassSpecLoss(Loss):
pos_max = tf.reduce_max(pos, axis=1)
neg = tf.boolean_mask(y_pred, tf.math.equal(y_true, 0.0))
term = tf.reshape(neg, (-1, 1)) - pos_max + self.delta
result = tf.reduce_sum(tf.maximum(0.0, term) ** 2)
return result
result = tf.reduce_sum(tf.maximum(0.0, term) ** 2) / tf.cast(
tf.shape(term)[0], tf.float32) * self.multiplier
return tf.where(tf.is_nan(result), 0.0, result)
class ZeroLoss(Loss):
"""
"""
name = "zero_loss"
supports_inequalities = False
supports_multiple_outputs = False
@staticmethod
def encode_y(y):
return y
@staticmethod
def loss(y_true, y_pred):
return 0.0
def check_shape(name, arr, expected_shape):
......@@ -284,5 +301,7 @@ def check_shape(name, arr, expected_shape):
# Register custom losses.
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs]:
for cls in [MSEWithInequalities, MSEWithInequalitiesAndMultipleOutputs, MultiallelicMassSpecLoss, ZeroLoss]:
CUSTOM_LOSSES[cls.name] = cls()
......@@ -21,6 +21,7 @@ logging.getLogger('matplotlib').disabled = True
import pandas
import argparse
import sys
import copy
from functools import partial
from numpy.testing import assert_, assert_equal, assert_allclose
......@@ -153,7 +154,7 @@ def test_loss():
)
contributions.append(contribution)
contributions = numpy.array(contributions)
expected1 = contributions.sum()
expected1 = contributions.sum() / len(contributions)
# reference implementation 2: numpy
pos = numpy.array([
......@@ -164,7 +165,8 @@ def test_loss():
neg = y_pred[(y_true == 0.0).astype(bool)]
expected2 = (
numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum()
numpy.maximum(0, neg.reshape((-1, 1)) - pos + delta)**2).sum() / (
len(pos) * len(neg))
yield numpy.testing.assert_almost_equal, expected1, expected2, 4
......@@ -226,7 +228,7 @@ def make_motif(allele, peptides, frac=0.01):
return matrix
def test_real_data_multiallelic_refinement(max_epochs=10):
def Xtest_real_data_multiallelic_refinement(max_epochs=10):
ms_df = pandas.read_csv(
get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
ms_df = ms_df.loc[
......@@ -240,10 +242,20 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
del sample_table[col]
sample_table["alleles"] = sample_table.hla.str.split()
multi_train_df = ms_df.loc[
multi_train_hit_df = ms_df.loc[
ms_df.sample_id == "RA957"
].drop_duplicates("peptide")[["peptide", "sample_id"]].reset_index(drop=True)
multi_train_df["label"] = 1.0
multi_train_hit_df["label"] = 1.0
multi_train_decoy_df = ms_df.loc[
(ms_df.sample_id == "CD165") &
(~ms_df.peptide.isin(multi_train_hit_df.peptide.unique()))
].drop_duplicates("peptide")[["peptide"]]
(multi_train_decoy_df["sample_id"],) = multi_train_hit_df.sample_id.unique()
multi_train_decoy_df["label"] = 0.0
multi_train_df = pandas.concat(
[multi_train_hit_df, multi_train_decoy_df], ignore_index=True)
multi_train_df["is_affinity"] = False
multi_train_alleles = set()
......@@ -281,6 +293,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
ligandome_predictor = Class1LigandomePredictor(
pan_predictor,
auxiliary_input_features=[],
max_ensemble_size=1,
max_epochs=50,
learning_rate=0.0001,
......@@ -292,7 +305,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
ligandome_predictor.predict(
output="affinities",
peptides=combined_train_df.peptide.values,
allele_encoding=allele_encoding))
alleles=allele_encoding))
(model,) = pan_predictor.class1_pan_allele_models
expected_pre_predictions = from_ic50(
......@@ -325,11 +338,220 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
progress_callback=update_motifs,
)
import ipdb ; ipdb.set_trace()
def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
refine_allele = "HLA-C*01:02"
alleles = [
"HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
"HLA-A*03:01", "HLA-B*15:01", refine_allele
]
peptides_per_allele = [
2000, 1000, 500,
1500, 1200, 800,
]
allele_to_peptides = dict(zip(alleles, peptides_per_allele))
length = 9
train_with_ms = pandas.read_csv(
get_path("data_curated", "curated_training_data.with_mass_spec.csv.bz2"))
train_no_ms = pandas.read_csv(get_path("data_curated",
"curated_training_data.no_mass_spec.csv.bz2"))
def filter_df(df):
df = df.loc[
(df.allele.isin(alleles)) &
(df.peptide.str.len() == length)
]
return df
train_with_ms = filter_df(train_with_ms)
train_no_ms = filter_df(train_no_ms)
ms_specific = train_with_ms.loc[
~train_with_ms.peptide.isin(train_no_ms.peptide)
]
train_peptides = []
train_true_alleles = []
for allele in alleles:
peptides = ms_specific.loc[ms_specific.allele == allele].peptide.sample(
n=allele_to_peptides[allele])
train_peptides.extend(peptides)
train_true_alleles.extend([allele] * len(peptides))
hits_df = pandas.DataFrame({"peptide": train_peptides})
hits_df["true_allele"] = train_true_alleles
hits_df["hit"] = 1.0
decoys_df = hits_df.copy()
decoys_df["peptide"] = decoys_df.peptide.map(scramble_peptide)
decoys_df["true_allele"] = ""
decoys_df["hit"] = 0.0
mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True)
mms_train_df["label"] = mms_train_df.hit
mms_train_df["is_affinity"] = False
affinity_train_df = pandas.read_csv(
get_path(
"models_class1_pan", "models.with_mass_spec/train_data.csv.bz2"))
affinity_train_df = affinity_train_df.loc[
affinity_train_df.allele.isin(alleles),
["peptide", "allele", "measurement_inequality", "measurement_value"]]
affinity_train_df["label"] = affinity_train_df["measurement_value"]
del affinity_train_df["measurement_value"]
affinity_train_df["is_affinity"] = True
predictor = Class1LigandomePredictor(
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
auxiliary_input_features=["gene"],
max_ensemble_size=1,
max_epochs=max_epochs,
learning_rate=0.0001,
patience=5,
min_delta=0.0,
random_negative_rate=1.0,
random_negative_constant=25)
mms_allele_encoding = MultipleAlleleEncoding(
experiment_names=["experiment1"] * len(mms_train_df),
experiment_to_allele_list={
"experiment1": alleles,
},
max_alleles_per_experiment=6,
allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence,
)
allele_encoding = copy.deepcopy(mms_allele_encoding)
allele_encoding.append_alleles(affinity_train_df.allele.values)
allele_encoding = allele_encoding.compact()
train_df = pandas.concat(
[mms_train_df, affinity_train_df], ignore_index=True, sort=False)
pre_predictions = from_ic50(
predictor.predict(
output="affinities_matrix",
peptides=mms_train_df.peptide.values,
alleles=mms_allele_encoding))
(model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
expected_pre_predictions = from_ic50(
model.predict(
peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)),
allele_encoding=mms_allele_encoding.allele_encoding,
)).reshape((-1, len(alleles)))
#import ipdb ; ipdb.set_trace()
mms_train_df["pre_max_prediction"] = pre_predictions.max(1)
pre_auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.pre_max_prediction.values)
print("PRE_AUC", pre_auc)
assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
motifs_history = []
random_peptides_encodable = make_random_peptides(10000, [9])
def update_motifs():
for allele in alleles:
motif = make_motif(allele, random_peptides_encodable)
motifs_history.append((allele, motif))
metric_rows = []
def progress():
(_, ligandome_prediction, affinities_predictions) = (
predictor.predict(
output="all",
peptides=mms_train_df.peptide.values,
alleles=mms_allele_encoding))
affinities_predictions = from_ic50(affinities_predictions)
for (kind, predictions) in [
("affinities", affinities_predictions),
("ligandome", ligandome_prediction)]:
mms_train_df["max_prediction"] = predictions.max(1)
mms_train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()
].values
print(kind)
print(predictions)
mean_predictions_for_hit = mms_train_df.loc[
mms_train_df.hit == 1.0
].max_prediction.mean()
mean_predictions_for_decoy = mms_train_df.loc[
mms_train_df.hit == 0.0
].max_prediction.mean()
correct_allele_fraction = (
mms_train_df.loc[mms_train_df.hit == 1.0].predicted_allele ==
mms_train_df.loc[mms_train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.max_prediction.values)
print(kind, "Mean prediction for hit", mean_predictions_for_hit)
print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
print(kind, "Correct predicted allele fraction", correct_allele_fraction)
print(kind, "AUC", auc)
metric_rows.append((
kind,
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (ligandome_prediction, auc)
print("Pre fitting:")
progress()
update_motifs()
print("Fitting...")
predictor.fit(
peptides=train_df.peptide.values,
labels=train_df.label.values,
inequalities=train_df.measurement_inequality.values,
affinities_mask=train_df.is_affinity.values,
allele_encoding=allele_encoding,
progress_callback=progress,
)
(predictions, final_auc) = progress()
print("Final AUC", final_auc)
update_motifs()
motifs = pandas.DataFrame(
motifs_history,
columns=[
"allele",
"motif",
]
)
metrics = pandas.DataFrame(
metric_rows,
columns=[
"output",
"mean_predictions_for_hit",
"mean_predictions_for_decoy",
"correct_allele_fraction",
"auc"
])
return (predictor, predictions, metrics, motifs)
def Xtest_synthetic_allele_refinement(max_epochs=10):
......@@ -387,12 +609,13 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
predictor = Class1LigandomePredictor(
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
additional_dense_layers=[8, 1],
max_ensemble_size=1,
max_epochs=max_epochs,
learning_rate=0.0001,
patience=5,
min_delta=0.0)
min_delta=0.0,
random_negative_rate=0.0,
random_negative_constant=0)
allele_encoding = MultipleAlleleEncoding(
experiment_names=["experiment1"] * len(train_df),
......@@ -405,9 +628,9 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
pre_predictions = from_ic50(
predictor.predict(
output="affinities",
output="affinities_matrix",
peptides=train_df.peptide.values,
allele_encoding=allele_encoding))
alleles=allele_encoding))
(model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
expected_pre_predictions = from_ic50(
......@@ -436,45 +659,52 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
metric_rows = []
def progress():
predictions = from_ic50(
(_, ligandome_prediction, affinities_predictions) = (
predictor.predict(
output="affinities",
output="all",
peptides=train_df.peptide.values,
allele_encoding=allele_encoding))
train_df["max_prediction"] = predictions.max(1)
train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()].values
print(predictions)
mean_predictions_for_hit = train_df.loc[
train_df.hit == 1.0
].max_prediction.mean()
mean_predictions_for_decoy = train_df.loc[
train_df.hit == 0.0
].max_prediction.mean()
correct_allele_fraction = (
train_df.loc[train_df.hit == 1.0].predicted_allele ==
train_df.loc[train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
print("Mean prediction for hit", mean_predictions_for_hit)
print("Mean prediction for decoy", mean_predictions_for_decoy)
print("Correct predicted allele fraction", correct_allele_fraction)
print("AUC", auc)
metric_rows.append((
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (predictions, auc)
alleles=allele_encoding))
affinities_predictions = from_ic50(affinities_predictions)
for (kind, predictions) in [
("affinities", affinities_predictions),
("ligandome", ligandome_prediction)]:
train_df["max_prediction"] = predictions.max(1)
train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()
].values
print(kind)
print(predictions)
mean_predictions_for_hit = train_df.loc[
train_df.hit == 1.0
].max_prediction.mean()
mean_predictions_for_decoy = train_df.loc[
train_df.hit == 0.0
].max_prediction.mean()
correct_allele_fraction = (
train_df.loc[train_df.hit == 1.0].predicted_allele ==
train_df.loc[train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
print(kind, "Mean prediction for hit", mean_predictions_for_hit)
print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
print(kind, "Correct predicted allele fraction", correct_allele_fraction)
print(kind, "AUC", auc)
metric_rows.append((
kind,
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (ligandome_prediction, auc)
print("Pre fitting:")
progress()
......@@ -504,6 +734,7 @@ def Xtest_synthetic_allele_refinement(max_epochs=10):
metrics = pandas.DataFrame(
metric_rows,
columns=[
"output",
"mean_predictions_for_hit",
"mean_predictions_for_decoy",
"correct_allele_fraction",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment