Skip to content
Snippets Groups Projects
Commit 288e0db7 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

updates

parent 7f6fb1dc
No related branches found
No related tags found
No related merge requests found
......@@ -146,7 +146,52 @@ class Class1LigandomePredictor(object):
return network
@staticmethod
def loss(y_true, y_pred):
def loss(y_true, y_pred, lmbda=0.001):
import keras.backend as K
import tensorflow as tf
y_pred = tf.squeeze(y_pred, axis=-1)
#y_pred = tf.Print(y_pred, [y_pred, tf.shape(y_pred)], "y_pred", summarize=20)
#y_true = tf.Print(y_true, [y_true, tf.shape(y_true)], "y_true", summarize=20)
y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,))
pos = tf.boolean_mask(y_pred, y_true)
pos_max = tf.reduce_max(pos, axis=1)
#pos_max = tf.reduce_logsumexp(tf.boolean_mask(y_pred, y_true), axis=1)
neg = tf.boolean_mask(y_pred, tf.logical_not(y_true))
result = tf.reduce_sum(
tf.maximum(0.0, tf.reshape(neg, (-1, 1)) - pos_max)**2)
term2 = tf.reduce_sum(
tf.minimum(0.0, tf.reshape(neg, (-1, 1)) - pos_max))
result = result + lmbda * term2
#differences = tf.reshape(neg, (-1, 1)) - pos
#result = tf.reduce_sum(tf.sign(differences) * differences**2)
#result = tf.Print(result, [result], "result", summarize=20)
#term2 = lmbda * tf.reduce_mean((1 - pos)**2)
#result = result + term2
return result
"""
pos = tf.boolean_mask(y_pred, y_true)
pos = y_pred[y_true.astype(bool)].max(1)
neg = y_pred[~y_true.astype(bool)]
expected2 = (numpy.maximum(0,
neg.flatten().reshape((-1, 1)) - pos) ** 2).sum()
"""
@staticmethod
def loss_old(y_true, y_pred):
"""Binary cross entropy after taking logsumexp over predictions"""
import keras.backend as K
import tensorflow as tf
......@@ -230,6 +275,11 @@ class Class1LigandomePredictor(object):
import keras.backend as K
#for layer in self.network._layers[:8]:
# print("Setting non trainable", layer)
# layer.trainable = False
# import ipdb ; ipdb.set_trace()
peptides = EncodableSequences.create(peptides)
peptide_encoding = self.peptides_to_network_input(peptides)
......@@ -273,6 +323,9 @@ class Class1LigandomePredictor(object):
start = time.time()
for i in range(self.hyperparameters['max_epochs']):
epoch_start = time.time()
# TODO: need to use fit_generator to keep each minibatch corresponding
# to a single experiment
fit_history = self.network.fit(
x_dict,
labels,
......@@ -281,7 +334,8 @@ class Class1LigandomePredictor(object):
verbose=verbose,
epochs=i + 1,
initial_epoch=i,
validation_split=self.hyperparameters['validation_split'])
validation_split=self.hyperparameters['validation_split'],
)
epoch_time = time.time() - epoch_start
for (key, value) in fit_history.history.items():
......
......@@ -14,7 +14,10 @@ Idea:
"""
from sklearn.metrics import roc_auc_score
import logging
logging.getLogger('tensorflow').disabled = True
logging.getLogger('matplotlib').disabled = True
import pandas
import argparse
import sys
......@@ -25,12 +28,13 @@ from random import shuffle
from sklearn.metrics import roc_auc_score
from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork
from mhcflurry import Class1AffinityPredictor, Class1NeuralNetwork
from mhcflurry.allele_encoding import MultipleAlleleEncoding
from mhcflurry.class1_ligandome_predictor import Class1LigandomePredictor
from mhcflurry.encodable_sequences import EncodableSequences
from mhcflurry.downloads import get_path
from mhcflurry.regression_target import from_ic50
from mhcflurry.common import random_peptides, positional_frequency_matrix
from mhcflurry.testing_utils import cleanup, startup
from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
......@@ -72,29 +76,140 @@ def teardown():
cleanup()
def sample_peptides_from_pssm(pssm, count):
result = pandas.DataFrame(
index=numpy.arange(count),
columns=pssm.index,
dtype=object,
)
for (position, vector) in pssm.iterrows():
result.loc[:, position] = numpy.random.choice(
pssm.columns,
size=count,
replace=True,
p=vector.values)
return result.apply("".join, axis=1)
def scramble_peptide(peptide):
lst = list(peptide)
shuffle(lst)
return "".join(lst)
def evaluate_loss(loss, y_true, y_pred):
import keras.backend as K
y_true = numpy.array(y_true)
y_pred = numpy.array(y_pred)
if y_pred.ndim == 1:
y_pred = y_pred.reshape((len(y_pred), 1))
if y_true.ndim == 1:
y_true = y_true.reshape((len(y_true), 1))
if K.backend() == "tensorflow":
session = K.get_session()
y_true_var = K.constant(y_true, name="y_true")
y_pred_var = K.constant(y_pred, name="y_pred")
result = loss(y_true_var, y_pred_var)
return result.eval(session=session)
elif K.backend() == "theano":
y_true_var = K.constant(y_true, name="y_true")
y_pred_var = K.constant(y_pred, name="y_pred")
result = loss(y_true_var, y_pred_var)
return result.eval()
else:
raise ValueError("Unsupported backend: %s" % K.backend())
def Xtest_loss():
# Hit labels
y_true = [
1.0,
0.0,
1.0,
1.0,
0.0
]
y_true = numpy.array(y_true)
y_pred = [
[0.3, 0.7, 0.5],
[0.2, 0.4, 0.6],
[0.1, 0.5, 0.3],
[0.1, 0.7, 0.1],
[0.8, 0.2, 0.4],
]
y_pred = numpy.array(y_pred)
# reference implementation 1
contributions = []
for i in range(len(y_true)):
if y_true[i] == 1.0:
for j in range(len(y_true)):
if y_true[j] == 0.0:
tightest_i = max(y_pred[i])
contribution = sum(
max(0, y_pred[j, k] - tightest_i)**2
for k in range(y_pred.shape[1])
)
contributions.append(contribution)
contributions = numpy.array(contributions)
expected1 = contributions.sum()
# reference implementation 2: numpy
pos = y_pred[y_true.astype(bool)].max(1)
neg = y_pred[~y_true.astype(bool)]
expected2 = (
numpy.maximum(0, neg.reshape((-1, 1)) - pos)**2).sum()
numpy.testing.assert_almost_equal(expected1, expected2)
computed = evaluate_loss(
Class1LigandomePredictor.loss,
y_true,
y_pred.reshape(y_pred.shape + (1,)))
numpy.testing.assert_almost_equal(computed, expected1)
AA_DIST = pandas.Series(
dict((line.split()[0], float(line.split()[1])) for line in """
A 0.071732
E 0.060102
N 0.034679
D 0.039601
T 0.055313
L 0.115337
V 0.070498
S 0.071882
Q 0.040436
F 0.050178
G 0.053176
C 0.005429
H 0.025487
I 0.056312
W 0.013593
K 0.057832
M 0.021079
Y 0.043372
R 0.060330
P 0.053632
""".strip().split("\n")))
print(AA_DIST)
def make_random_peptides(num_peptides_per_length=10000, lengths=[9]):
peptides = []
for length in lengths:
peptides.extend(
random_peptides
(num_peptides_per_length, length=length, distribution=AA_DIST))
return EncodableSequences.create(peptides)
def make_motif(allele, peptides, frac=0.01):
peptides = EncodableSequences.create(peptides)
predictions = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.predict(
peptides=peptides,
allele=allele,
)
random_predictions_df = pandas.DataFrame({"peptide": peptides.sequences})
random_predictions_df["prediction"] = predictions
random_predictions_df = random_predictions_df.sort_values(
"prediction", ascending=True)
#print("Random peptide predictions", allele)
#print(random_predictions_df)
top = random_predictions_df.iloc[:int(len(random_predictions_df) * frac)]
matrix = positional_frequency_matrix(top.peptide.values)
#print("Matrix")
return matrix
def test_synthetic_allele_refinement():
refine_allele = "HLA-C*01:02"
alleles = [
......@@ -151,8 +266,10 @@ def test_synthetic_allele_refinement():
predictor = Class1LigandomePredictor(
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC,
max_ensemble_size=1,
max_epochs=100,
patience=5)
max_epochs=10,
learning_rate=0.00001,
patience=5,
min_delta=0.0)
allele_encoding = MultipleAlleleEncoding(
experiment_names=["experiment1"] * len(train_df),
......@@ -182,85 +299,115 @@ def test_synthetic_allele_refinement():
assert_allclose(pre_predictions, expected_pre_predictions)
predictor.fit(
peptides=train_df.peptide.values,
labels=train_df.hit.values,
allele_encoding=allele_encoding
)
motifs_history = []
random_peptides_encodable = make_random_peptides(10000, [9])
predictions = predictor.predict(
peptides=train_df.peptide.values,
allele_encoding=allele_encoding,
)
train_df["max_prediction"] = predictions.max(1)
train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()
].values
def update_motifs():
for allele in alleles:
motif = make_motif(allele, random_peptides_encodable)
motifs_history.append((allele, motif))
print(predictions)
metric_rows = []
auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
print("AUC", auc)
def progress():
predictions = predictor.predict(peptides=train_df.peptide.values,
allele_encoding=allele_encoding, )
import ipdb ; ipdb.set_trace()
train_df["max_prediction"] = predictions.max(1)
train_df["predicted_allele"] = pandas.Series(alleles).loc[
predictions.argmax(1).flatten()].values
return predictions
print(predictions)
mean_predictions_for_hit = train_df.loc[
train_df.hit == 1.0
].max_prediction.mean()
mean_predictions_for_decoy = train_df.loc[
train_df.hit == 0.0
].max_prediction.mean()
correct_allele_fraction = (
train_df.loc[train_df.hit == 1.0].predicted_allele ==
train_df.loc[train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values)
print("Mean prediction for hit", mean_predictions_for_hit)
print("Mean prediction for decoy", mean_predictions_for_decoy)
print("Correct predicted allele fraction", correct_allele_fraction)
print("AUC", auc)
"""
def test_simple_synethetic(
num_peptide_per_allele_and_length=100, lengths=[8,9,10,11]):
alleles = [
"HLA-A*02:01", "HLA-B*52:01", "HLA-C*07:01",
"HLA-A*03:01", "HLA-B*57:02", "HLA-C*03:01",
]
cutoff = PAN_ALLELE_MOTIFS_DF.cutoff_fraction.min()
peptides_and_alleles = []
for allele in alleles:
sub_df = PAN_ALLELE_MOTIFS_DF.loc[
(PAN_ALLELE_MOTIFS_DF.allele == allele) &
(PAN_ALLELE_MOTIFS_DF.cutoff_fraction == cutoff)
metric_rows.append((
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (predictions, auc)
print("Pre fitting:")
progress()
update_motifs()
print("Fitting...")
predictor.fit(
peptides=train_df.peptide.values,
labels=train_df.hit.values,
allele_encoding=allele_encoding,
progress_callback=progress,
)
(predictions, final_auc) = progress()
print("Final AUC", final_auc)
update_motifs()
motifs = pandas.DataFrame(
motifs_history,
columns=[
"allele",
"motif",
]
assert len(sub_df) > 0, allele
for length in lengths:
pssm = sub_df.loc[
sub_df.length == length
].set_index("position")[COMMON_AMINO_ACIDS]
peptides = sample_peptides_from_pssm(pssm, num_peptide_per_allele_and_length)
for peptide in peptides:
peptides_and_alleles.append((peptide, allele))
hits_df = pandas.DataFrame(
peptides_and_alleles,
columns=["peptide", "allele"]
)
hits_df["hit"] = 1
decoys = hits_df.copy()
decoys["peptide"] = decoys.peptide.map(scramble_peptide)
decoys["hit"] = 0.0
metrics = pandas.DataFrame(
metric_rows,
columns=[
"mean_predictions_for_hit",
"mean_predictions_for_decoy",
"correct_allele_fraction",
"auc"
])
train_df = pandas.concat([hits_df, decoys], ignore_index=True)
#import ipdb ; ipdb.set_trace()
return (predictor, predictions, metrics, motifs)
return train_df
return
pass
"""
parser = argparse.ArgumentParser(usage=__doc__)
parser.add_argument(
"--alleles",
nargs="+",
"--out-metrics-csv",
default=None,
help="Metrics output")
parser.add_argument(
"--out-motifs-pickle",
default=None,
help="Which alleles to test")
help="Metrics output")
if __name__ == '__main__':
# If run directly from python, leave the user in a shell to explore results.
setup()
args = parser.parse_args(sys.argv[1:])
result = test_synthetic_allele_refinement()
(predictor, predictions, metrics, motifs) = test_synthetic_allele_refinement()
if args.out_metrics_csv:
metrics.to_csv(args.out_metrics_csv)
if args.out_motifs_pickle:
motifs.to_pickle(args.out_motifs_pickle)
# Leave in ipython
import ipdb # pylint: disable=import-error
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment