From 288e0db70b116466acf5256f14157018314099d2 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 18 Sep 2019 00:17:20 -0400 Subject: [PATCH] updates --- mhcflurry/class1_ligandome_predictor.py | 58 ++++- test/test_class1_ligandome_predictor.py | 303 ++++++++++++++++++------ 2 files changed, 281 insertions(+), 80 deletions(-) diff --git a/mhcflurry/class1_ligandome_predictor.py b/mhcflurry/class1_ligandome_predictor.py index 7eba3357..5be86637 100644 --- a/mhcflurry/class1_ligandome_predictor.py +++ b/mhcflurry/class1_ligandome_predictor.py @@ -146,7 +146,52 @@ class Class1LigandomePredictor(object): return network @staticmethod - def loss(y_true, y_pred): + def loss(y_true, y_pred, lmbda=0.001): + import keras.backend as K + import tensorflow as tf + + y_pred = tf.squeeze(y_pred, axis=-1) + + #y_pred = tf.Print(y_pred, [y_pred, tf.shape(y_pred)], "y_pred", summarize=20) + #y_true = tf.Print(y_true, [y_true, tf.shape(y_true)], "y_true", summarize=20) + + y_true = tf.reshape(tf.cast(y_true, tf.bool), (-1,)) + + pos = tf.boolean_mask(y_pred, y_true) + pos_max = tf.reduce_max(pos, axis=1) + #pos_max = tf.reduce_logsumexp(tf.boolean_mask(y_pred, y_true), axis=1) + neg = tf.boolean_mask(y_pred, tf.logical_not(y_true)) + + result = tf.reduce_sum( + tf.maximum(0.0, tf.reshape(neg, (-1, 1)) - pos_max)**2) + + term2 = tf.reduce_sum( + tf.minimum(0.0, tf.reshape(neg, (-1, 1)) - pos_max)) + result = result + lmbda * term2 + + #differences = tf.reshape(neg, (-1, 1)) - pos + + #result = tf.reduce_sum(tf.sign(differences) * differences**2) + #result = tf.Print(result, [result], "result", summarize=20) + + #term2 = lmbda * tf.reduce_mean((1 - pos)**2) + #result = result + term2 + return result + + """ + pos = tf.boolean_mask(y_pred, y_true) + + pos = y_pred[y_true.astype(bool)].max(1) + neg = y_pred[~y_true.astype(bool)] + expected2 = (numpy.maximum(0, + neg.flatten().reshape((-1, 1)) - pos) ** 2).sum() + """ + + + + + @staticmethod + def loss_old(y_true, y_pred): """Binary cross entropy after taking logsumexp over predictions""" import keras.backend as K import tensorflow as tf @@ -230,6 +275,11 @@ class Class1LigandomePredictor(object): import keras.backend as K + #for layer in self.network._layers[:8]: + # print("Setting non trainable", layer) + # layer.trainable = False + # import ipdb ; ipdb.set_trace() + peptides = EncodableSequences.create(peptides) peptide_encoding = self.peptides_to_network_input(peptides) @@ -273,6 +323,9 @@ class Class1LigandomePredictor(object): start = time.time() for i in range(self.hyperparameters['max_epochs']): epoch_start = time.time() + + # TODO: need to use fit_generator to keep each minibatch corresponding + # to a single experiment fit_history = self.network.fit( x_dict, labels, @@ -281,7 +334,8 @@ class Class1LigandomePredictor(object): verbose=verbose, epochs=i + 1, initial_epoch=i, - validation_split=self.hyperparameters['validation_split']) + validation_split=self.hyperparameters['validation_split'], + ) epoch_time = time.time() - epoch_start for (key, value) in fit_history.history.items(): diff --git a/test/test_class1_ligandome_predictor.py b/test/test_class1_ligandome_predictor.py index 568ff97e..411d9c06 100644 --- a/test/test_class1_ligandome_predictor.py +++ b/test/test_class1_ligandome_predictor.py @@ -14,7 +14,10 @@ Idea: """ -from sklearn.metrics import roc_auc_score +import logging +logging.getLogger('tensorflow').disabled = True +logging.getLogger('matplotlib').disabled = True + import pandas import argparse import sys @@ -25,12 +28,13 @@ from random import shuffle from sklearn.metrics import roc_auc_score -from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork +from mhcflurry import Class1AffinityPredictor, Class1NeuralNetwork from mhcflurry.allele_encoding import MultipleAlleleEncoding from mhcflurry.class1_ligandome_predictor import Class1LigandomePredictor +from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry.downloads import get_path from mhcflurry.regression_target import from_ic50 - +from mhcflurry.common import random_peptides, positional_frequency_matrix from mhcflurry.testing_utils import cleanup, startup from mhcflurry.amino_acid import COMMON_AMINO_ACIDS @@ -72,29 +76,140 @@ def teardown(): cleanup() -def sample_peptides_from_pssm(pssm, count): - result = pandas.DataFrame( - index=numpy.arange(count), - columns=pssm.index, - dtype=object, - ) - - for (position, vector) in pssm.iterrows(): - result.loc[:, position] = numpy.random.choice( - pssm.columns, - size=count, - replace=True, - p=vector.values) - - return result.apply("".join, axis=1) - - def scramble_peptide(peptide): lst = list(peptide) shuffle(lst) return "".join(lst) +def evaluate_loss(loss, y_true, y_pred): + import keras.backend as K + + y_true = numpy.array(y_true) + y_pred = numpy.array(y_pred) + if y_pred.ndim == 1: + y_pred = y_pred.reshape((len(y_pred), 1)) + if y_true.ndim == 1: + y_true = y_true.reshape((len(y_true), 1)) + + if K.backend() == "tensorflow": + session = K.get_session() + y_true_var = K.constant(y_true, name="y_true") + y_pred_var = K.constant(y_pred, name="y_pred") + result = loss(y_true_var, y_pred_var) + return result.eval(session=session) + elif K.backend() == "theano": + y_true_var = K.constant(y_true, name="y_true") + y_pred_var = K.constant(y_pred, name="y_pred") + result = loss(y_true_var, y_pred_var) + return result.eval() + else: + raise ValueError("Unsupported backend: %s" % K.backend()) + + +def Xtest_loss(): + # Hit labels + y_true = [ + 1.0, + 0.0, + 1.0, + 1.0, + 0.0 + ] + y_true = numpy.array(y_true) + y_pred = [ + [0.3, 0.7, 0.5], + [0.2, 0.4, 0.6], + [0.1, 0.5, 0.3], + [0.1, 0.7, 0.1], + [0.8, 0.2, 0.4], + ] + y_pred = numpy.array(y_pred) + + # reference implementation 1 + contributions = [] + for i in range(len(y_true)): + if y_true[i] == 1.0: + for j in range(len(y_true)): + if y_true[j] == 0.0: + tightest_i = max(y_pred[i]) + contribution = sum( + max(0, y_pred[j, k] - tightest_i)**2 + for k in range(y_pred.shape[1]) + ) + contributions.append(contribution) + contributions = numpy.array(contributions) + expected1 = contributions.sum() + + # reference implementation 2: numpy + pos = y_pred[y_true.astype(bool)].max(1) + neg = y_pred[~y_true.astype(bool)] + expected2 = ( + numpy.maximum(0, neg.reshape((-1, 1)) - pos)**2).sum() + + numpy.testing.assert_almost_equal(expected1, expected2) + + computed = evaluate_loss( + Class1LigandomePredictor.loss, + y_true, + y_pred.reshape(y_pred.shape + (1,))) + numpy.testing.assert_almost_equal(computed, expected1) + + +AA_DIST = pandas.Series( + dict((line.split()[0], float(line.split()[1])) for line in """ +A 0.071732 +E 0.060102 +N 0.034679 +D 0.039601 +T 0.055313 +L 0.115337 +V 0.070498 +S 0.071882 +Q 0.040436 +F 0.050178 +G 0.053176 +C 0.005429 +H 0.025487 +I 0.056312 +W 0.013593 +K 0.057832 +M 0.021079 +Y 0.043372 +R 0.060330 +P 0.053632 +""".strip().split("\n"))) +print(AA_DIST) + + +def make_random_peptides(num_peptides_per_length=10000, lengths=[9]): + peptides = [] + for length in lengths: + peptides.extend( + random_peptides + (num_peptides_per_length, length=length, distribution=AA_DIST)) + return EncodableSequences.create(peptides) + + +def make_motif(allele, peptides, frac=0.01): + peptides = EncodableSequences.create(peptides) + predictions = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.predict( + peptides=peptides, + allele=allele, + ) + + random_predictions_df = pandas.DataFrame({"peptide": peptides.sequences}) + random_predictions_df["prediction"] = predictions + random_predictions_df = random_predictions_df.sort_values( + "prediction", ascending=True) + #print("Random peptide predictions", allele) + #print(random_predictions_df) + top = random_predictions_df.iloc[:int(len(random_predictions_df) * frac)] + matrix = positional_frequency_matrix(top.peptide.values) + #print("Matrix") + return matrix + + def test_synthetic_allele_refinement(): refine_allele = "HLA-C*01:02" alleles = [ @@ -151,8 +266,10 @@ def test_synthetic_allele_refinement(): predictor = Class1LigandomePredictor( PAN_ALLELE_PREDICTOR_NO_MASS_SPEC, max_ensemble_size=1, - max_epochs=100, - patience=5) + max_epochs=10, + learning_rate=0.00001, + patience=5, + min_delta=0.0) allele_encoding = MultipleAlleleEncoding( experiment_names=["experiment1"] * len(train_df), @@ -182,85 +299,115 @@ def test_synthetic_allele_refinement(): assert_allclose(pre_predictions, expected_pre_predictions) - predictor.fit( - peptides=train_df.peptide.values, - labels=train_df.hit.values, - allele_encoding=allele_encoding - ) + motifs_history = [] + random_peptides_encodable = make_random_peptides(10000, [9]) - predictions = predictor.predict( - peptides=train_df.peptide.values, - allele_encoding=allele_encoding, - ) - train_df["max_prediction"] = predictions.max(1) - train_df["predicted_allele"] = pandas.Series(alleles).loc[ - predictions.argmax(1).flatten() - ].values + def update_motifs(): + for allele in alleles: + motif = make_motif(allele, random_peptides_encodable) + motifs_history.append((allele, motif)) - print(predictions) + metric_rows = [] - auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values) - print("AUC", auc) + def progress(): + predictions = predictor.predict(peptides=train_df.peptide.values, + allele_encoding=allele_encoding, ) - import ipdb ; ipdb.set_trace() + train_df["max_prediction"] = predictions.max(1) + train_df["predicted_allele"] = pandas.Series(alleles).loc[ + predictions.argmax(1).flatten()].values - return predictions + print(predictions) + mean_predictions_for_hit = train_df.loc[ + train_df.hit == 1.0 + ].max_prediction.mean() + mean_predictions_for_decoy = train_df.loc[ + train_df.hit == 0.0 + ].max_prediction.mean() + correct_allele_fraction = ( + train_df.loc[train_df.hit == 1.0].predicted_allele == + train_df.loc[train_df.hit == 1.0].true_allele + ).mean() + auc = roc_auc_score(train_df.hit.values, train_df.max_prediction.values) + print("Mean prediction for hit", mean_predictions_for_hit) + print("Mean prediction for decoy", mean_predictions_for_decoy) + print("Correct predicted allele fraction", correct_allele_fraction) + print("AUC", auc) -""" -def test_simple_synethetic( - num_peptide_per_allele_and_length=100, lengths=[8,9,10,11]): - alleles = [ - "HLA-A*02:01", "HLA-B*52:01", "HLA-C*07:01", - "HLA-A*03:01", "HLA-B*57:02", "HLA-C*03:01", - ] - cutoff = PAN_ALLELE_MOTIFS_DF.cutoff_fraction.min() - peptides_and_alleles = [] - for allele in alleles: - sub_df = PAN_ALLELE_MOTIFS_DF.loc[ - (PAN_ALLELE_MOTIFS_DF.allele == allele) & - (PAN_ALLELE_MOTIFS_DF.cutoff_fraction == cutoff) + metric_rows.append(( + mean_predictions_for_hit, + mean_predictions_for_decoy, + correct_allele_fraction, + auc, + )) + + update_motifs() + + return (predictions, auc) + + print("Pre fitting:") + progress() + update_motifs() + print("Fitting...") + + predictor.fit( + peptides=train_df.peptide.values, + labels=train_df.hit.values, + allele_encoding=allele_encoding, + progress_callback=progress, + ) + + (predictions, final_auc) = progress() + print("Final AUC", final_auc) + + update_motifs() + + motifs = pandas.DataFrame( + motifs_history, + columns=[ + "allele", + "motif", ] - assert len(sub_df) > 0, allele - for length in lengths: - pssm = sub_df.loc[ - sub_df.length == length - ].set_index("position")[COMMON_AMINO_ACIDS] - peptides = sample_peptides_from_pssm(pssm, num_peptide_per_allele_and_length) - for peptide in peptides: - peptides_and_alleles.append((peptide, allele)) - - hits_df = pandas.DataFrame( - peptides_and_alleles, - columns=["peptide", "allele"] ) - hits_df["hit"] = 1 - decoys = hits_df.copy() - decoys["peptide"] = decoys.peptide.map(scramble_peptide) - decoys["hit"] = 0.0 + metrics = pandas.DataFrame( + metric_rows, + columns=[ + "mean_predictions_for_hit", + "mean_predictions_for_decoy", + "correct_allele_fraction", + "auc" + ]) - train_df = pandas.concat([hits_df, decoys], ignore_index=True) + #import ipdb ; ipdb.set_trace() + + return (predictor, predictions, metrics, motifs) - return train_df - return - pass -""" parser = argparse.ArgumentParser(usage=__doc__) parser.add_argument( - "--alleles", - nargs="+", + "--out-metrics-csv", + default=None, + help="Metrics output") +parser.add_argument( + "--out-motifs-pickle", default=None, - help="Which alleles to test") + help="Metrics output") + if __name__ == '__main__': # If run directly from python, leave the user in a shell to explore results. setup() args = parser.parse_args(sys.argv[1:]) - result = test_synthetic_allele_refinement() + (predictor, predictions, metrics, motifs) = test_synthetic_allele_refinement() + + if args.out_metrics_csv: + metrics.to_csv(args.out_metrics_csv) + if args.out_motifs_pickle: + motifs.to_pickle(args.out_motifs_pickle) # Leave in ipython import ipdb # pylint: disable=import-error -- GitLab