Skip to content
Snippets Groups Projects
Commit 3d5ba47e authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

fixes

parent 024618ec
No related branches found
No related tags found
No related merge requests found
...@@ -89,7 +89,7 @@ class Class1PresentationNeuralNetwork(object): ...@@ -89,7 +89,7 @@ class Class1PresentationNeuralNetwork(object):
self.fit_info = [] self.fit_info = []
self.allele_representation_hash = None self.allele_representation_hash = None
def lift_from_class1_neural_network(self, class1_neural_network): def load_from_class1_neural_network(self, class1_neural_network):
import keras.backend as K import keras.backend as K
from keras.layers import ( from keras.layers import (
Input, Input,
...@@ -101,8 +101,13 @@ class Class1PresentationNeuralNetwork(object): ...@@ -101,8 +101,13 @@ class Class1PresentationNeuralNetwork(object):
Activation, Activation,
Lambda, Lambda,
Add, Add,
Multiply,
Embedding) Embedding)
from keras.models import Model from keras.models import Model
from keras.initializers import Zeros
if isinstance(class1_neural_network, Class1NeuralNetwork):
class1_neural_network = class1_neural_network.network()
peptide_shape = tuple( peptide_shape = tuple(
int(x) for x in K.int_shape(class1_neural_network.inputs[0])[1:]) int(x) for x in K.int_shape(class1_neural_network.inputs[0])[1:])
...@@ -173,8 +178,21 @@ class Class1PresentationNeuralNetwork(object): ...@@ -173,8 +178,21 @@ class Class1PresentationNeuralNetwork(object):
node = layer(input_nodes) node = layer(input_nodes)
layer_name_to_new_node[layer.name] = node layer_name_to_new_node[layer.name] = node
affinity_predictor_matrix_output = node pre_mask_affinity_predictor_matrix_output = node
# Apply allele mask: zero out all outputs corresponding to alleles
# with the special index 0.
affinity_predictor_matrix_output = Multiply(
name="affinity_matrix_output")([
Lambda(
lambda x: K.cast(
K.expand_dims(K.not_equal(x, 0.0)),
"float32"))(input_alleles),
pre_mask_affinity_predictor_matrix_output
])
# First allele (i.e. the first column of the alleles matrix) is given
# its own output. This is used for the affinity prediction loss.
affinity_predictor_output = Lambda( affinity_predictor_output = Lambda(
lambda x: x[:, 0], name="affinity_output")( lambda x: x[:, 0], name="affinity_output")(
affinity_predictor_matrix_output) affinity_predictor_matrix_output)
...@@ -195,11 +213,17 @@ class Class1PresentationNeuralNetwork(object): ...@@ -195,11 +213,17 @@ class Class1PresentationNeuralNetwork(object):
[node, auxiliary_input], name="affinities_with_auxiliary") [node, auxiliary_input], name="affinities_with_auxiliary")
layer = Dense(8, activation="tanh") layer = Dense(8, activation="tanh")
lifted = TimeDistributed(layer, name="presentation_hidden1") lifted = TimeDistributed(layer, name="presentation_adjustment_hidden1")
node = lifted(node) node = lifted(node)
layer = Dense(1, activation="tanh") # By initializing to zero we ensure that before training the
lifted = TimeDistributed(layer, name="presentation_hidden2") # presentation output is the same as the affinity output.
layer = Dense(
1,
activation="tanh",
kernel_initializer=Zeros(),
bias_initializer=Zeros())
lifted = TimeDistributed(layer, name="presentation_adjustment")
presentation_adjustment = lifted(node) presentation_adjustment = lifted(node)
def logit(x): def logit(x):
...@@ -210,8 +234,19 @@ class Class1PresentationNeuralNetwork(object): ...@@ -210,8 +234,19 @@ class Class1PresentationNeuralNetwork(object):
Lambda(logit)(affinity_predictor_matrix_output), Lambda(logit)(affinity_predictor_matrix_output),
presentation_adjustment, presentation_adjustment,
]) ])
presentation_output = Activation("sigmoid", name="presentation_output")( pre_mask_presentation_output = Activation(
presentation_output_pre_sigmoid) "sigmoid", name="unmasked_presentation_output")(
presentation_output_pre_sigmoid)
# Apply allele mask: zero out all outputs corresponding to alleles
# with the special index 0.
presentation_output = Multiply(name="presentation_output")([
Lambda(
lambda x: K.cast(
K.expand_dims(K.not_equal(x, 0.0)),
"float32"))(input_alleles),
pre_mask_presentation_output
])
self.network = Model( self.network = Model(
inputs=[ inputs=[
...@@ -578,7 +613,7 @@ class Class1PresentationNeuralNetwork(object): ...@@ -578,7 +613,7 @@ class Class1PresentationNeuralNetwork(object):
batch_size=DEFAULT_PREDICT_BATCH_SIZE): batch_size=DEFAULT_PREDICT_BATCH_SIZE):
peptides = EncodableSequences.create(peptides) peptides = EncodableSequences.create(peptides)
assert isinstance(AlleleEncoding, MultipleAlleleEncoding) assert isinstance(allele_encoding, MultipleAlleleEncoding)
(allele_encoding_input, allele_representations) = ( (allele_encoding_input, allele_representations) = (
self.allele_encoding_to_network_input(allele_encoding.compact())) self.allele_encoding_to_network_input(allele_encoding.compact()))
......
...@@ -75,7 +75,7 @@ class Class1PresentationPredictor(object): ...@@ -75,7 +75,7 @@ class Class1PresentationPredictor(object):
def max_alleles(self): def max_alleles(self):
max_alleles = self.models[0].hyperparameters['max_alleles'] max_alleles = self.models[0].hyperparameters['max_alleles']
assert all( assert all(
n.hyperparameters['max_alleles'] == self.max_alleles n.hyperparameters['max_alleles'] == max_alleles
for n in self.models) for n in self.models)
return max_alleles return max_alleles
...@@ -170,11 +170,12 @@ class Class1PresentationPredictor(object): ...@@ -170,11 +170,12 @@ class Class1PresentationPredictor(object):
ensemble_scores = numpy.mean(score_array, axis=0) ensemble_scores = numpy.mean(score_array, axis=0)
ensemble_affinity = numpy.mean(affinity_array, axis=0) ensemble_affinity = numpy.mean(affinity_array, axis=0)
top_allele_index = numpy.argmax(ensemble_scores, axis=-1) top_allele_index = numpy.argmax(ensemble_scores, axis=-1)
top_score = ensemble_scores[top_allele_index] top_allele_flat_indices = (
top_affinity = ensemble_affinity[top_allele_index] numpy.arange(len(peptides)) * self.max_alleles + top_allele_index)
top_score = ensemble_scores.flatten()[top_allele_flat_indices]
top_affinity = ensemble_affinity.flatten()[top_allele_flat_indices]
result_df = pandas.DataFrame({"peptide": peptides.sequences}) result_df = pandas.DataFrame({"peptide": peptides.sequences})
result_df["allele"] = alleles.alleles[top_allele_index] result_df["allele"] = alleles.alleles.flatten()[top_allele_flat_indices]
result_df["score"] = top_score result_df["score"] = top_score
result_df["affinity"] = to_ic50(top_affinity) result_df["affinity"] = to_ic50(top_affinity)
......
...@@ -24,16 +24,17 @@ import sys ...@@ -24,16 +24,17 @@ import sys
import copy import copy
import os import os
from numpy.testing import assert_, assert_equal, assert_allclose from numpy.testing import assert_, assert_equal, assert_allclose, assert_array_equal
from nose.tools import assert_greater, assert_less from nose.tools import assert_greater, assert_less
import numpy import numpy
from random import shuffle from random import shuffle
from sklearn.metrics import roc_auc_score from sklearn.metrics import roc_auc_score
from mhcflurry import Class1AffinityPredictor, Class1NeuralNetwork from mhcflurry import Class1AffinityPredictor
from mhcflurry.allele_encoding import MultipleAlleleEncoding from mhcflurry.allele_encoding import MultipleAlleleEncoding
from mhcflurry.class1_presentation_neural_network import Class1PresentationNeuralNetwork from mhcflurry.class1_presentation_neural_network import Class1PresentationNeuralNetwork
from mhcflurry.class1_presentation_predictor import Class1PresentationPredictor
from mhcflurry.encodable_sequences import EncodableSequences from mhcflurry.encodable_sequences import EncodableSequences
from mhcflurry.downloads import get_path from mhcflurry.downloads import get_path
from mhcflurry.regression_target import from_ic50 from mhcflurry.regression_target import from_ic50
...@@ -41,6 +42,7 @@ from mhcflurry.common import random_peptides, positional_frequency_matrix ...@@ -41,6 +42,7 @@ from mhcflurry.common import random_peptides, positional_frequency_matrix
from mhcflurry.testing_utils import cleanup, startup from mhcflurry.testing_utils import cleanup, startup
from mhcflurry.amino_acid import COMMON_AMINO_ACIDS from mhcflurry.amino_acid import COMMON_AMINO_ACIDS
from mhcflurry.custom_loss import MultiallelicMassSpecLoss from mhcflurry.custom_loss import MultiallelicMassSpecLoss
from mhcflurry.regression_target import to_ic50
COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)
...@@ -87,6 +89,40 @@ def teardown(): ...@@ -87,6 +89,40 @@ def teardown():
cleanup() cleanup()
def test_basic():
affinity_predictor = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC
models = []
for affinity_network in affinity_predictor.class1_pan_allele_models:
presentation_network = Class1PresentationNeuralNetwork()
presentation_network.load_from_class1_neural_network(affinity_network)
models.append(presentation_network)
predictor = Class1PresentationPredictor(
models=models,
allele_to_sequence=affinity_predictor.allele_to_sequence)
alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:02"]
df = pandas.DataFrame(index=numpy.unique(random_peptides(1000, length=9)))
for allele in alleles:
df[allele] = affinity_predictor.predict(
df.index.values, allele=allele)
df["tightest_affinity"] = df.min(1)
df["tightest_allele"] = df.idxmin(1)
df2 = predictor.predict_to_dataframe(
peptides=df.index.values,
alleles=alleles)
merged_df = pandas.merge(
df, df2.set_index("peptide"), left_index=True, right_index=True)
assert_array_equal(merged_df["tightest_affinity"], merged_df["affinity"])
assert_array_equal(merged_df["tightest_affinity"], to_ic50(merged_df["score"]))
assert_array_equal(merged_df["tightest_allele"], merged_df["allele"])
# TODO: test fitting, saving, and loading
def scramble_peptide(peptide): def scramble_peptide(peptide):
lst = list(peptide) lst = list(peptide)
shuffle(lst) shuffle(lst)
...@@ -118,7 +154,7 @@ def evaluate_loss(loss, y_true, y_pred): ...@@ -118,7 +154,7 @@ def evaluate_loss(loss, y_true, y_pred):
raise ValueError("Unsupported backend: %s" % K.backend()) raise ValueError("Unsupported backend: %s" % K.backend())
def test_loss(): def Xtest_loss():
for delta in [0.0, 0.3]: for delta in [0.0, 0.3]:
print("delta", delta) print("delta", delta)
# Hit labels # Hit labels
...@@ -236,7 +272,7 @@ def make_motif(allele, peptides, frac=0.01): ...@@ -236,7 +272,7 @@ def make_motif(allele, peptides, frac=0.01):
return matrix return matrix
def test_real_data_multiallelic_refinement(max_epochs=10): def Xtest_real_data_multiallelic_refinement(max_epochs=10):
ms_df = pandas.read_csv( ms_df = pandas.read_csv(
get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2")) get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
ms_df = ms_df.loc[ ms_df = ms_df.loc[
...@@ -349,7 +385,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10): ...@@ -349,7 +385,7 @@ def test_real_data_multiallelic_refinement(max_epochs=10):
import ipdb ; ipdb.set_trace() import ipdb ; ipdb.set_trace()
def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10): def Xtest_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
refine_allele = "HLA-C*01:02" refine_allele = "HLA-C*01:02"
alleles = [ alleles = [
"HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01", "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
...@@ -562,7 +598,7 @@ def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10): ...@@ -562,7 +598,7 @@ def test_synthetic_allele_refinement_with_affinity_data(max_epochs=10):
def test_synthetic_allele_refinement(max_epochs=10): def Xtest_synthetic_allele_refinement(max_epochs=10):
refine_allele = "HLA-C*01:02" refine_allele = "HLA-C*01:02"
alleles = [ alleles = [
"HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01", "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01",
...@@ -752,7 +788,7 @@ def test_synthetic_allele_refinement(max_epochs=10): ...@@ -752,7 +788,7 @@ def test_synthetic_allele_refinement(max_epochs=10):
return (predictor, predictions, metrics, motifs) return (predictor, predictions, metrics, motifs)
def test_batch_generator(sample_rate=0.1): def Xtest_batch_generator(sample_rate=0.1):
multi_train_df = pandas.read_csv( multi_train_df = pandas.read_csv(
data_path("multiallelic_ms.benchmark1.csv.bz2")) data_path("multiallelic_ms.benchmark1.csv.bz2"))
multi_train_df["label"] = multi_train_df.hit multi_train_df["label"] = multi_train_df.hit
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment