Skip to content
Snippets Groups Projects
Commit c9841e49 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

refactoring cleavage predictor

parent 6bcd1886
Branches
Tags
No related merge requests found
......@@ -85,7 +85,7 @@ import pandas
from .hyperparameters import HyperparameterDefaults
from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE
from .encodable_sequences import EncodableSequences
from .flanking_encoding import FlankingEncoding
class Class1CleavageNeuralNetwork(object):
......@@ -94,12 +94,11 @@ class Class1CleavageNeuralNetwork(object):
peptide_max_length=15,
n_flank_length=10,
c_flank_length=10,
vector_encoding_name="BLOSUM62",
flanking_averages=False,
convolutional_filters=16,
convolutional_kernel_size=8,
convolutional_activation="relu",
convolutional_kernel_l1_l2=[0.001, 0.001],
convolutional_activation="tanh",
convolutional_kernel_l1_l2=[0.0001, 0.0001],
dropout_rate=0.5,
post_convolutional_dense_layer_sizes=[],
)
......@@ -181,9 +180,7 @@ class Class1CleavageNeuralNetwork(object):
def fit(
self,
peptides,
n_flanks,
c_flanks,
sequences,
targets,
sample_weights=None,
shuffle_permutation=None,
......@@ -205,12 +202,7 @@ class Class1CleavageNeuralNetwork(object):
-------
"""
peptides = EncodableSequences.create(peptides)
n_flanks = EncodableSequences.create(n_flanks)
c_flanks = EncodableSequences.create(c_flanks)
x_dict = self.peptides_and_flanking_to_network_input(
peptides, n_flanks, c_flanks)
x_dict = self.network_input(sequences)
# Shuffle
if shuffle_permutation is None:
......@@ -301,7 +293,7 @@ class Class1CleavageNeuralNetwork(object):
progress_callback()
fit_info["time"] = time.time() - start
fit_info["num_points"] = len(peptides)
fit_info["num_points"] = len(sequences.dataframe)
self.fit_info.append(dict(fit_info))
if verbose:
......@@ -317,16 +309,23 @@ class Class1CleavageNeuralNetwork(object):
n_flanks,
c_flanks,
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
sequences = FlankingEncoding(
peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks)
return self.predict_encoded(sequences=sequences, batch_size=batch_size)
def predict_encoded(
self,
sequences,
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
"""
"""
x_list = self.peptides_and_flanking_to_network_input(
peptides, n_flanks, c_flanks)
x_dict = self.network_input(sequences)
raw_predictions = self.network().predict(
x_list, batch_size=batch_size)
predictions = numpy.array(raw_predictions, dtype="float64")[:,0]
x_dict, batch_size=batch_size)
predictions = numpy.squeeze(raw_predictions).astype("float64")
return predictions
def peptides_and_flanking_to_network_input(self, peptides, n_flanks, c_flanks):
def network_input(self, sequences):
"""
Encode peptides to the fixed-length encoding expected by the neural
network (which depends on the architecture).
......@@ -339,118 +338,25 @@ class Class1CleavageNeuralNetwork(object):
-------
numpy.array
"""
peptides = EncodableSequences.create(peptides)
peptides_df = pandas.DataFrame({
"peptide": peptides.sequences,
})
if len(peptides_df) > 0:
peptides_df["length"] = peptides_df.peptide.str.len()
else:
peptides_df["length"] = []
network_peptide_size = int(
self.hyperparameters['peptide_max_length'] +
numpy.ceil(
self.hyperparameters['convolutional_kernel_size'] / 2))
result = {}
result['peptide_right_pad'] = (
peptides.variable_length_to_fixed_length_vector_encoding(
vector_encoding_name=self.hyperparameters['vector_encoding_name'],
max_length=network_peptide_size,
alignment_method='right_pad'))
result['peptide_left_pad'] = (
peptides.variable_length_to_fixed_length_vector_encoding(
vector_encoding_name=self.hyperparameters['vector_encoding_name'],
max_length=network_peptide_size,
alignment_method='left_pad'))
flank_needed_to_fill_peptide = (
network_peptide_size - peptides_df.length.min())
if self.hyperparameters['n_flank_length'] > 0:
n_flanks = EncodableSequences.create(n_flanks)
n_flank_encoded = (
n_flanks.variable_length_to_fixed_length_vector_encoding(
vector_encoding_name=self.hyperparameters['vector_encoding_name'],
max_length=max(
self.hyperparameters['n_flank_length'],
flank_needed_to_fill_peptide),
alignment_method='left_pad',
trim=True,
allow_unsupported_amino_acids=True))
if n_flank_encoded.shape[1] == self.hyperparameters['n_flank_length']:
result['n_flank'] = n_flank_encoded
else:
result['n_flank'] = n_flank_encoded[
:,
(-self.hyperparameters['n_flank_length']):,
:
]
assert (
result['n_flank'].shape[1] ==
self.hyperparameters['n_flank_length'])
# Switch out X for N-terminal residues in peptide_left_pad.
for (length, sub_df) in peptides_df.groupby("length"):
num_to_set = network_peptide_size - length
result['peptide_left_pad'][
sub_df.index.values,
:num_to_set,
:
] = n_flank_encoded[
sub_df.index.values,
(-num_to_set):,
:
]
if self.hyperparameters['c_flank_length'] > 0:
c_flanks = EncodableSequences.create(c_flanks)
c_flank_encoded = (
c_flanks.variable_length_to_fixed_length_vector_encoding(
vector_encoding_name=self.hyperparameters['vector_encoding_name'],
max_length=max(
self.hyperparameters['n_flank_length'],
flank_needed_to_fill_peptide),
alignment_method='right_pad',
trim=True,
allow_unsupported_amino_acids=True))
if c_flank_encoded.shape[1] == self.hyperparameters['c_flank_length']:
result['c_flank'] = c_flank_encoded
else:
result['c_flank'] = c_flank_encoded[
:,
:self.hyperparameters['c_flank_length'],
:
]
assert (
result['c_flank'].shape[1] ==
self.hyperparameters['c_flank_length'])
# Switch out X for C-terminal residues in peptide_right_pad.
for (length, sub_df) in peptides_df.groupby("length"):
num_to_set = network_peptide_size - length
result['peptide_right_pad'][
sub_df.index.values,
length:,
:
] = result['c_flank'][
sub_df.index.values,
:num_to_set,
:
]
encoded = sequences.vector_encode(
self.hyperparameters['amino_acid_encoding'],
self.hyperparameters['peptide_max_length'],
n_flank_length=self.hyperparameters['n_flank_length'],
c_flank_length=self.hyperparameters['c_flank_length'],
allow_unsupported_amino_acids=True)
result = {
"sequence": encoded.array,
"peptide_length": encoded.peptide_lengths,
}
return result
def make_network(
self,
amino_acid_encoding,
peptide_max_length,
n_flank_length,
c_flank_length,
vector_encoding_name,
flanking_averages,
convolutional_filters,
convolutional_kernel_size,
......@@ -467,38 +373,136 @@ class Class1CleavageNeuralNetwork(object):
from keras.layers import Input
import keras.layers.pooling
import keras.initializers
from keras.layers.core import Dense, Flatten, Dropout
from keras.layers.merge import Concatenate
empty_x_dict = self.peptides_and_flanking_to_network_input(
peptides=[], n_flanks=[], c_flanks=[])
model_inputs = {}
model_inputs['peptide_right_pad'] = Input(
shape=empty_x_dict['peptide_right_pad'].shape[1:],
dtype='float32',
name='peptide_right_pad')
model_inputs['peptide_left_pad'] = Input(
shape=empty_x_dict['peptide_left_pad'].shape[1:],
dtype='float32',
name='peptide_left_pad')
empty_x_dict = self.network_input(FlankingEncoding([], [], []))
sequence_dims = empty_x_dict['sequence'].shape[1:]
if 'n_flank' in empty_x_dict:
model_inputs['n_flank'] = Input(
shape=empty_x_dict['n_flank'].shape[1:],
dtype='float32',
name='n_flank')
numpy.testing.assert_equal(
sequence_dims[0],
peptide_max_length + n_flank_length + c_flank_length)
if 'c_flank' in empty_x_dict:
model_inputs['c_flank'] = Input(
shape=empty_x_dict['c_flank'].shape[1:],
dtype='float32',
name='c_flank')
model_inputs['sequence'] = Input(
shape=sequence_dims,
dtype='float32',
name='sequence')
model_inputs['peptide_length'] = Input(
shape=(1,),
dtype='int32',
name='peptide_length')
current_layer = model_inputs['sequence']
current_layer = keras.layers.Conv1D(
filters=convolutional_filters,
kernel_size=convolutional_kernel_size,
kernel_regularizer=keras.regularizers.l1_l2(
*convolutional_kernel_l1_l2),
padding="same",
activation=convolutional_activation,
name="conv1")(current_layer)
if dropout_rate > 0:
current_layer = keras.layers.Dropout(
name="conv1_dropout",
rate=dropout_rate,
noise_shape=(
None, 1, int(current_layer.get_shape()[-1])))(
current_layer)
convolutional_result = current_layer
outputs_for_final_dense = []
for flank in ["n_flank", "c_flank"]:
current_layer = convolutional_result
for (i, size) in enumerate(
list(post_convolutional_dense_layer_sizes) + [1]):
current_layer = keras.layers.Conv1D(
name="%s_post_%d" % (flank, i),
filters=size,
kernel_size=1,
kernel_regularizer=keras.regularizers.l1_l2(
*convolutional_kernel_l1_l2),
activation=convolutional_activation)(current_layer)
single_output_result = current_layer
if flank == "n_flank":
def cleavage_extractor(x):
return x[:, n_flank_length]
single_output_at_cleavage_position = keras.layers.Lambda(
cleavage_extractor, name="%s_cleaved" % flank)(
single_output_result)
else:
assert flank == "c_flank"
def cleavage_extractor(lst):
import tensorflow as tf
(x, peptide_length) = lst
indexer = peptide_length + n_flank_length - 1
result = tf.squeeze(
tf.gather(x, indexer, batch_dims=1, axis=1),
-1)
return result
single_output_at_cleavage_position = keras.layers.Lambda(
cleavage_extractor, name="%s_cleaved" % flank)([
single_output_result,
model_inputs['peptide_length']
])
outputs_for_final_dense.append(single_output_at_cleavage_position)
"""
# Single output at cleavage position
single_output_at_cleavage_position = keras.layers.Lambda(
cleavage_position_extractor)(single_output_result)
outputs_for_final_dense.append(single_output_at_cleavage_position)
# Max of single-output at non-cleaved (peptide) positions.
non_cleaved_single_outputs = keras.layers.Lambda(
noncleaved_peptide_extractor, name="%s_noncleaved" % flank)(
single_output_result)
non_cleaved_pooled = keras.layers.pooling.GlobalMaxPooling1D(
name="%s_noncleaved_pooled" % flank)(non_cleaved_single_outputs)
# We flip it so that initializing the final dense layer weights to
# 1s is reasonable.
non_cleaved_pooled_flipped = keras.layers.Lambda(
lambda x: -x,
name="%s_noncleaved_pooled_flip" % flank)(non_cleaved_pooled)
outputs_for_final_dense.append(non_cleaved_pooled_flipped)
if include_flank and flanking_averages:
# Also include average pooled of flanking sequences
extracted_flank = keras.layers.Lambda(
flanking_extractor, name="%s_extracted" % flank)(
convolutional_result)
pooled_flank = keras.layers.pooling.GlobalAveragePooling1D(
name="%s_avg" % flank,
)(extracted_flank)
dense_flank = Dense(
1, activation="tanh", name="%s_avg_dense" % flank)(
pooled_flank)
outputs_for_final_dense.append(dense_flank)
include_flank = flank in model_inputs
if flank == "n_flank":
......@@ -587,6 +591,7 @@ class Class1CleavageNeuralNetwork(object):
1, activation="tanh", name="%s_avg_dense" % flank)(
pooled_flank)
outputs_for_final_dense.append(dense_flank)
"""
if len(outputs_for_final_dense) == 1:
(current_layer,) = outputs_for_final_dense
......
......@@ -17,7 +17,7 @@ import pandas
from .version import __version__
from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE
from .encodable_sequences import EncodableSequences
from .flanking_encoding import FlankingEncoding
from .downloads import get_default_class1_cleavage_models_dir
from .class1_cleavage_neural_network import Class1CleavageNeuralNetwork
from .common import save_weights, load_weights, NumpyJSONEncoder
......@@ -119,6 +119,7 @@ class Class1CleavagePredictor(object):
n_flanks,
c_flanks,
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
return self.predict_to_dataframe(
peptides=peptides,
n_flanks=n_flanks,
......@@ -132,34 +133,27 @@ class Class1CleavagePredictor(object):
c_flanks,
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
for (name, value) in [
("peptides", peptides),
("n_flanks", n_flanks),
("c_flanks", c_flanks)]:
if isinstance(value, string_types):
raise TypeError(
"%s must be a list or array, not a string" % name)
sequences = FlankingEncoding(
peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks)
return self.predict_to_dataframe_encoded(
sequences=sequences, batch_size=batch_size)
peptides = EncodableSequences.create(peptides)
n_flanks = EncodableSequences.create(n_flanks)
c_flanks = EncodableSequences.create(c_flanks)
def predict_to_dataframe_encoded(
self, sequences, batch_size=DEFAULT_PREDICT_BATCH_SIZE):
score_array = []
for (i, network) in enumerate(self.models):
predictions = network.predict(
peptides=peptides,
n_flanks=n_flanks,
c_flanks=c_flanks,
batch_size=batch_size)
predictions = network.predict_encoded(
encoded, batch_size=batch_size)
score_array.append(predictions)
score_array = numpy.array(score_array)
result_df = pandas.DataFrame({
"peptide": peptides.sequences,
"n_flank": n_flanks.sequences,
"c_flank": c_flanks.sequences,
"peptide": encoded.dataframe.peptide,
"n_flank": encoded.dataframe.n_flank,
"c_flank": encoded.dataframe.c_flank,
"score": numpy.mean(score_array, axis=0),
})
return result_df
......
"""
Class for encoding variable-length flanking and peptides to
fixed-size numerical matrices
"""
from __future__ import (
print_function, division, absolute_import, )
from six import string_types
from collections import namedtuple
from .encodable_sequences import EncodingError, EncodableSequences
import numpy
import pandas
EncodingResult = namedtuple(
"EncodingResult", ["array", "peptide_lengths"])
class FlankingEncoding(object):
"""
"""
unknown_character = "X"
def __init__(self, peptides, n_flanks, c_flanks):
self.dataframe = pandas.DataFrame({
"peptide": peptides,
"n_flank": n_flanks,
"c_flank": c_flanks,
}, dtype=str)
self.encoding_cache = {}
def __len__(self):
return len(self.dataframe)
def vector_encode(
self,
vector_encoding_name,
peptide_max_length,
n_flank_length,
c_flank_length,
allow_unsupported_amino_acids=True):
"""
Encode variable-length sequences to a fixed-size matrix. Amino acids
are encoded as specified by the vector_encoding_name argument.
See `sequences_to_fixed_length_index_encoded_array` for details.
See also: variable_length_to_fixed_length_categorical.
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings().
alignment_method : string
One of "pad_middle" or "left_pad_right_pad"
left_edge : int, size of fixed-position left side
Only relevant for pad_middle alignment method
right_edge : int, size of the fixed-position right side
Only relevant for pad_middle alignment method
max_length : maximum supported peptide length
Returns
-------
numpy.array with shape (num sequences, encoded length, m)
where
- m is the vector encoding length (usually 21).
- encoded length is max_length if alignment_method is pad_middle;
3 * max_length if it's left_pad_right_pad.
"""
cache_key = (
"vector_encode",
vector_encoding_name,
peptide_max_length,
n_flank_length,
c_flank_length,
allow_unsupported_amino_acids)
if cache_key not in self.encoding_cache:
result = self.encode(
vector_encoding_name=vector_encoding_name,
df=self.dataframe,
peptide_max_length=peptide_max_length,
n_flank_length=n_flank_length,
c_flank_length=c_flank_length,
allow_unsupported_amino_acids=allow_unsupported_amino_acids)
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
@staticmethod
def encode(
vector_encoding_name,
df,
peptide_max_length,
n_flank_length,
c_flank_length,
allow_unsupported_amino_acids=False):
"""
"""
error_df = df.loc[
(df.peptide.str.len() > peptide_max_length) |
(df.peptide.str.len() < 1)
]
if len(error_df) > 0:
raise EncodingError(
"Sequence '%s' (length %d) unsupported. There are %d "
"total peptides with this length." % (
error_df.iloc[0].peptide,
len(error_df.iloc[0].peptide),
len(error_df)))
if n_flank_length > 0:
n_flanks = df.n_flank.str.pad(
n_flank_length,
side="left",
fillchar="X").str.slice(-n_flank_length).str.upper()
else:
n_flanks = pandas.Series([""] * len(df))
c_flanks = df.c_flank.str.pad(
c_flank_length,
side="right",
fillchar="X").str.slice(0, c_flank_length).str.upper()
peptides = df.peptide.str.upper()
concatenated = n_flanks + peptides + c_flanks
encoder = EncodableSequences.create(concatenated.values)
array = encoder.variable_length_to_fixed_length_vector_encoding(
vector_encoding_name=vector_encoding_name,
alignment_method="right_pad",
max_length=n_flank_length + peptide_max_length + c_flank_length,
allow_unsupported_amino_acids=allow_unsupported_amino_acids)
result = EncodingResult(
array, peptide_lengths=peptides.str.len().values)
return result
......@@ -18,17 +18,35 @@ import pprint
from mhcflurry.class1_cleavage_neural_network import Class1CleavageNeuralNetwork
from mhcflurry.common import random_peptides
from mhcflurry.amino_acid import BLOSUM62_MATRIX
from mhcflurry.flanking_encoding import FlankingEncoding
from mhcflurry.testing_utils import cleanup, startup
teardown = cleanup
setup = startup
def Xtest_neural_network_input():
table = BLOSUM62_MATRIX.apply(
tuple).reset_index().set_index(0).to_dict()['index']
def decode_matrix(array):
(num, length, dim) = array.shape
assert dim == 21
results = []
for row in array:
item = "".join([
table[tuple(x)] for x in row
])
results.append(item)
return results
def test_neural_network_input():
model = Class1CleavageNeuralNetwork(
peptide_max_length=12,
n_flank_length=8,
c_flank_length=8)
c_flank_length=5)
tests = [
{
......@@ -38,25 +56,66 @@ def Xtest_neural_network_input():
"c_flank": "FGHKLCVNMQWE",
# Expected results
"peptide_left_pad": "",
"peptide_right_pad": "",
"c_flank": "",
"sequence": "TYIPSDFGSIINFEKLFGHKLXXXX",
},
{
# Input
"peptide": "QCV",
"n_flank": "QWERTYIPSDFG",
"c_flank": "FGHKLCVNMQWE",
# Expected results
"sequence": "TYIPSDFGQCVFGHKLXXXXXXXXX",
},
{
# Input
"peptide": "QCVSQCVSQCVS",
"n_flank": "QWE",
"c_flank": "MNV",
# Expected results
"sequence": "XXXXXQWEQCVSQCVSQCVSMNVXX",
},
{
# Input
"peptide": "QCVSQCVSQCVS",
"n_flank": "",
"c_flank": "MNV",
# Expected results
"sequence": "XXXXXXXXQCVSQCVSQCVSMNVXX",
},
{
# Input
"peptide": "QCVSQCVSQCVS",
"n_flank": "",
"c_flank": "",
}
# Expected results
"sequence": "XXXXXXXXQCVSQCVSQCVSXXXXX",
},
]
for (i, d) in enumerate(tests):
print("Test", i + 1)
pprint.pprint(d)
results = model.peptides_and_flanking_to_network_input(
encoding = FlankingEncoding(
peptides=[d['peptide']],
n_flanks=[d['n_flank']],
c_flanks=[d['n_flank']])
import ipdb ; ipdb.set_trace()
c_flanks=[d['c_flank']])
results = model.network_input(encoding)
(decoded,) = decode_matrix(results['sequence'])
numpy.testing.assert_equal(decoded, d['sequence'])
numpy.testing.assert_equal(results['peptide_length'], len(d['peptide']))
model.peptides_and_flanking_to_network_input()
# Test all at once
df = pandas.DataFrame(tests)
encoding = FlankingEncoding(df.peptide, df.n_flank, df.c_flank)
results = model.network_input(encoding)
df["decoded"] = decode_matrix(results['sequence'])
numpy.testing.assert_array_equal(df.decoded.values, df.sequence.values)
numpy.testing.assert_equal(
results['peptide_length'], df.peptide.str.len().values)
def Xtest_big():
......@@ -77,9 +136,29 @@ def Xtest_more():
post_convolutional_dense_layer_sizes=[8])
def train_basic_network(num, do_assertions=True, **hyperparameters):
def test_basic_indexing(num=10000, do_assertions=True, **hyperparameters):
def is_hit(n_flank, c_flank, peptide):
return peptide[0] in "SIQVL" and peptide[-1] in "YIPASD"
def is_hit1(n_flank, c_flank, peptide):
return peptide[0] in "SIQVL"
def is_hit2(n_flank, c_flank, peptide):
return peptide[-1] in "YIPASD"
hypers = {
"convolutional_kernel_size": 1,
"flanking_averages": False,
}
train_basic_network(num=10000, is_hit=is_hit1, **hypers)
train_basic_network(num=10000, is_hit=is_hit2, **hypers)
train_basic_network(num=10000, is_hit=is_hit, **hypers)
def train_basic_network(num, do_assertions=True, is_hit=None, **hyperparameters):
use_hyperparameters = {
"max_epochs": 30,
"max_epochs": 100,
"peptide_max_length": 12,
"n_flank_length": 8,
"c_flank_length": 8,
......@@ -93,12 +172,13 @@ def train_basic_network(num, do_assertions=True, **hyperparameters):
"peptide": random_peptides(num / 2, 11) + random_peptides(num / 2, 8),
}).sample(frac=1.0)
n_cleavage_regex = "[AILQSV][SINFEKLH][MNPQYK]"
if is_hit is None:
n_cleavage_regex = "[AILQSV][SINFEKLH][MNPQYK]"
def is_hit(n_flank, c_flank, peptide):
if re.search(n_cleavage_regex, peptide):
return False # peptide is cleaved
return bool(re.match(n_cleavage_regex, n_flank[-1:] + peptide))
def is_hit(n_flank, c_flank, peptide):
if re.search(n_cleavage_regex, peptide):
return False # peptide is cleaved
return bool(re.match(n_cleavage_regex, n_flank[-1:] + peptide))
df["hit"] = [
is_hit(row.n_flank, row.c_flank, row.peptide)
......@@ -118,18 +198,24 @@ def train_basic_network(num, do_assertions=True, **hyperparameters):
network = Class1CleavageNeuralNetwork(**use_hyperparameters)
network.fit(
train_df.peptide.values,
train_df.n_flank.values,
train_df.c_flank.values,
train_df.hit.values,
sequences=FlankingEncoding(
peptides=train_df.peptide.values,
n_flanks=train_df.n_flank.values,
c_flanks=train_df.c_flank.values),
targets=train_df.hit.values,
verbose=0)
network.network().summary()
for df in [train_df, test_df]:
df["predictions"] = network.predict(
df.peptide.values,
df.n_flank.values,
df.c_flank.values)
print(df)
print(df.loc[df.hit])
train_auc = roc_auc_score(train_df.hit.values, train_df.predictions.values)
test_auc = roc_auc_score(test_df.hit.values, test_df.predictions.values)
......@@ -141,3 +227,4 @@ def train_basic_network(num, do_assertions=True, **hyperparameters):
assert_greater(test_auc, 0.85)
return network
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment