Skip to content
Snippets Groups Projects
Commit 1a257560 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

adding keras layers for dealing with masks in variable length sequence inputs

parent 4e3ccd35
No related merge requests found
...@@ -18,7 +18,7 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) ...@@ -18,7 +18,7 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info # Log some environment info
date date
pip freeze pip freeze
git rev-parse HEAD # git rev-parse HEAD
git status git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME cd $SCRATCH_DIR/$DOWNLOAD_NAME
......
...@@ -25,9 +25,9 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) ...@@ -25,9 +25,9 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
# Log some environment info # Log some environment info
date date
pip freeze # pip freeze
git rev-parse HEAD # git rev-parse HEAD
git status # git status
cd $SCRATCH_DIR/$DOWNLOAD_NAME cd $SCRATCH_DIR/$DOWNLOAD_NAME
......
...@@ -20,12 +20,12 @@ from __future__ import ( ...@@ -20,12 +20,12 @@ from __future__ import (
import collections import collections
import logging import logging
import pepdata from pepdata.reduced_alphabet import make_alphabet_transformer, gbmr4
from .train import impute_and_select_allele, AlleleSpecificTrainTestFold from .train import impute_and_select_allele, AlleleSpecificTrainTestFold
from ..parallelism import get_default_backend from ..parallelism import get_default_backend
gbmr4_transformer = pepdata.reduced_alphabet.make_alphabet_transformer("gbmr4") gbmr4_transformer = make_alphabet_transformer(gbmr4)
def default_projector(peptide): def default_projector(peptide):
...@@ -70,7 +70,8 @@ def similar_peptides(set1, set2, projector=default_projector): ...@@ -70,7 +70,8 @@ def similar_peptides(set1, set2, projector=default_projector):
Returns Returns
---------- ----------
string list string list of peptides which approximately overlap between the two input
sets.
""" """
result = collections.defaultdict(lambda: ([], [])) result = collections.defaultdict(lambda: ([], []))
for (index, peptides) in enumerate([set1, set2]): for (index, peptides) in enumerate([set1, set2]):
...@@ -155,6 +156,10 @@ def cross_validation_folds( ...@@ -155,6 +156,10 @@ def cross_validation_folds(
) )
if peptides_to_remove: if peptides_to_remove:
# TODO: instead of dropping peptides, downweight the
# peptides which get grouped together
# For example, we could replace this code with
# test_peptides, test_peptide_weights = ....
test_split = full_test_split.drop_allele_peptide_lists( test_split = full_test_split.drop_allele_peptide_lists(
[allele] * len(peptides_to_remove), [allele] * len(peptides_to_remove),
peptides_to_remove) peptides_to_remove)
......
from keras.layers import Layer
class DropMask(Layer):
supports_masking = True
def call(self, x, mask):
return x
def compute_mask(self, x, mask):
return None
import keras.layers
import keras.backend as K
class MaskedGlobalAveragePooling1D(keras.layers.pooling._GlobalPooling1D):
"""
Takes an embedded representation of a sentence with dims
(n_samples, max_length, n_dims)
where each sample is masked to allow for variable-length inputs.
Returns a tensor of shape (n_samples, n_dims) after averaging across
time in a mask-sensitive fashion.
"""
supports_masking = True
def call(self, x, mask):
expanded_mask = K.expand_dims(mask)
# zero embedded vectors which come from masked characters
x_masked = x * expanded_mask
# how many non-masked characters are in each row?
mask_counts = K.sum(mask, axis=-1)
# add up the vector representations along the time dimension
# the result should have dimension (n_samples, n_embedding_dims)
x_sums = K.sum(x_masked, axis=1)
# cast the number of non-zero elements to float32 and
# give it an extra dimension so it can broadcast properly in
# an elementwise divsion
counts_cast = K.expand_dims(K.cast(mask_counts, "float32"))
return x_sums / counts_cast
def compute_mask(self, x, mask):
return None
import keras.layers
import keras.backend as K
class MaskedGlobalMaxPooling1D(keras.layers.pooling._GlobalPooling1D):
"""
Takes an embedded representation of a sentence with dims
(n_samples, max_length, n_dims)
where each sample is masked to allow for variable-length inputs.
Returns a tensor of shape (n_samples, n_dims) after averaging across
time in a mask-sensitive fashion.
"""
supports_masking = True
def call(self, x, mask):
expanded_mask = K.expand_dims(mask)
# zero embedded vectors which come from masked characters
x_masked = x * expanded_mask
# one flaw here is that we're returning max(0, max(x[:, i])) instead of
# max(x[:, i])
return K.max(x_masked, axis=1)
def compute_mask(self, x, mask):
return None
import keras.layers
class MaskedSlice(keras.layers.Lambda):
"""
Takes an embedded representation of a sentence with dims
(n_samples, max_length, n_dims)
where each sample is masked to allow for variable-length inputs.
Returns a tensor of shape (n_samples, n_dims) which are the first
and last vectors in each sentence.
"""
supports_masking = True
def __init__(
self,
time_start,
time_end,
*args,
**kwargs):
assert time_start >= 0
assert time_end >= 0
self.time_start = time_start
self.time_end = time_end
super(MaskedSlice, self).__init__(*args, **kwargs)
def call(self, x, mask):
return x[:, self.time_start:self.time_end, :]
def compute_mask(self, x, mask):
return mask[:, self.time_start:self.time_end, :]
def get_output_shape_for(self, input_shape):
assert len(input_shape) == 3
output_shape = (
input_shape[0],
self.time_end - self.time_start + 1,
input_shape[2])
return output_shape
from concurrent import futures
import logging import logging
from six import PY2
from concurrent import futures
DEFAULT_BACKEND = None DEFAULT_BACKEND = None
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment