adding keras layers for dealing with masks in variable length sequence inputs

1a257560 · Alex Rubinsteyn · 4e3ccd35 · 1a257560 · 1a257560 · 1a257560
Commit 1a257560 authored 8 years ago by Alex Rubinsteyn
--- a/downloads-generation/data_kim2014/GENERATE.sh
+++ b/downloads-generation/data_kim2014/GENERATE.sh
@@ -18,7 +18,7 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 # Log some environment info
 date
 pip freeze
-git rev-parse HEAD
+# git rev-parse HEAD
 git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME

--- a/downloads-generation/models_class1_allele_specific_single/GENERATE.sh
+++ b/downloads-generation/models_class1_allele_specific_single/GENERATE.sh
@@ -25,9 +25,9 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2)
 # Log some environment info
 date
-pip freeze
+# pip freeze
-git rev-parse HEAD
+# git rev-parse HEAD
-git status
+# git status
 cd $SCRATCH_DIR/$DOWNLOAD_NAME

--- a/mhcflurry/class1_allele_specific/cross_validation.py
+++ b/mhcflurry/class1_allele_specific/cross_validation.py
@@ -20,12 +20,12 @@ from __future__ import (
 import collections
 import logging
-import pepdata
+from pepdata.reduced_alphabet import make_alphabet_transformer, gbmr4
 from .train import impute_and_select_allele, AlleleSpecificTrainTestFold
 from ..parallelism import get_default_backend
-gbmr4_transformer = pepdata.reduced_alphabet.make_alphabet_transformer("gbmr4")
+gbmr4_transformer = make_alphabet_transformer(gbmr4)
 def default_projector(peptide):
@@ -70,7 +70,8 @@ def similar_peptides(set1, set2, projector=default_projector):
    Returns
    ----------
-    string list
+    string list of peptides which approximately overlap between the two input
+    sets.
    """
    result = collections.defaultdict(lambda: ([], []))
    for (index, peptides) in enumerate([set1, set2]):
@@ -155,6 +156,10 @@ def cross_validation_folds(
                )
            if peptides_to_remove:
+                # TODO: instead of dropping peptides, downweight the
+                # peptides which get grouped together
+                # For example, we could replace this code with
+                #   test_peptides, test_peptide_weights = ....
                test_split = full_test_split.drop_allele_peptide_lists(
                    [allele] * len(peptides_to_remove),
                    peptides_to_remove)

--- a/mhcflurry/keras_layers/drop_mask.py
+++ b/mhcflurry/keras_layers/drop_mask.py
+from keras.layers import Layer
+class DropMask(Layer):
+    supports_masking = True
+    def call(self, x, mask):
+        return x
+    def compute_mask(self, x, mask):
+        return None
--- a/mhcflurry/keras_layers/masked_global_average_pooling.py
+++ b/mhcflurry/keras_layers/masked_global_average_pooling.py
+import keras.layers
+import keras.backend as K
+class MaskedGlobalAveragePooling1D(keras.layers.pooling._GlobalPooling1D):
+    """
+    Takes an embedded representation of a sentence with dims
+    (n_samples, max_length, n_dims)
+    where each sample is masked to allow for variable-length inputs.
+    Returns a tensor of shape (n_samples, n_dims) after averaging across
+    time in a mask-sensitive fashion.
+    """
+    supports_masking = True
+    def call(self, x, mask):
+        expanded_mask = K.expand_dims(mask)
+        # zero embedded vectors which come from masked characters
+        x_masked = x * expanded_mask
+        # how many non-masked characters are in each row?
+        mask_counts = K.sum(mask, axis=-1)
+        # add up the vector representations along the time dimension
+        # the result should have dimension (n_samples, n_embedding_dims)
+        x_sums = K.sum(x_masked, axis=1)
+        # cast the number of non-zero elements to float32 and
+        # give it an extra dimension so it can broadcast properly in
+        # an elementwise divsion
+        counts_cast = K.expand_dims(K.cast(mask_counts, "float32"))
+        return x_sums / counts_cast
+    def compute_mask(self, x, mask):
+        return None
--- a/mhcflurry/keras_layers/masked_global_max_pooling.py
+++ b/mhcflurry/keras_layers/masked_global_max_pooling.py
+import keras.layers
+import keras.backend as K
+class MaskedGlobalMaxPooling1D(keras.layers.pooling._GlobalPooling1D):
+    """
+    Takes an embedded representation of a sentence with dims
+    (n_samples, max_length, n_dims)
+    where each sample is masked to allow for variable-length inputs.
+    Returns a tensor of shape (n_samples, n_dims) after averaging across
+    time in a mask-sensitive fashion.
+    """
+    supports_masking = True
+    def call(self, x, mask):
+        expanded_mask = K.expand_dims(mask)
+        # zero embedded vectors which come from masked characters
+        x_masked = x * expanded_mask
+        # one flaw here is that we're returning max(0, max(x[:, i])) instead of
+        # max(x[:, i])
+        return K.max(x_masked, axis=1)
+    def compute_mask(self, x, mask):
+        return None
--- a/mhcflurry/keras_layers/masked_slice.py
+++ b/mhcflurry/keras_layers/masked_slice.py
+import keras.layers
+class MaskedSlice(keras.layers.Lambda):
+    """
+    Takes an embedded representation of a sentence with dims
+    (n_samples, max_length, n_dims)
+    where each sample is masked to allow for variable-length inputs.
+    Returns a tensor of shape (n_samples, n_dims) which are the first
+    and last vectors in each sentence.
+    """
+    supports_masking = True
+    def __init__(
+            self,
+            time_start,
+            time_end,
+            *args,
+            **kwargs):
+        assert time_start >= 0
+        assert time_end >= 0
+        self.time_start = time_start
+        self.time_end = time_end
+        super(MaskedSlice, self).__init__(*args, **kwargs)
+    def call(self, x, mask):
+        return x[:, self.time_start:self.time_end, :]
+    def compute_mask(self, x, mask):
+        return mask[:, self.time_start:self.time_end, :]
+    def get_output_shape_for(self, input_shape):
+        assert len(input_shape) == 3
+        output_shape = (
+            input_shape[0],
+            self.time_end - self.time_start + 1,
+            input_shape[2])
+        return output_shape
--- a/mhcflurry/parallelism.py
+++ b/mhcflurry/parallelism.py
-from concurrent import futures
 import logging
+from six import PY2
+from concurrent import futures
 DEFAULT_BACKEND = None