diff --git a/downloads-generation/data_kim2014/GENERATE.sh b/downloads-generation/data_kim2014/GENERATE.sh index 3ee0078cf12ed7963ef372f8bedd40ef25ba38f0..3ae9a6b70bdd76aba2b323ce7d1c168086d04235 100755 --- a/downloads-generation/data_kim2014/GENERATE.sh +++ b/downloads-generation/data_kim2014/GENERATE.sh @@ -18,7 +18,7 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) # Log some environment info date pip freeze -git rev-parse HEAD +# git rev-parse HEAD git status cd $SCRATCH_DIR/$DOWNLOAD_NAME diff --git a/downloads-generation/models_class1_allele_specific_single/GENERATE.sh b/downloads-generation/models_class1_allele_specific_single/GENERATE.sh index f49fc99c20d62ee71e7502a1b0eee291a3384250..93463556a3f946d77522ab590d9b76bdf033fc76 100755 --- a/downloads-generation/models_class1_allele_specific_single/GENERATE.sh +++ b/downloads-generation/models_class1_allele_specific_single/GENERATE.sh @@ -25,9 +25,9 @@ exec 2> >(tee -ia "$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.txt" >&2) # Log some environment info date -pip freeze -git rev-parse HEAD -git status +# pip freeze +# git rev-parse HEAD +# git status cd $SCRATCH_DIR/$DOWNLOAD_NAME diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index dc67b8bc3bff77fee5d5d845de4f7535eb73a480..a2b1d664cabb8f417700ef252abf27c73d01353c 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -15,9 +15,10 @@ from .class1_allele_specific.class1_binding_predictor import ( Class1BindingPredictor) from .predict import predict -from .package_metadata import __version__ from . import parallelism +__version__ = "0.2.0" + __all__ = [ "Class1BindingPredictor", "predict", diff --git a/mhcflurry/class1_allele_specific/cross_validation.py b/mhcflurry/class1_allele_specific/cross_validation.py index 1bf0038fdde8f9ab2a6808693bc18d0e93fdba75..d2189377e0d9f28d4d608bda9186b9a6095b06e5 100644 --- a/mhcflurry/class1_allele_specific/cross_validation.py +++ b/mhcflurry/class1_allele_specific/cross_validation.py @@ -20,12 +20,12 @@ from __future__ import ( import collections import logging -import pepdata +from pepdata.reduced_alphabet import make_alphabet_transformer, gbmr4 from .train import impute_and_select_allele, AlleleSpecificTrainTestFold from ..parallelism import get_default_backend -gbmr4_transformer = pepdata.reduced_alphabet.make_alphabet_transformer("gbmr4") +gbmr4_transformer = make_alphabet_transformer(gbmr4) def default_projector(peptide): @@ -70,7 +70,8 @@ def similar_peptides(set1, set2, projector=default_projector): Returns ---------- - string list + string list of peptides which approximately overlap between the two input + sets. """ result = collections.defaultdict(lambda: ([], [])) for (index, peptides) in enumerate([set1, set2]): @@ -156,6 +157,10 @@ def cross_validation_folds( ) if peptides_to_remove: + # TODO: instead of dropping peptides, downweight the + # peptides which get grouped together + # For example, we could replace this code with + # test_peptides, test_peptide_weights = .... test_split = full_test_split.drop_allele_peptide_lists( [allele] * len(peptides_to_remove), peptides_to_remove) diff --git a/mhcflurry/keras_layers/drop_mask.py b/mhcflurry/keras_layers/drop_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..8ea799ca15ec7f5561f71d721fe6619036c74401 --- /dev/null +++ b/mhcflurry/keras_layers/drop_mask.py @@ -0,0 +1,16 @@ +from keras.layers import Layer + +class DropMask(Layer): + """ + Sometimes we know that a mask is always going to contain 1s (and never 0s) + due to e.g. slicing the beginning of a sequence with a known min length. + In that case it can be useful to drop the sequence mask and feed the + activations to a layer which does not support masking (e.g. Dense). + """ + supports_masking = True + + def call(self, x, mask): + return x + + def compute_mask(self, x, mask): + return None diff --git a/mhcflurry/keras_layers/masked_global_average_pooling.py b/mhcflurry/keras_layers/masked_global_average_pooling.py new file mode 100644 index 0000000000000000000000000000000000000000..187cc9c862dfd4f3896c82680d249a6653350888 --- /dev/null +++ b/mhcflurry/keras_layers/masked_global_average_pooling.py @@ -0,0 +1,31 @@ +import keras.layers +import keras.backend as K + +class MaskedGlobalAveragePooling1D(keras.layers.pooling._GlobalPooling1D): + """ + Takes an embedded representation of a sentence with dims + (n_samples, max_length, n_dims) + where each sample is masked to allow for variable-length inputs. + Returns a tensor of shape (n_samples, n_dims) after averaging across + time in a mask-sensitive fashion. + """ + supports_masking = True + + def call(self, x, mask): + expanded_mask = K.expand_dims(mask) + # zero embedded vectors which come from masked characters + x_masked = x * expanded_mask + # how many non-masked characters are in each row? + mask_counts = K.sum(mask, axis=-1) + # add up the vector representations along the time dimension + # the result should have dimension (n_samples, n_embedding_dims) + x_sums = K.sum(x_masked, axis=1) + # cast the number of non-zero elements to float32 and + # give it an extra dimension so it can broadcast properly in + # an elementwise divsion + counts_cast = K.expand_dims(K.cast(mask_counts, "float32")) + return x_sums / counts_cast + + def compute_mask(self, x, mask): + return None + diff --git a/mhcflurry/keras_layers/masked_global_max_pooling.py b/mhcflurry/keras_layers/masked_global_max_pooling.py new file mode 100644 index 0000000000000000000000000000000000000000..5bec252a5a445cb691c9ed2503cc5402a3d7e0ad --- /dev/null +++ b/mhcflurry/keras_layers/masked_global_max_pooling.py @@ -0,0 +1,24 @@ +import keras.layers +import keras.backend as K + +class MaskedGlobalMaxPooling1D(keras.layers.pooling._GlobalPooling1D): + """ + Takes an embedded representation of a sentence with dims + (n_samples, max_length, n_dims) + where each sample is masked to allow for variable-length inputs. + Returns a tensor of shape (n_samples, n_dims) after averaging across + time in a mask-sensitive fashion. + """ + supports_masking = True + + def call(self, x, mask): + expanded_mask = K.expand_dims(mask) + # zero embedded vectors which come from masked characters + x_masked = x * expanded_mask + + # one flaw here is that we're returning max(0, max(x[:, i])) instead of + # max(x[:, i]) + return K.max(x_masked, axis=1) + + def compute_mask(self, x, mask): + return None diff --git a/mhcflurry/keras_layers/masked_slice.py b/mhcflurry/keras_layers/masked_slice.py new file mode 100644 index 0000000000000000000000000000000000000000..022c7f79058763a878556adb42ad71971f02ff2b --- /dev/null +++ b/mhcflurry/keras_layers/masked_slice.py @@ -0,0 +1,37 @@ +import keras.layers + +class MaskedSlice(keras.layers.Lambda): + """ + Takes an embedded representation of a sentence with dims + (n_samples, max_length, n_dims) + where each sample is masked to allow for variable-length inputs. + Returns a tensor of shape (n_samples, n_dims) which are the first + and last vectors in each sentence. + """ + supports_masking = True + + def __init__( + self, + time_start, + time_end, + *args, + **kwargs): + assert time_start >= 0 + assert time_end >= 0 + self.time_start = time_start + self.time_end = time_end + super(MaskedSlice, self).__init__(*args, **kwargs) + + def call(self, x, mask): + return x[:, self.time_start:self.time_end, :] + + def compute_mask(self, x, mask): + return mask[:, self.time_start:self.time_end, :] + + def get_output_shape_for(self, input_shape): + assert len(input_shape) == 3 + output_shape = ( + input_shape[0], + self.time_end - self.time_start + 1, + input_shape[2]) + return output_shape diff --git a/mhcflurry/package_metadata.py b/mhcflurry/package_metadata.py deleted file mode 100644 index 36e61c641d912318fc6e395d4913c21c77c544ff..0000000000000000000000000000000000000000 --- a/mhcflurry/package_metadata.py +++ /dev/null @@ -1,2 +0,0 @@ - -__version__ = "0.1.0" diff --git a/mhcflurry/parallelism.py b/mhcflurry/parallelism.py index faecc871469210058ebb11f3323ae0a1ccdc20ce..df8dcfab8ff6b054ee65a2b1fc71df85f204b7bc 100644 --- a/mhcflurry/parallelism.py +++ b/mhcflurry/parallelism.py @@ -1,5 +1,5 @@ -from concurrent import futures import logging +from concurrent import futures DEFAULT_BACKEND = None diff --git a/setup.py b/setup.py index 909ba1f78c4328e7374ce3243423e1ce72d35e1c..dd66f5813776311e7bcaa4269175373e26a8cfce 100644 --- a/setup.py +++ b/setup.py @@ -41,7 +41,7 @@ except: pass -with open('mhcflurry/package_metadata.py', 'r') as f: +with open('mhcflurry/__init__.py', 'r') as f: version = re.search( r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', f.read(),