From 024cb70eae4995e2238600d3358c035af26f50a0 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 19 Apr 2019 14:33:13 -0400 Subject: [PATCH] add support for left_pad_right_pad in encodable_sequences --- mhcflurry/encodable_sequences.py | 37 +++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index 2560a8ef..8dff8e8f 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -51,7 +51,11 @@ class EncodableSequences(object): return len(self.sequences) def variable_length_to_fixed_length_categorical( - self, left_edge=4, right_edge=4, max_length=15): + self, + alignment_method="pad_middle", + left_edge=4, + right_edge=4, + max_length=15): """ Encode variable-length sequences using a fixed-length encoding designed for preserving the anchor positions of class I peptides. @@ -80,6 +84,7 @@ class EncodableSequences(object): fixed_length_sequences = ( self.sequences_to_fixed_length_index_encoded_array( self.sequences, + alignment_method=alignment_method, left_edge=left_edge, right_edge=right_edge, max_length=max_length)) @@ -87,7 +92,12 @@ class EncodableSequences(object): return self.encoding_cache[cache_key] def variable_length_to_fixed_length_vector_encoding( - self, vector_encoding_name, left_edge=4, right_edge=4, max_length=15): + self, + vector_encoding_name, + alignment_method="pad_middle", + left_edge=4, + right_edge=4, + max_length=15): """ Encode variable-length sequences using a fixed-length encoding designed for preserving the anchor positions of class I peptides. @@ -120,6 +130,7 @@ class EncodableSequences(object): fixed_length_sequences = ( self.sequences_to_fixed_length_index_encoded_array( self.sequences, + alignment_method=alignment_method, left_edge=left_edge, right_edge=right_edge, max_length=max_length)) @@ -227,7 +238,27 @@ class EncodableSequences(object): -right_edge: ] = fixed_length_sequences[:, -right_edge:] elif alignment_method == "left_pad_right_pad": - raise NotImplementedError + # Result array is int32, filled with X (null amino acid) value. + result = numpy.full(fill_value=amino_acid.AMINO_ACID_INDEX['X'], + shape=(len(sequences), max_length * 2), dtype="int32") + + df = pandas.DataFrame({"peptide": sequences}) + + # For efficiency we handle each supported peptide length using bulk + # array operations. + for (length, sub_df) in df.groupby(df.peptide.str.len()): + # Array of shape (num peptides, length) giving fixed-length amino + # acid encoding each peptide of the current length. + fixed_length_sequences = numpy.stack(sub_df.peptide.map( + lambda s: numpy.array( + [amino_acid.AMINO_ACID_INDEX[char] for char in + s])).values) + + # Set left edge + result[sub_df.index, :length] = fixed_length_sequences + + # Set right edge. + result[sub_df.index, -length:] = fixed_length_sequences else: raise NotImplementedError( "Unsupported alignment method: %s" % alignment_method) -- GitLab