From b2b032c76a5a8999be8ffd6626865fbca9d916e9 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Mon, 4 Dec 2017 19:14:56 -0500 Subject: [PATCH] Even faster peptide encoding --- mhcflurry/encodable_sequences.py | 74 +++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 24 deletions(-) diff --git a/mhcflurry/encodable_sequences.py b/mhcflurry/encodable_sequences.py index 8477938b..d2a06498 100644 --- a/mhcflurry/encodable_sequences.py +++ b/mhcflurry/encodable_sequences.py @@ -21,6 +21,7 @@ from __future__ import ( import math import numpy +import pandas import typechecks @@ -169,37 +170,62 @@ class EncodableSequences(object): Returns ------- - numpy array of shape (len(sequences), max_length, 21) and dtype int + numpy array of shape (len(sequences), max_length) and dtype int """ - result = numpy.ones(shape=(len(sequences), max_length), dtype=int) * -1 - fill_value = amino_acid.AMINO_ACID_INDEX['X'] - for (i, sequence) in enumerate(sequences): - sequence_indexes = [ - amino_acid.AMINO_ACID_INDEX[char] for char in sequence - ] - if len(sequence) < left_edge + right_edge: + # Result array is int32, filled with X (null amino acid) value. + result = numpy.full( + fill_value=amino_acid.AMINO_ACID_INDEX['X'], + shape=(len(sequences), max_length), + dtype="int32") + + df = pandas.DataFrame({"peptide": sequences}) + df["length"] = df.peptide.str.len() + + middle_length = max_length - left_edge - right_edge + + # For efficiency we handle each supported peptide length using bulk + # array operations. + for (length, sub_df) in df.groupby("length"): + if length < left_edge + right_edge: raise ValueError( "Sequence '%s' (length %d) unsupported: length must be at " - "least %d" % ( - sequence, len(sequence), left_edge + right_edge)) - if len(sequence) > max_length: + "least %d. There are %d total peptides with this length." % ( + sub_df.iloc[0].peptide, length, left_edge + right_edge, + len(sub_df))) + if length > max_length: raise ValueError( "Sequence '%s' (length %d) unsupported: length must be at " - "most %d" % (sequence, len(sequence), max_length)) - - middle_length = max_length - left_edge - right_edge - num_null = max_length - len(sequence) + "most %d. There are %d total peptides with this length." % ( + sub_df.iloc[0].peptide, length, max_length, + len(sub_df))) + + # Array of shape (num peptides, length) giving fixed-length amino + # acid encoding each peptide of the current length. + fixed_length_indices = numpy.stack( + sub_df.peptide.map( + lambda s: numpy.array( + [amino_acid.AMINO_ACID_INDEX[char] for char in + s])).values) + + num_null = max_length - length num_null_left = int(math.ceil(num_null / 2)) - num_null_right = int(math.floor(num_null / 2)) num_not_null_middle = middle_length - num_null - result[i] = numpy.concatenate([ - sequence_indexes[:left_edge], - numpy.ones(num_null_left) * fill_value, - sequence_indexes[left_edge:left_edge + num_not_null_middle], - numpy.ones(num_null_right) * fill_value, - sequence_indexes[-right_edge:], - ]) - assert len(result[i]) == max_length + # Set left edge + result[sub_df.index, :left_edge] = fixed_length_indices[ + :, :left_edge + ] + + # Set middle. + result[ + sub_df.index, + left_edge + num_null_left : left_edge + num_null_left + num_not_null_middle + ] = fixed_length_indices[:, left_edge:left_edge + num_not_null_middle] + + # Set right edge. + result[ + sub_df.index, + -right_edge: + ] = fixed_length_indices[:, -right_edge:] return result -- GitLab