From 4b0fe361548d2da2337cbc5882e9ee82dfa2a8f6 Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Sat, 21 Nov 2015 16:31:27 -0500 Subject: [PATCH] added unit tests for peptide shortening and extension --- mhcflurry/__init__.py | 6 +- mhcflurry/{data_helpers.py => data.py} | 136 +++++++++--------- mhcflurry/fixed_length_peptides.py | 188 +++++++++++++++---------- mhcflurry/mhc1_binding_predictor.py | 3 +- mhcflurry/peptide_encoding.py | 65 ++++++--- test/test_fixed_length_peptides.py | 109 ++++++++++++++ 6 files changed, 347 insertions(+), 160 deletions(-) rename mhcflurry/{data_helpers.py => data.py} (72%) create mode 100644 test/test_fixed_length_peptides.py diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index 5f7b272d..04e607f0 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -13,17 +13,19 @@ # limitations under the License. from . import paths -from . import data_helpers +from . import data from . import feedforward from . import common from . import fixed_length_peptides +from . import peptide_encoding from .mhc1_binding_predictor import Mhc1BindingPredictor __all__ = [ "paths", - "data_helpers", + "data", "feedforward", "fixed_length_peptides", + "peptide_encoding", "common", "Mhc1BindingPredictor" ] diff --git a/mhcflurry/data_helpers.py b/mhcflurry/data.py similarity index 72% rename from mhcflurry/data_helpers.py rename to mhcflurry/data.py index be40e8c6..a097e335 100644 --- a/mhcflurry/data_helpers.py +++ b/mhcflurry/data.py @@ -23,53 +23,25 @@ import pandas as pd import numpy as np from .common import normalize_allele_name -from .amino_acid import amino_acid_letter_indices - -AlleleData = namedtuple("AlleleData", "X Y peptides ic50") - - -def hotshot_encoding(peptides, peptide_length): - """ - Encode a set of equal length peptides as a binary matrix, - where each letter is transformed into a length 20 vector with a single - element that is 1 (and the others are 0). - """ - shape = (len(peptides), peptide_length, 20) - X = np.zeros(shape, dtype=bool) - for i, peptide in enumerate(peptides): - for j, amino_acid in enumerate(peptide): - k = amino_acid_letter_indices[amino_acid] - X[i, j, k] = 1 - return X - - -def index_encoding(peptides, peptide_length): - """ - Encode a set of equal length peptides as a vector of their - amino acid indices. - """ - X = np.zeros((len(peptides), peptide_length), dtype=int) - for i, peptide in enumerate(peptides): - for j, amino_acid in enumerate(peptide): - X[i, j] = amino_acid_letter_indices[amino_acid] - return X - - -def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): - """ - Given an (n_samples, peptide_length) integer matrix - convert it to a binary encoding of shape: - (n_samples, peptide_length * n_indices) - """ - (n_samples, peptide_length) = X.shape - if not n_indices: - n_indices = X.max() - first_index_value + 1 +from .peptide_encoding import ( + fixed_length_index_encoding, + indices_to_hotshot_encoding, +) - X_binary = np.zeros((n_samples, peptide_length * n_indices), dtype=bool) - for i, row in enumerate(X): - for j, xij in enumerate(row): - X_binary[i, n_indices * j + xij - first_index_value] = 1 - return X_binary.astype(float) +AlleleData = namedtuple( + "AlleleData", + [ + "X_index", # index-based featue encoding of fixed length peptides + "X_binary", # binary encoding of fixed length peptides + "Y", # regression encoding of IC50 (log scaled between 0..1) + "peptides", # list of fixed length peptide string + "ic50", # IC50 value associated with each entry + "original_peptides", # original peptides may be of different lengths + "original_length", # len(original_peptide) + "substring_count", # how many substrings were extracted from + # each original peptide string + "weight", # 1.0 / count + ]) def _infer_csv_separator(filename): @@ -213,8 +185,8 @@ def load_allele_dicts( def load_allele_datasets( filename, peptide_length=9, - max_ic50=5000.0, - binary_encoding=True, + use_multiple_peptide_lengths=True, + max_ic50=50000.0, flatten_binary_encoding=True, sep=None, species_column_name="species", @@ -241,14 +213,18 @@ def load_allele_datasets( peptide_length : int Which length peptides to use (default=9) + use_multiple_peptide_lengths : bool + If a peptide is shorter than `peptide_length`, expand it into many + peptides of the appropriate length by inserting all combinations of + amino acids. Similarly, if a peptide is longer than `peptide_length`, + shorten it by deleting stretches of contiguous amino acids at all + peptide positions. + max_ic50 : float Treat IC50 scores above this value as all equally bad (transform them to 0.0 in the rescaled output) - binary_encoding : bool - Encode amino acids of each peptide as indices or binary vectors - - flatten_features : bool + flatten_binary_encoding : bool If False, returns a (n_samples, peptide_length, 20) matrix, otherwise returns the 2D flattened version of the same data. @@ -275,21 +251,53 @@ def load_allele_datasets( allele_groups = {} for allele, group in df.groupby(allele_column_name): - ic50 = np.array(group[ic50_column_name]) - Y = np.array(group["regression_output"]) - peptides = list(group[peptide_column_name]) - if binary_encoding: - X = hotshot_encoding(peptides, peptide_length=peptide_length) - if flatten_binary_encoding: - # collapse 3D input into 2D matrix - X = X.reshape((X.shape[0], peptide_length * 20)) - else: - X = index_encoding(peptides, peptide_length=peptide_length) assert allele not in allele_groups, \ "Duplicate datasets for %s" % allele + + raw_peptides = group[peptide_column_name] + + # filter lengths in case user wants to drop peptides that are longer + # or shorter than the desired fixed length + if not use_multiple_peptide_lengths: + drop_mask = raw_peptides.str.len() != peptide_length + group = group[~drop_mask] + raw_peptides = raw_peptides[~drop_mask] + + # convert from a Pandas column to a list, since that's easier to + # interact with later + raw_peptides = list(raw_peptides) + # convert numberical values from a Pandas column to arrays + ic50 = np.array(group[ic50_column_name]) + Y = np.array(group["regression_output"]) + + X_index, original_peptides, counts = fixed_length_index_encoding( + peptides=raw_peptides, + desired_length=peptide_length) + + X_binary = indices_to_hotshot_encoding(X_index, n_indices=20) + assert X_binary.shape[0] == X_index.shape[0], \ + ("Mismatch between number of samples for index encoding (%d)" + " vs. binary encoding (%d)") % ( + X_binary.shape[0], + X_index.shape[0]) + n_samples = X_binary.shape[0] + + if flatten_binary_encoding: + # collapse 3D input into 2D matrix + n_binary_features = peptide_length * 20 + X_binary = X_binary.reshape((n_samples, n_binary_features)) + + # easier to work with counts when they're an array instead of list + counts = np.array(counts) + allele_groups[allele] = AlleleData( - X=X, + X_index=X_index, + X_binary=X_binary, Y=Y, ic50=ic50, - peptides=peptides) + peptides=raw_peptides, + original_peptides=original_peptides, + original_length=[len(peptide) for peptide in original_peptides], + substring_count=counts, + weight=1.0 / counts) return allele_groups diff --git a/mhcflurry/fixed_length_peptides.py b/mhcflurry/fixed_length_peptides.py index d5ef0a69..edaa4548 100644 --- a/mhcflurry/fixed_length_peptides.py +++ b/mhcflurry/fixed_length_peptides.py @@ -13,6 +13,7 @@ # limitations under the License. import itertools +import logging from .amino_acid import amino_acid_letters @@ -20,14 +21,25 @@ from .amino_acid import amino_acid_letters def all_kmers(k, alphabet=amino_acid_letters): """ Generates all k-mer peptide sequences + + Parameters + ---------- + k : int + + alphabet : str | list of characters """ + alphabets = [alphabet] * k return [ "".join(combination) for combination - in itertools.product(list(alphabet) * k) + in itertools.product(*alphabets) ] +class CombinatorialExplosion(Exception): + pass + + def extend_peptide( peptide, desired_length, @@ -35,18 +47,37 @@ def extend_peptide( end_offset, alphabet=amino_acid_letters): """Extend peptide by inserting every possible amino acid combination - if we're trying to e.g. turn an 8mer into 9mers + if we're trying to e.g. turn an 8mer into 9mers. + + Parameters + ---------- + peptide : str + + desired_length : int + + start_offset : int + How many characters (from the position before the start of the string) + to skip before inserting characters. + + + end_offset : int + Last character position from the end where we insert new characters, + where 0 is the position after the last character. + + alphabet : str | list of character """ n = len(peptide) - assert n < desired_length + assert n < desired_length, \ + "%s (length = %d) is too long! Must be shorter than %d" % ( + peptide, n, desired_length) n_missing = desired_length - n if n_missing > 3: - raise ValueError( + raise CombinatorialExplosion( "Cannot transform %s of length %d into a %d-mer peptide" % ( peptide, n, desired_length)) return [ peptide[:i] + extra + peptide[i:] - for i in range(start_offset, n - end_offset) + for i in range(start_offset, n - end_offset + 1) for extra in all_kmers(n_missing, alphabet=alphabet) ] @@ -57,9 +88,25 @@ def shorten_peptide( start_offset, end_offset, alphabet=amino_acid_letters): - """Shorten peptide if trying to e.g. turn 10mer into 9mers""" + """Shorten peptide if trying to e.g. turn 10mer into 9mers + + Parameters + ---------- + + peptide : str + + desired_length : int + + start_offset : int + + end_offset : int + + alphabet : str | list of characters + """ n = len(peptide) - assert n > desired_length + assert n > desired_length, \ + "%s (length = %d) is too short! Must be longer than %d" % ( + peptide, n, desired_length) n_skip = n - desired_length assert n_skip > 0, \ "Expected length of peptide %s %d to be greater than %d" % ( @@ -71,50 +118,6 @@ def shorten_peptide( ] -def fixed_length_from_single_peptide( - peptide, - desired_length, - start_offset_extend=2, - end_offset_extend=1, - start_offset_shorten=2, - end_offset_shorten=0, - alphabet=amino_acid_letters): - """ - Create a set of fixed-length k-mer peptides from a single peptide of any - length. Shorter peptides are filled in using all possible amino acids at any - insertion site between (start_offset, -end_offset). - - We can recreate the methods from: - Accurate approximation method for prediction of class I MHC - affinities for peptides of length 8, 10 and 11 using prediction - tools trained on 9mers. - by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397) - with the following settings: - - desired_length = 9 - - start_offset_extend = 2 - - end_offset_extend = 1 - - start_offset_shorten = 2 - - end_offset_shorten = 0 - """ - n = len(peptide) - if n == desired_length: - return [peptide] - elif n < desired_length: - return extend_peptide( - peptide=peptide, - desired_length=desired_length, - start_offset=start_offset_extend, - end_offset=end_offset_extend, - alphabet=alphabet) - else: - return shorten_peptide( - peptide=peptide, - desired_length=desired_length, - start_offset=start_offset_shorten, - end_offset=end_offset_shorten, - alphabet=alphabet) - - def fixed_length_from_many_peptides( peptides, desired_length, @@ -125,11 +128,19 @@ def fixed_length_from_many_peptides( alphabet=amino_acid_letters): """ Create a set of fixed-length k-mer peptides from a collection of varying - length peptides. Shorter peptides are filled in using all possible amino - acids at any insertion site between - [start_offset_extend, length - end_offset_extend). - Longer peptides are made smaller by deleting contiguous residues between - [start_offset_shorten, length - end_offset_shorten) + length peptides. + + + Shorter peptides are filled in using all possible amino acids at any + insertion site between start_offset_shorten and -end_offset_shorten + where start_offset_extend=0 represents insertions before the string + and end_offset_extend=0 represents insertions after the string's ending. + + Longer peptides are shortened by deleting contiguous residues, starting + from start_offset_shorten and ending with -end_offset_shorten. Unlike + peptide extensions, the offsets for shortening a peptide range between + the first and last positions (rather than between the positions *before* + the string starts and the position *after*). We can recreate the methods from: Accurate approximation method for prediction of class I MHC @@ -138,10 +149,10 @@ def fixed_length_from_many_peptides( by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397) with the following settings: - desired_length = 9 - - start_offset_extend = 2 - - end_offset_extend = 1 - - start_offset_shorten = 2 - - end_offset_shorten = 0 + - start_offset_extend = 3 + - end_offset_extend = 2 + - start_offset_shorten = 3 + - end_offset_shorten = 1 Returns three lists: - a list of fixed length peptides (all of length `desired_length`) @@ -162,21 +173,52 @@ def fixed_length_from_many_peptides( kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"] original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"] counts == [3, 3, 3, 6, 6, 6, 6, 6, 6] + + Parameters + ---------- + peptide : list of str + + desired_length : int + + start_offset_extend : int + + end_offset_extend : int + + start_offset_shorten : int + + end_offset_shorten : int + + alphabet : str | list of characters """ - fixed_length_peptides = [] + all_fixed_length_peptides = [] original_peptide_sequences = [] - number_expanded = [] + counts = [] for peptide in peptides: - fixed_length_peptides = fixed_length_from_single_peptide( - peptide, - desired_length=desired_length, - start_offset_extend=start_offset_extend, - end_offset_extend=end_offset_extend, - start_offset_shorten=start_offset_shorten, - end_offset_shorten=end_offset_shorten, - alphabet=alphabet) + n = len(peptide) + if n == desired_length: + fixed_length_peptides = [peptide] + elif n < desired_length: + try: + fixed_length_peptides = extend_peptide( + peptide=peptide, + desired_length=desired_length, + start_offset=start_offset_extend, + end_offset=end_offset_extend, + alphabet=alphabet) + except CombinatorialExplosion: + logging.warn( + "Peptide %s is too short to be extended to length %d" % ( + peptide, desired_length)) + continue + else: + fixed_length_peptides = shorten_peptide( + peptide=peptide, + desired_length=desired_length, + start_offset=start_offset_shorten, + end_offset=end_offset_shorten, + alphabet=alphabet) n_fixed_length = len(fixed_length_peptides) - fixed_length_peptides.extend(fixed_length_peptides) + all_fixed_length_peptides.extend(fixed_length_peptides) original_peptide_sequences.extend([peptide] * n_fixed_length) - number_expanded.extend([n_fixed_length] * n_fixed_length) - return fixed_length_peptides, original_peptide_sequences, number_expanded + counts.extend([n_fixed_length] * n_fixed_length) + return all_fixed_length_peptides, original_peptide_sequences, counts diff --git a/mhcflurry/mhc1_binding_predictor.py b/mhcflurry/mhc1_binding_predictor.py index b037b2d6..312eb892 100644 --- a/mhcflurry/mhc1_binding_predictor.py +++ b/mhcflurry/mhc1_binding_predictor.py @@ -30,7 +30,8 @@ import pandas as pd from keras.models import model_from_config from .class1_allele_specific_hyperparameters import MAX_IC50 -from .data_helpers import index_encoding, normalize_allele_name +from .common import normalize_allele_name +from .peptide_encoding import index_encoding from .paths import CLASS1_MODEL_DIRECTORY from .fixed_length_peptides import fixed_length_from_many_peptides diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py index db20c8ea..ac9f9620 100644 --- a/mhcflurry/peptide_encoding.py +++ b/mhcflurry/peptide_encoding.py @@ -13,7 +13,9 @@ # limitations under the License. import numpy as np + from .amino_acid import amino_acid_letter_indices +from .fixed_length_peptides import fixed_length_from_many_peptides def hotshot_encoding(peptides, peptide_length): @@ -43,26 +45,6 @@ def index_encoding(peptides, peptide_length): return X -def index_encoding_of_substrings( - peptides, - substring_length, - delete_exclude_start=0, - delete_exclude_end=0): - """ - Take peptides of varying lengths, chop them into substrings of fixed - length and apply index encoding to these substrings. - - If a string is longer than the substring length, then it's reduced to - the desired length by deleting characters at all possible positions. - If positions at the start or end of a string should be exempt from deletion - then the number of exempt characters can be controlled via - `delete_exclude_start` and `delete_exclude_end`. - - Returns feature matrix X and a vector of substring counts. - """ - pass - - def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): """ Given an (n_samples, peptide_length) integer matrix @@ -78,3 +60,46 @@ def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): for j, xij in enumerate(row): X_binary[i, n_indices * j + xij - first_index_value] = 1 return X_binary.astype(float) + + +def fixed_length_index_encoding( + peptides, + desired_length, + start_offset_shorten=0, + end_offset_shorten=0, + start_offset_extend=0, + end_offset_extend=0): + """ + Take peptides of varying lengths, chop them into substrings of fixed + length and apply index encoding to these substrings. + + If a string is longer than the desired length, then it's reduced to + the desired length by deleting characters at all possible positions. When + positions at the start or end of a string should be exempt from deletion + then the number of exempt characters can be controlled via + `start_offset_shorten` and `end_offset_shorten`. + + If a string is shorter than the desired length then it is filled + with all possible characters of the alphabet at all positions. The + parameters `start_offset_extend` and `end_offset_extend` control whether + certain positions are excluded from insertion. The positions are + in a "inter-residue" coordinate system, where `start_offset_extend` = 0 + refers to the position *before* the start of a peptide and, similarly, + `end_offset_extend` = 0 refers to the position *after* the peptide. + + Returns feature matrix X, a list of original peptides for each feature + vector, and a list of integer counts indicating how many rows share a + particular original peptide. When two rows are expanded out of a single + original peptide, they will both have a count of 2. These counts can + be useful for down-weighting the importance of multiple feature vectors + which originate from the same sample. + """ + fixed_length, original, counts = fixed_length_from_many_peptides( + peptides=peptides, + desired_length=desired_length, + start_offset_shorten=start_offset_shorten, + end_offset_shorten=end_offset_shorten, + start_offset_extend=start_offset_extend, + end_offset_extend=end_offset_extend) + X = index_encoding(fixed_length, desired_length) + return X, original, counts diff --git a/test/test_fixed_length_peptides.py b/test/test_fixed_length_peptides.py new file mode 100644 index 00000000..ebe0f8e2 --- /dev/null +++ b/test/test_fixed_length_peptides.py @@ -0,0 +1,109 @@ +from mhcflurry.fixed_length_peptides import ( + all_kmers, + extend_peptide, + shorten_peptide, + fixed_length_from_many_peptides, +) +from nose.tools import eq_ + + +def test_all_kmers(): + kmers = all_kmers(2, alphabet=["A", "B"]) + assert len(kmers) == 4, kmers + eq_(set(kmers), {"AA", "AB", "BA", "BB"}) + + +def test_all_kmers_string_alphabet(): + kmers = all_kmers(2, alphabet="AB") + assert len(kmers) == 4, kmers + eq_(set(kmers), {"AA", "AB", "BA", "BB"}) + + +def test_extend_peptide_all_positions(): + # insert 0 or 1 at every position + results = extend_peptide( + "111", + desired_length=4, + start_offset=0, + end_offset=0, + alphabet="01") + + expected = [ + "0111", + "1111", + "1011", + "1111", + "1101", + "1111", + "1110", + "1111", + ] + eq_(results, expected) + + +def test_shorten_peptide_all_positions(): + # insert 0 or 1 at every position + results = shorten_peptide( + "012", + desired_length=2, + start_offset=0, + end_offset=0, + alphabet="012") + + expected = [ + "12", + "02", + "01" + ] + eq_(results, expected) + + +def test_shorten_peptide_all_positions_except_first(): + # insert 0 or 1 at every position + results = shorten_peptide( + "012", + desired_length=2, + start_offset=1, + end_offset=0, + alphabet="012") + + expected = [ + "02", + "01", + ] + eq_(results, expected) + + +def test_shorten_peptide_all_positions_except_last(): + # insert 0 or 1 at every position + results = shorten_peptide( + "012", + desired_length=2, + start_offset=0, + end_offset=1, + alphabet="012") + + expected = [ + "12", + "02", + ] + eq_(results, expected) + + +def test_fixed_length_from_many_peptides(): + kmers, original, counts = fixed_length_from_many_peptides( + peptides=["ABC", "A"], + desired_length=2, + start_offset_extend=0, + end_offset_extend=0, + start_offset_shorten=0, + end_offset_shorten=0, + alphabet="ABC") + print(kmers) + print(original) + print(counts) + eq_(len(kmers), len(original)) + eq_(len(kmers), len(counts)) + eq_(kmers, ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"]) + eq_(original, ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"]) + eq_(counts, [3, 3, 3, 6, 6, 6, 6, 6, 6]) -- GitLab