diff --git a/experiments/matrix-completion-hyperparameter-search.py b/experiments/matrix-completion-hyperparameter-search.py old mode 100644 new mode 100755 index b62a778b9ac8f385418aeef36714ded408aa0cd5..c9559f09529d16119790c94f79c1887283f6d90e --- a/experiments/matrix-completion-hyperparameter-search.py +++ b/experiments/matrix-completion-hyperparameter-search.py @@ -24,12 +24,13 @@ For each allele, perform 5-fold cross validation to import argparse from collections import defaultdict -from fancyimpute import MICE, KNN +from fancyimpute import MICE, KNN, SimpleFill import numpy as np import pandas as pd -from mhcflurry.peptide_encoding import ( - fixed_length_index_encoding, - index_encoding, +from mhcflurry.peptide_encoding import fixed_length_index_encoding +from mhcflurry.amino_acid import ( + common_amino_acids, + amino_acids_with_unknown, ) from mhcflurry import Class1BindingPredictor from sklearn.cross_validation import StratifiedKFold @@ -109,10 +110,15 @@ parser.add_argument( default=100) parser.add_argument( - "--use-mice", + "--impute", + default="mice", + help="Use {'mice', 'knn', 'meanfill'} for imputing pre-training data") + +parser.add_argument( + "--unknown-amino-acids", default=False, action="store_true", - help="Use MICE for imputation instead of KNN") + help="When expanding 8mers into 9mers use 'X' instead of all possible AAs") def print_length_distribution(peptides, values, name): @@ -179,6 +185,21 @@ if __name__ == "__main__": "activation" ]) + if args.unknown_amino_acids: + index_encoding = amino_acids_with_unknown.index_encoding + else: + index_encoding = common_amino_acids.index_encoding + + impute_method_name = args.impute.lower() + if impute_method_name.startswith("mice"): + imputer = MICE(n_burn_in=5, n_imputations=20, min_value=0, max_value=1) + elif impute_method_name.startswith("knn"): + imputer = KNN(k=1, orientation="columns", print_interval=10) + elif impute_method_name.startswith("mean"): + imputer = SimpleFill("mean") + else: + raise ValueError("Invalid imputation method: %s" % impute_method_name) + # want at least 5 samples in each fold of CV # to make meaningful estimates of accuracy min_samples_per_cv_fold = 5 * args.n_folds @@ -243,10 +264,6 @@ if __name__ == "__main__": allele_list=allele_list, min_observations_per_peptide=2, min_observations_per_allele=min(10, min_samples_per_cv_fold)) - if args.use_mice: - imputer = MICE(n_burn_in=5, n_imputations=20, min_value=0, max_value=1) - else: - imputer = KNN(k=1, orientation="columns") pMHC_affinity_matrix_fold_completed = imputer.complete(pMHC_affinity_matrix_fold) # keep synthetic data for 9mer peptides, @@ -276,7 +293,8 @@ if __name__ == "__main__": X_train, training_row_peptides, training_counts = \ fixed_length_index_encoding( peptides=train_peptides, - desired_length=9) + desired_length=9, + allow_unknown_amino_acids=args.unknown_amino_acids) most_common_peptide_idx = np.argmax(training_counts) if args.verbose: print("-- Most common peptide in training data: %s (length=%d, count=%d)" % ( @@ -298,9 +316,13 @@ if __name__ == "__main__": activation ) print("\n-----") - print("Training model for %s with hyperparameters: %s" % ( - allele, - key)) + print( + ("Training model for %s (# peptides = %d, # samples=%d)" + " with parameters: %s") % ( + allele, + len(train_peptides), + len(X_train), + key)) print("-----") predictor = Class1BindingPredictor.from_hyperparameters( embedding_output_dim=embedding_dim_size, @@ -308,6 +330,9 @@ if __name__ == "__main__": activation=activation, output_activation="sigmoid", dropout_probability=dropout, + verbose=args.verbose, + allow_unknown_amino_acids=args.unknown_amino_acids, + embedding_input_dim=21 if args.unknown_amino_acids else 20, ) predictor.fit( X=X_train, diff --git a/mhcflurry/__init__.py b/mhcflurry/__init__.py index ec1ba0edbe1165226e40327356677ce6b85974f9..4ab1d343a2cb6462e2e50506c00e727a731fd75f 100644 --- a/mhcflurry/__init__.py +++ b/mhcflurry/__init__.py @@ -16,16 +16,16 @@ from . import paths from . import data from . import feedforward from . import common -from . import fixed_length_peptides from . import peptide_encoding +from . import amino_acid from .class1_binding_predictor import Class1BindingPredictor __all__ = [ "paths", "data", "feedforward", - "fixed_length_peptides", "peptide_encoding", + "amino_acid", "common", "Class1BindingPredictor" ] diff --git a/mhcflurry/amino_acid.py b/mhcflurry/amino_acid.py index c7607d4d712ff1929056e99836ef2aa06696a44e..1ba60648febd60a3da2dce17395c16ecbc7e2d6c 100644 --- a/mhcflurry/amino_acid.py +++ b/mhcflurry/amino_acid.py @@ -12,7 +12,81 @@ # See the License for the specific language governing permissions and # limitations under the License. -amino_acids = { +import numpy as np + + +class Alphabet(object): + """ + Used to track the order of amino acids used for peptide encodings + """ + + def __init__(self, **kwargs): + self.letters_to_names = {} + for (k, v) in kwargs.items(): + self.add(k, v) + + def add(self, letter, name): + assert letter not in self.letters_to_names + assert len(letter) == 1 + self.letters_to_names[letter] = name + + def letters(self): + return list(sorted(self.letters_to_names.keys())) + + def names(self): + return [self.letters_to_names[k] for k in self.letters()] + + def index_dict(self): + return {c: i for (i, c) in enumerate(self.letters())} + + def copy(self): + return Alphabet(**self.letters_to_names) + + def __getitem__(self, k): + return self.letters_to_names[k] + + def __setitem__(self, k, v): + self.add(k, v) + + def index_encoding_list(self, peptides): + index_dict = self.index_dict() + return [ + [index_dict[amino_acid] for amino_acid in peptide] + for peptide in peptides + ] + + def index_encoding(self, peptides, peptide_length): + """ + Encode a set of equal length peptides as a matrix of their + amino acid indices. + """ + X = np.zeros((len(peptides), peptide_length), dtype=int) + index_dict = self.index_dict() + for i, peptide in enumerate(peptides): + for j, amino_acid in enumerate(peptide): + X[i, j] = index_dict[amino_acid] + return X + + def hotshot_encoding( + self, + peptides, + peptide_length): + """ + Encode a set of equal length peptides as a binary matrix, + where each letter is transformed into a length 20 vector with a single + element that is 1 (and the others are 0). + """ + shape = (len(peptides), peptide_length, 20) + index_dict = self.index_dict() + X = np.zeros(shape, dtype=bool) + for i, peptide in enumerate(peptides): + for j, amino_acid in enumerate(peptide): + k = index_dict[amino_acid] + X[i, j, k] = 1 + return X + + +common_amino_acids = Alphabet(**{ "A": "Alanine", "R": "Arginine", "N": "Asparagine", @@ -33,8 +107,7 @@ amino_acids = { "W": "Tryptophan", "Y": "Tyrosine", "V": "Valine", -} +}) -amino_acid_letters = list(sorted(amino_acids.keys())) -amino_acid_names = [amino_acids[k] for k in amino_acid_letters] -amino_acid_letter_indices = {c: i for (i, c) in enumerate(amino_acid_letters)} +amino_acids_with_unknown = common_amino_acids.copy() +amino_acids_with_unknown.add("X", "Unknown") diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py index f75bf24e4958d5a16f461eb4e311272dd87317fc..3931a5ab5158a29a5cb88d39295ad08777d83af8 100644 --- a/mhcflurry/class1_binding_predictor.py +++ b/mhcflurry/class1_binding_predictor.py @@ -29,21 +29,34 @@ import json import numpy as np from keras.models import model_from_config +from .amino_acid import ( + common_amino_acids, + amino_acids_with_unknown, +) from .class1_allele_specific_hyperparameters import MAX_IC50 from .common import normalize_allele_name -from .peptide_encoding import index_encoding from .paths import CLASS1_MODEL_DIRECTORY from .feedforward import make_embedding_network -from .fixed_length_peptides import fixed_length_from_many_peptides +from .peptide_encoding import fixed_length_from_many_peptides _allele_predictor_cache = {} +common_amino_acid_letters = common_amino_acids.letters() + class Class1BindingPredictor(object): - def __init__(self, model, name=None, max_ic50=MAX_IC50): + def __init__( + self, + model, + name=None, + max_ic50=MAX_IC50, + allow_unknown_amino_acids=False, + verbose=False): self.model = model self.max_ic50 = max_ic50 self.name = name + self.allow_unknown_amino_acids = allow_unknown_amino_acids + self.verbose = verbose @classmethod def from_disk( @@ -92,7 +105,8 @@ class Class1BindingPredictor(object): loss="mse", output_activation="sigmoid", dropout_probability=0, - learning_rate=0.001): + learning_rate=0.001, + **kwargs): """ Create untrained predictor with the given hyperparameters. """ @@ -110,7 +124,8 @@ class Class1BindingPredictor(object): return cls( name=name, max_ic50=max_ic50, - model=model) + model=model, + **kwargs) def _combine_training_data( self, @@ -197,8 +212,8 @@ class Class1BindingPredictor(object): X_combined = np.vstack([X_pretrain, X]) Y_combined = np.concatenate([Y_pretrain, Y]) combined_weights = np.concatenate([ + pretrain_sample_weights, sample_weights, - pretrain_sample_weights ]) return X_combined, Y_combined, combined_weights, n_pretrain_samples @@ -252,6 +267,13 @@ class Class1BindingPredictor(object): total_combined_sample_weight = ( total_pretrain_sample_weight + total_train_sample_weight) + if self.verbose: + print("-- Total pretrain weight = %f (%f%%), sample weight = %f (%f%%)" % ( + total_pretrain_sample_weight, + 100 * total_pretrain_sample_weight / total_combined_sample_weight, + total_train_sample_weight, + 100 * total_train_sample_weight / total_combined_sample_weight)) + for epoch in range(n_training_epochs): # weights for synthetic points can be shrunk as: # ~ 1 / (1+epoch)**2 @@ -305,42 +327,6 @@ class Class1BindingPredictor(object): nb_epoch=1, verbose=0) - """ - def fit_peptides( - self, - peptides, - affinity_values, - sample_weights=None, - pretrain_peptides=None, - pretrain_affinity_values=None, - pretrain_sample_weights=None, - n_training_epochs=200, - verbose=False): - ''' - Train model from peptide sequences, expanding shorter or longer - peptides to make 9mers. - ''' - X, original_peptides, counts = \ - fixed_length_index_encoding( - peptides=peptides, - desired_length=9) - lookup = {k: v for (k, v) in zip} - Y = np.asarray(Y) - if sample_weights is None: - sample_weights = np.ones_like(Y) - else: - sample_weights = np.asarray(Y) - - if pretrain_peptides is None: - - if Y_pretrain - Y_pretrain = np.asarray(Y_pretrain) - if pretrain_sample_weights is None: - pretrain_sample_weights = np.ones_like(Y_pretrain) - - train_weights = 1.0 / np.array(expanded_train_counts) - """ - def to_disk(self, model_json_path, weights_hdf_path, overwrite=False): if exists(model_json_path) and overwrite: logging.info( @@ -435,13 +421,22 @@ class Class1BindingPredictor(object): """ return self.max_ic50 ** (1.0 - log_value) + def encode_peptides(self, peptides): + if self.allow_unknown_amino_acids: + return amino_acids_with_unknown.index_encoding(peptides, 9) + else: + return common_amino_acids.index_encoding(peptides, 9) + def _predict_9mer_peptides(self, peptides): """ Predict binding affinity for 9mer peptides """ if any(len(peptide) != 9 for peptide in peptides): raise ValueError("Can only predict 9mer peptides") - X = index_encoding(peptides, peptide_length=9) + X = self.encode_peptides(peptides) + max_expected_index = 20 if self.allow_unknown_amino_acids else 19 + assert X.max() <= max_expected_index, \ + "Got index %d in peptide encoding" % (X.max(),) return self.model.predict(X, verbose=False).flatten() def _predict_9mer_peptides_ic50(self, peptides): @@ -457,9 +452,19 @@ class Class1BindingPredictor(object): group_peptides = list(group_peptides) expanded_peptides, _, _ = fixed_length_from_many_peptides( peptides=group_peptides, - desired_length=9) + desired_length=9, + insert_amino_acid_letters=( + ["X"] if self.allow_unknown_amino_acids + else common_amino_acid_letters)) + n_group = len(group_peptides) n_expanded = len(expanded_peptides) + if self.verbose: + print( + "[Class1BindingPredictor] Expanded %d peptides of length %d => %d" % ( + n_group, + length, + n_expanded)) expansion_factor = int(n_expanded / n_group) raw_y = self._predict_9mer_peptides(expanded_peptides) if expansion_factor == 1: diff --git a/mhcflurry/data.py b/mhcflurry/data.py index 75435c3b1de95f7d030452aa3406f53550035a5e..895edecdfe9db51affda0233f1defc59549eaafe 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -23,14 +23,14 @@ import pandas as pd import numpy as np from .common import normalize_allele_name +from .amino_acid import common_amino_acids from .peptide_encoding import ( - index_encoding, indices_to_hotshot_encoding, -) -from .fixed_length_peptides import ( fixed_length_from_many_peptides ) +index_encoding = common_amino_acids.index_encoding + AlleleData = namedtuple( "AlleleData", [ diff --git a/mhcflurry/feedforward.py b/mhcflurry/feedforward.py index 7f09c73def7612508d33f44981663b7f8aa785f9..f2c7876bc520d6f6513b4922335293fba07aabfe 100644 --- a/mhcflurry/feedforward.py +++ b/mhcflurry/feedforward.py @@ -128,9 +128,10 @@ def make_hotshot_network( output_activation="sigmoid", dropout_probability=0.0, optimizer=None, - learning_rate=0.001): + learning_rate=0.001, + n_amino_acids=20): return make_network( - input_size=peptide_length * 20, + input_size=peptide_length * n_amino_acids, layer_sizes=layer_sizes, activation=activation, init=init, diff --git a/mhcflurry/fixed_length_peptides.py b/mhcflurry/fixed_length_peptides.py deleted file mode 100644 index 1ab37bdf4fd0a178c52be952f34888f1eaf652ad..0000000000000000000000000000000000000000 --- a/mhcflurry/fixed_length_peptides.py +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright (c) 2015. Mount Sinai School of Medicine -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import ( - print_function, - division, - absolute_import, -) -import itertools -import logging - -from .amino_acid import amino_acid_letters - - -def all_kmers(k, alphabet=amino_acid_letters): - """ - Generates all k-mer peptide sequences - - Parameters - ---------- - k : int - - alphabet : str | list of characters - """ - alphabets = [alphabet] * k - return [ - "".join(combination) - for combination - in itertools.product(*alphabets) - ] - - -class CombinatorialExplosion(Exception): - pass - - -def extend_peptide( - peptide, - desired_length, - start_offset, - end_offset, - alphabet=amino_acid_letters): - """Extend peptide by inserting every possible amino acid combination - if we're trying to e.g. turn an 8mer into 9mers. - - Parameters - ---------- - peptide : str - - desired_length : int - - start_offset : int - How many characters (from the position before the start of the string) - to skip before inserting characters. - - - end_offset : int - Last character position from the end where we insert new characters, - where 0 is the position after the last character. - - alphabet : str | list of character - """ - n = len(peptide) - assert n < desired_length, \ - "%s (length = %d) is too long! Must be shorter than %d" % ( - peptide, n, desired_length) - n_missing = desired_length - n - if n_missing > 3: - raise CombinatorialExplosion( - "Cannot transform %s of length %d into a %d-mer peptide" % ( - peptide, n, desired_length)) - return [ - peptide[:i] + extra + peptide[i:] - for i in range(start_offset, n - end_offset + 1) - for extra in all_kmers(n_missing, alphabet=alphabet) - ] - - -def shorten_peptide( - peptide, - desired_length, - start_offset, - end_offset, - alphabet=amino_acid_letters): - """Shorten peptide if trying to e.g. turn 10mer into 9mers - - Parameters - ---------- - - peptide : str - - desired_length : int - - start_offset : int - - end_offset : int - - alphabet : str | list of characters - """ - n = len(peptide) - assert n > desired_length, \ - "%s (length = %d) is too short! Must be longer than %d" % ( - peptide, n, desired_length) - n_skip = n - desired_length - assert n_skip > 0, \ - "Expected length of peptide %s %d to be greater than %d" % ( - peptide, n, desired_length) - end_range = n - end_offset - n_skip + 1 - return [ - peptide[:i] + peptide[i + n_skip:] - for i in range(start_offset, end_range) - ] - - -def fixed_length_from_many_peptides( - peptides, - desired_length, - start_offset_extend=2, - end_offset_extend=1, - start_offset_shorten=2, - end_offset_shorten=0, - alphabet=amino_acid_letters): - """ - Create a set of fixed-length k-mer peptides from a collection of varying - length peptides. - - - Shorter peptides are filled in using all possible amino acids at any - insertion site between start_offset_shorten and -end_offset_shorten - where start_offset_extend=0 represents insertions before the string - and end_offset_extend=0 represents insertions after the string's ending. - - Longer peptides are shortened by deleting contiguous residues, starting - from start_offset_shorten and ending with -end_offset_shorten. Unlike - peptide extensions, the offsets for shortening a peptide range between - the first and last positions (rather than between the positions *before* - the string starts and the position *after*). - - We can recreate the methods from: - Accurate approximation method for prediction of class I MHC - affinities for peptides of length 8, 10 and 11 using prediction - tools trained on 9mers. - by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397) - with the following settings: - - desired_length = 9 - - start_offset_extend = 3 - - end_offset_extend = 2 - - start_offset_shorten = 3 - - end_offset_shorten = 1 - - Returns three lists: - - a list of fixed length peptides (all of length `desired_length`) - - a list of the original peptides from which subsequences were - contracted or lengthened - - a list of integers indicating the number of fixed length peptides - generated from each variable length peptide. - - Example: - kmers, original, counts = fixed_length_from_many_peptides( - peptides=["ABC", "A"] - desired_length=2, - start_offset_extend=0, - end_offset_extend=0, - start_offset_shorten=0, - end_offset_shorten=0, - alphabet="ABC") - kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"] - original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"] - counts == [3, 3, 3, 6, 6, 6, 6, 6, 6] - - Parameters - ---------- - peptide : list of str - - desired_length : int - - start_offset_extend : int - - end_offset_extend : int - - start_offset_shorten : int - - end_offset_shorten : int - - alphabet : str | list of characters - """ - all_fixed_length_peptides = [] - original_peptide_sequences = [] - counts = [] - for peptide in peptides: - n = len(peptide) - if n == desired_length: - fixed_length_peptides = [peptide] - elif n < desired_length: - try: - fixed_length_peptides = extend_peptide( - peptide=peptide, - desired_length=desired_length, - start_offset=start_offset_extend, - end_offset=end_offset_extend, - alphabet=alphabet) - except CombinatorialExplosion: - logging.warn( - "Peptide %s is too short to be extended to length %d" % ( - peptide, desired_length)) - continue - else: - fixed_length_peptides = shorten_peptide( - peptide=peptide, - desired_length=desired_length, - start_offset=start_offset_shorten, - end_offset=end_offset_shorten, - alphabet=alphabet) - - n_fixed_length = len(fixed_length_peptides) - all_fixed_length_peptides.extend(fixed_length_peptides) - original_peptide_sequences.extend([peptide] * n_fixed_length) - counts.extend([n_fixed_length] * n_fixed_length) - return all_fixed_length_peptides, original_peptide_sequences, counts diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py index ac9f96207a86e28e9129de87b97fa7d7842fa57f..c4698fc9d4a57d4bdf71cb76917483defd284957 100644 --- a/mhcflurry/peptide_encoding.py +++ b/mhcflurry/peptide_encoding.py @@ -12,37 +12,223 @@ # See the License for the specific language governing permissions and # limitations under the License. +import itertools +import logging + import numpy as np -from .amino_acid import amino_acid_letter_indices -from .fixed_length_peptides import fixed_length_from_many_peptides +from .amino_acid import common_amino_acids, amino_acids_with_unknown + +common_amino_acid_letters = common_amino_acids.letters() + + +def all_kmers(k, alphabet=common_amino_acid_letters): + """ + Generates all k-mer peptide sequences + + Parameters + ---------- + k : int + + alphabet : str | list of characters + """ + alphabets = [alphabet] * k + return [ + "".join(combination) + for combination + in itertools.product(*alphabets) + ] + + +class CombinatorialExplosion(Exception): + pass + +def extend_peptide( + peptide, + desired_length, + start_offset, + end_offset, + insert_amino_acid_letters=common_amino_acid_letters): + """Extend peptide by inserting every possible amino acid combination + if we're trying to e.g. turn an 8mer into 9mers. + + Parameters + ---------- + peptide : str + + desired_length : int + + start_offset : int + How many characters (from the position before the start of the string) + to skip before inserting characters. + + + end_offset : int + Last character position from the end where we insert new characters, + where 0 is the position after the last character. -def hotshot_encoding(peptides, peptide_length): + insert_alphabet : str | list of character """ - Encode a set of equal length peptides as a binary matrix, - where each letter is transformed into a length 20 vector with a single - element that is 1 (and the others are 0). + n = len(peptide) + assert n < desired_length, \ + "%s (length = %d) is too long! Must be shorter than %d" % ( + peptide, n, desired_length) + n_missing = desired_length - n + if n_missing > 3: + raise CombinatorialExplosion( + "Cannot transform %s of length %d into a %d-mer peptide" % ( + peptide, n, desired_length)) + return [ + peptide[:i] + extra + peptide[i:] + for i in range(start_offset, n - end_offset + 1) + for extra in all_kmers( + n_missing, + alphabet=insert_amino_acid_letters) + ] + + +def shorten_peptide( + peptide, + desired_length, + start_offset, + end_offset, + insert_amino_acid_letters=common_amino_acid_letters): + """Shorten peptide if trying to e.g. turn 10mer into 9mers + + Parameters + ---------- + + peptide : str + + desired_length : int + + start_offset : int + + end_offset : int + + alphabet : str | list of characters """ - shape = (len(peptides), peptide_length, 20) - X = np.zeros(shape, dtype=bool) - for i, peptide in enumerate(peptides): - for j, amino_acid in enumerate(peptide): - k = amino_acid_letter_indices[amino_acid] - X[i, j, k] = 1 - return X + n = len(peptide) + assert n > desired_length, \ + "%s (length = %d) is too short! Must be longer than %d" % ( + peptide, n, desired_length) + n_skip = n - desired_length + assert n_skip > 0, \ + "Expected length of peptide %s %d to be greater than %d" % ( + peptide, n, desired_length) + end_range = n - end_offset - n_skip + 1 + return [ + peptide[:i] + peptide[i + n_skip:] + for i in range(start_offset, end_range) + ] -def index_encoding(peptides, peptide_length): +def fixed_length_from_many_peptides( + peptides, + desired_length, + start_offset_extend=2, + end_offset_extend=1, + start_offset_shorten=2, + end_offset_shorten=0, + insert_amino_acid_letters=common_amino_acid_letters): """ - Encode a set of equal length peptides as a vector of their - amino acid indices. + Create a set of fixed-length k-mer peptides from a collection of varying + length peptides. + + + Shorter peptides are filled in using all possible amino acids at any + insertion site between start_offset_shorten and -end_offset_shorten + where start_offset_extend=0 represents insertions before the string + and end_offset_extend=0 represents insertions after the string's ending. + + Longer peptides are shortened by deleting contiguous residues, starting + from start_offset_shorten and ending with -end_offset_shorten. Unlike + peptide extensions, the offsets for shortening a peptide range between + the first and last positions (rather than between the positions *before* + the string starts and the position *after*). + + We can recreate the methods from: + Accurate approximation method for prediction of class I MHC + affinities for peptides of length 8, 10 and 11 using prediction + tools trained on 9mers. + by Lundegaard et. al. (http://bioinformatics.oxfordjournals.org/content/24/11/1397) + with the following settings: + - desired_length = 9 + - start_offset_extend = 3 + - end_offset_extend = 2 + - start_offset_shorten = 3 + - end_offset_shorten = 1 + + Returns three lists: + - a list of fixed length peptides (all of length `desired_length`) + - a list of the original peptides from which subsequences were + contracted or lengthened + - a list of integers indicating the number of fixed length peptides + generated from each variable length peptide. + + Example: + kmers, original, counts = fixed_length_from_many_peptides( + peptides=["ABC", "A"] + desired_length=2, + start_offset_extend=0, + end_offset_extend=0, + start_offset_shorten=0, + end_offset_shorten=0, + insert_amino_acid_letters="ABC") + kmers == ["BC", "AC", "AB", "AA", "BA", "CA", "AA", "AB", "AC"] + original == ["ABC", "ABC", "ABC", "A", "A", "A", "A", "A", "A"] + counts == [3, 3, 3, 6, 6, 6, 6, 6, 6] + + Parameters + ---------- + peptide : list of str + + desired_length : int + + start_offset_extend : int + + end_offset_extend : int + + start_offset_shorten : int + + end_offset_shorten : int + + insert_amino_acid_letters : str | list of characters """ - X = np.zeros((len(peptides), peptide_length), dtype=int) - for i, peptide in enumerate(peptides): - for j, amino_acid in enumerate(peptide): - X[i, j] = amino_acid_letter_indices[amino_acid] - return X + all_fixed_length_peptides = [] + original_peptide_sequences = [] + counts = [] + for peptide in peptides: + n = len(peptide) + if n == desired_length: + fixed_length_peptides = [peptide] + elif n < desired_length: + try: + fixed_length_peptides = extend_peptide( + peptide=peptide, + desired_length=desired_length, + start_offset=start_offset_extend, + end_offset=end_offset_extend, + insert_amino_acid_letters=insert_amino_acid_letters) + except CombinatorialExplosion: + logging.warn( + "Peptide %s is too short to be extended to length %d" % ( + peptide, desired_length)) + continue + else: + fixed_length_peptides = shorten_peptide( + peptide=peptide, + desired_length=desired_length, + start_offset=start_offset_shorten, + end_offset=end_offset_shorten, + insert_amino_acid_letters=insert_amino_acid_letters) + + n_fixed_length = len(fixed_length_peptides) + all_fixed_length_peptides.extend(fixed_length_peptides) + original_peptide_sequences.extend([peptide] * n_fixed_length) + counts.extend([n_fixed_length] * n_fixed_length) + return all_fixed_length_peptides, original_peptide_sequences, counts def indices_to_hotshot_encoding(X, n_indices=None, first_index_value=0): @@ -68,7 +254,8 @@ def fixed_length_index_encoding( start_offset_shorten=0, end_offset_shorten=0, start_offset_extend=0, - end_offset_extend=0): + end_offset_extend=0, + allow_unknown_amino_acids=False): """ Take peptides of varying lengths, chop them into substrings of fixed length and apply index encoding to these substrings. @@ -94,12 +281,20 @@ def fixed_length_index_encoding( be useful for down-weighting the importance of multiple feature vectors which originate from the same sample. """ + if allow_unknown_amino_acids: + insert_letters = ["X"] + index_encoding = amino_acids_with_unknown.index_encoding + else: + insert_letters = common_amino_acids.letters() + index_encoding = common_amino_acids.index_encoding + fixed_length, original, counts = fixed_length_from_many_peptides( peptides=peptides, desired_length=desired_length, start_offset_shorten=start_offset_shorten, end_offset_shorten=end_offset_shorten, start_offset_extend=start_offset_extend, - end_offset_extend=end_offset_extend) + end_offset_extend=end_offset_extend, + insert_amino_acid_letters=insert_letters) X = index_encoding(fixed_length, desired_length) return X, original, counts diff --git a/test/test_fixed_length_peptides.py b/test/test_fixed_length_peptides.py index ebe0f8e25cc0ce2b4861275cfdb465b054c68ba8..733ef8a8f49737762df93479457c9b38399a5c7d 100644 --- a/test/test_fixed_length_peptides.py +++ b/test/test_fixed_length_peptides.py @@ -1,4 +1,4 @@ -from mhcflurry.fixed_length_peptides import ( +from mhcflurry.peptide_encoding import ( all_kmers, extend_peptide, shorten_peptide, @@ -26,7 +26,7 @@ def test_extend_peptide_all_positions(): desired_length=4, start_offset=0, end_offset=0, - alphabet="01") + insert_amino_acid_letters="01") expected = [ "0111", @@ -48,7 +48,7 @@ def test_shorten_peptide_all_positions(): desired_length=2, start_offset=0, end_offset=0, - alphabet="012") + insert_amino_acid_letters="012") expected = [ "12", @@ -65,7 +65,7 @@ def test_shorten_peptide_all_positions_except_first(): desired_length=2, start_offset=1, end_offset=0, - alphabet="012") + insert_amino_acid_letters="012") expected = [ "02", @@ -81,7 +81,7 @@ def test_shorten_peptide_all_positions_except_last(): desired_length=2, start_offset=0, end_offset=1, - alphabet="012") + insert_amino_acid_letters="012") expected = [ "12", @@ -98,7 +98,7 @@ def test_fixed_length_from_many_peptides(): end_offset_extend=0, start_offset_shorten=0, end_offset_shorten=0, - alphabet="ABC") + insert_amino_acid_letters="ABC") print(kmers) print(original) print(counts)