diff --git a/mhcflurry/class1_binding_predictor.py b/mhcflurry/class1_binding_predictor.py index 27f8fc4cf1c1777be47ac0f3edc766e05db201ff..366a5a40b5850f33b3d89e246f04efa6e338f232 100644 --- a/mhcflurry/class1_binding_predictor.py +++ b/mhcflurry/class1_binding_predictor.py @@ -34,7 +34,7 @@ from .serialization_helpers import ( load_keras_model_from_disk, save_keras_model_to_disk ) - +from .peptide_encoding import check_valid_index_encoding_array from .class1_allele_specific_hyperparameters import MAX_IC50 _allele_predictor_cache = {} @@ -386,14 +386,7 @@ class Class1BindingPredictor(PredictorBase): Given an encoded array of amino acid indices, returns a vector of predicted log IC50 values. """ - X = np.asarray(X) - if len(X.shape) != 2: - raise ValueError("Expected 2d input, got array with shape %s" % ( - X.shape,)) - max_expected_index = 20 if self.allow_unknown_amino_acids else 19 - if X.max() > max_expected_index: - raise ValueError( - "Got index %d in peptide encoding, max expected %d" % ( - X.max(), - max_expected_index)) + X = check_valid_index_encoding_array( + X, + allow_unknown_amino_acids=self.allow_unknown_amino_acids) return self.model.predict(X, verbose=False).flatten() diff --git a/mhcflurry/data.py b/mhcflurry/data.py index bbf2e76c98044c29b08f21b5e6e3c379973a7167..57bc2c58dbecb06754ea9ff2dd6fd171bd604fc4 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -26,7 +26,8 @@ from .common import normalize_allele_name from .amino_acid import common_amino_acids from .peptide_encoding import ( indices_to_hotshot_encoding, - fixed_length_from_many_peptides + fixed_length_index_encoding, + check_valid_index_encoding_array, ) from .class1_allele_specific_hyperparameters import MAX_IC50 @@ -196,10 +197,32 @@ def load_allele_dicts( def encode_peptide_to_affinity_dict( peptide_to_affinity_dict, peptide_length=9, - flatten_binary_encoding=True): + flatten_binary_encoding=True, + allow_unknown_amino_acids=True): """ - Given a dictionary mapping from peptide sequences to affinity values, - returns tuple with the following fields: + Given a dictionary mapping from peptide sequences to affinity values, return + both index and binary encodings of fixed length peptides, and + a vector of their affinities. + + Parameters + ---------- + peptide_to_affinity_dict : dict + Keys are peptide strings (of multiple lengths), each mapping to a + continuous affinity value. + + peptide_length : int + Length of vector encoding + + flatten_binary_encoding : bool + Should the binary encoding of a peptide be two-dimensional (9x20) + or a flattened 1d vector + + allow_unknown_amino_acids : bool + When extending a short vector to the desired peptide length, should + we insert every possible amino acid or a designated character "X" + indicating an unknown amino acid. + + Returns tuple with the following fields: - kmer_peptides: fixed length peptide strings - original_peptides: variable length peptide strings - counts: how many fixed length peptides were made from this original @@ -208,24 +231,32 @@ def encode_peptide_to_affinity_dict( - Y: affinity values associated with original peptides """ raw_peptides = list(sorted(peptide_to_affinity_dict.keys())) - kmer_peptides, original_peptides, counts = \ - fixed_length_from_many_peptides( + X_index, kmer_peptides, original_peptides, counts = \ + fixed_length_index_encoding( peptides=raw_peptides, desired_length=peptide_length, start_offset_shorten=0, end_offset_shorten=0, start_offset_extend=0, - end_offset_extend=0) + end_offset_extend=0, + allow_unknown_amino_acids=allow_unknown_amino_acids) + n_samples = len(kmer_peptides) + assert n_samples == len(original_peptides), \ "Mismatch between # of samples (%d) and # of peptides (%d)" % ( n_samples, len(original_peptides)) assert n_samples == len(counts), \ "Mismatch between # of samples (%d) and # of counts (%d)" % ( n_samples, len(counts)) - - X_index = index_encoding(kmer_peptides, peptide_length) - X_binary = indices_to_hotshot_encoding(X_index, n_indices=20) + assert n_samples == len(X_index), \ + "Mismatch between # of sample (%d) and index feature vectors (%d)" % ( + n_samples, len(X_index)) + X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids) + n_indices = 20 + allow_unknown_amino_acids + X_binary = indices_to_hotshot_encoding( + X_index, + n_indices=n_indices) assert X_binary.shape[0] == X_index.shape[0], \ ("Mismatch between number of samples for index encoding (%d)" @@ -235,7 +266,7 @@ def encode_peptide_to_affinity_dict( if flatten_binary_encoding: # collapse 3D input into 2D matrix - n_binary_features = peptide_length * 20 + n_binary_features = peptide_length * n_indices X_binary = X_binary.reshape((n_samples, n_binary_features)) # easier to work with counts when they're an array instead of list @@ -260,7 +291,7 @@ def load_allele_datasets( peptide_column_name=None, peptide_length_column_name="peptide_length", ic50_column_name="meas", - only_human=True): + only_human=False): """ Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides and log-transforms the IC50 measurement. Returns dictionary mapping allele diff --git a/mhcflurry/peptide_encoding.py b/mhcflurry/peptide_encoding.py index d07a41b8407179e62e4d8aebd39784be4c62b6b6..59bfaed5a3a94a3b7eb742d2c058ffe74b6bba56 100644 --- a/mhcflurry/peptide_encoding.py +++ b/mhcflurry/peptide_encoding.py @@ -273,12 +273,16 @@ def fixed_length_index_encoding( refers to the position *before* the start of a peptide and, similarly, `end_offset_extend` = 0 refers to the position *after* the peptide. - Returns feature matrix X, a list of original peptides for each feature - vector, and a list of integer counts indicating how many rows share a - particular original peptide. When two rows are expanded out of a single - original peptide, they will both have a count of 2. These counts can - be useful for down-weighting the importance of multiple feature vectors - which originate from the same sample. + Returns tuple with the following fields: + - index encoded feature matrix X + - list of fixed length peptides + - list of "original" peptides of varying lengths + - list of integer counts indicating how many rows came from + that original peptide. + + When two rows are expanded out of a single original peptide, they will both + have a count of 2. These counts can be useful for down-weighting the + importance of multiple feature vectors which originate from the same sample. """ if allow_unknown_amino_acids: insert_letters = ["X"] @@ -295,5 +299,18 @@ def fixed_length_index_encoding( start_offset_extend=start_offset_extend, end_offset_extend=end_offset_extend, insert_amino_acid_letters=insert_letters) - X = index_encoding(fixed_length, desired_length) - return X, original, counts + X_index = index_encoding(fixed_length, desired_length) + return X_index, fixed_length, original, counts + +def check_valid_index_encoding_array(X, allow_unknown_amino_acids=True): + X = np.asarray(X) + if len(X.shape) != 2: + raise ValueError("Expected 2d input, got array with shape %s" % ( + X.shape,)) + max_expected_index = 20 if allow_unknown_amino_acids else 19 + if X.max() > max_expected_index: + raise ValueError( + "Got index %d in peptide encoding, max expected %d" % ( + X.max(), + max_expected_index)) + return X diff --git a/mhcflurry/predictor_base.py b/mhcflurry/predictor_base.py index 1c6ed73a5aeedbaf1b174601dbfb7e061782e324..a82192fed8f1400532e287c7e937a092b0b7a385 100644 --- a/mhcflurry/predictor_base.py +++ b/mhcflurry/predictor_base.py @@ -67,7 +67,7 @@ class PredictorBase(object): indices = [] encoded_matrices = [] for i, peptide in enumerate(peptides): - matrix, _, _ = fixed_length_index_encoding( + matrix, _, _, _ = fixed_length_index_encoding( peptides=[peptide], desired_length=9, allow_unknown_amino_acids=self.allow_unknown_amino_acids)