Skip to content
Snippets Groups Projects
Commit 115bfdd5 authored by Alex Rubinsteyn's avatar Alex Rubinsteyn
Browse files

fixed propagation of allow_unknown_amino_acids through dataset loading

parent cc224e7a
No related merge requests found
......@@ -34,7 +34,7 @@ from .serialization_helpers import (
load_keras_model_from_disk,
save_keras_model_to_disk
)
from .peptide_encoding import check_valid_index_encoding_array
from .class1_allele_specific_hyperparameters import MAX_IC50
_allele_predictor_cache = {}
......@@ -386,14 +386,7 @@ class Class1BindingPredictor(PredictorBase):
Given an encoded array of amino acid indices, returns a vector
of predicted log IC50 values.
"""
X = np.asarray(X)
if len(X.shape) != 2:
raise ValueError("Expected 2d input, got array with shape %s" % (
X.shape,))
max_expected_index = 20 if self.allow_unknown_amino_acids else 19
if X.max() > max_expected_index:
raise ValueError(
"Got index %d in peptide encoding, max expected %d" % (
X.max(),
max_expected_index))
X = check_valid_index_encoding_array(
X,
allow_unknown_amino_acids=self.allow_unknown_amino_acids)
return self.model.predict(X, verbose=False).flatten()
......@@ -26,7 +26,8 @@ from .common import normalize_allele_name
from .amino_acid import common_amino_acids
from .peptide_encoding import (
indices_to_hotshot_encoding,
fixed_length_from_many_peptides
fixed_length_index_encoding,
check_valid_index_encoding_array,
)
from .class1_allele_specific_hyperparameters import MAX_IC50
......@@ -196,10 +197,32 @@ def load_allele_dicts(
def encode_peptide_to_affinity_dict(
peptide_to_affinity_dict,
peptide_length=9,
flatten_binary_encoding=True):
flatten_binary_encoding=True,
allow_unknown_amino_acids=True):
"""
Given a dictionary mapping from peptide sequences to affinity values,
returns tuple with the following fields:
Given a dictionary mapping from peptide sequences to affinity values, return
both index and binary encodings of fixed length peptides, and
a vector of their affinities.
Parameters
----------
peptide_to_affinity_dict : dict
Keys are peptide strings (of multiple lengths), each mapping to a
continuous affinity value.
peptide_length : int
Length of vector encoding
flatten_binary_encoding : bool
Should the binary encoding of a peptide be two-dimensional (9x20)
or a flattened 1d vector
allow_unknown_amino_acids : bool
When extending a short vector to the desired peptide length, should
we insert every possible amino acid or a designated character "X"
indicating an unknown amino acid.
Returns tuple with the following fields:
- kmer_peptides: fixed length peptide strings
- original_peptides: variable length peptide strings
- counts: how many fixed length peptides were made from this original
......@@ -208,24 +231,32 @@ def encode_peptide_to_affinity_dict(
- Y: affinity values associated with original peptides
"""
raw_peptides = list(sorted(peptide_to_affinity_dict.keys()))
kmer_peptides, original_peptides, counts = \
fixed_length_from_many_peptides(
X_index, kmer_peptides, original_peptides, counts = \
fixed_length_index_encoding(
peptides=raw_peptides,
desired_length=peptide_length,
start_offset_shorten=0,
end_offset_shorten=0,
start_offset_extend=0,
end_offset_extend=0)
end_offset_extend=0,
allow_unknown_amino_acids=allow_unknown_amino_acids)
n_samples = len(kmer_peptides)
assert n_samples == len(original_peptides), \
"Mismatch between # of samples (%d) and # of peptides (%d)" % (
n_samples, len(original_peptides))
assert n_samples == len(counts), \
"Mismatch between # of samples (%d) and # of counts (%d)" % (
n_samples, len(counts))
X_index = index_encoding(kmer_peptides, peptide_length)
X_binary = indices_to_hotshot_encoding(X_index, n_indices=20)
assert n_samples == len(X_index), \
"Mismatch between # of sample (%d) and index feature vectors (%d)" % (
n_samples, len(X_index))
X_index = check_valid_index_encoding_array(X_index, allow_unknown_amino_acids)
n_indices = 20 + allow_unknown_amino_acids
X_binary = indices_to_hotshot_encoding(
X_index,
n_indices=n_indices)
assert X_binary.shape[0] == X_index.shape[0], \
("Mismatch between number of samples for index encoding (%d)"
......@@ -235,7 +266,7 @@ def encode_peptide_to_affinity_dict(
if flatten_binary_encoding:
# collapse 3D input into 2D matrix
n_binary_features = peptide_length * 20
n_binary_features = peptide_length * n_indices
X_binary = X_binary.reshape((n_samples, n_binary_features))
# easier to work with counts when they're an array instead of list
......@@ -260,7 +291,7 @@ def load_allele_datasets(
peptide_column_name=None,
peptide_length_column_name="peptide_length",
ic50_column_name="meas",
only_human=True):
only_human=False):
"""
Loads an IEDB dataset, extracts "hot-shot" encoding of fixed length peptides
and log-transforms the IC50 measurement. Returns dictionary mapping allele
......
......@@ -273,12 +273,16 @@ def fixed_length_index_encoding(
refers to the position *before* the start of a peptide and, similarly,
`end_offset_extend` = 0 refers to the position *after* the peptide.
Returns feature matrix X, a list of original peptides for each feature
vector, and a list of integer counts indicating how many rows share a
particular original peptide. When two rows are expanded out of a single
original peptide, they will both have a count of 2. These counts can
be useful for down-weighting the importance of multiple feature vectors
which originate from the same sample.
Returns tuple with the following fields:
- index encoded feature matrix X
- list of fixed length peptides
- list of "original" peptides of varying lengths
- list of integer counts indicating how many rows came from
that original peptide.
When two rows are expanded out of a single original peptide, they will both
have a count of 2. These counts can be useful for down-weighting the
importance of multiple feature vectors which originate from the same sample.
"""
if allow_unknown_amino_acids:
insert_letters = ["X"]
......@@ -295,5 +299,18 @@ def fixed_length_index_encoding(
start_offset_extend=start_offset_extend,
end_offset_extend=end_offset_extend,
insert_amino_acid_letters=insert_letters)
X = index_encoding(fixed_length, desired_length)
return X, original, counts
X_index = index_encoding(fixed_length, desired_length)
return X_index, fixed_length, original, counts
def check_valid_index_encoding_array(X, allow_unknown_amino_acids=True):
X = np.asarray(X)
if len(X.shape) != 2:
raise ValueError("Expected 2d input, got array with shape %s" % (
X.shape,))
max_expected_index = 20 if allow_unknown_amino_acids else 19
if X.max() > max_expected_index:
raise ValueError(
"Got index %d in peptide encoding, max expected %d" % (
X.max(),
max_expected_index))
return X
......@@ -67,7 +67,7 @@ class PredictorBase(object):
indices = []
encoded_matrices = []
for i, peptide in enumerate(peptides):
matrix, _, _ = fixed_length_index_encoding(
matrix, _, _, _ = fixed_length_index_encoding(
peptides=[peptide],
desired_length=9,
allow_unknown_amino_acids=self.allow_unknown_amino_acids)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment