Newer
Older
Tim O'Donnell
committed
def __init__(self, alleles=None, allele_to_sequence=None, borrow_from=None):
A place to cache encodings for a sequence of alleles.
We frequently work with alleles by integer indices, for example as
inputs to neural networks. This class is used to map allele names to
integer indices in a consistent way by keeping track of the universe
of alleles under use, i.e. a distinction is made between the universe
of supported alleles (what's in `allele_to_sequence`) and the actual
set of alleles used (what's in `alleles`).
Parameters
----------
alleles : list of string
Allele names. If any allele is None instead of string, it will be
mapped to the special index value -1.
Tim O'Donnell
committed
allele_to_sequence : dict of str -> str
Allele name to amino acid sequence
borrow_from : AlleleEncoding, optional
If specified, do not specify allele_to_sequence. The sequences from
the provided instance are used. This guarantees that the mappings
from allele to index and from allele to sequence are the same
between the instances.
Tim O'Donnell
committed
if alleles is not None:
alleles = pandas.Series(alleles)
self.borrow_from = borrow_from
self.allele_to_sequence = allele_to_sequence
if self.borrow_from is None:
assert allele_to_sequence is not None
all_alleles = (
Tim O'Donnell
committed
self.allele_to_index = dict(
(allele, i)
for (i, allele) in enumerate(all_alleles))
Tim O'Donnell
committed
unpadded = pandas.Series(
[allele_to_sequence[a] for a in all_alleles],
index=all_alleles)
self.sequences = unpadded.str.pad(
unpadded.str.len().max(), fillchar="X")
else:
assert allele_to_sequence is None
self.allele_to_index = borrow_from.allele_to_index
self.sequences = borrow_from.sequences
self.allele_to_sequence = borrow_from.allele_to_sequence
Tim O'Donnell
committed
if alleles is not None:
assert all(
"Missing alleles: " + " ".join(set(
a for a in alleles if a not in self.allele_to_index))
Tim O'Donnell
committed
self.indices = alleles.map(self.allele_to_index)
assert not self.indices.isnull().any()
Tim O'Donnell
committed
self.indices = None
"""
Return a new AlleleEncoding in which the universe of supported alleles
is only the alleles actually used.
Returns
-------
AlleleEncoding
"""
return AlleleEncoding(
alleles=self.alleles,
allele_to_sequence=dict(
(allele, self.allele_to_sequence[allele])
for allele in self.alleles.unique()))
def allele_representations(self, encoding_name):
"""
Encode the universe of supported allele sequences to a matrix.
Parameters
----------
encoding_name : string
How to represent amino acids. Valid names are "BLOSUM62" or
"one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.
Returns
-------
numpy.array of shape
(num alleles in universe, sequence length, vector size)
where vector size is usually 21 (20 amino acids + X character)
"""
Tim O'Donnell
committed
if self.borrow_from is not None:
return self.borrow_from.allele_representations(encoding_name)
Tim O'Donnell
committed
cache_key = (
"allele_representations",
Tim O'Donnell
committed
if cache_key not in self.encoding_cache:
index_encoded_matrix = amino_acid.index_encoding(
self.sequences.values,
amino_acid.AMINO_ACID_INDEX)
vector_encoded = amino_acid.fixed_vectors_encoding(
index_encoded_matrix,
amino_acid.ENCODING_DATA_FRAMES[encoding_name])
Tim O'Donnell
committed
self.encoding_cache[cache_key] = vector_encoded
return self.encoding_cache[cache_key]
def fixed_length_vector_encoded_sequences(self, encoding_name):
Encode allele sequences (not the universe of alleles) to a matrix.
encoding_name : string
How to represent amino acids. Valid names are "BLOSUM62" or
"one-hot". See `amino_acid.ENCODING_DATA_FRAMES`.
numpy.array with shape:
(num alleles, sequence length, vector size)
where vector size is usually 21 (20 amino acids + X character)
"""
cache_key = (
"fixed_length_vector_encoding",
if cache_key not in self.encoding_cache:
vector_encoded = self.allele_representations(encoding_name)
Tim O'Donnell
committed
result = vector_encoded[self.indices]
self.encoding_cache[cache_key] = result
return self.encoding_cache[cache_key]
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
class MultipleAlleleEncoding(object):
def __init__(
self,
experiment_names,
experiment_to_allele_list,
max_alleles_per_experiment=6,
allele_to_sequence=None,
borrow_from=None):
padded_experiment_to_allele_list = {}
for (name, alleles) in experiment_to_allele_list.items():
assert len(alleles) > 0
assert len(alleles) <= max_alleles_per_experiment
alleles_with_mask = alleles + [None] * (
max_alleles_per_experiment - len(alleles))
padded_experiment_to_allele_list[name] = alleles_with_mask
flattened_allele_list = []
for name in experiment_names:
flattened_allele_list.extend(padded_experiment_to_allele_list[name])
self.allele_encoding = AlleleEncoding(
alleles=flattened_allele_list,
allele_to_sequence=allele_to_sequence,
borrow_from=borrow_from
)
self.max_alleles_per_experiment = max_alleles_per_experiment
@property
def indices(self):
return self.allele_encoding.indices.values.reshape(
(-1, self.max_alleles_per_experiment))
def compact(self):
result = copy(self)
result.allele_encoding = self.allele_encoding.compact()
return result
def allele_representations(self, encoding_name):
return self.allele_encoding.allele_representations(encoding_name)
@property
def allele_to_sequence(self):
return self.allele_encoding.allele_to_sequence
def fixed_length_vector_encoded_sequences(self, encoding_name):
raise NotImplementedError()