Newer
Older
Tim O'Donnell
committed
master_allele_encoding = self.get_master_allele_encoding()
unsupported_alleles = [
allele for allele in
df.normalized_allele.unique()
Tim O'Donnell
committed
if allele not in self.allele_to_sequence
"Supported alleles: %s" % (
" ".join(unsupported_alleles),
Tim O'Donnell
committed
" ".join(sorted(self.allele_to_sequence))))
mask = df.supported_peptide_length & (
~df.normalized_allele.isin(unsupported_alleles))
if mask is None or mask.all():
# Common case optimization
allele_encoding = AlleleEncoding(
df.normalized_allele,
borrow_from=master_allele_encoding)
# The following line is a performance optimization that may be
# revisited. It causes the neural network to set to include
# only the alleles actually being predicted for. This makes
# the network much smaller. However, subsequent calls to
# predict will need to reset these weights, so there is a
# tradeoff.
allele_encoding = allele_encoding.compact()
for (i, model) in enumerate(self.class1_pan_allele_models):
predictions_array[:, i] = (
model.predict(
peptides,
allele_encoding=allele_encoding,
**model_kwargs))
elif mask.sum() > 0:
masked_allele_encoding = AlleleEncoding(
df.loc[mask].normalized_allele,
Tim O'Donnell
committed
borrow_from=master_allele_encoding)
# See above performance note.
masked_allele_encoding = masked_allele_encoding.compact()
masked_peptides = peptides.sequences[mask]
for (i, model) in enumerate(self.class1_pan_allele_models):
predictions_array[mask, i] = model.predict(
if not self.allele_to_allele_specific_models.get(allele)
]
if unsupported_alleles:
msg = (
"No single-allele models for allele(s): %s.\n"
"Supported alleles are: %s" % (
mask = None
else:
mask = (
(df.normalized_allele == allele) &
df.supported_peptide_length).values
if mask is None or mask.all():
# Common case optimization
for (i, model) in enumerate(models):
predictions_array[:, num_pan_models + i] = (
peptides_for_allele = EncodableSequences.create(
df.ix[mask].peptide.values)
for (i, model) in enumerate(models):
predictions_array[
mask,
num_pan_models + i,
if callable(centrality_measure):
centrality_function = centrality_measure
else:
centrality_function = CENTRALITY_MEASURES[centrality_measure]
logs = numpy.log(predictions_array)
log_centers = centrality_function(logs)
df["prediction"] = numpy.exp(log_centers)
Tim O'Donnell
committed
df["prediction_low"] = numpy.exp(numpy.nanpercentile(logs, 5.0, axis=1))
df["prediction_high"] = numpy.exp(numpy.nanpercentile(logs, 95.0, axis=1))
for i in range(num_pan_models):
df["model_pan_%d" % i] = predictions_array[:, i]
for i in range(max_single_allele_models):
df["model_single_%d" % i] = predictions_array[
:, num_pan_models + i
]
if include_percentile_ranks:
if self.allele_to_percent_rank_transform:
df["prediction_percentile"] = self.percentile_ranks(
df.prediction,
alleles=df.normalized_allele.values,
throw=throw)
else:
warnings.warn("No percentile rank information available.")
del df["supported_peptide_length"]
del df["normalized_allele"]
return df
Save the model weights to the given filename using numpy's ".npz"
format.
numpy.savez(
filename,
**dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
Restore model weights from the given filename, which should have been
created with `save_weights`.
with numpy.load(filename) as loaded:
weights = [
loaded["array_%d" % i]
for i in range(len(loaded.keys()))
]
def calibrate_percentile_ranks(
self,
peptides=None,
num_peptides_per_length=int(1e5),
alleles=None,
summary_top_peptide_fractions=[0.001],
"""
Compute the cumulative distribution of ic50 values for a set of alleles
over a large universe of random peptides, to enable computing quantiles in
this distribution later.
Parameters
----------
peptides : sequence of string or EncodableSequences, optional
Peptides to use
num_peptides_per_length : int, optional
If peptides argument is not specified, then num_peptides_per_length
peptides are randomly sampled from a uniform distribution for each
supported length
alleles : sequence of string, optional
Alleles to perform calibration for. If not specified all supported
alleles will be calibrated.
bins : object
Anything that can be passed to numpy.histogram's "bins" argument
can be used here, i.e. either an integer or a sequence giving bin
edges. This is in ic50 space.
Returns
----------
EncodableSequences : peptides used for calibration
"""
if bins is None:
bins = to_ic50(numpy.linspace(1, 0, 1000))
if alleles is None:
alleles = self.supported_alleles
if peptides is None:
peptides = []
lengths = range(
self.supported_peptide_lengths[0],
self.supported_peptide_lengths[1] + 1)
for length in lengths:
peptides.extend(
random_peptides(num_peptides_per_length, length))
encoded_peptides = EncodableSequences.create(peptides)
if motif_summary:
frequency_matrices = []
length_distributions = []
else:
frequency_matrices = None
length_distributions = None
for (i, allele) in enumerate(alleles):
predictions = self.predict(
encoded_peptides, allele=allele, model_kwargs=model_kwargs)
if verbose:
elapsed = time.time() - start
print(
"Generated %d predictions for allele %s in %0.2f sec: "
"%0.2f predictions / sec" % (
len(encoded_peptides.sequences),
allele,
elapsed,
len(encoded_peptides.sequences) / elapsed))
transform = PercentRankTransform()
transform.fit(predictions, bins=bins)
self.allele_to_percent_rank_transform[allele] = transform
if frequency_matrices is not None:
predictions_df = pandas.DataFrame({
'peptide': encoded_peptides.sequences,
'prediction': predictions
}).drop_duplicates('peptide').set_index("peptide")
predictions_df["length"] = predictions_df.index.str.len()
for (length, sub_df) in predictions_df.groupby("length"):
for cutoff_fraction in summary_top_peptide_fractions:
selected = sub_df.prediction.nsmallest(
max(
int(len(sub_df) * cutoff_fraction),
1)).index.values
matrix = positional_frequency_matrix(selected).reset_index()
original_columns = list(matrix.columns)
matrix["allele"] = allele
matrix["length"] = length
matrix["cutoff_fraction"] = cutoff_fraction
matrix["cutoff_count"] = len(selected)
matrix = matrix[
["allele", "length", "cutoff_fraction", "cutoff_count"]
+ original_columns
]
frequency_matrices.append(matrix)
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
for cutoff_fraction in summary_top_peptide_fractions:
cutoff_count = max(
int(len(predictions_df) * cutoff_fraction), 1)
length_distribution = predictions_df.prediction.nsmallest(
cutoff_count).index.str.len().value_counts()
length_distribution.index.name = "length"
length_distribution /= length_distribution.sum()
length_distribution = length_distribution.to_frame()
length_distribution.columns = ["fraction"]
length_distribution = length_distribution.reset_index()
length_distribution["allele"] = allele
length_distribution["cutoff_fraction"] = cutoff_fraction
length_distribution["cutoff_count"] = cutoff_count
length_distribution = length_distribution[[
"allele",
"cutoff_fraction",
"cutoff_count",
"length",
"fraction"
]].sort_values(["cutoff_fraction", "length"])
length_distributions.append(length_distribution)
if frequency_matrices is not None:
frequency_matrices = pandas.concat(
frequency_matrices, ignore_index=True)
if length_distributions is not None:
length_distributions = pandas.concat(
length_distributions, ignore_index=True)
if motif_summary:
return {
'frequency_matrices': frequency_matrices,
'length_distributions': length_distributions,
}
def filter_networks(self, predicate):
"""
Return a new Class1AffinityPredictor containing a subset of this
predictor's neural networks.
Parameters
----------
predicate : Class1NeuralNetwork -> boolean
Function specifying which neural networks to include
Returns
-------
Class1AffinityPredictor
"""
allele_to_allele_specific_models = {}
for (allele, models) in self.allele_to_allele_specific_models.items():
allele_to_allele_specific_models[allele] = [
m for m in models if predicate(m)
]
class1_pan_allele_models = [
m for m in self.class1_pan_allele_models if predicate(m)
]
return Class1AffinityPredictor(
allele_to_allele_specific_models=allele_to_allele_specific_models,
class1_pan_allele_models=class1_pan_allele_models,
Tim O'Donnell
committed
allele_to_sequence=self.allele_to_sequence,
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
)
def model_select(
self,
score_function,
alleles=None,
min_models=1,
max_models=10000):
"""
Perform model selection using a user-specified scoring function.
Model selection is done using a "step up" variable selection procedure,
in which models are repeatedly added to an ensemble until the score
stops improving.
Parameters
----------
score_function : Class1AffinityPredictor -> float function
Scoring function
alleles : list of string, optional
If not specified, model selection is performed for all alleles.
min_models : int, optional
Min models to select per allele
max_models : int, optional
Max models to select per allele
Returns
-------
Class1AffinityPredictor : predictor containing the selected models
"""
if alleles is None:
alleles = self.supported_alleles
dfs = []
allele_to_allele_specific_models = {}
for allele in alleles:
df = pandas.DataFrame({
'model': self.allele_to_allele_specific_models[allele]
})
df["model_num"] = df.index
df["allele"] = allele
df["selected"] = False
round_num = 1
while not df.selected.all() and sum(df.selected) < max_models:
score_col = "score_%2d" % round_num
prev_score_col = "score_%2d" % (round_num - 1)
existing_selected = list(df[df.selected].model)
df[score_col] = [
numpy.nan if row.selected else
score_function(
Class1AffinityPredictor(
allele_to_allele_specific_models={
allele: [row.model] + existing_selected
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
for (_, row) in df.iterrows()
]
if round_num > min_models and (
df[score_col].max() < df[prev_score_col].max()):
break
# In case of a tie, pick a model at random.
(best_model_index,) = df.loc[
(df[score_col] == df[score_col].max())
].sample(1).index
df.loc[best_model_index, "selected"] = True
round_num += 1
dfs.append(df)
allele_to_allele_specific_models[allele] = list(
df.loc[df.selected].model)
df = pandas.concat(dfs, ignore_index=True)
new_predictor = Class1AffinityPredictor(
allele_to_allele_specific_models,
metadata_dataframes={
"model_selection": df,
})
return new_predictor