Newer
Older
raise ValueError("Must specify 'allele' or 'alleles'.")
peptides = EncodableSequences.create(peptides)
df = pandas.DataFrame({
'peptide': peptides.sequences
}, copy=False)
if allele is not None:
if alleles is not None:
raise ValueError("Specify exactly one of allele or alleles")
df["allele"] = allele
normalized_allele = mhcnames.normalize_allele_name(allele)
df["normalized_allele"] = normalized_allele
unique_alleles = [normalized_allele]
else:
df["allele"] = numpy.array(alleles)
df["normalized_allele"] = df.allele.map(
mhcnames.normalize_allele_name)
if len(df) == 0:
# No predictions.
logging.warning("Predicting for 0 peptides.")
empty_result = pandas.DataFrame(
columns=[
'peptide',
'allele',
'prediction',
'prediction_low',
'prediction_high'
])
return empty_result
(min_peptide_length, max_peptide_length) = (
self.supported_peptide_lengths)
if (peptides.min_length < min_peptide_length or
peptides.max_length > max_peptide_length):
# Only compute this if needed
all_peptide_lengths_supported = False
sequence_length = df.peptide.str.len()
(sequence_length >= min_peptide_length) &
(sequence_length <= max_peptide_length))
if (~df.supported_peptide_length).any():
msg = (
"%d peptides have lengths outside of supported range [%d, %d]: "
"%s" % (
(~df.supported_peptide_length).sum(),
min_peptide_length,
max_peptide_length,
str(df.loc[~df.supported_peptide_length].peptide.unique())))
logging.warning(msg)
if throw:
raise ValueError(msg)
else:
# Handle common case efficiently.
df["supported_peptide_length"] = True
all_peptide_lengths_supported = True
num_pan_models = (
len(self.class1_pan_allele_models)
if not self.optimization_info.get("pan_models_merged", False)
else self.optimization_info["num_pan_models_merged"])
max_single_allele_models = max(
len(self.allele_to_allele_specific_models.get(allele, []))
for allele in unique_alleles
)
predictions_array = numpy.zeros(
shape=(df.shape[0], num_pan_models + max_single_allele_models),
dtype="float64")
predictions_array[:] = numpy.nan
unsupported_alleles = [
allele for allele in
df.normalized_allele.unique()
Tim O'Donnell
committed
if allele not in self.allele_to_sequence
"Supported alleles: %s" % (
" ".join(unsupported_alleles),
Tim O'Donnell
committed
" ".join(sorted(self.allele_to_sequence))))
mask = df.supported_peptide_length & (
~df.normalized_allele.isin(unsupported_alleles))
row_slice = slice(None, None, None) # all rows
masked_allele_encoding = AlleleEncoding(
masked_peptides = peptides
elif mask.sum() > 0:
row_slice = mask
masked_allele_encoding = AlleleEncoding(
df.loc[mask].normalized_allele,
borrow_from=master_allele_encoding)
masked_peptides = peptides.sequences[mask]
# The following line is a performance optimization that may be
# revisited. It causes the neural network to set to include
# only the alleles actually being predicted for. This makes
# the network much smaller. However, subsequent calls to
# predict will need to reset these weights, so there is a
# tradeoff.
masked_allele_encoding = masked_allele_encoding.compact()
if self.optimization_info.get("pan_models_merged"):
# Multiple pan-allele models have been merged into one
# at the tensorflow level.
assert len(self.class1_pan_allele_models) == 1
predictions = self.class1_pan_allele_models[0].predict(
predictions_array[row_slice, :num_pan_models] = predictions
else:
for (i, model) in enumerate(self.class1_pan_allele_models):
predictions_array[row_slice, i] = model.predict(
masked_peptides,
allele_encoding=masked_allele_encoding,
**model_kwargs)
if not self.allele_to_allele_specific_models.get(allele)
]
if unsupported_alleles:
msg = (
"No single-allele models for allele(s): %s.\n"
"Supported alleles are: %s" % (
mask = None
else:
mask = (
(df.normalized_allele == allele) &
df.supported_peptide_length).values
peptides_for_allele = peptides
row_slice = slice(None, None, None)
peptides_for_allele = EncodableSequences.create(
for (i, model) in enumerate(models):
predictions_array[
num_pan_models + i,
if callable(centrality_measure):
centrality_function = centrality_measure
else:
centrality_function = CENTRALITY_MEASURES[centrality_measure]
logs = numpy.log(predictions_array)
log_centers = centrality_function(logs)
df["prediction"] = numpy.exp(log_centers)
df["prediction_low"] = numpy.exp(
numpy.nanpercentile(logs, 5.0, axis=1))
df["prediction_high"] = numpy.exp(
numpy.nanpercentile(logs, 95.0, axis=1))
for i in range(num_pan_models):
df["model_pan_%d" % i] = predictions_array[:, i]
for i in range(max_single_allele_models):
df["model_single_%d" % i] = predictions_array[
:, num_pan_models + i
]
if include_percentile_ranks:
if self.allele_to_percent_rank_transform:
df["prediction_percentile"] = self.percentile_ranks(
df.prediction,
alleles=df.normalized_allele.values,
throw=throw)
else:
warnings.warn("No percentile rank information available.")
del df["supported_peptide_length"]
del df["normalized_allele"]
return df
Save the model weights to the given filename using numpy's ".npz"
format.
numpy.savez(
filename,
**dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
Restore model weights from the given filename, which should have been
created with `save_weights`.
with numpy.load(filename) as loaded:
weights = [
loaded["array_%d" % i]
for i in range(len(loaded.keys()))
]
def calibrate_percentile_ranks(
self,
peptides=None,
num_peptides_per_length=int(1e5),
alleles=None,
summary_top_peptide_fractions=[0.001],
"""
Compute the cumulative distribution of ic50 values for a set of alleles
over a large universe of random peptides, to enable taking quantiles
of this distribution later.
Parameters
----------
peptides : sequence of string or EncodableSequences, optional
Peptides to use
num_peptides_per_length : int, optional
If peptides argument is not specified, then num_peptides_per_length
peptides are randomly sampled from a uniform distribution for each
supported length
alleles : sequence of string, optional
Alleles to perform calibration for. If not specified all supported
alleles will be calibrated.
bins : object
Anything that can be passed to numpy.histogram's "bins" argument
can be used here, i.e. either an integer or a sequence giving bin
edges. This is in ic50 space.
motif_summary : bool
If True, the length distribution and per-position amino acid
frequencies are also calculated for the top x fraction of tightest-
binding peptides, where each value of x is given in the
summary_top_peptide_fractions list.
summary_top_peptide_fractions : list of float
Only used if motif_summary is True
verbose : boolean
Whether to print status updates to stdout
model_kwargs : dict
Additional low-level Class1NeuralNetwork.predict() kwargs.
Returns
----------
If motif_summary is True, this will have keys "frequency_matrices" and
"length_distributions". Otherwise it will be empty.
"""
if bins is None:
bins = to_ic50(numpy.linspace(1, 0, 1000))
if alleles is None:
alleles = self.supported_alleles
if peptides is None:
peptides = []
lengths = range(
self.supported_peptide_lengths[0],
self.supported_peptide_lengths[1] + 1)
for length in lengths:
peptides.extend(
random_peptides(num_peptides_per_length, length))
encoded_peptides = EncodableSequences.create(peptides)
if motif_summary:
frequency_matrices = []
length_distributions = []
else:
frequency_matrices = None
length_distributions = None
predictions = self.predict(
encoded_peptides, allele=allele, model_kwargs=model_kwargs)
if verbose:
elapsed = time.time() - start
print(
"Generated %d predictions for allele %s in %0.2f sec: "
"%0.2f predictions / sec" % (
len(encoded_peptides.sequences),
allele,
elapsed,
len(encoded_peptides.sequences) / elapsed))
transform = PercentRankTransform()
transform.fit(predictions, bins=bins)
self.allele_to_percent_rank_transform[allele] = transform
if frequency_matrices is not None:
predictions_df = pandas.DataFrame({
'peptide': encoded_peptides.sequences,
'prediction': predictions
}).drop_duplicates('peptide').set_index("peptide")
predictions_df["length"] = predictions_df.index.str.len()
for (length, sub_df) in predictions_df.groupby("length"):
for cutoff_fraction in summary_top_peptide_fractions:
selected = sub_df.prediction.nsmallest(
max(
int(len(sub_df) * cutoff_fraction),
1)).index.values
matrix = positional_frequency_matrix(selected).reset_index()
original_columns = list(matrix.columns)
matrix["allele"] = allele
matrix["length"] = length
matrix["cutoff_fraction"] = cutoff_fraction
matrix["cutoff_count"] = len(selected)
matrix = matrix[
["allele", "length", "cutoff_fraction", "cutoff_count"]
+ original_columns
]
frequency_matrices.append(matrix)
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
for cutoff_fraction in summary_top_peptide_fractions:
cutoff_count = max(
int(len(predictions_df) * cutoff_fraction), 1)
length_distribution = predictions_df.prediction.nsmallest(
cutoff_count).index.str.len().value_counts()
length_distribution.index.name = "length"
length_distribution /= length_distribution.sum()
length_distribution = length_distribution.to_frame()
length_distribution.columns = ["fraction"]
length_distribution = length_distribution.reset_index()
length_distribution["allele"] = allele
length_distribution["cutoff_fraction"] = cutoff_fraction
length_distribution["cutoff_count"] = cutoff_count
length_distribution = length_distribution[[
"allele",
"cutoff_fraction",
"cutoff_count",
"length",
"fraction"
]].sort_values(["cutoff_fraction", "length"])
length_distributions.append(length_distribution)
if frequency_matrices is not None:
frequency_matrices = pandas.concat(
frequency_matrices, ignore_index=True)
if length_distributions is not None:
length_distributions = pandas.concat(
length_distributions, ignore_index=True)
if motif_summary:
return {
'frequency_matrices': frequency_matrices,
'length_distributions': length_distributions,
}
def model_select(
self,
score_function,
alleles=None,
min_models=1,
max_models=10000):
"""
Perform model selection using a user-specified scoring function.
This works only with allele-specific models, not pan-allele models.
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
Model selection is done using a "step up" variable selection procedure,
in which models are repeatedly added to an ensemble until the score
stops improving.
Parameters
----------
score_function : Class1AffinityPredictor -> float function
Scoring function
alleles : list of string, optional
If not specified, model selection is performed for all alleles.
min_models : int, optional
Min models to select per allele
max_models : int, optional
Max models to select per allele
Returns
-------
Class1AffinityPredictor : predictor containing the selected models
"""
if alleles is None:
alleles = self.supported_alleles
dfs = []
allele_to_allele_specific_models = {}
for allele in alleles:
df = pandas.DataFrame({
'model': self.allele_to_allele_specific_models[allele]
})
df["model_num"] = df.index
df["allele"] = allele
df["selected"] = False
round_num = 1
while not df.selected.all() and sum(df.selected) < max_models:
score_col = "score_%2d" % round_num
prev_score_col = "score_%2d" % (round_num - 1)
existing_selected = list(df[df.selected].model)
df[score_col] = [
numpy.nan if row.selected else
score_function(
Class1AffinityPredictor(
allele_to_allele_specific_models={
allele: [row.model] + existing_selected
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
for (_, row) in df.iterrows()
]
if round_num > min_models and (
df[score_col].max() < df[prev_score_col].max()):
break
# In case of a tie, pick a model at random.
(best_model_index,) = df.loc[
(df[score_col] == df[score_col].max())
].sample(1).index
df.loc[best_model_index, "selected"] = True
round_num += 1
dfs.append(df)
allele_to_allele_specific_models[allele] = list(
df.loc[df.selected].model)
df = pandas.concat(dfs, ignore_index=True)
new_predictor = Class1AffinityPredictor(
allele_to_allele_specific_models,
metadata_dataframes={
"model_selection": df,
})
return new_predictor