Newer
Older
df["prediction_low"] = numpy.exp(numpy.percentile(logs, 5.0, axis=1))
df["prediction_high"] = numpy.exp(numpy.percentile(logs, 95.0, axis=1))
for i in range(num_pan_models):
df["model_pan_%d" % i] = predictions_array[:, i]
for i in range(max_single_allele_models):
df["model_single_%d" % i] = predictions_array[
:, num_pan_models + i
]
if include_percentile_ranks:
if self.allele_to_percent_rank_transform:
df["prediction_percentile"] = self.percentile_ranks(
df.prediction,
alleles=df.normalized_allele.values,
throw=throw)
else:
warnings.warn("No percentile rank information available.")
del df["supported_peptide_length"]
del df["normalized_allele"]
return df
Save the model weights to the given filename using numpy's ".npz"
format.
numpy.savez(
filename,
**dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
Restore model weights from the given filename, which should have been
created with `save_weights`.
loaded = numpy.load(filename)
weights = [
loaded["array_%d" % i]
for i in range(len(loaded.keys()))
]
loaded.close()
return weights
def calibrate_percentile_ranks(
self,
peptides=None,
num_peptides_per_length=int(1e5),
alleles=None,
bins=None):
"""
Compute the cumulative distribution of ic50 values for a set of alleles
over a large universe of random peptides, to enable computing quantiles in
this distribution later.
Parameters
----------
peptides : sequence of string or EncodableSequences, optional
Peptides to use
num_peptides_per_length : int, optional
If peptides argument is not specified, then num_peptides_per_length
peptides are randomly sampled from a uniform distribution for each
supported length
alleles : sequence of string, optional
Alleles to perform calibration for. If not specified all supported
alleles will be calibrated.
bins : object
Anything that can be passed to numpy.histogram's "bins" argument
can be used here, i.e. either an integer or a sequence giving bin
edges. This is in ic50 space.
Returns
----------
EncodableSequences : peptides used for calibration
"""
if bins is None:
bins = to_ic50(numpy.linspace(1, 0, 1000))
if alleles is None:
alleles = self.supported_alleles
if peptides is None:
peptides = []
lengths = range(
self.supported_peptide_lengths[0],
self.supported_peptide_lengths[1] + 1)
for length in lengths:
peptides.extend(
random_peptides(num_peptides_per_length, length))
encoded_peptides = EncodableSequences.create(peptides)
for (i, allele) in enumerate(alleles):
predictions = self.predict(encoded_peptides, allele=allele)
transform = PercentRankTransform()
transform.fit(predictions, bins=bins)
self.allele_to_percent_rank_transform[allele] = transform
return encoded_peptides
def filter_networks(self, predicate):
"""
Return a new Class1AffinityPredictor containing a subset of this
predictor's neural networks.
Parameters
----------
predicate : Class1NeuralNetwork -> boolean
Function specifying which neural networks to include
Returns
-------
Class1AffinityPredictor
"""
allele_to_allele_specific_models = {}
for (allele, models) in self.allele_to_allele_specific_models.items():
allele_to_allele_specific_models[allele] = [
m for m in models if predicate(m)
]
class1_pan_allele_models = [
m for m in self.class1_pan_allele_models if predicate(m)
]
return Class1AffinityPredictor(
allele_to_allele_specific_models=allele_to_allele_specific_models,
class1_pan_allele_models=class1_pan_allele_models,
allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence,
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
)
def model_select(
self,
score_function,
alleles=None,
min_models=1,
max_models=10000):
"""
Perform model selection using a user-specified scoring function.
Model selection is done using a "step up" variable selection procedure,
in which models are repeatedly added to an ensemble until the score
stops improving.
Parameters
----------
score_function : Class1AffinityPredictor -> float function
Scoring function
alleles : list of string, optional
If not specified, model selection is performed for all alleles.
min_models : int, optional
Min models to select per allele
max_models : int, optional
Max models to select per allele
Returns
-------
Class1AffinityPredictor : predictor containing the selected models
"""
if alleles is None:
alleles = self.supported_alleles
dfs = []
allele_to_allele_specific_models = {}
for allele in alleles:
df = pandas.DataFrame({
'model': self.allele_to_allele_specific_models[allele]
})
df["model_num"] = df.index
df["allele"] = allele
df["selected"] = False
round_num = 1
while not df.selected.all() and sum(df.selected) < max_models:
score_col = "score_%2d" % round_num
prev_score_col = "score_%2d" % (round_num - 1)
existing_selected = list(df[df.selected].model)
df[score_col] = [
numpy.nan if row.selected else
score_function(
Class1AffinityPredictor(
allele_to_allele_specific_models={
allele: [row.model] + existing_selected
}))
for (_, row) in df.iterrows()
]
if round_num > min_models and (
df[score_col].max() < df[prev_score_col].max()):
break
# In case of a tie, pick a model at random.
(best_model_index,) = df.loc[
(df[score_col] == df[score_col].max())
].sample(1).index
df.loc[best_model_index, "selected"] = True
round_num += 1
dfs.append(df)
print("Selected %d models for allele %s" % (
df.selected.sum(), allele))
allele_to_allele_specific_models[allele] = list(
df.loc[df.selected].model)
df = pandas.concat(dfs, ignore_index=True)
new_predictor = Class1AffinityPredictor(
allele_to_allele_specific_models,
metadata_dataframes={
"model_selection": df,
})
return new_predictor