-
Tim O'Donnell authoredTim O'Donnell authored
class1_affinity_predictor.py 10.96 KiB
import collections
import time
import hashlib
import json
from os.path import join, exists
from six import string_types
import numpy
import pandas
import mhcnames
from ..encodable_sequences import EncodableSequences
from .class1_neural_network import Class1NeuralNetwork
class Class1AffinityPredictor(object):
def __init__(
self,
allele_to_allele_specific_models=None,
class1_pan_allele_models=None,
allele_to_pseudosequence=None,
manifest_df=None):
if allele_to_allele_specific_models is None:
allele_to_allele_specific_models = {}
if class1_pan_allele_models is None:
class1_pan_allele_models = []
if class1_pan_allele_models:
assert allele_to_pseudosequence, "Pseudosequences required"
self.allele_to_allele_specific_models = (
allele_to_allele_specific_models)
self.class1_pan_allele_models = class1_pan_allele_models
self.allele_to_pseudosequence = allele_to_pseudosequence
if manifest_df is None:
manifest_df = pandas.DataFrame()
manifest_df["model_name"] = []
manifest_df["allele"] = []
manifest_df["config_json"] = []
manifest_df["model"] = []
self.manifest_df = manifest_df
def save(self, models_dir, model_names_to_write=None):
num_models = len(self.class1_pan_allele_models) + sum(
len(v) for v in self.allele_to_allele_specific_models.values())
assert len(self.manifest_df) == num_models, (
"Manifest seems out of sync with models: %d vs %d entries" % (
len(self.manifest_df), num_models))
if model_names_to_write is None:
# Write all models
model_names_to_write = self.manifest_df.model_name.values
sub_manifest_df = self.manifest_df.ix[
self.manifest_df.model_name.isin(model_names_to_write)
]
for (_, row) in sub_manifest_df.iterrows():
weights_path = self.weights_path(models_dir, row.model_name)
row.model.save_weights(weights_path)
print("Wrote: %s" % weights_path)
write_manifest_df = self.manifest_df[[
c for c in self.manifest_df.columns if c != "model"
]]
manifest_path = join(models_dir, "manifest.csv")
write_manifest_df.to_csv(manifest_path, index=False)
print("Wrote: %s" % manifest_path)
@staticmethod
def model_name(allele, num):
random_string = hashlib.sha1(
str(time.time()).encode()).hexdigest()[:16]
return "%s-%d-%s" % (allele, num, random_string)
@staticmethod
def weights_path(models_dir, model_name):
return join(
models_dir,
"weights_%s.%s" % (
model_name, Class1NeuralNetwork.weights_filename_extension))
@staticmethod
def load(models_dir, max_models=None):
manifest_path = join(models_dir, "manifest.csv")
manifest_df = pandas.read_csv(manifest_path, nrows=max_models)
allele_to_allele_specific_models = collections.defaultdict(list)
class1_pan_allele_models = []
all_models = []
for (_, row) in manifest_df.iterrows():
model = Class1NeuralNetwork.from_config(
json.loads(row.config_json))
weights_path = Class1AffinityPredictor.weights_path(
models_dir, row.model_name)
print("Loading model weights: %s" % weights_path)
model.restore_weights(weights_path)
if row.allele == "pan-class1":
class1_pan_allele_models.append(model)
else:
allele_to_allele_specific_models[row.allele].append(model)
all_models.append(model)
manifest_df["model"] = all_models
pseudosequences = None
if exists(join(models_dir, "pseudosequences.csv")):
pseudosequences = pandas.read_csv(
join(models_dir, "pseudosequences.csv"),
index_col="allele").to_dict()
print(
"Loaded %d class1 pan allele predictors, %d pseudosequences, and "
"%d allele specific models: %s" % (
len(class1_pan_allele_models),
len(pseudosequences) if pseudosequences else 0,
sum(len(v) for v in allele_to_allele_specific_models.values()),
", ".join(
"%s (%d)" % (allele, len(v))
for (allele, v)
in sorted(allele_to_allele_specific_models.items()))))
result = Class1AffinityPredictor(
allele_to_allele_specific_models=allele_to_allele_specific_models,
class1_pan_allele_models=class1_pan_allele_models,
allele_to_pseudosequence=pseudosequences,
manifest_df=manifest_df)
return result
def fit_allele_specific_predictors(
self,
n_models,
architecture_hyperparameters,
allele,
peptides,
affinities,
models_dir_for_save=None,
verbose=1):
allele = mhcnames.normalize_allele_name(allele)
models = self._fit_predictors(
n_models=n_models,
architecture_hyperparameters=architecture_hyperparameters,
peptides=peptides,
affinities=affinities,
allele_pseudosequences=None,
verbose=verbose)
if allele not in self.allele_to_allele_specific_models:
self.allele_to_allele_specific_models[allele] = []
models_list = []
for (i, model) in enumerate(models):
model_name = self.model_name(allele, i)
models_list.append(model) # models is a generator
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
("allele", allele),
("config_json", json.dumps(model.get_config())),
("model", model),
])).to_frame().T
self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True)
self.allele_to_allele_specific_models[allele].append(model)
if models_dir_for_save:
self.save(
models_dir_for_save, model_names_to_write=[model_name])
return models
def fit_class1_pan_allele_models(
self,
n_models,
architecture_hyperparameters,
alleles,
peptides,
affinities,
models_dir_for_save=None,
verbose=1):
alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name)
allele_pseudosequences = alleles.map(self.allele_to_pseudosequence)
models = self._fit_predictors(
n_models=n_models,
architecture_hyperparameters=architecture_hyperparameters,
peptides=peptides,
affinities=affinities,
allele_pseudosequences=allele_pseudosequences)
for (i, model) in enumerate(models):
model_name = self.model_name("pan-class1", i)
self.class1_pan_allele_models.append(model)
row = pandas.Series(collections.OrderedDict([
("model_name", model_name),
("allele", "pan-class1"),
("config_json", json.dumps(model.get_config())),
("model", model),
])).to_frame().T
self.manifest_df = pandas.concat(
[self.manifest_df, row], ignore_index=True)
if models_dir_for_save:
self.save(
models_dir_for_save, model_names_to_write=[model_name])
return models
def _fit_predictors(
self,
n_models,
architecture_hyperparameters,
peptides,
affinities,
allele_pseudosequences,
verbose=1):
encodable_peptides = EncodableSequences.create(peptides)
for i in range(n_models):
print("Training model %d / %d" % (i + 1, n_models))
model = Class1NeuralNetwork(**architecture_hyperparameters)
model.fit(
encodable_peptides,
affinities,
allele_pseudosequences=allele_pseudosequences,
verbose=verbose)
yield model
def predict(self, peptides, alleles=None, allele=None):
df = self.predict_to_dataframe(
peptides=peptides,
alleles=alleles,
allele=allele
)
return df.prediction.values
def predict_to_dataframe(
self,
peptides,
alleles=None,
allele=None,
include_individual_model_predictions=False):
if isinstance(peptides, string_types):
raise TypeError("peptides must be a list or array, not a string")
if isinstance(alleles, string_types):
raise TypeError("alleles must be a list or array, not a string")
if allele is not None:
if alleles is not None:
raise ValueError("Specify exactly one of allele or alleles")
alleles = [allele] * len(peptides)
df = pandas.DataFrame({
'peptide': peptides,
'allele': alleles,
})
df["normalized_allele"] = df.allele.map(
mhcnames.normalize_allele_name)
if self.class1_pan_allele_models:
allele_pseudosequences = df.normalized_allele.map(
self.allele_to_pseudosequence)
encodable_peptides = EncodableSequences.create(
df.peptide.values)
for (i, model) in enumerate(self.class1_pan_allele_models):
df["model_pan_%d" % i] = model.predict(
encodable_peptides,
allele_pseudosequences=allele_pseudosequences)
if self.allele_to_allele_specific_models:
for allele in df.normalized_allele.unique():
mask = (df.normalized_allele == allele).values
allele_peptides = EncodableSequences.create(
df.ix[mask].peptide.values)
models = self.allele_to_allele_specific_models.get(allele, [])
for (i, model) in enumerate(models):
df.loc[mask, "model_single_%d" % i] = model.predict(
allele_peptides)
# Geometric mean
df_predictions = df[
[c for c in df.columns if c.startswith("model_")]
]
logs = numpy.log(df_predictions)
log_means = logs.mean(1)
df["prediction"] = numpy.exp(log_means)
df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1))
df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1))
del df["normalized_allele"]
if include_individual_model_predictions:
columns = sorted(df.columns, key=lambda c: c.startswith('model_'))
else:
columns = [
c for c in df.columns if c not in df_predictions.columns
]
return df[columns]