Newer
Older
from os.path import join, exists, abspath
from os import mkdir, environ
from socket import gethostname
from getpass import getuser
import sklearn
import sklearn.linear_model
try:
import tqdm
except ImportError:
tdqm = None
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_cleavage_predictor import Class1CleavagePredictor
from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE
from .multiple_allele_encoding import MultipleAlleleEncoding
from .downloads import get_default_class1_presentation_models_dir
from .common import load_weights
MAX_ALLELES_PER_SAMPLE = 6
PREDICT_BATCH_SIZE = DEFAULT_PREDICT_BATCH_SIZE
model_inputs = ["affinity_score", "cleavage_prediction"]
affinity_predictor=None,
cleavage_predictor_with_flanks=None,
cleavage_predictor_without_flanks=None,
weights_dataframe=None,
self.affinity_predictor = affinity_predictor
self.cleavage_predictor_with_flanks = cleavage_predictor_with_flanks
self.cleavage_predictor_without_flanks = cleavage_predictor_without_flanks
self.weights_dataframe = weights_dataframe
self.metadata_dataframes = (
dict(metadata_dataframes) if metadata_dataframes else {})
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
self._models_cache = {}
def get_affinity_predictions(
self, peptides, experiment_names, alleles, verbose=1):
df = pandas.DataFrame({
"peptide": numpy.array(peptides, copy=False),
"experiment_name": numpy.array(experiment_names, copy=False),
})
iterator = df.groupby("experiment_name")
if verbose > 0:
print("Predicting affinities.")
if tqdm is not None:
iterator = tqdm.tqdm(
iterator, total=df.experiment_name.nunique())
for (experiment, sub_df) in iterator:
predictions_df = pandas.DataFrame(index=sub_df.index)
experiment_peptides = EncodableSequences.create(sub_df.peptide.values)
for allele in alleles[experiment]:
predictions_df[allele] = self.affinity_predictor.predict(
peptides=experiment_peptides,
allele=allele,
model_kwargs={'batch_size': PREDICT_BATCH_SIZE})
df.loc[
sub_df.index, "tightest_affinity"
] = predictions_df.min(1).values
df.loc[
sub_df.index, "tightest_affinity_allele"
] = predictions_df.idxmin(1).values
return df
def get_cleavage_predictions(
self, peptides, n_flanks=None, c_flanks=None, verbose=1):
if verbose > 0:
print("Predicting cleavage.")
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
if n_flanks is None:
if self.cleavage_predictor_without_flanks is None:
raise ValueError("No cleavage predictor without flanks")
predictor = self.cleavage_predictor_without_flanks
n_flanks = [""] * len(peptides)
c_flanks = n_flanks
else:
if self.cleavage_predictor_with_flanks is None:
raise ValueError("No cleavage predictor with flanks")
predictor = self.cleavage_predictor_with_flanks
result = predictor.predict(
peptides=peptides,
n_flanks=n_flanks,
c_flanks=c_flanks,
batch_size=PREDICT_BATCH_SIZE)
def fit(
self,
targets,
peptides,
experiment_names,
alleles,
n_flanks=None,
c_flanks=None,
verbose=1):
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
df = self.get_affinity_predictions(
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
verbose=verbose)
df["affinity_score"] = from_ic50(df.tightest_affinity)
df["target"] = numpy.array(targets, copy=False)
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
with_flanks_list = []
if self.cleavage_predictor_without_flanks is not None:
with_flanks_list.append(False)
if n_flanks is not None and self.cleavage_predictor_with_flanks is not None:
with_flanks_list.append(True)
if not with_flanks_list:
raise RuntimeError("Can't fit any models")
if self.weights_dataframe is None:
self.weights_dataframe = pandas.DataFrame()
for with_flanks in with_flanks_list:
model_name = 'with_flanks' if with_flanks else "without_flanks"
if verbose > 0:
print("Training variant", model_name)
df["cleavage_prediction"] = self.get_cleavage_predictions(
peptides=df.peptide.values,
n_flanks=n_flanks if with_flanks else None,
c_flanks=c_flanks if with_flanks else None,
verbose=verbose)
model = self.get_model()
if verbose > 0:
print("Fitting LR model.")
print(df)
model.fit(
X=df[self.model_inputs].values,
y=df.target.astype(float))
self.weights_dataframe.loc[model_name, "intercept"] = model.intercept_
for (name, value) in zip(self.model_inputs, numpy.squeeze(model.coef_)):
self.weights_dataframe.loc[model_name, name] = value
self._models_cache[model_name] = model
def get_model(self, name=None):
if name is None or name not in self._models_cache:
model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
if name is not None:
row = self.weights_dataframe.loc[name]
model.intercept_ = row.intercept
model.coef_ = numpy.expand_dims(
row[self.model_inputs].values, axis=0)
else:
model = self._models_cache[name]
return model
def predict_sequences(self, alleles, sequences):
raise NotImplementedError
def predict(
self,
peptides,
alleles,
return self.predict_to_dataframe(
peptides=peptides,
alleles=alleles,
raise TypeError("peptides must be a list not a string")
raise TypeError("alleles must be a list or dict")
if isinstance(alleles, dict):
if experiment_names is None:
raise ValueError(
"experiment_names must be supplied when alleles is a dict")
else:
if experiment_names is not None:
raise ValueError(
"alleles must be a dict when experiment_names is specified")
alleles = numpy.array(alleles, copy=False)
if len(alleles) > MAX_ALLELES_PER_SAMPLE:
raise ValueError(
"When alleles is a list, it must have at most %d elements. "
"These alleles are taken to be a genotype for an "
"individual, and the strongest prediction across alleles "
"will be taken for each peptide. Note that this differs "
"from Class1AffinityPredictor.predict(), where alleles "
"is expected to be the same length as peptides."
experiment_names = ["experiment1"] * len(peptides)
alleles = {
"experiment1": alleles,
}
df = self.get_affinity_predictions(
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
verbose=verbose)
df["affinity_score"] = from_ic50(df.tightest_affinity)
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
df["cleavage_prediction"] = self.get_cleavage_predictions(
peptides=df.peptide.values,
n_flanks=n_flanks,
c_flanks=c_flanks,
verbose=verbose)
model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name)
df["score"] = model.predict_proba(df[self.model_inputs].values)[:,1]
return df
def save(self, models_dir):
"""
Serialize the predictor to a directory on disk. If the directory does
not exist it will be created.
Parameters
----------
models_dir : string
Path to directory. It will be created if it doesn't exist.
"""
if self.weights_dataframe is None:
raise RuntimeError("Can't save before fitting")
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
# Save underlying predictors
self.affinity_predictor.save(join(models_dir, "affinity_predictor"))
if self.cleavage_predictor_with_flanks is not None:
self.cleavage_predictor_with_flanks.save(
join(models_dir, "cleavage_predictor_with_flanks"))
if self.cleavage_predictor_without_flanks is not None:
self.cleavage_predictor_without_flanks.save(
join(models_dir, "cleavage_predictor_without_flanks"))
# Save model coefficients.
self.weights_dataframe.to_csv(join(models_dir, "weights.csv"))
# Write "info.txt"
info_path = join(models_dir, "info.txt")
rows = [
("trained on", time.asctime()),
("package ", "mhcflurry %s" % __version__),
("hostname ", gethostname()),
("user ", getuser()),
]
pandas.DataFrame(rows).to_csv(
info_path, sep="\t", header=False, index=False)
if self.metadata_dataframes:
for (name, df) in self.metadata_dataframes.items():
metadata_df_path = join(models_dir, "%s.csv.bz2" % name)
df.to_csv(metadata_df_path, index=False, compression="bz2")
@classmethod
def load(cls, models_dir=None, max_models=None):
"""
Deserialize a predictor from a directory on disk.
Parameters
----------
models_dir : string
Path to directory. If unspecified the default downloaded models are
used.
max_models : int, optional
Maximum number of affinity and cleavage (counted separately)
models to load
`Class1PresentationPredictor` instance
models_dir = get_default_class1_presentation_models_dir()
affinity_predictor = Class1AffinityPredictor.load(
join(models_dir, "affinity_predictor"), max_models=max_models)
cleavage_predictor_with_flanks = None
if exists(join(models_dir, "cleavage_predictor_with_flanks")):
cleavage_predictor_with_flanks = Class1CleavagePredictor.load(
join(models_dir, "cleavage_predictor_with_flanks"),
max_models=max_models)
cleavage_predictor_without_flanks = None
if exists(join(models_dir, "cleavage_predictor_without_flanks")):
cleavage_predictor_without_flanks = Class1CleavagePredictor.load(
join(models_dir, "cleavage_predictor_without_flanks"),
max_models=max_models)
weights_dataframe = pandas.read_csv(
join(models_dir, "weights.csv"),
index_col=0)
affinity_predictor=affinity_predictor,
cleavage_predictor_with_flanks=cleavage_predictor_with_flanks,
cleavage_predictor_without_flanks=cleavage_predictor_without_flanks,
weights_dataframe=weights_dataframe)