Newer
Older
import sklearn
import sklearn.linear_model
try:
import tqdm
except ImportError:
tdqm = None
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_processing_predictor import Class1ProcessingPredictor
from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE
from .downloads import get_default_class1_presentation_models_dir
MAX_ALLELES_PER_SAMPLE = 6
PREDICT_BATCH_SIZE = DEFAULT_PREDICT_BATCH_SIZE
PREDICT_CHUNK_SIZE = 100000 # currently used only for cleavage prediction
"""
A logistic regression model over predicted binding affinity (BA) and antigen
processing (AP) score.
See load() and predict() methods for basic usage.
"""
model_inputs = ["affinity_score", "processing_score"]
processing_predictor_with_flanks=None,
processing_predictor_without_flanks=None,
self.affinity_predictor = affinity_predictor
self.processing_predictor_with_flanks = processing_predictor_with_flanks
self.processing_predictor_without_flanks = processing_predictor_without_flanks
self.weights_dataframe = weights_dataframe
self.metadata_dataframes = (
dict(metadata_dataframes) if metadata_dataframes else {})
@property
def supported_alleles(self):
"""
List of alleles supported by the underlying Class1AffinityPredictor
"""
return self.affinity_predictor.supported_alleles
@property
def supported_peptide_lengths(self):
"""
(min, max) of supported peptide lengths, inclusive.
"""
return self.affinity_predictor.supported_peptide_lengths
def predict_affinity(
self,
peptides,
experiment_names,
alleles,
include_affinity_percentile=False,
verbose=1,
throw=True):
"""
Parameters
----------
peptides
experiment_names
alleles
include_affinity_percentile
verbose
throw
Returns
-------
"""
df = pandas.DataFrame({
"peptide": numpy.array(peptides, copy=False),
"experiment_name": numpy.array(experiment_names, copy=False),
})
iterator = df.groupby("experiment_name")
if verbose > 0:
print("Predicting affinities.")
if tqdm is not None:
iterator = tqdm.tqdm(
iterator, total=df.experiment_name.nunique())
for (experiment, sub_df) in iterator:
predictions_df = pandas.DataFrame(index=sub_df.index)
experiment_peptides = EncodableSequences.create(sub_df.peptide.values)
for allele in alleles[experiment]:
predictions_df[allele] = self.affinity_predictor.predict(
peptides=experiment_peptides,
allele=allele,
model_kwargs={'batch_size': PREDICT_BATCH_SIZE},
throw=throw)
] = predictions_df.min(1).values
df.loc[
] = predictions_df.idxmin(1).values
if include_affinity_percentile:
df.loc[sub_df.index, "affinity_percentile"] = (
self.affinity_predictor.percentile_ranks(
df.loc[sub_df.index, "affinity"].values,
alleles=df.loc[sub_df.index, "best_allele"].values,
throw=False))
self, peptides, n_flanks=None, c_flanks=None, verbose=1):
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
if n_flanks is None:
if self.processing_predictor_without_flanks is None:
raise ValueError("No processing predictor without flanks")
predictor = self.processing_predictor_without_flanks
n_flanks = [""] * len(peptides)
c_flanks = n_flanks
else:
if self.processing_predictor_with_flanks is None:
raise ValueError("No processing predictor with flanks")
predictor = self.processing_predictor_with_flanks
num_chunks = int(numpy.ceil(float(len(peptides)) / PREDICT_CHUNK_SIZE))
peptide_chunks = numpy.array_split(peptides, num_chunks)
n_flank_chunks = numpy.array_split(n_flanks, num_chunks)
c_flank_chunks = numpy.array_split(c_flanks, num_chunks)
iterator = zip(peptide_chunks, n_flank_chunks, c_flank_chunks)
if verbose > 0:
print("Predicting processing.")
if tqdm is not None:
iterator = tqdm.tqdm(iterator, total=len(peptide_chunks))
result_chunks = []
for (peptide_chunk, n_flank_chunk, c_flank_chunk) in iterator:
result_chunk = predictor.predict(
peptides=peptide_chunk,
n_flanks=n_flank_chunk,
c_flanks=c_flank_chunk,
batch_size=PREDICT_BATCH_SIZE)
result_chunks.append(result_chunk)
return numpy.concatenate(result_chunks)
def fit(
self,
targets,
peptides,
experiment_names,
alleles,
n_flanks=None,
c_flanks=None,
verbose=1):
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
verbose=verbose)
df["affinity_score"] = from_ic50(df.affinity)
df["target"] = numpy.array(targets, copy=False)
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
with_flanks_list = []
if self.processing_predictor_without_flanks is not None:
if n_flanks is not None and self.processing_predictor_with_flanks is not None:
with_flanks_list.append(True)
if not with_flanks_list:
raise RuntimeError("Can't fit any models")
if self.weights_dataframe is None:
self.weights_dataframe = pandas.DataFrame()
for with_flanks in with_flanks_list:
model_name = 'with_flanks' if with_flanks else "without_flanks"
if verbose > 0:
print("Training variant", model_name)
df["processing_score"] = self.predict_processing(
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
peptides=df.peptide.values,
n_flanks=n_flanks if with_flanks else None,
c_flanks=c_flanks if with_flanks else None,
verbose=verbose)
model = self.get_model()
if verbose > 0:
print("Fitting LR model.")
print(df)
model.fit(
X=df[self.model_inputs].values,
y=df.target.astype(float))
self.weights_dataframe.loc[model_name, "intercept"] = model.intercept_
for (name, value) in zip(self.model_inputs, numpy.squeeze(model.coef_)):
self.weights_dataframe.loc[model_name, name] = value
self._models_cache[model_name] = model
def get_model(self, name=None):
if name is None or name not in self._models_cache:
model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
if name is not None:
row = self.weights_dataframe.loc[name]
model.intercept_ = row.intercept
model.coef_ = numpy.expand_dims(
row[self.model_inputs].values, axis=0)
else:
model = self._models_cache[name]
return model
def predict(
self,
peptides,
alleles,
return self.predict_to_dataframe(
peptides=peptides,
alleles=alleles,
verbose=verbose).presentation_score.values
self,
sequences,
alleles,
comparison_quantity="presentation_score",
peptide_lengths=[8, 9, 10, 11],
use_flanks=True,
include_affinity_percentile=False,
verbose=1,
throw=True):
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
"""
Predict across protein sequences.
Parameters
----------
sequences : str, list of string, or string -> string dict
Protein sequences. If a dict is given, the keys are arbitrary (
e.g. protein names), and the values are the amino acid sequences.
alleles : str, list of string, list of list of string, or string -> string dict
MHC I alleles. Can be: (1) a string (a single allele), (2) a list of
strings (a single genotype), (3) a list of list of strings
(multiple genotypes, where the total number of genotypes must equal
the number of sequences), or (4) a dict (in which case the keys must
match the sequences dict keys).
result : string
One of:
- "best": return the strongest peptide for each sequence
- "all": return predictions for all peptides
- "filtered": return predictions stronger where comparison_quantity
is stronger than filter_value.
comparison_quantity : string
One of "presentation_score", "processing_score", or "affinity".
Quantity to use to rank (if result is "best") or filter (if result
is "filtered") results.
filter_value : float
Threshold value to use, only relevant when result is "filtered".
If comparison_quantity is "affinity", then all results less than
(i.e. tighter than) the specified nM affinity are retained. If it's
"presentation_score" or "processing_score" then results greater than
the indicated filter_value are retained.
peptide_lengths : list of int
Peptide lengths to predict for.
use_flanks : bool
Whether to include flanking sequences when running the AP predictor
(for better cleavage prediction).
include_affinity_percentile : bool
Whether to include affinity percentile ranks in output.
verbose : int
Set to 0 for quiet mode.
throw : boolean
Whether to throw exceptions (vs. log warnings) on invalid inputs.
Returns
-------
pandas.DataFrame with columns:
peptide, n_flank, c_flank, sequence_name, affinity, best_allele,
processing_score, presentation_score
"""
processing_predictor = self.processing_predictor_with_flanks
if not use_flanks or processing_predictor is None:
processing_predictor = self.processing_predictor_without_flanks
supported_sequence_lengths = processing_predictor.sequence_lengths
n_flank_length = supported_sequence_lengths["n_flank"]
c_flank_length = supported_sequence_lengths["c_flank"]
sequence_names = []
n_flanks = [] if use_flanks else None
c_flanks = [] if use_flanks else None
peptides = []
if isinstance(sequences, string_types):
sequences = [sequences]
if not isinstance(sequences, dict):
sequences = collections.OrderedDict(
("sequence_%04d" % (i + 1), sequence)
for (i, sequence) in enumerate(sequences))
if not isinstance(alleles, dict):
if all([isinstance(item, string_types) for item in alleles]):
alleles = dict((name, alleles) for name in sequences.keys())
elif len(alleles) != len(sequences):
raise ValueError(
"alleles must be (1) a string (a single allele), (2) a list of "
"strings (a single genotype), (3) a list of list of strings ("
"(multiple genotypes, where the total number of genotypes "
"must equal the number of sequences), or (4) a dict (in which "
"case the keys must match the sequences dict keys). Here "
"it seemed like option (3) was being used, but the length "
"of alleles (%d) did not match the length of sequences (%d)."
% (len(alleles), len(sequences)))
else:
alleles = dict(zip(sequences.keys(), alleles))
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
missing = [key for key in sequences if key not in alleles]
if missing:
raise ValueError(
"Sequence names missing from alleles dict: ", missing)
for (name, sequence) in sequences.items():
if not isinstance(sequence, string_types):
raise ValueError("Expected string, not %s (%s)" % (
sequence, type(sequence)))
for peptide_start in range(len(sequence) - min(peptide_lengths)):
n_flank_start = max(0, peptide_start - n_flank_length)
for peptide_length in peptide_lengths:
c_flank_end = (
peptide_start + peptide_length + c_flank_length)
sequence_names.append(name)
peptides.append(
sequence[peptide_start: peptide_start + peptide_length])
if use_flanks:
n_flanks.append(
sequence[n_flank_start : peptide_start])
c_flanks.append(
sequence[peptide_start + peptide_length : c_flank_end])
result_df = self.predict_to_dataframe(
peptides=peptides,
alleles=alleles,
n_flanks=n_flanks,
c_flanks=c_flanks,
experiment_names=sequence_names,
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
result_df = result_df.rename(
columns={"experiment_name": "sequence_name"})
comparison_is_score = comparison_quantity.endswith("score")
result_df = result_df.sort_values(
comparison_quantity,
ascending=not comparison_is_score)
if result == "best":
result_df = result_df.drop_duplicates(
"sequence_name", keep="first").sort_values("sequence_name")
elif result == "filtered":
if comparison_is_score:
result_df = result_df.loc[
]
else:
result_df = result_df.loc[
]
elif result == "all":
pass
else:
raise ValueError(
"Unknown result: %s. Valid choices are: best, filtered, all"
% result)
result_df = result_df.reset_index(drop=True)
result_df = result_df.copy()
return result_df
include_affinity_percentile=False,
verbose=1,
throw=True):
raise TypeError("peptides must be a list not a string")
raise TypeError("alleles must be a list or dict")
if isinstance(alleles, dict):
if experiment_names is None:
raise ValueError(
"experiment_names must be supplied when alleles is a dict")
else:
if experiment_names is not None:
raise ValueError(
"alleles must be a dict when experiment_names is specified")
alleles = numpy.array(alleles, copy=False)
if len(alleles) > MAX_ALLELES_PER_SAMPLE:
raise ValueError(
"When alleles is a list, it must have at most %d elements. "
"These alleles are taken to be a genotype for an "
"individual, and the strongest prediction across alleles "
"will be taken for each peptide. Note that this differs "
"from Class1AffinityPredictor.predict(), where alleles "
"is expected to be the same length as peptides."
experiment_names = ["experiment1"] * len(peptides)
alleles = {
"experiment1": alleles,
}
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
processing_scores = self.predict_processing(
peptides=peptides,
n_flanks=n_flanks,
c_flanks=c_flanks,
verbose=verbose)
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
df["affinity_score"] = from_ic50(df.affinity)
if c_flanks is not None:
df.insert(1, "c_flank", c_flanks)
if n_flanks is not None:
df.insert(1, "n_flank", n_flanks)
model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name)
df["presentation_score"] = model.predict_proba(
df[self.model_inputs].values)[:,1]
del df["affinity_score"]
return df
def save(self, models_dir):
"""
Serialize the predictor to a directory on disk. If the directory does
not exist it will be created.
Parameters
----------
models_dir : string
Path to directory. It will be created if it doesn't exist.
"""
if self.weights_dataframe is None:
raise RuntimeError("Can't save before fitting")
# Save underlying predictors
self.affinity_predictor.save(join(models_dir, "affinity_predictor"))
if self.processing_predictor_with_flanks is not None:
self.processing_predictor_with_flanks.save(
join(models_dir, "processing_predictor_with_flanks"))
if self.processing_predictor_without_flanks is not None:
self.processing_predictor_without_flanks.save(
join(models_dir, "processing_predictor_without_flanks"))
# Save model coefficients.
self.weights_dataframe.to_csv(join(models_dir, "weights.csv"))
# Write "info.txt"
info_path = join(models_dir, "info.txt")
rows = [
("trained on", time.asctime()),
("package ", "mhcflurry %s" % __version__),
("hostname ", gethostname()),
("user ", getuser()),
]
pandas.DataFrame(rows).to_csv(
info_path, sep="\t", header=False, index=False)
if self.metadata_dataframes:
for (name, df) in self.metadata_dataframes.items():
metadata_df_path = join(models_dir, "%s.csv.bz2" % name)
df.to_csv(metadata_df_path, index=False, compression="bz2")
@classmethod
def load(cls, models_dir=None, max_models=None):
"""
Deserialize a predictor from a directory on disk.
Parameters
----------
models_dir : string
Path to directory. If unspecified the default downloaded models are
used.
max_models : int, optional
Maximum number of affinity and processing (counted separately)
`Class1PresentationPredictor` instance
models_dir = get_default_class1_presentation_models_dir()
affinity_predictor = Class1AffinityPredictor.load(
join(models_dir, "affinity_predictor"), max_models=max_models)
processing_predictor_with_flanks = None
if exists(join(models_dir, "processing_predictor_with_flanks")):
processing_predictor_with_flanks = Class1ProcessingPredictor.load(
join(models_dir, "processing_predictor_with_flanks"),
else:
logging.warning(
"Presentation predictor is missing processing predictor: %s",
join(models_dir, "processing_predictor_with_flanks"))
processing_predictor_without_flanks = None
if exists(join(models_dir, "processing_predictor_without_flanks")):
processing_predictor_without_flanks = Class1ProcessingPredictor.load(
join(models_dir, "processing_predictor_without_flanks"),
else:
logging.warning(
"Presentation predictor is missing processing predictor: %s",
join(models_dir, "processing_predictor_without_flanks"))
weights_dataframe = pandas.read_csv(
join(models_dir, "weights.csv"),
index_col=0)
affinity_predictor=affinity_predictor,
processing_predictor_with_flanks=processing_predictor_with_flanks,
processing_predictor_without_flanks=processing_predictor_without_flanks,
weights_dataframe=weights_dataframe)