Skip to content
Snippets Groups Projects
Commit d05e772e authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

Working on docs

parent 8b08d482
No related branches found
No related tags found
No related merge requests found
......@@ -40,6 +40,11 @@ class Class1PresentationPredictor(object):
A logistic regression model over predicted binding affinity (BA) and antigen
processing (AP) score.
Instances of this class delegate to Class1AffinityPredictor and
Class1ProcessingPredictor instances to generate BA and AP predictions.
These predictions are combined using a logistic regression model to give
a "presentation score" prediction.
See load() and predict() methods for basic usage.
"""
model_inputs = ["affinity_score", "processing_score"]
......@@ -77,65 +82,124 @@ class Class1PresentationPredictor(object):
def predict_affinity(
self,
peptides,
experiment_names,
alleles,
experiment_names=None,
include_affinity_percentile=False,
verbose=1,
throw=True):
"""
Predict binding affinities.
Parameters
----------
peptides
experiment_names
alleles
include_affinity_percentile
verbose
throw
peptides : list of string
experiment_names : list of string [same length as peptides]
Sample names corresponding to each peptide. These are used to
lookup the alleles for each peptide in the alleles dict.
alleles : dict of string -> list of string
Keys are experiment names, values are the alleles for that sample
include_affinity_percentile : bool
Whether to include affinity percentile ranks
verbose : int
Set to 0 for quiet.
throw : verbose
Whether to throw exception (vs. just log a warning) on invalid
peptides, etc.
Returns
-------
pandas.DataFrame : predictions
"""
df = pandas.DataFrame({
"peptide": numpy.array(peptides, copy=False),
"experiment_name": numpy.array(experiment_names, copy=False),
})
if experiment_names is None:
peptides = EncodableSequences.create(peptides)
all_alleles = set()
for lst in alleles.values():
all_alleles.update(lst)
iterator = df.groupby("experiment_name")
if verbose > 0:
print("Predicting affinities.")
if tqdm is not None:
iterator = tqdm.tqdm(
iterator, total=df.experiment_name.nunique())
iterator = sorted(all_alleles)
if verbose > 0:
print("Predicting affinities.")
if tqdm is not None:
iterator = tqdm.tqdm(iterator, total=len(all_alleles))
for (experiment, sub_df) in iterator:
predictions_df = pandas.DataFrame(index=sub_df.index)
experiment_peptides = EncodableSequences.create(sub_df.peptide.values)
for allele in alleles[experiment]:
predictions_df = pandas.DataFrame(index=df.index)
for allele in iterator:
predictions_df[allele] = self.affinity_predictor.predict(
peptides=experiment_peptides,
peptides=peptides,
allele=allele,
model_kwargs={'batch_size': PREDICT_BATCH_SIZE},
throw=throw)
df.loc[
sub_df.index, "affinity"
] = predictions_df.min(1).values
df.loc[
sub_df.index, "best_allele"
] = predictions_df.idxmin(1).values
if include_affinity_percentile:
df.loc[sub_df.index, "affinity_percentile"] = (
self.affinity_predictor.percentile_ranks(
df.loc[sub_df.index, "affinity"].values,
alleles=df.loc[sub_df.index, "best_allele"].values,
throw=False))
return df
dfs = []
for (experiment_name, experiment_alleles) in alleles.items():
new_df = df.copy()
new_df["experiment_name"] = experiment_name
new_df["affinity"] = predictions_df[
experiment_alleles
].min(1).values
new_df["best_allele"] = predictions_df[
experiment_alleles
].idxmin(1).values
result_df = pandas.concat(dfs, ignore_index=True)
else:
df["experiment_name"] = numpy.array(experiment_names, copy=False)
iterator = df.groupby("experiment_name")
if verbose > 0:
print("Predicting affinities.")
if tqdm is not None:
iterator = tqdm.tqdm(
iterator, total=df.experiment_name.nunique())
for (experiment, sub_df) in iterator:
predictions_df = pandas.DataFrame(index=sub_df.index)
experiment_peptides = EncodableSequences.create(sub_df.peptide.values)
for allele in alleles[experiment]:
predictions_df[allele] = self.affinity_predictor.predict(
peptides=experiment_peptides,
allele=allele,
model_kwargs={'batch_size': PREDICT_BATCH_SIZE},
throw=throw)
df.loc[
sub_df.index, "affinity"
] = predictions_df.min(1).values
df.loc[
sub_df.index, "best_allele"
] = predictions_df.idxmin(1).values
result_df = df
if include_affinity_percentile:
result_df["affinity_percentile"] = (
self.affinity_predictor.percentile_ranks(
df.affinity.values,
alleles=df.best_alleles.values,
throw=False))
return result_df
def predict_processing(
self, peptides, n_flanks=None, c_flanks=None, verbose=1):
"""
Predict antigen processing scores for individual peptides, optionally
including flanking sequences for better cleavage prediction.
Parameters
----------
peptides : list of string
n_flanks : list of string [same length as peptides]
c_flanks : list of string [same length as peptides]
verbose : int
Returns
-------
numpy.array : Antigen processing scores for each peptide
"""
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
......@@ -181,11 +245,26 @@ class Class1PresentationPredictor(object):
n_flanks=None,
c_flanks=None,
verbose=1):
"""
Fit the presentation score logistic regression model.
Parameters
----------
targets : list of int/float
1 indicates hit, 0 indicates decoy
peptides : list of string [same length as targets]
experiment_names : list of string [same length as targets]
alleles : dict of string -> list of string
Keys are experiment names, values are the alleles for that sample
n_flanks : list of string [same length as targets]
c_flanks : list of string [same length as targets]
verbose : int
"""
df = self.predict_affinity(
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
experiment_names=experiment_names,
verbose=verbose)
df["affinity_score"] = from_ic50(df.affinity)
df["target"] = numpy.array(targets, copy=False)
......@@ -232,6 +311,20 @@ class Class1PresentationPredictor(object):
self._models_cache[model_name] = model
def get_model(self, name=None):
"""
Load or instantiate a new logistic regression model. Private helper
method.
Parameters
----------
name : string
If None (the default), an un-fit LR model is returned. Otherwise the
weights are loaded for the specified model.
Returns
-------
sklearn.linear_model.LogisticRegression
"""
if name is None or name not in self._models_cache:
model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
if name is not None:
......@@ -255,18 +348,41 @@ class Class1PresentationPredictor(object):
"""
Predict presentation scores across a set of peptides.
Presentation scores combine predictions for MHC I binding affinity
and antigen processing.
For intermediate results, see the `predict_to_dataframe` method.
Parameters
----------
peptides : list of string, or EncodableSequences
peptides : list of string
Peptide sequences
alleles : list of string or string -> string dict
experiment_names :
n_flanks
c_flanks
verbose
If you are predicting for a single sample, pass a list of strings
(up to 6) indicating the genotype. If you are predicting across
multiple samples, pass a dict where the keys are (arbitrary)
experiment names and the values are the alleles to predict for that
sample.
experiment_names : list of string [same length as peptides]
If you are passing a dict for 'alleles', use this argument to
specify which peptides go with which experiments.
n_flanks : list of string [same length as peptides]
Upstream sequences before the peptide. Sequences of any length can
be given and a suffix of the size supported by the model will be
used.
c_flanks : list of string [same length as peptides]
Downstream sequences after the peptide. Sequences of any length can
be given and a prefix of the size supported by the model will be
used.
verbose : int
Set to 0 for quiet mode.
Returns
-------
numpy.array
Presentation scores for each peptide. Scores range from 0 to 1, with
higher values indicating more favorable presentation likelihood.
"""
return self.predict_to_dataframe(
peptides=peptides,
......@@ -276,6 +392,121 @@ class Class1PresentationPredictor(object):
c_flanks=c_flanks,
verbose=verbose).presentation_score.values
def predict_to_dataframe(
self,
peptides,
alleles,
experiment_names=None,
n_flanks=None,
c_flanks=None,
include_affinity_percentile=False,
verbose=1,
throw=True):
"""
Predict presentation scores across a set of peptides.
Presentation scores combine predictions for MHC I binding affinity
and antigen processing.
This method returns a pandas.DataFrame giving presentation scores plus
the binding affinity and processing predictions and other intermediate
results.
Parameters
----------
peptides : list of string
Peptide sequences
alleles : list of string or string -> string dict
If you are predicting for a single sample, pass a list of strings
(up to 6) indicating the genotype. If you are predicting across
multiple samples, pass a dict where the keys are (arbitrary)
experiment names and the values are the alleles to predict for that
sample.
experiment_names : list of string [same length as peptides]
If you are passing a dict for 'alleles', use this argument to
specify which peptides go with which experiments.
n_flanks : list of string [same length as peptides]
Upstream sequences before the peptide. Sequences of any length can
be given and a suffix of the size supported by the model will be
used.
c_flanks : list of string [same length as peptides]
Downstream sequences after the peptide. Sequences of any length can
be given and a prefix of the size supported by the model will be
used.
include_affinity_percentile : bool
Whether to include affinity percentile ranks
verbose : int
Set to 0 for quiet.
throw : verbose
Whether to throw exception (vs. just log a warning) on invalid
peptides, etc.
Returns
-------
pandas.DataFrame
Presentation scores and intermediate results.
"""
if isinstance(peptides, string_types):
raise TypeError("peptides must be a list not a string")
if isinstance(alleles, string_types):
raise TypeError("alleles must be a list or dict")
if isinstance(alleles, dict):
if experiment_names is None:
raise ValueError(
"experiment_names must be supplied when alleles is a dict")
else:
if experiment_names is not None:
raise ValueError(
"alleles must be a dict when experiment_names is specified")
alleles = numpy.array(alleles, copy=False)
if len(alleles) > MAX_ALLELES_PER_SAMPLE:
raise ValueError(
"When alleles is a list, it must have at most %d elements. "
"These alleles are taken to be a genotype for an "
"individual, and the strongest prediction across alleles "
"will be taken for each peptide. Note that this differs "
"from Class1AffinityPredictor.predict(), where alleles "
"is expected to be the same length as peptides."
% MAX_ALLELES_PER_SAMPLE)
experiment_names = ["experiment1"] * len(peptides)
alleles = {
"experiment1": alleles,
}
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
processing_scores = self.predict_processing(
peptides=peptides,
n_flanks=n_flanks,
c_flanks=c_flanks,
verbose=verbose)
df = self.predict_affinity(
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
df["affinity_score"] = from_ic50(df.affinity)
df["processing_score"] = processing_scores
if c_flanks is not None:
df.insert(1, "c_flank", c_flanks)
if n_flanks is not None:
df.insert(1, "n_flank", n_flanks)
model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name)
df["presentation_score"] = model.predict_proba(
df[self.model_inputs].values)[:,1]
del df["affinity_score"]
return df
def predict_sequences(
self,
sequences,
......@@ -306,8 +537,8 @@ class Class1PresentationPredictor(object):
One of:
- "best": return the strongest peptide for each sequence
- "all": return predictions for all peptides
- "filtered": return predictions stronger where comparison_quantity
is stronger than filter_value.
- "filtered": return predictions where comparison_quantity is
stronger (i.e (<) for affinity, (>) for scores) than filter_value.
comparison_quantity : string
One of "presentation_score", "processing_score", or "affinity".
Quantity to use to rank (if result is "best") or filter (if result
......@@ -443,100 +674,14 @@ class Class1PresentationPredictor(object):
return result_df
def predict_to_dataframe(
self,
peptides,
alleles,
experiment_names=None,
n_flanks=None,
c_flanks=None,
include_affinity_percentile=False,
verbose=1,
throw=True):
"""
Parameters
----------
peptides
alleles
experiment_names
n_flanks
c_flanks
include_affinity_percentile
verbose
throw
Returns
-------
"""
if isinstance(peptides, string_types):
raise TypeError("peptides must be a list not a string")
if isinstance(alleles, string_types):
raise TypeError("alleles must be a list or dict")
if isinstance(alleles, dict):
if experiment_names is None:
raise ValueError(
"experiment_names must be supplied when alleles is a dict")
else:
if experiment_names is not None:
raise ValueError(
"alleles must be a dict when experiment_names is specified")
alleles = numpy.array(alleles, copy=False)
if len(alleles) > MAX_ALLELES_PER_SAMPLE:
raise ValueError(
"When alleles is a list, it must have at most %d elements. "
"These alleles are taken to be a genotype for an "
"individual, and the strongest prediction across alleles "
"will be taken for each peptide. Note that this differs "
"from Class1AffinityPredictor.predict(), where alleles "
"is expected to be the same length as peptides."
% MAX_ALLELES_PER_SAMPLE)
experiment_names = ["experiment1"] * len(peptides)
alleles = {
"experiment1": alleles,
}
if (n_flanks is None) != (c_flanks is None):
raise ValueError("Specify both or neither of n_flanks, c_flanks")
processing_scores = self.predict_processing(
peptides=peptides,
n_flanks=n_flanks,
c_flanks=c_flanks,
verbose=verbose)
df = self.predict_affinity(
peptides=peptides,
experiment_names=experiment_names,
alleles=alleles,
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
df["affinity_score"] = from_ic50(df.affinity)
df["processing_score"] = processing_scores
if c_flanks is not None:
df.insert(1, "c_flank", c_flanks)
if n_flanks is not None:
df.insert(1, "n_flank", n_flanks)
model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name)
df["presentation_score"] = model.predict_proba(
df[self.model_inputs].values)[:,1]
del df["affinity_score"]
return df
def save(self, models_dir):
"""
Serialize the predictor to a directory on disk. If the directory does
Save the predictor to a directory on disk. If the directory does
not exist it will be created.
The wrapped Class1AffinityPredictor and Class1ProcessingPredictor
instances are included in the saved data.
Parameters
----------
models_dir : string
......@@ -583,6 +728,9 @@ class Class1PresentationPredictor(object):
"""
Deserialize a predictor from a directory on disk.
This will also load the wrapped Class1AffinityPredictor and
Class1ProcessingPredictor instances.
Parameters
----------
models_dir : string
......
"""
Antigen processing models
Antigen processing neural network implementation
"""
from __future__ import print_function
......@@ -7,7 +7,6 @@ from __future__ import print_function
import time
import collections
import numpy
import pandas
from .hyperparameters import HyperparameterDefaults
from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE
......@@ -15,6 +14,9 @@ from .flanking_encoding import FlankingEncoding
class Class1ProcessingNeuralNetwork(object):
"""
A neural network for antigen processing prediction
"""
network_hyperparameter_defaults = HyperparameterDefaults(
amino_acid_encoding="BLOSUM62",
peptide_max_length=15,
......@@ -82,6 +84,16 @@ class Class1ProcessingNeuralNetwork(object):
@property
def sequence_lengths(self):
"""
Supported maximum sequence lengths
Returns
-------
dict of string -> int
Keys are "peptide", "n_flank", "c_flank". Values give the maximum
supported sequence length.
"""
return {
"peptide": self.hyperparameters['peptide_max_length'],
"n_flank": self.hyperparameters['n_flank_length'],
......@@ -119,18 +131,28 @@ class Class1ProcessingNeuralNetwork(object):
progress_preamble="",
progress_print_interval=5.0):
"""
Fit the neural network.
Parameters
----------
peptides
n_flanks
c_flanks
targets : array of {0, 1} indicating hits (1) or decoys (0)
Returns
-------
sequences : FlankingEncoding
Peptides and upstream/downstream flanking sequences
targets : list of float
1 indicates hit, 0 indicates decoy
sample_weights : list of float
If not specified all samples have equal weight.
shuffle_permutation : list of int
Permutation (integer list) of same length as peptides and affinities
If None, then a random permutation will be generated.
verbose : int
Keras verbosity level
progress_callback : function
No-argument function to call after each epoch.
progress_preamble : string
Optional string of information to include in each progress update
progress_print_interval : float
How often (in seconds) to print progress update. Set to None to
disable.
"""
x_dict = self.network_input(sequences)
......@@ -236,9 +258,35 @@ class Class1ProcessingNeuralNetwork(object):
def predict(
self,
peptides,
n_flanks,
c_flanks,
n_flanks=None,
c_flanks=None,
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
"""
Predict antigen processing.
Parameters
----------
peptides : list of string
Peptide sequences
n_flanks : list of string
Upstream sequence before each peptide
c_flanks : list of string
Downstream sequence after each peptide
batch_size : int
Prediction keras batch size.
Returns
-------
numpy.array
Processing scores. Range is 0-1, higher indicates more favorable
processing.
"""
if n_flanks is None:
n_flanks = [""] * len(peptides)
if c_flanks is None:
c_flanks = [""] * len(peptides)
sequences = FlankingEncoding(
peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks)
return self.predict_encoded(sequences=sequences, batch_size=batch_size)
......@@ -248,6 +296,18 @@ class Class1ProcessingNeuralNetwork(object):
sequences,
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
"""
Predict antigen processing.
Parameters
----------
sequences : FlankingEncoding
Peptides and flanking sequences
batch_size : int
Prediction keras batch size.
Returns
-------
numpy.array
"""
x_dict = self.network_input(sequences)
raw_predictions = self.network().predict(
......@@ -262,7 +322,8 @@ class Class1ProcessingNeuralNetwork(object):
Parameters
----------
peptides : EncodableSequences or list of string
sequences : FlankingEncoding
Peptides and flanking sequences
Returns
-------
......@@ -295,14 +356,13 @@ class Class1ProcessingNeuralNetwork(object):
dropout_rate,
post_convolutional_dense_layer_sizes):
"""
Helper function to make a keras network
Helper function to make a keras network given hyperparameters.
"""
# We import keras here to avoid tensorflow debug output, etc. unless we
# are actually about to use Keras.
from keras.layers import Input
import keras.layers.pooling
import keras.initializers
from keras.layers.core import Dense, Flatten, Dropout
from keras.layers.merge import Concatenate
......@@ -597,8 +657,6 @@ class Class1ProcessingNeuralNetwork(object):
config : dict
weights : list of array, optional
Network weights to restore
weights_loader : callable, optional
Function to call (no arguments) to load weights when needed
Returns
-------
......
......@@ -10,7 +10,6 @@ import json
import hashlib
import logging
import collections
from six import string_types
import numpy
import pandas
......@@ -24,11 +23,32 @@ from .common import save_weights, load_weights, NumpyJSONEncoder
class Class1ProcessingPredictor(object):
"""
User-facing interface to antigen processing prediction.
Delegates to an ensemble of Class1ProcessingNeuralNetwork instances.
"""
def __init__(
self,
models,
manifest_df=None,
metadata_dataframes=None):
"""
Instantiate a new Class1ProcessingPredictor
Users will generally call load() to restore a saved predictor rather
than using this constructor.
Parameters
----------
models : list of Class1ProcessingNeuralNetwork
Neural networks in the ensemble.
manifest_df : pandas.DataFrame
Manifest dataframe. If not specified a new one will be created when
needed.
metadata_dataframes : dict of string -> pandas.DataFrame
Arbitrary metadata associated with this predictor
"""
self.models = models
self._manifest_df = manifest_df
self.metadata_dataframes = (
......@@ -36,6 +56,22 @@ class Class1ProcessingPredictor(object):
@property
def sequence_lengths(self):
"""
Supported maximum sequence lengths.
Passing a peptide greater than the maximum supported length results
in an error.
Passing an N- or C-flank sequence greater than the maximum supported
length results in some part of it being ignored.
Returns
-------
dict of string -> int
Keys are "peptide", "n_flank", "c_flank". Values give the maximum
supported sequence length.
"""
df = pandas.DataFrame([model.sequence_lengths for model in self.models])
return {
"peptide": df.peptide.min(), # min: anything greater is error
......@@ -44,6 +80,19 @@ class Class1ProcessingPredictor(object):
}
def add_models(self, models):
"""
Add models to the ensemble (in-place).
Parameters
----------
models : list of Class1ProcessingNeuralNetwork
Returns
-------
list of string
Names of the new models.
"""
new_model_names = []
original_manifest = self.manifest_df
new_manifest_rows = []
......@@ -125,10 +174,30 @@ class Class1ProcessingPredictor(object):
def predict(
self,
peptides,
n_flanks,
c_flanks,
n_flanks=None,
c_flanks=None,
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
"""
Predict antigen processing.
Parameters
----------
peptides : list of string
Peptide sequences
n_flanks : list of string
Upstream sequence before each peptide
c_flanks : list of string
Downstream sequence after each peptide
batch_size : int
Prediction keras batch size.
Returns
-------
numpy.array
Processing scores. Range is 0-1, higher indicates more favorable
processing.
"""
return self.predict_to_dataframe(
peptides=peptides,
n_flanks=n_flanks,
......@@ -138,9 +207,26 @@ class Class1ProcessingPredictor(object):
def predict_to_dataframe(
self,
peptides,
n_flanks,
c_flanks,
n_flanks=None,
c_flanks=None,
batch_size=DEFAULT_PREDICT_BATCH_SIZE):
"""
Predict antigen processing.
See `predict` method for parameter descriptions.
Returns
-------
pandas.DataFrame
Processing predictions are in the "score" column. Also includes
peptides and flanking sequences.
"""
if n_flanks is None:
n_flanks = [""] * len(peptides)
if c_flanks is None:
c_flanks = [""] * len(peptides)
sequences = FlankingEncoding(
peptides=peptides, n_flanks=n_flanks, c_flanks=c_flanks)
......@@ -149,6 +235,20 @@ class Class1ProcessingPredictor(object):
def predict_to_dataframe_encoded(
self, sequences, batch_size=DEFAULT_PREDICT_BATCH_SIZE):
"""
Predict antigen processing.
See `predict` method for more information.
Parameters
----------
sequences : FlankingEncoding
batch_size : int
Returns
-------
pandas.DataFrame
"""
score_array = []
......
......@@ -210,6 +210,9 @@ def load_weights(filename):
class NumpyJSONEncoder(json.JSONEncoder):
"""
JSON encoder (used with json module) that can handle numpy arrays.
"""
def default(self, obj):
if isinstance(obj, (
numpy.int_, numpy.intc, numpy.intp, numpy.int8,
......@@ -222,4 +225,4 @@ class NumpyJSONEncoder(json.JSONEncoder):
return float(obj)
if isinstance(obj, numpy.ndarray):
return obj.tolist()
return json.JSONEncoder.default(self, obj)
\ No newline at end of file
return json.JSONEncoder.default(self, obj)
......@@ -141,11 +141,20 @@ class EncodableSequences(object):
encodings is given by available_vector_encodings().
alignment_method : string
One of "pad_middle" or "left_pad_right_pad"
left_edge : int, size of fixed-position left side
left_edge : int
Size of fixed-position left side.
Only relevant for pad_middle alignment method
right_edge : int, size of the fixed-position right side
right_edge : int
Size of the fixed-position right side.
Only relevant for pad_middle alignment method
max_length : maximum supported peptide length
max_length : int
Maximum supported peptide length
trim : bool
If True, longer sequences will be trimmed to fit the maximum
supported length. Not supported for all alignment methods.
allow_unsupported_amino_acids : bool
If True, non-canonical amino acids will be replaced with the X
character before encoding.
Returns
-------
......@@ -237,11 +246,20 @@ class EncodableSequences(object):
sequences : list of string
alignment_method : string
One of "pad_middle" or "left_pad_right_pad"
left_edge : int, size of fixed-position left side
left_edge : int
Size of fixed-position left side.
Only relevant for pad_middle alignment method
right_edge : int, size of the fixed-position right side
right_edge : int
Size of the fixed-position right side.
Only relevant for pad_middle alignment method
max_length : maximum supported peptide length
max_length : int
maximum supported peptide length
trim : bool
If True, longer sequences will be trimmed to fit the maximum
supported length. Not supported for all alignment methods.
allow_unsupported_amino_acids : bool
If True, non-canonical amino acids will be replaced with the X
character before encoding.
Returns
-------
......
......@@ -17,12 +17,33 @@ import pandas
EncodingResult = namedtuple(
"EncodingResult", ["array", "peptide_lengths"])
class FlankingEncoding(object):
"""
Encode peptides and optionally their N- and C-flanking sequences into fixed
size numerical matrices. Similar to EncodableSequences but with support
for flanking sequences and the encoding scheme used by the processing
predictor.
Instances of this class have an immutable list of peptides with
flanking sequences. Encodings are cached in the instances for faster
performance when the same set of peptides needs to encoded more than once.
"""
unknown_character = "X"
def __init__(self, peptides, n_flanks, c_flanks):
"""
Constructor. Sequences of any lengths can be passed.
Parameters
----------
peptides : list of string
Peptide sequences
n_flanks : list of string [same length as peptides]
Upstream sequences
c_flanks : list of string [same length as peptides]
Downstream sequences
"""
self.dataframe = pandas.DataFrame({
"peptide": peptides,
"n_flank": n_flanks,
......@@ -31,6 +52,9 @@ class FlankingEncoding(object):
self.encoding_cache = {}
def __len__(self):
"""
Number of peptides.
"""
return len(self.dataframe)
def vector_encode(
......@@ -41,35 +65,31 @@ class FlankingEncoding(object):
c_flank_length,
allow_unsupported_amino_acids=True):
"""
Encode variable-length sequences to a fixed-size matrix. Amino acids
are encoded as specified by the vector_encoding_name argument.
See `sequences_to_fixed_length_index_encoded_array` for details.
See also: variable_length_to_fixed_length_categorical.
Encode variable-length sequences to a fixed-size matrix.
Parameters
----------
vector_encoding_name : string
How to represent amino acids.
One of "BLOSUM62", "one-hot", etc. Full list of supported vector
encodings is given by available_vector_encodings().
alignment_method : string
One of "pad_middle" or "left_pad_right_pad"
left_edge : int, size of fixed-position left side
Only relevant for pad_middle alignment method
right_edge : int, size of the fixed-position right side
Only relevant for pad_middle alignment method
max_length : maximum supported peptide length
How to represent amino acids. One of "BLOSUM62", "one-hot", etc.
See `amino_acid.available_vector_encodings()`.
peptide_max_length : int
Maximum supported peptide length.
n_flank_length : int
Maximum supported N-flank length
c_flank_length : int
Maximum supported C-flank length
allow_unsupported_amino_acids : bool
If True, non-canonical amino acids will be replaced with the X
character before encoding.
Returns
-------
numpy.array with shape (num sequences, encoded length, m)
numpy.array with shape (num sequences, length, m)
where
- num sequences is number of peptides, i.e. len(self)
- length is peptide_max_length + n_flank_length + c_flank_length
- m is the vector encoding length (usually 21).
- encoded length is max_length if alignment_method is pad_middle;
3 * max_length if it's left_pad_right_pad.
"""
cache_key = (
"vector_encode",
......@@ -91,13 +111,29 @@ class FlankingEncoding(object):
@staticmethod
def encode(
vector_encoding_name,
df,
peptide_max_length,
n_flank_length,
c_flank_length,
allow_unsupported_amino_acids=False):
vector_encoding_name,
df,
peptide_max_length,
n_flank_length,
c_flank_length,
allow_unsupported_amino_acids=False):
"""
Encode variable-length sequences to a fixed-size matrix.
Helper function. Users should use `vector_encode`.
Parameters
----------
vector_encoding_name : string
df : pandas.DataFrame
peptide_max_length : int
n_flank_length : int
c_flank_length : int
allow_unsupported_amino_acids : bool
Returns
-------
numpy.array
"""
error_df = df.loc[
(df.peptide.str.len() > peptide_max_length) |
......
import numpy
from copy import copy
from .allele_encoding import AlleleEncoding
class MultipleAlleleEncoding(object):
def __init__(
self,
experiment_names=[],
experiment_to_allele_list={},
max_alleles_per_experiment=6,
allele_to_sequence=None,
borrow_from=None):
padded_experiment_to_allele_list = {}
for (name, alleles) in experiment_to_allele_list.items():
assert len(alleles) > 0
assert len(alleles) <= max_alleles_per_experiment
alleles_with_mask = alleles + [None] * (
max_alleles_per_experiment - len(alleles))
padded_experiment_to_allele_list[name] = alleles_with_mask
flattened_allele_list = []
for name in experiment_names:
flattened_allele_list.extend(padded_experiment_to_allele_list[name])
self.allele_encoding = AlleleEncoding(
alleles=flattened_allele_list,
allele_to_sequence=allele_to_sequence,
borrow_from=borrow_from
)
self.max_alleles_per_experiment = max_alleles_per_experiment
self.experiment_names = numpy.array(experiment_names)
def append_alleles(self, alleles):
extended_alleles = list(self.allele_encoding.alleles)
for allele in alleles:
extended_alleles.append(allele)
extended_alleles.extend(
[None] * (self.max_alleles_per_experiment - 1))
assert len(extended_alleles) % self.max_alleles_per_experiment == 0, (
len(extended_alleles))
self.allele_encoding = AlleleEncoding(
alleles=extended_alleles,
borrow_from=self.allele_encoding)
self.experiment_names = numpy.concatenate([
self.experiment_names,
numpy.tile(None, len(alleles))
])
@property
def indices(self):
return self.allele_encoding.indices.values.reshape(
(-1, self.max_alleles_per_experiment))
@property
def alleles(self):
return numpy.reshape(
self.allele_encoding.alleles.values,
(-1, self.max_alleles_per_experiment))
def compact(self):
result = copy(self)
result.allele_encoding = self.allele_encoding.compact()
return result
def allele_representations(self, encoding_name):
return self.allele_encoding.allele_representations(encoding_name)
@property
def allele_to_sequence(self):
return self.allele_encoding.allele_to_sequence
def fixed_length_vector_encoded_sequences(self, encoding_name):
raise NotImplementedError()
def shuffle_in_place(self, shuffle_permutation=None):
alleles_matrix = self.alleles
if shuffle_permutation is None:
shuffle_permutation = numpy.random.permutation(len(alleles_matrix))
self.allele_encoding = AlleleEncoding(
alleles=alleles_matrix[shuffle_permutation].flatten(),
borrow_from=self.allele_encoding
)
self.experiment_names = self.experiment_names[shuffle_permutation]
\ No newline at end of file
'''
Run MHCflurry predictor on specified peptide/allele pairs.
Run MHCflurry predictor on specified peptides.
By default, the presentation predictor is used, and predictions for
MHC I binding affinity, antigen processing, and the composite presentation score
are returned. If you just want binding affinity predictions, pass
--affinity-only.
Examples:
Write a CSV file containing the contents of INPUT.csv plus an
additional column giving MHCflurry binding affinity predictions:
Write a CSV file containing the contents of INPUT.csv plus additional columns
giving MHCflurry predictions:
$ mhcflurry-predict INPUT.csv --out RESULT.csv
The input CSV file is expected to contain columns ``allele`` and ``peptide``.
The predictions are written to a column called ``mhcflurry_prediction``.
These default column names may be changed with the `--allele-column`,
`--peptide-column`, and `--prediction-column` options.
The input CSV file is expected to contain columns "allele", "peptide", and,
optionally, "n_flank", and "c_flank".
If `--out` is not specified, results are written to standard out.
If `--out` is not specified, results are written to stdout.
You can also run on alleles and peptides specified on the commandline, in
which case predictions are written for all combinations of alleles and
......@@ -35,8 +38,7 @@ import os
import pandas
from .common import set_keras_backend
from .downloads import get_default_class1_models_dir, get_default_class1_presentation_models_dir
from .downloads import get_default_class1_presentation_models_dir
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_presentation_predictor import Class1PresentationPredictor
from .version import __version__
......@@ -148,8 +150,10 @@ model_args.add_argument(
"--models",
metavar="DIR",
default=None,
help="Directory containing models. "
"Default: %s" % get_default_class1_models_dir(test_exists=False))
help="Directory containing models. Either a binding affinity predictor or "
"a presentation predictor can be used. "
"Default: %s" % get_default_class1_presentation_models_dir(
test_exists=False))
model_args.add_argument(
"--affinity-only",
action="store_true",
......@@ -161,6 +165,7 @@ model_args.add_argument(
default=False,
help="Do not use flanking sequence information even when available")
def run(argv=sys.argv[1:]):
logging.getLogger('tensorflow').disabled = True
......@@ -195,8 +200,6 @@ def run(argv=sys.argv[1:]):
"--affinity-only. Specify this argument to silence this warning.")
args.affinity_only = True
# The following two are informative commands that can come
# if a wrapper would like to incorporate input validation.
if args.list_supported_alleles:
print("\n".join(predictor.supported_alleles))
return
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment