Skip to content
Snippets Groups Projects
Commit 9cce08f2 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

add sequence scanning to presentation predictor

parent 64e7f044
No related branches found
No related tags found
No related merge requests found
...@@ -240,6 +240,111 @@ class Class1PresentationPredictor(object): ...@@ -240,6 +240,111 @@ class Class1PresentationPredictor(object):
c_flanks=c_flanks, c_flanks=c_flanks,
verbose=verbose).presentation_score.values verbose=verbose).presentation_score.values
def predict_scan(
self,
sequences,
alleles,
result="best", # or "all" or "filtered"
comparison_quantity="presentation_score",
comparison_value=None,
peptide_lengths=[8, 9, 10, 11],
use_flanks=True,
include_affinity_percentile=False,
verbose=1,
throw=True):
processing_predictor = self.processing_predictor_with_flanks
if not use_flanks or processing_predictor is None:
processing_predictor = self.processing_predictor_without_flanks
supported_sequence_lengths = processing_predictor.sequence_lengths
n_flank_length = supported_sequence_lengths["n_flank"]
c_flank_length = supported_sequence_lengths["c_flank"]
sequence_names = []
n_flanks = [] if use_flanks else None
c_flanks = [] if use_flanks else None
peptides = []
if isinstance(sequences, string_types):
sequences = [sequences]
if not isinstance(sequences, dict):
sequences = collections.OrderedDict(
("sequence_%04d" % (i + 1), sequence)
for (i, sequence) in enumerate(sequences))
if not isinstance(alleles, dict):
alleles = dict((name, alleles) for name in sequences.keys())
missing = [key for key in sequences if key not in alleles]
if missing:
raise ValueError(
"Sequence names missing from alleles dict: ", missing)
for (name, sequence) in sequences.items():
if not isinstance(sequence, string_types):
raise ValueError("Expected string, not %s (%s)" % (
sequence, type(sequence)))
for peptide_start in range(len(sequence) - min(peptide_lengths)):
n_flank_start = max(0, peptide_start - n_flank_length)
for peptide_length in peptide_lengths:
c_flank_end = (
peptide_start + peptide_length + c_flank_length)
sequence_names.append(name)
peptides.append(
sequence[peptide_start: peptide_start + peptide_length])
if use_flanks:
n_flanks.append(
sequence[n_flank_start : peptide_start])
c_flanks.append(
sequence[peptide_start + peptide_length : c_flank_end])
result_df = self.predict_to_dataframe(
peptides=peptides,
alleles=alleles,
n_flanks=n_flanks,
c_flanks=c_flanks,
experiment_names=sequence_names,
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
result_df = result_df.rename(
columns={"experiment_name": "sequence_name"})
comparison_is_score = comparison_quantity.endswith("score")
result_df = result_df.sort_values(
comparison_quantity,
ascending=not comparison_is_score)
if result == "best":
result_df = result_df.drop_duplicates(
"sequence_name", keep="first").sort_values("sequence_name")
elif result == "filtered":
if comparison_is_score:
result_df = result_df.loc[
result_df[comparison_quantity] >= comparison_value
]
else:
result_df = result_df.loc[
result_df[comparison_quantity] <= comparison_value
]
elif result == "all":
pass
else:
raise ValueError(
"Unknown result: %s. Valid choices are: best, filtered, all"
% result)
result_df = result_df.reset_index(drop=True)
result_df = result_df.copy()
return result_df
def predict_to_dataframe( def predict_to_dataframe(
self, self,
peptides, peptides,
...@@ -298,7 +403,10 @@ class Class1PresentationPredictor(object): ...@@ -298,7 +403,10 @@ class Class1PresentationPredictor(object):
throw=throw) throw=throw)
df["affinity_score"] = from_ic50(df.affinity) df["affinity_score"] = from_ic50(df.affinity)
df["processing_score"] = processing_scores df["processing_score"] = processing_scores
if c_flanks is not None:
df.insert(1, "c_flank", c_flanks)
if n_flanks is not None:
df.insert(1, "n_flank", n_flanks)
model_name = 'with_flanks' if n_flanks is not None else "without_flanks" model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name) model = self.get_model(model_name)
...@@ -383,12 +491,20 @@ class Class1PresentationPredictor(object): ...@@ -383,12 +491,20 @@ class Class1PresentationPredictor(object):
processing_predictor_with_flanks = Class1ProcessingPredictor.load( processing_predictor_with_flanks = Class1ProcessingPredictor.load(
join(models_dir, "processing_predictor_with_flanks"), join(models_dir, "processing_predictor_with_flanks"),
max_models=max_models) max_models=max_models)
else:
logging.warning(
"Presentation predictor is missing processing predictor: %s",
join(models_dir, "processing_predictor_with_flanks"))
processing_predictor_without_flanks = None processing_predictor_without_flanks = None
if exists(join(models_dir, "processing_predictor_without_flanks")): if exists(join(models_dir, "processing_predictor_without_flanks")):
processing_predictor_without_flanks = Class1ProcessingPredictor.load( processing_predictor_without_flanks = Class1ProcessingPredictor.load(
join(models_dir, "processing_predictor_without_flanks"), join(models_dir, "processing_predictor_without_flanks"),
max_models=max_models) max_models=max_models)
else:
logging.warning(
"Presentation predictor is missing processing predictor: %s",
join(models_dir, "processing_predictor_without_flanks"))
weights_dataframe = pandas.read_csv( weights_dataframe = pandas.read_csv(
join(models_dir, "weights.csv"), join(models_dir, "weights.csv"),
......
...@@ -80,6 +80,14 @@ class Class1ProcessingNeuralNetwork(object): ...@@ -80,6 +80,14 @@ class Class1ProcessingNeuralNetwork(object):
self.network_weights = None self.network_weights = None
self.fit_info = [] self.fit_info = []
@property
def sequence_lengths(self):
return {
"peptide": self.hyperparameters['peptide_max_length'],
"n_flank": self.hyperparameters['n_flank_length'],
"c_flank": self.hyperparameters['c_flank_length'],
}
def network(self): def network(self):
""" """
Return the keras model associated with this network. Return the keras model associated with this network.
......
...@@ -34,6 +34,15 @@ class Class1ProcessingPredictor(object): ...@@ -34,6 +34,15 @@ class Class1ProcessingPredictor(object):
self.metadata_dataframes = ( self.metadata_dataframes = (
dict(metadata_dataframes) if metadata_dataframes else {}) dict(metadata_dataframes) if metadata_dataframes else {})
@property
def sequence_lengths(self):
df = pandas.DataFrame([model.sequence_lengths for model in self.models])
return {
"peptide": df.peptide.min(), # min: anything greater is error
"n_flank": df.n_flank.max(), # max: anything greater is ignored
"c_flank": df.c_flank.max(),
}
def add_models(self, models): def add_models(self, models):
new_model_names = [] new_model_names = []
original_manifest = self.manifest_df original_manifest = self.manifest_df
......
...@@ -26,12 +26,14 @@ from . import data_path ...@@ -26,12 +26,14 @@ from . import data_path
AFFINITY_PREDICTOR = None AFFINITY_PREDICTOR = None
CLEAVAGE_PREDICTOR = None CLEAVAGE_PREDICTOR = None
CLEAVAGE_PREDICTOR_NO_FLANKING = None CLEAVAGE_PREDICTOR_NO_FLANKING = None
PRESENTATION_PREDICTOR = None
def setup(): def setup():
global AFFINITY_PREDICTOR global AFFINITY_PREDICTOR
global CLEAVAGE_PREDICTOR global CLEAVAGE_PREDICTOR
global CLEAVAGE_PREDICTOR_NO_FLANKING global CLEAVAGE_PREDICTOR_NO_FLANKING
global PRESENTATION_PREDICTOR
startup() startup()
AFFINITY_PREDICTOR = Class1AffinityPredictor.load( AFFINITY_PREDICTOR = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.combined"), get_path("models_class1_pan", "models.combined"),
...@@ -42,15 +44,18 @@ def setup(): ...@@ -42,15 +44,18 @@ def setup():
CLEAVAGE_PREDICTOR_NO_FLANKING = Class1ProcessingPredictor.load( CLEAVAGE_PREDICTOR_NO_FLANKING = Class1ProcessingPredictor.load(
get_path("models_class1_processing_variants", "models.selected.no_flank"), get_path("models_class1_processing_variants", "models.selected.no_flank"),
max_models=1) max_models=1)
PRESENTATION_PREDICTOR = Class1PresentationPredictor.load()
def teardown(): def teardown():
global AFFINITY_PREDICTOR global AFFINITY_PREDICTOR
global CLEAVAGE_PREDICTOR global CLEAVAGE_PREDICTOR
global CLEAVAGE_PREDICTOR_NO_FLANKING global CLEAVAGE_PREDICTOR_NO_FLANKING
global PRESENTATION_PREDICTOR
AFFINITY_PREDICTOR = None AFFINITY_PREDICTOR = None
CLEAVAGE_PREDICTOR = None CLEAVAGE_PREDICTOR = None
CLEAVAGE_PREDICTOR_NO_FLANKING = None CLEAVAGE_PREDICTOR_NO_FLANKING = None
PRESENTATION_PREDICTOR = None
cleanup() cleanup()
...@@ -126,3 +131,92 @@ def test_basic(): ...@@ -126,3 +131,92 @@ def test_basic():
test_df["prediction1"], other_test_df["prediction1"], decimal=6) test_df["prediction1"], other_test_df["prediction1"], decimal=6)
numpy.testing.assert_array_almost_equal( numpy.testing.assert_array_almost_equal(
test_df["prediction2"], other_test_df["prediction2"], decimal=6) test_df["prediction2"], other_test_df["prediction2"], decimal=6)
def test_downloaded_predictor():
global PRESENTATION_PREDICTOR
# Test sequence scanning
scan_results1 = PRESENTATION_PREDICTOR.predict_scan(
sequences=[
"MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
],
alleles=[
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
])
print(scan_results1)
assert_equal(len(scan_results1), 3)
assert (scan_results1.affinity < 200).all()
assert (scan_results1.presentation_score > 0.7).all()
scan_results2 = PRESENTATION_PREDICTOR.predict_scan(
result="filtered",
comparison_value=500,
comparison_quantity="affinity",
sequences={
"seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
},
alleles=[
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
])
print(scan_results2)
assert len(scan_results2) > 10
assert (scan_results2.affinity <= 500).all()
scan_results3 = PRESENTATION_PREDICTOR.predict_scan(
result="filtered",
comparison_value=0.9,
comparison_quantity="presentation_score",
sequences={
"seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
},
alleles=[
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
])
print(scan_results3)
assert len(scan_results3) > 5, len(scan_results3)
assert (scan_results3.presentation_score >= 0.9).all()
scan_results4 = PRESENTATION_PREDICTOR.predict_scan(
result="all",
comparison_quantity="affinity",
sequences={
"seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
},
alleles=[
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
])
print(scan_results4)
assert len(scan_results4) > 200, len(scan_results4)
assert_less(scan_results4.iloc[0].affinity, 100)
...@@ -67,3 +67,4 @@ def test_basic(): ...@@ -67,3 +67,4 @@ def test_basic():
n_flanks=df.n_flank.values, n_flanks=df.n_flank.values,
c_flanks=df.c_flank.values) c_flanks=df.c_flank.values)
assert_array_equal(df.score.values, df3.score.values) assert_array_equal(df.score.values, df3.score.values)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment