Skip to content
Snippets Groups Projects
Commit 9cce08f2 authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

add sequence scanning to presentation predictor

parent 64e7f044
No related merge requests found
......@@ -240,6 +240,111 @@ class Class1PresentationPredictor(object):
c_flanks=c_flanks,
verbose=verbose).presentation_score.values
def predict_scan(
self,
sequences,
alleles,
result="best", # or "all" or "filtered"
comparison_quantity="presentation_score",
comparison_value=None,
peptide_lengths=[8, 9, 10, 11],
use_flanks=True,
include_affinity_percentile=False,
verbose=1,
throw=True):
processing_predictor = self.processing_predictor_with_flanks
if not use_flanks or processing_predictor is None:
processing_predictor = self.processing_predictor_without_flanks
supported_sequence_lengths = processing_predictor.sequence_lengths
n_flank_length = supported_sequence_lengths["n_flank"]
c_flank_length = supported_sequence_lengths["c_flank"]
sequence_names = []
n_flanks = [] if use_flanks else None
c_flanks = [] if use_flanks else None
peptides = []
if isinstance(sequences, string_types):
sequences = [sequences]
if not isinstance(sequences, dict):
sequences = collections.OrderedDict(
("sequence_%04d" % (i + 1), sequence)
for (i, sequence) in enumerate(sequences))
if not isinstance(alleles, dict):
alleles = dict((name, alleles) for name in sequences.keys())
missing = [key for key in sequences if key not in alleles]
if missing:
raise ValueError(
"Sequence names missing from alleles dict: ", missing)
for (name, sequence) in sequences.items():
if not isinstance(sequence, string_types):
raise ValueError("Expected string, not %s (%s)" % (
sequence, type(sequence)))
for peptide_start in range(len(sequence) - min(peptide_lengths)):
n_flank_start = max(0, peptide_start - n_flank_length)
for peptide_length in peptide_lengths:
c_flank_end = (
peptide_start + peptide_length + c_flank_length)
sequence_names.append(name)
peptides.append(
sequence[peptide_start: peptide_start + peptide_length])
if use_flanks:
n_flanks.append(
sequence[n_flank_start : peptide_start])
c_flanks.append(
sequence[peptide_start + peptide_length : c_flank_end])
result_df = self.predict_to_dataframe(
peptides=peptides,
alleles=alleles,
n_flanks=n_flanks,
c_flanks=c_flanks,
experiment_names=sequence_names,
include_affinity_percentile=include_affinity_percentile,
verbose=verbose,
throw=throw)
result_df = result_df.rename(
columns={"experiment_name": "sequence_name"})
comparison_is_score = comparison_quantity.endswith("score")
result_df = result_df.sort_values(
comparison_quantity,
ascending=not comparison_is_score)
if result == "best":
result_df = result_df.drop_duplicates(
"sequence_name", keep="first").sort_values("sequence_name")
elif result == "filtered":
if comparison_is_score:
result_df = result_df.loc[
result_df[comparison_quantity] >= comparison_value
]
else:
result_df = result_df.loc[
result_df[comparison_quantity] <= comparison_value
]
elif result == "all":
pass
else:
raise ValueError(
"Unknown result: %s. Valid choices are: best, filtered, all"
% result)
result_df = result_df.reset_index(drop=True)
result_df = result_df.copy()
return result_df
def predict_to_dataframe(
self,
peptides,
......@@ -298,7 +403,10 @@ class Class1PresentationPredictor(object):
throw=throw)
df["affinity_score"] = from_ic50(df.affinity)
df["processing_score"] = processing_scores
if c_flanks is not None:
df.insert(1, "c_flank", c_flanks)
if n_flanks is not None:
df.insert(1, "n_flank", n_flanks)
model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
model = self.get_model(model_name)
......@@ -383,12 +491,20 @@ class Class1PresentationPredictor(object):
processing_predictor_with_flanks = Class1ProcessingPredictor.load(
join(models_dir, "processing_predictor_with_flanks"),
max_models=max_models)
else:
logging.warning(
"Presentation predictor is missing processing predictor: %s",
join(models_dir, "processing_predictor_with_flanks"))
processing_predictor_without_flanks = None
if exists(join(models_dir, "processing_predictor_without_flanks")):
processing_predictor_without_flanks = Class1ProcessingPredictor.load(
join(models_dir, "processing_predictor_without_flanks"),
max_models=max_models)
else:
logging.warning(
"Presentation predictor is missing processing predictor: %s",
join(models_dir, "processing_predictor_without_flanks"))
weights_dataframe = pandas.read_csv(
join(models_dir, "weights.csv"),
......
......@@ -80,6 +80,14 @@ class Class1ProcessingNeuralNetwork(object):
self.network_weights = None
self.fit_info = []
@property
def sequence_lengths(self):
return {
"peptide": self.hyperparameters['peptide_max_length'],
"n_flank": self.hyperparameters['n_flank_length'],
"c_flank": self.hyperparameters['c_flank_length'],
}
def network(self):
"""
Return the keras model associated with this network.
......
......@@ -34,6 +34,15 @@ class Class1ProcessingPredictor(object):
self.metadata_dataframes = (
dict(metadata_dataframes) if metadata_dataframes else {})
@property
def sequence_lengths(self):
df = pandas.DataFrame([model.sequence_lengths for model in self.models])
return {
"peptide": df.peptide.min(), # min: anything greater is error
"n_flank": df.n_flank.max(), # max: anything greater is ignored
"c_flank": df.c_flank.max(),
}
def add_models(self, models):
new_model_names = []
original_manifest = self.manifest_df
......
......@@ -26,12 +26,14 @@ from . import data_path
AFFINITY_PREDICTOR = None
CLEAVAGE_PREDICTOR = None
CLEAVAGE_PREDICTOR_NO_FLANKING = None
PRESENTATION_PREDICTOR = None
def setup():
global AFFINITY_PREDICTOR
global CLEAVAGE_PREDICTOR
global CLEAVAGE_PREDICTOR_NO_FLANKING
global PRESENTATION_PREDICTOR
startup()
AFFINITY_PREDICTOR = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.combined"),
......@@ -42,15 +44,18 @@ def setup():
CLEAVAGE_PREDICTOR_NO_FLANKING = Class1ProcessingPredictor.load(
get_path("models_class1_processing_variants", "models.selected.no_flank"),
max_models=1)
PRESENTATION_PREDICTOR = Class1PresentationPredictor.load()
def teardown():
global AFFINITY_PREDICTOR
global CLEAVAGE_PREDICTOR
global CLEAVAGE_PREDICTOR_NO_FLANKING
global PRESENTATION_PREDICTOR
AFFINITY_PREDICTOR = None
CLEAVAGE_PREDICTOR = None
CLEAVAGE_PREDICTOR_NO_FLANKING = None
PRESENTATION_PREDICTOR = None
cleanup()
......@@ -126,3 +131,92 @@ def test_basic():
test_df["prediction1"], other_test_df["prediction1"], decimal=6)
numpy.testing.assert_array_almost_equal(
test_df["prediction2"], other_test_df["prediction2"], decimal=6)
def test_downloaded_predictor():
global PRESENTATION_PREDICTOR
# Test sequence scanning
scan_results1 = PRESENTATION_PREDICTOR.predict_scan(
sequences=[
"MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
],
alleles=[
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
])
print(scan_results1)
assert_equal(len(scan_results1), 3)
assert (scan_results1.affinity < 200).all()
assert (scan_results1.presentation_score > 0.7).all()
scan_results2 = PRESENTATION_PREDICTOR.predict_scan(
result="filtered",
comparison_value=500,
comparison_quantity="affinity",
sequences={
"seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
},
alleles=[
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
])
print(scan_results2)
assert len(scan_results2) > 10
assert (scan_results2.affinity <= 500).all()
scan_results3 = PRESENTATION_PREDICTOR.predict_scan(
result="filtered",
comparison_value=0.9,
comparison_quantity="presentation_score",
sequences={
"seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
},
alleles=[
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
])
print(scan_results3)
assert len(scan_results3) > 5, len(scan_results3)
assert (scan_results3.presentation_score >= 0.9).all()
scan_results4 = PRESENTATION_PREDICTOR.predict_scan(
result="all",
comparison_quantity="affinity",
sequences={
"seq1": "MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLVEVEKGVLPQLE",
"seq2": "QPYVFIKRSDARTAPHGHVMVELVAELEGIQYGRSGETLGVLVPHVGEIPVAYRKVLLRKNGNKG",
"seq3": "AGGHSYGADLKSFDLGDELGTDPYEDFQENWNTKHSSGVTRELMRELNGGAYTRYVDNNFCGPDG",
},
alleles=[
"HLA-A*02:01",
"HLA-A*03:01",
"HLA-B*57:01",
"HLA-B*44:02",
"HLA-C*02:01",
"HLA-C*07:01",
])
print(scan_results4)
assert len(scan_results4) > 200, len(scan_results4)
assert_less(scan_results4.iloc[0].affinity, 100)
......@@ -67,3 +67,4 @@ def test_basic():
n_flanks=df.n_flank.values,
c_flanks=df.c_flank.values)
assert_array_equal(df.score.values, df3.score.values)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment