Skip to content
Snippets Groups Projects
Commit db1d0e0f authored by Tim O'Donnell's avatar Tim O'Donnell
Browse files

better tests

parent 67987daf
No related branches found
No related tags found
No related merge requests found
...@@ -37,20 +37,20 @@ from mhcflurry.regression_target import to_ic50 ...@@ -37,20 +37,20 @@ from mhcflurry.regression_target import to_ic50
COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS) COMMON_AMINO_ACIDS = sorted(COMMON_AMINO_ACIDS)
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = None AFFINITY_PREDICTOR = None
def setup(): def setup():
global PAN_ALLELE_PREDICTOR_NO_MASS_SPEC global AFFINITY_PREDICTOR
startup() startup()
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = Class1AffinityPredictor.load( AFFINITY_PREDICTOR = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.no_mass_spec"), get_path("models_class1_pan", "models.no_mass_spec"),
optimization_level=0, optimization_level=0,
max_models=1) max_models=1)
def teardown(): def teardown():
global PAN_ALLELE_PREDICTOR_NO_MASS_SPEC global AFFINITY_PREDICTOR
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = None AFFINITY_PREDICTOR = None
cleanup() cleanup()
...@@ -91,26 +91,10 @@ def make_motif(presentation_predictor, allele, peptides, frac=0.01): ...@@ -91,26 +91,10 @@ def make_motif(presentation_predictor, allele, peptides, frac=0.01):
# TESTS # TESTS
################################################### ###################################################
def test_synthetic_allele_refinement_with_affinity_data(): def test_synthetic_allele_refinement(include_affinities=False):
test_synthetic_allele_refinement(include_affinities=True)
def test_synthetic_allele_refinement(max_epochs=10, include_affinities=False):
""" """
Idea: Test that in a synthetic example the model is able to learn that HLA-C*01:02
prefers P at position 3.
- take an allele where MS vs. no-MS trained predictors are very
different. One
possiblility is DLA-88*501:01 but human would be better
- generate synethetic multi-allele MS by combining single-allele MS for
differnet
alleles, including the selected allele
- train presentation predictor based on the no-ms pan-allele models on theis
synthetic dataset
- see if the pan-allele predictor learns the "correct" motif for the
selected
allele, i.e. updates to become more similar to the with-ms pan allele
predictor.
""" """
refine_allele = "HLA-C*01:02" refine_allele = "HLA-C*01:02"
alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01", "HLA-A*03:01", alleles = ["HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01", "HLA-A*03:01",
...@@ -171,24 +155,28 @@ def test_synthetic_allele_refinement(max_epochs=10, include_affinities=False): ...@@ -171,24 +155,28 @@ def test_synthetic_allele_refinement(max_epochs=10, include_affinities=False):
else: else:
affinity_train_df = None affinity_train_df = None
( (affinity_model,) = AFFINITY_PREDICTOR.class1_pan_allele_models
affinity_model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models
presentation_model = Class1PresentationNeuralNetwork( presentation_model = Class1PresentationNeuralNetwork(
auxiliary_input_features=["gene"], batch_generator_batch_size=1024, auxiliary_input_features=["gene"],
max_epochs=max_epochs, learning_rate=0.001, patience=5, min_delta=0.0, batch_generator_batch_size=1024,
random_negative_rate=1.0, random_negative_constant=25) max_epochs=10,
learning_rate=0.0001,
patience=5,
min_delta=0.0,
random_negative_rate=0,
random_negative_constant=0)
presentation_model.load_from_class1_neural_network(affinity_model) presentation_model.load_from_class1_neural_network(affinity_model)
presentation_predictor = Class1PresentationPredictor( presentation_predictor = Class1PresentationPredictor(
models=[presentation_model], models=[presentation_model],
allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence) allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence)
mms_allele_encoding = MultipleAlleleEncoding( mms_allele_encoding = MultipleAlleleEncoding(
experiment_names=["experiment1"] * len(mms_train_df), experiment_names=["experiment1"] * len(mms_train_df),
experiment_to_allele_list={ experiment_to_allele_list={
"experiment1": alleles, "experiment1": alleles,
}, max_alleles_per_experiment=6, }, max_alleles_per_experiment=6,
allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence, ) allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence, )
allele_encoding = copy.deepcopy(mms_allele_encoding) allele_encoding = copy.deepcopy(mms_allele_encoding)
if affinity_train_df is not None: if affinity_train_df is not None:
allele_encoding.append_alleles(affinity_train_df.allele.values) allele_encoding.append_alleles(affinity_train_df.allele.values)
...@@ -207,12 +195,6 @@ def test_synthetic_allele_refinement(max_epochs=10, include_affinities=False): ...@@ -207,12 +195,6 @@ def test_synthetic_allele_refinement(max_epochs=10, include_affinities=False):
peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)), peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)),
allele_encoding=mms_allele_encoding.allele_encoding, )).reshape( allele_encoding=mms_allele_encoding.allele_encoding, )).reshape(
(-1, len(alleles))) (-1, len(alleles)))
mms_train_df["pre_max_prediction"] = pre_predictions.max(1)
pre_auc = roc_auc_score(
mms_train_df.hit.values,
mms_train_df.pre_max_prediction.values)
print("PRE_AUC", pre_auc)
assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4) assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
random_peptides_encodable = EncodableSequences.create( random_peptides_encodable = EncodableSequences.create(
...@@ -223,101 +205,77 @@ def test_synthetic_allele_refinement(max_epochs=10, include_affinities=False): ...@@ -223,101 +205,77 @@ def test_synthetic_allele_refinement(max_epochs=10, include_affinities=False):
peptides=random_peptides_encodable, peptides=random_peptides_encodable,
allele=refine_allele) allele=refine_allele)
print("Original motif proline-3 rate: ", original_motif.loc[3, "P"]) print("Original motif proline-3 rate: ", original_motif.loc[3, "P"])
assert_less(original_motif.loc[3, "P"], 0.1)
metric_rows = []
iteration_box = [0]
def progress():
(_, presentation_prediction, affinities_predictions) = ( def progress(label = None):
predictor.predict( if label is None:
output="all", label = str(iteration_box[0])
peptides=mms_train_df.peptide.values, iteration_box[0] += 1
alleles=mms_allele_encoding)) print("*** iteration ", label, "***")
affinities_predictions = from_ic50(affinities_predictions) predictions_df = presentation_predictor.predict_to_dataframe(
for (kind, predictions) in [ peptides=mms_train_df.peptide.values,
("affinities", affinities_predictions), alleles=mms_allele_encoding)
("presentation", presentation_prediction)]: merged_df = pandas.merge(mms_train_df, predictions_df, on="peptide")
merged_hit_df = merged_df.loc[merged_df.hit == 1.0]
mms_train_df["max_prediction"] = predictions.max(1) correct_allele_fraction = (
mms_train_df["predicted_allele"] = pandas.Series(alleles).loc[ merged_hit_df.allele == merged_hit_df.true_allele).mean()
predictions.argmax(1).flatten() print("Correct allele fraction", correct_allele_fraction)
].values print(
"Mean score/affinity for hit",
print(kind) merged_df.loc[merged_df.hit == 1.0].score.mean(),
print(predictions) merged_df.loc[merged_df.hit == 1.0].affinity.mean())
print(
mean_predictions_for_hit = mms_train_df.loc[ "Mean score/affinity for decoy",
mms_train_df.hit == 1.0 merged_df.loc[merged_df.hit == 0.0].score.mean(),
].max_prediction.mean() merged_df.loc[merged_df.hit == 0.0].affinity.mean())
mean_predictions_for_decoy = mms_train_df.loc[ auc = roc_auc_score(merged_df.hit.values, merged_df.score.values)
mms_train_df.hit == 0.0 print("AUC", auc)
].max_prediction.mean() return (auc, correct_allele_fraction)
correct_allele_fraction = (
mms_train_df.loc[mms_train_df.hit == 1.0].predicted_allele == (pre_auc, pre_correct_allele_fraction) = progress(label="Pre fitting")
mms_train_df.loc[mms_train_df.hit == 1.0].true_allele
).mean()
auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.max_prediction.values)
print(kind, "Mean prediction for hit", mean_predictions_for_hit)
print(kind, "Mean prediction for decoy", mean_predictions_for_decoy)
print(kind, "Correct predicted allele fraction", correct_allele_fraction)
print(kind, "AUC", auc)
metric_rows.append((
kind,
mean_predictions_for_hit,
mean_predictions_for_decoy,
correct_allele_fraction,
auc,
))
update_motifs()
return (presentation_prediction, auc)
print("Pre fitting:")
#progress()
presentation_model.fit(peptides=train_df.peptide.values, presentation_model.fit(peptides=train_df.peptide.values,
labels=train_df.label.values, labels=train_df.label.values,
inequalities=train_df.measurement_inequality.values, inequalities=train_df.measurement_inequality.values,
affinities_mask=train_df.is_affinity.values, affinities_mask=train_df.is_affinity.values,
allele_encoding=allele_encoding, ) allele_encoding=allele_encoding,
post_predictions = presentation_model.predict( progress_callback=progress)
peptides=mms_train_df.peptide.values, (post_auc, post_correct_allele_fraction) = progress(label="Done fitting")
allele_encoding=mms_allele_encoding).score
mms_train_df["post_max_prediction"] = pre_predictions.max(1)
post_auc = roc_auc_score(
mms_train_df.hit.values,
mms_train_df.post_max_prediction.values)
print("POST_AUC", post_auc)
final_motif = make_motif( final_motif = make_motif(
presentation_predictor=presentation_predictor, presentation_predictor=presentation_predictor,
peptides=random_peptides_encodable, peptides=random_peptides_encodable,
allele=refine_allele) allele=refine_allele)
print("Final motif proline-3 rate: ", final_motif.loc[3, "P"]) print("Final motif proline-3 rate: ", final_motif.loc[3, "P"])
import ipdb ; ipdb.set_trace()
assert_greater(post_auc, pre_auc)
# (predictions, final_auc) = progress() assert_greater(
# print("Final AUC", final_auc) post_correct_allele_fraction, pre_correct_allele_fraction - 0.05)
assert_greater(final_motif.loc[3, "P"], original_motif.loc[3, "P"])
""""
update_motifs()
def test_real_data_multiallelic_refinement(max_epochs=10):
metrics = pandas.DataFrame( """
metric_rows, Test on real data that we can learn that HLA-A*02:20 has a preference K at
columns=[ position 1.
"output",
"mean_predictions_for_hit",
"mean_predictions_for_decoy",
"correct_allele_fraction",
"auc"
])
""" """
(affinity_model,) = AFFINITY_PREDICTOR.class1_pan_allele_models
presentation_model = Class1PresentationNeuralNetwork(
auxiliary_input_features=["gene"],
batch_generator_batch_size=1024,
max_epochs=max_epochs,
learning_rate=0.0001,
patience=5,
min_delta=0.0,
random_negative_rate=0,
random_negative_constant=0)
presentation_model.load_from_class1_neural_network(affinity_model)
presentation_predictor = Class1PresentationPredictor(
models=[presentation_model],
allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence)
def Xtest_real_data_multiallelic_refinement(max_epochs=10):
ms_df = pandas.read_csv( ms_df = pandas.read_csv(
get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2")) get_path("data_mass_spec_annotated", "annotated_ms.csv.bz2"))
ms_df = ms_df.loc[ ms_df = ms_df.loc[
...@@ -364,67 +322,42 @@ def Xtest_real_data_multiallelic_refinement(max_epochs=10): ...@@ -364,67 +322,42 @@ def Xtest_real_data_multiallelic_refinement(max_epochs=10):
del pan_sub_train_df["measurement_value"] del pan_sub_train_df["measurement_value"]
pan_sub_train_df["is_affinity"] = True pan_sub_train_df["is_affinity"] = True
pan_predictor = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.with_mass_spec"),
optimization_level=0,
max_models=1)
allele_encoding = MultipleAlleleEncoding( allele_encoding = MultipleAlleleEncoding(
experiment_names=multi_train_df.sample_id.values, experiment_names=multi_train_df.sample_id.values,
experiment_to_allele_list=sample_table.alleles.to_dict(), experiment_to_allele_list=sample_table.alleles.to_dict(),
max_alleles_per_experiment=sample_table.alleles.str.len().max(), max_alleles_per_experiment=sample_table.alleles.str.len().max(),
allele_to_sequence=pan_predictor.allele_to_sequence, allele_to_sequence=AFFINITY_PREDICTOR.allele_to_sequence,
) )
allele_encoding.append_alleles(pan_sub_train_df.allele.values) allele_encoding.append_alleles(pan_sub_train_df.allele.values)
allele_encoding = allele_encoding.compact() allele_encoding = allele_encoding.compact()
combined_train_df = pandas.concat([multi_train_df, pan_sub_train_df]) combined_train_df = pandas.concat([multi_train_df, pan_sub_train_df])
presentation_predictor = Class1PresentationNeuralNetwork( refine_allele = "HLA-A*02:20"
pan_predictor, random_peptides_encodable = EncodableSequences.create(
auxiliary_input_features=[], random_peptides(10000, 9))
max_ensemble_size=1,
max_epochs=max_epochs,
learning_rate=0.0001,
patience=5,
min_delta=0.0,
random_negative_rate=1.0)
pre_predictions = from_ic50(
presentation_predictor.predict(
output="affinities",
peptides=combined_train_df.peptide.values,
alleles=allele_encoding))
(model,) = pan_predictor.class1_pan_allele_models
expected_pre_predictions = from_ic50(
model.predict(
peptides=numpy.repeat(combined_train_df.peptide.values, len(alleles)),
allele_encoding=allele_encoding.allele_encoding,
)).reshape((-1, len(alleles)))[:,0]
assert_allclose(pre_predictions, expected_pre_predictions, rtol=1e-4)
motifs_history = []
random_peptides_encodable = make_random_peptides(10000, [9])
def update_motifs():
for allele in multi_train_alleles:
motif = make_motif(allele, random_peptides_encodable)
motifs_history.append((allele, motif))
print("Pre fitting:") original_motif = make_motif(
update_motifs() presentation_predictor=presentation_predictor,
print("Fitting...") peptides=random_peptides_encodable,
allele=refine_allele)
print(
refine_allele,
"original motif lysine-1 rate: ",
original_motif.loc[1, "K"])
presentation_predictor.fit( presentation_model.fit(
peptides=combined_train_df.peptide.values, peptides=combined_train_df.peptide.values,
labels=combined_train_df.label.values, labels=combined_train_df.label.values,
allele_encoding=allele_encoding, allele_encoding=allele_encoding,
affinities_mask=combined_train_df.is_affinity.values, affinities_mask=combined_train_df.is_affinity.values,
inequalities=combined_train_df.measurement_inequality.values, inequalities=combined_train_df.measurement_inequality.values)
progress_callback=update_motifs,
) final_motif = make_motif(
presentation_predictor=presentation_predictor,
peptides=random_peptides_encodable,
allele=refine_allele)
print(refine_allele, "final motif lysine-1 rate: ", final_motif.loc[1, "K"])
import ipdb ; ipdb.set_trace() assert_greater(final_motif.loc[1, "K"], original_motif.loc[1, "K"])
\ No newline at end of file
\ No newline at end of file
...@@ -19,10 +19,10 @@ from mhcflurry.common import random_peptides ...@@ -19,10 +19,10 @@ from mhcflurry.common import random_peptides
from mhcflurry.testing_utils import cleanup, startup from mhcflurry.testing_utils import cleanup, startup
from mhcflurry.regression_target import to_ic50 from mhcflurry.regression_target import to_ic50
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = None AFFINITY_PREDICTOR = None
def setup(): def setup():
global PAN_ALLELE_PREDICTOR_NO_MASS_SPEC global AFFINITY_PREDICTOR
startup() startup()
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = Class1AffinityPredictor.load( PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = Class1AffinityPredictor.load(
get_path("models_class1_pan", "models.no_mass_spec"), get_path("models_class1_pan", "models.no_mass_spec"),
...@@ -31,13 +31,13 @@ def setup(): ...@@ -31,13 +31,13 @@ def setup():
def teardown(): def teardown():
global PAN_ALLELE_PREDICTOR_NO_MASS_SPEC global AFFINITY_PREDICTOR
PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = None PAN_ALLELE_PREDICTOR_NO_MASS_SPEC = None
cleanup() cleanup()
def test_basic(): def test_basic():
affinity_predictor = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC affinity_predictor = AFFINITY_PREDICTOR
models = [] models = []
for affinity_network in affinity_predictor.class1_pan_allele_models: for affinity_network in affinity_predictor.class1_pan_allele_models:
presentation_network = Class1PresentationNeuralNetwork( presentation_network = Class1PresentationNeuralNetwork(
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment