from nose.tools import eq_, assert_less, assert_greater, assert_almost_equal import numpy import pandas from numpy import testing numpy.random.seed(0) import logging logging.getLogger('tensorflow').disabled = True from mhcflurry.class1_neural_network import Class1NeuralNetwork from mhcflurry.downloads import get_path from mhcflurry.common import random_peptides from mhcflurry.testing_utils import module_cleanup teardown = module_cleanup def test_class1_neural_network_a0205_training_accuracy(): # Memorize the dataset. hyperparameters = dict( activation="tanh", layer_sizes=[16], max_epochs=500, early_stopping=False, validation_split=0.0, locally_connected_layers=[ { "filters": 8, "activation": "tanh", "kernel_size": 3 } ], dense_layer_l1_regularization=0.0, dropout_probability=0.0) # First test a Class1NeuralNetwork, then a Class1AffinityPredictor. allele = "HLA-A*02:05" df = pandas.read_csv( get_path( "data_curated", "curated_training_data.no_mass_spec.csv.bz2")) df = df.loc[ df.allele == allele ] df = df.loc[ df.peptide.str.len() == 9 ] df = df.loc[ df.measurement_type == "quantitative" ] df = df.loc[ df.measurement_source == "kim2014" ] predictor = Class1NeuralNetwork(**hyperparameters) predictor.fit(df.peptide.values, df.measurement_value.values) ic50_pred = predictor.predict(df.peptide.values) ic50_true = df.measurement_value.values eq_(len(ic50_pred), len(ic50_true)) testing.assert_allclose( numpy.log(ic50_pred), numpy.log(ic50_true), rtol=0.2, atol=0.2) # Test that a second predictor has the same architecture json. # This is important for an optimization we use to re-use predictors of the # same architecture at prediction time. hyperparameters2 = dict( activation="tanh", layer_sizes=[16], max_epochs=1, early_stopping=False, validation_split=0.0, locally_connected_layers=[ { "filters": 8, "activation": "tanh", "kernel_size": 3 } ], dense_layer_l1_regularization=0.0, dropout_probability=0.0) predictor2 = Class1NeuralNetwork(**hyperparameters2) predictor2.fit(df.peptide.values, df.measurement_value.values, verbose=0) eq_(predictor.network().to_json(), predictor2.network().to_json()) def test_inequalities(): # Memorize the dataset. hyperparameters = dict( peptide_amino_acid_encoding="one-hot", activation="tanh", layer_sizes=[64], max_epochs=200, minibatch_size=32, random_negative_rate=0.0, early_stopping=False, validation_split=0.0, locally_connected_layers=[ { "filters": 8, "activation": "tanh", "kernel_size": 3 } ], dense_layer_l1_regularization=0.0, dropout_probability=0.0, loss="custom:mse_with_inequalities_and_multiple_outputs") df = pandas.DataFrame() df["peptide"] = random_peptides(100, length=9) # First half are binders df["binder"] = df.index < len(df) / 2 df["value"] = df.binder.map({True: 100, False: 5000}) df.loc[:10, "value"] = 1.0 # some strong binders df["inequality1"] = "=" df["inequality2"] = df.binder.map({True: "<", False: "="}) df["inequality3"] = df.binder.map({True: "=", False: ">"}) # "A" at start of peptide indicates strong binder df["peptide"] = [ ("C" if not row.binder else "A") + row.peptide[1:] for _, row in df.iterrows() ] fit_kwargs = {'verbose': 0} # Prediction1 uses no inequalities (i.e. all are (=)) predictor = Class1NeuralNetwork(**hyperparameters) predictor.fit( df.peptide.values, df.value.values, inequalities=df.inequality1.values, **fit_kwargs) df["prediction1"] = predictor.predict(df.peptide.values) # Prediction2 has a (<) inequality on binders and an (=) on non-binders predictor = Class1NeuralNetwork(**hyperparameters) predictor.fit( df.peptide.values, df.value.values, inequalities=df.inequality2.values, **fit_kwargs) df["prediction2"] = predictor.predict(df.peptide.values) # Prediction3 has a (=) inequality on binders and an (>) on non-binders predictor = Class1NeuralNetwork(**hyperparameters) predictor.fit( df.peptide.values, df.value.values, inequalities=df.inequality3.values, **fit_kwargs) df["prediction3"] = predictor.predict(df.peptide.values) df_binders = df.loc[df.binder] df_nonbinders = df.loc[~df.binder] print("***** Binders: *****") print(df_binders.head(5)) print("***** Non-binders: *****") print(df_nonbinders.head(5)) # Binders should always be given tighter predicted affinity than non-binders assert_less(df_binders.prediction1.mean(), df_nonbinders.prediction1.mean()) assert_less(df_binders.prediction2.mean(), df_nonbinders.prediction2.mean()) assert_less(df_binders.prediction3.mean(), df_nonbinders.prediction3.mean()) # prediction2 binders should be tighter on average than prediction1 # binders, since prediction2 has a (<) inequality for binders. # Non-binders should be about the same between prediction2 and prediction1 assert_less(df_binders.prediction2.mean(), df_binders.prediction1.mean()) assert_almost_equal( df_nonbinders.prediction2.mean(), df_nonbinders.prediction1.mean(), delta=3000) # prediction3 non-binders should be weaker on average than prediction2 (or 1) # non-binders, since prediction3 has a (>) inequality for these peptides. # Binders should be about the same. assert_greater( df_nonbinders.prediction3.mean(), df_nonbinders.prediction2.mean()) assert_greater( df_nonbinders.prediction3.mean(), df_nonbinders.prediction1.mean()) assert_almost_equal( df_binders.prediction3.mean(), df_binders.prediction1.mean(), delta=3000)