diff --git a/mhcflurry/batch_generator.py b/mhcflurry/batch_generator.py index ca51207201b98d83040a1ecefe37e35c23310666..96c1a15d7712f19d53fc92d73c487c4f9e48fa3e 100644 --- a/mhcflurry/batch_generator.py +++ b/mhcflurry/batch_generator.py @@ -80,9 +80,9 @@ class BatchPlan(object): ("Batch %5d: " % i) + ", ".join( "{key}[{value}]".format(key=key, value=value) for (key, value) in label_counts.iteritems())) - if i == 5: + if i == 5 and len(self.batch_compositions) > i + 3: lines.append("...") - i = len(self.batch_compositions) - 4 + i = len(self.batch_compositions) - i + 1 i += 1 indent_spaces = " " * indent diff --git a/mhcflurry/class1_presentation_neural_network.py b/mhcflurry/class1_presentation_neural_network.py index 26742d45a6251566a150e8e6d7df1c0f16061cbf..6ac7b1a220981410193e810a85a5f756305f0d0a 100644 --- a/mhcflurry/class1_presentation_neural_network.py +++ b/mhcflurry/class1_presentation_neural_network.py @@ -129,7 +129,7 @@ class Class1PresentationNeuralNetwork(object): output_dim=1029, input_length=self.hyperparameters['max_alleles'], trainable=False, - mask_zero=True)(input_alleles) + mask_zero=False)(input_alleles) allele_flat = allele_representation @@ -427,7 +427,8 @@ class Class1PresentationNeuralNetwork(object): allele_representations_hash = self.set_allele_representations( allele_representations) self.network.compile( - loss=[affinities_loss.loss, mms_loss.loss, ZeroLoss.loss], + loss=[affinities_loss.keras_wrapped(), mms_loss.keras_wrapped(), ZeroLoss().keras_wrapped()], + #loss_weights=[1.0, 1.0, 1.0], optimizer=self.hyperparameters['optimizer']) if self.hyperparameters['learning_rate'] is not None: K.set_value( @@ -545,6 +546,10 @@ class Class1PresentationNeuralNetwork(object): for (key, value) in fit_history.history.items(): fit_info[key].extend(value) + if numpy.isnan(fit_info['loss'][-1]): + import ipdb ; ipdb.set_trace() + raise ValueError("NaN loss") + # Print progress no more often than once every few seconds. if progress_print_interval is not None and ( not last_progress_print or ( diff --git a/mhcflurry/custom_loss.py b/mhcflurry/custom_loss.py index 65a6586b54b394f972596bc5d2e05df8f52a15a5..e6f72264ae5bc1a5f4ba4aa6d33fcf367cc948a4 100644 --- a/mhcflurry/custom_loss.py +++ b/mhcflurry/custom_loss.py @@ -57,6 +57,12 @@ class Loss(object): def __str__(self): return "<Loss: %s>" % self.name + def keras_wrapped(self, reduction="sum_over_batch_size"): + from keras.losses import LossFunctionWrapper + return LossFunctionWrapper( + self.loss, reduction=reduction, name=self.name) + + class StandardKerasLoss(Loss): """ @@ -132,6 +138,7 @@ class MSEWithInequalities(Loss): # We always delay import of Keras so that mhcflurry can be imported # initially without tensorflow debug output, etc. from keras import backend as K + import tensorflow as tf # Handle (=) inequalities diff1 = y_pred - y_true @@ -153,8 +160,7 @@ class MSEWithInequalities(Loss): K.sum(K.square(diff1)) + K.sum(K.square(diff2)) + K.sum(K.square(diff3))) / K.cast(K.shape(y_pred)[0], "float32") - - return result + return tf.where(tf.is_nan(result), tf.zeros_like(result), result) class MSEWithInequalitiesAndMultipleOutputs(Loss): @@ -281,7 +287,8 @@ class ZeroLoss(Loss): @staticmethod def loss(y_true, y_pred): - return 0.0 + import keras.backend as K + return K.constant(0.0) def check_shape(name, arr, expected_shape): diff --git a/test/test_class1_presentation_predictor.py b/test/test_class1_presentation_predictor.py index 9c04c618116342846cba659ab379b97e1a52c281..7cf4480c3499649d494a7456048587412195764e 100644 --- a/test/test_class1_presentation_predictor.py +++ b/test/test_class1_presentation_predictor.py @@ -24,6 +24,7 @@ import sys import copy import os import tempfile +import pickle from numpy.testing import assert_, assert_equal, assert_allclose, assert_array_equal from nose.tools import assert_greater, assert_less @@ -90,11 +91,14 @@ def teardown(): cleanup() -def test_basic(): +def Xtest_basic(): affinity_predictor = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC models = [] for affinity_network in affinity_predictor.class1_pan_allele_models: - presentation_network = Class1PresentationNeuralNetwork() + presentation_network = Class1PresentationNeuralNetwork( + max_epochs=1, + learning_rate=0.0001, + batch_generator_batch_size=256) presentation_network.load_from_class1_neural_network(affinity_network) print(presentation_network.network.get_config()) models.append(presentation_network) @@ -112,13 +116,13 @@ def test_basic(): df["tightest_affinity"] = df.min(1) df["tightest_allele"] = df.idxmin(1) - df2 = predictor.predict_to_dataframe( + # Test untrained predictor gives predictions that match the affinity + # predictor + df_predictor = predictor.predict_to_dataframe( peptides=df.index.values, alleles=alleles) merged_df = pandas.merge( - df, df2.set_index("peptide"), left_index=True, right_index=True) - - #import ipdb ; ipdb.set_trace() + df, df_predictor.set_index("peptide"), left_index=True, right_index=True) assert_allclose( merged_df["tightest_affinity"], merged_df["affinity"], rtol=1e-5) @@ -126,18 +130,59 @@ def test_basic(): merged_df["tightest_affinity"], to_ic50(merged_df["score"]), rtol=1e-5) assert_array_equal(merged_df["tightest_allele"], merged_df["allele"]) + # Test saving and loading models_dir = tempfile.mkdtemp("_models") print(models_dir) predictor.save(models_dir) predictor2 = Class1PresentationPredictor.load(models_dir) - df3 = predictor2.predict_to_dataframe( + df_predictor2 = predictor2.predict_to_dataframe( peptides=df.index.values, alleles=alleles) - - assert_array_equal(df2.values, df3.values) - - # TODO: test fitting, saving, and loading + assert_array_equal(df_predictor.values, df_predictor2.values) + + # Test pickling + predictor3 = pickle.loads( + pickle.dumps(predictor, protocol=pickle.HIGHEST_PROTOCOL)) + predictor4 = pickle.loads( + pickle.dumps(predictor2, protocol=pickle.HIGHEST_PROTOCOL)) + df_predictor3 = predictor3.predict_to_dataframe( + peptides=df.index.values, + alleles=alleles) + df_predictor4 = predictor4.predict_to_dataframe( + peptides=df.index.values, + alleles=alleles) + assert_array_equal(df_predictor.values, df_predictor3.values) + assert_array_equal(df_predictor.values, df_predictor4.values) + + # Test that fitting a model changes the predictor but not the original model + train_df = pandas.DataFrame({ + "peptide": random_peptides(256), + }) + train_df["allele"] = "HLA-A*02:20" + train_df["experiment"] = "experiment1" + train_df["label"] = train_df.peptide.str.match("^[KSEQ]").astype("float32") + allele_encoding = MultipleAlleleEncoding( + experiment_names=train_df.experiment.values, + experiment_to_allele_list={ + "experiment1": ["HLA-A*02:20", "HLA-A*02:01"], + }, + allele_to_sequence=predictor4.allele_to_sequence) + model = predictor4.models[0] + new_predictor = Class1PresentationPredictor( + models=[model], + allele_to_sequence=predictor4.allele_to_sequence) + train_df["original_score"] = new_predictor.predict( + train_df.peptide.values, alleles=["HLA-A*02:20"]) + model.fit( + peptides=train_df.peptide.values, + labels=train_df.label.values, + allele_encoding=allele_encoding) + train_df["updated_score"] = new_predictor.predict( + train_df.peptide.values, + alleles=["HLA-A*02:20"]) + print(train_df) + import ipdb ; ipdb.set_trace() def scramble_peptide(peptide): @@ -402,7 +447,8 @@ def Xtest_real_data_multiallelic_refinement(max_epochs=10): import ipdb ; ipdb.set_trace() -def Xtest_synthetic_allele_refinement_with_affinity_data(max_epochs=10): +def test_synthetic_allele_refinement_with_affinity_data( + max_epochs=10, include_affinities=False): refine_allele = "HLA-C*01:02" alleles = [ "HLA-A*02:01", "HLA-B*27:01", "HLA-C*07:01", @@ -456,28 +502,34 @@ def Xtest_synthetic_allele_refinement_with_affinity_data(max_epochs=10): mms_train_df = pandas.concat([hits_df, decoys_df], ignore_index=True) mms_train_df["label"] = mms_train_df.hit mms_train_df["is_affinity"] = False + mms_train_df["measurement_inequality"] = None + + if include_affinities: + affinity_train_df = pandas.read_csv( + get_path( + "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2")) + affinity_train_df = affinity_train_df.loc[ + affinity_train_df.allele.isin(alleles), + ["peptide", "allele", "measurement_inequality", "measurement_value"]] + + affinity_train_df["label"] = affinity_train_df["measurement_value"] + del affinity_train_df["measurement_value"] + affinity_train_df["is_affinity"] = True + else: + affinity_train_df = None - affinity_train_df = pandas.read_csv( - get_path( - "models_class1_pan", "models.with_mass_spec/train_data.csv.bz2")) - affinity_train_df = affinity_train_df.loc[ - affinity_train_df.allele.isin(alleles), - ["peptide", "allele", "measurement_inequality", "measurement_value"]] - - affinity_train_df["label"] = affinity_train_df["measurement_value"] - del affinity_train_df["measurement_value"] - affinity_train_df["is_affinity"] = True - - predictor = Class1PresentationNeuralNetwork( - PAN_ALLELE_PREDICTOR_NO_MASS_SPEC, + (affinity_model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models + presentation_model = Class1PresentationNeuralNetwork( auxiliary_input_features=["gene"], - max_ensemble_size=1, max_epochs=max_epochs, learning_rate=0.0001, patience=5, min_delta=0.0, - random_negative_rate=1.0, - random_negative_constant=25) + random_negative_rate=0.0, + random_negative_constant=0) # WHY DOES THIS BREAK WITH RANDOM NEG? + presentation_model.load_from_class1_neural_network(affinity_model) + + presentation_model = pickle.loads(pickle.dumps(presentation_model)) mms_allele_encoding = MultipleAlleleEncoding( experiment_names=["experiment1"] * len(mms_train_df), @@ -488,27 +540,24 @@ def Xtest_synthetic_allele_refinement_with_affinity_data(max_epochs=10): allele_to_sequence=PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.allele_to_sequence, ) allele_encoding = copy.deepcopy(mms_allele_encoding) - allele_encoding.append_alleles(affinity_train_df.allele.values) - allele_encoding = allele_encoding.compact() + if affinity_train_df is not None: + allele_encoding.append_alleles(affinity_train_df.allele.values) + train_df = pandas.concat( + [mms_train_df, affinity_train_df], ignore_index=True, sort=False) + else: + train_df = mms_train_df - train_df = pandas.concat( - [mms_train_df, affinity_train_df], ignore_index=True, sort=False) + allele_encoding = allele_encoding.compact() - pre_predictions = from_ic50( - predictor.predict( - output="affinities_matrix", + pre_predictions = presentation_model.predict( peptides=mms_train_df.peptide.values, - alleles=mms_allele_encoding)) + allele_encoding=mms_allele_encoding).score - (model,) = PAN_ALLELE_PREDICTOR_NO_MASS_SPEC.class1_pan_allele_models expected_pre_predictions = from_ic50( - model.predict( + affinity_model.predict( peptides=numpy.repeat(mms_train_df.peptide.values, len(alleles)), allele_encoding=mms_allele_encoding.allele_encoding, )).reshape((-1, len(alleles))) - - #import ipdb ; ipdb.set_trace() - mms_train_df["pre_max_prediction"] = pre_predictions.max(1) pre_auc = roc_auc_score(mms_train_df.hit.values, mms_train_df.pre_max_prediction.values) print("PRE_AUC", pre_auc) @@ -526,6 +575,7 @@ def Xtest_synthetic_allele_refinement_with_affinity_data(max_epochs=10): metric_rows = [] + """ def progress(): (_, presentation_prediction, affinities_predictions) = ( predictor.predict( @@ -573,24 +623,26 @@ def Xtest_synthetic_allele_refinement_with_affinity_data(max_epochs=10): update_motifs() return (presentation_prediction, auc) + print("Pre fitting:") progress() update_motifs() print("Fitting...") - - predictor.fit( + """ + presentation_model.fit( peptides=train_df.peptide.values, labels=train_df.label.values, inequalities=train_df.measurement_inequality.values, affinities_mask=train_df.is_affinity.values, allele_encoding=allele_encoding, - progress_callback=progress, + #progress_callback=progress, ) - (predictions, final_auc) = progress() - print("Final AUC", final_auc) + #(predictions, final_auc) = progress() + #print("Final AUC", final_auc) + """ update_motifs() motifs = pandas.DataFrame( @@ -612,6 +664,7 @@ def Xtest_synthetic_allele_refinement_with_affinity_data(max_epochs=10): ]) return (predictor, predictions, metrics, motifs) + """