From d1ef4aae5a49093a0fb82935f98f75a7140ae217 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Thu, 16 May 2019 09:51:19 -0400 Subject: [PATCH] add test --- downloads-generation/data_curated/curate.py | 2 +- mhcflurry/downloads.yml | 2 +- mhcflurry/train_pan_allele_models_command.py | 6 -- test/test_class1_pan.py | 89 ++++++++++++++++++++ 4 files changed, 91 insertions(+), 8 deletions(-) create mode 100644 test/test_class1_pan.py diff --git a/downloads-generation/data_curated/curate.py b/downloads-generation/data_curated/curate.py index d61719f7..2c5f0319 100755 --- a/downloads-generation/data_curated/curate.py +++ b/downloads-generation/data_curated/curate.py @@ -145,7 +145,7 @@ def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): quantitative["measurement_type"] = "quantitative" quantitative["measurement_inequality"] = quantitative[ "Measurement Inequality" - ].fillna("=") + ].fillna("=").map(lambda s: {">=": ">", "<=": "<"}.get(s, s)) print("Quantitative measurements: %d" % len(quantitative)) qualitative = iedb_df.ix[iedb_df["Units"].isnull()].copy() diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index f1f3e980..7b87201e 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -41,7 +41,7 @@ releases: default: false - name: data_curated - url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_curated.20190514.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_curated.20190516.tar.bz2 default: true # Older downloads diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index bc9bb9ab..1520502a 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -246,12 +246,6 @@ def main(args): print("Data inequalities:") print(df.measurement_inequality.value_counts()) - df.measurement_inequality = df.measurement_inequality.map(lambda s: { - ">=": ">", "<=": "<" - }.get(s, s)) - - print("Data inequalities after adjustment:") - print(df.measurement_inequality.value_counts()) if args.ignore_inequalities and "measurement_inequality" in df.columns: print("Dropping measurement_inequality column") diff --git a/test/test_class1_pan.py b/test/test_class1_pan.py new file mode 100644 index 00000000..acea90a2 --- /dev/null +++ b/test/test_class1_pan.py @@ -0,0 +1,89 @@ +""" +Tests for training and predicting using Class1 pan-allele models. +""" + +import json +import os +import shutil +import tempfile +import subprocess +from copy import deepcopy + +import pandas + +from numpy.testing import assert_array_less, assert_equal + +from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork +from mhcflurry.allele_encoding import AlleleEncoding +from mhcflurry.downloads import get_path + + +HYPERPARAMETERS = { + 'activation': 'tanh', + 'allele_dense_layer_sizes': [], + 'batch_normalization': False, + 'dense_layer_l1_regularization': 0.0, + 'dense_layer_l2_regularization': 0.0, + 'dropout_probability': 0.5, + 'early_stopping': True, + 'init': 'glorot_uniform', + 'layer_sizes': [64], + 'learning_rate': None, + 'locally_connected_layers': [], + 'loss': 'custom:mse_with_inequalities', + 'max_epochs': 5000, + 'minibatch_size': 128, + 'optimizer': 'rmsprop', + 'output_activation': 'sigmoid', + 'patience': 20, + 'peptide_allele_merge_activation': '', + 'peptide_allele_merge_method': 'concatenate', + 'peptide_amino_acid_encoding': 'BLOSUM62', + 'peptide_dense_layer_sizes': [], + 'peptide_encoding': { + 'alignment_method': 'left_pad_centered_right_pad', + 'max_length': 15, + 'vector_encoding_name': 'BLOSUM62', + }, + 'random_negative_affinity_max': 50000.0, + 'random_negative_affinity_min': 20000.0, + 'random_negative_constant': 25, + 'random_negative_distribution_smoothing': 0.0, + 'random_negative_match_distribution': True, + 'random_negative_rate': 0.2, + 'train_data': {}, + 'validation_split': 0.1, +} + + +ALLELE_TO_SEQUENCE = pandas.read_csv( + get_path( + "allele_sequences", "allele_sequences.csv"), + index_col=0).sequence.to_dict() + + +TRAIN_DF = pandas.read_csv( + get_path( + "data_curated", "curated_training_data.no_mass_spec.csv.bz2")) + +TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.allele.isin(ALLELE_TO_SEQUENCE)] +TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.peptide.str.len() >= 8] +TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.peptide.str.len() <= 15] + + +def test_train_simple(): + network = Class1NeuralNetwork(**HYPERPARAMETERS) + allele_encoding = AlleleEncoding( + TRAIN_DF.allele.values, + allele_to_sequence=ALLELE_TO_SEQUENCE) + network.fit( + TRAIN_DF.peptide.values, + affinities=TRAIN_DF.measurement_value.values, + allele_encoding=allele_encoding, + inequalities=TRAIN_DF.measurement_inequality.values) + + predictions = network.predict( + peptides=TRAIN_DF.peptide.values, + allele_encoding=allele_encoding) + + print(pandas.Series(predictions).describe()) -- GitLab