diff --git a/downloads-generation/data_curated/curate.py b/downloads-generation/data_curated/curate.py index d61719f74d2132a9e251c434d50aa8e89e73e42c..2c5f031985050dfc4561c1a2f494b908f88bcc1d 100755 --- a/downloads-generation/data_curated/curate.py +++ b/downloads-generation/data_curated/curate.py @@ -145,7 +145,7 @@ def load_data_iedb(iedb_csv, include_qualitative=True, include_mass_spec=False): quantitative["measurement_type"] = "quantitative" quantitative["measurement_inequality"] = quantitative[ "Measurement Inequality" - ].fillna("=") + ].fillna("=").map(lambda s: {">=": ">", "<=": "<"}.get(s, s)) print("Quantitative measurements: %d" % len(quantitative)) qualitative = iedb_df.ix[iedb_df["Units"].isnull()].copy() diff --git a/mhcflurry/downloads.yml b/mhcflurry/downloads.yml index f1f3e980fe339eef1f8db34f5785f6e7807fc596..7b87201e83f43d5ee2446e3f2b57a633288118cf 100644 --- a/mhcflurry/downloads.yml +++ b/mhcflurry/downloads.yml @@ -41,7 +41,7 @@ releases: default: false - name: data_curated - url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_curated.20190514.tar.bz2 + url: https://github.com/openvax/mhcflurry/releases/download/pan-dev1/data_curated.20190516.tar.bz2 default: true # Older downloads diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index bc9bb9ab1cac1bd5142ffb322efe957ea33117eb..1520502a5fef4084be3b03c76c61c49c0c68e854 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -246,12 +246,6 @@ def main(args): print("Data inequalities:") print(df.measurement_inequality.value_counts()) - df.measurement_inequality = df.measurement_inequality.map(lambda s: { - ">=": ">", "<=": "<" - }.get(s, s)) - - print("Data inequalities after adjustment:") - print(df.measurement_inequality.value_counts()) if args.ignore_inequalities and "measurement_inequality" in df.columns: print("Dropping measurement_inequality column") diff --git a/test/test_class1_pan.py b/test/test_class1_pan.py new file mode 100644 index 0000000000000000000000000000000000000000..acea90a21e346f634a906824135a34972f92c766 --- /dev/null +++ b/test/test_class1_pan.py @@ -0,0 +1,89 @@ +""" +Tests for training and predicting using Class1 pan-allele models. +""" + +import json +import os +import shutil +import tempfile +import subprocess +from copy import deepcopy + +import pandas + +from numpy.testing import assert_array_less, assert_equal + +from mhcflurry import Class1AffinityPredictor,Class1NeuralNetwork +from mhcflurry.allele_encoding import AlleleEncoding +from mhcflurry.downloads import get_path + + +HYPERPARAMETERS = { + 'activation': 'tanh', + 'allele_dense_layer_sizes': [], + 'batch_normalization': False, + 'dense_layer_l1_regularization': 0.0, + 'dense_layer_l2_regularization': 0.0, + 'dropout_probability': 0.5, + 'early_stopping': True, + 'init': 'glorot_uniform', + 'layer_sizes': [64], + 'learning_rate': None, + 'locally_connected_layers': [], + 'loss': 'custom:mse_with_inequalities', + 'max_epochs': 5000, + 'minibatch_size': 128, + 'optimizer': 'rmsprop', + 'output_activation': 'sigmoid', + 'patience': 20, + 'peptide_allele_merge_activation': '', + 'peptide_allele_merge_method': 'concatenate', + 'peptide_amino_acid_encoding': 'BLOSUM62', + 'peptide_dense_layer_sizes': [], + 'peptide_encoding': { + 'alignment_method': 'left_pad_centered_right_pad', + 'max_length': 15, + 'vector_encoding_name': 'BLOSUM62', + }, + 'random_negative_affinity_max': 50000.0, + 'random_negative_affinity_min': 20000.0, + 'random_negative_constant': 25, + 'random_negative_distribution_smoothing': 0.0, + 'random_negative_match_distribution': True, + 'random_negative_rate': 0.2, + 'train_data': {}, + 'validation_split': 0.1, +} + + +ALLELE_TO_SEQUENCE = pandas.read_csv( + get_path( + "allele_sequences", "allele_sequences.csv"), + index_col=0).sequence.to_dict() + + +TRAIN_DF = pandas.read_csv( + get_path( + "data_curated", "curated_training_data.no_mass_spec.csv.bz2")) + +TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.allele.isin(ALLELE_TO_SEQUENCE)] +TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.peptide.str.len() >= 8] +TRAIN_DF = TRAIN_DF.loc[TRAIN_DF.peptide.str.len() <= 15] + + +def test_train_simple(): + network = Class1NeuralNetwork(**HYPERPARAMETERS) + allele_encoding = AlleleEncoding( + TRAIN_DF.allele.values, + allele_to_sequence=ALLELE_TO_SEQUENCE) + network.fit( + TRAIN_DF.peptide.values, + affinities=TRAIN_DF.measurement_value.values, + allele_encoding=allele_encoding, + inequalities=TRAIN_DF.measurement_inequality.values) + + predictions = network.predict( + peptides=TRAIN_DF.peptide.values, + allele_encoding=allele_encoding) + + print(pandas.Series(predictions).describe())