From df5ac4846d6ff811d3e514e9c63f129409fb7a2f Mon Sep 17 00:00:00 2001 From: Alex Rubinsteyn <alex.rubinsteyn@gmail.com> Date: Fri, 29 Apr 2016 12:17:22 -0400 Subject: [PATCH] fixed neural network test, fixed length filtering of data loading, started on adding imputation to training script --- mhcflurry/data.py | 17 ++++---- mhcflurry/imputation.py | 40 ++++++++++++++++++- ...rry-train-class1-allele-specific-models.py | 26 +++++++++--- test/test_neural_nets.py | 17 +++++--- 4 files changed, 80 insertions(+), 20 deletions(-) diff --git a/mhcflurry/data.py b/mhcflurry/data.py index eab00b13..1ea71212 100644 --- a/mhcflurry/data.py +++ b/mhcflurry/data.py @@ -68,13 +68,12 @@ def _infer_csv_separator(filename): def load_dataframe( filename, - peptide_length=None, max_ic50=MAX_IC50, sep=None, species_column_name="species", allele_column_name="mhc", peptide_column_name=None, - peptide_length_column_name="peptide_length", + filter_peptide_length=None, ic50_column_name="meas", only_human=False): """ @@ -88,9 +87,6 @@ def load_dataframe( - 'sequence' - 'meas' - peptide_length : int, optional - Which length peptides to use (default=load all lengths) - max_ic50 : float Treat IC50 scores above this value as all equally bad (transform them to 0.0 in the regression output) @@ -101,6 +97,9 @@ def load_dataframe( peptide_column_name : str, optional Default behavior is to try {"sequence", "peptide", "peptide_sequence"} + filter_peptide_length : int, optional + Which length peptides to use (default=load all lengths) + only_human : bool Only load entries from human MHC alleles @@ -131,8 +130,9 @@ def load_dataframe( if only_human: human_mask = df[species_column_name] == "human" df = df[human_mask] - if peptide_length is not None: - length_mask = df[peptide_length_column_name] == peptide_length + + if filter_peptide_length: + length_mask = df[peptide_column_name].str.len() == filter_peptide_length df = df[length_mask] df[allele_column_name] = df[allele_column_name].map(normalize_allele_name) @@ -383,11 +383,10 @@ def load_allele_datasets( filename=filename, max_ic50=max_ic50, sep=sep, - peptide_length=peptide_length, species_column_name=species_column_name, allele_column_name=allele_column_name, peptide_column_name=peptide_column_name, - peptide_length_column_name=peptide_length_column_name, + filter_peptide_length=None if use_multiple_peptide_lengths else peptide_length, ic50_column_name=ic50_column_name, only_human=only_human) diff --git a/mhcflurry/imputation.py b/mhcflurry/imputation.py index 145c6e9a..1bcb08dc 100644 --- a/mhcflurry/imputation.py +++ b/mhcflurry/imputation.py @@ -19,7 +19,16 @@ from __future__ import ( ) from collections import defaultdict import numpy as np -from fancyimpute.dictionary_helpers import dense_matrix_from_nested_dictionary +from fancyimpute.dictionary_helpers import ( + dense_matrix_from_nested_dictionary +) +from fancyimpute import ( + KNN, + IterativeSVD, + SimpleFill, + SoftImpute, + MICE +) from .data import ( create_allele_data_from_peptide_to_ic50_dict, @@ -190,3 +199,32 @@ def create_imputed_datasets( for (allele_name, allele_dict) in allele_to_peptide_to_affinity_dict.items() } + +def imputer_from_name(imputation_method_name, **kwargs): + """ + Helper function for constructing an imputation object from a name given + typically from a commandline argument. + """ + imputation_method_name = imputation_method_name.strip().lower() + if imputation_method_name == "mice": + kwargs["n_burn_in"] = kwargs.get("n_burn_in", 5) + kwargs["n_imputations"] = kwargs.get("n_imputations", 25) + kwargs["min_value"] = kwargs.get("min_value", 0) + kwargs["max_value"] = kwargs.get("max_value", 1) + return MICE(**kwargs) + elif imputation_method_name == "knn": + kwargs["k"] = kwargs.get("k", 3) + kwargs["orientation"] = kwargs.get("orientation", "columns") + kwargs["print_interval"] = kwargs.get("print_interval", 10) + return KNN(**kwargs) + elif imputation_method_name == "svd": + kwargs["rank"] = kwargs.get("rank", 10) + return IterativeSVD(**kwargs) + elif imputation_method_name == "svt" or imputation_method_name == "softimpute": + return SoftImpute(**kwargs) + elif imputation_method_name == "mean": + return SimpleFill("mean", **kwargs) + elif imputation_method_name == "none": + return None + else: + raise ValueError("Invalid imputation method: %s" % imputation_method_name) diff --git a/script/mhcflurry-train-class1-allele-specific-models.py b/script/mhcflurry-train-class1-allele-specific-models.py index dab69768..7a1475e9 100755 --- a/script/mhcflurry-train-class1-allele-specific-models.py +++ b/script/mhcflurry-train-class1-allele-specific-models.py @@ -53,6 +53,8 @@ from mhcflurry.paths import ( CLASS1_MODEL_DIRECTORY, CLASS1_DATA_DIRECTORY ) +from mhcflurry.imputation import imputer_from_name, create_imputed_datasets + CSV_FILENAME = "combined_human_class1_dataset.csv" CSV_PATH = join(CLASS1_DATA_DIRECTORY, CSV_FILENAME) @@ -92,16 +94,24 @@ parser.add_argument( nargs="+", type=normalize_allele_name) +parser.add_argument( + "--imputation-method", + default=None, + choices=("mice", "knn", "softimpute", "svd", "mean"), + type=imputer_from_name, + help="Use the given imputation method to generate data for pre-training models") + # add options for neural network hyperparameters parser = add_hyperparameter_arguments_to_parser(parser) if __name__ == "__main__": args = parser.parse_args() print(args) + if not exists(args.output_dir): makedirs(args.output_dir) - allele_groups = load_allele_datasets( + allele_data_dict = load_allele_datasets( filename=args.binding_data_csv, peptide_length=9, use_multiple_peptide_lengths=True, @@ -111,15 +121,21 @@ if __name__ == "__main__": # concatenate datasets from all alleles to use for pre-training of # allele-specific predictors - X_all = np.vstack([group.X_index for group in allele_groups.values()]) - Y_all = np.concatenate([group.Y for group in allele_groups.values()]) + X_all = np.vstack([group.X_index for group in allele_data_dict.values()]) + Y_all = np.concatenate([group.Y for group in allele_data_dict.values()]) print("Total Dataset size = %d" % len(Y_all)) + if args.imputation_method is not None: + # TODO: use imputed data for training + imputed_data_dict = create_imputed_datasets( + allele_data_dict, + args.imputation_method) + # if user didn't specify alleles then train models for all available alleles alleles = args.alleles if not alleles: - alleles = sorted(allele_groups.keys()) + alleles = sorted(allele_data_dict.keys()) for allele_name in alleles: allele_name = normalize_allele_name(allele_name) @@ -127,7 +143,7 @@ if __name__ == "__main__": print("Skipping allele %s" % (allele_name,)) continue - allele_data = allele_groups[allele_name] + allele_data = allele_data_dict[allele_name] X = allele_data.X_index Y = allele_data.Y diff --git a/test/test_neural_nets.py b/test/test_neural_nets.py index 99f8f774..966efdd5 100644 --- a/test/test_neural_nets.py +++ b/test/test_neural_nets.py @@ -11,7 +11,7 @@ def test_make_embedding_network(): activation="tanh", embedding_input_dim=3, embedding_output_dim=20, - learning_rate=0.1) + learning_rate=0.05) X_negative = np.array([ [0] * 3, @@ -33,7 +33,7 @@ def test_make_embedding_network(): ]) X_index = np.vstack([X_negative, X_positive]) Y = np.array([0.0] * len(X_negative) + [1.0] * len(X_positive)) - nn.fit(X_index, Y, nb_epoch=10) + nn.fit(X_index, Y, nb_epoch=20) Y_pred = nn.predict(X_index) print(Y) print(Y_pred) @@ -46,7 +46,7 @@ def test_make_hotshot_network(): activation="relu", layer_sizes=[4], n_amino_acids=2, - learning_rate=0.1) + learning_rate=0.05) X_binary = np.array([ [True, False, True, False, True, False], [True, False, True, False, False, True], @@ -56,9 +56,16 @@ def test_make_hotshot_network(): [False, True, True, False, False, True], ], dtype=bool) Y = np.array([0.0, 0.0, 0.0, 0.0, 1.0, 1.0]) - nn.fit(X_binary, Y, nb_epoch=10) + nn.fit(X_binary, Y, nb_epoch=20) Y_pred = nn.predict(X_binary) print(Y) print(Y_pred) for (Y_i, Y_pred_i) in zip(Y, Y_pred): - assert abs(Y_i - Y_pred_i) <= 0.25, (Y_i, Y_pred_i) + if Y_i: + assert Y_pred_i >= 0.6, "Expected higher value than %f" % Y_pred_i + else: + assert Y_pred_i <= 0.4, "Expected lower value than %f" % Y_pred_i + +if __name__ == "__main__": + test_make_hotshot_network() + test_make_embedding_network() -- GitLab