From 4a2f6a182546dda094c0fc1977f7760bc072ecb0 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 18 Sep 2019 18:07:40 -0400 Subject: [PATCH] fix --- .../GENERATE.WITH_HPC_CLUSTER.sh | 2 +- .../models_class1_pan_unselected/GENERATE.sh | 2 +- mhcflurry/select_pan_allele_models_command.py | 28 +++--------- mhcflurry/train_pan_allele_models_command.py | 14 +++--- test/test_train_pan_allele_models_command.py | 43 +++++++++++++------ 5 files changed, 43 insertions(+), 46 deletions(-) diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh index d3d232f7..d8b70a64 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.WITH_HPC_CLUSTER.sh @@ -61,7 +61,7 @@ do --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ - --ensemble-size 4 \ + --num-folds 4 \ --hyperparameters hyperparameters.yaml \ --out-models-dir $(pwd)/models.${kind} \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ diff --git a/downloads-generation/models_class1_pan_unselected/GENERATE.sh b/downloads-generation/models_class1_pan_unselected/GENERATE.sh index 51676bf0..c5799bb9 100755 --- a/downloads-generation/models_class1_pan_unselected/GENERATE.sh +++ b/downloads-generation/models_class1_pan_unselected/GENERATE.sh @@ -73,7 +73,7 @@ do --allele-sequences "$(mhcflurry-downloads path allele_sequences)/allele_sequences.csv" \ --pretrain-data "$(mhcflurry-downloads path random_peptide_predictions)/predictions.csv.bz2" \ --held-out-measurements-per-allele-fraction-and-max 0.25 100 \ - --ensemble-size 4 \ + --num-folds 4 \ --hyperparameters hyperparameters.yaml \ --out-models-dir models.${kind} \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py index c931632b..b99657f4 100644 --- a/mhcflurry/select_pan_allele_models_command.py +++ b/mhcflurry/select_pan_allele_models_command.py @@ -51,11 +51,6 @@ parser.add_argument( help=( "Model selection data CSV. Expected columns: " "allele, peptide, measurement_value")) -parser.add_argument( - "--folds", - metavar="FILE.csv", - required=False, - help=("")) parser.add_argument( "--models-dir", metavar="DIR", @@ -161,19 +156,6 @@ def run(argv=sys.argv[1:]): metadata_dfs = {} - if args.folds: - folds_df = pandas.read_csv(args.folds) - matches = all([ - len(folds_df) == len(df), - (folds_df.peptide == df.peptide).all(), - (folds_df.allele == df.allele).all(), - ]) - if not matches: - raise ValueError("Training data and fold data do not match") - fold_cols = [c for c in folds_df if c.startswith("fold_")] - for col in fold_cols: - df[col] = folds_df[col] - fold_cols = [c for c in df if c.startswith("fold_")] num_folds = len(fold_cols) if num_folds <= 1: @@ -193,8 +175,6 @@ def run(argv=sys.argv[1:]): df = df.loc[df.allele.isin(alleles)].dropna() print("Subselected to supported alleles: %s" % str(df.shape)) - print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) - metadata_dfs["model_selection_data"] = df df["mass_spec"] = df.measurement_source.str.contains( @@ -248,13 +228,13 @@ def run(argv=sys.argv[1:]): if serial_run: # Serial run print("Running in serial.") - results = (do_model_select_task(item) for item in work_items) + results = (model_select(**item) for item in work_items) elif args.cluster_parallelism: # Run using separate processes HPC cluster. print("Running on cluster.") results = cluster_results_from_args( args, - work_function=do_model_select_task, + work_function=model_select, work_items=work_items, constant_data=GLOBAL_DATA, result_serialization_method="pickle") @@ -268,7 +248,9 @@ def run(argv=sys.argv[1:]): # Parallel run results = worker_pool.imap_unordered( - do_model_select_task, work_items, chunksize=1) + do_model_select_task, + work_items, + chunksize=1) models_by_fold = {} summary_dfs = [] diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index f20c9e3e..5947183e 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -84,11 +84,11 @@ parser.add_argument( default=False, help="Do not use affinity value inequalities even when present in data") parser.add_argument( - "--ensemble-size", + "--num-folds", type=int, + default=4, metavar="N", - help="Ensemble size, i.e. how many models to retain the final predictor. " - "In the current implementation, this is also the number of training folds.") + help="Number of training folds.") parser.add_argument( "--num-replicates", type=int, @@ -296,7 +296,7 @@ def initialize_training(args): "data", "out_models_dir", "hyperparameters", - "ensemble_size", + "num_folds", ] for arg in required_arguments: if getattr(args, arg) is None: @@ -338,7 +338,7 @@ def initialize_training(args): folds_df = assign_folds( df=df, - num_folds=args.ensemble_size, + num_folds=args.num_folds, held_out_fraction=held_out_fraction, held_out_max=held_out_max) @@ -387,14 +387,14 @@ def initialize_training(args): if not args.pretrain_data: raise ValueError("--pretrain-data is required") - for fold in range(args.ensemble_size): + for fold in range(args.num_folds): for replicate in range(args.num_replicates): work_dict = { 'work_item_name': str(uuid.uuid4()), 'architecture_num': h, 'num_architectures': len(hyperparameters_lst), 'fold_num': fold, - 'num_folds': args.ensemble_size, + 'num_folds': args.num_folds, 'replicate_num': replicate, 'num_replicates': args.num_replicates, 'hyperparameters': hyperparameters, diff --git a/test/test_train_pan_allele_models_command.py b/test/test_train_pan_allele_models_command.py index 53c5bb25..c214ec56 100644 --- a/test/test_train_pan_allele_models_command.py +++ b/test/test_train_pan_allele_models_command.py @@ -2,6 +2,11 @@ Tests for training and predicting using Class1 pan-allele models. """ +import logging +logging.getLogger('tensorflow').disabled = True +logging.getLogger('matplotlib').disabled = True + + import json import os import shutil @@ -36,7 +41,7 @@ HYPERPARAMETERS_LIST = [ 'learning_rate': None, 'locally_connected_layers': [], 'loss': 'custom:mse_with_inequalities', - 'max_epochs': 5, + 'max_epochs': 0, # never selected 'minibatch_size': 256, 'optimizer': 'rmsprop', 'output_activation': 'sigmoid', @@ -100,7 +105,7 @@ HYPERPARAMETERS_LIST = [ }, 'validation_split': 0.1, }, -][1:] +] def run_and_check(n_jobs=0, delete=True, additional_args=[]): @@ -114,37 +119,47 @@ def run_and_check(n_jobs=0, delete=True, additional_args=[]): get_path("data_curated", "curated_training_data.no_mass_spec.csv.bz2")) selected_data_df = data_df.loc[data_df.allele.str.startswith("HLA-A")] selected_data_df.to_csv( - os.path.join(models_dir, "train_data.csv"), index=False) + os.path.join(models_dir, "_train_data.csv"), index=False) args = [ "mhcflurry-class1-train-pan-allele-models", - "--data", os.path.join(models_dir, "train_data.csv"), + "--data", os.path.join(models_dir, "_train_data.csv"), "--allele-sequences", get_path("allele_sequences", "allele_sequences.csv"), "--hyperparameters", hyperparameters_filename, "--out-models-dir", models_dir, "--num-jobs", str(n_jobs), - "--ensemble-size", "2", + "--num-folds", "2", "--verbosity", "1", - # "--pretrain-data", get_path( - # "random_peptide_predictions", "predictions.csv.bz2"), ] + additional_args print("Running with args: %s" % args) subprocess.check_call(args) - result = Class1AffinityPredictor.load(models_dir) - predictions = result.predict( - peptides=["SLYNTVATL"], + # Run model selection + models_dir_selected = tempfile.mkdtemp( + prefix="mhcflurry-test-models-selected") + args = [ + "mhcflurry-class1-select-pan-allele-models", + "--data", os.path.join(models_dir, "train_data.csv.bz2"), + "--models-dir", models_dir, + "--out-models-dir", models_dir_selected, + "--max-models", "1", + "--num-jobs", str(n_jobs), + ] + additional_args + print("Running with args: %s" % args) + subprocess.check_call(args) + + result = Class1AffinityPredictor.load( + models_dir_selected, optimization_level=0) + assert_equal(len(result.neural_networks), 2) + predictions = result.predict(peptides=["SLYNTVATL"], alleles=["HLA-A*02:01"]) assert_equal(predictions.shape, (1,)) assert_array_less(predictions, 1000) - df = result.predict_to_dataframe( - peptides=["SLYNTVATL"], - alleles=["HLA-A*02:01"]) - print(df) if delete: print("Deleting: %s" % models_dir) shutil.rmtree(models_dir) + shutil.rmtree(models_dir_selected) if os.environ.get("KERAS_BACKEND") != "theano": -- GitLab