From 548ad0253d3da923131a0409faf031c0fd48240b Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Tue, 20 Feb 2018 15:05:09 -0500 Subject: [PATCH] Hold out 10% of data in models_class_unselected --- .../models_class1/GENERATE.sh | 27 +++--- .../models_class1/generate_hyperparameters.py | 88 ------------------ .../models_class1_unselected/GENERATE.sh | 3 +- .../generate_hyperparameters.py | 89 ++++++++++++++++++- .../train_allele_specific_models_command.py | 2 - 5 files changed, 107 insertions(+), 102 deletions(-) delete mode 100644 downloads-generation/models_class1/generate_hyperparameters.py mode change 120000 => 100644 downloads-generation/models_class1_unselected/generate_hyperparameters.py diff --git a/downloads-generation/models_class1/GENERATE.sh b/downloads-generation/models_class1/GENERATE.sh index 275e5c1b..642c5d96 100755 --- a/downloads-generation/models_class1/GENERATE.sh +++ b/downloads-generation/models_class1/GENERATE.sh @@ -1,8 +1,6 @@ #!/bin/bash # -# Train standard MHCflurry Class I models. -# Calls mhcflurry-class1-train-allele-specific-models on curated training data -# using the hyperparameters in "hyperparameters.yaml". +# Model select standard MHCflurry Class I models. # set -e set -x @@ -29,15 +27,24 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME mkdir models -python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml +GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 +echo "Detected GPUS: $GPUS" -time mhcflurry-class1-train-allele-specific-models \ - --data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ - --hyperparameters hyperparameters.yaml \ +PROCESSORS=$(getconf _NPROCESSORS_ONLN) +echo "Detected processors: $PROCESSORS" + +time mhcflurry-class1-select-allele-specific-models \ + --models-dir "$(mhcflurry-downloads path models_class1_unselected)/models" \ --out-models-dir models \ - --percent-rank-calibration-num-peptides-per-length 100000 \ - --min-measurements-per-allele 75 \ - --num-jobs 32 16 + --scoring mass-spec consensus \ + --consensus-num-peptides-per-length 10000 \ + --min-models 8 \ + --num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50 + +time mhcflurry-calibrate-percentile-ranks \ + --models-dir models \ + --num-peptides-per-length 100000 \ + --num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50 cp $SCRIPT_ABSOLUTE_PATH . bzip2 LOG.txt diff --git a/downloads-generation/models_class1/generate_hyperparameters.py b/downloads-generation/models_class1/generate_hyperparameters.py deleted file mode 100644 index 4cbd1f2b..00000000 --- a/downloads-generation/models_class1/generate_hyperparameters.py +++ /dev/null @@ -1,88 +0,0 @@ -""" -Generate grid of hyperparameters -""" - -from sys import stdout -from copy import deepcopy -from yaml import dump - -base_hyperparameters = { - ########################################## - # ENSEMBLE SIZE - ########################################## - "n_models": 4, - - ########################################## - # OPTIMIZATION - ########################################## - "max_epochs": 500, - "patience": 20, - "early_stopping": True, - "validation_split": 0.1, - "minibatch_size": None, - "loss": "custom:mse_with_inequalities", - - ########################################## - # RANDOM NEGATIVE PEPTIDES - ########################################## - "random_negative_rate": 0.0, - "random_negative_constant": 25, - "random_negative_affinity_min": 20000.0, - "random_negative_affinity_max": 50000.0, - - ########################################## - # PEPTIDE REPRESENTATION - ########################################## - # One of "one-hot", "embedding", or "BLOSUM62". - "peptide_amino_acid_encoding": "BLOSUM62", - "use_embedding": False, # maintained for backward compatability - "embedding_output_dim": 8, # only used if using embedding - "kmer_size": 15, - - ########################################## - # NEURAL NETWORK ARCHITECTURE - ########################################## - "locally_connected_layers": [ - { - "filters": 8, - "activation": "tanh", - "kernel_size": 3 - } - ], - "activation": "tanh", - "output_activation": "sigmoid", - "layer_sizes": [16], - "dense_layer_l1_regularization": None, - "batch_normalization": False, - "dropout_probability": 0.0, - - ########################################## - # TRAINING Data - ########################################## - "train_data": {"subset": "all", "pretrain_min_points": 1000}, -} - -grid = [] -for train_subset in ["all", "quantitative"]: - for minibatch_size in [128]: - for dense_layer_size in [8, 16, 32, 64]: - for l1 in [0.0, 0.001]: - for num_lc in [0, 1, 2]: - for lc_kernel_size in [3, 5]: - new = deepcopy(base_hyperparameters) - new["minibatch_size"] = minibatch_size - new["train_data"]["subset"] = train_subset - new["layer_sizes"] = [dense_layer_size] - new["dense_layer_l1_regularization"] = l1 - (lc_layer,) = new["locally_connected_layers"] - lc_layer['kernel_size'] = lc_kernel_size - if num_lc == 0: - new["locally_connected_layers"] = [] - elif num_lc == 1: - new["locally_connected_layers"] = [lc_layer] - elif num_lc == 2: - new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)] - if not grid or new not in grid: - grid.append(new) - -dump(grid, stdout) diff --git a/downloads-generation/models_class1_unselected/GENERATE.sh b/downloads-generation/models_class1_unselected/GENERATE.sh index ea9ba8a4..19dc08b4 100755 --- a/downloads-generation/models_class1_unselected/GENERATE.sh +++ b/downloads-generation/models_class1_unselected/GENERATE.sh @@ -45,7 +45,8 @@ time mhcflurry-class1-train-allele-specific-models \ --hyperparameters hyperparameters.yaml \ --out-models-dir models \ --percent-rank-calibration-num-peptides-per-length 0 \ - --min-measurements-per-allele 75 \ + --held-out-fraction-reciprocal 10 \ + --min-measurements-per-allele 50 \ --num-jobs $(expr $PROCESSORS \* 2) --gpus $GPUS --max-workers-per-gpu 2 --max-tasks-per-worker 50 cp $SCRIPT_ABSOLUTE_PATH . diff --git a/downloads-generation/models_class1_unselected/generate_hyperparameters.py b/downloads-generation/models_class1_unselected/generate_hyperparameters.py deleted file mode 120000 index 5f2599b4..00000000 --- a/downloads-generation/models_class1_unselected/generate_hyperparameters.py +++ /dev/null @@ -1 +0,0 @@ -../models_class1/generate_hyperparameters.py \ No newline at end of file diff --git a/downloads-generation/models_class1_unselected/generate_hyperparameters.py b/downloads-generation/models_class1_unselected/generate_hyperparameters.py new file mode 100644 index 00000000..4cbd1f2b --- /dev/null +++ b/downloads-generation/models_class1_unselected/generate_hyperparameters.py @@ -0,0 +1,88 @@ +""" +Generate grid of hyperparameters +""" + +from sys import stdout +from copy import deepcopy +from yaml import dump + +base_hyperparameters = { + ########################################## + # ENSEMBLE SIZE + ########################################## + "n_models": 4, + + ########################################## + # OPTIMIZATION + ########################################## + "max_epochs": 500, + "patience": 20, + "early_stopping": True, + "validation_split": 0.1, + "minibatch_size": None, + "loss": "custom:mse_with_inequalities", + + ########################################## + # RANDOM NEGATIVE PEPTIDES + ########################################## + "random_negative_rate": 0.0, + "random_negative_constant": 25, + "random_negative_affinity_min": 20000.0, + "random_negative_affinity_max": 50000.0, + + ########################################## + # PEPTIDE REPRESENTATION + ########################################## + # One of "one-hot", "embedding", or "BLOSUM62". + "peptide_amino_acid_encoding": "BLOSUM62", + "use_embedding": False, # maintained for backward compatability + "embedding_output_dim": 8, # only used if using embedding + "kmer_size": 15, + + ########################################## + # NEURAL NETWORK ARCHITECTURE + ########################################## + "locally_connected_layers": [ + { + "filters": 8, + "activation": "tanh", + "kernel_size": 3 + } + ], + "activation": "tanh", + "output_activation": "sigmoid", + "layer_sizes": [16], + "dense_layer_l1_regularization": None, + "batch_normalization": False, + "dropout_probability": 0.0, + + ########################################## + # TRAINING Data + ########################################## + "train_data": {"subset": "all", "pretrain_min_points": 1000}, +} + +grid = [] +for train_subset in ["all", "quantitative"]: + for minibatch_size in [128]: + for dense_layer_size in [8, 16, 32, 64]: + for l1 in [0.0, 0.001]: + for num_lc in [0, 1, 2]: + for lc_kernel_size in [3, 5]: + new = deepcopy(base_hyperparameters) + new["minibatch_size"] = minibatch_size + new["train_data"]["subset"] = train_subset + new["layer_sizes"] = [dense_layer_size] + new["dense_layer_l1_regularization"] = l1 + (lc_layer,) = new["locally_connected_layers"] + lc_layer['kernel_size'] = lc_kernel_size + if num_lc == 0: + new["locally_connected_layers"] = [] + elif num_lc == 1: + new["locally_connected_layers"] = [lc_layer] + elif num_lc == 2: + new["locally_connected_layers"] = [lc_layer, deepcopy(lc_layer)] + if not grid or new not in grid: + grid.append(new) + +dump(grid, stdout) diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py index fdc1ae22..6084e874 100644 --- a/mhcflurry/train_allele_specific_models_command.py +++ b/mhcflurry/train_allele_specific_models_command.py @@ -60,14 +60,12 @@ parser.add_argument( metavar="FILE.json", required=True, help="JSON or YAML of hyperparameters") - parser.add_argument( "--allele", default=None, nargs="+", help="Alleles to train models for. If not specified, all alleles with " "enough measurements will be used.") - parser.add_argument( "--min-measurements-per-allele", type=int, -- GitLab