From 51dabf24215359e48eab042871517b5d6429e275 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Wed, 14 Feb 2018 16:21:06 -0500 Subject: [PATCH] first cut on multi-round training --- .../models_class1/generate_hyperparameters.py | 2 +- .../models_class1_unselected/GENERATE.sh | 3 + .../class1_pseudosequences.csv | 0 mhcflurry/class1_affinity_predictor.py | 139 ++++++++-------- mhcflurry/class1_neural_network.py | 6 +- .../train_allele_specific_models_command.py | 156 +++++++++++++----- 6 files changed, 191 insertions(+), 115 deletions(-) rename {mhcflurry/data => downloads-generation/models_class1_unselected}/class1_pseudosequences.csv (100%) diff --git a/downloads-generation/models_class1/generate_hyperparameters.py b/downloads-generation/models_class1/generate_hyperparameters.py index a805a32c..5b1bf5e0 100644 --- a/downloads-generation/models_class1/generate_hyperparameters.py +++ b/downloads-generation/models_class1/generate_hyperparameters.py @@ -59,7 +59,7 @@ base_hyperparameters = { ########################################## # TRAINING Data ########################################## - "train_data": {"subset": "all"}, + "train_data": {"subset": "all", "pretrain_min_points": 10000}, } grid = [] diff --git a/downloads-generation/models_class1_unselected/GENERATE.sh b/downloads-generation/models_class1_unselected/GENERATE.sh index a3cdb5c0..c92baa02 100755 --- a/downloads-generation/models_class1_unselected/GENERATE.sh +++ b/downloads-generation/models_class1_unselected/GENERATE.sh @@ -29,6 +29,8 @@ cd $SCRATCH_DIR/$DOWNLOAD_NAME mkdir models +cp $SCRIPT_DIR/class1_pseudosequences.csv . + python $SCRIPT_DIR/generate_hyperparameters.py > hyperparameters.yaml GPUS=$(nvidia-smi -L 2> /dev/null | wc -l) || GPUS=0 @@ -39,6 +41,7 @@ echo "Detected processors: $PROCESSORS" time mhcflurry-class1-train-allele-specific-models \ --data "$(mhcflurry-downloads path data_curated)/curated_training_data.no_mass_spec.csv.bz2" \ + --allele-sequences class1_pseudosequences.csv \ --hyperparameters hyperparameters.yaml \ --out-models-dir models \ --percent-rank-calibration-num-peptides-per-length 0 \ diff --git a/mhcflurry/data/class1_pseudosequences.csv b/downloads-generation/models_class1_unselected/class1_pseudosequences.csv similarity index 100% rename from mhcflurry/data/class1_pseudosequences.csv rename to downloads-generation/models_class1_unselected/class1_pseudosequences.csv diff --git a/mhcflurry/class1_affinity_predictor.py b/mhcflurry/class1_affinity_predictor.py index 4773df95..2f0df6b8 100644 --- a/mhcflurry/class1_affinity_predictor.py +++ b/mhcflurry/class1_affinity_predictor.py @@ -448,13 +448,14 @@ class Class1AffinityPredictor(object): peptides, affinities, inequalities=None, + train_rounds=None, models_dir_for_save=None, verbose=0, progress_preamble="", progress_print_interval=5.0): """ - Fit one or more allele specific predictors for a single allele using a - single neural network architecture. + Fit one or more allele specific predictors for a single allele using one + or more neural network architectures. The new predictors are saved in the Class1AffinityPredictor instance and will be used on subsequent calls to `predict`. @@ -465,9 +466,7 @@ class Class1AffinityPredictor(object): Number of neural networks to fit architecture_hyperparameters_list : list of dict - List of hyperparameters. If more than one set of hyperparameters - are specified, model selection over them is performed and model - with the lowest validation loss is selected for each fold. + List of hyperparameter sets. allele : string @@ -478,6 +477,10 @@ class Class1AffinityPredictor(object): inequalities : list of string, each element one of ">", "<", or "=" See Class1NeuralNetwork.fit for details. + + train_rounds : sequence of int + Each training point i will be used on training rounds r for which + train_rounds[i] > r, r >= 0. models_dir_for_save : string, optional If specified, the Class1AffinityPredictor is (incrementally) written @@ -502,78 +505,76 @@ class Class1AffinityPredictor(object): self.allele_to_allele_specific_models[allele] = [] encodable_peptides = EncodableSequences.create(peptides) + peptides_affinities_inequalities_per_round = [ + (encodable_peptides, affinities, inequalities) + ] + + if train_rounds is not None: + for round in range(1, train_rounds.max() + 1): + round_mask = train_rounds >= round + sub_encodable_peptides = EncodableSequences.create( + encodable_peptides.sequences[round_mask]) + peptides_affinities_inequalities_per_round.append(( + sub_encodable_peptides, + affinities[round_mask], + None if inequalities is None else inequalities[round_mask])) + n_rounds = len(peptides_affinities_inequalities_per_round) n_architectures = len(architecture_hyperparameters_list) - if n_models > 1 or n_architectures > 1: - # Adjust progress info to indicate number of models and - # architectures. - pieces = [] - if n_models > 1: - pieces.append("Model {model_num:2d} / {n_models:2d}") - if n_architectures > 1: - pieces.append( - "Architecture {architecture_num:2d} / {n_architectures:2d}" - " (best so far: {best_num})") - progress_preamble_template = "[ %s ] {user_progress_preamble}" % ( - ", ".join(pieces)) - else: - # Just use the user-provided progress message. - progress_preamble_template = "{user_progress_preamble}" + + # Adjust progress info to indicate number of models and + # architectures. + pieces = [] + if n_models > 1: + pieces.append("Model {model_num:2d} / {n_models:2d}") + if n_architectures > 1: + pieces.append( + "Architecture {architecture_num:2d} / {n_architectures:2d}") + if len(peptides_affinities_inequalities_per_round) > 1: + pieces.append("Round {round:2d} / {n_rounds:2d}") + pieces.append("{n_peptides:4d} peptides") + progress_preamble_template = "[ %s ] {user_progress_preamble}" % ( + ", ".join(pieces)) models = [] for model_num in range(n_models): - shuffle_permutation = numpy.random.permutation(len(affinities)) - - best_num = None - best_loss = None - best_model = None for (architecture_num, architecture_hyperparameters) in enumerate( architecture_hyperparameters_list): model = Class1NeuralNetwork(**architecture_hyperparameters) - model.fit( - encodable_peptides, - affinities, - shuffle_permutation=shuffle_permutation, - inequalities=inequalities, - verbose=verbose, - progress_preamble=progress_preamble_template.format( - user_progress_preamble=progress_preamble, - best_num="n/a" if best_num is None else best_num + 1, - model_num=model_num + 1, - n_models=n_models, - architecture_num=architecture_num + 1, - n_architectures=n_architectures), - progress_print_interval=progress_print_interval) - - if n_architectures > 1: - # We require val_loss (i.e. a validation set) if we have - # multiple architectures. - loss = model.loss_history['val_loss'][-1] - else: - loss = None - if loss is None or best_loss is None or best_loss > loss: - best_loss = loss - best_num = architecture_num - best_model = model - del model - - if n_architectures > 1: - print("Selected architecture %d." % (best_num + 1)) - - model_name = self.model_name(allele, model_num) - row = pandas.Series(collections.OrderedDict([ - ("model_name", model_name), - ("allele", allele), - ("config_json", json.dumps(best_model.get_config())), - ("model", best_model), - ])).to_frame().T - self.manifest_df = pandas.concat( - [self.manifest_df, row], ignore_index=True) - self.allele_to_allele_specific_models[allele].append(best_model) - if models_dir_for_save: - self.save( - models_dir_for_save, model_names_to_write=[model_name]) - models.append(best_model) + for round_num in range(n_rounds): + (round_peptides, round_affinities, round_inequalities) = ( + peptides_affinities_inequalities_per_round[round_num] + ) + model.fit( + round_peptides, + round_affinities, + inequalities=round_inequalities, + verbose=verbose, + progress_preamble=progress_preamble_template.format( + n_peptides=len(round_peptides), + round=round_num, + n_rounds=n_rounds, + user_progress_preamble=progress_preamble, + model_num=model_num + 1, + n_models=n_models, + architecture_num=architecture_num + 1, + n_architectures=n_architectures), + progress_print_interval=progress_print_interval) + + model_name = self.model_name(allele, model_num) + row = pandas.Series(collections.OrderedDict([ + ("model_name", model_name), + ("allele", allele), + ("config_json", json.dumps(model.get_config())), + ("model", model), + ])).to_frame().T + self.manifest_df = pandas.concat( + [self.manifest_df, row], ignore_index=True) + self.allele_to_allele_specific_models[allele].append(model) + if models_dir_for_save: + self.save( + models_dir_for_save, model_names_to_write=[model_name]) + models.append(model) return models diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 9b65a4aa..c81a3638 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -99,7 +99,7 @@ class Class1NeuralNetwork(object): """ miscelaneous_hyperparameter_defaults = HyperparameterDefaults( - train_data={'subset': 'all'}, + train_data={}, ) """ Miscelaneous hyperaparameters. These parameters are not used by this class @@ -165,7 +165,7 @@ class Class1NeuralNetwork(object): self.loss_history = None self.fit_seconds = None - self.fit_num_points = None + self.fit_num_points = [] self.prediction_cache = weakref.WeakKeyDictionary() @@ -472,7 +472,7 @@ class Class1NeuralNetwork(object): disable. """ - self.fit_num_points = len(peptides) + self.fit_num_points.append(len(peptides)) encodable_peptides = EncodableSequences.create(peptides) peptide_encoding = self.peptides_to_network_input(encodable_peptides) diff --git a/mhcflurry/train_allele_specific_models_command.py b/mhcflurry/train_allele_specific_models_command.py index 7e4fdeb3..decd4b98 100644 --- a/mhcflurry/train_allele_specific_models_command.py +++ b/mhcflurry/train_allele_specific_models_command.py @@ -13,6 +13,7 @@ from functools import partial import numpy import pandas import yaml +from sklearn.metrics.pairwise import cosine_similarity from mhcnames import normalize_allele_name import tqdm # progress bar tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 @@ -20,6 +21,8 @@ tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 from .class1_affinity_predictor import Class1AffinityPredictor from .common import configure_logging, set_keras_backend from .parallelism import make_worker_pool, cpu_count +from .hyperparameters import HyperparameterDefaults +from .allele_encoding import AlleleEncoding # To avoid pickling large matrices to send to child processes when running in @@ -92,6 +95,10 @@ parser.add_argument( metavar="N", help="Max training epochs. If specified here it overrides any 'max_epochs' " "specified in the hyperparameters.") +parser.add_argument( + "--allele-sequences", + metavar="FILE.csv", + help="Allele sequences file. Used for computing allele similarity matrix.") parser.add_argument( "--verbosity", type=int, @@ -140,6 +147,12 @@ parser.add_argument( "leaks. Requires Python >=3.2.") +TRAIN_DATA_HYPERPARAMETER_DEFAULTS = HyperparameterDefaults( + subset="all", + pretrain_min_points=None, +) + + def run(argv=sys.argv[1:]): global GLOBAL_DATA @@ -187,16 +200,85 @@ def run(argv=sys.argv[1:]): print("Training data: %s" % (str(df.shape))) GLOBAL_DATA["train_data"] = df + GLOBAL_DATA["train_data_size_by_allele"] = df.allele.value_counts() GLOBAL_DATA["args"] = args + if not os.path.exists(args.out_models_dir): + print("Attempting to create directory: %s" % args.out_models_dir) + os.mkdir(args.out_models_dir) + print("Done.") + predictor = Class1AffinityPredictor() - if args.num_jobs[0] == 1: + serial_run = args.num_jobs[0] == 1 + + work_items = [] + for (h, hyperparameters) in enumerate(hyperparameters_lst): + n_models = None + if 'n_models' in hyperparameters: + n_models = hyperparameters.pop("n_models") + if args.n_models: + n_models = args.n_models + if not n_models: + raise ValueError("Specify --ensemble-size or n_models hyperparameter") + + if args.max_epochs: + hyperparameters['max_epochs'] = args.max_epochs + + hyperparameters['train_data'] = TRAIN_DATA_HYPERPARAMETER_DEFAULTS.with_defaults( + hyperparameters.get('train_data', {}) + ) + + if hyperparameters['train_data']['pretrain_min_points'] and ( + 'allele_similarity_matrix' not in GLOBAL_DATA): + print("Generating allele similarity matrix.") + if not args.allele_sequences: + parser.error( + "Allele sequences required when using pretrain_min_points") + allele_sequences = pandas.read_csv( + args.allele_sequences, + index_col="allele") + print("Read %d allele sequences" % len(allele_sequences)) + allele_sequences = allele_sequences.loc[ + allele_sequences.index.isin(df.allele.unique()) + ] + blosum_encoding = ( + AlleleEncoding( + allele_sequences.index.values, + allele_sequences.pseudosequence.to_dict()) + .fixed_length_vector_encoded_sequences("BLOSUM62")) + allele_similarity_matrix = pandas.DataFrame( + cosine_similarity( + blosum_encoding.reshape((len(allele_sequences), -1))), + index=allele_sequences.index.values, + columns=allele_sequences.index.values) + GLOBAL_DATA['allele_similarity_matrix'] = allele_similarity_matrix + print("Computed allele similarity matrix") + print(allele_similarity_matrix) + + for (i, allele) in enumerate(df.allele.unique()): + for model_num in range(n_models): + work_dict = { + 'n_models': 1, + 'allele_num': i, + 'n_alleles': len(alleles), + 'hyperparameter_set_num': h, + 'num_hyperparameter_sets': len(hyperparameters_lst), + 'allele': allele, + 'hyperparameters': hyperparameters, + 'verbose': args.verbosity, + 'progress_print_interval': None if not serial_run else 5.0, + 'predictor': predictor if serial_run else None, + 'save_to': args.out_models_dir if serial_run else None, + } + work_items.append(work_dict) + + start = time.time() + if serial_run: # Serial run. print("Running in serial.") worker_pool = None if args.backend: set_keras_backend(args.backend) - else: # Parallel run. num_workers = args.num_jobs[0] if args.num_jobs[0] else cpu_count() @@ -238,43 +320,6 @@ def run(argv=sys.argv[1:]): initializer_kwargs_per_process=worker_init_kwargs, max_tasks_per_worker=args.max_tasks_per_worker) - if not os.path.exists(args.out_models_dir): - print("Attempting to create directory: %s" % args.out_models_dir) - os.mkdir(args.out_models_dir) - print("Done.") - - start = time.time() - work_items = [] - for (h, hyperparameters) in enumerate(hyperparameters_lst): - n_models = None - if 'n_models' in hyperparameters: - n_models = hyperparameters.pop("n_models") - if args.n_models: - n_models = args.n_models - if not n_models: - raise ValueError("Specify --ensemble-size or n_models hyperparameter") - - if args.max_epochs: - hyperparameters['max_epochs'] = args.max_epochs - - for (i, allele) in enumerate(df.allele.unique()): - for model_num in range(n_models): - work_dict = { - 'n_models': 1, - 'allele_num': i, - 'n_alleles': len(alleles), - 'hyperparameter_set_num': h, - 'num_hyperparameter_sets': len(hyperparameters_lst), - 'allele': allele, - 'data': None, # subselect from GLOBAL_DATA["train_data"] - 'hyperparameters': hyperparameters, - 'verbose': args.verbosity, - 'progress_print_interval': None if worker_pool else 5.0, - 'predictor': predictor if not worker_pool else None, - 'save_to': args.out_models_dir if not worker_pool else None, - } - work_items.append(work_dict) - if worker_pool: print("Processing %d work items in parallel." % len(work_items)) @@ -400,6 +445,19 @@ def train_model_entry_point(item): return train_model(**item) +def alleles_by_similarity(allele): + global GLOBAL_DATA + allele_similarity = GLOBAL_DATA['allele_similarity_matrix'] + if allele not in allele_similarity.columns: + # Use random alleles + print("No similar alleles for: %s" % allele) + return [allele] + list(allele_similarity.columns.to_series().sample(frac=1.0)) + return ( + allele_similarity[allele] + ( + allele_similarity.index == allele) # force that we return specified allele first + ).sort_values(ascending=False).index.tolist() + + def train_model( n_models, allele_num, @@ -407,7 +465,6 @@ def train_model( hyperparameter_set_num, num_hyperparameter_sets, allele, - data, hyperparameters, verbose, progress_print_interval, @@ -417,8 +474,22 @@ def train_model( if predictor is None: predictor = Class1AffinityPredictor() - if data is None: - full_data = GLOBAL_DATA["train_data"] + pretrain_min_points = hyperparameters['train_data']['pretrain_min_points'] + + full_data = GLOBAL_DATA["train_data"] + data_size_by_allele = GLOBAL_DATA["train_data_size_by_allele"] + + if pretrain_min_points: + similar_alleles = alleles_by_similarity(allele) + alleles = [] + while not alleles or data_size_by_allele.loc[alleles].sum() < pretrain_min_points: + alleles.append(similar_alleles.pop(0)) + print(alleles) + data = full_data.loc[full_data.allele.isin(alleles)] + assert len(data) >= pretrain_min_points, (len(data), pretrain_min_points) + train_rounds = (data.allele == allele).astype(int).values + else: + train_rounds = None data = full_data.loc[full_data.allele == allele] subset = hyperparameters.get("train_data", {}).get("subset", "all") @@ -450,6 +521,7 @@ def train_model( inequalities=( train_data.measurement_inequality.values if "measurement_inequality" in train_data.columns else None), + train_rounds=train_rounds, models_dir_for_save=save_to, progress_preamble=progress_preamble, progress_print_interval=progress_print_interval, -- GitLab