diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index d2d367615b52e8cffe8f8f092e468701c98f0c71..9d9f00bd844d3e2e33da60c31d147f6d00b52549 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -458,6 +458,9 @@ class Class1NeuralNetwork(object): """ import keras + from keras import backend as K + + fit_info = collections.defaultdict(list) loss = get_loss(self.hyperparameters['loss']) @@ -478,6 +481,13 @@ class Class1NeuralNetwork(object): network._make_predict_function() self.set_allele_representations(allele_representations) + if self.hyperparameters['learning_rate'] is not None: + K.set_value( + self.network().optimizer.lr, + self.hyperparameters['learning_rate']) + fit_info["learning_rate"] = float( + K.get_value(self.network().optimizer.lr)) + validation_x_dict = { 'peptide': self.peptides_to_network_input( validation_peptide_encoding), @@ -513,7 +523,8 @@ class Class1NeuralNetwork(object): yielded_values_box[0] += len(affinities) start = time.time() - result = network.fit_generator( + + fit_history = network.fit_generator( wrapped_generator(), steps_per_epoch=steps_per_epoch, epochs=epochs, @@ -526,10 +537,12 @@ class Class1NeuralNetwork(object): patience=patience, verbose=verbose)] ) - if verbose > 0: - print("fit_generator completed in %0.2f sec (%d total points)" % ( - time.time() - start, yielded_values_box[0])) - return result + for (key, value) in fit_history.history.items(): + fit_info[key].extend(value) + + fit_info["time"] = time.time() - start + fit_info["num_points"] = yielded_values_box[0] + self.fit_info.append(dict(fit_info)) def fit( @@ -585,8 +598,10 @@ class Class1NeuralNetwork(object): How often (in seconds) to print progress update. Set to None to disable. """ + from keras import backend as K encodable_peptides = EncodableSequences.create(peptides) peptide_encoding = self.peptides_to_network_input(encodable_peptides) + fit_info = collections.defaultdict(list) length_counts = ( pandas.Series(encodable_peptides.sequences) @@ -687,10 +702,11 @@ class Class1NeuralNetwork(object): loss=loss.loss, optimizer=self.hyperparameters['optimizer']) if self.hyperparameters['learning_rate'] is not None: - from keras import backend as K K.set_value( self.network().optimizer.lr, self.hyperparameters['learning_rate']) + fit_info["learning_rate"] = float( + K.get_value(self.network().optimizer.lr)) if loss.supports_inequalities: # Do not sample negative affinities: just use an inequality. @@ -762,7 +778,6 @@ class Class1NeuralNetwork(object): min_val_loss_iteration = None min_val_loss = None - fit_info = collections.defaultdict(list) start = time.time() last_progress_print = None x_dict_with_random_negatives = {} diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py index e44cdcdf9d28d6e743a84ce1f85c3bdf85448f86..510016bf846885119c5d9eac0f68cc7109c7049a 100644 --- a/mhcflurry/select_pan_allele_models_command.py +++ b/mhcflurry/select_pan_allele_models_command.py @@ -43,13 +43,6 @@ parser.add_argument( help=( "Model selection data CSV. Expected columns: " "allele, peptide, measurement_value")) -parser.add_argument( - "--exclude-data", - metavar="FILE.csv", - required=False, - help=( - "Data to EXCLUDE from model selection. Useful to specify the original " - "training data used")) parser.add_argument( "--models-dir", metavar="DIR", @@ -60,24 +53,6 @@ parser.add_argument( metavar="DIR", required=True, help="Directory to write selected models") -parser.add_argument( - "--out-unselected-predictions", - metavar="FILE.csv", - help="Write predictions for validation data using unselected predictor to " - "FILE.csv") -parser.add_argument( - "--unselected-accuracy-scorer", - metavar="SCORER", - default="combined:mass-spec,mse") -parser.add_argument( - "--unselected-accuracy-scorer-num-samples", - type=int, - default=1000) -parser.add_argument( - "--unselected-accuracy-percentile-threshold", - type=float, - metavar="X", - default=95) parser.add_argument( "--min-models", type=int, @@ -122,15 +97,14 @@ def run(argv=sys.argv[1:]): print("Loaded: %s" % input_predictor) alleles = input_predictor.supported_alleles + (min_peptide_length, max_peptide_length) = ( + input_predictor.supported_peptide_lengths) metadata_dfs = {} df = pandas.read_csv(args.data) print("Loaded data: %s" % (str(df.shape))) - (min_peptide_length, max_peptide_length) = ( - input_predictor.supported_peptide_lengths) - - df = df.ix[ + df = df.loc[ (df.peptide.str.len() >= min_peptide_length) & (df.peptide.str.len() <= max_peptide_length) ] @@ -141,26 +115,10 @@ def run(argv=sys.argv[1:]): # Allele names in data are assumed to be already normalized. df = df.loc[df.allele.isin(alleles)].dropna() - print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) - - if args.exclude_data: - exclude_df = pandas.read_csv(args.exclude_data) - metadata_dfs["model_selection_exclude"] = exclude_df - print("Loaded exclude data: %s" % (str(df.shape))) - - df["_key"] = df.allele + "__" + df.peptide - exclude_df["_key"] = exclude_df.allele + "__" + exclude_df.peptide - df["_excluded"] = df._key.isin(exclude_df._key.unique()) - print("Excluding measurements per allele (counts): ") - print(df.groupby("allele")._excluded.sum()) + print("Subselected to supported alleles: %s" % str(df.shape)) - print("Excluding measurements per allele (fractions): ") - print(df.groupby("allele")._excluded.mean()) - df = df.loc[~df._excluded] - del df["_excluded"] - del df["_key"] - print("Reduced data to: %s" % (str(df.shape))) + print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) metadata_dfs["model_selection_data"] = df @@ -168,101 +126,9 @@ def run(argv=sys.argv[1:]): args.mass_spec_regex) - if args.out_unselected_predictions: - df["unselected_prediction"] = input_predictor.predict( - alleles=df.allele.values, - peptides=df.peptide.values) - df.to_csv(args.out_unselected_predictions) - print("Wrote: %s" % args.out_unselected_predictions) - - selectors = {} - selector_to_model_selection_kwargs = {} - - def make_selector( - scoring, - combined_min_contribution_percent=args.combined_min_contribution_percent): - if scoring in selectors: - return ( - selectors[scoring], selector_to_model_selection_kwargs[scoring]) - - start = time.time() - if scoring.startswith("combined:"): - model_selection_kwargs = { - 'min_models': args.combined_min_models, - 'max_models': args.combined_max_models, - } - component_selectors = [] - for component_selector in scoring.split(":", 1)[1].split(","): - component_selectors.append( - make_selector( - component_selector)[0]) - selector = CombinedModelSelector( - component_selectors, - min_contribution_percent=combined_min_contribution_percent) - elif scoring == "mse": - model_selection_kwargs = { - 'min_models': args.mse_min_models, - 'max_models': args.mse_max_models, - } - min_measurements = args.mse_min_measurements - selector = MSEModelSelector( - df=df.loc[~df.mass_spec], - predictor=input_predictor, - min_measurements=min_measurements) - elif scoring == "mass-spec": - mass_spec_df = df.loc[df.mass_spec] - model_selection_kwargs = { - 'min_models': args.mass_spec_min_models, - 'max_models': args.mass_spec_max_models, - } - min_measurements = args.mass_spec_min_measurements - selector = MassSpecModelSelector( - df=mass_spec_df, - predictor=input_predictor, - min_measurements=min_measurements) - elif scoring == "consensus": - model_selection_kwargs = { - 'min_models': args.consensus_min_models, - 'max_models': args.consensus_max_models, - } - selector = ConsensusModelSelector( - predictor=input_predictor, - num_peptides_per_length=args.consensus_num_peptides_per_length) - else: - raise ValueError("Unsupported scoring method: %s" % scoring) - print("Instantiated model selector %s in %0.2f sec." % ( - scoring, time.time() - start)) - return (selector, model_selection_kwargs) - - for scoring in args.scoring: - (selector, model_selection_kwargs) = make_selector(scoring) - selectors[scoring] = selector - selector_to_model_selection_kwargs[scoring] = model_selection_kwargs - - unselected_accuracy_scorer = None - if args.unselected_accuracy_scorer: - # Force running all selectors by setting combined_min_contribution_percent=0. - unselected_accuracy_scorer = make_selector( - args.unselected_accuracy_scorer, - combined_min_contribution_percent=0.0)[0] - print("Using unselected accuracy scorer: %s" % unselected_accuracy_scorer) - GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer - print("Selectors for alleles:") - allele_to_selector = {} - allele_to_model_selection_kwargs = {} - for allele in alleles: - selector = None - for possible_selector in args.scoring: - if selectors[possible_selector].usable_for_allele(allele=allele): - selector = selectors[possible_selector] - print("%20s %s" % (allele, selector.plan_summary(allele))) - break - if selector is None: - raise ValueError("No selectors usable for allele: %s" % allele) - allele_to_selector[allele] = selector - allele_to_model_selection_kwargs[allele] = ( - selector_to_model_selection_kwargs[possible_selector]) + + GLOBAL_DATA["args"] = args GLOBAL_DATA["input_predictor"] = input_predictor diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index bafb6f722644f86a6ceb25866d8daf0e1cf9aab0..1abf6712386f50726b41e582df113760d30d382e 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -9,6 +9,7 @@ import time import traceback import random import pprint +import hashlib from functools import partial import numpy @@ -130,6 +131,7 @@ add_worker_pool_args(parser) def assign_folds(df, num_folds, held_out_fraction, held_out_max): result_df = pandas.DataFrame(index=df.index) + for fold in range(num_folds): result_df["fold_%d" % fold] = True for (allele, sub_df) in df.groupby("allele"): @@ -172,6 +174,9 @@ def assign_folds(df, num_folds, held_out_fraction, held_out_max): print("Test points per fold") print((~result_df).sum()) + result_df["allele"] = df["allele"] + result_df["peptide"] = df["peptide"] + return result_df @@ -422,8 +427,6 @@ def train_model( progress_print_interval, predictor, save_to): - import keras.backend as K - import keras df = GLOBAL_DATA["train_data"] folds_df = GLOBAL_DATA["folds_df"] @@ -484,10 +487,10 @@ def train_model( epochs=pretrain_max_epochs, verbose=verbose, ) - if model.hyperparameters['learning_rate']: - model.hyperparameters['learning_rate'] /= 10 - else: - model.hyperparameters['learning_rate'] = 0.0001 + + # Use a smaller learning rate for training on real data + learning_rate = model.fit_info[-1]["learning_rate"] + model.hyperparameters['learning_rate'] = learning_rate / 10 model.fit( peptides=train_peptides, @@ -500,6 +503,20 @@ def train_model( progress_print_interval=progress_print_interval, verbose=verbose) + # Save model-specific training info + train_peptide_hash = hashlib.sha1() + for peptide in train_data.peptide.values: + train_peptide_hash.update(peptide.encode()) + model.fit_info[-1]["training_info"] = { + "fold_num": fold_num, + "num_folds": num_folds, + "replicate_num": replicate_num, + "num_replicates": num_replicates, + "architecture_num": architecture_num, + "num_architectures": num_architectures, + "train_peptide_hash": train_peptide_hash.hexdigest(), + } + numpy.testing.assert_equal( predictor.manifest_df.shape[0], len(predictor.class1_pan_allele_models)) predictor.add_pan_allele_model(model, models_dir_for_save=save_to) diff --git a/test/test_train_pan_allele_models_command.py b/test/test_train_pan_allele_models_command.py index f4142a1d869187298d039a1968a6bbe080d2df7b..f98599ffcc410e1d46ee2ba4958240f41b95a681 100644 --- a/test/test_train_pan_allele_models_command.py +++ b/test/test_train_pan_allele_models_command.py @@ -101,7 +101,7 @@ HYPERPARAMETERS_LIST = [ ][1:] -def run_and_check(n_jobs=0): +def run_and_check(n_jobs=0, delete=True): models_dir = tempfile.mkdtemp(prefix="mhcflurry-test-models") hyperparameters_filename = os.path.join( models_dir, "hyperparameters.yaml") @@ -140,8 +140,9 @@ def run_and_check(n_jobs=0): alleles=["HLA-A*02:01"]) print(df) - print("Deleting: %s" % models_dir) - shutil.rmtree(models_dir) + if delete: + print("Deleting: %s" % models_dir) + shutil.rmtree(models_dir) if os.environ.get("KERAS_BACKEND") != "theano": @@ -153,5 +154,6 @@ if os.environ.get("KERAS_BACKEND") != "theano": def test_run_serial(): run_and_check(n_jobs=0) + if __name__ == "__main__": - test_run_serial() \ No newline at end of file + run_and_check(n_jobs=0, delete=False)