diff --git a/mhcflurry/class1_neural_network.py b/mhcflurry/class1_neural_network.py index 9d9f00bd844d3e2e33da60c31d147f6d00b52549..d1add2b749d5de6b29e77c8078552f613e80a3a6 100644 --- a/mhcflurry/class1_neural_network.py +++ b/mhcflurry/class1_neural_network.py @@ -10,7 +10,6 @@ import pandas from .hyperparameters import HyperparameterDefaults from .encodable_sequences import EncodableSequences, EncodingError -from .amino_acid import available_vector_encodings, vector_encoding_length from .regression_target import to_ic50, from_ic50 from .common import random_peptides, amino_acid_distribution from .custom_loss import get_loss @@ -156,7 +155,6 @@ class Class1NeuralNetwork(object): hyperparameters[to_name] = value return hyperparameters - def __init__(self, **hyperparameters): self.hyperparameters = self.hyperparameter_defaults.with_defaults( self.apply_hyperparameter_renames(hyperparameters)) @@ -420,7 +418,6 @@ class Class1NeuralNetwork(object): allele_encoding.allele_representations( self.hyperparameters['allele_amino_acid_encoding'])) - def fit_generator( self, generator, @@ -544,7 +541,6 @@ class Class1NeuralNetwork(object): fit_info["num_points"] = yielded_values_box[0] self.fit_info.append(dict(fit_info)) - def fit( self, peptides, diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py index eb3bca3df75e5cb18ab4a6d9528fd7d02647ce83..c564e7c189b9e9c90b711b16a9af0b0a3421429b 100644 --- a/mhcflurry/select_pan_allele_models_command.py +++ b/mhcflurry/select_pan_allele_models_command.py @@ -232,19 +232,31 @@ def run(argv=sys.argv[1:]): chunksize=1) models_by_fold = {} + summary_dfs = [] for result in tqdm.tqdm(results, total=len(work_items)): pprint(result) fold_num = result['fold_num'] + (all_models_for_fold, _) = folds_to_predictors[fold_num] models = [ - folds_to_predictors[fold_num][0][i] + all_models_for_fold[i] for i in result['selected_indices'] ] + summary_df = result['summary'].copy() + summary_df.index = summary_df.index.map( + lambda idx: all_models_for_fold[idx]) + summary_dfs.append(summary_df) + print("Selected %d models for fold %d: %s" % ( len(models), fold_num, result['selected_indices'])) models_by_fold[fold_num] = models for model in models: result_predictor.add_pan_allele_model(model) + summary_df = pandas.concat(summary_dfs, ignore_index=False) + summary_df["model_config"] = summary_df.index.map(lambda m: m.get_config()) + result_predictor.metadata_dataframes["model_selection_summary"] = ( + summary_df.reset_index(drop=True)) + result_predictor.save(args.out_models_dir) model_selection_time = time.time() - start @@ -312,11 +324,16 @@ def model_select(fold_num, models, min_models, max_models): break assert selected + + summary_df = pandas.Series(individual_model_scores)[ + numpy.arange(len(models)) + ].to_frame() + summary_df.columns = ['mse_score'] + return { 'fold_num': fold_num, 'selected_indices': selected, - 'individual_model_scores': pandas.Series( - individual_model_scores)[numpy.arange(len(models))], + 'summary': summary_df, # indexed by model index } diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index 0bee23e86cd1a8c98be64d5cf4de98976f3169ee..1ec1118c038a4aa8a431e60d0b56542df1bb2044 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -15,8 +15,6 @@ from functools import partial import numpy import pandas import yaml -from sklearn.metrics.pairwise import cosine_similarity -from sklearn.model_selection import StratifiedKFold from mhcnames import normalize_allele_name import tqdm # progress bar tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 @@ -28,11 +26,8 @@ from .parallelism import ( add_worker_pool_args, worker_pool_with_gpu_assignments_from_args, call_wrapped_kwargs) -from .hyperparameters import HyperparameterDefaults from .allele_encoding import AlleleEncoding from .encodable_sequences import EncodableSequences -from .regression_target import to_ic50, from_ic50 -from .import custom_loss # To avoid pickling large matrices to send to child processes when running in @@ -173,10 +168,6 @@ def assign_folds(df, num_folds, held_out_fraction, held_out_max): print("Test points per fold") print((~result_df).sum()) - - result_df["allele"] = df["allele"] - result_df["peptide"] = df["peptide"] - return result_df @@ -304,10 +295,13 @@ def main(args): predictor = Class1AffinityPredictor( allele_to_sequence=allele_encoding.allele_to_sequence, metadata_dataframes={ - 'train_data': df, - 'training_folds': folds_df, + 'train_data': pandas.merge( + df, + folds_df, + left_index=True, + right_index=True) }) - serial_run = args.num_jobs == 1 + serial_run = args.num_jobs == 0 work_items = [] for (h, hyperparameters) in enumerate(hyperparameters_lst): @@ -353,6 +347,7 @@ def main(args): if worker_pool: print("Processing %d work items in parallel." % len(work_items)) + assert not serial_run results_generator = worker_pool.imap_unordered( partial(call_wrapped_kwargs, train_model), @@ -389,6 +384,7 @@ def main(args): # which it adds models to, so no merging is required. It also saves # as it goes so no saving is required at the end. print("Processing %d work items in serial." % len(work_items)) + assert serial_run for _ in tqdm.trange(len(work_items)): item = work_items.pop(0) # want to keep freeing up memory work_predictor = train_model(**item)