From 7a688ba54251e23f8599516223e462a88b64867a Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Mon, 8 Jul 2019 17:20:08 -0400 Subject: [PATCH] basic support for pan allele model selection --- .../calibrate_percentile_ranks_command.py | 5 - mhcflurry/parallelism.py | 8 +- mhcflurry/select_pan_allele_models_command.py | 609 +++++------------- mhcflurry/train_pan_allele_models_command.py | 2 +- 4 files changed, 176 insertions(+), 448 deletions(-) diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py index 4a986e5c..74dee9d4 100644 --- a/mhcflurry/calibrate_percentile_ranks_command.py +++ b/mhcflurry/calibrate_percentile_ranks_command.py @@ -7,13 +7,8 @@ import signal import sys import time import traceback -import random from functools import partial -import numpy -import pandas -import yaml -from sklearn.metrics.pairwise import cosine_similarity from mhcnames import normalize_allele_name import tqdm # progress bar tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 diff --git a/mhcflurry/parallelism.py b/mhcflurry/parallelism.py index 0e652a96..3786b9c6 100644 --- a/mhcflurry/parallelism.py +++ b/mhcflurry/parallelism.py @@ -72,14 +72,12 @@ def worker_pool_with_gpu_assignments( max_tasks_per_worker=None, worker_log_dir=None): - num_workers = num_jobs if num_jobs else cpu_count() - - if num_workers == 0: + if num_jobs == 0: if backend: set_keras_backend(backend) return None - worker_init_kwargs = [{} for _ in range(num_workers)] + worker_init_kwargs = [{} for _ in range(num_jobs)] if num_gpus: print("Attempting to round-robin assign each worker a GPU.") if backend != "tensorflow-default": @@ -115,7 +113,7 @@ def worker_pool_with_gpu_assignments( kwargs["worker_log_dir"] = worker_log_dir worker_pool = make_worker_pool( - processes=num_workers, + processes=num_jobs, initializer=worker_init, initializer_kwargs_per_process=worker_init_kwargs, max_tasks_per_worker=max_tasks_per_worker) diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py index 510016bf..b7d2e5e3 100644 --- a/mhcflurry/select_pan_allele_models_command.py +++ b/mhcflurry/select_pan_allele_models_command.py @@ -8,19 +8,19 @@ import sys import time import traceback import random +import hashlib from pprint import pprint import numpy import pandas from scipy.stats import kendalltau, percentileofscore, pearsonr -from sklearn.metrics import roc_auc_score -from mhcnames import normalize_allele_name import tqdm # progress bar tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 from .class1_affinity_predictor import Class1AffinityPredictor from .encodable_sequences import EncodableSequences +from .allele_encoding import AlleleEncoding from .common import configure_logging, random_peptides from .parallelism import worker_pool_with_gpu_assignments_from_args, add_worker_pool_args from .regression_target import from_ic50 @@ -43,6 +43,11 @@ parser.add_argument( help=( "Model selection data CSV. Expected columns: " "allele, peptide, measurement_value")) +parser.add_argument( + "--folds", + metavar="FILE.csv", + required=False, + help=("")) parser.add_argument( "--models-dir", metavar="DIR", @@ -54,17 +59,17 @@ parser.add_argument( required=True, help="Directory to write selected models") parser.add_argument( - "--min-models", + "--min-models-per-fold", type=int, - default=8, + default=2, metavar="N", - help="Min number of models to select") + help="Min number of models to select per fold") parser.add_argument( - "--max-models", + "--max-models-per-fold", type=int, default=1000, metavar="N", - help="Max number of models to select") + help="Max number of models to select per fold") parser.add_argument( "--mass-spec-regex", metavar="REGEX", @@ -80,6 +85,31 @@ parser.add_argument( add_worker_pool_args(parser) +def mse( + predictions, + actual, + inequalities=None, + affinities_are_already_01_transformed=False): + if not affinities_are_already_01_transformed: + predictions = from_ic50(predictions) + actual = from_ic50(actual) + + deviations = ( + numpy.array(predictions, copy=False) - numpy.array(actual, copy=False)) + + if inequalities is not None: + # Must reverse meaning of inequality since we are working with + # transformed 0-1 values, which are anti-correlated with the ic50s. + # The measurement_inequality column is given in terms of ic50s. + inequalities = numpy.array(inequalities, copy=False) + deviations[ + ((inequalities == "<") & (deviations > 0)) | ( + (inequalities == ">") & (deviations < 0)) + ] = 0.0 + + return (deviations ** 2).mean() + + def run(argv=sys.argv[1:]): global GLOBAL_DATA @@ -104,44 +134,86 @@ def run(argv=sys.argv[1:]): df = pandas.read_csv(args.data) print("Loaded data: %s" % (str(df.shape))) - df = df.loc[ - (df.peptide.str.len() >= min_peptide_length) & - (df.peptide.str.len() <= max_peptide_length) - ] - print("Subselected to %d-%dmers: %s" % ( - min_peptide_length, max_peptide_length, str(df.shape))) - - import ipdb ; ipdb.set_trace() # leaving off here + if args.folds: + folds_df = pandas.read_csv(args.folds) + matches = all([ + len(folds_df) == len(df), + (folds_df.peptide == df.peptide).all(), + (folds_df.allele == df.allele).all(), + ]) + if not matches: + raise ValueError("Training data and fold data do not match") + fold_cols = [c for c in folds_df if c.startswith("fold_")] + for col in fold_cols: + df[col] = folds_df[col] + + fold_cols = [c for c in df if c.startswith("fold_")] + num_folds = len(fold_cols) + if num_folds <= 1: + raise ValueError("Too few folds: ", num_folds) + + #df = df.loc[ + # (df.peptide.str.len() >= min_peptide_length) & + # (df.peptide.str.len() <= max_peptide_length) + #] + #print("Subselected to %d-%dmers: %s" % ( + # min_peptide_length, max_peptide_length, str(df.shape))) + + print("Num folds: ", num_folds, "fraction included:") + print(df[fold_cols].mean()) # Allele names in data are assumed to be already normalized. - df = df.loc[df.allele.isin(alleles)].dropna() - print("Subselected to supported alleles: %s" % str(df.shape)) - + #df = df.loc[df.allele.isin(alleles)].dropna() + #print("Subselected to supported alleles: %s" % str(df.shape)) - print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) + #print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles))) metadata_dfs["model_selection_data"] = df df["mass_spec"] = df.measurement_source.str.contains( args.mass_spec_regex) - - - - - - GLOBAL_DATA["args"] = args + def make_train_peptide_hash(sub_df): + train_peptide_hash = hashlib.sha1() + for peptide in sorted(sub_df.peptide.values): + train_peptide_hash.update(peptide.encode()) + return train_peptide_hash.hexdigest() + + folds_to_predictors = dict( + (int(col.split("_")[-1]), ( + [], + make_train_peptide_hash(df.loc[df[col] == 1]))) + for col in fold_cols) + print(folds_to_predictors) + for model in input_predictor.class1_pan_allele_models: + training_info = model.fit_info[-1]['training_info'] + fold_num = training_info['fold_num'] + assert num_folds == training_info['num_folds'] + (lst, hash) = folds_to_predictors[fold_num] + train_peptide_hash = training_info['train_peptide_hash'] + #numpy.testing.assert_equal(hash, train_peptide_hash) #enable later + lst.append(model) + + work_items = [] + for (fold_num, (models, _)) in folds_to_predictors.items(): + work_items.append({ + 'fold_num': fold_num, + 'models': models, + 'min_models': args.min_models_per_fold, + 'max_models': args.max_models_per_fold, + }) + + GLOBAL_DATA["data"] = df GLOBAL_DATA["input_predictor"] = input_predictor - GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer - GLOBAL_DATA["allele_to_selector"] = allele_to_selector - GLOBAL_DATA["allele_to_model_selection_kwargs"] = allele_to_model_selection_kwargs if not os.path.exists(args.out_models_dir): print("Attempting to create directory: %s" % args.out_models_dir) os.mkdir(args.out_models_dir) print("Done.") - result_predictor = Class1AffinityPredictor(metadata_dataframes=metadata_dfs) + result_predictor = Class1AffinityPredictor( + allele_to_sequence=input_predictor.allele_to_sequence, + metadata_dataframes=metadata_dfs) worker_pool = worker_pool_with_gpu_assignments_from_args(args) @@ -150,43 +222,29 @@ def run(argv=sys.argv[1:]): if worker_pool is None: # Serial run print("Running in serial.") - results = ( - model_select(allele) for allele in alleles) + results = (do_model_select_task(item) for item in work_items) else: # Parallel run random.shuffle(alleles) results = worker_pool.imap_unordered( - model_select, - alleles, + do_model_select_task, + work_items, chunksize=1) - unselected_summary = [] - model_selection_dfs = [] - for result in tqdm.tqdm(results, total=len(alleles)): + models_by_fold = {} + for result in tqdm.tqdm(results, total=len(work_items)): pprint(result) + fold_num = result['fold_num'] + models = [ + folds_to_predictors[fold_num][0][i] + for i in result['selected_indices'] + ] + print("Selected %d models for fold %d: %s" % ( + len(models), fold_num, result['selected_indices'])) + models_by_fold[fold_num] = models + for model in models: + result_predictor.add_pan_allele_model(model) - summary_dict = dict(result) - summary_dict["retained"] = result["selected"] is not None - del summary_dict["selected"] - - unselected_summary.append(summary_dict) - if result['selected'] is not None: - model_selection_dfs.append( - result['selected'].metadata_dataframes['model_selection']) - result_predictor.merge_in_place([result['selected']]) - - if model_selection_dfs: - model_selection_df = pandas.concat( - model_selection_dfs, ignore_index=True) - model_selection_df["selector"] = model_selection_df.allele.map( - allele_to_selector) - result_predictor.metadata_dataframes["model_selection"] = ( - model_selection_df) - - result_predictor.metadata_dataframes["unselected_summary"] = ( - pandas.DataFrame(unselected_summary)) - - print("Done model selecting for %d alleles." % len(alleles)) result_predictor.save(args.out_models_dir) model_selection_time = time.time() - start @@ -199,390 +257,67 @@ def run(argv=sys.argv[1:]): print("Predictor written to: %s" % args.out_models_dir) -class ScrambledPredictor(object): - def __init__(self, predictor): - self.predictor = predictor - self._predictions = {} - self._allele = None - - def predict(self, peptides, allele): - if peptides not in self._predictions: - self._predictions[peptides] = pandas.Series( - self.predictor.predict(peptides=peptides, allele=allele)) - self._allele = allele - assert allele == self._allele - return self._predictions[peptides].sample(frac=1.0).values +def do_model_select_task(item): + return model_select(**item) -def model_select(allele): +def model_select(fold_num, models, min_models, max_models): global GLOBAL_DATA - unselected_accuracy_scorer = GLOBAL_DATA["unselected_accuracy_scorer"] - selector = GLOBAL_DATA["allele_to_selector"][allele] - model_selection_kwargs = GLOBAL_DATA[ - "allele_to_model_selection_kwargs" - ][allele] - predictor = GLOBAL_DATA["input_predictor"] - args = GLOBAL_DATA["args"] - unselected_accuracy_scorer_samples = GLOBAL_DATA["args"].unselected_accuracy_scorer_num_samples - - result_dict = { - "allele": allele - } - - unselected_score = None - unselected_score_percentile = None - unselected_score_scrambled_mean = None - if unselected_accuracy_scorer: - unselected_score_function = ( - unselected_accuracy_scorer.score_function(allele)) - - additional_metadata = {} - unselected_score = unselected_score_function( - predictor, additional_metadata_out=additional_metadata) - scrambled_predictor = ScrambledPredictor(predictor) - scrambled_scores = numpy.array([ - unselected_score_function( - scrambled_predictor) - for _ in range(unselected_accuracy_scorer_samples) - ]) - unselected_score_scrambled_mean = scrambled_scores.mean() - unselected_score_percentile = percentileofscore( - scrambled_scores, unselected_score) - print( - "Unselected score and percentile", - allele, - unselected_score, - unselected_score_percentile, - additional_metadata) - result_dict.update( - dict(("unselected_%s" % key, value) - for (key, value) - in additional_metadata.items())) - - selected = None - threshold = args.unselected_accuracy_percentile_threshold - if unselected_score_percentile is None or unselected_score_percentile >= threshold: - selected = predictor.model_select( - score_function=selector.score_function(allele=allele), - alleles=[allele], - **model_selection_kwargs) - - result_dict["unselected_score_plan"] = ( - unselected_accuracy_scorer.plan_summary(allele) - if unselected_accuracy_scorer else None) - result_dict["selector_score_plan"] = selector.plan_summary(allele) - result_dict["unselected_accuracy_score_percentile"] = unselected_score_percentile - result_dict["unselected_score"] = unselected_score - result_dict["unselected_score_scrambled_mean"] = unselected_score_scrambled_mean - result_dict["selected"] = selected - result_dict["num_models"] = len(selected.neural_networks) if selected else None - return result_dict - - -def cache_encoding(predictor, peptides): - # Encode the peptides for each neural network, so the encoding - # becomes cached. - for network in predictor.neural_networks: - network.peptides_to_network_input(peptides) - - -class ScoreFunction(object): - """ - Thin wrapper over a score function (Class1AffinityPredictor -> float). - Used to keep a summary string associated with the function. - """ - def __init__(self, function, summary=None): - self.function = function - self.summary = summary if summary else "(n/a)" - - def __call__(self, *args, **kwargs): - return self.function(*args, **kwargs) - - -class CombinedModelSelector(object): - """ - Model selector that computes a weighted average over other model selectors. - """ - def __init__(self, model_selectors, weights=None, min_contribution_percent=1.0): - if weights is None: - weights = numpy.ones(shape=(len(model_selectors),)) - self.model_selectors = model_selectors - self.selector_to_weight = dict(zip(self.model_selectors, weights)) - self.min_contribution_percent = min_contribution_percent - - def usable_for_allele(self, allele): - return any( - selector.usable_for_allele(allele) - for selector in self.model_selectors) - - def plan_summary(self, allele): - return self.score_function(allele, dry_run=True).summary - - def score_function(self, allele, dry_run=False): - selector_to_max_weighted_score = {} - for selector in self.model_selectors: - weight = self.selector_to_weight[selector] - if selector.usable_for_allele(allele): - max_weighted_score = selector.max_absolute_value(allele) * weight - else: - max_weighted_score = 0 - selector_to_max_weighted_score[selector] = max_weighted_score - max_total_score = sum(selector_to_max_weighted_score.values()) - - # Use only selectors that can contribute >1% to the total score - selectors_to_use = [ - selector - for selector in self.model_selectors - if ( - selector_to_max_weighted_score[selector] > - max_total_score * self.min_contribution_percent / 100.0) - ] - - summary = ", ".join([ - "%s(|%.3f|)" % ( - selector.plan_summary(allele), - selector_to_max_weighted_score[selector]) - for selector in selectors_to_use - ]) - - if dry_run: - score = None - else: - score_functions_and_weights = [ - (selector.score_function(allele=allele), - self.selector_to_weight[selector]) - for selector in selectors_to_use - ] - - def score(predictor, additional_metadata_out=None): - scores = numpy.array([ - score_function( - predictor, - additional_metadata_out=additional_metadata_out) * weight - for (score_function, weight) in score_functions_and_weights - ]) - if additional_metadata_out is not None: - additional_metadata_out["combined_score_terms"] = str( - list(scores)) - - return scores.sum() - return ScoreFunction(score, summary=summary) - - -class ConsensusModelSelector(object): - """ - Model selector that scores sub-ensembles based on their Kendall tau - consistency with the full ensemble over a set of random peptides. - """ - def __init__( - self, - predictor, - num_peptides_per_length=10000, - multiply_score_by_value=10.0): - - (min_length, max_length) = predictor.supported_peptide_lengths - peptides = [] - for length in range(min_length, max_length + 1): - peptides.extend( - random_peptides(num_peptides_per_length, length=length)) - - self.peptides = EncodableSequences.create(peptides) - self.predictor = predictor - self.multiply_score_by_value = multiply_score_by_value - cache_encoding(self.predictor, self.peptides) - - def usable_for_allele(self, allele): - return True - - def max_absolute_value(self, allele): - return self.multiply_score_by_value - - def plan_summary(self, allele): - return "consensus (%d points)" % len(self.peptides) - - def score_function(self, allele): - full_ensemble_predictions = self.predictor.predict( - allele=allele, - peptides=self.peptides) - - def score(predictor, additional_metadata_out=None): - predictions = predictor.predict( - allele=allele, - peptides=self.peptides, - ) - tau = kendalltau(predictions, full_ensemble_predictions).correlation - if additional_metadata_out is not None: - additional_metadata_out["score_consensus_tau"] = tau - return tau * self.multiply_score_by_value - - return ScoreFunction( - score, summary=self.plan_summary(allele)) - - -class MSEModelSelector(object): - """ - Model selector that uses mean-squared error to score models. Inequalities - are supported. - """ - def __init__( - self, - df, - predictor, - min_measurements=1, - multiply_score_by_data_size=True): - - self.df = df - self.predictor = predictor - self.min_measurements = min_measurements - self.multiply_score_by_data_size = multiply_score_by_data_size - - def usable_for_allele(self, allele): - return (self.df.allele == allele).sum() >= self.min_measurements - - def max_absolute_value(self, allele): - if self.multiply_score_by_data_size: - return (self.df.allele == allele).sum() - else: - return 1.0 - - def plan_summary(self, allele): - return self.score_function(allele).summary - - def score_function(self, allele): - sub_df = self.df.loc[self.df.allele == allele].reset_index(drop=True) - peptides = EncodableSequences.create(sub_df.peptide.values) - - def score(predictor, additional_metadata_out=None): - predictions = predictor.predict( - allele=allele, - peptides=peptides, - ) - deviations = from_ic50(predictions) - from_ic50( - sub_df.measurement_value) - - if 'measurement_inequality' in sub_df.columns: - # Must reverse meaning of inequality since we are working with - # transformed 0-1 values, which are anti-correlated with the ic50s. - # The measurement_inequality column is given in terms of ic50s. - deviations.loc[ - ( - (sub_df.measurement_inequality == "<") & (deviations > 0)) | - ((sub_df.measurement_inequality == ">") & (deviations < 0)) - ] = 0.0 - - score_mse = (1 - (deviations ** 2).mean()) - if additional_metadata_out is not None: - additional_metadata_out["score_MSE"] = 1 - score_mse - - # We additionally include other scores on (=) measurements as - # a convenience - eq_df = sub_df - if 'measurement_inequality' in sub_df.columns: - eq_df = sub_df.loc[ - sub_df.measurement_inequality == "=" - ] - additional_metadata_out["score_pearsonr"] = ( - pearsonr( - numpy.log(eq_df.measurement_value.values), - numpy.log(predictions[eq_df.index.values]))[0]) - - for threshold in [500, 5000, 15000]: - if (eq_df.measurement_value < threshold).nunique() == 2: - additional_metadata_out["score_AUC@%d" % threshold] = ( - roc_auc_score( - (eq_df.measurement_value < threshold).values, - -1 * predictions[eq_df.index.values])) - - return score_mse * ( - len(sub_df) if self.multiply_score_by_data_size else 1) - - summary = "mse (%d points)" % (len(sub_df)) - return ScoreFunction(score, summary=summary) - - -class MassSpecModelSelector(object): - """ - Model selector that uses PPV of differentiating decoys from hits from - mass-spec experiments. - """ - def __init__( - self, - df, - predictor, - decoys_per_length=0, - min_measurements=100, - multiply_score_by_data_size=True): - - # Index is peptide, columns are alleles - hit_matrix = df.groupby( - ["peptide", "allele"]).measurement_value.count().unstack().fillna( - 0).astype(bool) - - if decoys_per_length: - (min_length, max_length) = predictor.supported_peptide_lengths - decoys = [] - for length in range(min_length, max_length + 1): - decoys.extend( - random_peptides(decoys_per_length, length=length)) - - decoy_matrix = pandas.DataFrame( - index=decoys, columns=hit_matrix.columns, dtype=bool) - decoy_matrix[:] = False - full_matrix = pandas.concat([hit_matrix, decoy_matrix]) - else: - full_matrix = hit_matrix - - if len(full_matrix) > 0: - full_matrix = full_matrix.sample(frac=1.0).astype(float) - - self.df = full_matrix - self.predictor = predictor - self.min_measurements = min_measurements - self.multiply_score_by_data_size = multiply_score_by_data_size - - self.peptides = EncodableSequences.create(full_matrix.index.values) - cache_encoding(self.predictor, self.peptides) - - @staticmethod - def ppv(y_true, predictions): - df = pandas.DataFrame({"prediction": predictions, "y_true": y_true}) - return df.sort_values("prediction", ascending=True)[ - : int(y_true.sum()) - ].y_true.mean() - - def usable_for_allele(self, allele): - return allele in self.df.columns and ( - self.df[allele].sum() >= self.min_measurements) + full_data = GLOBAL_DATA["data"] + input_predictor = GLOBAL_DATA["input_predictor"] + df = full_data.loc[ + full_data["fold_%d" % fold_num] == 0 + ] - def max_absolute_value(self, allele): - if self.multiply_score_by_data_size: - return self.df[allele].sum() + peptides = EncodableSequences.create(df.peptide.values) + alleles = AlleleEncoding( + df.allele.values, + borrow_from=input_predictor.get_master_allele_encoding()) + + predictions_df = df.copy() + for (i, model) in enumerate(models): + predictions_df[i] = from_ic50(model.predict(peptides, alleles)) + + actual = from_ic50(predictions_df.measurement_value) + + selected = [] + selected_score = 0 + remaining_models = set(numpy.arange(len(models))) + individual_model_scores = {} + while remaining_models and len(selected) < max_models: + best_model = None + best_model_score = 0 + for i in remaining_models: + possible_ensemble = list(selected) + [i] + predictions = predictions_df[possible_ensemble].mean(1) + mse_score = 1 - mse( + predictions, + actual, + inequalities=( + predictions_df.measurement_inequality + if 'measurement_inequality' in predictions_df.columns + else None), + affinities_are_already_01_transformed=True) + if mse_score >= best_model_score: + best_model = i + best_model_score = mse_score + if not selected: + # First iteration. Store individual model scores. + individual_model_scores[i] = mse_score + if len(selected) < min_models or best_model_score > selected_score: + selected_score = best_model_score + remaining_models.remove(best_model) + selected.append(best_model) else: - return 1.0 - - def plan_summary(self, allele): - return self.score_function(allele).summary - - def score_function(self, allele): - total_hits = self.df[allele].sum() - total_decoys = (self.df[allele] == 0).sum() - multiplier = total_hits if self.multiply_score_by_data_size else 1 - def score(predictor, additional_metadata_out=None): - predictions = predictor.predict( - allele=allele, - peptides=self.peptides, - ) - ppv = self.ppv(self.df[allele], predictions) - if additional_metadata_out is not None: - additional_metadata_out["score_mass_spec_PPV"] = ppv - - # We additionally compute AUC score. - additional_metadata_out["score_mass_spec_AUC"] = roc_auc_score( - self.df[allele].values, -1 * predictions) - return ppv * multiplier - - summary = "mass-spec (%d hits / %d decoys)" % (total_hits, total_decoys) - return ScoreFunction(score, summary=summary) + break + + assert selected + return { + 'fold_num': fold_num, + 'selected_indices': selected, + 'individual_model_scores': pandas.Series( + individual_model_scores)[numpy.arange(len(models))], + } if __name__ == '__main__': diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py index 1abf6712..0bee23e8 100644 --- a/mhcflurry/train_pan_allele_models_command.py +++ b/mhcflurry/train_pan_allele_models_command.py @@ -505,7 +505,7 @@ def train_model( # Save model-specific training info train_peptide_hash = hashlib.sha1() - for peptide in train_data.peptide.values: + for peptide in sorted(train_data.peptide.values): train_peptide_hash.update(peptide.encode()) model.fit_info[-1]["training_info"] = { "fold_num": fold_num, -- GitLab