From 1541ad78239c93cd933aad5ed679ada815d207c2 Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Fri, 24 Jan 2020 18:39:00 -0500 Subject: [PATCH] add mhcflurry/train_presentation_models_command.py --- .../models_class1_presentation/GENERATE.sh | 6 +- mhcflurry/multiallelic_refinement_command.py | 331 ------------------ .../train_presentation_models_command.py | 137 ++++++++ setup.py | 4 +- 4 files changed, 142 insertions(+), 336 deletions(-) delete mode 100644 mhcflurry/multiallelic_refinement_command.py create mode 100644 mhcflurry/train_presentation_models_command.py diff --git a/downloads-generation/models_class1_presentation/GENERATE.sh b/downloads-generation/models_class1_presentation/GENERATE.sh index 29f27c86..d8998f41 100755 --- a/downloads-generation/models_class1_presentation/GENERATE.sh +++ b/downloads-generation/models_class1_presentation/GENERATE.sh @@ -82,6 +82,7 @@ else --proteome-peptides "$(mhcflurry-downloads path data_mass_spec_benchmark)/proteome_peptides.all.csv.bz2" \ --decoys-per-hit 99 \ --exclude-pmid 31844290 31495665 31154438 \ + --only-format MULTIALLELIC \ --out "$(pwd)/train_data.csv" bzip2 -f train_data.csv fi @@ -89,10 +90,9 @@ fi mhcflurry-class1-train-presentation-models \ --data "$(pwd)/train_data.csv.bz2" \ --affinity-predictor "$(mhcflurry-downloads path models_class1_pan)/models.combined" \ - --cleavage-predictor-with-flanking "$(mhcflurry-downloads path models_class1_cleavage)/models.selected" \ - --cleavage-predictor-without-flanking "$(mhcflurry-downloads path models_class1_cleavage_variants)/models.selected.no_flank" \ + --cleavage-predictor-with-flanks "$(mhcflurry-downloads path models_class1_cleavage)/models.selected" \ + --cleavage-predictor-without-flanks "$(mhcflurry-downloads path models_class1_cleavage_variants)/models.selected.no_flank" \ --out-models-dir "$(pwd)/models" \ - --only-format MULTIALLELIC \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ $PARALLELISM_ARGS diff --git a/mhcflurry/multiallelic_refinement_command.py b/mhcflurry/multiallelic_refinement_command.py deleted file mode 100644 index 6165c4bc..00000000 --- a/mhcflurry/multiallelic_refinement_command.py +++ /dev/null @@ -1,331 +0,0 @@ -""" -Refine pan-allele predictors using multiallelic mass spec. -""" -import argparse -import os -import signal -import sys -import time -import traceback -import hashlib -import yaml -import pickle - -import numpy -import pandas - -import tqdm # progress bar -tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 - -from .class1_affinity_predictor import Class1AffinityPredictor -from .class1_presentation_neural_network import Class1PresentationNeuralNetwork -from .class1_presentation_predictor import Class1PresentationPredictor -from .allele_encoding import MultipleAlleleEncoding -from .common import configure_logging -from .local_parallelism import ( - worker_pool_with_gpu_assignments_from_args, - add_local_parallelism_args) -from .cluster_parallelism import ( - add_cluster_parallelism_args, - cluster_results_from_args) - - -# To avoid pickling large matrices to send to child processes when running in -# parallel, we use this global variable as a place to store data. Data that is -# stored here before creating the thread pool will be inherited to the child -# processes upon fork() call, allowing us to share large data with the workers -# via shared memory. -GLOBAL_DATA = {} - - -parser = argparse.ArgumentParser(usage=__doc__) -parser.add_argument( - "--multiallelic-data", - metavar="FILE.csv", - required=False, - help="Multiallelic mass spec data.") -parser.add_argument( - "--monoallelic-data", - metavar="FILE.csv", - required=False, - help="Affinity meaurements and monoallelic mass spec data.") -parser.add_argument( - "--models-dir", - metavar="DIR", - required=True, - help="Directory to read models") -parser.add_argument( - "--hyperparameters", - metavar="FILE.json", - help="presentation predictor hyperparameters") -parser.add_argument( - "--out-affinity-predictor-dir", - metavar="DIR", - required=True, - help="Directory to write refined models") -parser.add_argument( - "--out-presentation-predictor-dir", - metavar="DIR", - required=True, - help="Directory to write preentation predictor") -parser.add_argument( - "--max-models", - type=int, - default=None) -parser.add_argument( - "--only-alleles-with-mass-spec", - default=False, - action="store_true") -parser.add_argument( - "--verbosity", - type=int, - help="Keras verbosity. Default: %(default)s", - default=0) - -add_local_parallelism_args(parser) -add_cluster_parallelism_args(parser) - - -def run(argv=sys.argv[1:]): - global GLOBAL_DATA - - # On sigusr1 print stack trace - print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) - signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) - - args = parser.parse_args(argv) - - hyperparameters = yaml.load(open(args.hyperparameters)) - - args.out_affinity_predictor_dir = os.path.abspath( - args.out_affinity_predictor_dir) - args.out_presentation_predictor_dir = os.path.abspath( - args.out_presentation_predictor_dir) - - configure_logging(verbose=args.verbosity > 1) - - multiallelic_df = pandas.read_csv(args.multiallelic_data) - print("Loaded multiallelic data: %s" % (str(multiallelic_df.shape))) - - monoallelic_df = pandas.read_csv(args.monoallelic_data) - print("Loaded monoallelic data: %s" % (str(monoallelic_df.shape))) - - if args.only_alleles_with_mass_spec: - multiallelic_alleles = set() - for hla in multiallelic_df.hla.unique(): - multiallelic_alleles.update(hla.split()) - print( - "Multiallelic alleles (%d)" % len(multiallelic_alleles), - multiallelic_alleles) - new_monoallelic_df = monoallelic_df.loc[ - monoallelic_df.allele.isin((multiallelic_alleles)) - ].copy() - print( - "Allele selection reduced monoallelic data from", - len(monoallelic_df), - "to", - len(new_monoallelic_df)) - monoallelic_df = new_monoallelic_df - - input_predictor = Class1AffinityPredictor.load( - args.models_dir, optimization_level=0, max_models=args.max_models) - print("Loaded: %s" % input_predictor) - - sample_table = multiallelic_df.drop_duplicates( - "sample_id").set_index("sample_id").loc[ - multiallelic_df.sample_id.unique() - ] - grouped = multiallelic_df.groupby("sample_id").nunique() - for col in sample_table.columns: - if (grouped[col] > 1).any(): - del sample_table[col] - sample_table["alleles"] = sample_table.hla.str.split() - - fold_cols = [c for c in monoallelic_df if c.startswith("fold_")] - num_folds = len(fold_cols) - if num_folds <= 1: - raise ValueError("Too few folds: ", num_folds) - - def make_train_peptide_hash(sub_df): - train_peptide_hash = hashlib.sha1() - for peptide in sorted(sub_df.peptide.values): - train_peptide_hash.update(peptide.encode()) - return train_peptide_hash.hexdigest() - - work_items = [] - for model in input_predictor.class1_pan_allele_models: - training_info = model.fit_info[-1]['training_info'] - fold_num = training_info['fold_num'] - assert num_folds == training_info['num_folds'] - fold_col = "fold_%d" % fold_num - observed_hash = make_train_peptide_hash( - monoallelic_df.loc[monoallelic_df[fold_col] == 1]) - saved_hash = training_info['train_peptide_hash'] - #numpy.testing.assert_equal(observed_hash, saved_hash) - work_items.append({ - "work_item_num": len(work_items), - "affinity_model": model, - "fold_num": fold_num, - }) - - work_items_dict = dict((item['work_item_num'], item) for item in work_items) - - GLOBAL_DATA["monoallelic_data"] = monoallelic_df - GLOBAL_DATA["multiallelic_data"] = multiallelic_df - GLOBAL_DATA["multiallelic_sample_table"] = sample_table - GLOBAL_DATA["hyperparameters"] = hyperparameters - GLOBAL_DATA["allele_to_sequence"] = input_predictor.allele_to_sequence - - out_dirs = [ - args.out_affinity_predictor_dir, - args.out_presentation_predictor_dir - ] - - for out in out_dirs: - if not os.path.exists(out): - print("Attempting to create directory:", out) - os.mkdir(out) - print("Done.") - - metadata_dfs = { - "monoallelic_train_data": monoallelic_df, - "multiallelic_train_data": multiallelic_df, - } - - affinity_models = [] - presentation_models = [] - - serial_run = not args.cluster_parallelism and args.num_jobs == 0 - worker_pool = None - start = time.time() - if serial_run: - # Serial run - print("Running in serial.") - results = (refine_model(**item) for item in work_items) - elif args.cluster_parallelism: - # Run using separate processes HPC cluster. - print("Running on cluster.") - results = cluster_results_from_args( - args, - work_function=refine_model, - work_items=work_items, - constant_data=GLOBAL_DATA, - result_serialization_method="pickle") - else: - worker_pool = worker_pool_with_gpu_assignments_from_args(args) - print("Worker pool", worker_pool) - assert worker_pool is not None - - print("Processing %d work items in parallel." % len(work_items)) - assert not serial_run - - # Parallel run - results = worker_pool.imap_unordered( - do_refine_model_task, - work_items, - chunksize=1) - - for result in tqdm.tqdm(results, total=len(work_items)): - work_item_num = result['work_item_num'] - work_item = work_items_dict[work_item_num] - affinity_model = work_item['affinity_model'] - presentation_model = pickle.loads(result['presentation_model']) - presentation_model.copy_weights_to_affinity_model(affinity_model) - affinity_models.append(affinity_model) - presentation_models.append(presentation_model) - - if worker_pool: - worker_pool.close() - worker_pool.join() - - print("Done model fitting. Writing predictors.") - - result_affinity_predictor = Class1AffinityPredictor( - class1_pan_allele_models=affinity_models, - allele_to_sequence=input_predictor.allele_to_sequence) - result_affinity_predictor.save(args.out_affinity_predictor_dir) - print("Wrote", args.out_affinity_predictor_dir) - - result_presentation_predictor = Class1PresentationPredictor( - models=presentation_models, - allele_to_sequence=input_predictor.allele_to_sequence, - metadata_dataframes=metadata_dfs) - result_presentation_predictor.save(args.out_presentation_predictor_dir) - print("Wrote", args.out_presentation_predictor_dir) - - -def do_refine_model_task(item, constant_data=GLOBAL_DATA): - return refine_model(constant_data=constant_data, **item) - - -def refine_model( - work_item_num, affinity_model, fold_num, constant_data=GLOBAL_DATA): - """ - Refine a model. - """ - monoallelic_df = constant_data["monoallelic_data"] - multiallelic_df = constant_data["multiallelic_data"] - sample_table = constant_data["multiallelic_sample_table"] - hyperparameters = constant_data["hyperparameters"] - allele_to_sequence = constant_data["allele_to_sequence"] - - fold_col = "fold_%d" % fold_num - - multiallelic_train_df = multiallelic_df[ - ["sample_id", "peptide", "hit"] - ].rename(columns={"hit": "label"}).copy() - multiallelic_train_df["is_affinity"] = False - multiallelic_train_df["validation_weight"] = 1.0 - - monoallelic_train_df = monoallelic_df[ - ["peptide", "allele", "measurement_inequality", "measurement_value"] - ].copy() - monoallelic_train_df["label"] = monoallelic_train_df["measurement_value"] - del monoallelic_train_df["measurement_value"] - monoallelic_train_df["is_affinity"] = True - - # We force all validation affinities to be from the validation set used - # originally to train the predictor. To ensure proportional sampling between - # the affinity and multiallelic mass spec data, we set their weight to - # as follows. - monoallelic_train_df["validation_weight"] = ( - (monoallelic_df[fold_col] == 0).astype(float) * ( - (monoallelic_df[fold_col] == 1).sum() / - (monoallelic_df[fold_col] == 0).sum())) - - combined_train_df = pandas.concat( - [multiallelic_train_df, monoallelic_train_df], - ignore_index=True, - sort=False) - - allele_encoding = MultipleAlleleEncoding( - experiment_names=multiallelic_train_df.sample_id.values, - experiment_to_allele_list=sample_table.alleles.to_dict(), - allele_to_sequence=allele_to_sequence, - ) - allele_encoding.append_alleles(monoallelic_train_df.allele.values) - allele_encoding = allele_encoding.compact() - - presentation_model = Class1PresentationNeuralNetwork(**hyperparameters) - presentation_model.load_from_class1_neural_network(affinity_model) - presentation_model.fit( - peptides=combined_train_df.peptide.values, - targets=combined_train_df.label.values, - allele_encoding=allele_encoding, - affinities_mask=combined_train_df.is_affinity.values, - inequalities=combined_train_df.measurement_inequality.values, - validation_weights=combined_train_df.validation_weight.values) - - return { - 'work_item_num': work_item_num, - - # We pickle it here so it always gets pickled, even when running in one - # process. This prevents tensorflow errors when using thread-level - # parallelism. - 'presentation_model': pickle.dumps( - presentation_model, pickle.HIGHEST_PROTOCOL), - } - - -if __name__ == '__main__': - run() diff --git a/mhcflurry/train_presentation_models_command.py b/mhcflurry/train_presentation_models_command.py new file mode 100644 index 00000000..d23d0106 --- /dev/null +++ b/mhcflurry/train_presentation_models_command.py @@ -0,0 +1,137 @@ +""" +Train Class1 presentation models. +""" +from __future__ import print_function +import argparse +import os +import signal +import sys +import time +import traceback + +import pandas +import tqdm # progress bar +tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 + +from .class1_cleavage_predictor import Class1CleavagePredictor +from .class1_affinity_predictor import Class1AffinityPredictor +from .class1_presentation_predictor import Class1PresentationPredictor +from .common import configure_logging + +parser = argparse.ArgumentParser(usage=__doc__) + +parser.add_argument( + "--data", + metavar="FILE.csv", + help="Training data CSV. Expected columns: peptide, n_flank, c_flank, hit") +parser.add_argument( + "--out-models-dir", + metavar="DIR", + required=True, + help="Directory to write models and manifest") +parser.add_argument( + "--affinity-predictor", + metavar="DIR", + required=True, + help="Affinity predictor models dir") +parser.add_argument( + "--cleavage-predictor-with-flanks", + metavar="DIR", + required=True, + help="Cleavage predictor with flanking") +parser.add_argument( + "--cleavage-predictor-without-flanks", + metavar="DIR", + required=True, + help="Cleavage predictor without flanking") +parser.add_argument( + "--verbosity", + type=int, + help="Default: %(default)s", + default=0) +parser.add_argument( + "--debug", + action="store_true", + default=False, + help="Launch python debugger on error") +parser.add_argument( + "--hla-column", + default="hla", + help="Column in data giving space-separated MHC I alleles") +parser.add_argument( + "--target-column", + default="hit", + help="Column in data giving hit (1) vs decoy (0)") + +def run(argv=sys.argv[1:]): + # On sigusr1 print stack trace + print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid()) + signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack()) + + args = parser.parse_args(argv) + + if args.debug: + try: + return main(args) + except Exception as e: + print(e) + import ipdb # pylint: disable=import-error + ipdb.set_trace() + raise + else: + return main(args) + + +def main(args): + print("Arguments:") + print(args) + + args.out_models_dir = os.path.abspath(args.out_models_dir) + configure_logging(verbose=args.verbosity > 1) + + df = pandas.read_csv(args.data) + print("Loaded training data: %s" % (str(df.shape))) + df = df.loc[ + (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15) + ] + print("Subselected to 8-15mers: %s" % (str(df.shape))) + + df["experiment_id"] = df[args.hla_columns] + experiment_to_alleles = dict(( + key, key.split()) for key in df.experiment_id.unique()) + + if not os.path.exists(args.out_models_dir): + print("Attempting to create directory: %s" % args.out_models_dir) + os.mkdir(args.out_models_dir) + print("Done.") + + affinity_predictor = Class1AffinityPredictor.load(args.affinity_predictor) + cleavage_predictor_with_flanks = Class1CleavagePredictor.load( + args.cleavage_predictor_with_flanking) + cleavage_predictor_without_flanks = Class1CleavagePredictor.load( + args.cleavage_predictor_without_flanking) + + predictor = Class1PresentationPredictor( + affinity_predictor=affinity_predictor, + cleavage_predictor_with_flanks=cleavage_predictor_with_flanks, + cleavage_predictor_without_flanks=cleavage_predictor_without_flanks) + + print("Fitting.") + start = time.time() + predictor.fit( + targets=df[args.target_column].values, + peptides=df.peptide.values, + alleles=experiment_to_alleles, + experiment_names=df.experiment_id, + n_flanks=df.n_flank.values, + c_flanks=df.c_flank.values, + verbose=args.verbose) + print("Done fitting in", time.time() - start, "seconds") + + print("Saving") + predictor.save(args.out_models_dir) + print("Wrote", args.out_models_dir) + + +if __name__ == '__main__': + run() diff --git a/setup.py b/setup.py index 69449ce4..6db5db77 100644 --- a/setup.py +++ b/setup.py @@ -87,8 +87,8 @@ if __name__ == '__main__': 'mhcflurry.select_cleavage_models_command:run', 'mhcflurry-calibrate-percentile-ranks = ' 'mhcflurry.calibrate_percentile_ranks_command:run', - 'mhcflurry-multiallelic-refinement = ' - 'mhcflurry.multiallelic_refinement_command:run', + 'mhcflurry-class1-train-presentation-models = ' + 'mhcflurry.train_presentation_models_command:run', '_mhcflurry-cluster-worker-entry-point = ' 'mhcflurry.cluster_parallelism:worker_entry_point', ] -- GitLab