Skip to content
Snippets Groups Projects
train_allele_specific_models_command.py 13.3 KiB
Newer Older
Tim O'Donnell's avatar
Tim O'Donnell committed
"""
Tim O'Donnell's avatar
Tim O'Donnell committed
Train Class1 single allele models.
Tim O'Donnell's avatar
Tim O'Donnell committed
"""
import argparse
import signal
Tim O'Donnell's avatar
Tim O'Donnell committed
import sys
Tim O'Donnell's avatar
Tim O'Donnell committed
import time
from pprint import pprint
Tim O'Donnell's avatar
Tim O'Donnell committed

import pandas
import tqdm  # progress bar
Tim O'Donnell's avatar
Tim O'Donnell committed

from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_neural_network import Class1NeuralNetwork
from .common import configure_logging, set_keras_backend
Tim O'Donnell's avatar
Tim O'Donnell committed


# To avoid pickling large matrices to send to child processes when running in
# parallel, we use this global variable as a place to store data. Data that is
# stored here before creating the thread pool will be inherited to the child
# processes upon fork() call, allowing us to share large data with the workers
# efficiently.
Tim O'Donnell's avatar
Tim O'Donnell committed
parser = argparse.ArgumentParser(usage=__doc__)

parser.add_argument(
Tim O'Donnell's avatar
Tim O'Donnell committed
    metavar="FILE.csv",
Tim O'Donnell's avatar
Tim O'Donnell committed
    help=(
        "Training data CSV. Expected columns: "
        "allele, peptide, measurement_value"))
Tim O'Donnell's avatar
Tim O'Donnell committed
parser.add_argument(
    "--out-models-dir",
Tim O'Donnell's avatar
Tim O'Donnell committed
    metavar="DIR",
Tim O'Donnell's avatar
Tim O'Donnell committed
    help="Directory to write models and manifest")
parser.add_argument(
    "--hyperparameters",
Tim O'Donnell's avatar
Tim O'Donnell committed
    metavar="FILE.json",
Tim O'Donnell's avatar
Tim O'Donnell committed
    required=True,
    help="JSON or YAML of hyperparameters")
Tim O'Donnell's avatar
Tim O'Donnell committed
parser.add_argument(
    "--allele",
    default=None,
    nargs="+",
    help="Alleles to train models for. If not specified, all alleles with "
    "enough measurements will be used.")
Tim O'Donnell's avatar
Tim O'Donnell committed
parser.add_argument(
    "--min-measurements-per-allele",
    type=int,
Tim O'Donnell's avatar
Tim O'Donnell committed
    default=50,
    help="Train models for alleles with >=N measurements.")
parser.add_argument(
    "--only-quantitative",
    action="store_true",
    default=False,
    help="Use only quantitative training data")
parser.add_argument(
    "--ignore-inequalities",
    action="store_true",
    default=False,
    help="Do not use affinity value inequalities even when present in data")
Tim O'Donnell's avatar
Tim O'Donnell committed
parser.add_argument(
    "--percent-rank-calibration-num-peptides-per-length",
    type=int,
Tim O'Donnell's avatar
Tim O'Donnell committed
    metavar="N",
Tim O'Donnell's avatar
Tim O'Donnell committed
    default=int(1e5),
    help="Number of peptides per length to use to calibrate percent ranks. "
    "Set to 0 to disable percent rank calibration. The resulting models will "
Tim O'Donnell's avatar
Tim O'Donnell committed
    "not support percent ranks. Default: %(default)s.")
parser.add_argument(
    "--n-models",
    type=int,
    metavar="N",
    help="Ensemble size, i.e. how many models to train for each architecture. "
    "If specified here it overrides any 'n_models' specified in the "
    "hyperparameters.")
parser.add_argument(
    "--max-epochs",
    type=int,
    metavar="N",
    help="Max training epochs. If specified here it overrides any 'max_epochs' "
    "specified in the hyperparameters.")
Tim O'Donnell's avatar
Tim O'Donnell committed
parser.add_argument(
Tim O'Donnell's avatar
Tim O'Donnell committed
    type=int,
Tim O'Donnell's avatar
Tim O'Donnell committed
    help="Keras verbosity. Default: %(default)s",
    default=0)
Tim O'Donnell's avatar
Tim O'Donnell committed
    "--train-num-jobs",
    default=1,
    type=int,
    metavar="N",
Tim O'Donnell's avatar
Tim O'Donnell committed
    help="Number of processes to parallelize training over. "
    "Set to 1 for serial run. Set to 0 to use number of cores. Experimental."
Tim O'Donnell's avatar
Tim O'Donnell committed
    "Default: %(default)s.")
parser.add_argument(
    "--calibration-num-jobs",
Tim O'Donnell's avatar
Tim O'Donnell committed
    help="Number of processes to parallelize percent rank calibration over. "
    "Set to 1 for serial run. Set to 0 to use number of cores. Experimental."
parser.add_argument(
    "--backend",
    choices=("tensorflow-gpu", "tensorflow-cpu"),
    help="Keras backend. If not specified will use system default.")
Tim O'Donnell's avatar
Tim O'Donnell committed
def run(argv=sys.argv[1:]):
    # On sigusr1 print stack trace
    print("To show stack trace, run:\nkill -s USR1 %d" % os.getpid())
    signal.signal(signal.SIGUSR1, lambda sig, frame: traceback.print_stack())

Tim O'Donnell's avatar
Tim O'Donnell committed
    args = parser.parse_args(argv)
Tim O'Donnell's avatar
Tim O'Donnell committed

    if args.backend:
        set_keras_backend(args.backend)

Tim O'Donnell's avatar
Tim O'Donnell committed
    configure_logging(verbose=args.verbosity > 1)

    hyperparameters_lst = yaml.load(open(args.hyperparameters))
    assert isinstance(hyperparameters_lst, list)
Tim O'Donnell's avatar
Tim O'Donnell committed
    print("Loaded hyperparameters list: %s" % str(hyperparameters_lst))

    df = pandas.read_csv(args.data)
    print("Loaded training data: %s" % (str(df.shape)))

    df = df.ix[
        (df.peptide.str.len() >= 8) & (df.peptide.str.len() <= 15)
    ]
    print("Subselected to 8-15mers: %s" % (str(df.shape)))
Tim O'Donnell's avatar
Tim O'Donnell committed

    if args.only_quantitative:
        df = df.loc[
            df.measurement_type == "quantitative"
        ]
        print("Subselected to quantitative: %s" % (str(df.shape)))

    if args.ignore_inequalities and "measurement_inequality" in df.columns:
        print("Dropping measurement_inequality column")
        del df["measurement_inequality"]

Tim O'Donnell's avatar
Tim O'Donnell committed
    allele_counts = df.allele.value_counts()

        alleles = [normalize_allele_name(a) for a in args.allele]
    else:
        alleles = list(allele_counts.ix[
            allele_counts > args.min_measurements_per_allele
        ].index)
Tim O'Donnell's avatar
Tim O'Donnell committed

    # Allele names in data are assumed to be already normalized.
    df = df.loc[df.allele.isin(alleles)].dropna()

    print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
    print("Training data: %s" % (str(df.shape)))
Tim O'Donnell's avatar
Tim O'Donnell committed

    GLOBAL_DATA["train_data"] = df

Tim O'Donnell's avatar
Tim O'Donnell committed
    predictor = Class1AffinityPredictor()
Tim O'Donnell's avatar
Tim O'Donnell committed
    if args.train_num_jobs == 1:
Tim O'Donnell's avatar
Tim O'Donnell committed
        print("Running in serial.")
Tim O'Donnell's avatar
Tim O'Donnell committed
                args.train_num_jobs
                if args.train_num_jobs else None))
Tim O'Donnell's avatar
Tim O'Donnell committed

    if args.out_models_dir and not os.path.exists(args.out_models_dir):
        print("Attempting to create directory: %s" % args.out_models_dir)
        os.mkdir(args.out_models_dir)
        print("Done.")

Tim O'Donnell's avatar
Tim O'Donnell committed
    start = time.time()
Tim O'Donnell's avatar
Tim O'Donnell committed
    work_items = []
Tim O'Donnell's avatar
Tim O'Donnell committed
    for (h, hyperparameters) in enumerate(hyperparameters_lst):
        n_models = None
        if 'n_models' in hyperparameters:
            n_models = hyperparameters.pop("n_models")
        if args.n_models:
            n_models = args.n_models
        if not n_models:
            raise ValueError("Specify --ensemble-size or n_models hyperparameter")

        if args.max_epochs:
            hyperparameters['max_epochs'] = args.max_epochs
Tim O'Donnell's avatar
Tim O'Donnell committed

        for (i, allele) in enumerate(df.allele.unique()):
            for model_group in range(n_models):
                work_dict = {
                    'model_group': model_group,
                    'n_models': n_models,
                    'allele_num': i,
                    'n_alleles': len(alleles),
                    'hyperparameter_set_num': h,
                    'num_hyperparameter_sets': len(hyperparameters_lst),
                    'allele': allele,
                    'data': None,  # subselect from GLOBAL_DATA["train_data"]
                    'verbose': args.verbosity,
                    'predictor': predictor if not worker_pool else None,
                    'save_to': args.out_models_dir if not worker_pool else None,
                }
                work_items.append(work_dict)

Tim O'Donnell's avatar
Tim O'Donnell committed
    if worker_pool:
        print("Processing %d work items in parallel." % len(work_items))
        predictors = list(
            tqdm.tqdm(
                worker_pool.imap_unordered(
                    train_model_entrypoint, work_items, chunksize=1),
                ascii=True,
                total=len(work_items)))

        print("Merging %d predictors fit in parallel." % (len(predictors)))
        predictor = Class1AffinityPredictor.merge([predictor] + predictors)
        print("Saving merged predictor to: %s" % args.out_models_dir)
        predictor.save(args.out_models_dir)
    else:
        # Run in serial. In this case, every worker is passed the same predictor,
        # which it adds models to, so no merging is required. It also saves
        # as it goes so no saving is required at the end.
        start = time.time()
        for _ in tqdm.trange(len(work_items)):
            item = work_items.pop(0)  # want to keep freeing up memory
            work_predictor = train_model_entrypoint(item)
            assert work_predictor is predictor
        assert not work_items
Tim O'Donnell's avatar
Tim O'Donnell committed
    print("*" * 30)
Tim O'Donnell's avatar
Tim O'Donnell committed
    training_time = time.time() - start
Tim O'Donnell's avatar
Tim O'Donnell committed
    print("Trained affinity predictor with %d networks in %0.2f min." % (
        len(predictor.neural_networks), training_time / 60.0))
Tim O'Donnell's avatar
Tim O'Donnell committed
    print("*" * 30)

Tim O'Donnell's avatar
Tim O'Donnell committed
    if worker_pool:
        worker_pool.close()
        worker_pool.join()
Tim O'Donnell's avatar
Tim O'Donnell committed
        worker_pool = None
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    start = time.time()
Tim O'Donnell's avatar
Tim O'Donnell committed
    if args.percent_rank_calibration_num_peptides_per_length > 0:
        alleles = list(predictor.supported_alleles)
        first_allele = alleles.pop(0)

        print("Performing percent rank calibration. Calibrating first allele.")
        encoded_peptides = predictor.calibrate_percentile_ranks(
            alleles=[first_allele],
            num_peptides_per_length=args.percent_rank_calibration_num_peptides_per_length)
Tim O'Donnell's avatar
Tim O'Donnell committed
        assert encoded_peptides.encoding_cache  # must have cached the encoding
        print("Finished calibrating percent ranks for first allele in %0.2f sec." % (
Tim O'Donnell's avatar
Tim O'Donnell committed
            time.time() - start))
        print("Calibrating %d additional alleles." % len(alleles))

Tim O'Donnell's avatar
Tim O'Donnell committed
        if args.calibration_num_jobs == 1:
            # Serial run
Tim O'Donnell's avatar
Tim O'Donnell committed
            print("Running in serial.")
Tim O'Donnell's avatar
Tim O'Donnell committed
            worker_pool = None
            results = (
                calibrate_percentile_ranks(
                    allele=allele,
Tim O'Donnell's avatar
Tim O'Donnell committed
                    predictor=predictor,
                    peptides=encoded_peptides)
                for allele in alleles)
Tim O'Donnell's avatar
Tim O'Donnell committed
        else:
            # Parallel run
            # Store peptides in global variable so they are in shared memory
            # after fork, instead of needing to be pickled.
            GLOBAL_DATA["calibration_peptides"] = encoded_peptides
            Class1NeuralNetwork.clear_model_cache()
Tim O'Donnell's avatar
Tim O'Donnell committed
            worker_pool = Pool(
                processes=(
                    args.calibration_num_jobs
                    if args.calibration_num_jobs else None))
Tim O'Donnell's avatar
Tim O'Donnell committed
            print("Using worker pool: %s" % str(worker_pool))
            results = worker_pool.imap_unordered(
                partial(
                    calibrate_percentile_ranks,
Tim O'Donnell's avatar
Tim O'Donnell committed
                    predictor=args.out_models_dir),
Tim O'Donnell's avatar
Tim O'Donnell committed
                alleles,
                chunksize=1)

        for result in tqdm.tqdm(results, ascii=True, total=len(alleles)):
            predictor.allele_to_percent_rank_transform.update(result)
        print("Done calibrating %d additional alleles." % len(alleles))
Tim O'Donnell's avatar
Tim O'Donnell committed
        predictor.save(args.out_models_dir, model_names_to_write=[])

Tim O'Donnell's avatar
Tim O'Donnell committed
    percent_rank_calibration_time = time.time() - start

    if worker_pool:
        worker_pool.close()
        worker_pool.join()
Tim O'Donnell's avatar
Tim O'Donnell committed
        worker_pool = None
Tim O'Donnell's avatar
Tim O'Donnell committed
    print("Train time: %0.2f min. Percent rank calibration time: %0.2f min." % (
        training_time / 60.0, percent_rank_calibration_time / 60.0))
Tim O'Donnell's avatar
Tim O'Donnell committed
    print("Predictor written to: %s" % args.out_models_dir)

Tim O'Donnell's avatar
Tim O'Donnell committed

def train_model_entrypoint(item):
    return train_model(**item)
        model_group,
        n_models,
        allele_num,
        n_alleles,
        hyperparameter_set_num,
        num_hyperparameter_sets,
        allele,
        verbose,
        predictor,
        save_to):

    if predictor is None:
        predictor = Class1AffinityPredictor()

    if data is None:
        full_data = GLOBAL_DATA["train_data"]
        data = full_data.loc[full_data.allele == allele]

    progress_preamble = (
        "[%4d / %4d alleles] "
        "[%2d / %2d replicates]: %s " % (
            hyperparameter_set_num + 1,
            num_hyperparameter_sets,
            allele_num + 1,
            n_alleles,
            model_group + 1,
            n_models,
    train_data = data.sample(frac=1.0)
Tim O'Donnell's avatar
Tim O'Donnell committed
    (model,) = predictor.fit_allele_specific_predictors(
        architecture_hyperparameters_list=[hyperparameters],
        allele=allele,
        peptides=train_data.peptide.values,
        affinities=train_data.measurement_value.values,
        inequalities=(
            train_data.measurement_inequality.values
            if "measurement_inequality" in train_data.columns else None),
        models_dir_for_save=save_to,
        progress_preamble=progress_preamble,
        verbose=verbose)
Tim O'Donnell's avatar
Tim O'Donnell committed
    if allele_num == 0 and model_group == 0:
        # For the first model for the first allele, print the architecture.
Tim O'Donnell's avatar
Tim O'Donnell committed
        print("*** HYPERPARAMETER SET %d***" %
              (hyperparameter_set_num + 1))
        pprint(hyperparameters)
Tim O'Donnell's avatar
Tim O'Donnell committed
        print("*** ARCHITECTURE FOR HYPERPARAMETER SET %d***" %
              (hyperparameter_set_num + 1))
        model.network(borrow=True).summary()

def calibrate_percentile_ranks(allele, predictor, peptides=None):
    """
    Private helper function.
    """
    global GLOBAL_DATA
Tim O'Donnell's avatar
Tim O'Donnell committed
    Class1NeuralNetwork.clear_model_cache()
    import keras.backend as K
    K.clear_session()
    if peptides is None:
        peptides = GLOBAL_DATA["calibration_peptides"]
Tim O'Donnell's avatar
Tim O'Donnell committed
    if isinstance(predictor, str):
        predictor = Class1AffinityPredictor.load(predictor)
    predictor.calibrate_percentile_ranks(
        peptides=peptides,
        alleles=[allele])
    return {
        allele: predictor.allele_to_percent_rank_transform[allele],
    }


Tim O'Donnell's avatar
Tim O'Donnell committed
if __name__ == '__main__':
    run()