From 1cbb85cf52d5ca4e575de3f34335fa97bfd2d00c Mon Sep 17 00:00:00 2001 From: Tim O'Donnell <timodonnell@gmail.com> Date: Mon, 9 Dec 2019 11:46:16 -0500 Subject: [PATCH] first cut on downloads-generation/models_class1_pan_refined --- .../models_class1_pan_refined/GENERATE.sh | 41 ++- .../additional_alleles.txt | 1 + .../cluster_submit_script_header.mssm_hpc.lsf | 1 + .../hyperparameters.yaml | 14 +- mhcflurry/batch_generator.py | 18 +- .../class1_presentation_neural_network.py | 13 +- mhcflurry/class1_presentation_predictor.py | 7 +- mhcflurry/common.py | 17 + mhcflurry/multiallelic_refinement_command.py | 322 ++++++++---------- 9 files changed, 218 insertions(+), 216 deletions(-) create mode 120000 downloads-generation/models_class1_pan_refined/additional_alleles.txt create mode 120000 downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf diff --git a/downloads-generation/models_class1_pan_refined/GENERATE.sh b/downloads-generation/models_class1_pan_refined/GENERATE.sh index 6591e853..e85c805b 100755 --- a/downloads-generation/models_class1_pan_refined/GENERATE.sh +++ b/downloads-generation/models_class1_pan_refined/GENERATE.sh @@ -36,42 +36,58 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME" mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME" # Send stdout and stderr to a logfile included with the archive. -#LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt" -#exec > >(tee -ia "$LOG") -#exec 2> >(tee -ia "$LOG" >&2) +LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt" +exec > >(tee -ia "$LOG") +exec 2> >(tee -ia "$LOG" >&2) # Log some environment info -#echo "Invocation: $0 $@" -#date -#pip freeze -#git status +echo "Invocation: $0 $@" +date +pip freeze +git status cd $SCRATCH_DIR/$DOWNLOAD_NAME +cd /var/folders/jc/fyrvcrcs3sb8g4mkdg6nl_t80000gq/T/mhcflurry-downloads-generation/models_class1_pan_refined export OMP_NUM_THREADS=1 export PYTHONUNBUFFERED=1 -# CAHNGE TO CP -ln -s $SCRIPT_DIR/make_multiallelic_training_data.py . +cp $SCRIPT_DIR/make_multiallelic_training_data.py . +cp $SCRIPT_DIR/hyperparameters.yaml . time python make_multiallelic_training_data.py \ --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \ --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \ --out train.multiallelic.csv +MONOALLELIC_TRAIN="$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec/train_data.csv.bz2" + +ALLELE_LIST=$(bzcat "$MONOALLELIC_TRAIN" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq) +ALLELE_LIST+=$(cat train.multiallelic.csv | cut -f 7 -d , | gerp -v hla | uniq | tr ' ' '\n' | sort | uniq) +ALLELE_LIST+=$(echo " " $(cat $SCRIPT_DIR/additional_alleles.txt | grep -v '#') ) + time mhcflurry-multiallelic-refinement \ - --monoallelic-data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \ + --monoallelic-data "$MONOALLELIC_TRAIN" \ --multiallelic-data train.multiallelic.csv \ - --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" + --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \ --hyperparameters hyperparameters.yaml \ --out-affinity-predictor-dir $(pwd)/models.affinity \ --out-presentation-predictor-dir $(pwd)/models.presentation \ --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \ $PARALLELISM_ARGS +time mhcflurry-calibrate-percentile-ranks \ + --models-dir $(pwd)/models.affinity \ + --match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \ + --motif-summary \ + --num-peptides-per-length 100000 \ + --allele $ALLELE_LIST \ + --verbosity 1 \ + $PARALLELISM_ARGS + echo "Done training." -bzip2 train.multiallelic.csv +rm train.multiallelic.csv cp $SCRIPT_ABSOLUTE_PATH . bzip2 -f "$LOG" @@ -79,4 +95,3 @@ for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2" tar -cjf "$RESULT" * echo "Created archive: $RESULT" - diff --git a/downloads-generation/models_class1_pan_refined/additional_alleles.txt b/downloads-generation/models_class1_pan_refined/additional_alleles.txt new file mode 120000 index 00000000..1ad4ba7a --- /dev/null +++ b/downloads-generation/models_class1_pan_refined/additional_alleles.txt @@ -0,0 +1 @@ +../models_class1_pan/additional_alleles.txt \ No newline at end of file diff --git a/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf new file mode 120000 index 00000000..1860a77b --- /dev/null +++ b/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf @@ -0,0 +1 @@ +../models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf \ No newline at end of file diff --git a/downloads-generation/models_class1_pan_refined/hyperparameters.yaml b/downloads-generation/models_class1_pan_refined/hyperparameters.yaml index 908db6fc..e8b53748 100644 --- a/downloads-generation/models_class1_pan_refined/hyperparameters.yaml +++ b/downloads-generation/models_class1_pan_refined/hyperparameters.yaml @@ -1,7 +1,7 @@ -- - ######################### - # Batch generation - ######################### - batch_generator_validation_split: 0.1, - batch_generator_batch_size: 1024, - batch_generator_affinity_fraction: 0.5 \ No newline at end of file +######################### +# Batch generation +######################### +batch_generator_validation_split: 0.1 +batch_generator_batch_size: 1024 +batch_generator_affinity_fraction: 0.5 +max_epochs: 500 diff --git a/mhcflurry/batch_generator.py b/mhcflurry/batch_generator.py index 4a0abb40..83d495f0 100644 --- a/mhcflurry/batch_generator.py +++ b/mhcflurry/batch_generator.py @@ -194,7 +194,7 @@ class MultiallelicMassSpecBatchGenerator(object): experiment_names, alleles_matrix, is_binder, - potential_validation_mask=None): + validation_weights=None): affinities_mask = numpy.array(affinities_mask, copy=False, dtype=bool) experiment_names = numpy.array(experiment_names, copy=False) alleles_matrix = numpy.array(alleles_matrix, copy=False) @@ -206,14 +206,18 @@ class MultiallelicMassSpecBatchGenerator(object): numpy.testing.assert_equal(len(is_binder), n) numpy.testing.assert_equal( affinities_mask, pandas.isnull(experiment_names)) - if potential_validation_mask is not None: - numpy.testing.assert_equal(len(potential_validation_mask), n) + + if validation_weights is not None: + validation_weights = numpy.array( + validation_weights, copy=True, dtype=float) + numpy.testing.assert_equal(len(validation_weights), n) + validation_weights /= validation_weights.sum() validation_items = numpy.random.choice( - n if potential_validation_mask is None - else numpy.where(potential_validation_mask)[0], - int(self.hyperparameters['batch_generator_validation_split'] * n), - replace=False) + n, + int((self.hyperparameters['batch_generator_validation_split']) * n), + replace=False, + p=validation_weights) validation_mask = numpy.zeros(n, dtype=bool) validation_mask[validation_items] = True diff --git a/mhcflurry/class1_presentation_neural_network.py b/mhcflurry/class1_presentation_neural_network.py index d3a69e93..a1c495fa 100644 --- a/mhcflurry/class1_presentation_neural_network.py +++ b/mhcflurry/class1_presentation_neural_network.py @@ -239,7 +239,7 @@ class Class1PresentationNeuralNetwork(object): def logit(x): import tensorflow as tf - return - tf.log( + return - tf.math.log( tf.maximum( tf.math.divide_no_nan(1., x) - 1., 0.0)) @@ -325,6 +325,7 @@ class Class1PresentationNeuralNetwork(object): allele_encoding, affinities_mask=None, # True when a peptide/label is actually a peptide and an affinity inequalities=None, # interpreted only for elements where affinities_mask is True, otherwise ignored + validation_weights=None, verbose=1, progress_callback=None, progress_preamble="", @@ -349,6 +350,10 @@ class Class1PresentationNeuralNetwork(object): affinities_mask = numpy.array(affinities_mask, copy=False) else: affinities_mask = numpy.tile(False, len(labels)) + if validation_weights is None: + validation_weights = numpy.tile(1.0, len(labels)) + else: + validation_weights = numpy.array(validation_weights, copy=False) inequalities[~affinities_mask] = "=" random_negatives_planner = RandomNegativePeptides( @@ -486,9 +491,9 @@ class Class1PresentationNeuralNetwork(object): numpy.tile(False, num_random_negatives), numpy.where(affinities_mask, labels, to_ic50(labels)) < 1000.0 ]), - potential_validation_mask=numpy.concatenate([ - numpy.tile(False, num_random_negatives), - numpy.tile(True, len(labels)) + validation_weights=numpy.concatenate([ + numpy.tile(0.0, num_random_negatives), + validation_weights ]), ) if verbose: diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py index 4a4f3bc6..cc266f22 100644 --- a/mhcflurry/class1_presentation_predictor.py +++ b/mhcflurry/class1_presentation_predictor.py @@ -25,7 +25,8 @@ from .regression_target import from_ic50, to_ic50 from .allele_encoding import MultipleAlleleEncoding from .downloads import get_default_class1_presentation_models_dir from .class1_presentation_neural_network import Class1PresentationNeuralNetwork -from .common import save_weights, load_weights +from .common import save_weights, load_weights, NumpyJSONEncoder + class Class1PresentationPredictor(object): def __init__( @@ -55,7 +56,7 @@ class Class1PresentationPredictor(object): model_config = model.get_config() rows.append(( self.model_name(i), - json.dumps(model_config), + json.dumps(model_config, cls=NumpyJSONEncoder), model )) self._manifest_df = pandas.DataFrame( @@ -234,7 +235,7 @@ class Class1PresentationPredictor(object): updated_network_config_jsons = [] for (_, row) in sub_manifest_df.iterrows(): updated_network_config_jsons.append( - json.dumps(row.model.get_config())) + json.dumps(row.model.get_config(), cls=NumpyJSONEncoder)) weights_path = self.weights_path(models_dir, row.model_name) save_weights(row.model.get_weights(), weights_path) logging.info("Wrote: %s", weights_path) diff --git a/mhcflurry/common.py b/mhcflurry/common.py index f51696ea..cd602d25 100644 --- a/mhcflurry/common.py +++ b/mhcflurry/common.py @@ -4,6 +4,7 @@ import logging import sys import os import warnings +import json import numpy import pandas @@ -206,3 +207,19 @@ def load_weights(filename): with numpy.load(filename) as loaded: weights = [loaded["array_%d" % i] for i in range(len(loaded.keys()))] return weights + + +class NumpyJSONEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, ( + numpy.int_, numpy.intc, numpy.intp, numpy.int8, + numpy.int16, numpy.int32, numpy.int64, numpy.uint8, + numpy.uint16, numpy.uint32, numpy.uint64)): + return int(obj) + elif isinstance(obj, ( + numpy.float_, numpy.float16, numpy.float32, + numpy.float64)): + return float(obj) + if isinstance(obj, numpy.ndarray): + return obj.tolist() + return json.JSONEncoder.default(self, obj) \ No newline at end of file diff --git a/mhcflurry/multiallelic_refinement_command.py b/mhcflurry/multiallelic_refinement_command.py index 324efff0..e059e112 100644 --- a/mhcflurry/multiallelic_refinement_command.py +++ b/mhcflurry/multiallelic_refinement_command.py @@ -8,7 +8,8 @@ import sys import time import traceback import hashlib -from pprint import pprint +import yaml +import pickle import numpy import pandas @@ -17,8 +18,9 @@ import tqdm # progress bar tqdm.monitor_interval = 0 # see https://github.com/tqdm/tqdm/issues/481 from .class1_affinity_predictor import Class1AffinityPredictor -from .encodable_sequences import EncodableSequences -from .allele_encoding import AlleleEncoding +from .class1_presentation_neural_network import Class1PresentationNeuralNetwork +from .class1_presentation_predictor import Class1PresentationPredictor +from .allele_encoding import MultipleAlleleEncoding from .common import configure_logging from .local_parallelism import ( worker_pool_with_gpu_assignments_from_args, @@ -26,7 +28,6 @@ from .local_parallelism import ( from .cluster_parallelism import ( add_cluster_parallelism_args, cluster_results_from_args) -from .regression_target import from_ic50 # To avoid pickling large matrices to send to child processes when running in @@ -47,8 +48,7 @@ parser.add_argument( "--monoallelic-data", metavar="FILE.csv", required=False, - help=( - "Affinity meaurements and monoallelic mass spec data.")) + help="Affinity meaurements and monoallelic mass spec data.") parser.add_argument( "--models-dir", metavar="DIR", @@ -87,6 +87,8 @@ def run(argv=sys.argv[1:]): args = parser.parse_args(argv) + hyperparameters = yaml.load(open(args.hyperparameters)) + args.out_affinity_predictor_dir = os.path.abspath( args.out_affinity_predictor_dir) args.out_presentation_predictor_dir = os.path.abspath( @@ -94,7 +96,7 @@ def run(argv=sys.argv[1:]): configure_logging(verbose=args.verbosity > 1) - multiallelic_df = pandas.read_csv(args.multiallelic_df) + multiallelic_df = pandas.read_csv(args.multiallelic_data) print("Loaded multiallelic data: %s" % (str(multiallelic_df.shape))) monoallelic_df = pandas.read_csv(args.monoallelic_data) @@ -104,91 +106,69 @@ def run(argv=sys.argv[1:]): args.models_dir, optimization_level=0) print("Loaded: %s" % input_predictor) - import ipdb ; ipdb.set_trace() - - - - - - - alleles = input_predictor.supported_alleles - (min_peptide_length, max_peptide_length) = ( - input_predictor.supported_peptide_lengths) - - - - - - - - - metadata_dfs = {} + sample_table = multiallelic_df.drop_duplicates( + "sample_id").set_index("sample_id").loc[ + multiallelic_df.sample_id.unique() + ] + grouped = multiallelic_df.groupby("sample_id").nunique() + for col in sample_table.columns: + if (grouped[col] > 1).any(): + del sample_table[col] + sample_table["alleles"] = sample_table.hla.str.split() - fold_cols = [c for c in df if c.startswith("fold_")] + fold_cols = [c for c in monoallelic_df if c.startswith("fold_")] num_folds = len(fold_cols) if num_folds <= 1: raise ValueError("Too few folds: ", num_folds) - df = df.loc[ - (df.peptide.str.len() >= min_peptide_length) & - (df.peptide.str.len() <= max_peptide_length) - ] - print("Subselected to %d-%dmers: %s" % ( - min_peptide_length, max_peptide_length, str(df.shape))) - - print("Num folds: ", num_folds, "fraction included:") - print(df[fold_cols].mean()) - - # Allele names in data are assumed to be already normalized. - df = df.loc[df.allele.isin(alleles)].dropna() - print("Subselected to supported alleles: %s" % str(df.shape)) - - metadata_dfs["model_selection_data"] = df - - df["mass_spec"] = df.measurement_source.str.contains( - args.mass_spec_regex) - def make_train_peptide_hash(sub_df): train_peptide_hash = hashlib.sha1() for peptide in sorted(sub_df.peptide.values): train_peptide_hash.update(peptide.encode()) return train_peptide_hash.hexdigest() - folds_to_predictors = dict( - (int(col.split("_")[-1]), ( - [], - make_train_peptide_hash(df.loc[df[col] == 1]))) - for col in fold_cols) - print(folds_to_predictors) + work_items = [] for model in input_predictor.class1_pan_allele_models: training_info = model.fit_info[-1]['training_info'] fold_num = training_info['fold_num'] assert num_folds == training_info['num_folds'] - (lst, hash) = folds_to_predictors[fold_num] - train_peptide_hash = training_info['train_peptide_hash'] - numpy.testing.assert_equal(hash, train_peptide_hash) - lst.append(model) - - work_items = [] - for (fold_num, (models, _)) in folds_to_predictors.items(): + fold_col = "fold_%d" % fold_num + observed_hash = make_train_peptide_hash( + monoallelic_df.loc[monoallelic_df[fold_col] == 1]) + saved_hash = training_info['train_peptide_hash'] + #numpy.testing.assert_equal(observed_hash, saved_hash) work_items.append({ - 'fold_num': fold_num, - 'models': models, - 'min_models': args.min_models_per_fold, - 'max_models': args.max_models_per_fold, + "work_item_num": len(work_items), + "affinity_model": model, + "fold_num": fold_num, }) - GLOBAL_DATA["data"] = df - GLOBAL_DATA["input_predictor"] = input_predictor + work_items_dict = dict((item['work_item_num'], item) for item in work_items) - if not os.path.exists(args.out_models_dir): - print("Attempting to create directory: %s" % args.out_models_dir) - os.mkdir(args.out_models_dir) - print("Done.") + GLOBAL_DATA["monoallelic_data"] = monoallelic_df + GLOBAL_DATA["multiallelic_data"] = multiallelic_df + GLOBAL_DATA["multiallelic_sample_table"] = sample_table + GLOBAL_DATA["hyperparameters"] = hyperparameters + GLOBAL_DATA["allele_to_sequence"] = input_predictor.allele_to_sequence - result_predictor = Class1AffinityPredictor( - allele_to_sequence=input_predictor.allele_to_sequence, - metadata_dataframes=metadata_dfs) + out_dirs = [ + args.out_affinity_predictor_dir, + args.out_presentation_predictor_dir + ] + + for out in out_dirs: + if not os.path.exists(out): + print("Attempting to create directory:", out) + os.mkdir(out) + print("Done.") + + metadata_dfs = { + "monoallelic_train_data": monoallelic_df, + "multiallelic_train_data": multiallelic_df, + } + + affinity_models = [] + presentation_models = [] serial_run = not args.cluster_parallelism and args.num_jobs == 0 worker_pool = None @@ -196,13 +176,13 @@ def run(argv=sys.argv[1:]): if serial_run: # Serial run print("Running in serial.") - results = (model_select(**item) for item in work_items) + results = (refine_model(**item) for item in work_items) elif args.cluster_parallelism: # Run using separate processes HPC cluster. print("Running on cluster.") results = cluster_results_from_args( args, - work_function=model_select, + work_function=refine_model, work_items=work_items, constant_data=GLOBAL_DATA, result_serialization_method="pickle") @@ -216,131 +196,109 @@ def run(argv=sys.argv[1:]): # Parallel run results = worker_pool.imap_unordered( - do_model_select_task, + do_refine_model_task, work_items, chunksize=1) - models_by_fold = {} - summary_dfs = [] for result in tqdm.tqdm(results, total=len(work_items)): - pprint(result) - fold_num = result['fold_num'] - (all_models_for_fold, _) = folds_to_predictors[fold_num] - models = [ - all_models_for_fold[i] - for i in result['selected_indices'] - ] - summary_df = result['summary'].copy() - summary_df.index = summary_df.index.map( - lambda idx: all_models_for_fold[idx]) - summary_dfs.append(summary_df) - - print("Selected %d models for fold %d: %s" % ( - len(models), fold_num, result['selected_indices'])) - models_by_fold[fold_num] = models - for model in models: - model.clear_allele_representations() - result_predictor.add_pan_allele_model(model) - - summary_df = pandas.concat(summary_dfs, ignore_index=False) - summary_df["model_config"] = summary_df.index.map(lambda m: m.get_config()) - result_predictor.metadata_dataframes["model_selection_summary"] = ( - summary_df.reset_index(drop=True)) - - result_predictor.save(args.out_models_dir) - - model_selection_time = time.time() - start + work_item_num = result['work_item_num'] + work_item = work_items_dict[work_item_num] + affinity_model = work_item['affinity_model'] + presentation_model = pickle.loads(result['presentation_model']) + presentation_model.copy_weights_to_affinity_model(affinity_model) + affinity_models.append(affinity_model) + presentation_models.append(presentation_model) if worker_pool: worker_pool.close() worker_pool.join() - print("Model selection time %0.2f min." % (model_selection_time / 60.0)) - print("Predictor [%d models] written to: %s" % ( - len(result_predictor.neural_networks), - args.out_models_dir)) + print("Done model fitting. Writing predictors.") + + result_affinity_predictor = Class1AffinityPredictor( + class1_pan_allele_models=affinity_models, + allele_to_sequence=input_predictor.allele_to_sequence) + result_affinity_predictor.save(args.out_affinity_predictor_dir) + print("Wrote", args.out_affinity_predictor_dir) + result_presentation_predictor = Class1PresentationPredictor( + models=presentation_models, + allele_to_sequence=input_predictor.allele_to_sequence, + metadata_dataframes=metadata_dfs) + result_presentation_predictor.save(args.out_presentation_predictor_dir) + print("Wrote", args.out_presentation_predictor_dir) -def do_model_select_task(item, constant_data=GLOBAL_DATA): - return model_select(constant_data=constant_data, **item) +def do_refine_model_task(item, constant_data=GLOBAL_DATA): + return refine_model(constant_data=constant_data, **item) -def model_select( - fold_num, models, min_models, max_models, constant_data=GLOBAL_DATA): + +def refine_model( + work_item_num, affinity_model, fold_num, constant_data=GLOBAL_DATA): """ - Model select for a fold. - - Parameters - ---------- - fold_num : int - models : list of Class1NeuralNetwork - min_models : int - max_models : int - constant_data : dict - - Returns - ------- - dict with keys 'fold_num', 'selected_indices', 'summary' + Refine a model. """ - full_data = constant_data["data"] - input_predictor = constant_data["input_predictor"] - df = full_data.loc[ - full_data["fold_%d" % fold_num] == 0 - ] - - peptides = EncodableSequences.create(df.peptide.values) - alleles = AlleleEncoding( - df.allele.values, - borrow_from=input_predictor.master_allele_encoding) - - predictions_df = df.copy() - for (i, model) in enumerate(models): - predictions_df[i] = from_ic50(model.predict(peptides, alleles)) - - actual = from_ic50(predictions_df.measurement_value) - - selected = [] - selected_score = 0 - remaining_models = set(numpy.arange(len(models))) - individual_model_scores = {} - while remaining_models and len(selected) < max_models: - best_model = None - best_model_score = 0 - for i in remaining_models: - possible_ensemble = list(selected) + [i] - predictions = predictions_df[possible_ensemble].mean(1) - mse_score = 1 - mse( - predictions, - actual, - inequalities=( - predictions_df.measurement_inequality - if 'measurement_inequality' in predictions_df.columns - else None), - affinities_are_already_01_transformed=True) - if mse_score >= best_model_score: - best_model = i - best_model_score = mse_score - if not selected: - # First iteration. Store individual model scores. - individual_model_scores[i] = mse_score - if len(selected) < min_models or best_model_score > selected_score: - selected_score = best_model_score - remaining_models.remove(best_model) - selected.append(best_model) - else: - break - - assert selected - - summary_df = pandas.Series(individual_model_scores)[ - numpy.arange(len(models)) - ].to_frame() - summary_df.columns = ['mse_score'] + monoallelic_df = constant_data["monoallelic_data"] + multiallelic_df = constant_data["multiallelic_data"] + sample_table = constant_data["multiallelic_sample_table"] + hyperparameters = constant_data["hyperparameters"] + allele_to_sequence = constant_data["allele_to_sequence"] + + fold_col = "fold_%d" % fold_num + + multiallelic_train_df = multiallelic_df[ + ["sample_id", "peptide", "hit"] + ].rename(columns={"hit": "label"}).copy() + multiallelic_train_df["is_affinity"] = False + multiallelic_train_df["validation_weight"] = 1.0 + + monoallelic_train_df = monoallelic_df[ + ["peptide", "allele", "measurement_inequality", "measurement_value"] + ].copy() + monoallelic_train_df["label"] = monoallelic_train_df["measurement_value"] + del monoallelic_train_df["measurement_value"] + monoallelic_train_df["is_affinity"] = True + + # We force all validation affinities to be from the validation set used + # originally to train the predictor. To ensure proportional sampling between + # the affinity and multiallelic mass spec data, we set their weight to + # as follows. + monoallelic_train_df["validation_weight"] = ( + (monoallelic_df[fold_col] == 0).astype(float) * ( + (monoallelic_df[fold_col] == 1).sum() / + (monoallelic_df[fold_col] == 0).sum())) + + combined_train_df = pandas.concat( + [multiallelic_train_df, monoallelic_train_df], + ignore_index=True, + sort=False) + + allele_encoding = MultipleAlleleEncoding( + experiment_names=multiallelic_train_df.sample_id.values, + experiment_to_allele_list=sample_table.alleles.to_dict(), + allele_to_sequence=allele_to_sequence, + ) + allele_encoding.append_alleles(monoallelic_train_df.allele.values) + allele_encoding = allele_encoding.compact() + + presentation_model = Class1PresentationNeuralNetwork(**hyperparameters) + presentation_model.load_from_class1_neural_network(affinity_model) + presentation_model.fit( + peptides=combined_train_df.peptide.values, + labels=combined_train_df.label.values, + allele_encoding=allele_encoding, + affinities_mask=combined_train_df.is_affinity.values, + inequalities=combined_train_df.measurement_inequality.values, + validation_weights=combined_train_df.validation_weight.values) return { - 'fold_num': fold_num, - 'selected_indices': selected, - 'summary': summary_df, # indexed by model index + 'work_item_num': work_item_num, + + # We pickle it here so it always gets pickled, even when running in one + # process. This prevents tensorflow errors when using thread-level + # parallelism. + 'presentation_model': pickle.dumps( + presentation_model, pickle.HIGHEST_PROTOCOL), } -- GitLab