From 1cbb85cf52d5ca4e575de3f34335fa97bfd2d00c Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Mon, 9 Dec 2019 11:46:16 -0500
Subject: [PATCH] first cut on downloads-generation/models_class1_pan_refined

---
 .../models_class1_pan_refined/GENERATE.sh     |  41 ++-
 .../additional_alleles.txt                    |   1 +
 .../cluster_submit_script_header.mssm_hpc.lsf |   1 +
 .../hyperparameters.yaml                      |  14 +-
 mhcflurry/batch_generator.py                  |  18 +-
 .../class1_presentation_neural_network.py     |  13 +-
 mhcflurry/class1_presentation_predictor.py    |   7 +-
 mhcflurry/common.py                           |  17 +
 mhcflurry/multiallelic_refinement_command.py  | 322 ++++++++----------
 9 files changed, 218 insertions(+), 216 deletions(-)
 create mode 120000 downloads-generation/models_class1_pan_refined/additional_alleles.txt
 create mode 120000 downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf

diff --git a/downloads-generation/models_class1_pan_refined/GENERATE.sh b/downloads-generation/models_class1_pan_refined/GENERATE.sh
index 6591e853..e85c805b 100755
--- a/downloads-generation/models_class1_pan_refined/GENERATE.sh
+++ b/downloads-generation/models_class1_pan_refined/GENERATE.sh
@@ -36,42 +36,58 @@ rm -rf "$SCRATCH_DIR/$DOWNLOAD_NAME"
 mkdir -p "$SCRATCH_DIR/$DOWNLOAD_NAME"
 
 # Send stdout and stderr to a logfile included with the archive.
-#LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
-#exec >  >(tee -ia "$LOG")
-#exec 2> >(tee -ia "$LOG" >&2)
+LOG="$SCRATCH_DIR/$DOWNLOAD_NAME/LOG.$(date +%s).txt"
+exec >  >(tee -ia "$LOG")
+exec 2> >(tee -ia "$LOG" >&2)
 
 # Log some environment info
-#echo "Invocation: $0 $@"
-#date
-#pip freeze
-#git status
+echo "Invocation: $0 $@"
+date
+pip freeze
+git status
 
 cd $SCRATCH_DIR/$DOWNLOAD_NAME
+cd /var/folders/jc/fyrvcrcs3sb8g4mkdg6nl_t80000gq/T/mhcflurry-downloads-generation/models_class1_pan_refined
 
 export OMP_NUM_THREADS=1
 export PYTHONUNBUFFERED=1
 
-# CAHNGE TO CP
-ln -s $SCRIPT_DIR/make_multiallelic_training_data.py .
+cp $SCRIPT_DIR/make_multiallelic_training_data.py .
+cp $SCRIPT_DIR/hyperparameters.yaml .
 
 time python make_multiallelic_training_data.py \
     --hits "$(mhcflurry-downloads path data_mass_spec_annotated)/annotated_ms.csv.bz2" \
     --expression "$(mhcflurry-downloads path data_curated)/rna_expression.csv.bz2" \
     --out train.multiallelic.csv
 
+MONOALLELIC_TRAIN="$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec/train_data.csv.bz2"
+
+ALLELE_LIST=$(bzcat "$MONOALLELIC_TRAIN" | cut -f 1 -d , | grep -v allele | uniq | sort | uniq)
+ALLELE_LIST+=$(cat train.multiallelic.csv | cut -f 7 -d , | gerp -v hla | uniq | tr ' ' '\n' | sort | uniq)
+ALLELE_LIST+=$(echo " " $(cat $SCRIPT_DIR/additional_alleles.txt | grep -v '#') )
+
 time mhcflurry-multiallelic-refinement \
-    --monoallelic-data "$(mhcflurry-downloads path data_curated)/curated_training_data.with_mass_spec.csv.bz2" \
+    --monoallelic-data "$MONOALLELIC_TRAIN" \
     --multiallelic-data train.multiallelic.csv \
-    --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec"
+    --models-dir "$(mhcflurry-downloads path models_class1_pan)/models.with_mass_spec" \
     --hyperparameters hyperparameters.yaml \
     --out-affinity-predictor-dir $(pwd)/models.affinity \
     --out-presentation-predictor-dir $(pwd)/models.presentation \
     --worker-log-dir "$SCRATCH_DIR/$DOWNLOAD_NAME" \
     $PARALLELISM_ARGS
 
+time mhcflurry-calibrate-percentile-ranks \
+    --models-dir $(pwd)/models.affinity  \
+    --match-amino-acid-distribution-data "$MONOALLELIC_TRAIN" \
+    --motif-summary \
+    --num-peptides-per-length 100000 \
+    --allele $ALLELE_LIST \
+    --verbosity 1 \
+    $PARALLELISM_ARGS
+
 echo "Done training."
 
-bzip2 train.multiallelic.csv
+rm train.multiallelic.csv
 
 cp $SCRIPT_ABSOLUTE_PATH .
 bzip2 -f "$LOG"
@@ -79,4 +95,3 @@ for i in $(ls LOG-worker.*.txt) ; do bzip2 -f $i ; done
 RESULT="$SCRATCH_DIR/${DOWNLOAD_NAME}.$(date +%Y%m%d).tar.bz2"
 tar -cjf "$RESULT" *
 echo "Created archive: $RESULT"
-
diff --git a/downloads-generation/models_class1_pan_refined/additional_alleles.txt b/downloads-generation/models_class1_pan_refined/additional_alleles.txt
new file mode 120000
index 00000000..1ad4ba7a
--- /dev/null
+++ b/downloads-generation/models_class1_pan_refined/additional_alleles.txt
@@ -0,0 +1 @@
+../models_class1_pan/additional_alleles.txt
\ No newline at end of file
diff --git a/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf b/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf
new file mode 120000
index 00000000..1860a77b
--- /dev/null
+++ b/downloads-generation/models_class1_pan_refined/cluster_submit_script_header.mssm_hpc.lsf
@@ -0,0 +1 @@
+../models_class1_pan/cluster_submit_script_header.mssm_hpc.lsf
\ No newline at end of file
diff --git a/downloads-generation/models_class1_pan_refined/hyperparameters.yaml b/downloads-generation/models_class1_pan_refined/hyperparameters.yaml
index 908db6fc..e8b53748 100644
--- a/downloads-generation/models_class1_pan_refined/hyperparameters.yaml
+++ b/downloads-generation/models_class1_pan_refined/hyperparameters.yaml
@@ -1,7 +1,7 @@
--
-  #########################
-  # Batch generation
-  #########################
-  batch_generator_validation_split: 0.1,
-  batch_generator_batch_size: 1024,
-  batch_generator_affinity_fraction: 0.5
\ No newline at end of file
+#########################
+# Batch generation
+#########################
+batch_generator_validation_split: 0.1
+batch_generator_batch_size: 1024
+batch_generator_affinity_fraction: 0.5
+max_epochs: 500
diff --git a/mhcflurry/batch_generator.py b/mhcflurry/batch_generator.py
index 4a0abb40..83d495f0 100644
--- a/mhcflurry/batch_generator.py
+++ b/mhcflurry/batch_generator.py
@@ -194,7 +194,7 @@ class MultiallelicMassSpecBatchGenerator(object):
             experiment_names,
             alleles_matrix,
             is_binder,
-            potential_validation_mask=None):
+            validation_weights=None):
         affinities_mask = numpy.array(affinities_mask, copy=False, dtype=bool)
         experiment_names = numpy.array(experiment_names, copy=False)
         alleles_matrix = numpy.array(alleles_matrix, copy=False)
@@ -206,14 +206,18 @@ class MultiallelicMassSpecBatchGenerator(object):
         numpy.testing.assert_equal(len(is_binder), n)
         numpy.testing.assert_equal(
             affinities_mask, pandas.isnull(experiment_names))
-        if potential_validation_mask is not None:
-            numpy.testing.assert_equal(len(potential_validation_mask), n)
+
+        if validation_weights is not None:
+            validation_weights = numpy.array(
+                validation_weights, copy=True, dtype=float)
+            numpy.testing.assert_equal(len(validation_weights), n)
+            validation_weights /= validation_weights.sum()
 
         validation_items = numpy.random.choice(
-            n if potential_validation_mask is None
-                else numpy.where(potential_validation_mask)[0],
-            int(self.hyperparameters['batch_generator_validation_split'] * n),
-            replace=False)
+            n,
+            int((self.hyperparameters['batch_generator_validation_split']) * n),
+            replace=False,
+            p=validation_weights)
         validation_mask = numpy.zeros(n, dtype=bool)
         validation_mask[validation_items] = True
 
diff --git a/mhcflurry/class1_presentation_neural_network.py b/mhcflurry/class1_presentation_neural_network.py
index d3a69e93..a1c495fa 100644
--- a/mhcflurry/class1_presentation_neural_network.py
+++ b/mhcflurry/class1_presentation_neural_network.py
@@ -239,7 +239,7 @@ class Class1PresentationNeuralNetwork(object):
 
         def logit(x):
             import tensorflow as tf
-            return - tf.log(
+            return - tf.math.log(
                 tf.maximum(
                     tf.math.divide_no_nan(1., x) - 1.,
                     0.0))
@@ -325,6 +325,7 @@ class Class1PresentationNeuralNetwork(object):
             allele_encoding,
             affinities_mask=None,  # True when a peptide/label is actually a peptide and an affinity
             inequalities=None,  # interpreted only for elements where affinities_mask is True, otherwise ignored
+            validation_weights=None,
             verbose=1,
             progress_callback=None,
             progress_preamble="",
@@ -349,6 +350,10 @@ class Class1PresentationNeuralNetwork(object):
             affinities_mask = numpy.array(affinities_mask, copy=False)
         else:
             affinities_mask = numpy.tile(False, len(labels))
+        if validation_weights is None:
+            validation_weights = numpy.tile(1.0, len(labels))
+        else:
+            validation_weights = numpy.array(validation_weights, copy=False)
         inequalities[~affinities_mask] = "="
 
         random_negatives_planner = RandomNegativePeptides(
@@ -486,9 +491,9 @@ class Class1PresentationNeuralNetwork(object):
                 numpy.tile(False, num_random_negatives),
                 numpy.where(affinities_mask, labels, to_ic50(labels)) < 1000.0
             ]),
-            potential_validation_mask=numpy.concatenate([
-                numpy.tile(False, num_random_negatives),
-                numpy.tile(True, len(labels))
+            validation_weights=numpy.concatenate([
+                numpy.tile(0.0, num_random_negatives),
+                validation_weights
             ]),
         )
         if verbose:
diff --git a/mhcflurry/class1_presentation_predictor.py b/mhcflurry/class1_presentation_predictor.py
index 4a4f3bc6..cc266f22 100644
--- a/mhcflurry/class1_presentation_predictor.py
+++ b/mhcflurry/class1_presentation_predictor.py
@@ -25,7 +25,8 @@ from .regression_target import from_ic50, to_ic50
 from .allele_encoding import MultipleAlleleEncoding
 from .downloads import get_default_class1_presentation_models_dir
 from .class1_presentation_neural_network import Class1PresentationNeuralNetwork
-from .common import save_weights, load_weights
+from .common import save_weights, load_weights, NumpyJSONEncoder
+
 
 class Class1PresentationPredictor(object):
     def __init__(
@@ -55,7 +56,7 @@ class Class1PresentationPredictor(object):
                 model_config = model.get_config()
                 rows.append((
                     self.model_name(i),
-                    json.dumps(model_config),
+                    json.dumps(model_config, cls=NumpyJSONEncoder),
                     model
                 ))
             self._manifest_df = pandas.DataFrame(
@@ -234,7 +235,7 @@ class Class1PresentationPredictor(object):
         updated_network_config_jsons = []
         for (_, row) in sub_manifest_df.iterrows():
             updated_network_config_jsons.append(
-                json.dumps(row.model.get_config()))
+                json.dumps(row.model.get_config(), cls=NumpyJSONEncoder))
             weights_path = self.weights_path(models_dir, row.model_name)
             save_weights(row.model.get_weights(), weights_path)
             logging.info("Wrote: %s", weights_path)
diff --git a/mhcflurry/common.py b/mhcflurry/common.py
index f51696ea..cd602d25 100644
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -4,6 +4,7 @@ import logging
 import sys
 import os
 import warnings
+import json
 
 import numpy
 import pandas
@@ -206,3 +207,19 @@ def load_weights(filename):
     with numpy.load(filename) as loaded:
         weights = [loaded["array_%d" % i] for i in range(len(loaded.keys()))]
     return weights
+
+
+class NumpyJSONEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, (
+                numpy.int_, numpy.intc, numpy.intp, numpy.int8,
+                numpy.int16, numpy.int32, numpy.int64, numpy.uint8,
+                numpy.uint16, numpy.uint32, numpy.uint64)):
+            return int(obj)
+        elif isinstance(obj, (
+                numpy.float_, numpy.float16, numpy.float32,
+                numpy.float64)):
+            return float(obj)
+        if isinstance(obj, numpy.ndarray):
+            return obj.tolist()
+        return json.JSONEncoder.default(self, obj)
\ No newline at end of file
diff --git a/mhcflurry/multiallelic_refinement_command.py b/mhcflurry/multiallelic_refinement_command.py
index 324efff0..e059e112 100644
--- a/mhcflurry/multiallelic_refinement_command.py
+++ b/mhcflurry/multiallelic_refinement_command.py
@@ -8,7 +8,8 @@ import sys
 import time
 import traceback
 import hashlib
-from pprint import pprint
+import yaml
+import pickle
 
 import numpy
 import pandas
@@ -17,8 +18,9 @@ import tqdm  # progress bar
 tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
 
 from .class1_affinity_predictor import Class1AffinityPredictor
-from .encodable_sequences import EncodableSequences
-from .allele_encoding import AlleleEncoding
+from .class1_presentation_neural_network import Class1PresentationNeuralNetwork
+from .class1_presentation_predictor import Class1PresentationPredictor
+from .allele_encoding import MultipleAlleleEncoding
 from .common import configure_logging
 from .local_parallelism import (
     worker_pool_with_gpu_assignments_from_args,
@@ -26,7 +28,6 @@ from .local_parallelism import (
 from .cluster_parallelism import (
     add_cluster_parallelism_args,
     cluster_results_from_args)
-from .regression_target import from_ic50
 
 
 # To avoid pickling large matrices to send to child processes when running in
@@ -47,8 +48,7 @@ parser.add_argument(
     "--monoallelic-data",
     metavar="FILE.csv",
     required=False,
-    help=(
-        "Affinity meaurements and monoallelic mass spec data."))
+    help="Affinity meaurements and monoallelic mass spec data.")
 parser.add_argument(
     "--models-dir",
     metavar="DIR",
@@ -87,6 +87,8 @@ def run(argv=sys.argv[1:]):
 
     args = parser.parse_args(argv)
 
+    hyperparameters = yaml.load(open(args.hyperparameters))
+
     args.out_affinity_predictor_dir = os.path.abspath(
         args.out_affinity_predictor_dir)
     args.out_presentation_predictor_dir = os.path.abspath(
@@ -94,7 +96,7 @@ def run(argv=sys.argv[1:]):
 
     configure_logging(verbose=args.verbosity > 1)
 
-    multiallelic_df = pandas.read_csv(args.multiallelic_df)
+    multiallelic_df = pandas.read_csv(args.multiallelic_data)
     print("Loaded multiallelic data: %s" % (str(multiallelic_df.shape)))
 
     monoallelic_df = pandas.read_csv(args.monoallelic_data)
@@ -104,91 +106,69 @@ def run(argv=sys.argv[1:]):
         args.models_dir, optimization_level=0)
     print("Loaded: %s" % input_predictor)
 
-    import ipdb ; ipdb.set_trace()
-
-
-
-
-
-
-    alleles = input_predictor.supported_alleles
-    (min_peptide_length, max_peptide_length) = (
-        input_predictor.supported_peptide_lengths)
-
-
-
-
-
-
-
-
-    metadata_dfs = {}
+    sample_table = multiallelic_df.drop_duplicates(
+        "sample_id").set_index("sample_id").loc[
+        multiallelic_df.sample_id.unique()
+    ]
+    grouped = multiallelic_df.groupby("sample_id").nunique()
+    for col in sample_table.columns:
+        if (grouped[col] > 1).any():
+            del sample_table[col]
+    sample_table["alleles"] = sample_table.hla.str.split()
 
-    fold_cols = [c for c in df if c.startswith("fold_")]
+    fold_cols = [c for c in monoallelic_df if c.startswith("fold_")]
     num_folds = len(fold_cols)
     if num_folds <= 1:
         raise ValueError("Too few folds: ", num_folds)
 
-    df = df.loc[
-        (df.peptide.str.len() >= min_peptide_length) &
-        (df.peptide.str.len() <= max_peptide_length)
-    ]
-    print("Subselected to %d-%dmers: %s" % (
-        min_peptide_length, max_peptide_length, str(df.shape)))
-
-    print("Num folds: ", num_folds, "fraction included:")
-    print(df[fold_cols].mean())
-
-    # Allele names in data are assumed to be already normalized.
-    df = df.loc[df.allele.isin(alleles)].dropna()
-    print("Subselected to supported alleles: %s" % str(df.shape))
-
-    metadata_dfs["model_selection_data"] = df
-
-    df["mass_spec"] = df.measurement_source.str.contains(
-        args.mass_spec_regex)
-
     def make_train_peptide_hash(sub_df):
         train_peptide_hash = hashlib.sha1()
         for peptide in sorted(sub_df.peptide.values):
             train_peptide_hash.update(peptide.encode())
         return train_peptide_hash.hexdigest()
 
-    folds_to_predictors = dict(
-        (int(col.split("_")[-1]), (
-            [],
-            make_train_peptide_hash(df.loc[df[col] == 1])))
-        for col in fold_cols)
-    print(folds_to_predictors)
+    work_items = []
     for model in input_predictor.class1_pan_allele_models:
         training_info = model.fit_info[-1]['training_info']
         fold_num = training_info['fold_num']
         assert num_folds == training_info['num_folds']
-        (lst, hash) = folds_to_predictors[fold_num]
-        train_peptide_hash = training_info['train_peptide_hash']
-        numpy.testing.assert_equal(hash, train_peptide_hash)
-        lst.append(model)
-
-    work_items = []
-    for (fold_num, (models, _)) in folds_to_predictors.items():
+        fold_col = "fold_%d" % fold_num
+        observed_hash = make_train_peptide_hash(
+            monoallelic_df.loc[monoallelic_df[fold_col] == 1])
+        saved_hash = training_info['train_peptide_hash']
+        #numpy.testing.assert_equal(observed_hash, saved_hash)
         work_items.append({
-            'fold_num': fold_num,
-            'models': models,
-            'min_models': args.min_models_per_fold,
-            'max_models': args.max_models_per_fold,
+            "work_item_num": len(work_items),
+            "affinity_model": model,
+            "fold_num": fold_num,
         })
 
-    GLOBAL_DATA["data"] = df
-    GLOBAL_DATA["input_predictor"] = input_predictor
+    work_items_dict = dict((item['work_item_num'], item) for item in work_items)
 
-    if not os.path.exists(args.out_models_dir):
-        print("Attempting to create directory: %s" % args.out_models_dir)
-        os.mkdir(args.out_models_dir)
-        print("Done.")
+    GLOBAL_DATA["monoallelic_data"] = monoallelic_df
+    GLOBAL_DATA["multiallelic_data"] = multiallelic_df
+    GLOBAL_DATA["multiallelic_sample_table"] = sample_table
+    GLOBAL_DATA["hyperparameters"] = hyperparameters
+    GLOBAL_DATA["allele_to_sequence"] = input_predictor.allele_to_sequence
 
-    result_predictor = Class1AffinityPredictor(
-        allele_to_sequence=input_predictor.allele_to_sequence,
-        metadata_dataframes=metadata_dfs)
+    out_dirs = [
+        args.out_affinity_predictor_dir,
+        args.out_presentation_predictor_dir
+    ]
+
+    for out in out_dirs:
+        if not os.path.exists(out):
+            print("Attempting to create directory:", out)
+            os.mkdir(out)
+            print("Done.")
+
+    metadata_dfs = {
+        "monoallelic_train_data": monoallelic_df,
+        "multiallelic_train_data": multiallelic_df,
+    }
+
+    affinity_models = []
+    presentation_models = []
 
     serial_run = not args.cluster_parallelism and args.num_jobs == 0
     worker_pool = None
@@ -196,13 +176,13 @@ def run(argv=sys.argv[1:]):
     if serial_run:
         # Serial run
         print("Running in serial.")
-        results = (model_select(**item) for item in work_items)
+        results = (refine_model(**item) for item in work_items)
     elif args.cluster_parallelism:
         # Run using separate processes HPC cluster.
         print("Running on cluster.")
         results = cluster_results_from_args(
             args,
-            work_function=model_select,
+            work_function=refine_model,
             work_items=work_items,
             constant_data=GLOBAL_DATA,
             result_serialization_method="pickle")
@@ -216,131 +196,109 @@ def run(argv=sys.argv[1:]):
 
         # Parallel run
         results = worker_pool.imap_unordered(
-            do_model_select_task,
+            do_refine_model_task,
             work_items,
             chunksize=1)
 
-    models_by_fold = {}
-    summary_dfs = []
     for result in tqdm.tqdm(results, total=len(work_items)):
-        pprint(result)
-        fold_num = result['fold_num']
-        (all_models_for_fold, _) = folds_to_predictors[fold_num]
-        models = [
-            all_models_for_fold[i]
-            for i in result['selected_indices']
-        ]
-        summary_df = result['summary'].copy()
-        summary_df.index = summary_df.index.map(
-            lambda idx: all_models_for_fold[idx])
-        summary_dfs.append(summary_df)
-
-        print("Selected %d models for fold %d: %s" % (
-            len(models), fold_num, result['selected_indices']))
-        models_by_fold[fold_num] = models
-        for model in models:
-            model.clear_allele_representations()
-            result_predictor.add_pan_allele_model(model)
-
-    summary_df = pandas.concat(summary_dfs, ignore_index=False)
-    summary_df["model_config"] = summary_df.index.map(lambda m: m.get_config())
-    result_predictor.metadata_dataframes["model_selection_summary"] = (
-        summary_df.reset_index(drop=True))
-
-    result_predictor.save(args.out_models_dir)
-
-    model_selection_time = time.time() - start
+        work_item_num = result['work_item_num']
+        work_item = work_items_dict[work_item_num]
+        affinity_model = work_item['affinity_model']
+        presentation_model = pickle.loads(result['presentation_model'])
+        presentation_model.copy_weights_to_affinity_model(affinity_model)
+        affinity_models.append(affinity_model)
+        presentation_models.append(presentation_model)
 
     if worker_pool:
         worker_pool.close()
         worker_pool.join()
 
-    print("Model selection time %0.2f min." % (model_selection_time / 60.0))
-    print("Predictor [%d models] written to: %s" % (
-        len(result_predictor.neural_networks),
-        args.out_models_dir))
+    print("Done model fitting. Writing predictors.")
+
+    result_affinity_predictor = Class1AffinityPredictor(
+        class1_pan_allele_models=affinity_models,
+        allele_to_sequence=input_predictor.allele_to_sequence)
+    result_affinity_predictor.save(args.out_affinity_predictor_dir)
+    print("Wrote", args.out_affinity_predictor_dir)
 
+    result_presentation_predictor = Class1PresentationPredictor(
+        models=presentation_models,
+        allele_to_sequence=input_predictor.allele_to_sequence,
+        metadata_dataframes=metadata_dfs)
+    result_presentation_predictor.save(args.out_presentation_predictor_dir)
+    print("Wrote", args.out_presentation_predictor_dir)
 
-def do_model_select_task(item, constant_data=GLOBAL_DATA):
-    return model_select(constant_data=constant_data, **item)
 
+def do_refine_model_task(item, constant_data=GLOBAL_DATA):
+    return refine_model(constant_data=constant_data, **item)
 
-def model_select(
-        fold_num, models, min_models, max_models, constant_data=GLOBAL_DATA):
+
+def refine_model(
+        work_item_num, affinity_model, fold_num, constant_data=GLOBAL_DATA):
     """
-    Model select for a fold.
-
-    Parameters
-    ----------
-    fold_num : int
-    models : list of Class1NeuralNetwork
-    min_models : int
-    max_models : int
-    constant_data : dict
-
-    Returns
-    -------
-    dict with keys 'fold_num', 'selected_indices', 'summary'
+    Refine a model.
     """
-    full_data = constant_data["data"]
-    input_predictor = constant_data["input_predictor"]
-    df = full_data.loc[
-        full_data["fold_%d" % fold_num] == 0
-    ]
-
-    peptides = EncodableSequences.create(df.peptide.values)
-    alleles = AlleleEncoding(
-        df.allele.values,
-        borrow_from=input_predictor.master_allele_encoding)
-
-    predictions_df = df.copy()
-    for (i, model) in enumerate(models):
-        predictions_df[i] = from_ic50(model.predict(peptides, alleles))
-
-    actual = from_ic50(predictions_df.measurement_value)
-
-    selected = []
-    selected_score = 0
-    remaining_models = set(numpy.arange(len(models)))
-    individual_model_scores = {}
-    while remaining_models and len(selected) < max_models:
-        best_model = None
-        best_model_score = 0
-        for i in remaining_models:
-            possible_ensemble = list(selected) + [i]
-            predictions = predictions_df[possible_ensemble].mean(1)
-            mse_score = 1 - mse(
-                predictions,
-                actual,
-                inequalities=(
-                    predictions_df.measurement_inequality
-                    if 'measurement_inequality' in predictions_df.columns
-                    else None),
-                affinities_are_already_01_transformed=True)
-            if mse_score >= best_model_score:
-                best_model = i
-                best_model_score = mse_score
-            if not selected:
-                # First iteration. Store individual model scores.
-                individual_model_scores[i] = mse_score
-        if len(selected) < min_models or best_model_score > selected_score:
-            selected_score = best_model_score
-            remaining_models.remove(best_model)
-            selected.append(best_model)
-        else:
-            break
-
-    assert selected
-
-    summary_df = pandas.Series(individual_model_scores)[
-        numpy.arange(len(models))
-    ].to_frame()
-    summary_df.columns = ['mse_score']
+    monoallelic_df = constant_data["monoallelic_data"]
+    multiallelic_df = constant_data["multiallelic_data"]
+    sample_table = constant_data["multiallelic_sample_table"]
+    hyperparameters = constant_data["hyperparameters"]
+    allele_to_sequence = constant_data["allele_to_sequence"]
+
+    fold_col = "fold_%d" % fold_num
+
+    multiallelic_train_df = multiallelic_df[
+        ["sample_id", "peptide", "hit"]
+    ].rename(columns={"hit": "label"}).copy()
+    multiallelic_train_df["is_affinity"] = False
+    multiallelic_train_df["validation_weight"] = 1.0
+
+    monoallelic_train_df = monoallelic_df[
+        ["peptide", "allele", "measurement_inequality", "measurement_value"]
+    ].copy()
+    monoallelic_train_df["label"] = monoallelic_train_df["measurement_value"]
+    del monoallelic_train_df["measurement_value"]
+    monoallelic_train_df["is_affinity"] = True
+
+    # We force all validation affinities to be from the validation set used
+    # originally to train the predictor. To ensure proportional sampling between
+    # the affinity and multiallelic mass spec data, we set their weight to
+    # as follows.
+    monoallelic_train_df["validation_weight"] = (
+        (monoallelic_df[fold_col] == 0).astype(float) * (
+            (monoallelic_df[fold_col] == 1).sum() /
+            (monoallelic_df[fold_col] == 0).sum()))
+
+    combined_train_df = pandas.concat(
+        [multiallelic_train_df, monoallelic_train_df],
+        ignore_index=True,
+        sort=False)
+
+    allele_encoding = MultipleAlleleEncoding(
+        experiment_names=multiallelic_train_df.sample_id.values,
+        experiment_to_allele_list=sample_table.alleles.to_dict(),
+        allele_to_sequence=allele_to_sequence,
+    )
+    allele_encoding.append_alleles(monoallelic_train_df.allele.values)
+    allele_encoding = allele_encoding.compact()
+
+    presentation_model = Class1PresentationNeuralNetwork(**hyperparameters)
+    presentation_model.load_from_class1_neural_network(affinity_model)
+    presentation_model.fit(
+        peptides=combined_train_df.peptide.values,
+        labels=combined_train_df.label.values,
+        allele_encoding=allele_encoding,
+        affinities_mask=combined_train_df.is_affinity.values,
+        inequalities=combined_train_df.measurement_inequality.values,
+        validation_weights=combined_train_df.validation_weight.values)
 
     return {
-        'fold_num': fold_num,
-        'selected_indices': selected,
-        'summary': summary_df,  # indexed by model index
+        'work_item_num': work_item_num,
+
+        # We pickle it here so it always gets pickled, even when running in one
+        # process. This prevents tensorflow errors when using thread-level
+        # parallelism.
+        'presentation_model': pickle.dumps(
+            presentation_model, pickle.HIGHEST_PROTOCOL),
     }
 
 
-- 
GitLab