From 7a688ba54251e23f8599516223e462a88b64867a Mon Sep 17 00:00:00 2001
From: Tim O'Donnell <timodonnell@gmail.com>
Date: Mon, 8 Jul 2019 17:20:08 -0400
Subject: [PATCH] basic support for pan allele model selection

---
 .../calibrate_percentile_ranks_command.py     |   5 -
 mhcflurry/parallelism.py                      |   8 +-
 mhcflurry/select_pan_allele_models_command.py | 609 +++++-------------
 mhcflurry/train_pan_allele_models_command.py  |   2 +-
 4 files changed, 176 insertions(+), 448 deletions(-)

diff --git a/mhcflurry/calibrate_percentile_ranks_command.py b/mhcflurry/calibrate_percentile_ranks_command.py
index 4a986e5c..74dee9d4 100644
--- a/mhcflurry/calibrate_percentile_ranks_command.py
+++ b/mhcflurry/calibrate_percentile_ranks_command.py
@@ -7,13 +7,8 @@ import signal
 import sys
 import time
 import traceback
-import random
 from functools import partial
 
-import numpy
-import pandas
-import yaml
-from sklearn.metrics.pairwise import cosine_similarity
 from mhcnames import normalize_allele_name
 import tqdm  # progress bar
 tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
diff --git a/mhcflurry/parallelism.py b/mhcflurry/parallelism.py
index 0e652a96..3786b9c6 100644
--- a/mhcflurry/parallelism.py
+++ b/mhcflurry/parallelism.py
@@ -72,14 +72,12 @@ def worker_pool_with_gpu_assignments(
         max_tasks_per_worker=None,
         worker_log_dir=None):
 
-    num_workers = num_jobs if num_jobs else cpu_count()
-
-    if num_workers == 0:
+    if num_jobs == 0:
         if backend:
             set_keras_backend(backend)
         return None
 
-    worker_init_kwargs = [{} for _ in range(num_workers)]
+    worker_init_kwargs = [{} for _ in range(num_jobs)]
     if num_gpus:
         print("Attempting to round-robin assign each worker a GPU.")
         if backend != "tensorflow-default":
@@ -115,7 +113,7 @@ def worker_pool_with_gpu_assignments(
             kwargs["worker_log_dir"] = worker_log_dir
 
     worker_pool = make_worker_pool(
-        processes=num_workers,
+        processes=num_jobs,
         initializer=worker_init,
         initializer_kwargs_per_process=worker_init_kwargs,
         max_tasks_per_worker=max_tasks_per_worker)
diff --git a/mhcflurry/select_pan_allele_models_command.py b/mhcflurry/select_pan_allele_models_command.py
index 510016bf..b7d2e5e3 100644
--- a/mhcflurry/select_pan_allele_models_command.py
+++ b/mhcflurry/select_pan_allele_models_command.py
@@ -8,19 +8,19 @@ import sys
 import time
 import traceback
 import random
+import hashlib
 from pprint import pprint
 
 import numpy
 import pandas
 from scipy.stats import kendalltau, percentileofscore, pearsonr
-from sklearn.metrics import roc_auc_score
 
-from mhcnames import normalize_allele_name
 import tqdm  # progress bar
 tqdm.monitor_interval = 0  # see https://github.com/tqdm/tqdm/issues/481
 
 from .class1_affinity_predictor import Class1AffinityPredictor
 from .encodable_sequences import EncodableSequences
+from .allele_encoding import AlleleEncoding
 from .common import configure_logging, random_peptides
 from .parallelism import worker_pool_with_gpu_assignments_from_args, add_worker_pool_args
 from .regression_target import from_ic50
@@ -43,6 +43,11 @@ parser.add_argument(
     help=(
         "Model selection data CSV. Expected columns: "
         "allele, peptide, measurement_value"))
+parser.add_argument(
+    "--folds",
+    metavar="FILE.csv",
+    required=False,
+    help=(""))
 parser.add_argument(
     "--models-dir",
     metavar="DIR",
@@ -54,17 +59,17 @@ parser.add_argument(
     required=True,
     help="Directory to write selected models")
 parser.add_argument(
-    "--min-models",
+    "--min-models-per-fold",
     type=int,
-    default=8,
+    default=2,
     metavar="N",
-    help="Min number of models to select")
+    help="Min number of models to select per fold")
 parser.add_argument(
-    "--max-models",
+    "--max-models-per-fold",
     type=int,
     default=1000,
     metavar="N",
-    help="Max number of models to select")
+    help="Max number of models to select per fold")
 parser.add_argument(
     "--mass-spec-regex",
     metavar="REGEX",
@@ -80,6 +85,31 @@ parser.add_argument(
 add_worker_pool_args(parser)
 
 
+def mse(
+        predictions,
+        actual,
+        inequalities=None,
+        affinities_are_already_01_transformed=False):
+    if not affinities_are_already_01_transformed:
+        predictions = from_ic50(predictions)
+        actual = from_ic50(actual)
+
+    deviations = (
+        numpy.array(predictions, copy=False) - numpy.array(actual, copy=False))
+
+    if inequalities is not None:
+        # Must reverse meaning of inequality since we are working with
+        # transformed 0-1 values, which are anti-correlated with the ic50s.
+        # The measurement_inequality column is given in terms of ic50s.
+        inequalities = numpy.array(inequalities, copy=False)
+        deviations[
+            ((inequalities == "<") & (deviations > 0)) | (
+             (inequalities == ">") & (deviations < 0))
+        ] = 0.0
+
+    return (deviations ** 2).mean()
+
+
 def run(argv=sys.argv[1:]):
     global GLOBAL_DATA
 
@@ -104,44 +134,86 @@ def run(argv=sys.argv[1:]):
     df = pandas.read_csv(args.data)
     print("Loaded data: %s" % (str(df.shape)))
 
-    df = df.loc[
-        (df.peptide.str.len() >= min_peptide_length) &
-        (df.peptide.str.len() <= max_peptide_length)
-    ]
-    print("Subselected to %d-%dmers: %s" % (
-        min_peptide_length, max_peptide_length, str(df.shape)))
-
-    import ipdb ; ipdb.set_trace()  # leaving off here
+    if args.folds:
+        folds_df = pandas.read_csv(args.folds)
+        matches = all([
+            len(folds_df) == len(df),
+            (folds_df.peptide == df.peptide).all(),
+            (folds_df.allele == df.allele).all(),
+        ])
+        if not matches:
+            raise ValueError("Training data and fold data do not match")
+        fold_cols = [c for c in folds_df if c.startswith("fold_")]
+        for col in fold_cols:
+            df[col] = folds_df[col]
+
+    fold_cols = [c for c in df if c.startswith("fold_")]
+    num_folds = len(fold_cols)
+    if num_folds <= 1:
+        raise ValueError("Too few folds: ", num_folds)
+
+    #df = df.loc[
+    #    (df.peptide.str.len() >= min_peptide_length) &
+    #    (df.peptide.str.len() <= max_peptide_length)
+    #]
+    #print("Subselected to %d-%dmers: %s" % (
+    #    min_peptide_length, max_peptide_length, str(df.shape)))
+
+    print("Num folds: ", num_folds, "fraction included:")
+    print(df[fold_cols].mean())
 
     # Allele names in data are assumed to be already normalized.
-    df = df.loc[df.allele.isin(alleles)].dropna()
-    print("Subselected to supported alleles: %s" % str(df.shape))
-
+    #df = df.loc[df.allele.isin(alleles)].dropna()
+    #print("Subselected to supported alleles: %s" % str(df.shape))
 
-    print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
+    #print("Selected %d alleles: %s" % (len(alleles), ' '.join(alleles)))
 
     metadata_dfs["model_selection_data"] = df
 
     df["mass_spec"] = df.measurement_source.str.contains(
         args.mass_spec_regex)
 
-
-
-
-
-
-    GLOBAL_DATA["args"] = args
+    def make_train_peptide_hash(sub_df):
+        train_peptide_hash = hashlib.sha1()
+        for peptide in sorted(sub_df.peptide.values):
+            train_peptide_hash.update(peptide.encode())
+        return train_peptide_hash.hexdigest()
+
+    folds_to_predictors = dict(
+        (int(col.split("_")[-1]), (
+            [],
+            make_train_peptide_hash(df.loc[df[col] == 1])))
+        for col in fold_cols)
+    print(folds_to_predictors)
+    for model in input_predictor.class1_pan_allele_models:
+        training_info = model.fit_info[-1]['training_info']
+        fold_num = training_info['fold_num']
+        assert num_folds == training_info['num_folds']
+        (lst, hash) = folds_to_predictors[fold_num]
+        train_peptide_hash = training_info['train_peptide_hash']
+        #numpy.testing.assert_equal(hash, train_peptide_hash)  #enable later
+        lst.append(model)
+
+    work_items = []
+    for (fold_num, (models, _)) in folds_to_predictors.items():
+        work_items.append({
+            'fold_num': fold_num,
+            'models': models,
+            'min_models': args.min_models_per_fold,
+            'max_models': args.max_models_per_fold,
+        })
+
+    GLOBAL_DATA["data"] = df
     GLOBAL_DATA["input_predictor"] = input_predictor
-    GLOBAL_DATA["unselected_accuracy_scorer"] = unselected_accuracy_scorer
-    GLOBAL_DATA["allele_to_selector"] = allele_to_selector
-    GLOBAL_DATA["allele_to_model_selection_kwargs"] = allele_to_model_selection_kwargs
 
     if not os.path.exists(args.out_models_dir):
         print("Attempting to create directory: %s" % args.out_models_dir)
         os.mkdir(args.out_models_dir)
         print("Done.")
 
-    result_predictor = Class1AffinityPredictor(metadata_dataframes=metadata_dfs)
+    result_predictor = Class1AffinityPredictor(
+        allele_to_sequence=input_predictor.allele_to_sequence,
+        metadata_dataframes=metadata_dfs)
 
     worker_pool = worker_pool_with_gpu_assignments_from_args(args)
 
@@ -150,43 +222,29 @@ def run(argv=sys.argv[1:]):
     if worker_pool is None:
         # Serial run
         print("Running in serial.")
-        results = (
-            model_select(allele) for allele in alleles)
+        results = (do_model_select_task(item) for item in work_items)
     else:
         # Parallel run
         random.shuffle(alleles)
         results = worker_pool.imap_unordered(
-            model_select,
-            alleles,
+            do_model_select_task,
+            work_items,
             chunksize=1)
 
-    unselected_summary = []
-    model_selection_dfs = []
-    for result in tqdm.tqdm(results, total=len(alleles)):
+    models_by_fold = {}
+    for result in tqdm.tqdm(results, total=len(work_items)):
         pprint(result)
+        fold_num = result['fold_num']
+        models = [
+            folds_to_predictors[fold_num][0][i]
+            for i in result['selected_indices']
+        ]
+        print("Selected %d models for fold %d: %s" % (
+            len(models), fold_num, result['selected_indices']))
+        models_by_fold[fold_num] = models
+        for model in models:
+            result_predictor.add_pan_allele_model(model)
 
-        summary_dict = dict(result)
-        summary_dict["retained"] = result["selected"] is not None
-        del summary_dict["selected"]
-
-        unselected_summary.append(summary_dict)
-        if result['selected'] is not None:
-            model_selection_dfs.append(
-                result['selected'].metadata_dataframes['model_selection'])
-            result_predictor.merge_in_place([result['selected']])
-
-    if model_selection_dfs:
-        model_selection_df = pandas.concat(
-            model_selection_dfs, ignore_index=True)
-        model_selection_df["selector"] = model_selection_df.allele.map(
-            allele_to_selector)
-        result_predictor.metadata_dataframes["model_selection"] = (
-            model_selection_df)
-
-    result_predictor.metadata_dataframes["unselected_summary"] = (
-        pandas.DataFrame(unselected_summary))
-
-    print("Done model selecting for %d alleles." % len(alleles))
     result_predictor.save(args.out_models_dir)
 
     model_selection_time = time.time() - start
@@ -199,390 +257,67 @@ def run(argv=sys.argv[1:]):
     print("Predictor written to: %s" % args.out_models_dir)
 
 
-class ScrambledPredictor(object):
-    def __init__(self, predictor):
-        self.predictor = predictor
-        self._predictions = {}
-        self._allele = None
-
-    def predict(self, peptides, allele):
-        if peptides not in self._predictions:
-            self._predictions[peptides] = pandas.Series(
-                self.predictor.predict(peptides=peptides, allele=allele))
-            self._allele = allele
-        assert allele == self._allele
-        return self._predictions[peptides].sample(frac=1.0).values
+def do_model_select_task(item):
+    return model_select(**item)
 
 
-def model_select(allele):
+def model_select(fold_num, models, min_models, max_models):
     global GLOBAL_DATA
-    unselected_accuracy_scorer = GLOBAL_DATA["unselected_accuracy_scorer"]
-    selector = GLOBAL_DATA["allele_to_selector"][allele]
-    model_selection_kwargs = GLOBAL_DATA[
-        "allele_to_model_selection_kwargs"
-    ][allele]
-    predictor = GLOBAL_DATA["input_predictor"]
-    args = GLOBAL_DATA["args"]
-    unselected_accuracy_scorer_samples = GLOBAL_DATA["args"].unselected_accuracy_scorer_num_samples
-
-    result_dict = {
-        "allele": allele
-    }
-
-    unselected_score = None
-    unselected_score_percentile = None
-    unselected_score_scrambled_mean = None
-    if unselected_accuracy_scorer:
-        unselected_score_function = (
-            unselected_accuracy_scorer.score_function(allele))
-
-        additional_metadata = {}
-        unselected_score = unselected_score_function(
-            predictor, additional_metadata_out=additional_metadata)
-        scrambled_predictor = ScrambledPredictor(predictor)
-        scrambled_scores = numpy.array([
-            unselected_score_function(
-                scrambled_predictor)
-            for _ in range(unselected_accuracy_scorer_samples)
-        ])
-        unselected_score_scrambled_mean = scrambled_scores.mean()
-        unselected_score_percentile = percentileofscore(
-            scrambled_scores, unselected_score)
-        print(
-            "Unselected score and percentile",
-            allele,
-            unselected_score,
-            unselected_score_percentile,
-            additional_metadata)
-        result_dict.update(
-            dict(("unselected_%s" % key, value)
-                 for (key, value)
-                 in additional_metadata.items()))
-
-    selected = None
-    threshold = args.unselected_accuracy_percentile_threshold
-    if unselected_score_percentile is None or unselected_score_percentile >= threshold:
-        selected = predictor.model_select(
-            score_function=selector.score_function(allele=allele),
-            alleles=[allele],
-            **model_selection_kwargs)
-
-    result_dict["unselected_score_plan"] = (
-        unselected_accuracy_scorer.plan_summary(allele)
-        if unselected_accuracy_scorer else None)
-    result_dict["selector_score_plan"] = selector.plan_summary(allele)
-    result_dict["unselected_accuracy_score_percentile"] = unselected_score_percentile
-    result_dict["unselected_score"] = unselected_score
-    result_dict["unselected_score_scrambled_mean"] = unselected_score_scrambled_mean
-    result_dict["selected"] = selected
-    result_dict["num_models"] = len(selected.neural_networks) if selected else None
-    return result_dict
-
-
-def cache_encoding(predictor, peptides):
-    # Encode the peptides for each neural network, so the encoding
-    # becomes cached.
-    for network in predictor.neural_networks:
-        network.peptides_to_network_input(peptides)
-
-
-class ScoreFunction(object):
-    """
-    Thin wrapper over a score function (Class1AffinityPredictor -> float).
-    Used to keep a summary string associated with the function.
-    """
-    def __init__(self, function, summary=None):
-        self.function = function
-        self.summary = summary if summary else "(n/a)"
-
-    def __call__(self, *args, **kwargs):
-        return self.function(*args, **kwargs)
-
-
-class CombinedModelSelector(object):
-    """
-    Model selector that computes a weighted average over other model selectors.
-    """
-    def __init__(self, model_selectors, weights=None, min_contribution_percent=1.0):
-        if weights is None:
-            weights = numpy.ones(shape=(len(model_selectors),))
-        self.model_selectors = model_selectors
-        self.selector_to_weight = dict(zip(self.model_selectors, weights))
-        self.min_contribution_percent = min_contribution_percent
-
-    def usable_for_allele(self, allele):
-        return any(
-            selector.usable_for_allele(allele)
-            for selector in self.model_selectors)
-
-    def plan_summary(self, allele):
-        return self.score_function(allele, dry_run=True).summary
-
-    def score_function(self, allele, dry_run=False):
-        selector_to_max_weighted_score = {}
-        for selector in self.model_selectors:
-            weight = self.selector_to_weight[selector]
-            if selector.usable_for_allele(allele):
-                max_weighted_score = selector.max_absolute_value(allele) * weight
-            else:
-                max_weighted_score = 0
-            selector_to_max_weighted_score[selector] = max_weighted_score
-        max_total_score = sum(selector_to_max_weighted_score.values())
-
-        # Use only selectors that can contribute >1% to the total score
-        selectors_to_use = [
-            selector
-            for selector in self.model_selectors
-            if (
-                selector_to_max_weighted_score[selector] >
-                max_total_score * self.min_contribution_percent / 100.0)
-        ]
-
-        summary = ", ".join([
-            "%s(|%.3f|)" % (
-                selector.plan_summary(allele),
-                selector_to_max_weighted_score[selector])
-            for selector in selectors_to_use
-        ])
-
-        if dry_run:
-            score = None
-        else:
-            score_functions_and_weights = [
-                (selector.score_function(allele=allele),
-                 self.selector_to_weight[selector])
-                for selector in selectors_to_use
-            ]
-
-            def score(predictor, additional_metadata_out=None):
-                scores = numpy.array([
-                    score_function(
-                        predictor,
-                        additional_metadata_out=additional_metadata_out) * weight
-                    for (score_function, weight) in score_functions_and_weights
-                ])
-                if additional_metadata_out is not None:
-                    additional_metadata_out["combined_score_terms"] = str(
-                        list(scores))
-
-                return scores.sum()
-        return ScoreFunction(score, summary=summary)
-
-
-class ConsensusModelSelector(object):
-    """
-    Model selector that scores sub-ensembles based on their Kendall tau
-    consistency with the full ensemble over a set of random peptides.
-    """
-    def __init__(
-            self,
-            predictor,
-            num_peptides_per_length=10000,
-            multiply_score_by_value=10.0):
-
-        (min_length, max_length) = predictor.supported_peptide_lengths
-        peptides = []
-        for length in range(min_length, max_length + 1):
-            peptides.extend(
-                random_peptides(num_peptides_per_length, length=length))
-
-        self.peptides = EncodableSequences.create(peptides)
-        self.predictor = predictor
-        self.multiply_score_by_value = multiply_score_by_value
-        cache_encoding(self.predictor, self.peptides)
-
-    def usable_for_allele(self, allele):
-        return True
-
-    def max_absolute_value(self, allele):
-        return self.multiply_score_by_value
-
-    def plan_summary(self, allele):
-        return "consensus (%d points)" % len(self.peptides)
-
-    def score_function(self, allele):
-        full_ensemble_predictions = self.predictor.predict(
-            allele=allele,
-            peptides=self.peptides)
-
-        def score(predictor, additional_metadata_out=None):
-            predictions = predictor.predict(
-                allele=allele,
-                peptides=self.peptides,
-            )
-            tau = kendalltau(predictions, full_ensemble_predictions).correlation
-            if additional_metadata_out is not None:
-                additional_metadata_out["score_consensus_tau"] = tau
-            return tau * self.multiply_score_by_value
-
-        return ScoreFunction(
-            score, summary=self.plan_summary(allele))
-
-
-class MSEModelSelector(object):
-    """
-    Model selector that uses mean-squared error to score models. Inequalities
-    are supported.
-    """
-    def __init__(
-            self,
-            df,
-            predictor,
-            min_measurements=1,
-            multiply_score_by_data_size=True):
-
-        self.df = df
-        self.predictor = predictor
-        self.min_measurements = min_measurements
-        self.multiply_score_by_data_size = multiply_score_by_data_size
-
-    def usable_for_allele(self, allele):
-        return (self.df.allele == allele).sum() >= self.min_measurements
-
-    def max_absolute_value(self, allele):
-        if self.multiply_score_by_data_size:
-            return (self.df.allele == allele).sum()
-        else:
-            return 1.0
-
-    def plan_summary(self, allele):
-        return self.score_function(allele).summary
-
-    def score_function(self, allele):
-        sub_df = self.df.loc[self.df.allele == allele].reset_index(drop=True)
-        peptides = EncodableSequences.create(sub_df.peptide.values)
-
-        def score(predictor, additional_metadata_out=None):
-            predictions = predictor.predict(
-                allele=allele,
-                peptides=peptides,
-            )
-            deviations = from_ic50(predictions) - from_ic50(
-                sub_df.measurement_value)
-
-            if 'measurement_inequality' in sub_df.columns:
-                # Must reverse meaning of inequality since we are working with
-                # transformed 0-1 values, which are anti-correlated with the ic50s.
-                # The measurement_inequality column is given in terms of ic50s.
-                deviations.loc[
-                    (
-                    (sub_df.measurement_inequality == "<") & (deviations > 0)) |
-                    ((sub_df.measurement_inequality == ">") & (deviations < 0))
-                    ] = 0.0
-
-            score_mse = (1 - (deviations ** 2).mean())
-            if additional_metadata_out is not None:
-                additional_metadata_out["score_MSE"] = 1 - score_mse
-
-                # We additionally include other scores on (=) measurements as
-                # a convenience
-                eq_df = sub_df
-                if 'measurement_inequality' in sub_df.columns:
-                    eq_df = sub_df.loc[
-                        sub_df.measurement_inequality == "="
-                        ]
-                additional_metadata_out["score_pearsonr"] = (
-                    pearsonr(
-                        numpy.log(eq_df.measurement_value.values),
-                        numpy.log(predictions[eq_df.index.values]))[0])
-
-                for threshold in [500, 5000, 15000]:
-                    if (eq_df.measurement_value < threshold).nunique() == 2:
-                        additional_metadata_out["score_AUC@%d" % threshold] = (
-                            roc_auc_score(
-                                (eq_df.measurement_value < threshold).values,
-                                -1 * predictions[eq_df.index.values]))
-
-            return score_mse * (
-                len(sub_df) if self.multiply_score_by_data_size else 1)
-
-        summary = "mse (%d points)" % (len(sub_df))
-        return ScoreFunction(score, summary=summary)
-
-
-class MassSpecModelSelector(object):
-    """
-    Model selector that uses PPV of differentiating decoys from hits from
-    mass-spec experiments.
-    """
-    def __init__(
-            self,
-            df,
-            predictor,
-            decoys_per_length=0,
-            min_measurements=100,
-            multiply_score_by_data_size=True):
-
-        # Index is peptide, columns are alleles
-        hit_matrix = df.groupby(
-            ["peptide", "allele"]).measurement_value.count().unstack().fillna(
-            0).astype(bool)
-
-        if decoys_per_length:
-            (min_length, max_length) = predictor.supported_peptide_lengths
-            decoys = []
-            for length in range(min_length, max_length + 1):
-                decoys.extend(
-                    random_peptides(decoys_per_length, length=length))
-
-            decoy_matrix = pandas.DataFrame(
-                index=decoys, columns=hit_matrix.columns, dtype=bool)
-            decoy_matrix[:] = False
-            full_matrix = pandas.concat([hit_matrix, decoy_matrix])
-        else:
-            full_matrix = hit_matrix
-
-        if len(full_matrix) > 0:
-            full_matrix = full_matrix.sample(frac=1.0).astype(float)
-
-        self.df = full_matrix
-        self.predictor = predictor
-        self.min_measurements = min_measurements
-        self.multiply_score_by_data_size = multiply_score_by_data_size
-
-        self.peptides = EncodableSequences.create(full_matrix.index.values)
-        cache_encoding(self.predictor, self.peptides)
-
-    @staticmethod
-    def ppv(y_true, predictions):
-        df = pandas.DataFrame({"prediction": predictions, "y_true": y_true})
-        return df.sort_values("prediction", ascending=True)[
-            : int(y_true.sum())
-        ].y_true.mean()
-
-    def usable_for_allele(self, allele):
-        return allele in self.df.columns and (
-            self.df[allele].sum() >= self.min_measurements)
+    full_data = GLOBAL_DATA["data"]
+    input_predictor = GLOBAL_DATA["input_predictor"]
+    df = full_data.loc[
+        full_data["fold_%d" % fold_num] == 0
+    ]
 
-    def max_absolute_value(self, allele):
-        if self.multiply_score_by_data_size:
-            return self.df[allele].sum()
+    peptides = EncodableSequences.create(df.peptide.values)
+    alleles = AlleleEncoding(
+        df.allele.values,
+        borrow_from=input_predictor.get_master_allele_encoding())
+
+    predictions_df = df.copy()
+    for (i, model) in enumerate(models):
+        predictions_df[i] = from_ic50(model.predict(peptides, alleles))
+
+    actual = from_ic50(predictions_df.measurement_value)
+
+    selected = []
+    selected_score = 0
+    remaining_models = set(numpy.arange(len(models)))
+    individual_model_scores = {}
+    while remaining_models and len(selected) < max_models:
+        best_model = None
+        best_model_score = 0
+        for i in remaining_models:
+            possible_ensemble = list(selected) + [i]
+            predictions = predictions_df[possible_ensemble].mean(1)
+            mse_score = 1 - mse(
+                predictions,
+                actual,
+                inequalities=(
+                    predictions_df.measurement_inequality
+                    if 'measurement_inequality' in predictions_df.columns
+                    else None),
+                affinities_are_already_01_transformed=True)
+            if mse_score >= best_model_score:
+                best_model = i
+                best_model_score = mse_score
+            if not selected:
+                # First iteration. Store individual model scores.
+                individual_model_scores[i] = mse_score
+        if len(selected) < min_models or best_model_score > selected_score:
+            selected_score = best_model_score
+            remaining_models.remove(best_model)
+            selected.append(best_model)
         else:
-            return 1.0
-
-    def plan_summary(self, allele):
-        return self.score_function(allele).summary
-
-    def score_function(self, allele):
-        total_hits = self.df[allele].sum()
-        total_decoys = (self.df[allele] == 0).sum()
-        multiplier = total_hits if self.multiply_score_by_data_size else 1
-        def score(predictor, additional_metadata_out=None):
-            predictions = predictor.predict(
-                allele=allele,
-                peptides=self.peptides,
-            )
-            ppv = self.ppv(self.df[allele], predictions)
-            if additional_metadata_out is not None:
-                additional_metadata_out["score_mass_spec_PPV"] = ppv
-
-                # We additionally compute AUC score.
-                additional_metadata_out["score_mass_spec_AUC"] = roc_auc_score(
-                    self.df[allele].values, -1 * predictions)
-            return ppv * multiplier
-
-        summary = "mass-spec (%d hits / %d decoys)" % (total_hits, total_decoys)
-        return ScoreFunction(score, summary=summary)
+            break
+
+    assert selected
+    return {
+        'fold_num': fold_num,
+        'selected_indices': selected,
+        'individual_model_scores': pandas.Series(
+            individual_model_scores)[numpy.arange(len(models))],
+    }
 
 
 if __name__ == '__main__':
diff --git a/mhcflurry/train_pan_allele_models_command.py b/mhcflurry/train_pan_allele_models_command.py
index 1abf6712..0bee23e8 100644
--- a/mhcflurry/train_pan_allele_models_command.py
+++ b/mhcflurry/train_pan_allele_models_command.py
@@ -505,7 +505,7 @@ def train_model(
 
     # Save model-specific training info
     train_peptide_hash = hashlib.sha1()
-    for peptide in train_data.peptide.values:
+    for peptide in sorted(train_data.peptide.values):
         train_peptide_hash.update(peptide.encode())
     model.fit_info[-1]["training_info"] = {
         "fold_num": fold_num,
-- 
GitLab