Initial implementation of antigen presentation prediction infrastructure

dfdbd8a4 · Tim O'Donnell · fd6fdd75 · dfdbd8a4 · dfdbd8a4 · dfdbd8a4
Commit dfdbd8a4 authored 8 years ago by Tim O'Donnell
--- a/mhcflurry/antigen_presentation/README.md
+++ b/mhcflurry/antigen_presentation/README.md
+# Prediction of antigen presention
+
+This submodule contains predictors for naturally presented MHC ligands. These predictors are typically trained on peptides eluted from cell surfaces and identified with mass-spec. The models combine MHC binding affinity with cleavage prediction and the level of expression of transcripts containing the given peptide.
+
+This is a work in progress and not ready for production use.
--- a/mhcflurry/antigen_presentation/__init__.py
+++ b/mhcflurry/antigen_presentation/__init__.py
+from .presentation_model import PresentationModel
+from .percent_rank_transform import PercentRankTransform
+from . import presentation_component_models, decoy_strategies
+
+__all__ = [
+    "PresentationModel",
+    "PercentRankTransform",
+    "presentation_component_models",
+    "decoy_strategies",
+]
--- a/mhcflurry/antigen_presentation/decoy_strategies/__init__.py
+++ b/mhcflurry/antigen_presentation/decoy_strategies/__init__.py
+from .decoy_strategy import DecoyStrategy
+from .same_transcripts_as_hits import SameTranscriptsAsHits
+from .uniform_random import UniformRandom
+
+__all__ = [
+    "DecoyStrategy",
+    "SameTranscriptsAsHits",
+    "UniformRandom",
+]
--- a/mhcflurry/antigen_presentation/decoy_strategies/decoy_strategy.py
+++ b/mhcflurry/antigen_presentation/decoy_strategies/decoy_strategy.py
+import pandas
+
+
+class DecoyStrategy(object):
+    """
+    A mechanism for selecting decoys (non-hit peptides) given hits (
+    peptides detected via mass-spec).
+
+    Subclasses should override either decoys() or decoys_for_experiment().
+    Whichever one is not overriden is implemented using the other.
+    """
+
+    def __init__(self):
+        pass
+
+    def decoys(self, hits_df):
+        """
+        Given a df of hits with columns 'experiment_name' and 'peptide',
+        return a df with the same structure giving decoys.
+
+        Subclasses should override either this or decoys_for_experiment()
+        """
+
+        assert 'experiment_name' in hits_df.columns
+        assert 'peptide' in hits_df.columns
+        assert len(hits_df) > 0
+        grouped = hits_df.groupby("experiment_name")
+        dfs = []
+        for (experiment_name, sub_df) in grouped:
+            decoys = self.decoys_for_experiment(
+                experiment_name,
+                sub_df.peptide.values)
+            df = pandas.DataFrame({
+                'peptide': decoys,
+            })
+            df["experiment_name"] = experiment_name
+            dfs.append(df)
+        return pandas.concat(dfs, ignore_index=True)
+
+    def decoys_for_experiment(self, experiment_name, hit_list):
+        """
+        Return decoys for a single experiment.
+
+        Parameters
+        ------------
+        experiment_name : string
+
+        hit_list : list of string
+            List of hits
+
+        """
+        # prevent infinite recursion:
+        assert self.decoys is not DecoyStrategy.decoys
+
+        hits_df = pandas.DataFrame({'peptide': hit_list})
+        hits_df["experiment_name"] = experiment_name
+        return self.decoys(hits_df)
--- a/mhcflurry/antigen_presentation/decoy_strategies/same_transcripts_as_hits.py
+++ b/mhcflurry/antigen_presentation/decoy_strategies/same_transcripts_as_hits.py
+import numpy
+
+from .decoy_strategy import DecoyStrategy
+
+
+class SameTranscriptsAsHits(DecoyStrategy):
+    """
+    Decoy strategy that selects decoys from the same transcripts the
+    hits come from. The transcript for each hit is taken to be the
+    transcript containing the hit with the the highest expression for
+    the given experiment.
+
+    Parameters
+    ------------
+    experiment_to_expression_group : dict of string -> string
+        Maps experiment names to expression groups.
+
+    peptides_and_transcripts: pandas.DataFrame
+        Must have columns 'peptide' and 'transcript', index unimportant.
+
+    peptide_to_expression_group_to_transcript : pandas.DataFrame
+        Indexed by peptides, columns are expression groups. Values
+        give transcripts to use.
+
+    decoys_per_hit : int
+    """
+    def __init__(
+            self,
+            experiment_to_expression_group,
+            peptides_and_transcripts,
+            peptide_to_expression_group_to_transcript,
+            decoys_per_hit=10):
+        DecoyStrategy.__init__(self)
+        assert decoys_per_hit > 0
+        self.experiment_to_expression_group = experiment_to_expression_group
+        self.peptides_and_transcripts = peptides_and_transcripts
+        self.peptide_to_expression_group_to_transcript = (
+            peptide_to_expression_group_to_transcript)
+        self.decoys_per_hit = decoys_per_hit
+
+    def decoys_for_experiment(self, experiment_name, hit_list):
+        assert len(hit_list) > 0, "No hits for %s" % experiment_name
+        expression_group = self.experiment_to_expression_group[experiment_name]
+        transcripts = self.peptide_to_expression_group_to_transcript.ix[
+            hit_list, expression_group
+        ]
+        assert len(transcripts) > 0, experiment_name
+
+        universe = self.peptides_and_transcripts.ix[
+            self.peptides_and_transcripts.transcript.isin(transcripts) &
+            (~ self.peptides_and_transcripts.peptide.isin(hit_list))
+        ].peptide.values
+        assert len(universe) > 0, experiment_name
+
+        return numpy.random.choice(
+            universe,
+            replace=True,
+            size=self.decoys_per_hit * len(hit_list))
--- a/mhcflurry/antigen_presentation/decoy_strategies/uniform_random.py
+++ b/mhcflurry/antigen_presentation/decoy_strategies/uniform_random.py
+import numpy
+
+from .decoy_strategy import DecoyStrategy
+
+
+class UniformRandom(DecoyStrategy):
+    """
+    Decoy strategy that selects decoys randomly from a provided universe
+    of peptides.
+    """
+    def __init__(self, all_peptides, decoys_per_hit=999):
+        DecoyStrategy.__init__(self)
+        self.all_peptides = set(all_peptides)
+        self.decoys_per_hit = decoys_per_hit
+
+    def decoys_for_experiment(self, experiment_name, hit_list):
+        decoy_pool = self.all_peptides.difference(set(hit_list))
+        return numpy.random.choice(
+            list(decoy_pool),
+            replace=True,
+            size=self.decoys_per_hit * len(hit_list))
--- a/mhcflurry/antigen_presentation/percent_rank_transform.py
+++ b/mhcflurry/antigen_presentation/percent_rank_transform.py
+import numpy
+
+
+class PercentRankTransform(object):
+    """
+    Transform arbitrary values into percent ranks.
+    """
+
+    def __init__(self, n_bins=1e5):
+        self.n_bins = int(n_bins)
+        self.cdf = None
+        self.bin_edges = None
+
+    def fit(self, values):
+        """
+        Fit the transform using the given values, which are used to
+        establish percentiles.
+        """
+        assert self.cdf is None
+        assert self.bin_edges is None
+        assert len(values) > 0
+        (hist, self.bin_edges) = numpy.histogram(values, bins=self.n_bins)
+        self.cdf = numpy.ones(len(hist) + 2) * numpy.nan
+        self.cdf[0] = 0.0
+        self.cdf[-1] = 100.0
+        numpy.cumsum(hist / numpy.sum(hist) * 100.0, out=self.cdf[1:-1])
+        assert not numpy.isnan(self.cdf).any()
+
+    def transform(self, values):
+        """
+        Return percent ranks (range [0, 100]) for the given values.
+        """
+        assert self.cdf is not None
+        assert self.bin_edges is not None
+        indices = numpy.searchsorted(self.bin_edges, values)
+        return self.cdf[indices]
--- a/mhcflurry/antigen_presentation/presentation_component_models/__init__.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/__init__.py
+from .presentation_component_model import PresentationComponentModel
+from .expression import Expression
+from .mhcflurry_released import MHCflurryReleased
+from .mhcflurry_trained_on_hits import MHCflurryTrainedOnHits
+from .fixed_affinity_predictions import FixedAffinityPredictions
+from .fixed_per_peptide_quantity import FixedPerPeptideQuantity
+from .fixed_per_peptide_and_transcript_quantity import (
+    FixedPerPeptideAndTranscriptQuantity)
+
+__all__ = [
+    "PresentationComponentModel",
+    "Expression",
+    "MHCflurryReleased",
+    "MHCflurryTrainedOnHits",
+    "FixedAffinityPredictions",
+    "FixedPerPeptideQuantity",
+    "FixedPerPeptideAndTranscriptQuantity",
+]
--- a/mhcflurry/antigen_presentation/presentation_component_models/expression.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/expression.py
+from .presentation_component_model import PresentationComponentModel
+
+from ...common import assert_no_null
+
+
+class Expression(PresentationComponentModel):
+    """
+    Model input for transcript expression.
+
+    Parameters
+    ------------
+
+    experiment_to_expression_group : dict of string -> string
+        Maps experiment names to expression groups.
+
+    expression_values : pandas.DataFrame
+        Columns should be expression groups. Indices should be peptide.
+
+    """
+
+    def __init__(
+            self, experiment_to_expression_group, expression_values, **kwargs):
+        PresentationComponentModel.__init__(self, **kwargs)
+        assert all(
+            group in expression_values.columns
+            for group in experiment_to_expression_group.values())
+
+        assert_no_null(experiment_to_expression_group)
+
+        self.experiment_to_expression_group = experiment_to_expression_group
+        self.expression_values = expression_values
+
+    def column_names(self):
+        return ["expression"]
+
+    def requires_fitting(self):
+        return False
+
+    def predict_for_experiment(self, experiment_name, peptides):
+        expression_group = self.experiment_to_expression_group[experiment_name]
+        return {
+            "expression": (
+                self.expression_values.ix[peptides, expression_group]
+                .values)
+        }
--- a/mhcflurry/antigen_presentation/presentation_component_models/fixed_affinity_predictions.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/fixed_affinity_predictions.py
+from .presentation_component_model import PresentationComponentModel
+
+from ...common import assert_no_null
+
+
+class FixedAffinityPredictions(PresentationComponentModel):
+    """
+    Parameters
+    ------------
+
+    experiment_to_alleles : dict: string -> string list
+        Normalized allele names for each experiment.
+
+    panel : pandas.Panel
+        Dimensions should be:
+            - "value", "percentile_rank" (IC50 and percent rank)
+            - peptide (string)
+            - allele (string)
+    """
+
+    def __init__(
+            self,
+            experiment_to_alleles,
+            panel,
+            name='precomputed',
+            **kwargs):
+        PresentationComponentModel.__init__(self, **kwargs)
+        self.experiment_to_alleles = experiment_to_alleles
+        for key in panel.items:
+            assert_no_null(panel[key])
+        self.panel = panel
+        self.name = name
+
+    def column_names(self):
+        return [
+            "%s_affinity" % self.name,
+            "%s_percentile_rank" % self.name
+        ]
+
+    def requires_fitting(self):
+        return False
+
+    def predict_min_across_alleles(self, alleles, peptides):
+        return {
+            ("%s_affinity" % self.name): (
+                self.panel
+                .value[alleles]
+                .min(axis=1)
+                .ix[peptides].values),
+            ("%s_percentile_rank" % self.name): (
+                self.panel
+                .percentile_rank[alleles]
+                .min(axis=1)
+                .ix[peptides].values)
+        }
+
+    def predict_for_experiment(self, experiment_name, peptides):
+        alleles = self.experiment_to_alleles[experiment_name]
+        return self.predict_min_across_alleles(alleles, peptides)
--- a/mhcflurry/antigen_presentation/presentation_component_models/fixed_per_peptide_and_transcript_quantity.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/fixed_per_peptide_and_transcript_quantity.py
+import logging
+
+from .presentation_component_model import PresentationComponentModel
+
+from ...common import assert_no_null
+
+
+class FixedPerPeptideAndTranscriptQuantity(PresentationComponentModel):
+    """
+    Model input for arbitrary fixed (i.e. not fitted) quantities that
+    depend only on the peptide and the transcript it comes from, which
+    is taken to be the most-expressed transcript in the experiment.
+
+    Motivating example: netChop cleavage predictions.
+
+    Parameters
+    ------------
+
+    name : string
+        Name for this final model input. Used in debug messages.
+
+    experiment_to_expression_group : dict of string -> string
+        Maps experiment names to expression groups.
+
+    top_transcripts : pandas.DataFrame
+        Columns should be expression groups. Indices should be peptide. Values
+        should be transcript names.
+
+    df : pandas.DataFrame
+        Must have columns 'peptide' and 'transcript'. Remaining columns are
+        the values emitted by this model input.
+
+    """
+
+    def __init__(
+            self,
+            name,
+            experiment_to_expression_group,
+            top_transcripts,
+            df,
+            **kwargs):
+        PresentationComponentModel.__init__(self, **kwargs)
+        self.name = name
+        self.experiment_to_expression_group = experiment_to_expression_group
+        self.top_transcripts = top_transcripts.copy()
+
+        self.df = df.drop_duplicates(['peptide', 'transcript'])
+
+        # This hack seems to be faster than using a multindex.
+        self.df.index = self.df.peptide.str.cat(self.df.transcript, sep=":")
+        del self.df["peptide"]
+        del self.df["transcript"]
+        assert_no_null(self.df)
+
+        df_set = set(self.df.index)
+        missing = set()
+
+        for expression_group in self.top_transcripts.columns:
+            self.top_transcripts[expression_group] = (
+                self.top_transcripts.index.str.cat(
+                    self.top_transcripts[expression_group],
+                    sep=":"))
+            missing.update(
+                set(self.top_transcripts[expression_group]).difference(df_set))
+        if missing:
+            logging.warn(
+                "%s: missing %d (peptide, transcript) pairs from df: %s" % (
+                    self.name,
+                    len(missing),
+                    sorted(missing)[:1000]))
+
+    def column_names(self):
+        return list(self.df.columns)
+
+    def requires_fitting(self):
+        return False
+
+    def predict_for_experiment(self, experiment_name, peptides):
+        expression_group = self.experiment_to_expression_group[experiment_name]
+        indices = self.top_transcripts.ix[peptides, expression_group]
+        assert len(indices) == len(peptides)
+        sub_df = self.df.ix[indices]
+        assert len(sub_df) == len(peptides)
+        result = {}
+        for col in self.column_names():
+            result_series = sub_df[col]
+            num_nulls = result_series.isnull().sum()
+            if num_nulls > 0:
+                logging.warning("%s: mean-filling for %d nulls" % (
+                    self.name, num_nulls))
+                result_series = result_series.fillna(self.df[col].mean())
+            result[col] = result_series.values
+        return result
--- a/mhcflurry/antigen_presentation/presentation_component_models/fixed_per_peptide_quantity.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/fixed_per_peptide_quantity.py
+from .presentation_component_model import PresentationComponentModel
+
+from ...common import assert_no_null
+
+
+class FixedPerPeptideQuantity(PresentationComponentModel):
+    """
+    Model input for arbitrary fixed (i.e. not fitted) quantities that
+    depend only on the peptide. Motivating example: Mike's cleavage
+    predictions.
+
+    Parameters
+    ------------
+
+    name : string
+        Name for this final model input. Used in debug messages.
+
+    df : pandas.DataFrame
+        index must be named 'peptide'. The columns of the dataframe are
+        the columns emitted by this final modle input.
+    """
+
+    def __init__(self, name, df, **kwargs):
+        PresentationComponentModel.__init__(self, **kwargs)
+        self.name = name
+        assert df.index.name == "peptide"
+        assert_no_null(df)
+        self.df = df
+
+    def column_names(self):
+        return list(self.df.columns)
+
+    def requires_fitting(self):
+        return False
+
+    def predict(self, peptides_df):
+        sub_df = self.df.ix[peptides_df.peptide]
+        return dict(
+            (col, sub_df[col].values)
+            for col in self.column_names())
--- a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_released.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_released.py
+import logging
+
+import numpy
+import pandas
+
+from ...common import normalize_allele_name
+from ...predict import predict
+from ..percent_rank_transform import PercentRankTransform
+from .presentation_component_model import PresentationComponentModel
+
+
+class MHCflurryReleased(PresentationComponentModel):
+    """
+    Final model input that uses the standard downloaded MHCflurry models.
+
+    Parameters
+    ------------
+    experiment_to_alleles : dict: string -> string list
+        Normalized allele names for each experiment.
+
+    random_peptides_for_percent_rank : list of string
+        If specified, then percentile rank will be calibrated and emitted
+        using the given peptides.
+    """
+
+    def __init__(
+            self,
+            experiment_to_alleles,
+            random_peptides_for_percent_rank=None,
+            **kwargs):
+        PresentationComponentModel.__init__(self, **kwargs)
+        self.experiment_to_alleles = experiment_to_alleles
+        if random_peptides_for_percent_rank is None:
+            self.percent_rank_transforms = None
+            self.random_peptides_for_percent_rank = None
+        else:
+            self.percent_rank_transforms = {}
+            self.random_peptides_for_percent_rank = numpy.array(
+                random_peptides_for_percent_rank)
+
+    def column_names(self):
+        columns = ['mhcflurry_released_affinity']
+        if self.percent_rank_transforms is not None:
+            columns.append('mhcflurry_released_percentile_rank')
+        return columns
+
+    def requires_fitting(self):
+        return False
+
+    def fit_percentile_rank_if_needed(self, alleles):
+        for allele in alleles:
+            if allele not in self.percent_rank_transforms:
+                logging.info('fitting percent rank for allele: %s' % allele)
+                self.percent_rank_transforms[allele] = PercentRankTransform()
+                self.percent_rank_transforms[allele].fit(
+                    predict(
+                        [allele],
+                        self.random_peptides_for_percent_rank)
+                    .Prediction.values)
+
+    def predict_min_across_alleles(self, alleles, peptides):
+        alleles = [
+            normalize_allele_name(allele)
+            for allele in alleles
+        ]
+        df = predict(alleles, numpy.unique(numpy.array(peptides)))
+        pivoted = df.pivot(index='Peptide', columns='Allele')
+        pivoted.columns = pivoted.columns.droplevel()
+        result = {
+            'mhcflurry_released_affinity': (
+                pivoted.min(axis=1).ix[peptides].values)
+        }
+        if self.percent_rank_transforms is not None:
+            self.fit_percentile_rank_if_needed(alleles)
+            percentile_ranks = pandas.DataFrame(index=pivoted.index)
+            for allele in alleles:
+                percentile_ranks[allele] = (
+                    self.percent_rank_transforms[allele]
+                    .transform(pivoted[allele].values))
+            result['mhcflurry_released_percentile_rank'] = (
+                percentile_ranks.min(axis=1).ix[peptides].values)
+        return result
+
+    def predict_for_experiment(self, experiment_name, peptides):
+        alleles = self.experiment_to_alleles[experiment_name]
+        return self.predict_min_across_alleles(alleles, peptides)
--- a/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/mhcflurry_trained_on_hits.py
+import logging
+from copy import copy
+
+import pandas
+import numpy
+
+from ...dataset import Dataset
+from ...class1_allele_specific import Class1BindingPredictor
+from ...common import normalize_allele_name, dataframe_cryptographic_hash
+
+from .presentation_component_model import PresentationComponentModel
+from ..decoy_strategies import SameTranscriptsAsHits
+from ..percent_rank_transform import PercentRankTransform
+
+
+MHCFLURRY_DEFAULT_HYPERPARAMETERS = dict(
+    embedding_output_dim=8,
+    dropout_probability=0.25)
+
+
+class MHCflurryTrainedOnHits(PresentationComponentModel):
+    """
+    Final model input that is a mhcflurry predictor trained on mass-spec
+    hits and, optionally, affinity measurements (for example from IEDB).
+
+    Parameters
+    ------------
+
+    predictor_name : string
+        used on column name. Example: 'vanilla'
+
+    experiment_to_alleles : dict: string -> string list
+        Normalized allele names for each experiment.
+
+    experiment_to_expression_group : dict of string -> string
+        Maps experiment names to expression groups.
+
+    transcripts : pandas.DataFrame
+        Index is peptide, columns are expression groups, values are
+        which transcript to use for the given peptide.
+        Not required if decoy_strategy specified.
+
+    peptides_and_transcripts : pandas.DataFrame
+        Dataframe with columns 'peptide' and 'transcript'
+        Not required if decoy_strategy specified.
+
+    decoy_strategy : decoy_strategy.DecoyStrategy
+        how to pick decoys. If not specified peptides_and_transcripts and
+        transcripts must be specified.
+
+    fallback_predictor : function: (allele, peptides) -> predictions
+        Used when missing an allele.
+
+    iedb_dataset : mhcflurry.Dataset
+        IEDB data for this allele. If not specified no iedb data is used.
+
+    decoys_per_hit : int
+
+    mhcflurry_hyperparameters : dict
+
+    hit_affinity : float
+        nM affinity to use for hits
+
+    decoy_affinity : float
+        nM affinity to use for decoys
+
+    random_peptides_for_percent_rank : list of string
+        If specified, then percentile rank will be calibrated and emitted
+        using the given peptides.
+    """
+
+    def __init__(
+            self,
+            predictor_name,
+            experiment_to_alleles,
+            experiment_to_expression_group=None,
+            transcripts=None,
+            peptides_and_transcripts=None,
+            decoy_strategy=None,
+            fallback_predictor=None,
+            iedb_dataset=None,
+            decoys_per_hit=10,
+            mhcflurry_hyperparameters=MHCFLURRY_DEFAULT_HYPERPARAMETERS,
+            hit_affinity=100,
+            decoy_affinity=20000,
+            random_peptides_for_percent_rank=None,
+            **kwargs):
+
+        PresentationComponentModel.__init__(self, **kwargs)
+        self.predictor_name = predictor_name
+        self.experiment_to_alleles = experiment_to_alleles
+        self.fallback_predictor = fallback_predictor
+        self.iedb_dataset = iedb_dataset
+        self.mhcflurry_hyperparameters = mhcflurry_hyperparameters
+        self.hit_affinity = hit_affinity
+        self.decoy_affinity = decoy_affinity
+
+        self.allele_to_model = None
+
+        if decoy_strategy is None:
+            assert peptides_and_transcripts is not None
+            assert transcripts is not None
+            self.decoy_strategy = SameTranscriptsAsHits(
+                experiment_to_expression_group=experiment_to_expression_group,
+                peptides_and_transcripts=peptides_and_transcripts,
+                peptide_to_expression_group_to_transcript=transcripts,
+                decoys_per_hit=decoys_per_hit)
+        else:
+            self.decoy_strategy = decoy_strategy
+
+        if random_peptides_for_percent_rank is None:
+            self.percent_rank_transforms = None
+            self.random_peptides_for_percent_rank = None
+        else:
+            self.percent_rank_transforms = {}
+            self.random_peptides_for_percent_rank = numpy.array(
+                random_peptides_for_percent_rank)
+
+    def column_name_affinity(self):
+        return "mhcflurry_%s_affinity" % self.predictor_name
+
+    def column_name_percentile_rank(self):
+        return "mhcflurry_%s_percentile_rank" % self.predictor_name
+
+    def column_names(self):
+        columns = [self.column_name_affinity()]
+        if self.percent_rank_transforms is not None:
+            columns.append(self.column_name_percentile_rank())
+        return columns
+
+    def requires_fitting(self):
+        return True
+
+    def fit_percentile_rank_if_needed(self, alleles):
+        for allele in alleles:
+            if allele not in self.percent_rank_transforms:
+                logging.info('fitting percent rank for allele: %s' % allele)
+                self.percent_rank_transforms[allele] = PercentRankTransform()
+                self.percent_rank_transforms[allele].fit(
+                    self.predict_affinity_for_allele(
+                        allele,
+                        self.random_peptides_for_percent_rank))
+
+    def fit(self, hits_df):
+        assert 'experiment_name' in hits_df.columns
+        assert 'peptide' in hits_df.columns
+
+        grouped = hits_df.groupby("experiment_name")
+        for (experiment_name, sub_df) in grouped:
+            self.fit_to_experiment(experiment_name, sub_df.peptide.values)
+
+        # No longer required after fitting.
+        self.decoy_strategy = None
+        self.iedb_dataset = None
+
+    def fit_to_experiment(self, experiment_name, hit_list):
+        assert len(hit_list) > 0
+        if self.allele_to_model is None:
+            self.allele_to_model = {}
+
+        alleles = self.experiment_to_alleles[experiment_name]
+        if len(alleles) != 1:
+            raise ValueError("Monoallelic data required")
+
+        (allele,) = alleles
+        mhcflurry_allele = normalize_allele_name(allele)
+        assert allele not in self.allele_to_model, \
+            "TODO: Support training on >1 experiments with same allele"
+
+        extra_hits = hit_list = set(hit_list)
+
+        iedb_dataset_df = None
+        if self.iedb_dataset is not None:
+            iedb_dataset_df = (
+                self.iedb_dataset.get_allele(mhcflurry_allele).to_dataframe())
+            extra_hits = hit_list.difference(set(iedb_dataset_df.peptide))
+            print("Using %d / %d ms hits not in iedb in augmented model" % (
+                len(extra_hits),
+                len(hit_list)))
+
+        decoys = self.decoy_strategy.decoys_for_experiment(
+            experiment_name, hit_list)
+
+        df = pandas.DataFrame({"peptide": sorted(set(hit_list).union(decoys))})
+        df["allele"] = mhcflurry_allele
+        df["species"] = "human"
+        df["affinity"] = ((
+            ~df.peptide.isin(hit_list))
+            .astype(float) * (
+                self.decoy_affinity - self.hit_affinity) + self.hit_affinity)
+        df["sample_weight"] = 1.0
+        df["peptide_length"] = 9
+
+        if self.iedb_dataset is not None:
+            df = df.append(iedb_dataset_df, ignore_index=True)
+
+        dataset = Dataset(
+            df.sample(frac=1))  # shuffle dataframe
+        print("Train data: ", dataset)
+        model = Class1BindingPredictor(
+            **self.mhcflurry_hyperparameters)
+        model.fit_dataset(dataset, verbose=True)
+        self.allele_to_model[allele] = model
+
+    def predict_affinity_for_allele(self, allele, peptides):
+        if self.cached_predictions is None:
+            cache_key = None
+            cached_result = None
+        else:
+            cache_key = (
+                allele,
+                dataframe_cryptographic_hash(pandas.Series(peptides)))
+            cached_result = self.cached_predictions.get(cache_key)
+        if cached_result is not None:
+            print("Cache hit in predict_affinity_for_allele: %s %s %s" % (
+                allele, str(self), id(cached_result)))
+            return cached_result
+        else:
+            print("Cache miss in predict_affinity_for_allele: %s %s" % (
+                allele, str(self)))
+
+        if allele in self.allele_to_model:
+            result = self.allele_to_model[allele].predict(peptides)
+        elif self.fallback_predictor:
+            print(
+                "MHCflurry: falling back on %s, "
+                "available alleles: %s" % (
+                    allele, ' '.join(self.allele_to_model)))
+            result = self.fallback_predictor(allele, peptides)
+        else:
+            raise ValueError("No model for allele: %s" % allele)
+
+        if self.cached_predictions is not None:
+            self.cached_predictions[cache_key] = result
+        return result
+
+    def predict_for_experiment(self, experiment_name, peptides):
+        assert self.allele_to_model is not None, "Must fit first"
+
+        alleles = self.experiment_to_alleles[experiment_name]
+        predictions = pandas.DataFrame(index=peptides)
+        for allele in alleles:
+            predictions[allele] = self.predict_affinity_for_allele(
+                allele, peptides)
+
+        result = {
+            self.column_name_affinity(): predictions.min(axis=1).values
+        }
+        if self.percent_rank_transforms is not None:
+            self.fit_percentile_rank_if_needed(alleles)
+            percentile_ranks = pandas.DataFrame(index=peptides)
+            for allele in alleles:
+                percentile_ranks[allele] = (
+                    self.percent_rank_transforms[allele]
+                    .transform(predictions[allele].values))
+            result[self.column_name_percentile_rank()] = (
+                percentile_ranks.min(axis=1).ix[peptides].values)
+        assert all(len(x) == len(peptides) for x in result.values()), (
+            "Result lengths don't match peptide lengths. %s:\n%s" % (
+                ", ".join(
+                    "%s=%d" % (key, value) for (key, value) in result.items()),
+                result))
+        return result
+
+    def get_fit(self):
+        return {
+            'model': 'MHCflurryTrainedOnMassSpec',
+            'allele_to_model': self.allele_to_model,
+        }
+
+    def restore_fit(self, fit_info):
+        fit_info = dict(fit_info)
+        self.allele_to_model = fit_info.pop('allele_to_model')
+
+        model = fit_info.pop('model')
+        assert model == 'MHCflurryTrainedOnMassSpec', model
+        assert not fit_info, "Extra info in fit: %s" % str(fit_info)
+
+    def clone(self):
+        result = copy(self)
+        result.reset_cache()
+        result.allele_to_model = copy(result.allele_to_model)
+        return result
--- a/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py
+++ b/mhcflurry/antigen_presentation/presentation_component_models/presentation_component_model.py
+import weakref
+
+from copy import copy
+
+import numpy
+import pandas
+
+from ...common import (
+    dataframe_cryptographic_hash, assert_no_null, freeze_object)
+
+
+def cache_dict_for_policy(policy):
+    if policy == "weak":
+        return weakref.WeakValueDictionary()
+    elif policy == "strong":
+        return {}
+    elif policy == "none":
+        return None
+    else:
+        raise ValueError("Unsupported cache policy: %s" % policy)
+
+
+class PresentationComponentModel(object):
+    cache_fields = ["cached_fits", "cached_predictions"]
+
+    '''
+    Base class for inputs to a "final model". By "final model" we mean
+    something like a logistic regression model that takes as input expression,
+    mhc binding affinity, cleavage, etc. By "final model input" we mean
+    the predictors for expression, mhc binding affinity, etc.
+    '''
+    def __init__(
+            self, fit_cache_policy="weak", predictions_cache_policy="weak"):
+        self.cached_fits = cache_dict_for_policy(fit_cache_policy)
+        self.cached_predictions = cache_dict_for_policy(
+            predictions_cache_policy)
+
+    def column_names(self):
+        """
+        Names for the values this final model input emits.
+        Some final model inputs emit multiple related quantities, such as
+        "binding affinity" and "binding percentile rank".
+        """
+        raise NotImplementedError(str(self))
+
+    def requires_fitting(self):
+        """
+        Does this model require fitting to mass-spec data?
+
+        For example, the 'expression' componenet models don't need to be
+        fit, but some cleavage predictors and binding predictors can be
+        trained on the ms data.
+        """
+        raise NotImplementedError(str(self))
+
+    def clone_and_fit(self, peptides_df):
+        """
+        Clone the object and fit to given dataset with a weakref cache.
+        """
+        if not self.requires_fitting():
+            return self
+
+        if self.cached_fits is None:
+            key = None
+            result = None
+        else:
+            key = dataframe_cryptographic_hash(
+                peptides_df[["experiment_name", "peptide"]])
+            result = self.cached_fits.get(key)
+        if result is None:
+            print("Cache miss in clone_and_fit: %s" % str(self))
+            result = self.clone()
+            result.fit(peptides_df)
+            if self.cached_fits is not None:
+                self.cached_fits[key] = result
+        else:
+            print("Cache hit in clone_and_fit: %s" % str(self))
+        return result
+
+    def clone_and_restore_fit(self, fit_info):
+        if not self.requires_fitting():
+            assert fit_info is None
+            return self
+
+        if self.cached_fits is None:
+            key = None
+            result = None
+        else:
+            key = freeze_object(fit_info)
+            result = self.cached_fits.get(key)
+        if result is None:
+            print("Cache miss in clone_and_restore_fit: %s" % str(self))
+            result = self.clone()
+            result.restore_fit(fit_info)
+            if self.cached_fits is not None:
+                self.cached_fits[key] = result
+        else:
+            print("Cache hit in clone_and_restore_fit: %s" % str(self))
+        return result
+
+    def __getstate__(self):
+        d = dict(self.__dict__)
+
+        # Don't pickle the cache variables, but remember what type they are.
+        for key in PresentationComponentModel.cache_fields:
+            d[key] = type(d[key])
+        return d
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        for key in PresentationComponentModel.cache_fields:
+            setattr(self, key, state[key]())
+
+    def fit(self, hits_df):
+        """
+        Train the final model input.
+
+        Parameters
+        -----------
+        hits_df : pandas.DataFrame
+            dataframe of hits with columns 'experiment_name' and 'peptide'
+
+        """
+        if self.requires_fitting():
+            raise NotImplementedError(str(self))
+
+    def predict_for_experiment(self, experiment_name, peptides):
+        """
+        A more convenient prediction method to implement.
+
+        Subclasses should override this method or predict().
+
+        Returns
+        ------------
+
+        A dict of column name -> list of predictions for each peptide
+
+        """
+        assert self.predict != PresentationComponentModel.predict, (
+            "Must override predict_for_experiment() or predict()")
+        peptides_df = pandas.DataFrame({
+            'peptide': peptides,
+        })
+        peptides_df["experiment_name"] = experiment_name
+        return self.predict(peptides_df)
+
+    def predict(self, peptides_df):
+        """
+        Subclasses can override either this or predict_for_experiment.
+
+        This is the high-level predict method that users should call.
+
+        This convenience method groups the peptides_df by experiment
+        and calls predict_for_experiment on each experiment.
+        """
+        assert (
+            self.predict_for_experiment !=
+            PresentationComponentModel.predict_for_experiment)
+        assert 'experiment_name' in peptides_df.columns
+        assert 'peptide' in peptides_df.columns
+
+        if self.cached_predictions is None:
+            cache_key = None
+            cached_result = None
+        else:
+            cache_key = dataframe_cryptographic_hash(peptides_df)
+            cached_result = self.cached_predictions.get(cache_key)
+        if cached_result is not None:
+            print("Cache hit in predict: %s" % str(self))
+            return cached_result
+        else:
+            print("Cache miss in predict: %s" % str(self))
+
+        grouped = peptides_df.groupby("experiment_name")
+
+        if len(grouped) == 1:
+            print("%s : using single-experiment predict optimization" % (
+                str(self)))
+            return_value = pandas.DataFrame(
+                self.predict_for_experiment(
+                    str(peptides_df.iloc[0].experiment_name),
+                    peptides_df.peptide.values))
+            assert len(return_value) == len(peptides_df), str(self)
+            assert_no_null(return_value, str(self))
+        else:
+            peptides_df = (
+                peptides_df[["experiment_name", "peptide"]]
+                .reset_index(drop=True))
+            columns = self.column_names()
+            result_df = peptides_df.copy()
+            for col in columns:
+                result_df[col] = numpy.nan
+            for (experiment_name, sub_df) in grouped:
+                assert (
+                    result_df.loc[sub_df.index, "experiment_name"] ==
+                    experiment_name).all()
+
+                unique_peptides = numpy.unique(sub_df.peptide.values)
+
+                result_dict = self.predict_for_experiment(
+                    experiment_name, unique_peptides)
+
+                for col in columns:
+                    assert len(result_dict[col]) == len(unique_peptides), (
+                        "Final model input %s: wrong number of predictions "
+                        "%d (expected %d) for col %s:\n%s\n"
+                        "Input was: experiment: %s, peptides:\n%s" % (
+                            str(self),
+                            len(result_dict[col]),
+                            len(unique_peptides),
+                            col,
+                            result_dict[col],
+                            experiment_name,
+                            unique_peptides))
+                    prediction_series = pandas.Series(
+                        result_dict[col],
+                        index=unique_peptides)
+                    prediction_values = (
+                        prediction_series.ix[sub_df.peptide.values]).values
+                    result_df.loc[
+                        sub_df.index, col
+                    ] = prediction_values
+
+            assert len(result_df) == len(peptides_df), "%s != %s" % (
+                len(result_df),
+                len(peptides_df))
+            return_value = result_df[columns]
+        if self.cached_predictions is not None:
+            self.cached_predictions[cache_key] = return_value
+        return return_value
+
+    def reset_cache(self):
+        for key in PresentationComponentModel.cache_fields:
+            obj_type = type(getattr(self, key))
+            setattr(self, key, obj_type())
+
+    def clone(self):
+        """
+        Copy this object so that the original and copy can be fit
+        independently.
+        """
+        if self.requires_fitting():
+            # shallow copy won't work here, subclass must override.
+            raise NotImplementedError(str(self))
+        result = copy(self)
+
+        # We do not want to share a cache with the clone.
+        result.reset_cache()
+        return result
+
+    def get_fit(self):
+        if self.requires_fitting():
+            raise NotImplementedError(str(self))
+        return None
+
+    def restore_fit(self, fit_info):
+        if self.requires_fitting():
+            raise NotImplementedError(str(self))
+        assert fit_info is None, (str(self), str(fit_info))
--- a/mhcflurry/antigen_presentation/presentation_model.py
+++ b/mhcflurry/antigen_presentation/presentation_model.py
+import collections
+import time
+from copy import copy
+import logging
+
+import pandas
+import numpy
+
+from sklearn.base import clone
+from sklearn.model_selection import StratifiedKFold
+from sklearn.linear_model import LogisticRegression
+
+from ..common import assert_no_null, drop_nulls_and_warn
+
+EVAL_CONTEXT = {
+    'log': numpy.log,
+    'exp': numpy.exp,
+}
+
+
+def build_presentation_models_from_formulas(term_dict, formulas, **kwargs):
+    """
+    Convenience function for creating multiple final models based on
+    shared terms.
+
+    Parameters
+    ------------
+    term_dict : dict of string -> (
+            list of PresentationComponentModel,
+            list of string)
+        Terms are named with arbitrary strings (e.g. "A_ms") and are
+        associated with some presentation component models and some
+        expressions (e.g. ["log(affinity_percentile_rank + .001)"]).
+
+    formulas : list of string
+        A formula is a string containing terms separated by "+". For example:
+        "A_ms + A_cleavage + A_expression".
+
+    **kwargs : dict
+        Passed to PresentationModel constructor
+
+    Returns
+    ------------
+
+    dict of string -> PresentationModel
+
+    The keys of the result dict are formulas, and the values are (untrained)
+    PresentationModel instances.
+    """
+    result = collections.OrderedDict()
+
+    for formula in formulas:
+        term_names = [x.strip() for x in formula.split("+")]
+        inputs = []
+        expressions = []
+        for name in term_names:
+            (term_inputs, term_expressions) = term_dict[name]
+            inputs.extend(term_inputs)
+            expressions.extend(term_expressions)
+        assert len(set(expressions)) == len(expressions)
+        presentation_model = PresentationModel(
+            inputs,
+            expressions,
+            **kwargs)
+        result[formula] = presentation_model
+    return result
+
+
+class PresentationModel(object):
+    """
+    A predictor for whether a peptide is detected via mass-spec. Uses
+    "final model inputs" (e.g. expression, cleavage, mhc affinity) which
+    themselves may need to be fit.
+
+    Parameters
+    ------------
+    component_models : list of PresentationComponentModel
+
+    feature_expressions : list of string
+        Expressions to use to generate features for the final model based
+        on the columns generated by the final model inputs.
+
+        Example: ["log(expression + .01)"]
+
+    decoy_strategy : DecoyStrategy
+        Decoy strategy to use for training the final model. (The final
+        model inputs handle their own decoys.)
+
+    random-state : int
+        Random state to use for picking cross validation folds. We are
+        careful to be deterministic here (i.e. same folds used if the
+        random state is the same) because we want to have cache hits
+        for final model inputs that are being used more than once in
+        multiple final models fit to the same data.
+
+    ensemble_size : int
+        If specified, train an ensemble of each final model input, and use
+        the out-of-bag predictors to generate predictions to fit the final
+        model. If not specified (default), a two-fold fit is used.
+
+    """
+    def __init__(
+            self,
+            component_models,
+            feature_expressions,
+            decoy_strategy,
+            predictor=LogisticRegression(),
+            random_state=0,
+            ensemble_size=None):
+        columns = set()
+        self.component_models_require_fitting = False
+        for component_model in component_models:
+            model_cols = component_model.column_names()
+            assert not columns.intersection(model_cols), model_cols
+            columns.update(model_cols)
+            if component_model.requires_fitting():
+                self.component_models_require_fitting = True
+
+        self.component_models = component_models
+        self.ensemble_size = ensemble_size
+
+        self.feature_expressions = feature_expressions
+        self.decoy_strategy = decoy_strategy
+        self.random_state = random_state
+        self.predictor = predictor
+
+        self.trained_component_models = None
+        self.presentation_models_predictors = None
+        self.fit_experiments = None
+
+    @property
+    def has_been_fit(self):
+        return self.fit_experiments is not None
+
+    def clone(self):
+        return copy(self)
+
+    def reset_cache(self):
+        for model in self.component_models:
+            model.reset_cache()
+        if self.trained_component_models is not None:
+            for models in self.trained_component_models:
+                for model in models:
+                    model.reset_cache()
+
+    def fit(self, hits_df):
+        """
+        Train the final model and its inputs (if necessary).
+
+        Parameters
+        -----------
+        hits_df : pandas.DataFrame
+            dataframe of hits with columns 'experiment_name' and 'peptide'
+        """
+        start = time.time()
+        assert not self.has_been_fit
+        assert 'experiment_name' in hits_df.columns
+        assert 'peptide' in hits_df.columns
+
+        assert self.trained_component_models is None
+        assert self.presentation_models_predictors is None
+
+        hits_df = hits_df.reset_index(drop=True)
+        self.fit_experiments = set(hits_df.experiment_name.unique())
+
+        if self.component_models_require_fitting and not self.ensemble_size:
+            print("Using 2-fold fit.")
+            # Use two fold CV to train model inputs then final models.
+            cv = StratifiedKFold(
+                n_splits=2, shuffle=True, random_state=self.random_state)
+
+            self.trained_component_models = []
+            self.presentation_models_predictors = []
+            for (fold1, fold2) in cv.split(hits_df, hits_df.experiment_name):
+                assert len(fold1) > 0
+                assert len(fold2) > 0
+                model_input_training_hits_df = hits_df.iloc[fold1]
+
+                self.trained_component_models.append([])
+                for sub_model in self.component_models:
+                    sub_model = sub_model.clone_and_fit(
+                        model_input_training_hits_df)
+                    self.trained_component_models[-1].append(sub_model)
+
+                final_predictor = self.fit_final_predictor(
+                    hits_df.iloc[fold2],
+                    self.trained_component_models[-1])
+                self.presentation_models_predictors.append(final_predictor)
+        elif self.component_models_require_fitting:
+            print("Using ensemble fit, ensemble size: %d" % self.ensemble_size)
+            raise NotImplementedError()
+
+            '''
+            hits_in_train = pandas.DataFrame(index=hits_df.index)
+            out_of_sample_predictions = [
+                []
+                for _ in self.component_models
+            ]
+            for i in range(self.ensemble_size):
+                print("Training ensemble %d / %d" % (
+                    i + 1, self.ensemble_size))
+
+                train_mask = numpy.random.randint(2, size=len(hits_df))
+
+                model_input_training_hits_df = hits_df.ix[train_mask]
+                presentation_model_training_hits_df = hits_df.ix[~train_mask]
+
+                hits_and_decoys_df = make_hits_and_decoys_df(
+                    presentation_model_training_hits_df,
+                    self.decoy_strategy)
+
+                self.trained_component_models.append([])
+                out_of_sample_predictions.append([])
+                for sub_model in self.component_models:
+                    sub_model = sub_model.clone_and_fit(
+                        model_input_training_hits_df)
+                    self.trained_component_models[-1].append(sub_model)
+
+                    predictions = sub_model.predict(presentation_model_training_hits_df)
+                    for (col, values) in predictions.items():
+                        presentation_model_training_hits_df[col] = values
+                    out_of_sample_predictions[-1].append()
+
+            hits_and_decoys_df = make_hits_and_decoys_df(
+                hits_df,
+                self.decoy_strategy)
+
+            for sub_model in component_models:
+                predictions = sub_model.predict(hits_and_decoys_df)
+                for (col, values) in predictions.items():
+                    hits_and_decoys_df[col] = values
+
+            (x, y) = self.make_features_and_target(hits_and_decoys_df)
+            print("Training final model predictor on data of shape %s" % (
+                str(x.shape)))
+            final_predictor = clone(self.predictor)
+            final_predictor.fit(x.values, y.values)
+            self.presentation_models_predictors.append(final_predictor)
+            '''
+        else:
+            print("Using single-fold fit.")
+            # Use full data set to train final model.
+            final_predictor = self.fit_final_predictor(
+                hits_df,
+                self.component_models)
+
+            assert not self.presentation_models_predictors
+            self.presentation_models_predictors = [final_predictor]
+            self.trained_component_models = [
+                self.component_models
+            ]
+
+        assert len(self.presentation_models_predictors) == \
+            len(self.trained_component_models)
+
+        print("Fit final model in %0.1f sec." % (time.time() - start))
+
+        # Decoy strategy is no longer required after fitting.
+        self.decoy_strategy = None
+
+    def fit_final_predictor(self, hits_df, component_models):
+        """
+        Private helper method.
+        """
+        hits_and_decoys_df = make_hits_and_decoys_df(
+            hits_df,
+            self.decoy_strategy)
+
+        for sub_model in component_models:
+            predictions = sub_model.predict(hits_and_decoys_df)
+            for (col, values) in predictions.items():
+                hits_and_decoys_df[col] = values
+
+        (x, y) = self.make_features_and_target(hits_and_decoys_df)
+        print("Training final model predictor on data of shape %s" % (
+            str(x.shape)))
+        final_predictor = clone(self.predictor)
+        final_predictor.fit(x.values, y.values)
+
+        return final_predictor
+
+    def evaluate_expressions(self, input_df):
+        result = pandas.DataFrame()
+        for expression in self.feature_expressions:
+            values = eval(expression, EVAL_CONTEXT, input_df)
+            assert len(values) == len(input_df), expression
+            if hasattr(values, 'values'):
+                values = values.values
+            series = pandas.Series(values)
+            assert_no_null(series, expression)
+            result[expression] = series
+        assert len(result) == len(input_df)
+        return result
+
+    def make_features_and_target(self, hits_and_decoys_df):
+        """
+        Private helper method.
+        """
+        assert 'peptide' in hits_and_decoys_df
+        assert 'hit' in hits_and_decoys_df
+
+        df = self.evaluate_expressions(hits_and_decoys_df)
+        df['hit'] = hits_and_decoys_df.hit.values
+        new_df = drop_nulls_and_warn(df, hits_and_decoys_df)
+        y = new_df["hit"]
+        del new_df["hit"]
+        return (new_df, y)
+
+    def predict_to_df(self, peptides_df):
+        """
+        Predict for the given peptides_df, which should have columns
+        'experiment_name' and 'peptide'.
+
+        Returns a dataframe giving the predictions. If this final
+        model's inputs required fitting and therefore the final model
+        has two predictors trained each fold, the resulting dataframe
+        will have predictions for both final model predictors.
+        """
+        assert self.has_been_fit
+        assert 'experiment_name' in peptides_df.columns
+        assert 'peptide' in peptides_df.columns
+        assert len(self.presentation_models_predictors) == \
+            len(self.trained_component_models)
+
+        # peptides_df = peptides_df.reset_index(drop=True)
+        prediction_cols = []
+        presentation_model_predictions = {}
+        zipped = enumerate(
+            zip(
+                self.trained_component_models,
+                self.presentation_models_predictors))
+        for (i, (component_models, presentation_model_predictor)) in zipped:
+            df = pandas.DataFrame()
+            for sub_model in component_models:
+                start_t = time.time()
+                predictions = sub_model.predict(peptides_df)
+                print("Input '%s' generated %d predictions in %0.2f sec." % (
+                    sub_model, len(peptides_df), (time.time() - start_t)))
+                for (col, values) in predictions.items():
+                    values = pandas.Series(values)
+                    assert_no_null(values)
+                    df[col] = values
+
+            x_df = self.evaluate_expressions(df)
+            assert_no_null(x_df)
+
+            """
+            with pandas.option_context('mode.use_inf_as_null', True):
+                null_x = x_df.ix[x_df.isnull().sum(axis=1) > 0]
+                if len(null_x) > 0:
+                    null_x = null_x.copy()
+                    null_x["peptide"] = peptides_df.ix[null_x.index, "peptide"]
+                    null_x["experiment_name"] = peptides_df.ix[
+                        null_x.index, "experiment_name"
+                    ]
+                    raise ValueError(
+                        "Null values in features:\n%s\ninputs:\n%s" % (
+                            str(null_x), str(df.iloc[null_x.index])))
+            """
+
+            prediction_col = "Prediction (Model %d)" % (i + 1)
+            assert prediction_col not in presentation_model_predictions
+            presentation_model_predictions[prediction_col] = (
+                presentation_model_predictor
+                .predict_proba(x_df.values)[:, 1])
+            prediction_cols.append(prediction_col)
+
+        if len(prediction_cols) == 1:
+            presentation_model_predictions["Prediction"] = (
+                presentation_model_predictions[prediction_cols[0]])
+            del presentation_model_predictions[prediction_cols[0]]
+        else:
+            presentation_model_predictions["Prediction"] = numpy.mean(
+                [presentation_model_predictions[col] for col in prediction_cols],
+                axis=0)
+
+        return pandas.DataFrame(presentation_model_predictions)
+
+    def predict(self, peptides_df):
+        """
+        Predict for the given peptides_df, which should have columns
+        'experiment_name' and 'peptide'.
+
+        Returns an array of floats giving the predictions for each
+        row in peptides_df. If the final model was trained in two
+        folds, the predictions from the two final model predictors
+        are averaged.
+        """
+        assert self.has_been_fit
+        df = self.predict_to_df(peptides_df)
+        return df.Prediction.values
+
+    def score_from_peptides_df(
+            self, peptides_df, include_hit_indices=True):
+        """
+        Given a DataFrame with columns 'peptide', 'experiment_name', and
+        'hit', calculate the PPV score. Return a dict of scoring info.
+
+        If include_hit_indices is True (default), then the indices the
+        hits occur in after sorting by prediction score, is also returned.
+        The top predicted peptide will have index 0.
+        """
+        assert self.has_been_fit
+        assert 'peptide' in peptides_df.columns
+        assert 'experiment_name' in peptides_df.columns
+        assert 'hit' in peptides_df.columns
+
+        peptides_df["prediction"] = self.predict(peptides_df)
+        # print(sorted(peptides_df.prediction[peptides_df.hit].values))
+        top_n = float(peptides_df.hit.sum())
+
+        if not include_hit_indices:
+            top = peptides_df.nlargest(top_n, "prediction")
+            result = {
+                'score': top.hit.mean()
+            }
+        else:
+            ranks = peptides_df.prediction.rank(ascending=False)
+            result = {
+                'hit_indices': numpy.sort(ranks[peptides_df.hit > 0].values),
+                'total_peptides': len(peptides_df),
+            }
+            result['score'] = (
+                numpy.sum(result['hit_indices'] <= top_n) / top_n)
+        return result
+
+    def score_from_hits_and_decoy_strategy(self, hits_df, decoy_strategy):
+        """
+        Compute positive predictive value on the given hits_df.
+
+        Parameters
+        -----------
+        hits_df : pandas.DataFrame
+            dataframe of hits with columns 'experiment_name' and 'peptide'
+
+        decoy_strategy : DecoyStrategy
+            Strategy for selecting decoys
+
+        Returns
+        -----------
+
+        dict of scoring info, with keys 'score', 'hit_indices', and
+        'total_peptides'
+        """
+        assert self.has_been_fit
+        peptides_df = make_hits_and_decoys_df(
+            hits_df,
+            decoy_strategy)
+        return self.score_from_peptides_df(peptides_df)
+
+    def get_fit(self):
+        """
+        Return fit (i.e. trained) parameters.
+        """
+        assert self.has_been_fit
+        result = {
+            'trained_component_model_fits': [],
+            'presentation_models_predictors': (
+                self.presentation_models_predictors),
+            'fit_experiments': self.fit_experiments,
+            'feature_expressions': self.feature_expressions,
+        }
+        for models in self.trained_component_models:
+            result['trained_component_model_fits'].append([
+                component_model.get_fit()
+                for component_model in models
+            ])
+        return result
+
+    def restore_fit(self, fit):
+        """
+        Restore fit parameters.
+
+        Parameters
+        ------------
+        fit : object
+            What was returned from a call to get_fit().
+
+        """
+        assert not self.has_been_fit
+        fit = dict(fit)
+        self.presentation_models_predictors = (
+            fit.pop('presentation_models_predictors'))
+        self.fit_experiments = fit.pop('fit_experiments')
+        model_input_fits = fit.pop('trained_component_model_fits')
+        feature_expressions = fit.pop('feature_expressions', [])
+        if feature_expressions != self.feature_expressions:
+            logging.warn(
+                "Feature expressions restored from fit: '%s' do not match "
+                "those of this PresentationModel: '%s'" % (
+                    feature_expressions, self.feature_expressions))
+        assert not fit, "Unhandled data in fit: %s" % fit
+        assert len(model_input_fits) == (
+            2 if self.component_models_require_fitting else 1), (
+            "Wrong length: %s" % model_input_fits)
+
+        self.trained_component_models = []
+        for model_input_fits_for_fold in model_input_fits:
+            self.trained_component_models.append([])
+            for (sub_model, sub_model_fit) in zip(
+                    self.component_models,
+                    model_input_fits_for_fold):
+                sub_model = sub_model.clone_and_restore_fit(sub_model_fit)
+                self.trained_component_models[-1].append(
+                    sub_model)
+
+        assert len(self.trained_component_models) == (
+            2 if self.component_models_require_fitting else 1), (
+            "Wrong length: %s" % self.trained_component_models)
+
+
+def make_hits_and_decoys_df(hits_df, decoy_strategy):
+    """
+    Given some hits (with columns 'experiment_name' and 'peptide'),
+    and a decoy strategy, return a "peptides_df", which has columns
+    'experiment_name', 'peptide', and 'hit.'
+    """
+    hits_df = hits_df.copy()
+    hits_df["hit"] = 1
+
+    decoys_df = decoy_strategy.decoys(hits_df)
+    decoys_df["hit"] = 0
+
+    peptides_df = pandas.concat(
+        [hits_df, decoys_df],
+        ignore_index=True)
+    return peptides_df
--- a/mhcflurry/common.py
+++ b/mhcflurry/common.py
@@ -16,8 +16,14 @@ from __future__ import print_function, division, absolute_import
 from math import exp, log
 import itertools
 from collections import defaultdict
+import logging
+import hashlib
+import time
+import sys
+from os import environ

 import numpy as np
+import pandas


 class UnsupportedAllele(Exception):
@@ -127,3 +133,107 @@ def shuffle_split_list(indices, fraction=0.5):
        left_count = n - 1

    return indices[:left_count], indices[left_count:]
+
+
+def dataframe_cryptographic_hash(df):
+    """
+    Return a cryptographic (i.e. collisions extremely unlikely) hash
+    of a dataframe. Suitible for using as a cache key.
+
+    Parameters
+    -----------
+    df : pandas.DataFrame or pandas.Series
+
+    Returns
+    -----------
+    string
+    """
+    start = time.time()
+    result = hashlib.sha1(df.to_msgpack()).hexdigest()
+    print("Generated dataframe hash in %0.2f sec" % (time.time() - start))
+    return result
+
+
+def freeze_object(o):
+    """
+    Recursively convert nested dicts and lists into frozensets and tuples.
+    """
+    if isinstance(o, dict):
+        return frozenset({k: freeze_object(v) for k, v in o.items()}.items())
+    if isinstance(o, list):
+        return tuple(freeze_object(v) for v in o)
+    return o
+
+
+def configure_logging(verbose=False):
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        format="%(asctime)s.%(msecs)d %(levelname)s %(module)s - %(funcName)s:"
+        " %(message)s",
+        datefmt="%Y-%m-%d %H:%M:%S",
+        stream=sys.stderr,
+        level=level)
+
+
+def describe_nulls(df, related_df_with_same_index_to_describe=None):
+    """
+    Return a string describing the positions of any nan or inf values
+    in a dataframe.
+
+    If related_df_with_same_index_to_describe is specified, it should be
+    a dataframe with the same index as df. Positions corresponding to
+    where df has null values will also be printed from this dataframe.
+    """
+    if isinstance(df, pandas.Series):
+        df = df.to_frame()
+    with pandas.option_context('mode.use_inf_as_null', True):
+        null_counts_by_col = df.isnull().sum(axis=0)
+        null_rows = df.isnull().sum(axis=1) > 0
+        return (
+            "Columns with nulls:\n%s, related rows with nulls:\n%s, "
+            "full df:\n%s" % (
+                null_counts_by_col.index[null_counts_by_col > 0],
+                related_df_with_same_index_to_describe.ix[null_rows]
+                if related_df_with_same_index_to_describe is not None
+                else "(n/a)",
+                str(df.ix[null_rows])))
+
+
+def raise_or_debug(exception):
+    """
+    Raise the exception unless the NEON_DEBUG environment variable is set,
+    in which case drop into ipython debugger (ipdb).
+    """
+    if environ.get("NEON_DEBUG"):
+        import ipdb
+        ipdb.set_trace()
+    raise exception
+
+
+def assert_no_null(df, message=''):
+    """
+    Raise an assertino error if the given DataFrame has any nan or inf values.
+    """
+    # start = time.time()
+    with pandas.option_context('mode.use_inf_as_null', True):
+        if df.count().sum() != df.size:
+            raise_or_debug(
+                AssertionError(
+                    "%s %s" % (message, describe_nulls(df))))
+    # print("Null check completed in %0.2f sec" % (time.time() - start))
+
+
+def drop_nulls_and_warn(df, related_df_with_same_index_to_describe=None):
+    """
+    Return a new DataFrame that is a copy of the given DataFrame where any
+    rows with nulls have been removed, and a warning about them logged.
+    """
+    with pandas.option_context('mode.use_inf_as_null', True):
+        new_df = df.dropna()
+    if df.shape != new_df.shape:
+        logging.warn(
+            "Dropped rows with null or inf: %s -> %s:\n%s" % (
+                df.shape,
+                new_df.shape,
+                describe_nulls(df, related_df_with_same_index_to_describe)))
+    return new_df
--- a/test/test_antigen_presentation.py
+++ b/test/test_antigen_presentation.py
+from nose.tools import eq_, assert_less
+
+import numpy
+from numpy import testing
+import pandas
+from mhcflurry import amino_acid
+from mhcflurry.antigen_presentation import presentation_component_models
+
+
+def make_random_peptides(num, length=9):
+    return [
+        ''.join(peptide_sequence)
+        for peptide_sequence in
+        numpy.random.choice(
+            amino_acid.common_amino_acid_letters, size=(num, length))
+    ]
+
+
+PEPTIDES = make_random_peptides(100, 9)
+
+TRANSCRIPTS = [
+    "transcript-%d" % i
+    for i in range(1, 10)
+]
+
+EXPERIMENT_TO_ALLELES = {
+    'exp1': ['HLA-A*01:01'],
+    'exp2': ['HLA-A*02:01', 'HLA-B*51:01'],
+}
+
+EXPERIMENT_TO_EXPRESSION_GROUP = {
+    'exp1': 'group1',
+    'exp2': 'group2',
+}
+
+EXPERESSION_GROUPS = sorted(set(EXPERIMENT_TO_EXPRESSION_GROUP.values()))
+
+TRANSCIPTS_DF = pandas.DataFrame(index=PEPTIDES, columns=EXPERESSION_GROUPS)
+TRANSCIPTS_DF[:] = numpy.random.choice(TRANSCRIPTS, size=TRANSCIPTS_DF.shape)
+
+PEPTIDES_AND_TRANSCRIPTS_DF = TRANSCIPTS_DF.stack().to_frame().reset_index()
+PEPTIDES_AND_TRANSCRIPTS_DF.columns = ["peptide", "group", "transcript"]
+del PEPTIDES_AND_TRANSCRIPTS_DF["group"]
+
+
+def hit_criterion(experiment_name, peptide):
+    return 'A' in peptide
+
+
+PEPTIDES_DF = pandas.DataFrame({"peptide": PEPTIDES})
+PEPTIDES_DF["experiment_name"] = "exp1"
+PEPTIDES_DF["hit"] = [
+    hit_criterion(row.experiment_name, row.peptide)
+    for _, row in
+    PEPTIDES_DF.iterrows()
+]
+
+HITS_DF = PEPTIDES_DF.ix[PEPTIDES_DF.hit].reset_index().copy()
+del HITS_DF["hit"]
+
+
+def test_mhcflurry_trained_on_hits():
+    model = presentation_component_models.MHCflurryTrainedOnHits(
+        "basic",
+        experiment_to_alleles=EXPERIMENT_TO_ALLELES,
+        experiment_to_expression_group=EXPERIMENT_TO_EXPRESSION_GROUP,
+        transcripts=TRANSCIPTS_DF,
+        peptides_and_transcripts=PEPTIDES_AND_TRANSCRIPTS_DF,
+        random_peptides_for_percent_rank=make_random_peptides(10000, 9),
+    )
+    model.fit(HITS_DF)
+
+    peptides = PEPTIDES_DF.copy()
+    predictions = model.predict(peptides)
+    peptides["affinity"] = predictions["mhcflurry_basic_affinity"]
+    peptides["percent_rank"] = predictions["mhcflurry_basic_percentile_rank"]
+    assert_less(
+        peptides.affinity[peptides.hit].mean(),
+        peptides.affinity[~peptides.hit].mean())
+    assert_less(
+        peptides.percent_rank[peptides.hit].mean(),
+        peptides.percent_rank[~peptides.hit].mean())