Skip to content
Snippets Groups Projects
class1_affinity_predictor.py 57.8 KiB
Newer Older
Tim O'Donnell's avatar
Tim O'Donnell committed
import collections
import hashlib
Tim O'Donnell's avatar
Tim O'Donnell committed
import json
Tim O'Donnell's avatar
Tim O'Donnell committed
import logging
import time
import warnings
Tim O'Donnell's avatar
Tim O'Donnell committed
from os.path import join, exists, abspath
from os import mkdir, environ
Tim O'Donnell's avatar
Tim O'Donnell committed
from socket import gethostname
from getpass import getuser
Tim O'Donnell's avatar
Tim O'Donnell committed
from functools import partial
Tim O'Donnell's avatar
Tim O'Donnell committed
from six import string_types
Tim O'Donnell's avatar
Tim O'Donnell committed

import numpy
Tim O'Donnell's avatar
Tim O'Donnell committed
from numpy.testing import assert_equal
Tim O'Donnell's avatar
Tim O'Donnell committed
import pandas

import mhcnames
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
from .class1_neural_network import Class1NeuralNetwork
Tim O'Donnell's avatar
Tim O'Donnell committed
from .common import random_peptides, positional_frequency_matrix
from .downloads import get_default_class1_models_dir
Tim O'Donnell's avatar
Tim O'Donnell committed
from .encodable_sequences import EncodableSequences
from .percent_rank_transform import PercentRankTransform
from .regression_target import to_ic50
Tim O'Donnell's avatar
Tim O'Donnell committed
from .version import __version__
from .ensemble_centrality import CENTRALITY_MEASURES
Tim O'Donnell's avatar
Tim O'Donnell committed
from .allele_encoding import AlleleEncoding
Tim O'Donnell's avatar
Tim O'Donnell committed


Tim O'Donnell's avatar
Tim O'Donnell committed
# Default function for combining predictions across models in an ensemble.
# See ensemble_centrality.py for other options.
DEFAULT_CENTRALITY_MEASURE = "mean"

# Any value > 0 will result in attempting to optimize models after loading.
OPTIMIZATION_LEVEL = int(environ.get("MHCFLURRY_OPTIMIZATION_LEVEL", 1))
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
class Class1AffinityPredictor(object):
Tim O'Donnell's avatar
Tim O'Donnell committed
    """
    High-level interface for peptide/MHC I binding affinity prediction.
Tim O'Donnell's avatar
Tim O'Donnell committed

    This class manages low-level `Class1NeuralNetwork` instances, each of which
    wraps a single Keras network. The purpose of `Class1AffinityPredictor` is to
    implement ensembles, handling of multiple alleles, and predictor loading and
Tim O'Donnell's avatar
Tim O'Donnell committed
    saving. It also provides a place to keep track of metadata like prediction
    histograms for percentile rank calibration.
Tim O'Donnell's avatar
Tim O'Donnell committed
    """
Tim O'Donnell's avatar
Tim O'Donnell committed
    def __init__(
            self,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_allele_specific_models=None,
            class1_pan_allele_models=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            manifest_df=None,
            allele_to_percent_rank_transform=None,
            metadata_dataframes=None):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_to_allele_specific_models : dict of string -> list of `Class1NeuralNetwork`
Tim O'Donnell's avatar
Tim O'Donnell committed
            Ensemble of single-allele models to use for each allele.

Tim O'Donnell's avatar
Tim O'Donnell committed
        class1_pan_allele_models : list of `Class1NeuralNetwork`
Tim O'Donnell's avatar
Tim O'Donnell committed
            Ensemble of pan-allele models.
Tim O'Donnell's avatar
Tim O'Donnell committed

        allele_to_sequence : dict of string -> string
Tim O'Donnell's avatar
Tim O'Donnell committed
            MHC allele name to fixed-length amino acid sequence (sometimes
            referred to as the pseudosequence). Required only if
            class1_pan_allele_models is specified.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        manifest_df : `pandas.DataFrame`, optional
Tim O'Donnell's avatar
Tim O'Donnell committed
            Must have columns: model_name, allele, config_json, model.
            Only required if you want to update an existing serialization of a
Tim O'Donnell's avatar
Tim O'Donnell committed
            Class1AffinityPredictor. Otherwise this dataframe will be generated
            automatically based on the supplied models.
Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_to_percent_rank_transform : dict of string -> `PercentRankTransform`, optional
            `PercentRankTransform` instances to use for each allele

        metadata_dataframes : dict of string -> pandas.DataFrame, optional
            Optional additional dataframes to write to the models dir when
            save() is called. Useful for tracking provenance.
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        if allele_to_allele_specific_models is None:
            allele_to_allele_specific_models = {}
        if class1_pan_allele_models is None:
            class1_pan_allele_models = []

        self.allele_to_sequence = (
            dict(allele_to_sequence)
            if allele_to_sequence is not None else None)  # make a copy

Tim O'Donnell's avatar
Tim O'Donnell committed
        self._master_allele_encoding = None
Tim O'Donnell's avatar
Tim O'Donnell committed
        if class1_pan_allele_models:
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        self.allele_to_allele_specific_models = allele_to_allele_specific_models
        self.class1_pan_allele_models = class1_pan_allele_models
        self._manifest_df = manifest_df
Tim O'Donnell's avatar
Tim O'Donnell committed

        if not allele_to_percent_rank_transform:
            allele_to_percent_rank_transform = {}
        self.allele_to_percent_rank_transform = allele_to_percent_rank_transform
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.metadata_dataframes = (
            dict(metadata_dataframes) if metadata_dataframes else {})
        self.optimization_info = {}
Tim O'Donnell's avatar
Tim O'Donnell committed
        assert isinstance(self.allele_to_allele_specific_models, dict)
        assert isinstance(self.class1_pan_allele_models, list)

    @property
    def manifest_df(self):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        A pandas.DataFrame describing the models included in this predictor.

        Based on:
        - self.class1_pan_allele_models
        - self.allele_to_allele_specific_models

        Returns
        -------
        pandas.DataFrame
        """
        if self._manifest_df is None:
Tim O'Donnell's avatar
Tim O'Donnell committed
            rows = []
            for (i, model) in enumerate(self.class1_pan_allele_models):
                rows.append((
                    self.model_name("pan-class1", i),
                    "pan-class1",
Tim O'Donnell's avatar
Tim O'Donnell committed
                    json.dumps(model.get_config()),
Tim O'Donnell's avatar
Tim O'Donnell committed
                    model
                ))
            for (allele, models) in self.allele_to_allele_specific_models.items():
Tim O'Donnell's avatar
Tim O'Donnell committed
                for (i, model) in enumerate(models):
                    rows.append((
                        self.model_name(allele, i),
                        allele,
Tim O'Donnell's avatar
Tim O'Donnell committed
                        json.dumps(model.get_config()),
Tim O'Donnell's avatar
Tim O'Donnell committed
                        model
                    ))
            self._manifest_df = pandas.DataFrame(
Tim O'Donnell's avatar
Tim O'Donnell committed
                rows,
                columns=["model_name", "allele", "config_json", "model"])
        return self._manifest_df

    def clear_cache(self):
        """
        Clear values cached based on the neural networks in this predictor.

        Users should call this after mutating any of the following:
Tim O'Donnell's avatar
Tim O'Donnell committed
            - self.class1_pan_allele_models
            - self.allele_to_allele_specific_models
            - self.allele_to_sequence

        Methods that mutate these instance variables will call this method on
        their own if needed.
        """
        self._cache.clear()
    @property
    def neural_networks(self):
        """
        List of the neural networks in the ensemble.

        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        list of `Class1NeuralNetwork`
        """
        result = []
        for models in self.allele_to_allele_specific_models.values():
            result.extend(models)
        result.extend(self.class1_pan_allele_models)
        return result

    @classmethod
    def merge(cls, predictors):
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Merge the ensembles of two or more `Class1AffinityPredictor` instances.

        Note: the resulting merged predictor will NOT have calibrated percentile
Tim O'Donnell's avatar
Tim O'Donnell committed
        ranks. Call `calibrate_percentile_ranks` on it if these are needed.
Tim O'Donnell's avatar
Tim O'Donnell committed
        predictors : sequence of `Class1AffinityPredictor`
Tim O'Donnell's avatar
Tim O'Donnell committed
        `Class1AffinityPredictor` instance

        """
        assert len(predictors) > 0
        if len(predictors) == 1:
            return predictors[0]

        allele_to_allele_specific_models = collections.defaultdict(list)
        class1_pan_allele_models = []
Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_to_sequence = predictors[0].allele_to_sequence

        for predictor in predictors:
            for (allele, networks) in (
                    predictor.allele_to_allele_specific_models.items()):
                allele_to_allele_specific_models[allele].extend(networks)
            class1_pan_allele_models.extend(
                predictor.class1_pan_allele_models)

        return Class1AffinityPredictor(
            allele_to_allele_specific_models=allele_to_allele_specific_models,
            class1_pan_allele_models=class1_pan_allele_models,
    def merge_in_place(self, others):
        """
        Add the models present other predictors into the current predictor.

        Parameters
        ----------
        others : list of Class1AffinityPredictor
            Other predictors to merge into the current predictor.

        Returns
        -------
        list of string : names of newly added models
        """
        new_model_names = []
        original_manifest = self.manifest_df
        new_manifest_rows = []
        for predictor in others:
            for model in predictor.class1_pan_allele_models:
                model_name = self.model_name(
                    "pan-class1",
                    len(self.class1_pan_allele_models))
                row = pandas.Series(collections.OrderedDict([
                    ("model_name", model_name),
                    ("allele", "pan-class1"),
                    ("config_json", json.dumps(model.get_config())),
                    ("model", model),
                ])).to_frame().T
                new_manifest_rows.append(row)
                self.class1_pan_allele_models.append(model)
                new_model_names.append(model_name)

            for allele in predictor.allele_to_allele_specific_models:
                if allele not in self.allele_to_allele_specific_models:
                    self.allele_to_allele_specific_models[allele] = []
                current_models = self.allele_to_allele_specific_models[allele]
                for model in predictor.allele_to_allele_specific_models[allele]:
                    model_name = self.model_name(allele, len(current_models))
                    row = pandas.Series(collections.OrderedDict([
                        ("model_name", model_name),
                        ("allele", allele),
                        ("config_json", json.dumps(model.get_config())),
                        ("model", model),
                    ])).to_frame().T
                    new_manifest_rows.append(row)
                    current_models.append(model)
                    new_model_names.append(model_name)

        self._manifest_df = pandas.concat(
            [original_manifest] + new_manifest_rows,
            ignore_index=True)

        self.check_consistency()
    @property
    def supported_alleles(self):
        """
        Alleles for which predictions can be made.
        
        Returns
        -------
        list of string
        """
        if 'supported_alleles' not in self._cache:
            result = set(self.allele_to_allele_specific_models)
            if self.allele_to_sequence:
                result = result.union(self.allele_to_sequence)
            self._cache["supported_alleles"] = sorted(result)
        return self._cache["supported_alleles"]
    @property
    def supported_peptide_lengths(self):
        """
        (minimum, maximum) lengths of peptides supported by *all models*,
        inclusive.

        Returns
        -------
        (int, int) tuple

        """
        if 'supported_peptide_lengths' not in self._cache:
            length_ranges = set(
                network.supported_peptide_lengths
                for network in self.neural_networks)
            result = (
                max(lower for (lower, upper) in length_ranges),
                min(upper for (lower, upper) in length_ranges))
            self._cache["supported_peptide_lengths"] = result
        return self._cache["supported_peptide_lengths"]
    def check_consistency(self):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Verify that self.manifest_df is consistent with:
        - self.class1_pan_allele_models
        - self.allele_to_allele_specific_models

        Currently only checks for agreement on the total number of models.

        Throws AssertionError if inconsistent.
        """
        num_models = len(self.class1_pan_allele_models) + sum(
            len(v) for v in self.allele_to_allele_specific_models.values())
        assert len(self.manifest_df) == num_models, (
            "Manifest seems out of sync with models: %d vs %d entries: "
            "\n%s\npan-allele: %s\nallele-specific: %s"% (
                len(self.manifest_df),
                num_models,
                str(self.manifest_df),
                str(self.class1_pan_allele_models),
                str(self.allele_to_allele_specific_models)))

    def save(self, models_dir, model_names_to_write=None, write_metadata=True):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Serialize the predictor to a directory on disk. If the directory does
        not exist it will be created.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        The serialization format consists of a file called "manifest.csv" with
        the configurations of each Class1NeuralNetwork, along with per-network
        files giving the model weights. If there are pan-allele predictors in
Tim O'Donnell's avatar
Tim O'Donnell committed
        the ensemble, the allele sequences are also stored in the
Tim O'Donnell's avatar
Tim O'Donnell committed
        directory. There is also a small file "index.txt" with basic metadata:
        when the models were trained, by whom, on what host.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        Parameters
        ----------
        models_dir : string
Tim O'Donnell's avatar
Tim O'Donnell committed
            Path to directory. It will be created if it doesn't exist.
Tim O'Donnell's avatar
Tim O'Donnell committed
            
        model_names_to_write : list of string, optional
            Only write the weights for the specified models. Useful for
            incremental updates during training.

        write_metadata : boolean, optional
            Whether to write optional metadata
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        self.check_consistency()
Tim O'Donnell's avatar
Tim O'Donnell committed

        if model_names_to_write is None:
            # Write all models
Tim O'Donnell's avatar
Tim O'Donnell committed
            model_names_to_write = self.manifest_df.model_name.values
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        if not exists(models_dir):
            mkdir(models_dir)

Tim O'Donnell's avatar
Tim O'Donnell committed
        sub_manifest_df = self.manifest_df.loc[
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.manifest_df.model_name.isin(model_names_to_write)
Tim O'Donnell's avatar
Tim O'Donnell committed
        ].copy()
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        # Network JSON configs may have changed since the models were added,
        # for example due to changes to the allele representation layer.
        # So we update the JSON configs here also.
        updated_network_config_jsons = []
Tim O'Donnell's avatar
Tim O'Donnell committed
        for (_, row) in sub_manifest_df.iterrows():
Tim O'Donnell's avatar
Tim O'Donnell committed
            updated_network_config_jsons.append(
                json.dumps(row.model.get_config()))
Tim O'Donnell's avatar
Tim O'Donnell committed
            weights_path = self.weights_path(models_dir, row.model_name)
Tim O'Donnell's avatar
Tim O'Donnell committed
            Class1AffinityPredictor.save_weights(
                row.model.get_weights(), weights_path)
Tim O'Donnell's avatar
Tim O'Donnell committed
            logging.info("Wrote: %s", weights_path)
Tim O'Donnell's avatar
Tim O'Donnell committed
        sub_manifest_df["config_json"] = updated_network_config_jsons
        self.manifest_df.loc[
            sub_manifest_df.index,
            "config_json"
        ] = updated_network_config_jsons
Tim O'Donnell's avatar
Tim O'Donnell committed

        write_manifest_df = self.manifest_df[[
            c for c in self.manifest_df.columns if c != "model"
        ]]
        manifest_path = join(models_dir, "manifest.csv")
        write_manifest_df.to_csv(manifest_path, index=False)
Tim O'Donnell's avatar
Tim O'Donnell committed
        logging.info("Wrote: %s", manifest_path)
Tim O'Donnell's avatar
Tim O'Donnell committed

        if write_metadata:
            # Write "info.txt"
            info_path = join(models_dir, "info.txt")
            rows = [
                ("trained on", time.asctime()),
                ("package   ", "mhcflurry %s" % __version__),
                ("hostname  ", gethostname()),
                ("user      ", getuser()),
            ]
            pandas.DataFrame(rows).to_csv(
                info_path, sep="\t", header=False, index=False)

            if self.metadata_dataframes:
                for (name, df) in self.metadata_dataframes.items():
                    metadata_df_path = join(models_dir, "%s.csv.bz2" % name)
                    df.to_csv(metadata_df_path, index=False, compression="bz2")
Tim O'Donnell's avatar
Tim O'Donnell committed

        # Save allele sequences
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_sequence_df = pandas.DataFrame(
Tim O'Donnell's avatar
Tim O'Donnell committed
                columns=['allele', 'sequence']
            )
            allele_to_sequence_df.to_csv(
                join(models_dir, "allele_sequences.csv"), index=False)
Tim O'Donnell's avatar
Tim O'Donnell committed
            logging.info("Wrote: %s", join(models_dir, "allele_sequences.csv"))
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        if self.allele_to_percent_rank_transform:
            percent_ranks_df = None
            for (allele, transform) in self.allele_to_percent_rank_transform.items():
                series = transform.to_series()
                if percent_ranks_df is None:
                    percent_ranks_df = pandas.DataFrame(index=series.index)
                assert_equal(series.index.values, percent_ranks_df.index.values)
                percent_ranks_df[allele] = series
            percent_ranks_path = join(models_dir, "percent_ranks.csv")
            percent_ranks_df.to_csv(
                percent_ranks_path,
                index=True,
                index_label="bin")
Tim O'Donnell's avatar
Tim O'Donnell committed
            logging.info("Wrote: %s", percent_ranks_path)
Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
Tim O'Donnell's avatar
Tim O'Donnell committed
    def load(models_dir=None, max_models=None, optimization_level=None):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Deserialize a predictor from a directory on disk.
        
        Parameters
        ----------
        models_dir : string
Tim O'Donnell's avatar
Tim O'Donnell committed
            Path to directory. If unspecified the default downloaded models are
            used.
Tim O'Donnell's avatar
Tim O'Donnell committed
            
        max_models : int, optional
Tim O'Donnell's avatar
Tim O'Donnell committed
            Maximum number of `Class1NeuralNetwork` instances to load
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        optimization_level : int
            If >0, model optimization will be attempted. Defaults to value of
            environment variable MHCFLURRY_OPTIMIZATION_LEVEL.

Tim O'Donnell's avatar
Tim O'Donnell committed
        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        `Class1AffinityPredictor` instance
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        if models_dir is None:
            models_dir = get_default_class1_models_dir()
Tim O'Donnell's avatar
Tim O'Donnell committed
        if optimization_level is None:
            optimization_level = OPTIMIZATION_LEVEL
Tim O'Donnell's avatar
Tim O'Donnell committed
        manifest_path = join(models_dir, "manifest.csv")
        manifest_df = pandas.read_csv(manifest_path, nrows=max_models)

        allele_to_allele_specific_models = collections.defaultdict(list)
        class1_pan_allele_models = []
        all_models = []
        for (_, row) in manifest_df.iterrows():
Tim O'Donnell's avatar
Tim O'Donnell committed
            weights_filename = Class1AffinityPredictor.weights_path(
                models_dir, row.model_name)
            config = json.loads(row.config_json)
Tim O'Donnell's avatar
Tim O'Donnell committed

            # We will lazy-load weights when the network is used.
            model = Class1NeuralNetwork.from_config(
                config,
                weights_loader=partial(
                    Class1AffinityPredictor.load_weights,
                    abspath(weights_filename)))
Tim O'Donnell's avatar
Tim O'Donnell committed
            if row.allele == "pan-class1":
                class1_pan_allele_models.append(model)
            else:
                allele_to_allele_specific_models[row.allele].append(model)
            all_models.append(model)

        manifest_df["model"] = all_models

        # Load allele sequences
Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_to_sequence = None
Tim O'Donnell's avatar
Tim O'Donnell committed
        if exists(join(models_dir, "allele_sequences.csv")):
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_sequence = pandas.read_csv(
Tim O'Donnell's avatar
Tim O'Donnell committed
                join(models_dir, "allele_sequences.csv"),
Tim O'Donnell's avatar
Tim O'Donnell committed
                index_col=0).iloc[:, 0].to_dict()
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_to_percent_rank_transform = {}
        percent_ranks_path = join(models_dir, "percent_ranks.csv")
        if exists(percent_ranks_path):
            percent_ranks_df = pandas.read_csv(percent_ranks_path, index_col=0)
            for allele in percent_ranks_df.columns:
                allele_to_percent_rank_transform[allele] = (
                    PercentRankTransform.from_series(percent_ranks_df[allele]))

        logging.info(
Tim O'Donnell's avatar
Tim O'Donnell committed
            "Loaded %d class1 pan allele predictors, %d allele sequences, "
Tim O'Donnell's avatar
Tim O'Donnell committed
            "%d percent rank distributions, and %d allele specific models: %s",
            len(class1_pan_allele_models),
            len(allele_to_sequence) if allele_to_sequence else 0,
            len(allele_to_percent_rank_transform),
            sum(len(v) for v in allele_to_allele_specific_models.values()),
            ", ".join(
                "%s (%d)" % (allele, len(v))
                for (allele, v)
                in sorted(allele_to_allele_specific_models.items())))
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        result = Class1AffinityPredictor(
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_allele_specific_models=allele_to_allele_specific_models,
            class1_pan_allele_models=class1_pan_allele_models,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_sequence=allele_to_sequence,
Tim O'Donnell's avatar
Tim O'Donnell committed
            manifest_df=manifest_df,
            allele_to_percent_rank_transform=allele_to_percent_rank_transform,
        )
Tim O'Donnell's avatar
Tim O'Donnell committed
        if optimization_level >= 1:
            optimized = result.optimize()
            logging.info(
Tim O'Donnell's avatar
Tim O'Donnell committed
                "Model optimization %s",
                "succeeded" if optimized else "not supported for these models")
Tim O'Donnell's avatar
Tim O'Donnell committed
        return result

Tim O'Donnell's avatar
Tim O'Donnell committed
    def optimize(self, warn=True):
        """
        EXPERIMENTAL: Optimize the predictor for faster predictions.

        Currently the only optimization implemented is to merge multiple pan-
        allele predictors at the tensorflow level.

        The optimization is performed in-place, mutating the instance.

        Returns
        ----------
        bool
            Whether optimization was performed

        """
        num_class1_pan_allele_models = len(self.class1_pan_allele_models)
        if num_class1_pan_allele_models > 1:
            try:
                self.class1_pan_allele_models = [
                    Class1NeuralNetwork.merge(
                        self.class1_pan_allele_models,
                        merge_method="concatenate")
                ]
            except NotImplementedError as e:
Tim O'Donnell's avatar
Tim O'Donnell committed
                if warn:
                    logging.warning("Optimization failed: %s", str(e))
                return False
            self._manifest_df = None
            self.clear_cache()
            self.optimization_info["pan_models_merged"] = True
            self.optimization_info["num_pan_models_merged"] = (
                num_class1_pan_allele_models)
        else:
            return False
        return True

Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def model_name(allele, num):
        """
        Generate a model name
        
        Parameters
        ----------
        allele : string
        num : int

        Returns
        -------
        string

        """
        random_string = hashlib.sha1(
            str(time.time()).encode()).hexdigest()[:16]
Tim O'Donnell's avatar
Tim O'Donnell committed
        return "%s-%d-%s" % (
            allele.upper().replace("*", "_").replace(":", "_"),
            num,
            random_string)
Tim O'Donnell's avatar
Tim O'Donnell committed

    @staticmethod
    def weights_path(models_dir, model_name):
        """
        Generate the path to the weights file for a model
        
        Parameters
        ----------
        models_dir : string
        model_name : string

        Returns
        -------
        string
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        return join(models_dir, "weights_%s.npz" % model_name)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    @property
    def master_allele_encoding(self):
        """
        An AlleleEncoding containing the universe of alleles specified by
        self.allele_to_sequence.

        Returns
        -------
        AlleleEncoding
        """
        if (self._master_allele_encoding is None or
Tim O'Donnell's avatar
Tim O'Donnell committed
                self._master_allele_encoding.allele_to_sequence !=
                self.allele_to_sequence):
Tim O'Donnell's avatar
Tim O'Donnell committed
            self._master_allele_encoding = AlleleEncoding(
                allele_to_sequence=self.allele_to_sequence)
Tim O'Donnell's avatar
Tim O'Donnell committed
        return self._master_allele_encoding
Tim O'Donnell's avatar
Tim O'Donnell committed
    def fit_allele_specific_predictors(
            self,
            n_models,
            architecture_hyperparameters_list,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele,
            peptides,
            affinities,
            train_rounds=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            models_dir_for_save=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            verbose=0,
Tim O'Donnell's avatar
Tim O'Donnell committed
            progress_preamble="",
            progress_print_interval=5.0):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Fit one or more allele specific predictors for a single allele using one
        or more neural network architectures.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        The new predictors are saved in the Class1AffinityPredictor instance
        and will be used on subsequent calls to `predict`.
        
        Parameters
        ----------
        n_models : int
            Number of neural networks to fit
        
        architecture_hyperparameters_list : list of dict
            List of hyperparameter sets.
Tim O'Donnell's avatar
Tim O'Donnell committed
               
        allele : string
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides : `EncodableSequences` or list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        affinities : list of float
            nM affinities

        inequalities : list of string, each element one of ">", "<", or "="
Tim O'Donnell's avatar
Tim O'Donnell committed
            See `Class1NeuralNetwork.fit` for details.

        train_rounds : sequence of int
            Each training point i will be used on training rounds r for which
            train_rounds[i] > r, r >= 0.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        models_dir_for_save : string, optional
            If specified, the Class1AffinityPredictor is (incrementally) written
            to the given models dir after each neural network is fit.
        
        verbose : int
            Keras verbosity

        progress_preamble : string
            Optional string of information to include in each progress update

Tim O'Donnell's avatar
Tim O'Donnell committed
        progress_print_interval : float
            How often (in seconds) to print progress. Set to None to disable.

Tim O'Donnell's avatar
Tim O'Donnell committed
        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        list of `Class1NeuralNetwork`
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed

        allele = mhcnames.normalize_allele_name(allele)
Tim O'Donnell's avatar
Tim O'Donnell committed
        if allele not in self.allele_to_allele_specific_models:
            self.allele_to_allele_specific_models[allele] = []

        encodable_peptides = EncodableSequences.create(peptides)
        peptides_affinities_inequalities_per_round = [
            (encodable_peptides, affinities, inequalities)
        ]

        if train_rounds is not None:
Tim O'Donnell's avatar
Tim O'Donnell committed
            for round in sorted(set(train_rounds)):
                round_mask = train_rounds > round
                if round_mask.any():
                    sub_encodable_peptides = EncodableSequences.create(
                        encodable_peptides.sequences[round_mask])
                    peptides_affinities_inequalities_per_round.append((
                        sub_encodable_peptides,
                        affinities[round_mask],
                        None if inequalities is None else inequalities[round_mask]))
        n_rounds = len(peptides_affinities_inequalities_per_round)

        n_architectures = len(architecture_hyperparameters_list)

        # Adjust progress info to indicate number of models and
        # architectures.
        pieces = []
        if n_models > 1:
            pieces.append("Model {model_num:2d} / {n_models:2d}")
        if n_architectures > 1:
            pieces.append(
                "Architecture {architecture_num:2d} / {n_architectures:2d}")
        if len(peptides_affinities_inequalities_per_round) > 1:
            pieces.append("Round {round:2d} / {n_rounds:2d}")
        pieces.append("{n_peptides:4d} peptides")
        progress_preamble_template = "[ %s ] {user_progress_preamble}" % (
            ", ".join(pieces))

        models = []
        for model_num in range(n_models):
            for (architecture_num, architecture_hyperparameters) in enumerate(
                    architecture_hyperparameters_list):
                model = Class1NeuralNetwork(**architecture_hyperparameters)
                for round_num in range(n_rounds):
                    (round_peptides, round_affinities, round_inequalities) = (
                        peptides_affinities_inequalities_per_round[round_num]
                    )
                    model.fit(
                        round_peptides,
                        round_affinities,
                        inequalities=round_inequalities,
                        verbose=verbose,
                        progress_preamble=progress_preamble_template.format(
                            n_peptides=len(round_peptides),
                            round=round_num,
                            n_rounds=n_rounds,
                            user_progress_preamble=progress_preamble,
                            model_num=model_num + 1,
                            n_models=n_models,
                            architecture_num=architecture_num + 1,
                            n_architectures=n_architectures),
                        progress_print_interval=progress_print_interval)

                model_name = self.model_name(allele, model_num)
                row = pandas.Series(collections.OrderedDict([
                    ("model_name", model_name),
                    ("allele", allele),
                    ("config_json", json.dumps(model.get_config())),
                    ("model", model),
                ])).to_frame().T
                self._manifest_df = pandas.concat(
                    [self.manifest_df, row], ignore_index=True)
                self.allele_to_allele_specific_models[allele].append(model)
                if models_dir_for_save:
                    self.save(
                        models_dir_for_save, model_names_to_write=[model_name])
                models.append(model)
Tim O'Donnell's avatar
Tim O'Donnell committed

    def fit_class1_pan_allele_models(
            self,
            n_models,
            architecture_hyperparameters,
            alleles,
            peptides,
            affinities,
Tim O'Donnell's avatar
Tim O'Donnell committed
            inequalities,
Tim O'Donnell's avatar
Tim O'Donnell committed
            models_dir_for_save=None,
            verbose=1,
Tim O'Donnell's avatar
Tim O'Donnell committed
            progress_preamble="",
            progress_print_interval=5.0):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Fit one or more pan-allele predictors using a single neural network
        architecture.
        
        The new predictors are saved in the Class1AffinityPredictor instance
        and will be used on subsequent calls to `predict`.
        
        Parameters
        ----------
        n_models : int
            Number of neural networks to fit
            
        architecture_hyperparameters : dict
        
        alleles : list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
            Allele names (not sequences) corresponding to each peptide
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides : `EncodableSequences` or list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        affinities : list of float
            nM affinities
Tim O'Donnell's avatar
Tim O'Donnell committed

        inequalities : list of string, each element one of ">", "<", or "="
            See Class1NeuralNetwork.fit for details.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        models_dir_for_save : string, optional
            If specified, the Class1AffinityPredictor is (incrementally) written
            to the given models dir after each neural network is fit.
        
        verbose : int
            Keras verbosity

        progress_preamble : string
            Optional string of information to include in each progress update

Tim O'Donnell's avatar
Tim O'Donnell committed
        progress_print_interval : float
            How often (in seconds) to print progress. Set to None to disable.

Tim O'Donnell's avatar
Tim O'Donnell committed
        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        list of `Class1NeuralNetwork`
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed

        alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name)
Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_encoding = AlleleEncoding(
            alleles,
Tim O'Donnell's avatar
Tim O'Donnell committed
            borrow_from=self.master_allele_encoding)
Tim O'Donnell's avatar
Tim O'Donnell committed

        encodable_peptides = EncodableSequences.create(peptides)
        models = []
        for i in range(n_models):
Tim O'Donnell's avatar
Tim O'Donnell committed
            logging.info("Training model %d / %d", i + 1, n_models)
            model = Class1NeuralNetwork(**architecture_hyperparameters)
            model.fit(
                encodable_peptides,
                affinities,
Tim O'Donnell's avatar
Tim O'Donnell committed
                inequalities=inequalities,
                allele_encoding=allele_encoding,
Tim O'Donnell's avatar
Tim O'Donnell committed
                progress_preamble=progress_preamble,
                progress_print_interval=progress_print_interval)
Tim O'Donnell's avatar
Tim O'Donnell committed
            model_name = self.model_name("pan-class1", i)
            row = pandas.Series(collections.OrderedDict([
                ("model_name", model_name),
                ("allele", "pan-class1"),
                ("config_json", json.dumps(model.get_config())),
Tim O'Donnell's avatar
Tim O'Donnell committed
                ("model", model),
Tim O'Donnell's avatar
Tim O'Donnell committed
            ])).to_frame().T
            self._manifest_df = pandas.concat(
Tim O'Donnell's avatar
Tim O'Donnell committed
                [self.manifest_df, row], ignore_index=True)
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.class1_pan_allele_models.append(model)
Tim O'Donnell's avatar
Tim O'Donnell committed
            if models_dir_for_save:
Tim O'Donnell's avatar
Tim O'Donnell committed
                self.save(
                    models_dir_for_save, model_names_to_write=[model_name])
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    def add_pan_allele_model(self, model, models_dir_for_save=None):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Add a pan-allele model to the ensemble and optionally do an incremental
        save.

        Parameters
        ----------
        model : Class1NeuralNetwork
        models_dir_for_save : string
            Directory to save resulting ensemble to
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        model_name = self.model_name("pan-class1", 1)
        row = pandas.Series(collections.OrderedDict([
            ("model_name", model_name),
            ("allele", "pan-class1"),
            ("config_json", json.dumps(model.get_config())),
            ("model", model),
        ])).to_frame().T
        self._manifest_df = pandas.concat(
            [self.manifest_df, row], ignore_index=True)
        self.class1_pan_allele_models.append(model)
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.clear_cache()
        self.check_consistency()
Tim O'Donnell's avatar
Tim O'Donnell committed
        if models_dir_for_save:
            self.save(
                models_dir_for_save, model_names_to_write=[model_name])

    def percentile_ranks(self, affinities, allele=None, alleles=None, throw=True):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Return percentile ranks for the given ic50 affinities and alleles.

Tim O'Donnell's avatar
Tim O'Donnell committed
        The 'allele' and 'alleles' argument are as in the `predict` method.
Tim O'Donnell's avatar
Tim O'Donnell committed
        Specify one of these.

        Parameters
        ----------
        affinities : sequence of float
            nM affinities
        allele : string
        alleles : sequence of string
        throw : boolean
            If True, a ValueError will be raised in the case of unsupported
            alleles. If False, a warning will be logged and NaN will be returned
            for those percentile ranks.
Tim O'Donnell's avatar
Tim O'Donnell committed

        Returns
        -------
        numpy.array of float
        """
        if allele is not None:
            try:
                transform = self.allele_to_percent_rank_transform[allele]
                return transform.transform(affinities)
            except KeyError:
                msg = "Allele %s has no percentile rank information" % allele
                if throw:
                    raise ValueError(msg)
Tim O'Donnell's avatar
Tim O'Donnell committed
                warnings.warn(msg)
                return numpy.ones(len(affinities)) * numpy.nan  # Return NaNs
Tim O'Donnell's avatar
Tim O'Donnell committed

        if alleles is None:
            raise ValueError("Specify allele or alleles")

        df = pandas.DataFrame({"affinity": affinities})
        df["allele"] = alleles
        df["result"] = numpy.nan
        for (allele, sub_df) in df.groupby("allele"):
Tim O'Donnell's avatar
Tim O'Donnell committed
            df.loc[sub_df.index, "result"] = self.percentile_ranks(
                sub_df.affinity, allele=allele, throw=throw)
Tim O'Donnell's avatar
Tim O'Donnell committed
        return df.result.values

Tim O'Donnell's avatar
Tim O'Donnell committed
    def predict(
            self,
            peptides,
            alleles=None,
            allele=None,
            throw=True,
Tim O'Donnell's avatar
Tim O'Donnell committed
            centrality_measure=DEFAULT_CENTRALITY_MEASURE,
            model_kwargs={}):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Predict nM binding affinities.
        
        If multiple predictors are available for an allele, the predictions are
Tim O'Donnell's avatar
Tim O'Donnell committed
        the geometric means of the individual model (nM) predictions.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        One of 'allele' or 'alleles' must be specified. If 'allele' is specified
        all predictions will be for the given allele. If 'alleles' is specified
        it must be the same length as 'peptides' and give the allele
        corresponding to each peptide.
        
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides : `EncodableSequences` or list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
        alleles : list of string
        allele : string
Tim O'Donnell's avatar
Tim O'Donnell committed
        throw : boolean
            If True, a ValueError will be raised in the case of unsupported
            alleles or peptide lengths. If False, a warning will be logged and
            the predictions for the unsupported alleles or peptides will be NaN.
Tim O'Donnell's avatar
Tim O'Donnell committed
        centrality_measure : string or callable
            Measure of central tendency to use to combine predictions in the
            ensemble. Options include: mean, median, robust_mean.
Tim O'Donnell's avatar
Tim O'Donnell committed
        model_kwargs : dict
            Additional keyword arguments to pass to Class1NeuralNetwork.predict
Tim O'Donnell's avatar
Tim O'Donnell committed

        Returns
        -------
        numpy.array of predictions
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        df = self.predict_to_dataframe(
            peptides=peptides,
            alleles=alleles,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele=allele,
            throw=throw,
Tim O'Donnell's avatar
Tim O'Donnell committed
            include_percentile_ranks=False,
Tim O'Donnell's avatar
Tim O'Donnell committed
            include_confidence_intervals=False,
Tim O'Donnell's avatar
Tim O'Donnell committed
            centrality_measure=centrality_measure,
Tim O'Donnell's avatar
Tim O'Donnell committed
            model_kwargs=model_kwargs
Tim O'Donnell's avatar
Tim O'Donnell committed
        )
        return df.prediction.values

    def predict_to_dataframe(
Tim O'Donnell's avatar
Tim O'Donnell committed
            self,
            peptides,
Tim O'Donnell's avatar
Tim O'Donnell committed
            alleles=None,
            allele=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            throw=True,
Tim O'Donnell's avatar
Tim O'Donnell committed
            include_individual_model_predictions=False,
            include_percentile_ranks=True,
Tim O'Donnell's avatar
Tim O'Donnell committed
            include_confidence_intervals=True,
Tim O'Donnell's avatar
Tim O'Donnell committed
            centrality_measure=DEFAULT_CENTRALITY_MEASURE,
            model_kwargs={}):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Predict nM binding affinities. Gives more detailed output than `predict`
        method, including 5-95% prediction intervals.
        
        If multiple predictors are available for an allele, the predictions are
        the geometric means of the individual model predictions.
        
        One of 'allele' or 'alleles' must be specified. If 'allele' is specified
        all predictions will be for the given allele. If 'alleles' is specified
        it must be the same length as 'peptides' and give the allele
        corresponding to each peptide. 
        
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides : `EncodableSequences` or list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
        alleles : list of string
        allele : string
Tim O'Donnell's avatar
Tim O'Donnell committed
        throw : boolean
            If True, a ValueError will be raised in the case of unsupported
            alleles or peptide lengths. If False, a warning will be logged and
            the predictions for the unsupported alleles or peptides will be NaN.
Tim O'Donnell's avatar
Tim O'Donnell committed
        include_individual_model_predictions : boolean