Skip to content
Snippets Groups Projects
class1_affinity_predictor.py 35.9 KiB
Newer Older
Tim O'Donnell's avatar
Tim O'Donnell committed
import collections
import hashlib
Tim O'Donnell's avatar
Tim O'Donnell committed
import json
Tim O'Donnell's avatar
Tim O'Donnell committed
import logging
import time
import warnings
from os.path import join, exists
Tim O'Donnell's avatar
Tim O'Donnell committed
from os import mkdir
Tim O'Donnell's avatar
Tim O'Donnell committed
from socket import gethostname
from getpass import getuser
from functools import partial
Tim O'Donnell's avatar
Tim O'Donnell committed

import mhcnames
Tim O'Donnell's avatar
Tim O'Donnell committed
import numpy
import pandas
import tqdm  # progress bars
Tim O'Donnell's avatar
Tim O'Donnell committed
from numpy.testing import assert_equal
from six import string_types
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
from .class1_neural_network import Class1NeuralNetwork
from .common import random_peptides
from .downloads import get_path
from .encodable_sequences import EncodableSequences
from .percent_rank_transform import PercentRankTransform
from .regression_target import to_ic50
Tim O'Donnell's avatar
Tim O'Donnell committed
from .version import __version__
Tim O'Donnell's avatar
Tim O'Donnell committed


Tim O'Donnell's avatar
Tim O'Donnell committed
class Class1AffinityPredictor(object):
Tim O'Donnell's avatar
Tim O'Donnell committed
    """
    High-level interface for peptide/MHC I binding affinity prediction.
Tim O'Donnell's avatar
Tim O'Donnell committed

    This class manages low-level `Class1NeuralNetwork` instances, each of which
    wraps a single Keras network. The purpose of `Class1AffinityPredictor` is to
    implement ensembles, handling of multiple alleles, and predictor loading and
    saving.
Tim O'Donnell's avatar
Tim O'Donnell committed
    """
Tim O'Donnell's avatar
Tim O'Donnell committed
    def __init__(
            self,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_allele_specific_models=None,
            class1_pan_allele_models=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_pseudosequence=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            manifest_df=None,
            allele_to_percent_rank_transform=None):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_to_allele_specific_models : dict of string -> list of `Class1NeuralNetwork`
Tim O'Donnell's avatar
Tim O'Donnell committed
            Ensemble of single-allele models to use for each allele. 
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        class1_pan_allele_models : list of `Class1NeuralNetwork`
Tim O'Donnell's avatar
Tim O'Donnell committed
            Ensemble of pan-allele models.
        
        allele_to_pseudosequence : dict of string -> string
            Required only if class1_pan_allele_models is specified.
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        manifest_df : `pandas.DataFrame`, optional
Tim O'Donnell's avatar
Tim O'Donnell committed
            Must have columns: model_name, allele, config_json, model.
            Only required if you want to update an existing serialization of a
Tim O'Donnell's avatar
Tim O'Donnell committed
            Class1AffinityPredictor. Otherwise this dataframe will be generated
            automatically based on the supplied models.
Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_to_percent_rank_transform : dict of string -> `PercentRankTransform`, optional
            `PercentRankTransform` instances to use for each allele
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        if allele_to_allele_specific_models is None:
            allele_to_allele_specific_models = {}
        if class1_pan_allele_models is None:
            class1_pan_allele_models = []

Tim O'Donnell's avatar
Tim O'Donnell committed
        if class1_pan_allele_models:
            assert allele_to_pseudosequence, "Pseudosequences required"

Tim O'Donnell's avatar
Tim O'Donnell committed
        self.allele_to_allele_specific_models = allele_to_allele_specific_models
        self.class1_pan_allele_models = class1_pan_allele_models
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.allele_to_pseudosequence = allele_to_pseudosequence

        if manifest_df is None:
Tim O'Donnell's avatar
Tim O'Donnell committed
            rows = []
            for (i, model) in enumerate(self.class1_pan_allele_models):
                rows.append((
                    self.model_name("pan-class1", i),
                    "pan-class1",
Tim O'Donnell's avatar
Tim O'Donnell committed
                    json.dumps(model.get_config()),
Tim O'Donnell's avatar
Tim O'Donnell committed
                    model
                ))
            for (allele, models) in self.allele_to_allele_specific_models.items():
                for (i, model) in enumerate(models):
                    rows.append((
                        self.model_name(allele, i),
                        allele,
Tim O'Donnell's avatar
Tim O'Donnell committed
                        json.dumps(model.get_config()),
Tim O'Donnell's avatar
Tim O'Donnell committed
                        model
                    ))
            manifest_df = pandas.DataFrame(
                rows,
                columns=["model_name", "allele", "config_json", "model"])
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.manifest_df = manifest_df

        if not allele_to_percent_rank_transform:
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_percent_rank_transform = {}
        self.allele_to_percent_rank_transform = allele_to_percent_rank_transform

    @property
    def neural_networks(self):
        """
        List of the neural networks in the ensemble.

        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        list of `Class1NeuralNetwork`
        """
        result = []
        for models in self.allele_to_allele_specific_models.values():
            result.extend(models)
        result.extend(self.class1_pan_allele_models)
        return result

    @classmethod
    def merge(cls, predictors):
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Merge the ensembles of two or more `Class1AffinityPredictor` instances.

        Note: the resulting merged predictor will NOT have calibrated percentile
Tim O'Donnell's avatar
Tim O'Donnell committed
        ranks. Call `calibrate_percentile_ranks` on it if these are needed.
Tim O'Donnell's avatar
Tim O'Donnell committed
        predictors : sequence of `Class1AffinityPredictor`
Tim O'Donnell's avatar
Tim O'Donnell committed
        `Class1AffinityPredictor` instance

        """
        assert len(predictors) > 0
        if len(predictors) == 1:
            return predictors[0]

        allele_to_allele_specific_models = collections.defaultdict(list)
        class1_pan_allele_models = []
        allele_to_pseudosequence = predictors[0].allele_to_pseudosequence

        for predictor in predictors:
            for (allele, networks) in (
                    predictor.allele_to_allele_specific_models.items()):
                allele_to_allele_specific_models[allele].extend(networks)
            class1_pan_allele_models.extend(
                predictor.class1_pan_allele_models)

        return Class1AffinityPredictor(
            allele_to_allele_specific_models=allele_to_allele_specific_models,
            class1_pan_allele_models=class1_pan_allele_models,
            allele_to_pseudosequence=allele_to_pseudosequence
        )

    @property
    def supported_alleles(self):
        """
        Alleles for which predictions can be made.
        
        Returns
        -------
        list of string
        """
        result = set(self.allele_to_allele_specific_models)
        if self.allele_to_pseudosequence:
            result = result.union(self.allele_to_pseudosequence)
        return sorted(result)

    @property
    def supported_peptide_lengths(self):
        """
        (minimum, maximum) lengths of peptides supported by *all models*,
        inclusive.

        Returns
        -------
        (int, int) tuple

        """
        models = list(self.class1_pan_allele_models)
        for allele_models in self.allele_to_allele_specific_models.values():
            models.extend(allele_models)
        length_ranges = [model.supported_peptide_lengths for model in models]
        return (
            max(lower for (lower, upper) in length_ranges),
            min(upper for (lower, upper) in length_ranges))

Tim O'Donnell's avatar
Tim O'Donnell committed
    def save(self, models_dir, model_names_to_write=None):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Serialize the predictor to a directory on disk. If the directory does
        not exist it will be created.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        The serialization format consists of a file called "manifest.csv" with
        the configurations of each Class1NeuralNetwork, along with per-network
        files giving the model weights. If there are pan-allele predictors in
        the ensemble, the allele pseudosequences are also stored in the
Tim O'Donnell's avatar
Tim O'Donnell committed
        directory. There is also a small file "index.txt" with basic metadata:
        when the models were trained, by whom, on what host.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        Parameters
        ----------
        models_dir : string
            Path to directory
            
        model_names_to_write : list of string, optional
            Only write the weights for the specified models. Useful for
            incremental updates during training.
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        num_models = len(self.class1_pan_allele_models) + sum(
            len(v) for v in self.allele_to_allele_specific_models.values())
        assert len(self.manifest_df) == num_models, (
            "Manifest seems out of sync with models: %d vs %d entries" % (
                len(self.manifest_df), num_models))

        if model_names_to_write is None:
            # Write all models
Tim O'Donnell's avatar
Tim O'Donnell committed
            model_names_to_write = self.manifest_df.model_name.values
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        if not exists(models_dir):
            mkdir(models_dir)

Tim O'Donnell's avatar
Tim O'Donnell committed
        sub_manifest_df = self.manifest_df.ix[
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.manifest_df.model_name.isin(model_names_to_write)
Tim O'Donnell's avatar
Tim O'Donnell committed
        ]

        for (_, row) in sub_manifest_df.iterrows():
Tim O'Donnell's avatar
Tim O'Donnell committed
            weights_path = self.weights_path(models_dir, row.model_name)
Tim O'Donnell's avatar
Tim O'Donnell committed
            Class1AffinityPredictor.save_weights(
                row.model.get_weights(), weights_path)
            logging.info("Wrote: %s" % weights_path)
Tim O'Donnell's avatar
Tim O'Donnell committed

        write_manifest_df = self.manifest_df[[
            c for c in self.manifest_df.columns if c != "model"
        ]]
        manifest_path = join(models_dir, "manifest.csv")
        write_manifest_df.to_csv(manifest_path, index=False)
        logging.info("Wrote: %s" % manifest_path)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        # Write "info.txt"
        info_path = join(models_dir, "info.txt")
        rows = [
            ("trained on", time.asctime()),
            ("package   ", "mhcflurry %s" % __version__),
            ("hostname  ", gethostname()),
            ("user      ", getuser()),
        ]
        pandas.DataFrame(rows).to_csv(
            info_path, sep="\t", header=False, index=False)

Tim O'Donnell's avatar
Tim O'Donnell committed
        if self.allele_to_percent_rank_transform:
            percent_ranks_df = None
            for (allele, transform) in self.allele_to_percent_rank_transform.items():
                series = transform.to_series()
                if percent_ranks_df is None:
                    percent_ranks_df = pandas.DataFrame(index=series.index)
                assert_equal(series.index.values, percent_ranks_df.index.values)
                percent_ranks_df[allele] = series
            percent_ranks_path = join(models_dir, "percent_ranks.csv")
            percent_ranks_df.to_csv(
                percent_ranks_path,
                index=True,
                index_label="bin")
            logging.info("Wrote: %s" % percent_ranks_path)

Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
Tim O'Donnell's avatar
Tim O'Donnell committed
    def load(models_dir=None, max_models=None):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Deserialize a predictor from a directory on disk.
        
        Parameters
        ----------
        models_dir : string
            Path to directory
            
        max_models : int, optional
Tim O'Donnell's avatar
Tim O'Donnell committed
            Maximum number of `Class1NeuralNetwork` instances to load
Tim O'Donnell's avatar
Tim O'Donnell committed

        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        `Class1AffinityPredictor` instance
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        if models_dir is None:
            models_dir = get_path("models_class1", "models")

Tim O'Donnell's avatar
Tim O'Donnell committed
        manifest_path = join(models_dir, "manifest.csv")
        manifest_df = pandas.read_csv(manifest_path, nrows=max_models)

        allele_to_allele_specific_models = collections.defaultdict(list)
        class1_pan_allele_models = []
        all_models = []
        for (_, row) in manifest_df.iterrows():
Tim O'Donnell's avatar
Tim O'Donnell committed
            weights_filename = Class1AffinityPredictor.weights_path(
                models_dir, row.model_name)
            weights = Class1AffinityPredictor.load_weights(weights_filename)
            config = json.loads(row.config_json)
            model = Class1NeuralNetwork.from_config(config, weights=weights)
Tim O'Donnell's avatar
Tim O'Donnell committed
            if row.allele == "pan-class1":
                class1_pan_allele_models.append(model)
            else:
                allele_to_allele_specific_models[row.allele].append(model)
            all_models.append(model)

        manifest_df["model"] = all_models

        pseudosequences = None
        if exists(join(models_dir, "pseudosequences.csv")):
            pseudosequences = pandas.read_csv(
                join(models_dir, "pseudosequences.csv"),
                index_col="allele").to_dict()

Tim O'Donnell's avatar
Tim O'Donnell committed
        allele_to_percent_rank_transform = {}
        percent_ranks_path = join(models_dir, "percent_ranks.csv")
        if exists(percent_ranks_path):
            percent_ranks_df = pandas.read_csv(percent_ranks_path, index_col=0)
            for allele in percent_ranks_df.columns:
                allele_to_percent_rank_transform[allele] = (
                    PercentRankTransform.from_series(percent_ranks_df[allele]))

        logging.info(
Tim O'Donnell's avatar
Tim O'Donnell committed
            "Loaded %d class1 pan allele predictors, %d pseudosequences, "
            "%d percent rank distributions, and %d allele specific models: %s" % (
Tim O'Donnell's avatar
Tim O'Donnell committed
                len(class1_pan_allele_models),
                len(pseudosequences) if pseudosequences else 0,
Tim O'Donnell's avatar
Tim O'Donnell committed
                len(allele_to_percent_rank_transform),
Tim O'Donnell's avatar
Tim O'Donnell committed
                sum(len(v) for v in allele_to_allele_specific_models.values()),
                ", ".join(
                    "%s (%d)" % (allele, len(v))
                    for (allele, v)
                    in sorted(allele_to_allele_specific_models.items()))))

Tim O'Donnell's avatar
Tim O'Donnell committed
        result = Class1AffinityPredictor(
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_allele_specific_models=allele_to_allele_specific_models,
            class1_pan_allele_models=class1_pan_allele_models,
            allele_to_pseudosequence=pseudosequences,
Tim O'Donnell's avatar
Tim O'Donnell committed
            manifest_df=manifest_df,
            allele_to_percent_rank_transform=allele_to_percent_rank_transform,
        )
Tim O'Donnell's avatar
Tim O'Donnell committed
        return result

Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def model_name(allele, num):
        """
        Generate a model name
        
        Parameters
        ----------
        allele : string
        num : int

        Returns
        -------
        string

        """
        random_string = hashlib.sha1(
            str(time.time()).encode()).hexdigest()[:16]
        return "%s-%d-%s" % (allele.upper(), num, random_string)

    @staticmethod
    def weights_path(models_dir, model_name):
        """
        Generate the path to the weights file for a model
        
        Parameters
        ----------
        models_dir : string
        model_name : string

        Returns
        -------
        string
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        return join(models_dir, "weights_%s.npz" % model_name)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    def fit_allele_specific_predictors(
            self,
            n_models,
            architecture_hyperparameters_list,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele,
            peptides,
            affinities,
Tim O'Donnell's avatar
Tim O'Donnell committed
            models_dir_for_save=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            verbose=0,
            progress_preamble=""):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Fit one or more allele specific predictors for a single allele using a
        single neural network architecture.
        
        The new predictors are saved in the Class1AffinityPredictor instance
        and will be used on subsequent calls to `predict`.
        
        Parameters
        ----------
        n_models : int
            Number of neural networks to fit
        
        architecture_hyperparameters_list : list of dict
Tim O'Donnell's avatar
Tim O'Donnell committed
               
        allele : string
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides : `EncodableSequences` or list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        affinities : list of float
            nM affinities

        inequalities : list of string, each element one of ">", "<", or "="
            See Class1NeuralNetwork.fit for details.
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        models_dir_for_save : string, optional
            If specified, the Class1AffinityPredictor is (incrementally) written
            to the given models dir after each neural network is fit.
        
        verbose : int
            Keras verbosity

        progress_preamble : string
            Optional string of information to include in each progress update

Tim O'Donnell's avatar
Tim O'Donnell committed
        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        list of `Class1NeuralNetwork`
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed

        allele = mhcnames.normalize_allele_name(allele)
Tim O'Donnell's avatar
Tim O'Donnell committed
        if allele not in self.allele_to_allele_specific_models:
            self.allele_to_allele_specific_models[allele] = []

        encodable_peptides = EncodableSequences.create(peptides)

        n_architectures = len(architecture_hyperparameters_list)
        if n_models > 1 or n_architectures > 1:
            # Adjust progress info to indicate number of models and
            # architectures.
            pieces = []
            if n_models > 1:
                pieces.append("Model {model_num:2d} / {n_models:2d}")
            if n_architectures > 1:
                pieces.append(
                    "Architecture {architecture_num:2d} / {n_architectures:2d}"
Tim O'Donnell's avatar
Tim O'Donnell committed
                    " (best so far: {best_num})")
            progress_preamble_template = "[ %s ] {user_progress_preamble}" % (
                ", ".join(pieces))
        else:
            # Just use the user-provided progress message.
            progress_preamble_template = "{user_progress_preamble}"

        models = []
        for model_num in range(n_models):
            shuffle_permutation = numpy.random.permutation(len(affinities))

            best_num = None
            best_loss = None
            best_model = None
            for (architecture_num, architecture_hyperparameters) in enumerate(
                    architecture_hyperparameters_list):
                model = Class1NeuralNetwork(**architecture_hyperparameters)
                model.fit(
                    encodable_peptides,
                    affinities,
                    shuffle_permutation=shuffle_permutation,
                    inequalities=inequalities,
                    verbose=verbose,
                    progress_preamble=progress_preamble_template.format(
                        user_progress_preamble=progress_preamble,
Tim O'Donnell's avatar
Tim O'Donnell committed
                        best_num="n/a" if best_num is None else best_num + 1,
                        model_num=model_num + 1,
Tim O'Donnell's avatar
Tim O'Donnell committed
                        architecture_num=architecture_num + 1,
                        n_architectures=n_architectures))

                if n_architectures > 1:
                    # We require val_loss (i.e. a validation set) if we have
                    # multiple architectures.
                    loss = model.loss_history['val_loss'][-1]
                else:
                    loss = None
                if loss is None or best_loss is None or best_loss > loss:
Tim O'Donnell's avatar
Tim O'Donnell committed
                    best_loss = loss
                    best_num = architecture_num
                    best_model = model
                del model

Tim O'Donnell's avatar
Tim O'Donnell committed
            if n_architectures > 1:
                print("Selected architecture %d." % (best_num + 1))

            model_name = self.model_name(allele, model_num)
Tim O'Donnell's avatar
Tim O'Donnell committed
            row = pandas.Series(collections.OrderedDict([
                ("model_name", model_name),
                ("allele", allele),
                ("config_json", json.dumps(best_model.get_config())),
                ("model", best_model),
Tim O'Donnell's avatar
Tim O'Donnell committed
            ])).to_frame().T
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.manifest_df = pandas.concat(
                [self.manifest_df, row], ignore_index=True)
            self.allele_to_allele_specific_models[allele].append(best_model)
Tim O'Donnell's avatar
Tim O'Donnell committed
            if models_dir_for_save:
Tim O'Donnell's avatar
Tim O'Donnell committed
                self.save(
                    models_dir_for_save, model_names_to_write=[model_name])
Tim O'Donnell's avatar
Tim O'Donnell committed

    def fit_class1_pan_allele_models(
            self,
            n_models,
            architecture_hyperparameters,
            alleles,
            peptides,
            affinities,
            models_dir_for_save=None,
            verbose=1,
            progress_preamble=""):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Fit one or more pan-allele predictors using a single neural network
        architecture.
        
        The new predictors are saved in the Class1AffinityPredictor instance
        and will be used on subsequent calls to `predict`.
        
        Parameters
        ----------
        n_models : int
            Number of neural networks to fit
            
        architecture_hyperparameters : dict
        
        alleles : list of string
            Allele names (not pseudosequences) corresponding to each peptide 
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides : `EncodableSequences` or list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
        
        affinities : list of float
            nM affinities
        
        models_dir_for_save : string, optional
            If specified, the Class1AffinityPredictor is (incrementally) written
            to the given models dir after each neural network is fit.
        
        verbose : int
            Keras verbosity

        progress_preamble : string
            Optional string of information to include in each progress update

Tim O'Donnell's avatar
Tim O'Donnell committed
        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        list of `Class1NeuralNetwork`
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed

        alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name)
        allele_pseudosequences = alleles.map(self.allele_to_pseudosequence)

        encodable_peptides = EncodableSequences.create(peptides)
        models = []
        for i in range(n_models):
            logging.info("Training model %d / %d" % (i + 1, n_models))
            model = Class1NeuralNetwork(**architecture_hyperparameters)
            model.fit(
                encodable_peptides,
                affinities,
                allele_pseudosequences=allele_pseudosequences,
                verbose=verbose,
                progress_preamble=progress_preamble)

Tim O'Donnell's avatar
Tim O'Donnell committed
            model_name = self.model_name("pan-class1", i)
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.class1_pan_allele_models.append(model)
Tim O'Donnell's avatar
Tim O'Donnell committed
            row = pandas.Series(collections.OrderedDict([
                ("model_name", model_name),
                ("allele", "pan-class1"),
                ("config_json", json.dumps(model.get_config())),
Tim O'Donnell's avatar
Tim O'Donnell committed
                ("model", model),
Tim O'Donnell's avatar
Tim O'Donnell committed
            ])).to_frame().T
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.manifest_df = pandas.concat(
                [self.manifest_df, row], ignore_index=True)
            if models_dir_for_save:
Tim O'Donnell's avatar
Tim O'Donnell committed
                self.save(
                    models_dir_for_save, model_names_to_write=[model_name])
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed

    def percentile_ranks(self, affinities, allele=None, alleles=None, throw=True):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Return percentile ranks for the given ic50 affinities and alleles.

Tim O'Donnell's avatar
Tim O'Donnell committed
        The 'allele' and 'alleles' argument are as in the `predict` method.
Tim O'Donnell's avatar
Tim O'Donnell committed
        Specify one of these.

        Parameters
        ----------
        affinities : sequence of float
            nM affinities
        allele : string
        alleles : sequence of string
        throw : boolean
            If True, a ValueError will be raised in the case of unsupported
            alleles. If False, a warning will be logged and NaN will be returned
            for those percentile ranks.
Tim O'Donnell's avatar
Tim O'Donnell committed

        Returns
        -------
        numpy.array of float
        """
        if allele is not None:
            try:
                transform = self.allele_to_percent_rank_transform[allele]
                return transform.transform(affinities)
            except KeyError:
                msg = "Allele %s has no percentile rank information" % allele
                if throw:
                    raise ValueError(msg)
                else:
                    warnings.warn(msg)
                    # Return NaNs
                    return numpy.ones(len(affinities)) * numpy.nan
Tim O'Donnell's avatar
Tim O'Donnell committed

        if alleles is None:
            raise ValueError("Specify allele or alleles")

        df = pandas.DataFrame({"affinity": affinities})
        df["allele"] = alleles
        df["result"] = numpy.nan
        for (allele, sub_df) in df.groupby("allele"):
Tim O'Donnell's avatar
Tim O'Donnell committed
            df.loc[sub_df.index, "result"] = self.percentile_ranks(
                sub_df.affinity, allele=allele, throw=throw)
Tim O'Donnell's avatar
Tim O'Donnell committed
        return df.result.values

Tim O'Donnell's avatar
Tim O'Donnell committed
    def predict(self, peptides, alleles=None, allele=None, throw=True):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Predict nM binding affinities.
        
        If multiple predictors are available for an allele, the predictions are
        the geometric means of the individual model predictions.
        
        One of 'allele' or 'alleles' must be specified. If 'allele' is specified
        all predictions will be for the given allele. If 'alleles' is specified
        it must be the same length as 'peptides' and give the allele
        corresponding to each peptide.
        
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides : `EncodableSequences` or list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
        alleles : list of string
        allele : string
Tim O'Donnell's avatar
Tim O'Donnell committed
        throw : boolean
            If True, a ValueError will be raised in the case of unsupported
            alleles or peptide lengths. If False, a warning will be logged and
            the predictions for the unsupported alleles or peptides will be NaN.
Tim O'Donnell's avatar
Tim O'Donnell committed

        Returns
        -------
        numpy.array of predictions
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        df = self.predict_to_dataframe(
            peptides=peptides,
            alleles=alleles,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele=allele,
            throw=throw,
Tim O'Donnell's avatar
Tim O'Donnell committed
            include_percentile_ranks=False,
Tim O'Donnell's avatar
Tim O'Donnell committed
        )
        return df.prediction.values

    def predict_to_dataframe(
Tim O'Donnell's avatar
Tim O'Donnell committed
            self,
            peptides,
Tim O'Donnell's avatar
Tim O'Donnell committed
            alleles=None,
            allele=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            throw=True,
Tim O'Donnell's avatar
Tim O'Donnell committed
            include_individual_model_predictions=False,
            include_percentile_ranks=True):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
        Predict nM binding affinities. Gives more detailed output than `predict`
        method, including 5-95% prediction intervals.
        
        If multiple predictors are available for an allele, the predictions are
        the geometric means of the individual model predictions.
        
        One of 'allele' or 'alleles' must be specified. If 'allele' is specified
        all predictions will be for the given allele. If 'alleles' is specified
        it must be the same length as 'peptides' and give the allele
        corresponding to each peptide. 
        
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides : `EncodableSequences` or list of string
Tim O'Donnell's avatar
Tim O'Donnell committed
        alleles : list of string
        allele : string
Tim O'Donnell's avatar
Tim O'Donnell committed
        throw : boolean
            If True, a ValueError will be raised in the case of unsupported
            alleles or peptide lengths. If False, a warning will be logged and
            the predictions for the unsupported alleles or peptides will be NaN.
Tim O'Donnell's avatar
Tim O'Donnell committed
        include_individual_model_predictions : boolean
            If True, the predictions of each individual model are included as
            columns in the result dataframe.
        include_percentile_ranks : boolean, default True
            If True, a "prediction_percentile" column will be included giving the
            percentile ranks. If no percentile rank information is available,
            this will be ignored with a warning.
Tim O'Donnell's avatar
Tim O'Donnell committed

        Returns
        -------
Tim O'Donnell's avatar
Tim O'Donnell committed
        `pandas.DataFrame` of predictions
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list or array, not a string")
        if isinstance(alleles, string_types):
            raise TypeError("alleles must be a list or array, not a string")
        if allele is not None:
            if alleles is not None:
                raise ValueError("Specify exactly one of allele or alleles")
            alleles = [allele] * len(peptides)

        alleles = numpy.array(alleles)
        peptides = EncodableSequences.create(peptides)

Tim O'Donnell's avatar
Tim O'Donnell committed
        df = pandas.DataFrame({
            'peptide': peptides.sequences,
Tim O'Donnell's avatar
Tim O'Donnell committed
            'allele': alleles,
        })
        if len(df) == 0:
            # No predictions.
            logging.warning("Predicting for 0 peptides.")
            empty_result = pandas.DataFrame(
                columns=[
                    'peptide',
                    'allele',
                    'prediction',
                    'prediction_low',
                    'prediction_high'
                ])
            return empty_result

Tim O'Donnell's avatar
Tim O'Donnell committed
        df["normalized_allele"] = df.allele.map(
Tim O'Donnell's avatar
Tim O'Donnell committed
            mhcnames.normalize_allele_name)

        (min_peptide_length, max_peptide_length) = (
            self.supported_peptide_lengths)
        df["supported_peptide_length"] = (
            (df.peptide.str.len() >= min_peptide_length) &
            (df.peptide.str.len() <= max_peptide_length))
        if (~df.supported_peptide_length).any():
            msg = (
                "%d peptides have lengths outside of supported range [%d, %d]: "
                "%s" % (
                    (~df.supported_peptide_length).sum(),
                    min_peptide_length,
                    max_peptide_length,
                    str(df.ix[~df.supported_peptide_length].peptide.unique())))
            logging.warning(msg)
            if throw:
                raise ValueError(msg)

Tim O'Donnell's avatar
Tim O'Donnell committed
        if self.class1_pan_allele_models:
Tim O'Donnell's avatar
Tim O'Donnell committed
            unsupported_alleles = [
                allele for allele in
                df.normalized_allele.unique()
                if allele not in self.allele_to_pseudosequence
            ]
            if unsupported_alleles:
                msg = (
                    "No pseudosequences for allele(s): %s.\n"
Tim O'Donnell's avatar
Tim O'Donnell committed
                    "Supported alleles: %s" % (
                        " ".join(unsupported_alleles),
Tim O'Donnell's avatar
Tim O'Donnell committed
                        " ".join(sorted(self.allele_to_pseudosequence))))
Tim O'Donnell's avatar
Tim O'Donnell committed
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
            mask = df.supported_peptide_length
            if mask.sum() > 0:
                masked_allele_pseudosequences = (
                    df.ix[mask].normalized_allele.map(
                        self.allele_to_pseudosequence))
                masked_peptides = peptides.sequences[mask]
                for (i, model) in enumerate(self.class1_pan_allele_models):
                    df.loc[mask, "model_pan_%d" % i] = model.predict(
                        masked_peptides,
                        allele_pseudosequences=masked_allele_pseudosequences)
Tim O'Donnell's avatar
Tim O'Donnell committed

        if self.allele_to_allele_specific_models:
Tim O'Donnell's avatar
Tim O'Donnell committed
            query_alleles = df.normalized_allele.unique()
            unsupported_alleles = [
                allele for allele in query_alleles
                if not self.allele_to_allele_specific_models.get(allele)
            ]
            if unsupported_alleles:
                msg = (
                    "No single-allele models for allele(s): %s.\n"
                    "Supported alleles are: %s" % (
Tim O'Donnell's avatar
Tim O'Donnell committed
                        " ".join(unsupported_alleles),
Tim O'Donnell's avatar
Tim O'Donnell committed
                        " ".join(sorted(self.allele_to_allele_specific_models))))
Tim O'Donnell's avatar
Tim O'Donnell committed
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
            for allele in query_alleles:
                models = self.allele_to_allele_specific_models.get(allele, [])
                mask = (
                    (df.normalized_allele == allele) &
                    df.supported_peptide_length).values
                if mask.all():
                    # Common case optimization
                    for (i, model) in enumerate(models):
                        df["model_single_%d" % i] = model.predict(peptides)
                elif mask.sum() > 0:
                    allele_peptides = EncodableSequences.create(
                        df.ix[mask].peptide.values)
                    for (i, model) in enumerate(models):
                        df.loc[
                            mask, "model_single_%d" % i
                        ] = model.predict(allele_peptides)
Tim O'Donnell's avatar
Tim O'Donnell committed

        # Geometric mean
Tim O'Donnell's avatar
Tim O'Donnell committed
        df_predictions = df[
            [c for c in df.columns if c.startswith("model_")]
Tim O'Donnell's avatar
Tim O'Donnell committed
        ]
Tim O'Donnell's avatar
Tim O'Donnell committed
        logs = numpy.log(df_predictions)
        log_means = logs.mean(1)
Tim O'Donnell's avatar
Tim O'Donnell committed
        df["prediction"] = numpy.exp(log_means)
Tim O'Donnell's avatar
Tim O'Donnell committed
        df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1))
        df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1))
Tim O'Donnell's avatar
Tim O'Donnell committed

        if include_individual_model_predictions:
Tim O'Donnell's avatar
Tim O'Donnell committed
            columns = sorted(df.columns, key=lambda c: c.startswith('model_'))
        else:
            columns = [
                c for c in df.columns if c not in df_predictions.columns
            ]
        columns.remove("normalized_allele")
        columns.remove("supported_peptide_length")
Tim O'Donnell's avatar
Tim O'Donnell committed
        if include_percentile_ranks:
            if self.allele_to_percent_rank_transform:
                df["prediction_percentile"] = self.percentile_ranks(
                    df.prediction,
                    alleles=df.normalized_allele.values,
                    throw=throw)
                columns.append("prediction_percentile")
Tim O'Donnell's avatar
Tim O'Donnell committed
            else:
                warnings.warn("No percentile rank information available.")
Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def save_weights(weights_list, filename):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Save the model weights to the given filename using numpy's ".npz"
        format.
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        weights_list : list of array
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        filename : string
            Should end in ".npz".
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        numpy.savez(
            filename,
            **dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def load_weights(filename):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Restore model weights from the given filename, which should have been
        created with `save_weights`.
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        filename : string
            Should end in ".npz".
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        Returns
Tim O'Donnell's avatar
Tim O'Donnell committed
        ----------
        list of array
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        loaded = numpy.load(filename)
        weights = [
            loaded["array_%d" % i]
            for i in range(len(loaded.keys()))
        ]
        loaded.close()
        return weights

    def calibrate_percentile_ranks(
            self,
            peptides=None,
            num_peptides_per_length=int(1e5),
            alleles=None,
            bins=None,
            worker_pool=None):
        """
        Compute the cumulative distribution of ic50 values for a set of alleles
        over a large universe of random peptides, to enable computing quantiles in
        this distribution later.

        Parameters
        ----------
        peptides : sequence of string, optional
            Peptides to use
        num_peptides_per_length : int, optional
            If peptides argument is not specified, then num_peptides_per_length
            peptides are randomly sampled from a uniform distribution for each
            supported length
        alleles : sequence of string, optional
            Alleles to perform calibration for. If not specified all supported
            alleles will be calibrated.
        bins : object
            Anything that can be passed to numpy.histogram's "bins" argument
            can be used here, i.e. either an integer or a sequence giving bin
            edges. This is in ic50 space.
        worker_pool : multiprocessing.Pool, optional
            If specified multiple alleles will be calibrated in parallel
        """
        if bins is None:
            bins = to_ic50(numpy.linspace(1, 0, 1000))

        if alleles is None:
            alleles = self.supported_alleles

        if peptides is None:
            peptides = []
            lengths = range(
                self.supported_peptide_lengths[0],
                self.supported_peptide_lengths[1] + 1)
            for length in lengths:
                peptides.extend(
                    random_peptides(num_peptides_per_length, length))

        encoded_peptides = EncodableSequences.create(peptides)

        if worker_pool and len(alleles) > 1:
            # Run in parallel
Tim O'Donnell's avatar
Tim O'Donnell committed

            # Performance hack.
            self.neural_networks[0].peptides_to_network_input(encoded_peptides)

            do_work = partial(
                _calibrate_percentile_ranks,
                predictor=self,
                peptides=encoded_peptides,
                bins=bins)
            list_of_singleton_alleles = [ [allele] for allele in alleles ]
            results = worker_pool.imap_unordered(
                do_work, list_of_singleton_alleles, chunksize=1)

            # Add progress bar
            results = tqdm.tqdm(results, ascii=True, total=len(alleles))

            # Merge results
            for partial_dict in results:
                self.allele_to_percent_rank_transform.update(partial_dict)
        else:
            # Run in serial
            self.allele_to_percent_rank_transform.update(
                _calibrate_percentile_ranks(
                    alleles=alleles,
                    predictor=self,
                    peptides=encoded_peptides,
                    bins=bins))


def _calibrate_percentile_ranks(alleles, predictor, peptides, bins):
    """
    Private helper function.

    Parameters
    ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
    alleles : list of string
    predictor : Class1AffinityPredictor
    peptides : list of string or EncodableSequences
    bins : object
Tim O'Donnell's avatar
Tim O'Donnell committed
    dict : allele -> percentile rank transform

    """
    result = {}
    for (i, allele) in enumerate(alleles):
        predictions = predictor.predict(peptides, allele=allele)
        transform = PercentRankTransform()
        transform.fit(predictions, bins=bins)
        result[allele] = transform
    return result