class1_presentation_predictor.py

from __future__ import print_function

from os.path import join, exists
from os import mkdir
from socket import gethostname
from getpass import getuser

import time
import collections
import logging
from six import string_types

import numpy
import pandas
import sklearn
import sklearn.linear_model


try:
    import tqdm
except ImportError:
    tdqm = None

from .version import __version__
from .class1_affinity_predictor import Class1AffinityPredictor
from .class1_processing_predictor import Class1ProcessingPredictor
from .class1_neural_network import DEFAULT_PREDICT_BATCH_SIZE
from .encodable_sequences import EncodableSequences
from .regression_target import from_ic50, to_ic50
from .downloads import get_default_class1_presentation_models_dir


MAX_ALLELES_PER_SAMPLE = 6
PREDICT_BATCH_SIZE = DEFAULT_PREDICT_BATCH_SIZE
PREDICT_CHUNK_SIZE = 100000  # currently used only for cleavage prediction


class Class1PresentationPredictor(object):
    """
    A logistic regression model over predicted binding affinity (BA) and antigen
    processing (AP) score.

    See load() and predict() methods for basic usage.
    """
    model_inputs = ["affinity_score", "processing_score"]

    def __init__(
            self,
            affinity_predictor=None,
            processing_predictor_with_flanks=None,
            processing_predictor_without_flanks=None,
            weights_dataframe=None,
            metadata_dataframes=None):

        self.affinity_predictor = affinity_predictor
        self.processing_predictor_with_flanks = processing_predictor_with_flanks
        self.processing_predictor_without_flanks = processing_predictor_without_flanks
        self.weights_dataframe = weights_dataframe
        self.metadata_dataframes = (
            dict(metadata_dataframes) if metadata_dataframes else {})
        self._models_cache = {}

    @property
    def supported_alleles(self):
        """
        List of alleles supported by the underlying Class1AffinityPredictor
        """
        return self.affinity_predictor.supported_alleles

    @property
    def supported_peptide_lengths(self):
        """
        (min, max) of supported peptide lengths, inclusive.
        """
        return self.affinity_predictor.supported_peptide_lengths

    def predict_affinity(
            self,
            peptides,
            experiment_names,
            alleles,
            include_affinity_percentile=False,
            verbose=1,
            throw=True):
        """

        Parameters
        ----------
        peptides
        experiment_names
        alleles
        include_affinity_percentile
        verbose
        throw

        Returns
        -------

        """
        df = pandas.DataFrame({
            "peptide": numpy.array(peptides, copy=False),
            "experiment_name": numpy.array(experiment_names, copy=False),
        })

        iterator = df.groupby("experiment_name")
        if verbose > 0:
            print("Predicting affinities.")
            if tqdm is not None:
                iterator = tqdm.tqdm(
                    iterator, total=df.experiment_name.nunique())

        for (experiment, sub_df) in iterator:
            predictions_df = pandas.DataFrame(index=sub_df.index)
            experiment_peptides = EncodableSequences.create(sub_df.peptide.values)
            for allele in alleles[experiment]:
                predictions_df[allele] = self.affinity_predictor.predict(
                    peptides=experiment_peptides,
                    allele=allele,
                    model_kwargs={'batch_size': PREDICT_BATCH_SIZE},
                    throw=throw)
            df.loc[
                sub_df.index, "affinity"
            ] = predictions_df.min(1).values
            df.loc[
                sub_df.index, "best_allele"
            ] = predictions_df.idxmin(1).values

            if include_affinity_percentile:
                df.loc[sub_df.index, "affinity_percentile"] = (
                    self.affinity_predictor.percentile_ranks(
                        df.loc[sub_df.index, "affinity"].values,
                        alleles=df.loc[sub_df.index, "best_allele"].values,
                        throw=False))

        return df

    def predict_processing(
            self, peptides, n_flanks=None, c_flanks=None, verbose=1):

        if (n_flanks is None) != (c_flanks is None):
            raise ValueError("Specify both or neither of n_flanks, c_flanks")

        if n_flanks is None:
            if self.processing_predictor_without_flanks is None:
                raise ValueError("No processing predictor without flanks")
            predictor = self.processing_predictor_without_flanks
            n_flanks = [""] * len(peptides)
            c_flanks = n_flanks
        else:
            if self.processing_predictor_with_flanks is None:
                raise ValueError("No processing predictor with flanks")
            predictor = self.processing_predictor_with_flanks

        num_chunks = int(numpy.ceil(float(len(peptides)) / PREDICT_CHUNK_SIZE))
        peptide_chunks = numpy.array_split(peptides, num_chunks)
        n_flank_chunks = numpy.array_split(n_flanks, num_chunks)
        c_flank_chunks = numpy.array_split(c_flanks, num_chunks)

        iterator = zip(peptide_chunks, n_flank_chunks, c_flank_chunks)
        if verbose > 0:
            print("Predicting processing.")
            if tqdm is not None:
                iterator = tqdm.tqdm(iterator, total=len(peptide_chunks))

        result_chunks = []
        for (peptide_chunk, n_flank_chunk, c_flank_chunk) in iterator:
            result_chunk = predictor.predict(
                peptides=peptide_chunk,
                n_flanks=n_flank_chunk,
                c_flanks=c_flank_chunk,
                batch_size=PREDICT_BATCH_SIZE)
            result_chunks.append(result_chunk)
        return numpy.concatenate(result_chunks)

    def fit(
            self,
            targets,
            peptides,
            experiment_names,
            alleles,
            n_flanks=None,
            c_flanks=None,
            verbose=1):

        df = self.predict_affinity(
            peptides=peptides,
            experiment_names=experiment_names,
            alleles=alleles,
            verbose=verbose)
        df["affinity_score"] = from_ic50(df.affinity)
        df["target"] = numpy.array(targets, copy=False)

        if (n_flanks is None) != (c_flanks is None):
            raise ValueError("Specify both or neither of n_flanks, c_flanks")

        with_flanks_list = []
        if self.processing_predictor_without_flanks is not None:
            with_flanks_list.append(False)

        if n_flanks is not None and self.processing_predictor_with_flanks is not None:
            with_flanks_list.append(True)

        if not with_flanks_list:
            raise RuntimeError("Can't fit any models")

        if self.weights_dataframe is None:
            self.weights_dataframe = pandas.DataFrame()

        for with_flanks in with_flanks_list:
            model_name = 'with_flanks' if with_flanks else "without_flanks"
            if verbose > 0:
                print("Training variant", model_name)

            df["processing_score"] = self.predict_processing(
                peptides=df.peptide.values,
                n_flanks=n_flanks if with_flanks else None,
                c_flanks=c_flanks if with_flanks else None,
                verbose=verbose)

            model = self.get_model()
            if verbose > 0:
                print("Fitting LR model.")
                print(df)

            model.fit(
                X=df[self.model_inputs].values,
                y=df.target.astype(float))

            self.weights_dataframe.loc[model_name, "intercept"] = model.intercept_
            for (name, value) in zip(self.model_inputs, numpy.squeeze(model.coef_)):
                self.weights_dataframe.loc[model_name, name] = value
            self._models_cache[model_name] = model

    def get_model(self, name=None):
        if name is None or name not in self._models_cache:
            model = sklearn.linear_model.LogisticRegression(solver="lbfgs")
            if name is not None:
                row = self.weights_dataframe.loc[name]
                model.intercept_ = row.intercept
                model.coef_ = numpy.expand_dims(
                    row[self.model_inputs].values, axis=0)
                model.classes_ = numpy.array([0, 1])
        else:
            model = self._models_cache[name]
        return model

    def predict(
            self,
            peptides,
            alleles,
            experiment_names=None,
            n_flanks=None,
            c_flanks=None,
            verbose=1):
        """
        Predict presentation scores across a set of peptides.

        Parameters
        ----------
        peptides : list of string, or EncodableSequences
        alleles : list of string or string -> string dict
        experiment_names :
        n_flanks
        c_flanks
        verbose

        Returns
        -------

        """
        return self.predict_to_dataframe(
            peptides=peptides,
            alleles=alleles,
            experiment_names=experiment_names,
            n_flanks=n_flanks,
            c_flanks=c_flanks,
            verbose=verbose).presentation_score.values

    def predict_sequences(
            self,
            sequences,
            alleles,
            result="best",
            comparison_quantity="presentation_score",
            filter_value=None,
            peptide_lengths=[8, 9, 10, 11],
            use_flanks=True,
            include_affinity_percentile=False,
            verbose=1,
            throw=True):
        """
        Predict presentation across protein sequences.

        Parameters
        ----------
        sequences : str, list of string, or string -> string dict
            Protein sequences. If a dict is given, the keys are arbitrary (
            e.g. protein names), and the values are the amino acid sequences.
        alleles : str, list of string, list of list of string, or string -> string dict
            MHC I alleles. Can be: (1) a string (a single allele), (2) a list of
            strings (a single genotype), (3) a list of list of strings
            (multiple genotypes, where the total number of genotypes must equal
            the number of sequences), or (4) a dict (in which case the keys must
            match the sequences dict keys).
        result : string
            One of:
             - "best": return the strongest peptide for each sequence
             - "all": return predictions for all peptides
             - "filtered": return predictions stronger where comparison_quantity
               is stronger than filter_value.
        comparison_quantity : string
            One of "presentation_score", "processing_score", or "affinity".
            Quantity to use to rank (if result is "best") or filter (if result
            is "filtered") results.
        filter_value : float
            Threshold value to use, only relevant when result is "filtered".
            If comparison_quantity is "affinity", then all results less than
            (i.e. tighter than) the specified nM affinity are retained. If it's
            "presentation_score" or "processing_score" then results greater than
            the indicated filter_value are retained.
        peptide_lengths : list of int
            Peptide lengths to predict for.
        use_flanks : bool
            Whether to include flanking sequences when running the AP predictor
            (for better cleavage prediction).
        include_affinity_percentile : bool
            Whether to include affinity percentile ranks in output.
        verbose : int
            Set to 0 for quiet mode.
        throw : boolean
            Whether to throw exceptions (vs. log warnings) on invalid inputs.

        Returns
        -------
        pandas.DataFrame with columns:
            peptide, n_flank, c_flank, sequence_name, affinity, best_allele,
            processing_score, presentation_score
        """

        processing_predictor = self.processing_predictor_with_flanks
        if not use_flanks or processing_predictor is None:
            processing_predictor = self.processing_predictor_without_flanks

        supported_sequence_lengths = processing_predictor.sequence_lengths
        n_flank_length = supported_sequence_lengths["n_flank"]
        c_flank_length = supported_sequence_lengths["c_flank"]

        sequence_names = []
        n_flanks = [] if use_flanks else None
        c_flanks = [] if use_flanks else None
        peptides = []

        if isinstance(sequences, string_types):
            sequences = [sequences]

        if not isinstance(sequences, dict):
            sequences = collections.OrderedDict(
                ("sequence_%04d" % (i + 1), sequence)
                for (i, sequence) in enumerate(sequences))

        if isinstance(alleles, string_types):
            alleles = [alleles]

        if not isinstance(alleles, dict):
            if all([isinstance(item, string_types) for item in alleles]):
                alleles = dict((name, alleles) for name in sequences.keys())
            elif len(alleles) != len(sequences):
                raise ValueError(
                    "alleles must be (1) a string (a single allele), (2) a list of "
                    "strings (a single genotype), (3) a list of list of strings ("
                    "(multiple genotypes, where the total number of genotypes "
                    "must equal the number of sequences), or (4) a dict (in which "
                    "case the keys must match the sequences dict keys). Here "
                    "it seemed like option (3) was being used, but the length "
                    "of alleles (%d) did not match the length of sequences (%d)."
                    % (len(alleles), len(sequences)))
            else:
                alleles = dict(zip(sequences.keys(), alleles))

        missing = [key for key in sequences if key not in alleles]
        if missing:
            raise ValueError(
                "Sequence names missing from alleles dict: ", missing)

        for (name, sequence) in sequences.items():
            if not isinstance(sequence, string_types):
                raise ValueError("Expected string, not %s (%s)" % (
                    sequence, type(sequence)))
            for peptide_start in range(len(sequence) - min(peptide_lengths)):
                n_flank_start = max(0, peptide_start - n_flank_length)
                for peptide_length in peptide_lengths:
                    c_flank_end = (
                        peptide_start + peptide_length + c_flank_length)
                    sequence_names.append(name)
                    peptides.append(
                        sequence[peptide_start: peptide_start + peptide_length])
                    if use_flanks:
                        n_flanks.append(
                            sequence[n_flank_start : peptide_start])
                        c_flanks.append(
                            sequence[peptide_start + peptide_length : c_flank_end])

        result_df = self.predict_to_dataframe(
            peptides=peptides,
            alleles=alleles,
            n_flanks=n_flanks,
            c_flanks=c_flanks,
            experiment_names=sequence_names,
            include_affinity_percentile=include_affinity_percentile,
            verbose=verbose,
            throw=throw)

        result_df = result_df.rename(
            columns={"experiment_name": "sequence_name"})

        comparison_is_score = comparison_quantity.endswith("score")

        result_df = result_df.sort_values(
            comparison_quantity,
            ascending=not comparison_is_score)

        if result == "best":
            result_df = result_df.drop_duplicates(
                "sequence_name", keep="first").sort_values("sequence_name")
        elif result == "filtered":
            if comparison_is_score:
                result_df = result_df.loc[
                    result_df[comparison_quantity] >= filter_value
                ]
            else:
                result_df = result_df.loc[
                    result_df[comparison_quantity] <= filter_value
                ]
        elif result == "all":
            pass
        else:
            raise ValueError(
                "Unknown result: %s. Valid choices are: best, filtered, all"
                % result)

        result_df = result_df.reset_index(drop=True)
        result_df = result_df.copy()

        return result_df


    def predict_to_dataframe(
            self,
            peptides,
            alleles,
            experiment_names=None,
            n_flanks=None,
            c_flanks=None,
            include_affinity_percentile=False,
            verbose=1,
            throw=True):
        """

        Parameters
        ----------
        peptides
        alleles
        experiment_names
        n_flanks
        c_flanks
        include_affinity_percentile
        verbose
        throw

        Returns
        -------

        """

        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list not a string")
        if isinstance(alleles, string_types):
            raise TypeError("alleles must be a list or dict")

        if isinstance(alleles, dict):
            if experiment_names is None:
                raise ValueError(
                    "experiment_names must be supplied when alleles is a dict")
        else:
            if experiment_names is not None:
                raise ValueError(
                    "alleles must be a dict when experiment_names is specified")
            alleles = numpy.array(alleles, copy=False)
            if len(alleles) > MAX_ALLELES_PER_SAMPLE:
                raise ValueError(
                    "When alleles is a list, it must have at most %d elements. "
                    "These alleles are taken to be a genotype for an "
                    "individual, and the strongest prediction across alleles "
                    "will be taken for each peptide. Note that this differs "
                    "from Class1AffinityPredictor.predict(), where alleles "
                    "is expected to be the same length as peptides."
                    % MAX_ALLELES_PER_SAMPLE)

            experiment_names = ["experiment1"] * len(peptides)
            alleles = {
                "experiment1": alleles,
            }

        if (n_flanks is None) != (c_flanks is None):
            raise ValueError("Specify both or neither of n_flanks, c_flanks")

        processing_scores = self.predict_processing(
            peptides=peptides,
            n_flanks=n_flanks,
            c_flanks=c_flanks,
            verbose=verbose)

        df = self.predict_affinity(
            peptides=peptides,
            experiment_names=experiment_names,
            alleles=alleles,
            include_affinity_percentile=include_affinity_percentile,
            verbose=verbose,
            throw=throw)
        df["affinity_score"] = from_ic50(df.affinity)
        df["processing_score"] = processing_scores
        if c_flanks is not None:
            df.insert(1, "c_flank", c_flanks)
        if n_flanks is not None:
            df.insert(1, "n_flank", n_flanks)

        model_name = 'with_flanks' if n_flanks is not None else "without_flanks"
        model = self.get_model(model_name)
        df["presentation_score"] = model.predict_proba(
            df[self.model_inputs].values)[:,1]
        del df["affinity_score"]
        return df

    def save(self, models_dir):
        """
        Serialize the predictor to a directory on disk. If the directory does
        not exist it will be created.

        Parameters
        ----------
        models_dir : string
            Path to directory. It will be created if it doesn't exist.
        """

        if self.weights_dataframe is None:
            raise RuntimeError("Can't save before fitting")

        if not exists(models_dir):
            mkdir(models_dir)

        # Save underlying predictors
        self.affinity_predictor.save(join(models_dir, "affinity_predictor"))
        if self.processing_predictor_with_flanks is not None:
            self.processing_predictor_with_flanks.save(
                join(models_dir, "processing_predictor_with_flanks"))
        if self.processing_predictor_without_flanks is not None:
            self.processing_predictor_without_flanks.save(
                join(models_dir, "processing_predictor_without_flanks"))

        # Save model coefficients.
        self.weights_dataframe.to_csv(join(models_dir, "weights.csv"))

        # Write "info.txt"
        info_path = join(models_dir, "info.txt")
        rows = [
            ("trained on", time.asctime()),
            ("package   ", "mhcflurry %s" % __version__),
            ("hostname  ", gethostname()),
            ("user      ", getuser()),
        ]
        pandas.DataFrame(rows).to_csv(
            info_path, sep="\t", header=False, index=False)

        if self.metadata_dataframes:
            for (name, df) in self.metadata_dataframes.items():
                metadata_df_path = join(models_dir, "%s.csv.bz2" % name)
                df.to_csv(metadata_df_path, index=False, compression="bz2")


    @classmethod
    def load(cls, models_dir=None, max_models=None):
        """
        Deserialize a predictor from a directory on disk.

        Parameters
        ----------
        models_dir : string
            Path to directory. If unspecified the default downloaded models are
            used.

        max_models : int, optional
            Maximum number of affinity and processing (counted separately)
            models to load

        Returns
        -------
        `Class1PresentationPredictor` instance
        """
        if models_dir is None:
            models_dir = get_default_class1_presentation_models_dir()

        affinity_predictor = Class1AffinityPredictor.load(
            join(models_dir, "affinity_predictor"), max_models=max_models)

        processing_predictor_with_flanks = None
        if exists(join(models_dir, "processing_predictor_with_flanks")):
            processing_predictor_with_flanks = Class1ProcessingPredictor.load(
                join(models_dir, "processing_predictor_with_flanks"),
                max_models=max_models)
        else:
            logging.warning(
                "Presentation predictor is missing processing predictor: %s",
                join(models_dir, "processing_predictor_with_flanks"))

        processing_predictor_without_flanks = None
        if exists(join(models_dir, "processing_predictor_without_flanks")):
            processing_predictor_without_flanks = Class1ProcessingPredictor.load(
                join(models_dir, "processing_predictor_without_flanks"),
                max_models=max_models)
        else:
            logging.warning(
                "Presentation predictor is missing processing predictor: %s",
                join(models_dir, "processing_predictor_without_flanks"))

        weights_dataframe = pandas.read_csv(
            join(models_dir, "weights.csv"),
            index_col=0)

        result = cls(
            affinity_predictor=affinity_predictor,
            processing_predictor_with_flanks=processing_predictor_with_flanks,
            processing_predictor_without_flanks=processing_predictor_without_flanks,
            weights_dataframe=weights_dataframe)
        return result