class1_ligandome_predictor.py

from __future__ import print_function

import time
import collections
from six import string_types

import numpy
import pandas
import mhcnames
import hashlib

from .hyperparameters import HyperparameterDefaults
from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
from .encodable_sequences import EncodableSequences
from .regression_target import from_ic50, to_ic50
from .random_negative_peptides import RandomNegativePeptides
from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding
from .auxiliary_input import AuxiliaryInputEncoder
from .batch_generator import MultiallelicMassSpecBatchGenerator
from .custom_loss import (
    MSEWithInequalities,
    MultiallelicMassSpecLoss,
    ZeroLoss)


class Class1LigandomePredictor(object):
    def __init__(self, class1_ligandome_neural_networks, allele_to_sequence):
        self.networks = class1_ligandome_neural_networks
        self.allele_to_sequence = allele_to_sequence

    @property
    def max_alleles(self):
        max_alleles = self.networks[0].hyperparameters['max_alleles']
        assert all(
            n.hyperparameters['max_alleles'] == self.max_alleles
            for n in self.networks)
        return max_alleles

    def predict(self, peptides, alleles, batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        return self.predict_to_dataframe(
            peptides=peptides,
            alleles=alleles,
            batch_size=batch_size).score.values

    def predict_to_dataframe(
            self,
            peptides,
            alleles,
            include_details=False,
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):

        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list or array, not a string")
        if isinstance(alleles, string_types):
            raise TypeError(
                "alleles must be an iterable or MultipleAlleleEncoding")

        peptides = EncodableSequences.create(peptides)

        if not isinstance(alleles, MultipleAlleleEncoding):
            if len(alleles) > self.max_alleles:
                raise ValueError(
                    "When alleles is a list, it must have at most %d elements. "
                    "These alleles are taken to be a genotype for an "
                    "individual, and the strongest prediction across alleles "
                    "will be taken for each peptide. Note that this differs "
                    "from Class1AffinityPredictor.predict(), where alleles "
                    "is expected to be the same length as peptides."
                    % (
                        self.max_alleles))
            alleles = MultipleAlleleEncoding(
                experiment_names=numpy.tile("experiment", len(peptides)),
                experiment_to_allele_list={
                    "experiment": alleles,
                },
                allele_to_sequence=self.allele_to_sequence,
                max_alleles_per_experiment=self.max_alleles)

        score_array = []
        affinity_array = []

        for (i, network) in enumerate(self.networks):
            predictions = network.predict(
                peptides=peptides,
                allele_encoding=alleles,
                batch_size=batch_size)
            score_array.append(predictions.score)
            affinity_array.append(predictions.affinity)

        score_array = numpy.array(score_array)
        affinity_array = numpy.array(affinity_array)

        ensemble_scores = numpy.mean(score_array, axis=0)
        ensemble_affinity = numpy.mean(affinity_array, axis=0)
        top_allele_index = numpy.argmax(ensemble_scores, axis=-1)
        top_score = ensemble_scores[top_allele_index]
        top_affinity = ensemble_affinity[top_allele_index]

        result_df = pandas.DataFrame({"peptide": peptides.sequences})
        result_df["allele"] = alleles.alleles[top_allele_index]
        result_df["score"] = top_score
        result_df["affinity"] = to_ic50(top_affinity)

        if include_details:
            for i in range(self.max_alleles):
                result_df["allele%d" % (i + 1)] = alleles.allele[:, i]
                result_df["allele%d score" % (i + 1)] = ensemble_scores[:, i]
                result_df["allele%d score low" % (i + 1)] = numpy.percentile(
                    score_array[:, :, i], 5.0, axis=0)
                result_df["allele%d score high" % (i + 1)] = numpy.percentile(
                    score_array[:, :, i], 95.0, axis=0)
                result_df["allele%d affinity" % (i + 1)] = to_ic50(
                    ensemble_affinity[:, i])
                result_df["allele%d affinity low" % (i + 1)] = numpy.percentile(
                    affinity_array[:, :, i], 5.0, axis=0)
                result_df["allele%d affinity high" % (i + 1)] = numpy.percentile(
                    affinity_array[:, :, i], 95.0, axis=0)
        return result_df


    # TODO: implement saving and loading