Skip to content
Snippets Groups Projects
class1_ligandome_predictor.py 4.82 KiB
Newer Older
from __future__ import print_function

Tim O'Donnell's avatar
Tim O'Donnell committed
import time
import collections
Tim O'Donnell's avatar
Tim O'Donnell committed
from six import string_types
Tim O'Donnell's avatar
Tim O'Donnell committed

import numpy
Tim O'Donnell's avatar
Tim O'Donnell committed
import pandas
Tim O'Donnell's avatar
Tim O'Donnell committed
import mhcnames
import hashlib

from .hyperparameters import HyperparameterDefaults
Tim O'Donnell's avatar
Tim O'Donnell committed
from .class1_neural_network import Class1NeuralNetwork, DEFAULT_PREDICT_BATCH_SIZE
from .encodable_sequences import EncodableSequences
Tim O'Donnell's avatar
Tim O'Donnell committed
from .regression_target import from_ic50, to_ic50
from .random_negative_peptides import RandomNegativePeptides
Tim O'Donnell's avatar
Tim O'Donnell committed
from .allele_encoding import MultipleAlleleEncoding, AlleleEncoding
from .auxiliary_input import AuxiliaryInputEncoder
Tim O'Donnell's avatar
Tim O'Donnell committed
from .batch_generator import MultiallelicMassSpecBatchGenerator
Tim O'Donnell's avatar
Tim O'Donnell committed
from .custom_loss import (
    MSEWithInequalities,
Tim O'Donnell's avatar
Tim O'Donnell committed
    MultiallelicMassSpecLoss,
    ZeroLoss)
Tim O'Donnell's avatar
Tim O'Donnell committed

class Class1LigandomePredictor(object):
Tim O'Donnell's avatar
Tim O'Donnell committed
    def __init__(self, class1_ligandome_neural_networks, allele_to_sequence):
        self.networks = class1_ligandome_neural_networks
        self.allele_to_sequence = allele_to_sequence

    @property
    def max_alleles(self):
        max_alleles = self.networks[0].hyperparameters['max_alleles']
        assert all(
            n.hyperparameters['max_alleles'] == self.max_alleles
            for n in self.networks)
        return max_alleles

    def predict(self, peptides, alleles, batch_size=DEFAULT_PREDICT_BATCH_SIZE):
        return self.predict_to_dataframe(
            peptides=peptides,
            alleles=alleles,
            batch_size=batch_size).score.values

    def predict_to_dataframe(
Tim O'Donnell's avatar
Tim O'Donnell committed
            self,
            peptides,
Tim O'Donnell's avatar
Tim O'Donnell committed
            alleles,
            include_details=False,
Tim O'Donnell's avatar
Tim O'Donnell committed
            batch_size=DEFAULT_PREDICT_BATCH_SIZE):
Tim O'Donnell's avatar
Tim O'Donnell committed

        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list or array, not a string")
        if isinstance(alleles, string_types):
            raise TypeError(
Tim O'Donnell's avatar
Tim O'Donnell committed
                "alleles must be an iterable or MultipleAlleleEncoding")
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        peptides = EncodableSequences.create(peptides)
Tim O'Donnell's avatar
Tim O'Donnell committed

        if not isinstance(alleles, MultipleAlleleEncoding):
Tim O'Donnell's avatar
Tim O'Donnell committed
            if len(alleles) > self.max_alleles:
                raise ValueError(
                    "When alleles is a list, it must have at most %d elements. "
                    "These alleles are taken to be a genotype for an "
                    "individual, and the strongest prediction across alleles "
                    "will be taken for each peptide. Note that this differs "
                    "from Class1AffinityPredictor.predict(), where alleles "
                    "is expected to be the same length as peptides."
                    % (
                        self.max_alleles))
            alleles = MultipleAlleleEncoding(
                experiment_names=numpy.tile("experiment", len(peptides)),
                experiment_to_allele_list={
                    "experiment": alleles,
                },
Tim O'Donnell's avatar
Tim O'Donnell committed
                allele_to_sequence=self.allele_to_sequence,
Tim O'Donnell's avatar
Tim O'Donnell committed
                max_alleles_per_experiment=self.max_alleles)

        score_array = []
        affinity_array = []

        for (i, network) in enumerate(self.networks):
            predictions = network.predict(
                peptides=peptides,
                allele_encoding=alleles,
                batch_size=batch_size)
            score_array.append(predictions.score)
            affinity_array.append(predictions.affinity)

        score_array = numpy.array(score_array)
        affinity_array = numpy.array(affinity_array)

        ensemble_scores = numpy.mean(score_array, axis=0)
        ensemble_affinity = numpy.mean(affinity_array, axis=0)
        top_allele_index = numpy.argmax(ensemble_scores, axis=-1)
        top_score = ensemble_scores[top_allele_index]
        top_affinity = ensemble_affinity[top_allele_index]

        result_df = pandas.DataFrame({"peptide": peptides.sequences})
        result_df["allele"] = alleles.alleles[top_allele_index]
        result_df["score"] = top_score
        result_df["affinity"] = to_ic50(top_affinity)

        if include_details:
            for i in range(self.max_alleles):
                result_df["allele%d" % (i + 1)] = alleles.allele[:, i]
                result_df["allele%d score" % (i + 1)] = ensemble_scores[:, i]
                result_df["allele%d score low" % (i + 1)] = numpy.percentile(
                    score_array[:, :, i], 5.0, axis=0)
                result_df["allele%d score high" % (i + 1)] = numpy.percentile(
                    score_array[:, :, i], 95.0, axis=0)
                result_df["allele%d affinity" % (i + 1)] = to_ic50(
                    ensemble_affinity[:, i])
                result_df["allele%d affinity low" % (i + 1)] = numpy.percentile(
                    affinity_array[:, :, i], 5.0, axis=0)
                result_df["allele%d affinity high" % (i + 1)] = numpy.percentile(
                    affinity_array[:, :, i], 95.0, axis=0)
        return result_df


    # TODO: implement saving and loading