Skip to content
Snippets Groups Projects
class1_affinity_predictor.py 12.4 KiB
Newer Older
Tim O'Donnell's avatar
Tim O'Donnell committed
import collections
import time
import hashlib
Tim O'Donnell's avatar
Tim O'Donnell committed
import json
Tim O'Donnell's avatar
Tim O'Donnell committed
from os.path import join, exists
Tim O'Donnell's avatar
Tim O'Donnell committed
from six import string_types
Tim O'Donnell's avatar
Tim O'Donnell committed

import numpy
import pandas

import mhcnames

from ..encodable_sequences import EncodableSequences
Tim O'Donnell's avatar
Tim O'Donnell committed
from ..downloads import get_path
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
from .class1_neural_network import Class1NeuralNetwork
Tim O'Donnell's avatar
Tim O'Donnell committed


Tim O'Donnell's avatar
Tim O'Donnell committed
class LazyLoadingClass1NeuralNetwork(object):
    @classmethod
    def wrap(cls, instance):
        if isinstance(instance, cls):
            return instance
        elif isinstance(instance, Class1NeuralNetwork):
            return cls(model=instance)
        raise TypeError("Unsupported type: %s" % instance)

    @classmethod
    def wrap_list(cls, lst):
        return [
            cls.wrap(instance)
            for instance in lst
        ]

    def __init__(self, model=None, config=None, weights_filename=None):
        if model is None:
            assert config is not None
            assert weights_filename is not None
        else:
            assert config is None
            assert weights_filename is None

        self.model = model
        self.config = config
        self.weights_filename = weights_filename

    @property
    def instance(self):
        if self.model is None:
            self.model = Class1NeuralNetwork.from_config(self.config)
            self.model.restore_weights(self.weights_filename)
        return self.model


Tim O'Donnell's avatar
Tim O'Donnell committed
class Class1AffinityPredictor(object):
Tim O'Donnell's avatar
Tim O'Donnell committed
    def __init__(
            self,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_allele_specific_models=None,
            class1_pan_allele_models=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_pseudosequence=None,
            manifest_df=None):

Tim O'Donnell's avatar
Tim O'Donnell committed
        if allele_to_allele_specific_models is None:
            allele_to_allele_specific_models = {}
        if class1_pan_allele_models is None:
            class1_pan_allele_models = []

Tim O'Donnell's avatar
Tim O'Donnell committed
        if class1_pan_allele_models:
            assert allele_to_pseudosequence, "Pseudosequences required"

Tim O'Donnell's avatar
Tim O'Donnell committed
        self.allele_to_allele_specific_models = dict(
            (k, LazyLoadingClass1NeuralNetwork.wrap_list(v))
            for (k, v) in allele_to_allele_specific_models.items())
        self.class1_pan_allele_models = (
            LazyLoadingClass1NeuralNetwork.wrap_list(class1_pan_allele_models))
Tim O'Donnell's avatar
Tim O'Donnell committed
        self.allele_to_pseudosequence = allele_to_pseudosequence

        if manifest_df is None:
            manifest_df = pandas.DataFrame()
Tim O'Donnell's avatar
Tim O'Donnell committed
            manifest_df["model_name"] = []
Tim O'Donnell's avatar
Tim O'Donnell committed
            manifest_df["allele"] = []
Tim O'Donnell's avatar
Tim O'Donnell committed
            manifest_df["config_json"] = []
Tim O'Donnell's avatar
Tim O'Donnell committed
            manifest_df["model"] = []
        self.manifest_df = manifest_df

    def save(self, models_dir, model_names_to_write=None):
        num_models = len(self.class1_pan_allele_models) + sum(
            len(v) for v in self.allele_to_allele_specific_models.values())
        assert len(self.manifest_df) == num_models, (
            "Manifest seems out of sync with models: %d vs %d entries" % (
                len(self.manifest_df), num_models))

        if model_names_to_write is None:
            # Write all models
Tim O'Donnell's avatar
Tim O'Donnell committed
            model_names_to_write = self.manifest_df.model_name.values
Tim O'Donnell's avatar
Tim O'Donnell committed

        sub_manifest_df = self.manifest_df.ix[
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.manifest_df.model_name.isin(model_names_to_write)
Tim O'Donnell's avatar
Tim O'Donnell committed
        ]

        for (_, row) in sub_manifest_df.iterrows():
Tim O'Donnell's avatar
Tim O'Donnell committed
            weights_path = self.weights_path(models_dir, row.model_name)
Tim O'Donnell's avatar
Tim O'Donnell committed
            row.model.instance.save_weights(weights_path)
Tim O'Donnell's avatar
Tim O'Donnell committed
            print("Wrote: %s" % weights_path)
Tim O'Donnell's avatar
Tim O'Donnell committed

        write_manifest_df = self.manifest_df[[
            c for c in self.manifest_df.columns if c != "model"
        ]]
        manifest_path = join(models_dir, "manifest.csv")
        write_manifest_df.to_csv(manifest_path, index=False)
        print("Wrote: %s" % manifest_path)

    @staticmethod
    def model_name(allele, num):
        random_string = hashlib.sha1(
            str(time.time()).encode()).hexdigest()[:16]
Tim O'Donnell's avatar
Tim O'Donnell committed
        return "%s-%d-%s" % (allele.upper(), num, random_string)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def weights_path(models_dir, model_name):
        return join(
            models_dir,
Tim O'Donnell's avatar
Tim O'Donnell committed
            "weights_%s.%s" % (
Tim O'Donnell's avatar
Tim O'Donnell committed
                model_name, Class1NeuralNetwork.weights_filename_extension))

Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
Tim O'Donnell's avatar
Tim O'Donnell committed
    def load(models_dir=None, max_models=None):
        if models_dir is None:
            models_dir = get_path("models_class1", "models")

Tim O'Donnell's avatar
Tim O'Donnell committed
        manifest_path = join(models_dir, "manifest.csv")
        manifest_df = pandas.read_csv(manifest_path, nrows=max_models)

        allele_to_allele_specific_models = collections.defaultdict(list)
        class1_pan_allele_models = []
        all_models = []
        for (_, row) in manifest_df.iterrows():
Tim O'Donnell's avatar
Tim O'Donnell committed
            model = LazyLoadingClass1NeuralNetwork(
                config=json.loads(row.config_json),
                weights_filename=Class1AffinityPredictor.weights_path(
                    models_dir, row.model_name)
            )
Tim O'Donnell's avatar
Tim O'Donnell committed
            if row.allele == "pan-class1":
                class1_pan_allele_models.append(model)
            else:
                allele_to_allele_specific_models[row.allele].append(model)
            all_models.append(model)

        manifest_df["model"] = all_models

        pseudosequences = None
        if exists(join(models_dir, "pseudosequences.csv")):
            pseudosequences = pandas.read_csv(
                join(models_dir, "pseudosequences.csv"),
                index_col="allele").to_dict()

        print(
            "Loaded %d class1 pan allele predictors, %d pseudosequences, and "
            "%d allele specific models: %s" % (
                len(class1_pan_allele_models),
                len(pseudosequences) if pseudosequences else 0,
                sum(len(v) for v in allele_to_allele_specific_models.values()),
                ", ".join(
                    "%s (%d)" % (allele, len(v))
                    for (allele, v)
                    in sorted(allele_to_allele_specific_models.items()))))

Tim O'Donnell's avatar
Tim O'Donnell committed
        result = Class1AffinityPredictor(
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_to_allele_specific_models=allele_to_allele_specific_models,
            class1_pan_allele_models=class1_pan_allele_models,
            allele_to_pseudosequence=pseudosequences,
            manifest_df=manifest_df)
        return result

    def fit_allele_specific_predictors(
            self,
            n_models,
            architecture_hyperparameters,
            allele,
            peptides,
            affinities,
            models_dir_for_save=None,
            verbose=1):

        allele = mhcnames.normalize_allele_name(allele)
        models = self._fit_predictors(
            n_models=n_models,
            architecture_hyperparameters=architecture_hyperparameters,
Tim O'Donnell's avatar
Tim O'Donnell committed
            peptides=peptides,
Tim O'Donnell's avatar
Tim O'Donnell committed
            affinities=affinities,
            allele_pseudosequences=None,
            verbose=verbose)

Tim O'Donnell's avatar
Tim O'Donnell committed
        if allele not in self.allele_to_allele_specific_models:
            self.allele_to_allele_specific_models[allele] = []

Tim O'Donnell's avatar
Tim O'Donnell committed
        models_list = []
        for (i, model) in enumerate(models):
Tim O'Donnell's avatar
Tim O'Donnell committed
            lazy_model = LazyLoadingClass1NeuralNetwork.wrap(model)
Tim O'Donnell's avatar
Tim O'Donnell committed
            model_name = self.model_name(allele, i)
Tim O'Donnell's avatar
Tim O'Donnell committed
            models_list.append(model)  # models is a generator
Tim O'Donnell's avatar
Tim O'Donnell committed
            row = pandas.Series(collections.OrderedDict([
                ("model_name", model_name),
                ("allele", allele),
                ("config_json", json.dumps(model.get_config())),
Tim O'Donnell's avatar
Tim O'Donnell committed
                ("model", lazy_model),
Tim O'Donnell's avatar
Tim O'Donnell committed
            ])).to_frame().T
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.manifest_df = pandas.concat(
                [self.manifest_df, row], ignore_index=True)
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.allele_to_allele_specific_models[allele].append(lazy_model)
Tim O'Donnell's avatar
Tim O'Donnell committed
            if models_dir_for_save:
Tim O'Donnell's avatar
Tim O'Donnell committed
                self.save(
                    models_dir_for_save, model_names_to_write=[model_name])
Tim O'Donnell's avatar
Tim O'Donnell committed
        return models

    def fit_class1_pan_allele_models(
            self,
            n_models,
            architecture_hyperparameters,
            alleles,
            peptides,
            affinities,
            models_dir_for_save=None,
            verbose=1):

        alleles = pandas.Series(alleles).map(mhcnames.normalize_allele_name)
        allele_pseudosequences = alleles.map(self.allele_to_pseudosequence)

        models = self._fit_predictors(
            n_models=n_models,
            architecture_hyperparameters=architecture_hyperparameters,
            peptides=peptides,
            affinities=affinities,
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_pseudosequences=allele_pseudosequences,
            verbose=verbose)
Tim O'Donnell's avatar
Tim O'Donnell committed

        for (i, model) in enumerate(models):
Tim O'Donnell's avatar
Tim O'Donnell committed
            lazy_model = LazyLoadingClass1NeuralNetwork.wrap(model)
Tim O'Donnell's avatar
Tim O'Donnell committed
            model_name = self.model_name("pan-class1", i)
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.class1_pan_allele_models.append(lazy_model)
Tim O'Donnell's avatar
Tim O'Donnell committed
            row = pandas.Series(collections.OrderedDict([
                ("model_name", model_name),
                ("allele", "pan-class1"),
                ("config_json", json.dumps(model.get_config())),
Tim O'Donnell's avatar
Tim O'Donnell committed
                ("model", lazy_model),
Tim O'Donnell's avatar
Tim O'Donnell committed
            ])).to_frame().T
Tim O'Donnell's avatar
Tim O'Donnell committed
            self.manifest_df = pandas.concat(
                [self.manifest_df, row], ignore_index=True)
            if models_dir_for_save:
Tim O'Donnell's avatar
Tim O'Donnell committed
                self.save(
                    models_dir_for_save, model_names_to_write=[model_name])
Tim O'Donnell's avatar
Tim O'Donnell committed
        return models

    def _fit_predictors(
            self,
            n_models,
            architecture_hyperparameters,
            peptides,
            affinities,
            allele_pseudosequences,
            verbose=1):

        encodable_peptides = EncodableSequences.create(peptides)
        for i in range(n_models):
            print("Training model %d / %d" % (i + 1, n_models))
Tim O'Donnell's avatar
Tim O'Donnell committed
            model = Class1NeuralNetwork(**architecture_hyperparameters)
Tim O'Donnell's avatar
Tim O'Donnell committed
            model.fit(
                encodable_peptides,
                affinities,
                allele_pseudosequences=allele_pseudosequences,
                verbose=verbose)
            yield model

Tim O'Donnell's avatar
Tim O'Donnell committed
    def predict(self, peptides, alleles=None, allele=None):
        df = self.predict_to_dataframe(
            peptides=peptides,
            alleles=alleles,
            allele=allele
        )
        return df.prediction.values

    def predict_to_dataframe(
Tim O'Donnell's avatar
Tim O'Donnell committed
            self,
            peptides,
Tim O'Donnell's avatar
Tim O'Donnell committed
            alleles=None,
            allele=None,
            include_individual_model_predictions=False):
        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list or array, not a string")
        if isinstance(alleles, string_types):
            raise TypeError("alleles must be a list or array, not a string")
        if allele is not None:
            if alleles is not None:
                raise ValueError("Specify exactly one of allele or alleles")
            alleles = [allele] * len(peptides)

        df = pandas.DataFrame({
Tim O'Donnell's avatar
Tim O'Donnell committed
            'peptide': peptides,
            'allele': alleles,
        })
Tim O'Donnell's avatar
Tim O'Donnell committed
        df["normalized_allele"] = df.allele.map(
Tim O'Donnell's avatar
Tim O'Donnell committed
            mhcnames.normalize_allele_name)

        if self.class1_pan_allele_models:
Tim O'Donnell's avatar
Tim O'Donnell committed
            allele_pseudosequences = df.normalized_allele.map(
Tim O'Donnell's avatar
Tim O'Donnell committed
                self.allele_to_pseudosequence)
            encodable_peptides = EncodableSequences.create(
Tim O'Donnell's avatar
Tim O'Donnell committed
                df.peptide.values)
            for (i, model) in enumerate(self.class1_pan_allele_models):
Tim O'Donnell's avatar
Tim O'Donnell committed
                df["model_pan_%d" % i] = model.instance.predict(
Tim O'Donnell's avatar
Tim O'Donnell committed
                    encodable_peptides,
                    allele_pseudosequences=allele_pseudosequences)
Tim O'Donnell's avatar
Tim O'Donnell committed

        if self.allele_to_allele_specific_models:
Tim O'Donnell's avatar
Tim O'Donnell committed
            for allele in df.normalized_allele.unique():
                mask = (df.normalized_allele == allele).values
Tim O'Donnell's avatar
Tim O'Donnell committed
                allele_peptides = EncodableSequences.create(
Tim O'Donnell's avatar
Tim O'Donnell committed
                    df.ix[mask].peptide.values)
Tim O'Donnell's avatar
Tim O'Donnell committed
                models = self.allele_to_allele_specific_models.get(allele, [])
Tim O'Donnell's avatar
Tim O'Donnell committed
                for (i, model) in enumerate(models):
Tim O'Donnell's avatar
Tim O'Donnell committed
                    df.loc[
                        mask, "model_single_%d" % i
                    ] = model.instance.predict(allele_peptides)
Tim O'Donnell's avatar
Tim O'Donnell committed

        # Geometric mean
Tim O'Donnell's avatar
Tim O'Donnell committed
        df_predictions = df[
            [c for c in df.columns if c.startswith("model_")]
Tim O'Donnell's avatar
Tim O'Donnell committed
        ]
Tim O'Donnell's avatar
Tim O'Donnell committed
        logs = numpy.log(df_predictions)
        log_means = logs.mean(1)
Tim O'Donnell's avatar
Tim O'Donnell committed
        df["prediction"] = numpy.exp(log_means)
Tim O'Donnell's avatar
Tim O'Donnell committed
        df["prediction_low"] = numpy.exp(logs.quantile(0.05, axis=1))
        df["prediction_high"] = numpy.exp(logs.quantile(0.95, axis=1))
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        del df["normalized_allele"]
Tim O'Donnell's avatar
Tim O'Donnell committed
        if include_individual_model_predictions:
Tim O'Donnell's avatar
Tim O'Donnell committed
            columns = sorted(df.columns, key=lambda c: c.startswith('model_'))
        else:
            columns = [
                c for c in df.columns if c not in df_predictions.columns
            ]
        return df[columns]