Skip to content
Snippets Groups Projects
class1_affinity_predictor.py 46.4 KiB
Newer Older
            df["prediction_low"] = numpy.exp(numpy.percentile(logs, 5.0, axis=1))
            df["prediction_high"] = numpy.exp(numpy.percentile(logs, 95.0, axis=1))

Tim O'Donnell's avatar
Tim O'Donnell committed
        if include_individual_model_predictions:
            for i in range(num_pan_models):
                df["model_pan_%d" % i] = predictions_array[:, i]

            for i in range(max_single_allele_models):
                df["model_single_%d" % i] = predictions_array[
                    :, num_pan_models + i
                ]
Tim O'Donnell's avatar
Tim O'Donnell committed
        if include_percentile_ranks:
            if self.allele_to_percent_rank_transform:
                df["prediction_percentile"] = self.percentile_ranks(
                    df.prediction,
                    alleles=df.normalized_allele.values,
                    throw=throw)
Tim O'Donnell's avatar
Tim O'Donnell committed
            else:
                warnings.warn("No percentile rank information available.")
Tim O'Donnell's avatar
Tim O'Donnell committed

        del df["supported_peptide_length"]
        del df["normalized_allele"]
Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def save_weights(weights_list, filename):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Save the model weights to the given filename using numpy's ".npz"
        format.
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        weights_list : list of array
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        filename : string
            Should end in ".npz".
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        numpy.savez(
            filename,
            **dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def load_weights(filename):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Restore model weights from the given filename, which should have been
        created with `save_weights`.
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        filename : string
            Should end in ".npz".
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        Returns
Tim O'Donnell's avatar
Tim O'Donnell committed
        ----------
        list of array
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        loaded = numpy.load(filename)
        weights = [
            loaded["array_%d" % i]
            for i in range(len(loaded.keys()))
        ]
        loaded.close()
        return weights

    def calibrate_percentile_ranks(
            self,
            peptides=None,
            num_peptides_per_length=int(1e5),
            alleles=None,
        """
        Compute the cumulative distribution of ic50 values for a set of alleles
        over a large universe of random peptides, to enable computing quantiles in
        this distribution later.

        Parameters
        ----------
        peptides : sequence of string or EncodableSequences, optional
            Peptides to use
        num_peptides_per_length : int, optional
            If peptides argument is not specified, then num_peptides_per_length
            peptides are randomly sampled from a uniform distribution for each
            supported length
        alleles : sequence of string, optional
            Alleles to perform calibration for. If not specified all supported
            alleles will be calibrated.
        bins : object
            Anything that can be passed to numpy.histogram's "bins" argument
            can be used here, i.e. either an integer or a sequence giving bin
            edges. This is in ic50 space.

        Returns
        ----------
        EncodableSequences : peptides used for calibration
        """
        if bins is None:
            bins = to_ic50(numpy.linspace(1, 0, 1000))

        if alleles is None:
            alleles = self.supported_alleles

        if peptides is None:
            peptides = []
            lengths = range(
                self.supported_peptide_lengths[0],
                self.supported_peptide_lengths[1] + 1)
            for length in lengths:
                peptides.extend(
                    random_peptides(num_peptides_per_length, length))

        encoded_peptides = EncodableSequences.create(peptides)

        for (i, allele) in enumerate(alleles):
            predictions = self.predict(encoded_peptides, allele=allele)
            transform = PercentRankTransform()
            transform.fit(predictions, bins=bins)
            self.allele_to_percent_rank_transform[allele] = transform
    def filter_networks(self, predicate):
        """
        Return a new Class1AffinityPredictor containing a subset of this
        predictor's neural networks.

        Parameters
        ----------
        predicate : Class1NeuralNetwork -> boolean
            Function specifying which neural networks to include
        Returns
        -------
        Class1AffinityPredictor
        """
        allele_to_allele_specific_models = {}
        for (allele, models) in self.allele_to_allele_specific_models.items():
            allele_to_allele_specific_models[allele] = [
                m for m in models if predicate(m)
            ]
        class1_pan_allele_models = [
            m for m in self.class1_pan_allele_models if predicate(m)
        ]

        return Class1AffinityPredictor(
            allele_to_allele_specific_models=allele_to_allele_specific_models,
            class1_pan_allele_models=class1_pan_allele_models,
            allele_to_fixed_length_sequence=self.allele_to_fixed_length_sequence,
        )

    def model_select(
            self,
            score_function,
            alleles=None,
            min_models=1,
            max_models=10000):
        """
        Perform model selection using a user-specified scoring function.

        Model selection is done using a "step up" variable selection procedure,
        in which models are repeatedly added to an ensemble until the score
        stops improving.

        Parameters
        ----------
        score_function : Class1AffinityPredictor -> float function
            Scoring function

        alleles : list of string, optional
            If not specified, model selection is performed for all alleles.

        min_models : int, optional
            Min models to select per allele

        max_models : int, optional
            Max models to select per allele

        Returns
        -------
        Class1AffinityPredictor : predictor containing the selected models
        """

        if alleles is None:
            alleles = self.supported_alleles

        dfs = []
        allele_to_allele_specific_models = {}
        for allele in alleles:
            df = pandas.DataFrame({
                'model': self.allele_to_allele_specific_models[allele]
            })
            df["model_num"] = df.index
            df["allele"] = allele
            df["selected"] = False

            round_num = 1

            while not df.selected.all() and sum(df.selected) < max_models:
                score_col = "score_%2d" % round_num
                prev_score_col = "score_%2d" % (round_num - 1)

                existing_selected = list(df[df.selected].model)
                df[score_col] = [
                    numpy.nan if row.selected else
                    score_function(
                        Class1AffinityPredictor(
                            allele_to_allele_specific_models={
                                allele: [row.model] + existing_selected
                    }))
                    for (_, row) in df.iterrows()
                ]

                if round_num > min_models and (
                        df[score_col].max() < df[prev_score_col].max()):
                    break

                # In case of a tie, pick a model at random.
                (best_model_index,) = df.loc[
                    (df[score_col] == df[score_col].max())
                ].sample(1).index
                df.loc[best_model_index, "selected"] = True
                round_num += 1

            dfs.append(df)
            print("Selected %d models for allele %s" % (
            df.selected.sum(), allele))
            allele_to_allele_specific_models[allele] = list(
                df.loc[df.selected].model)

        df = pandas.concat(dfs, ignore_index=True)

        new_predictor = Class1AffinityPredictor(
            allele_to_allele_specific_models,
            metadata_dataframes={
                "model_selection": df,
            })
        return new_predictor