Skip to content
Snippets Groups Projects
class1_affinity_predictor.py 56 KiB
Newer Older
                    'prediction_low',
                    'prediction_high'
                ])
            return empty_result

        (min_peptide_length, max_peptide_length) = (
            self.supported_peptide_lengths)
Tim O'Donnell's avatar
Tim O'Donnell committed

        if (peptides.min_length < min_peptide_length or
                peptides.max_length > max_peptide_length):
            # Only compute this if needed
            all_peptide_lengths_supported = False
            sequence_length = df.peptide.str.len()
Tim O'Donnell's avatar
Tim O'Donnell committed
            df["supported_peptide_length"] = (
                (sequence_length >= min_peptide_length) &
                (sequence_length <= max_peptide_length))
Tim O'Donnell's avatar
Tim O'Donnell committed
            if (~df.supported_peptide_length).any():
                msg = (
                    "%d peptides have lengths outside of supported range [%d, %d]: "
                    "%s" % (
                        (~df.supported_peptide_length).sum(),
                        min_peptide_length,
                        max_peptide_length,
Tim O'Donnell's avatar
Tim O'Donnell committed
                        str(df.loc[~df.supported_peptide_length].peptide.unique())))
Tim O'Donnell's avatar
Tim O'Donnell committed
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
        else:
            # Handle common case efficiently.
            df["supported_peptide_length"] = True
            all_peptide_lengths_supported = True
        num_pan_models = (
            len(self.class1_pan_allele_models)
            if not self.optimization_info.get("pan_models_merged", False)
            else self.optimization_info["num_pan_models_merged"])
        max_single_allele_models = max(
            len(self.allele_to_allele_specific_models.get(allele, []))
            for allele in unique_alleles
        )
        predictions_array = numpy.zeros(
            shape=(df.shape[0], num_pan_models + max_single_allele_models),
            dtype="float64")
        predictions_array[:] = numpy.nan

Tim O'Donnell's avatar
Tim O'Donnell committed
        if self.class1_pan_allele_models:
            master_allele_encoding = self.get_master_allele_encoding()
Tim O'Donnell's avatar
Tim O'Donnell committed
            unsupported_alleles = [
                allele for allele in
                df.normalized_allele.unique()
Tim O'Donnell's avatar
Tim O'Donnell committed
            ]
            if unsupported_alleles:
                msg = (
Tim O'Donnell's avatar
Tim O'Donnell committed
                    "No sequences for allele(s): %s.\n"
Tim O'Donnell's avatar
Tim O'Donnell committed
                    "Supported alleles: %s" % (
                        " ".join(unsupported_alleles),
                        " ".join(sorted(self.allele_to_sequence))))
Tim O'Donnell's avatar
Tim O'Donnell committed
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
Tim O'Donnell's avatar
Tim O'Donnell committed
            mask = df.supported_peptide_length & (
                ~df.normalized_allele.isin(unsupported_alleles))

            row_slice = None
Tim O'Donnell's avatar
Tim O'Donnell committed
            if mask is None or mask.all():
                row_slice = slice(None, None, None)  # all rows
                masked_allele_encoding = AlleleEncoding(
Tim O'Donnell's avatar
Tim O'Donnell committed
                    df.normalized_allele,
                    borrow_from=master_allele_encoding)
                masked_peptides = peptides
            elif mask.sum() > 0:
                row_slice = mask
                masked_allele_encoding = AlleleEncoding(
                    df.loc[mask].normalized_allele,
                    borrow_from=master_allele_encoding)
                masked_peptides = peptides.sequences[mask]
Tim O'Donnell's avatar
Tim O'Donnell committed

            if row_slice is not None:
Tim O'Donnell's avatar
Tim O'Donnell committed
                # The following line is a performance optimization that may be
                # revisited. It causes the neural network to set to include
                # only the alleles actually being predicted for. This makes
                # the network much smaller. However, subsequent calls to
                # predict will need to reset these weights, so there is a
                # tradeoff.
                masked_allele_encoding = masked_allele_encoding.compact()

                if self.optimization_info.get("pan_models_merged"):
                    # Multiple pan-allele models have been merged into one
                    # at the tensorflow level.
                    assert len(self.class1_pan_allele_models) == 1
                    predictions = self.class1_pan_allele_models[0].predict(
                        masked_peptides,
Tim O'Donnell's avatar
Tim O'Donnell committed
                        allele_encoding=masked_allele_encoding,
                        output_index=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
                        **model_kwargs)
                    predictions_array[row_slice, :num_pan_models] = predictions
                else:
                    for (i, model) in enumerate(self.class1_pan_allele_models):
                        predictions_array[row_slice, i] = model.predict(
                            masked_peptides,
                            allele_encoding=masked_allele_encoding,
                            **model_kwargs)
Tim O'Donnell's avatar
Tim O'Donnell committed

        if self.allele_to_allele_specific_models:
Tim O'Donnell's avatar
Tim O'Donnell committed
            unsupported_alleles = [
Tim O'Donnell's avatar
Tim O'Donnell committed
                allele for allele in unique_alleles
Tim O'Donnell's avatar
Tim O'Donnell committed
                if not self.allele_to_allele_specific_models.get(allele)
            ]
            if unsupported_alleles:
                msg = (
                    "No single-allele models for allele(s): %s.\n"
                    "Supported alleles are: %s" % (
Tim O'Donnell's avatar
Tim O'Donnell committed
                        " ".join(unsupported_alleles),
Tim O'Donnell's avatar
Tim O'Donnell committed
                        " ".join(sorted(self.allele_to_allele_specific_models))))
Tim O'Donnell's avatar
Tim O'Donnell committed
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
            for allele in unique_alleles:
Tim O'Donnell's avatar
Tim O'Donnell committed
                models = self.allele_to_allele_specific_models.get(allele, [])
Tim O'Donnell's avatar
Tim O'Donnell committed
                if len(unique_alleles) == 1 and all_peptide_lengths_supported:
Tim O'Donnell's avatar
Tim O'Donnell committed
                    mask = None
                else:
                    mask = (
                        (df.normalized_allele == allele) &
                        df.supported_peptide_length).values
                if mask is None or mask.all():
                    # Common case optimization
                    for (i, model) in enumerate(models):
                        predictions_array[:, num_pan_models + i] = (
Tim O'Donnell's avatar
Tim O'Donnell committed
                            model.predict(peptides, **model_kwargs))
                elif mask.sum() > 0:
                    peptides_for_allele = EncodableSequences.create(
Tim O'Donnell's avatar
Tim O'Donnell committed
                        df.loc[mask].peptide.values)
                    for (i, model) in enumerate(models):
                        predictions_array[
                            mask,
                            num_pan_models + i,
Tim O'Donnell's avatar
Tim O'Donnell committed
                        ] = model.predict(peptides_for_allele, **model_kwargs)

        if callable(centrality_measure):
            centrality_function = centrality_measure
        else:
            centrality_function = CENTRALITY_MEASURES[centrality_measure]

        logs = numpy.log(predictions_array)
        log_centers = centrality_function(logs)
        df["prediction"] = numpy.exp(log_centers)
Tim O'Donnell's avatar
Tim O'Donnell committed
        if include_confidence_intervals:
            df["prediction_low"] = numpy.exp(numpy.nanpercentile(logs, 5.0, axis=1))
            df["prediction_high"] = numpy.exp(numpy.nanpercentile(logs, 95.0, axis=1))
Tim O'Donnell's avatar
Tim O'Donnell committed
        if include_individual_model_predictions:
            for i in range(num_pan_models):
                df["model_pan_%d" % i] = predictions_array[:, i]

            for i in range(max_single_allele_models):
                df["model_single_%d" % i] = predictions_array[
                    :, num_pan_models + i
                ]
Tim O'Donnell's avatar
Tim O'Donnell committed
        if include_percentile_ranks:
            if self.allele_to_percent_rank_transform:
                df["prediction_percentile"] = self.percentile_ranks(
                    df.prediction,
                    alleles=df.normalized_allele.values,
                    throw=throw)
Tim O'Donnell's avatar
Tim O'Donnell committed
            else:
                warnings.warn("No percentile rank information available.")
Tim O'Donnell's avatar
Tim O'Donnell committed

        del df["supported_peptide_length"]
        del df["normalized_allele"]
Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def save_weights(weights_list, filename):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Save the model weights to the given filename using numpy's ".npz"
        format.
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        weights_list : list of array
Tim O'Donnell's avatar
Tim O'Donnell committed
        
Tim O'Donnell's avatar
Tim O'Donnell committed
        filename : string
            Should end in ".npz".
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        numpy.savez(
            filename,
            **dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))
Tim O'Donnell's avatar
Tim O'Donnell committed
    @staticmethod
    def load_weights(filename):
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        Restore model weights from the given filename, which should have been
        created with `save_weights`.
    
Tim O'Donnell's avatar
Tim O'Donnell committed
        Parameters
        ----------
Tim O'Donnell's avatar
Tim O'Donnell committed
        filename : string
            Should end in ".npz".
Tim O'Donnell's avatar
Tim O'Donnell committed

Tim O'Donnell's avatar
Tim O'Donnell committed
        Returns
Tim O'Donnell's avatar
Tim O'Donnell committed
        ----------
        list of array
Tim O'Donnell's avatar
Tim O'Donnell committed
        """
Tim O'Donnell's avatar
Tim O'Donnell committed
        with numpy.load(filename) as loaded:
            weights = [
                loaded["array_%d" % i]
                for i in range(len(loaded.keys()))
            ]
Tim O'Donnell's avatar
Tim O'Donnell committed
        return weights

    def calibrate_percentile_ranks(
            self,
            peptides=None,
            num_peptides_per_length=int(1e5),
            alleles=None,
Tim O'Donnell's avatar
Tim O'Donnell committed
            bins=None,
            motif_summary=False,
            summary_top_peptide_fractions=[0.001],
Tim O'Donnell's avatar
Tim O'Donnell committed
            verbose=False,
            model_kwargs={}):
        """
        Compute the cumulative distribution of ic50 values for a set of alleles
        over a large universe of random peptides, to enable computing quantiles in
        this distribution later.

        Parameters
        ----------
        peptides : sequence of string or EncodableSequences, optional
            Peptides to use
        num_peptides_per_length : int, optional
            If peptides argument is not specified, then num_peptides_per_length
            peptides are randomly sampled from a uniform distribution for each
            supported length
        alleles : sequence of string, optional
            Alleles to perform calibration for. If not specified all supported
            alleles will be calibrated.
        bins : object
            Anything that can be passed to numpy.histogram's "bins" argument
            can be used here, i.e. either an integer or a sequence giving bin
            edges. This is in ic50 space.

        Returns
        ----------
        EncodableSequences : peptides used for calibration
        """
        if bins is None:
            bins = to_ic50(numpy.linspace(1, 0, 1000))

        if alleles is None:
            alleles = self.supported_alleles

        if peptides is None:
            peptides = []
            lengths = range(
                self.supported_peptide_lengths[0],
                self.supported_peptide_lengths[1] + 1)
            for length in lengths:
                peptides.extend(
                    random_peptides(num_peptides_per_length, length))

        encoded_peptides = EncodableSequences.create(peptides)

Tim O'Donnell's avatar
Tim O'Donnell committed
        if motif_summary:
            frequency_matrices = []
            length_distributions = []
        else:
            frequency_matrices = None
            length_distributions = None
        for (i, allele) in enumerate(alleles):
Tim O'Donnell's avatar
Tim O'Donnell committed
            start = time.time()
Tim O'Donnell's avatar
Tim O'Donnell committed
            predictions = self.predict(
                encoded_peptides, allele=allele, model_kwargs=model_kwargs)
Tim O'Donnell's avatar
Tim O'Donnell committed
            if verbose:
                elapsed = time.time() - start
                print(
                    "Generated %d predictions for allele %s in %0.2f sec: "
                    "%0.2f predictions / sec" % (
                        len(encoded_peptides.sequences),
                        allele,
                        elapsed,
                        len(encoded_peptides.sequences) / elapsed))
            transform = PercentRankTransform()
            transform.fit(predictions, bins=bins)
            self.allele_to_percent_rank_transform[allele] = transform
Tim O'Donnell's avatar
Tim O'Donnell committed
            if frequency_matrices is not None:
                predictions_df = pandas.DataFrame({
                    'peptide': encoded_peptides.sequences,
                    'prediction': predictions
                }).drop_duplicates('peptide').set_index("peptide")
                predictions_df["length"] = predictions_df.index.str.len()
                for (length, sub_df) in predictions_df.groupby("length"):
                    for cutoff_fraction in summary_top_peptide_fractions:
                        selected = sub_df.prediction.nsmallest(
                            max(
                                int(len(sub_df) * cutoff_fraction),
                                1)).index.values
                        matrix = positional_frequency_matrix(selected).reset_index()
                        original_columns = list(matrix.columns)
                        matrix["allele"] = allele
                        matrix["length"] = length
                        matrix["cutoff_fraction"] = cutoff_fraction
                        matrix["cutoff_count"] = len(selected)
                        matrix = matrix[
                            ["allele", "length", "cutoff_fraction", "cutoff_count"]
                            + original_columns
                        ]
                        frequency_matrices.append(matrix)
Tim O'Donnell's avatar
Tim O'Donnell committed

                # Length distribution
                for cutoff_fraction in summary_top_peptide_fractions:
                    cutoff_count = max(
                        int(len(predictions_df) * cutoff_fraction), 1)
                    length_distribution = predictions_df.prediction.nsmallest(
                        cutoff_count).index.str.len().value_counts()
                    length_distribution.index.name = "length"
                    length_distribution /= length_distribution.sum()
                    length_distribution = length_distribution.to_frame()
                    length_distribution.columns = ["fraction"]
                    length_distribution = length_distribution.reset_index()
                    length_distribution["allele"] = allele
                    length_distribution["cutoff_fraction"] = cutoff_fraction
                    length_distribution["cutoff_count"] = cutoff_count
                    length_distribution = length_distribution[[
                        "allele",
                        "cutoff_fraction",
                        "cutoff_count",
                        "length",
                        "fraction"
                    ]].sort_values(["cutoff_fraction", "length"])
                    length_distributions.append(length_distribution)
Tim O'Donnell's avatar
Tim O'Donnell committed

        if frequency_matrices is not None:
            frequency_matrices = pandas.concat(
                frequency_matrices, ignore_index=True)

        if length_distributions is not None:
            length_distributions = pandas.concat(
                length_distributions, ignore_index=True)

        if motif_summary:
            return {
                'frequency_matrices': frequency_matrices,
                'length_distributions': length_distributions,
            }
    def filter_networks(self, predicate):
        """
        Return a new Class1AffinityPredictor containing a subset of this
        predictor's neural networks.

        Parameters
        ----------
        predicate : Class1NeuralNetwork -> boolean
            Function specifying which neural networks to include
        Returns
        -------
        Class1AffinityPredictor
        """
        allele_to_allele_specific_models = {}
        for (allele, models) in self.allele_to_allele_specific_models.items():
            allele_to_allele_specific_models[allele] = [
                m for m in models if predicate(m)
            ]
        class1_pan_allele_models = [
            m for m in self.class1_pan_allele_models if predicate(m)
        ]

        return Class1AffinityPredictor(
            allele_to_allele_specific_models=allele_to_allele_specific_models,
            class1_pan_allele_models=class1_pan_allele_models,
            allele_to_sequence=self.allele_to_sequence,
        )

    def model_select(
            self,
            score_function,
            alleles=None,
            min_models=1,
            max_models=10000):
        """
        Perform model selection using a user-specified scoring function.

        Model selection is done using a "step up" variable selection procedure,
        in which models are repeatedly added to an ensemble until the score
        stops improving.

        Parameters
        ----------
        score_function : Class1AffinityPredictor -> float function
            Scoring function

        alleles : list of string, optional
            If not specified, model selection is performed for all alleles.

        min_models : int, optional
            Min models to select per allele

        max_models : int, optional
            Max models to select per allele

        Returns
        -------
        Class1AffinityPredictor : predictor containing the selected models
        """

        if alleles is None:
            alleles = self.supported_alleles

        dfs = []
        allele_to_allele_specific_models = {}
        for allele in alleles:
            df = pandas.DataFrame({
                'model': self.allele_to_allele_specific_models[allele]
            })
            df["model_num"] = df.index
            df["allele"] = allele
            df["selected"] = False

            round_num = 1

            while not df.selected.all() and sum(df.selected) < max_models:
                score_col = "score_%2d" % round_num
                prev_score_col = "score_%2d" % (round_num - 1)

                existing_selected = list(df[df.selected].model)
                df[score_col] = [
                    numpy.nan if row.selected else
                    score_function(
                        Class1AffinityPredictor(
                            allele_to_allele_specific_models={
                                allele: [row.model] + existing_selected
                    for (_, row) in df.iterrows()
                ]

                if round_num > min_models and (
                        df[score_col].max() < df[prev_score_col].max()):
                    break

                # In case of a tie, pick a model at random.
                (best_model_index,) = df.loc[
                    (df[score_col] == df[score_col].max())
                ].sample(1).index
                df.loc[best_model_index, "selected"] = True
                round_num += 1

            dfs.append(df)
            allele_to_allele_specific_models[allele] = list(
                df.loc[df.selected].model)

        df = pandas.concat(dfs, ignore_index=True)

        new_predictor = Class1AffinityPredictor(
            allele_to_allele_specific_models,
            metadata_dataframes={
                "model_selection": df,
            })
        return new_predictor