class1_affinity_predictor.py

            columns in the result DataFrame.
        include_percentile_ranks : boolean, default True
            If True, a "prediction_percentile" column will be included giving
            the percentile ranks. If no percentile rank info is available,
            this will be ignored with a warning.
        centrality_measure : string or callable
            Measure of central tendency to use to combine predictions in the
            ensemble. Options include: mean, median, robust_mean.
        model_kwargs : dict
            Additional keyword arguments to pass to Class1NeuralNetwork.predict

        Returns
        -------
        `pandas.DataFrame` of predictions
        """
        if isinstance(peptides, string_types):
            raise TypeError("peptides must be a list or array, not a string")
        if isinstance(alleles, string_types):
            raise TypeError("alleles must be a list or array, not a string")
        if allele is None and alleles is None:
            raise ValueError("Must specify 'allele' or 'alleles'.")

        peptides = EncodableSequences.create(peptides)
        df = pandas.DataFrame({
            'peptide': peptides.sequences
        }, copy=False)

        if allele is not None:
            if alleles is not None:
                raise ValueError("Specify exactly one of allele or alleles")
            df["allele"] = allele
            normalized_allele = mhcnames.normalize_allele_name(allele)
            df["normalized_allele"] = normalized_allele
            unique_alleles = [normalized_allele]
        else:
            df["allele"] = numpy.array(alleles)
            df["normalized_allele"] = df.allele.map(
                mhcnames.normalize_allele_name)
            unique_alleles = df.normalized_allele.unique()

        if len(df) == 0:
            # No predictions.
            logging.warning("Predicting for 0 peptides.")
            empty_result = pandas.DataFrame(
                columns=[
                    'peptide',
                    'allele',
                    'prediction',
                    'prediction_low',
                    'prediction_high'
                ])
            return empty_result

        (min_peptide_length, max_peptide_length) = (
            self.supported_peptide_lengths)

        if (peptides.min_length < min_peptide_length or
                peptides.max_length > max_peptide_length):
            # Only compute this if needed
            all_peptide_lengths_supported = False
            sequence_length = df.peptide.str.len()
            df["supported_peptide_length"] = (
                (sequence_length >= min_peptide_length) &
                (sequence_length <= max_peptide_length))
            if (~df.supported_peptide_length).any():
                msg = (
                    "%d peptides have lengths outside of supported range [%d, %d]: "
                    "%s" % (
                        (~df.supported_peptide_length).sum(),
                        min_peptide_length,
                        max_peptide_length,
                        str(df.loc[~df.supported_peptide_length].peptide.unique())))
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
        else:
            # Handle common case efficiently.
            df["supported_peptide_length"] = True
            all_peptide_lengths_supported = True

        num_pan_models = (
            len(self.class1_pan_allele_models)
            if not self.optimization_info.get("pan_models_merged", False)
            else self.optimization_info["num_pan_models_merged"])
        max_single_allele_models = max(
            len(self.allele_to_allele_specific_models.get(allele, []))
            for allele in unique_alleles
        )
        predictions_array = numpy.zeros(
            shape=(df.shape[0], num_pan_models + max_single_allele_models),
            dtype="float64")
        predictions_array[:] = numpy.nan

        if self.class1_pan_allele_models:
            master_allele_encoding = self.master_allele_encoding
            unsupported_alleles = [
                allele for allele in
                df.normalized_allele.unique()
                if allele not in self.allele_to_sequence
            ]
            if unsupported_alleles:
                truncate_at = 100
                allele_string = " ".join(
                    sorted(self.allele_to_sequence)[:truncate_at])
                if len(self.allele_to_sequence) > truncate_at:
                    allele_string += " + %d more alleles" % (
                        len(self.allele_to_sequence) - truncate_at)
                msg = (
                    "No sequences for allele(s): %s.\n"
                    "Supported alleles: %s" % (
                        " ".join(unsupported_alleles), allele_string))
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)
            mask = df.supported_peptide_length & (
                ~df.normalized_allele.isin(unsupported_alleles))

            row_slice = None
            if mask is None or mask.all():
                row_slice = slice(None, None, None)  # all rows
                masked_allele_encoding = AlleleEncoding(
                    df.normalized_allele,
                    borrow_from=master_allele_encoding)
                masked_peptides = peptides
            elif mask.sum() > 0:
                row_slice = mask
                masked_allele_encoding = AlleleEncoding(
                    df.loc[mask].normalized_allele,
                    borrow_from=master_allele_encoding)
                masked_peptides = peptides.sequences[mask]

            masked_peptides = EncodableSequences.create(masked_peptides)

            if row_slice is not None:
                # The following line is a performance optimization that may be
                # revisited. It causes the neural network to set to include
                # only the alleles actually being predicted for. This makes
                # the network much smaller. However, subsequent calls to
                # predict will need to reset these weights, so there is a
                # tradeoff.
                masked_allele_encoding = masked_allele_encoding.compact()

                if self.optimization_info.get("pan_models_merged"):
                    # Multiple pan-allele models have been merged into one
                    # at the tensorflow level.
                    assert len(self.class1_pan_allele_models) == 1
                    predictions = self.class1_pan_allele_models[0].predict(
                        masked_peptides,
                        allele_encoding=masked_allele_encoding,
                        output_index=None,
                        **model_kwargs)
                    predictions_array[row_slice, :num_pan_models] = predictions
                else:
                    for (i, model) in enumerate(self.class1_pan_allele_models):
                        predictions_array[row_slice, i] = model.predict(
                            masked_peptides,
                            allele_encoding=masked_allele_encoding,
                            **model_kwargs)

        if self.allele_to_allele_specific_models:
            unsupported_alleles = [
                allele for allele in unique_alleles
                if not self.allele_to_allele_specific_models.get(allele)
            ]
            if unsupported_alleles:
                msg = (
                    "No single-allele models for allele(s): %s.\n"
                    "Supported alleles are: %s" % (
                        " ".join(unsupported_alleles),
                        " ".join(sorted(self.allele_to_allele_specific_models))))
                logging.warning(msg)
                if throw:
                    raise ValueError(msg)

            for allele in unique_alleles:
                models = self.allele_to_allele_specific_models.get(allele, [])
                if len(unique_alleles) == 1 and all_peptide_lengths_supported:
                    mask = None
                else:
                    mask = (
                        (df.normalized_allele == allele) &
                        df.supported_peptide_length).values

                row_slice = None
                if mask is None or mask.all():
                    peptides_for_allele = peptides
                    row_slice = slice(None, None, None)
                elif mask.sum() > 0:
                    peptides_for_allele = EncodableSequences.create(
                        df.loc[mask].peptide.values)
                    row_slice = mask

                if row_slice is not None:
                    for (i, model) in enumerate(models):
                        predictions_array[
                            row_slice,
                            num_pan_models + i,
                        ] = model.predict(peptides_for_allele, **model_kwargs)

        if callable(centrality_measure):
            centrality_function = centrality_measure
        else:
            centrality_function = CENTRALITY_MEASURES[centrality_measure]

        logs = numpy.log(predictions_array)
        log_centers = centrality_function(logs)
        df["prediction"] = numpy.exp(log_centers)

        if include_confidence_intervals:
            df["prediction_low"] = numpy.exp(
                numpy.nanpercentile(logs, 5.0, axis=1))
            df["prediction_high"] = numpy.exp(
                numpy.nanpercentile(logs, 95.0, axis=1))

        if include_individual_model_predictions:
            for i in range(num_pan_models):
                df["model_pan_%d" % i] = predictions_array[:, i]

            for i in range(max_single_allele_models):
                df["model_single_%d" % i] = predictions_array[
                    :, num_pan_models + i
                ]

        if include_percentile_ranks:
            if self.allele_to_percent_rank_transform:
                df["prediction_percentile"] = self.percentile_ranks(
                    df.prediction,
                    alleles=df.normalized_allele.values,
                    throw=throw)
            else:
                warnings.warn("No percentile rank information available.")

        del df["supported_peptide_length"]
        del df["normalized_allele"]
        return df

    @staticmethod
    def save_weights(weights_list, filename):
        """
        Save the model weights to the given filename using numpy's ".npz"
        format.
    
        Parameters
        ----------
        weights_list : list of array
        
        filename : string
            Should end in ".npz".
    
        """
        numpy.savez(
            filename,
            **dict((("array_%d" % i), w) for (i, w) in enumerate(weights_list)))

    @staticmethod
    def load_weights(filename):
        """
        Restore model weights from the given filename, which should have been
        created with `save_weights`.
    
        Parameters
        ----------
        filename : string
            Should end in ".npz".

        Returns
        ----------
        list of array
        """
        with numpy.load(filename) as loaded:
            weights = [
                loaded["array_%d" % i]
                for i in range(len(loaded.keys()))
            ]
        return weights

    def calibrate_percentile_ranks(
            self,
            peptides=None,
            num_peptides_per_length=int(1e5),
            alleles=None,
            bins=None,
            motif_summary=False,
            summary_top_peptide_fractions=[0.001],
            verbose=False,
            model_kwargs={}):
        """
        Compute the cumulative distribution of ic50 values for a set of alleles
        over a large universe of random peptides, to enable taking quantiles
        of this distribution later.

        Parameters
        ----------
        peptides : sequence of string or EncodableSequences, optional
            Peptides to use
        num_peptides_per_length : int, optional
            If peptides argument is not specified, then num_peptides_per_length
            peptides are randomly sampled from a uniform distribution for each
            supported length
        alleles : sequence of string, optional
            Alleles to perform calibration for. If not specified all supported
            alleles will be calibrated.
        bins : object
            Anything that can be passed to numpy.histogram's "bins" argument
            can be used here, i.e. either an integer or a sequence giving bin
            edges. This is in ic50 space.
        motif_summary : bool
            If True, the length distribution and per-position amino acid
            frequencies are also calculated for the top x fraction of tightest-
            binding peptides, where each value of x is given in the
            summary_top_peptide_fractions list.
        summary_top_peptide_fractions : list of float
            Only used if motif_summary is True
        verbose : boolean
            Whether to print status updates to stdout
        model_kwargs : dict
            Additional low-level Class1NeuralNetwork.predict() kwargs.

        Returns
        ----------
        dict of string -> pandas.DataFrame

        If motif_summary is True, this will have keys  "frequency_matrices" and
        "length_distributions". Otherwise it will be empty.

        """
        if bins is None:
            bins = to_ic50(numpy.linspace(1, 0, 1000))

        if alleles is None:
            alleles = self.supported_alleles

        if peptides is None:
            peptides = []
            lengths = range(
                self.supported_peptide_lengths[0],
                self.supported_peptide_lengths[1] + 1)
            for length in lengths:
                peptides.extend(
                    random_peptides(num_peptides_per_length, length))

        encoded_peptides = EncodableSequences.create(peptides)

        if motif_summary:
            frequency_matrices = []
            length_distributions = []
        else:
            frequency_matrices = None
            length_distributions = None
        for allele in alleles:
            start = time.time()
            predictions = self.predict(
                encoded_peptides, allele=allele, model_kwargs=model_kwargs)
            if verbose:
                elapsed = time.time() - start
                print(
                    "Generated %d predictions for allele %s in %0.2f sec: "
                    "%0.2f predictions / sec" % (
                        len(encoded_peptides.sequences),
                        allele,
                        elapsed,
                        len(encoded_peptides.sequences) / elapsed))
            transform = PercentRankTransform()
            transform.fit(predictions, bins=bins)
            self.allele_to_percent_rank_transform[allele] = transform

            if frequency_matrices is not None:
                predictions_df = pandas.DataFrame({
                    'peptide': encoded_peptides.sequences,
                    'prediction': predictions
                }).drop_duplicates('peptide').set_index("peptide")
                predictions_df["length"] = predictions_df.index.str.len()
                for (length, sub_df) in predictions_df.groupby("length"):
                    for cutoff_fraction in summary_top_peptide_fractions:
                        selected = sub_df.prediction.nsmallest(
                            max(
                                int(len(sub_df) * cutoff_fraction),
                                1)).index.values
                        matrix = positional_frequency_matrix(selected).reset_index()
                        original_columns = list(matrix.columns)
                        matrix["allele"] = allele
                        matrix["length"] = length
                        matrix["cutoff_fraction"] = cutoff_fraction
                        matrix["cutoff_count"] = len(selected)
                        matrix = matrix[
                            ["allele", "length", "cutoff_fraction", "cutoff_count"]
                            + original_columns
                        ]
                        frequency_matrices.append(matrix)

                # Length distribution
                for cutoff_fraction in summary_top_peptide_fractions:
                    cutoff_count = max(
                        int(len(predictions_df) * cutoff_fraction), 1)
                    length_distribution = predictions_df.prediction.nsmallest(
                        cutoff_count).index.str.len().value_counts()
                    length_distribution.index.name = "length"
                    length_distribution /= length_distribution.sum()
                    length_distribution = length_distribution.to_frame()
                    length_distribution.columns = ["fraction"]
                    length_distribution = length_distribution.reset_index()
                    length_distribution["allele"] = allele
                    length_distribution["cutoff_fraction"] = cutoff_fraction
                    length_distribution["cutoff_count"] = cutoff_count
                    length_distribution = length_distribution[[
                        "allele",
                        "cutoff_fraction",
                        "cutoff_count",
                        "length",
                        "fraction"
                    ]].sort_values(["cutoff_fraction", "length"])
                    length_distributions.append(length_distribution)

        if frequency_matrices is not None:
            frequency_matrices = pandas.concat(
                frequency_matrices, ignore_index=True)

        if length_distributions is not None:
            length_distributions = pandas.concat(
                length_distributions, ignore_index=True)

        if motif_summary:
            return {
                'frequency_matrices': frequency_matrices,
                'length_distributions': length_distributions,
            }
        return {}

    def model_select(
            self,
            score_function,
            alleles=None,
            min_models=1,
            max_models=10000):
        """
        Perform model selection using a user-specified scoring function.

        This works only with allele-specific models, not pan-allele models.

        Model selection is done using a "step up" variable selection procedure,
        in which models are repeatedly added to an ensemble until the score
        stops improving.

        Parameters
        ----------
        score_function : Class1AffinityPredictor -> float function
            Scoring function

        alleles : list of string, optional
            If not specified, model selection is performed for all alleles.

        min_models : int, optional
            Min models to select per allele

        max_models : int, optional
            Max models to select per allele

        Returns
        -------
        Class1AffinityPredictor : predictor containing the selected models
        """

        if alleles is None:
            alleles = self.supported_alleles

        dfs = []
        allele_to_allele_specific_models = {}
        for allele in alleles:
            df = pandas.DataFrame({
                'model': self.allele_to_allele_specific_models[allele]
            })
            df["model_num"] = df.index
            df["allele"] = allele
            df["selected"] = False

            round_num = 1

            while not df.selected.all() and sum(df.selected) < max_models:
                score_col = "score_%2d" % round_num
                prev_score_col = "score_%2d" % (round_num - 1)

                existing_selected = list(df[df.selected].model)
                df[score_col] = [
                    numpy.nan if row.selected else
                    score_function(
                        Class1AffinityPredictor(
                            allele_to_allele_specific_models={
                                allele: [row.model] + existing_selected
                            }
                        )
                    )
                    for (_, row) in df.iterrows()
                ]

                if round_num > min_models and (
                        df[score_col].max() < df[prev_score_col].max()):
                    break

                # In case of a tie, pick a model at random.
                (best_model_index,) = df.loc[
                    (df[score_col] == df[score_col].max())
                ].sample(1).index
                df.loc[best_model_index, "selected"] = True
                round_num += 1

            dfs.append(df)
            allele_to_allele_specific_models[allele] = list(
                df.loc[df.selected].model)

        df = pandas.concat(dfs, ignore_index=True)

        new_predictor = Class1AffinityPredictor(
            allele_to_allele_specific_models,
            metadata_dataframes={
                "model_selection": df,
            })
        return new_predictor