Module statkit.naive_bayes

Naive Bayes classifier with support for feature specific distributions.

See statkit.distributions for a list of supported distributions.

Expand source code
"""
Naive Bayes classifier with support for feature specific distributions.

See `statkit.distributions` for a list of supported distributions.
"""
from typing import Union

from pomegranate.bayes import BayesModel
from pomegranate.distributions import (
    Distribution,
    IndependentComponentsDistribution,
)
from pandas import DataFrame, Series
from numpy import fromiter, isin, ndarray, testing, unique, vectorize, zeros
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.multiclass import unique_labels
from sklearn.utils.validation import check_is_fitted


class _BaseNaiveBayes(ClassifierMixin, BaseEstimator):
    """Pomegranate NaiveBayes extension with distribution kwargs and pseudo counts."""

    def __init__(
        self,
        distributions: list,
        pseudo_count: Union[float, dict[Distribution, float]] = 1.0,
        distribution_kwargs: dict[dict] = {},
    ):
        """
        Args:
            pseudo_count: Pseudo count for all distributions (when float) or distribution
                specific (by key) pseudo count.
            distribution_kwargs: Initialisation of arguments, per distribution (=key), and
                per class (key of value). These arguments are positional (in stead of
                keyword) because pomegranate is Cython compiled.
                Example:
                    {Gaussian: {0: (0, 1), 1: (0, 1)}}
        """
        self.distributions = distributions
        self.distribution_kwargs = distribution_kwargs
        self.pseudo_count = pseudo_count

    def fit(self, X, y, weights=None):
        """Initialise a BayesModel."""
        self.classes_ = unique_labels(y)

        if len(self.classes_) == 1:
            if set(self.classes_).issubset([0, 1]):
                self.classes_ = [0, 1]
            else:
                raise ValueError(
                    f"Training data contains single class {self.classes_}."
                )

        # Catch matrices with zero features to prevent segmentation faults.
        if len(X.shape) < 2 or X.shape[1] < 1:
            raise ValueError(
                f"Dimension mismatch: expected matrix, got `X` with shape {X.shape}!"
            )

        assert not isinstance(X, DataFrame), "A numpy matrix is required."

        self.distributions_ = self.distributions
        if callable(self.distributions):
            self.distributions_ = [self.distributions] * X.shape[1]

        # For each class, generate a `IndependentComponentsDistribution`.
        distributions = []
        for c in self.classes_:
            components = []
            # Each component is initialised using the class specific keyword
            # arguments.
            for i, p_i in enumerate(self.distributions_):
                if isinstance(self.pseudo_count, (float, int)):
                    kwargs = {"pseudo_count": self.pseudo_count}
                elif isinstance(self.pseudo_count, dict):
                    # Use distribution specific pseudo count.
                    kwargs = {"pseudo_count": self.pseudo_count[p_i]}

                kwargs.update(self.distribution_kwargs.get(p_i, {}).get(c, {}))
                p_i_given_c = self.distributions_[i](**kwargs)
                components.append(p_i_given_c)

            icd = IndependentComponentsDistribution(components)
            # IMPORTANT: By default, cython=1 in
            # `IndependentComponentsDistribution` causing it to call
            # `_summarize` from its children instead of `summarize`. Since
            # the former is not implemented, toggle cython to 0 so that
            # `IndependentComponentsDistribution` falls back on `summarize`
            # (which we did implement for, e.g., ZeroInflatedGaussian).
            icd.cython = 0

            distributions.append(icd)

        self.model_ = BayesModel(distributions)
        self.model_.fit(X, y, weights=weights)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def predict_proba(self, X):
        return self.model_.predict_proba(X)


class NaiveBayesClassifier(_BaseNaiveBayes):
    r"""Naive Bayes classifier that supports feature specific distributions.

    $$
    p(y,\vec{x}) =  p(y) \prod_{i=1}^n p(x_i|y).
    $$
    """

    def _clean(self, X, y=None):
        """Turn into numpy array."""
        Xnp = X
        if isinstance(X, DataFrame):
            Xnp = X.to_numpy()

        if y is None:
            return Xnp

        ynp = self.map_label_(y)
        return Xnp, ynp

    def __init__(
        self,
        distributions: Union[list, dict, Distribution],
        pseudo_count: Union[float, dict[Distribution, float]] = 1.0,
        distribution_kwargs: dict[dict] = {},
    ):
        """
        Args:
            distributions: Distribution to use across all features, or use a `dict` to
                specify a distribution (=value) per feature (=key).
            pseudo_count:  Pseudo count for all distributions (when float) or per
                distribution (indicatd by key in a `dict`).
            distribution_kwargs: Pass specific keyword arguments (=value) for each
                distribution type (=key).

        Example:
            ```
            from numpy import exp
            from pandas import DataFrame
            from sklearn.datasets import make_blobs
            from statkit.distributions import Gaussian, LogNormal
            from statkit.naive_bayes import NaiveBayesClassifier

            X, y = make_blobs(n_features=2, centers=2)
            X = DataFrame({'a': X[:, 0], 'b': exp(X[:, 1])})
            model = NaiveBayesClassifier(distributions={'a': Gaussian, 'b': LogNormal})
            model.fit(X, y)
            ```
        """
        super().__init__(distributions, pseudo_count, distribution_kwargs)

    def _check_schema(self, X):
        """Check schema of `X` with X_train."""
        if isinstance(X, DataFrame):
            if len(unique(X.columns)) != len(X.columns):
                raise KeyError("Duplicate columns!")

            # Python guarantees order of dicts.
            if list(X.columns) != list(self.column_map_.keys()):
                raise KeyError("Schema skew!")

    def _check_smooth_distributions(self):
        """Verify that all discrete distributions are non-zero."""
        for c in range(len(self.classes_)):
            distributions_c = self.model_.distributions[c].distributions
            for i in range(len(distributions_c)):
                distr_params = distributions_c[i].parameters
                # When the first element is a dict, it is probably an inflated
                # distribution.
                if isinstance(distr_params[0], dict):
                    inflated_probabilities = fromiter(
                        distr_params[0].values(), dtype=float
                    )
                    testing.assert_almost_equal(
                        inflated_probabilities.sum(),
                        1,
                        err_msg=(
                            f"Probabilties of inflated points ({inflated_probabilities})"
                            " are not normalised."
                        ),
                    )
                    assert all(
                        inflated_probabilities != 0
                    ), "Some inflated points have zero probability."

    def fit(self, X, y, weights=None):
        """Estimate feature distributions (per class) and class distribution."""
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.class_map_ = {k: i for i, k in enumerate(self.classes_)}
        self.class_map_inverse_ = {i: k for i, k in enumerate(self.classes_)}
        self.map_label_ = vectorize(lambda x: self.class_map_[x])
        self.map_label_inverse_ = vectorize(lambda x: self.class_map_inverse_[x])

        # Make a map of the columns.
        if isinstance(X, DataFrame):
            assert len(unique(X.columns)) == len(X.columns), "Duplicate columns!"
            self.column_map_ = {col: i for i, col in enumerate(X.columns)}
        else:
            # Identity map in case of NumPy matrix.
            self.column_map_ = {i: i for i in range(X.shape[1])}

        # !! Override distributions before calling `super` method. !!
        if isinstance(self.distributions, dict):
            if isinstance(X, DataFrame):
                self.distributions = [self.distributions[c] for c in X.columns]
            else:
                self.distributions = [self.distributions[i] for i in range(X.shape[1])]

        X, y = self._clean(X, y)
        super().fit(X, y, weights)
        self._check_smooth_distributions()

        self.is_fitted_ = True
        return self

    def inspect_distribution(self, column, y=None):
        """Inspect sufficient statistics of given variable."""
        check_is_fitted(self)

        if y is None:
            ys = self.classes_
        else:
            ys = [y]

        distributions = {}
        for yi in ys:
            y_index = self.class_map_[yi]
            variable_index = self.column_map_[column]
            dist_i = self.model_.distributions[y_index].distributions[variable_index]
            distributions[yi] = dist_i

        if len(distributions) == 1:
            return distributions[y]
        return distributions

    def predict(self, X):
        """Predict labels for samples."""
        # Check is fit had been called
        check_is_fitted(self)
        self._check_schema(X)
        X_np = self._clean(X)
        y = self.model_.predict(X_np)
        y = self.map_label_inverse_(y)
        if isinstance(X, DataFrame):
            return Series(y, index=X.index, name=X.index.name)
        return y

    def feature_importance(self, X):
        r"""Compute feature importance for given samples.

        For sample \( i \) with features \( x^{(i)}_1, \dots, x^{(i)}_n \),
        compute importance vector (of size \( n + 1 \) ):
        $$
        \begin{pmatrix}
            \ln p(x^{(i)}_1|y=1) - \ln p(x^{(i)}_1|y=0), \\
             \dots \\
             \ln p(x^{(i)}_n|y=1) - \ln p(x^{(i)}_n|y=0), \\
             \ln p(y=1) - \ln p(y=0)
        \end{pmatrix}
        $$
        """
        check_is_fitted(self)
        self._check_schema(X)
        X_np = self._clean(X)

        m_rows, n_features = X_np.shape
        n_classes = len(self.classes_)

        assert n_classes == 2, "Importance only defined for binary classes."

        # Log probability per class, per feature (i.e., ln p(x_i|c)) and the
        # class itself ln p(c).
        logp_yx = zeros(shape=[m_rows, n_classes, n_features + 1])

        # Compute ln p(x,y) = lnp(x|y) + ln p(y).
        for y_index in range(n_classes):
            for feature_index in range(n_features):
                dist_i = self.model_.distributions[y_index].distributions[feature_index]
                logp_yx[:, y_index, feature_index] = dist_i.log_probability(
                    X_np[:, feature_index]
                )
            # Last element is reserved for class itself: ln[p(c)].
            logp_yx[:, y_index, -1] = self.model_.weights[y_index]

        importance = logp_yx[:, 1] - logp_yx[:, 0]

        if isinstance(X, DataFrame):
            columns = list(X.columns) + ["label"]
            return DataFrame(importance, index=X.index, columns=columns)
        return importance

    def predict_proba(self, X):
        r"""Estimate probability per sample \( p(y^{(i)}=c|\vec{x}^{(i)}) \), per class \( c \)."""
        # Check is fit had been called
        check_is_fitted(self)
        self._check_schema(X)
        X_np = self._clean(X)
        y_pred = self.model_.predict_proba(X_np)
        if isinstance(X, DataFrame):
            return DataFrame(
                y_pred, index=X.index, columns=self.map_label_inverse_([0, 1])
            )
        return y_pred

    def decision_function(self, X) -> ndarray:
        """The probability of the positive class."""
        y_prob = self.predict_proba(X)
        if isinstance(y_prob, DataFrame):
            return y_prob.to_numpy()[:, 1]
        return y_prob[:, 1]

    def score(self, X, y):
        """Compute model accuracy."""
        check_is_fitted(self)
        self._check_schema(X)
        X, y = self._clean(X, y)
        return self.model_.score(X, y)

Classes

class NaiveBayesClassifier (distributions: Union[list, dict, pomegranate.distributions.distributions.Distribution], pseudo_count: Union[float, dict[pomegranate.distributions.distributions.Distribution, float]] = 1.0, distribution_kwargs: dict[dict] = {})

Naive Bayes classifier that supports feature specific distributions.

p(y,\vec{x}) = p(y) \prod_{i=1}^n p(x_i|y).

Args

distributions
Distribution to use across all features, or use a dict to specify a distribution (=value) per feature (=key).
pseudo_count
Pseudo count for all distributions (when float) or per distribution (indicatd by key in a dict).
distribution_kwargs
Pass specific keyword arguments (=value) for each distribution type (=key).

Example

from numpy import exp
from pandas import DataFrame
from sklearn.datasets import make_blobs
from statkit.distributions import Gaussian, LogNormal
from statkit.naive_bayes import NaiveBayesClassifier

X, y = make_blobs(n_features=2, centers=2)
X = DataFrame({'a': X[:, 0], 'b': exp(X[:, 1])})
model = NaiveBayesClassifier(distributions={'a': Gaussian, 'b': LogNormal})
model.fit(X, y)
Expand source code
class NaiveBayesClassifier(_BaseNaiveBayes):
    r"""Naive Bayes classifier that supports feature specific distributions.

    $$
    p(y,\vec{x}) =  p(y) \prod_{i=1}^n p(x_i|y).
    $$
    """

    def _clean(self, X, y=None):
        """Turn into numpy array."""
        Xnp = X
        if isinstance(X, DataFrame):
            Xnp = X.to_numpy()

        if y is None:
            return Xnp

        ynp = self.map_label_(y)
        return Xnp, ynp

    def __init__(
        self,
        distributions: Union[list, dict, Distribution],
        pseudo_count: Union[float, dict[Distribution, float]] = 1.0,
        distribution_kwargs: dict[dict] = {},
    ):
        """
        Args:
            distributions: Distribution to use across all features, or use a `dict` to
                specify a distribution (=value) per feature (=key).
            pseudo_count:  Pseudo count for all distributions (when float) or per
                distribution (indicatd by key in a `dict`).
            distribution_kwargs: Pass specific keyword arguments (=value) for each
                distribution type (=key).

        Example:
            ```
            from numpy import exp
            from pandas import DataFrame
            from sklearn.datasets import make_blobs
            from statkit.distributions import Gaussian, LogNormal
            from statkit.naive_bayes import NaiveBayesClassifier

            X, y = make_blobs(n_features=2, centers=2)
            X = DataFrame({'a': X[:, 0], 'b': exp(X[:, 1])})
            model = NaiveBayesClassifier(distributions={'a': Gaussian, 'b': LogNormal})
            model.fit(X, y)
            ```
        """
        super().__init__(distributions, pseudo_count, distribution_kwargs)

    def _check_schema(self, X):
        """Check schema of `X` with X_train."""
        if isinstance(X, DataFrame):
            if len(unique(X.columns)) != len(X.columns):
                raise KeyError("Duplicate columns!")

            # Python guarantees order of dicts.
            if list(X.columns) != list(self.column_map_.keys()):
                raise KeyError("Schema skew!")

    def _check_smooth_distributions(self):
        """Verify that all discrete distributions are non-zero."""
        for c in range(len(self.classes_)):
            distributions_c = self.model_.distributions[c].distributions
            for i in range(len(distributions_c)):
                distr_params = distributions_c[i].parameters
                # When the first element is a dict, it is probably an inflated
                # distribution.
                if isinstance(distr_params[0], dict):
                    inflated_probabilities = fromiter(
                        distr_params[0].values(), dtype=float
                    )
                    testing.assert_almost_equal(
                        inflated_probabilities.sum(),
                        1,
                        err_msg=(
                            f"Probabilties of inflated points ({inflated_probabilities})"
                            " are not normalised."
                        ),
                    )
                    assert all(
                        inflated_probabilities != 0
                    ), "Some inflated points have zero probability."

    def fit(self, X, y, weights=None):
        """Estimate feature distributions (per class) and class distribution."""
        # Store the classes seen during fit
        self.classes_ = unique_labels(y)
        self.class_map_ = {k: i for i, k in enumerate(self.classes_)}
        self.class_map_inverse_ = {i: k for i, k in enumerate(self.classes_)}
        self.map_label_ = vectorize(lambda x: self.class_map_[x])
        self.map_label_inverse_ = vectorize(lambda x: self.class_map_inverse_[x])

        # Make a map of the columns.
        if isinstance(X, DataFrame):
            assert len(unique(X.columns)) == len(X.columns), "Duplicate columns!"
            self.column_map_ = {col: i for i, col in enumerate(X.columns)}
        else:
            # Identity map in case of NumPy matrix.
            self.column_map_ = {i: i for i in range(X.shape[1])}

        # !! Override distributions before calling `super` method. !!
        if isinstance(self.distributions, dict):
            if isinstance(X, DataFrame):
                self.distributions = [self.distributions[c] for c in X.columns]
            else:
                self.distributions = [self.distributions[i] for i in range(X.shape[1])]

        X, y = self._clean(X, y)
        super().fit(X, y, weights)
        self._check_smooth_distributions()

        self.is_fitted_ = True
        return self

    def inspect_distribution(self, column, y=None):
        """Inspect sufficient statistics of given variable."""
        check_is_fitted(self)

        if y is None:
            ys = self.classes_
        else:
            ys = [y]

        distributions = {}
        for yi in ys:
            y_index = self.class_map_[yi]
            variable_index = self.column_map_[column]
            dist_i = self.model_.distributions[y_index].distributions[variable_index]
            distributions[yi] = dist_i

        if len(distributions) == 1:
            return distributions[y]
        return distributions

    def predict(self, X):
        """Predict labels for samples."""
        # Check is fit had been called
        check_is_fitted(self)
        self._check_schema(X)
        X_np = self._clean(X)
        y = self.model_.predict(X_np)
        y = self.map_label_inverse_(y)
        if isinstance(X, DataFrame):
            return Series(y, index=X.index, name=X.index.name)
        return y

    def feature_importance(self, X):
        r"""Compute feature importance for given samples.

        For sample \( i \) with features \( x^{(i)}_1, \dots, x^{(i)}_n \),
        compute importance vector (of size \( n + 1 \) ):
        $$
        \begin{pmatrix}
            \ln p(x^{(i)}_1|y=1) - \ln p(x^{(i)}_1|y=0), \\
             \dots \\
             \ln p(x^{(i)}_n|y=1) - \ln p(x^{(i)}_n|y=0), \\
             \ln p(y=1) - \ln p(y=0)
        \end{pmatrix}
        $$
        """
        check_is_fitted(self)
        self._check_schema(X)
        X_np = self._clean(X)

        m_rows, n_features = X_np.shape
        n_classes = len(self.classes_)

        assert n_classes == 2, "Importance only defined for binary classes."

        # Log probability per class, per feature (i.e., ln p(x_i|c)) and the
        # class itself ln p(c).
        logp_yx = zeros(shape=[m_rows, n_classes, n_features + 1])

        # Compute ln p(x,y) = lnp(x|y) + ln p(y).
        for y_index in range(n_classes):
            for feature_index in range(n_features):
                dist_i = self.model_.distributions[y_index].distributions[feature_index]
                logp_yx[:, y_index, feature_index] = dist_i.log_probability(
                    X_np[:, feature_index]
                )
            # Last element is reserved for class itself: ln[p(c)].
            logp_yx[:, y_index, -1] = self.model_.weights[y_index]

        importance = logp_yx[:, 1] - logp_yx[:, 0]

        if isinstance(X, DataFrame):
            columns = list(X.columns) + ["label"]
            return DataFrame(importance, index=X.index, columns=columns)
        return importance

    def predict_proba(self, X):
        r"""Estimate probability per sample \( p(y^{(i)}=c|\vec{x}^{(i)}) \), per class \( c \)."""
        # Check is fit had been called
        check_is_fitted(self)
        self._check_schema(X)
        X_np = self._clean(X)
        y_pred = self.model_.predict_proba(X_np)
        if isinstance(X, DataFrame):
            return DataFrame(
                y_pred, index=X.index, columns=self.map_label_inverse_([0, 1])
            )
        return y_pred

    def decision_function(self, X) -> ndarray:
        """The probability of the positive class."""
        y_prob = self.predict_proba(X)
        if isinstance(y_prob, DataFrame):
            return y_prob.to_numpy()[:, 1]
        return y_prob[:, 1]

    def score(self, X, y):
        """Compute model accuracy."""
        check_is_fitted(self)
        self._check_schema(X)
        X, y = self._clean(X, y)
        return self.model_.score(X, y)

Ancestors

  • statkit.naive_bayes._BaseNaiveBayes
  • sklearn.base.ClassifierMixin
  • sklearn.base.BaseEstimator

Methods

def decision_function(self, X) ‑> numpy.ndarray

The probability of the positive class.

Expand source code
def decision_function(self, X) -> ndarray:
    """The probability of the positive class."""
    y_prob = self.predict_proba(X)
    if isinstance(y_prob, DataFrame):
        return y_prob.to_numpy()[:, 1]
    return y_prob[:, 1]
def feature_importance(self, X)

Compute feature importance for given samples.

For sample i with features x^{(i)}_1, \dots, x^{(i)}_n , compute importance vector (of size n + 1 ): \begin{pmatrix} \ln p(x^{(i)}_1|y=1) - \ln p(x^{(i)}_1|y=0), \\ \dots \\ \ln p(x^{(i)}_n|y=1) - \ln p(x^{(i)}_n|y=0), \\ \ln p(y=1) - \ln p(y=0) \end{pmatrix}

Expand source code
def feature_importance(self, X):
    r"""Compute feature importance for given samples.

    For sample \( i \) with features \( x^{(i)}_1, \dots, x^{(i)}_n \),
    compute importance vector (of size \( n + 1 \) ):
    $$
    \begin{pmatrix}
        \ln p(x^{(i)}_1|y=1) - \ln p(x^{(i)}_1|y=0), \\
         \dots \\
         \ln p(x^{(i)}_n|y=1) - \ln p(x^{(i)}_n|y=0), \\
         \ln p(y=1) - \ln p(y=0)
    \end{pmatrix}
    $$
    """
    check_is_fitted(self)
    self._check_schema(X)
    X_np = self._clean(X)

    m_rows, n_features = X_np.shape
    n_classes = len(self.classes_)

    assert n_classes == 2, "Importance only defined for binary classes."

    # Log probability per class, per feature (i.e., ln p(x_i|c)) and the
    # class itself ln p(c).
    logp_yx = zeros(shape=[m_rows, n_classes, n_features + 1])

    # Compute ln p(x,y) = lnp(x|y) + ln p(y).
    for y_index in range(n_classes):
        for feature_index in range(n_features):
            dist_i = self.model_.distributions[y_index].distributions[feature_index]
            logp_yx[:, y_index, feature_index] = dist_i.log_probability(
                X_np[:, feature_index]
            )
        # Last element is reserved for class itself: ln[p(c)].
        logp_yx[:, y_index, -1] = self.model_.weights[y_index]

    importance = logp_yx[:, 1] - logp_yx[:, 0]

    if isinstance(X, DataFrame):
        columns = list(X.columns) + ["label"]
        return DataFrame(importance, index=X.index, columns=columns)
    return importance
def fit(self, X, y, weights=None)

Estimate feature distributions (per class) and class distribution.

Expand source code
def fit(self, X, y, weights=None):
    """Estimate feature distributions (per class) and class distribution."""
    # Store the classes seen during fit
    self.classes_ = unique_labels(y)
    self.class_map_ = {k: i for i, k in enumerate(self.classes_)}
    self.class_map_inverse_ = {i: k for i, k in enumerate(self.classes_)}
    self.map_label_ = vectorize(lambda x: self.class_map_[x])
    self.map_label_inverse_ = vectorize(lambda x: self.class_map_inverse_[x])

    # Make a map of the columns.
    if isinstance(X, DataFrame):
        assert len(unique(X.columns)) == len(X.columns), "Duplicate columns!"
        self.column_map_ = {col: i for i, col in enumerate(X.columns)}
    else:
        # Identity map in case of NumPy matrix.
        self.column_map_ = {i: i for i in range(X.shape[1])}

    # !! Override distributions before calling `super` method. !!
    if isinstance(self.distributions, dict):
        if isinstance(X, DataFrame):
            self.distributions = [self.distributions[c] for c in X.columns]
        else:
            self.distributions = [self.distributions[i] for i in range(X.shape[1])]

    X, y = self._clean(X, y)
    super().fit(X, y, weights)
    self._check_smooth_distributions()

    self.is_fitted_ = True
    return self
def inspect_distribution(self, column, y=None)

Inspect sufficient statistics of given variable.

Expand source code
def inspect_distribution(self, column, y=None):
    """Inspect sufficient statistics of given variable."""
    check_is_fitted(self)

    if y is None:
        ys = self.classes_
    else:
        ys = [y]

    distributions = {}
    for yi in ys:
        y_index = self.class_map_[yi]
        variable_index = self.column_map_[column]
        dist_i = self.model_.distributions[y_index].distributions[variable_index]
        distributions[yi] = dist_i

    if len(distributions) == 1:
        return distributions[y]
    return distributions
def predict(self, X)

Predict labels for samples.

Expand source code
def predict(self, X):
    """Predict labels for samples."""
    # Check is fit had been called
    check_is_fitted(self)
    self._check_schema(X)
    X_np = self._clean(X)
    y = self.model_.predict(X_np)
    y = self.map_label_inverse_(y)
    if isinstance(X, DataFrame):
        return Series(y, index=X.index, name=X.index.name)
    return y
def predict_proba(self, X)

Estimate probability per sample p(y^{(i)}=c|\vec{x}^{(i)}) , per class c .

Expand source code
def predict_proba(self, X):
    r"""Estimate probability per sample \( p(y^{(i)}=c|\vec{x}^{(i)}) \), per class \( c \)."""
    # Check is fit had been called
    check_is_fitted(self)
    self._check_schema(X)
    X_np = self._clean(X)
    y_pred = self.model_.predict_proba(X_np)
    if isinstance(X, DataFrame):
        return DataFrame(
            y_pred, index=X.index, columns=self.map_label_inverse_([0, 1])
        )
    return y_pred
def score(self, X, y)

Compute model accuracy.

Expand source code
def score(self, X, y):
    """Compute model accuracy."""
    check_is_fitted(self)
    self._check_schema(X)
    X, y = self._clean(X, y)
    return self.model_.score(X, y)