Module imodels.rule_list.bayesian_rule_list.bayesian_rule_list
Expand source code
import numpy as np
import pandas as pd
import random
from collections import Counter, defaultdict
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.multiclass import check_classification_targets, unique_labels
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from imodels.rule_list.bayesian_rule_list.brl_util import (
default_permsdic, preds_d_t, run_bdl_multichain_serial, merge_chains, get_point_estimate, get_rule_rhs
)
from imodels.rule_list.rule_list import RuleList
from imodels.util.convert import itemsets_to_rules
from imodels.util.extract import extract_fpgrowth
from imodels.util.rule import get_feature_dict, replace_feature_name, Rule
class BayesianRuleListClassifier(BaseEstimator, RuleList, ClassifierMixin):
"""
This is a scikit-learn compatible wrapper for the Bayesian Rule List
classifier developed by Benjamin Letham. It produces a highly
interpretable model (a list of decision rules) by sampling many different
rule lists, trying to optimize for compactness and predictive performance.
Parameters
----------
listlengthprior : int, optional (default=3)
Prior hyperparameter for expected list length (excluding null rule)
listwidthprior : int, optional (default=1)
Prior hyperparameter for expected list width (excluding null rule)
maxcardinality : int, optional (default=2)
Maximum cardinality of an itemset
minsupport : float, optional (default=0.1)
Minimum support (fraction between 0 and 1) of an itemset
alpha : array_like, shape = [n_classes]
prior hyperparameter for multinomial pseudocounts
n_chains : int, optional (default=3)
Number of MCMC chains for inference
max_iter : int, optional (default=50000)
Maximum number of iterations
class1label: str, optional (default="class 1")
Label or description of what the positive class (with y=1) means
verbose: bool, optional (default=True)
Verbose output
random_state: int
Random seed
"""
def __init__(self,
listlengthprior=3,
listwidthprior=1,
maxcardinality=2,
minsupport=0.1,
disc_strategy='mdlp',
disc_kwargs={},
alpha=np.array([1., 1.]),
n_chains=3,
max_iter=50000,
class1label="class 1",
verbose=False,
random_state=42):
self.listlengthprior = listlengthprior
self.listwidthprior = listwidthprior
self.maxcardinality = maxcardinality
self.minsupport = minsupport
self.disc_strategy = disc_strategy
self.disc_kwargs = disc_kwargs
self.alpha = alpha
self.n_chains = n_chains
self.max_iter = max_iter
self.class1label = class1label
self.verbose = verbose
self._zmin = 1
self.thinning = 1 # The thinning rate
self.burnin = self.max_iter // 2 # the number of samples to drop as burn-in in-simulation
self.discretizer = None
self.d_star = None
self.random_state = random_state
self.seed()
def seed(self):
if self.random_state is not None:
random.seed(self.random_state)
np.random.seed(self.random_state)
def _setlabels(self, X, feature_names=[]):
if len(feature_names) == 0:
if type(X) == pd.DataFrame and ('object' in str(X.columns.dtype) or 'str' in str(X.columns.dtype)):
feature_names = X.columns
else:
feature_names = ["ft" + str(i + 1) for i in range(len(X[0]))]
self.feature_names = feature_names
def fit(self, X, y, feature_names: list = None, undiscretized_features=[], verbose=False):
"""Fit rule lists to data.
Note: Numerical data in `X` is automatically discretized.
To prevent discretization (e.g. to protect columns containing categorical data represented as integers),
pass the list of protected column names in the `fit` method,
e.g. `model.fit(X,y,undiscretized_features=['CAT_COLUMN_NAME'])`
(entries in undiscretized columns will be converted to strings and used as categorical values)
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Training data
y : array_like, shape = [n_samples]
Labels
feature_names : array_like, shape = [n_features], optional (default: [])
String labels for each feature.
If empty and X is a DataFrame, column labels are used.
If empty and X is not a DataFrame, then features are simply enumerated
undiscretized_features : array_like, shape = [n_features], optional (default: [])
String labels for each feature which is NOT to be discretized.
If empty, all numeric features are discretized
verbose : bool
Currently doesn't do anything
Returns
-------
self : returns an instance of self.
"""
self.seed()
if len(set(y)) != 2:
raise Exception("Only binary classification is supported at this time!")
X, y = check_X_y(X, y)
check_classification_targets(y)
self.n_features_in_ = X.shape[1]
self.classes_ = unique_labels(y)
self.feature_dict_ = get_feature_dict(X.shape[1], feature_names)
self.feature_placeholders = np.array(list(self.feature_dict_.keys()))
self.feature_names = np.array(list(self.feature_dict_.values()))
itemsets, self.discretizer = extract_fpgrowth(X, y,
feature_names=self.feature_placeholders,
minsupport=self.minsupport,
maxcardinality=self.maxcardinality,
undiscretized_features=undiscretized_features,
disc_strategy=self.disc_strategy,
disc_kwargs=self.disc_kwargs,
verbose=verbose)
X_df_onehot = self.discretizer.transform(X)
# Now form the data-vs.-lhs set
# X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
for c in X_df_onehot.columns:
X_df_onehot[c] = [c if x == 1 else '' for x in list(X_df_onehot[c])]
X = [{}] * (len(itemsets) + 1)
X[0] = set(range(len(X_df_onehot))) # the default rule satisfies all data
for (j, lhs) in enumerate(itemsets):
X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)])
# now form lhs_len
lhs_len = [0]
for lhs in itemsets:
lhs_len.append(len(lhs))
nruleslen = Counter(lhs_len)
lhs_len = np.array(lhs_len)
itemsets_all = ['null']
itemsets_all.extend(itemsets)
Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = (
X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all
)
permsdic = defaultdict(default_permsdic) # We will store here the MCMC results
# Do MCMC
res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior,
self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len,
self.maxcardinality, permsdic, self.burnin, self.n_chains,
[None] * self.n_chains, verbose=self.verbose, seed=self.random_state)
# Merge the chains
permsdic = merge_chains(res)
# The point estimate, BRL-point
self.d_star = get_point_estimate(permsdic, lhs_len, Xtrain, Ytrain, self.alpha, nruleslen, self.maxcardinality,
self.listlengthprior, self.listwidthprior,
verbose=self.verbose) # get the point estimate
if self.d_star:
# Compute the rule consequent
self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain, self.d_star, self.alpha, True)
self.final_itemsets = np.array(self.itemsets, dtype=object)[self.d_star]
rule_strs = itemsets_to_rules(self.final_itemsets)
self.rules_without_feature_names_ = [Rule(r) for r in rule_strs]
self.rules_ = [
replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_
]
self.complexity_ = self._get_complexity()
return self
def _get_complexity(self):
n_rule_terms = sum([len(iset) for iset in self.final_itemsets if type(iset) != str])
return n_rule_terms + 1
# def __repr__(self, decimals=1):
# if self.d_star:
# detect = ""
# if self.class1label != "class 1":
# detect = "for detecting " + self.class1label
# header = "Trained RuleListClassifier " + detect + "\n"
# separator = "".join(["="] * len(header)) + "\n"
# s = ""
# for i, j in enumerate(self.d_star):
# if self.itemsets[j] != 'null':
# condition = "ELSE IF " + (
# " AND ".join([str(self.itemsets[j][k]) for k in range(len(self.itemsets[j]))])) + " THEN"
# else:
# condition = "ELSE"
# s += condition + " probability of " + self.class1label + ": " + str(
# np.round(self.theta[i] * 100, decimals)) + "% (" + str(
# np.round(self.ci_theta[i][0] * 100, decimals)) + "%-" + str(
# np.round(self.ci_theta[i][1] * 100, decimals)) + "%)\n"
# return header + separator + s[5:] + separator[1:]
# else:
# return "(Untrained RuleListClassifier)"
def __repr__(self, decimals=1):
if self.d_star:
detect = ""
if self.class1label != "class 1":
detect = "for detecting " + self.class1label
header = "Trained RuleListClassifier " + detect + "\n"
separator = "".join(["="] * len(header)) + "\n"
s = ""
for i in range(len(self.rules_) + 1):
if i != len(self.rules_):
condition = "ELSE IF " + str(self.rules_[i]) + " THEN"
else:
condition = "ELSE"
s += condition + " probability of " + self.class1label + ": " + str(
np.round(self.theta[i] * 100, decimals)) + "% (" + str(
np.round(self.ci_theta[i][0] * 100, decimals)) + "%-" + str(
np.round(self.ci_theta[i][1] * 100, decimals)) + "%)\n"
return header + separator + s[5:] + separator[1:]
else:
return "(Untrained RuleListClassifier)"
def _to_itemset_indices(self, X_df_onehot):
# X[j] is the set of data points that contain itemset j (that is, satisfy rule j)
for c in X_df_onehot.columns:
X_df_onehot[c] = [c if x == 1 else '' for x in list(X_df_onehot[c])]
X = [set() for j in range(len(self.itemsets))]
X[0] = set(range(X_df_onehot.shape[0])) # the default rule satisfies all data
for (j, lhs) in enumerate(self.itemsets):
if j > 0:
X[j] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)])
return X
def predict_proba(self, X):
"""Compute probabilities of possible outcomes for samples in X.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
T : array-like, shape = [n_samples, n_classes]
Returns the probability of the sample for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute `classes_`.
"""
check_is_fitted(self)
X = check_array(X)
if self.discretizer:
D = self.discretizer.transform(X)
else:
D = pd.DataFrame(X, columns=self.feature_names)
N = len(D)
X2 = self._to_itemset_indices(D)
P = preds_d_t(X2, np.zeros((N, 1), dtype=int), self.d_star, self.theta)
return np.vstack((1 - P, P)).T
def predict(self, X, threshold=0.1):
"""Perform classification on samples in X.
Parameters
----------
X : array-like, shape = [n_samples, n_features]
Returns
-------
y_pred : array, shape = [n_samples]
Class labels for samples in X.
"""
check_is_fitted(self)
X = check_array(X)
# print('predicting!')
# print('preds_proba', self.predict_proba(X)[:, 1])
return 1 * (self.predict_proba(X)[:, 1] >= threshold)
Classes
class BayesianRuleListClassifier (listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=0.1, disc_strategy='mdlp', disc_kwargs={}, alpha=array([1., 1.]), n_chains=3, max_iter=50000, class1label='class 1', verbose=False, random_state=42)
-
This is a scikit-learn compatible wrapper for the Bayesian Rule List classifier developed by Benjamin Letham. It produces a highly interpretable model (a list of decision rules) by sampling many different rule lists, trying to optimize for compactness and predictive performance.
Parameters
listlengthprior
:int
, optional (default=3
)- Prior hyperparameter for expected list length (excluding null rule)
listwidthprior
:int
, optional (default=1
)- Prior hyperparameter for expected list width (excluding null rule)
maxcardinality
:int
, optional (default=2
)- Maximum cardinality of an itemset
minsupport
:float
, optional (default=0.1
)- Minimum support (fraction between 0 and 1) of an itemset
alpha
:array_like
,shape
= [n_classes
]- prior hyperparameter for multinomial pseudocounts
n_chains
:int
, optional (default=3
)- Number of MCMC chains for inference
max_iter
:int
, optional (default=50000
)- Maximum number of iterations
class1label
:str
, optional (default="class
1"
)- Label or description of what the positive class (with y=1) means
verbose
:bool
, optional (default=True
)- Verbose output
random_state
:int
- Random seed
Expand source code
class BayesianRuleListClassifier(BaseEstimator, RuleList, ClassifierMixin): """ This is a scikit-learn compatible wrapper for the Bayesian Rule List classifier developed by Benjamin Letham. It produces a highly interpretable model (a list of decision rules) by sampling many different rule lists, trying to optimize for compactness and predictive performance. Parameters ---------- listlengthprior : int, optional (default=3) Prior hyperparameter for expected list length (excluding null rule) listwidthprior : int, optional (default=1) Prior hyperparameter for expected list width (excluding null rule) maxcardinality : int, optional (default=2) Maximum cardinality of an itemset minsupport : float, optional (default=0.1) Minimum support (fraction between 0 and 1) of an itemset alpha : array_like, shape = [n_classes] prior hyperparameter for multinomial pseudocounts n_chains : int, optional (default=3) Number of MCMC chains for inference max_iter : int, optional (default=50000) Maximum number of iterations class1label: str, optional (default="class 1") Label or description of what the positive class (with y=1) means verbose: bool, optional (default=True) Verbose output random_state: int Random seed """ def __init__(self, listlengthprior=3, listwidthprior=1, maxcardinality=2, minsupport=0.1, disc_strategy='mdlp', disc_kwargs={}, alpha=np.array([1., 1.]), n_chains=3, max_iter=50000, class1label="class 1", verbose=False, random_state=42): self.listlengthprior = listlengthprior self.listwidthprior = listwidthprior self.maxcardinality = maxcardinality self.minsupport = minsupport self.disc_strategy = disc_strategy self.disc_kwargs = disc_kwargs self.alpha = alpha self.n_chains = n_chains self.max_iter = max_iter self.class1label = class1label self.verbose = verbose self._zmin = 1 self.thinning = 1 # The thinning rate self.burnin = self.max_iter // 2 # the number of samples to drop as burn-in in-simulation self.discretizer = None self.d_star = None self.random_state = random_state self.seed() def seed(self): if self.random_state is not None: random.seed(self.random_state) np.random.seed(self.random_state) def _setlabels(self, X, feature_names=[]): if len(feature_names) == 0: if type(X) == pd.DataFrame and ('object' in str(X.columns.dtype) or 'str' in str(X.columns.dtype)): feature_names = X.columns else: feature_names = ["ft" + str(i + 1) for i in range(len(X[0]))] self.feature_names = feature_names def fit(self, X, y, feature_names: list = None, undiscretized_features=[], verbose=False): """Fit rule lists to data. Note: Numerical data in `X` is automatically discretized. To prevent discretization (e.g. to protect columns containing categorical data represented as integers), pass the list of protected column names in the `fit` method, e.g. `model.fit(X,y,undiscretized_features=['CAT_COLUMN_NAME'])` (entries in undiscretized columns will be converted to strings and used as categorical values) Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array_like, shape = [n_samples] Labels feature_names : array_like, shape = [n_features], optional (default: []) String labels for each feature. If empty and X is a DataFrame, column labels are used. If empty and X is not a DataFrame, then features are simply enumerated undiscretized_features : array_like, shape = [n_features], optional (default: []) String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized verbose : bool Currently doesn't do anything Returns ------- self : returns an instance of self. """ self.seed() if len(set(y)) != 2: raise Exception("Only binary classification is supported at this time!") X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_in_ = X.shape[1] self.classes_ = unique_labels(y) self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = np.array(list(self.feature_dict_.keys())) self.feature_names = np.array(list(self.feature_dict_.values())) itemsets, self.discretizer = extract_fpgrowth(X, y, feature_names=self.feature_placeholders, minsupport=self.minsupport, maxcardinality=self.maxcardinality, undiscretized_features=undiscretized_features, disc_strategy=self.disc_strategy, disc_kwargs=self.disc_kwargs, verbose=verbose) X_df_onehot = self.discretizer.transform(X) # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) for c in X_df_onehot.columns: X_df_onehot[c] = [c if x == 1 else '' for x in list(X_df_onehot[c])] X = [{}] * (len(itemsets) + 1) X[0] = set(range(len(X_df_onehot))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)]) # now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = np.array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all ) permsdic = defaultdict(default_permsdic) # We will store here the MCMC results # Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, self.maxcardinality, permsdic, self.burnin, self.n_chains, [None] * self.n_chains, verbose=self.verbose, seed=self.random_state) # Merge the chains permsdic = merge_chains(res) # The point estimate, BRL-point self.d_star = get_point_estimate(permsdic, lhs_len, Xtrain, Ytrain, self.alpha, nruleslen, self.maxcardinality, self.listlengthprior, self.listwidthprior, verbose=self.verbose) # get the point estimate if self.d_star: # Compute the rule consequent self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain, self.d_star, self.alpha, True) self.final_itemsets = np.array(self.itemsets, dtype=object)[self.d_star] rule_strs = itemsets_to_rules(self.final_itemsets) self.rules_without_feature_names_ = [Rule(r) for r in rule_strs] self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity_ = self._get_complexity() return self def _get_complexity(self): n_rule_terms = sum([len(iset) for iset in self.final_itemsets if type(iset) != str]) return n_rule_terms + 1 # def __repr__(self, decimals=1): # if self.d_star: # detect = "" # if self.class1label != "class 1": # detect = "for detecting " + self.class1label # header = "Trained RuleListClassifier " + detect + "\n" # separator = "".join(["="] * len(header)) + "\n" # s = "" # for i, j in enumerate(self.d_star): # if self.itemsets[j] != 'null': # condition = "ELSE IF " + ( # " AND ".join([str(self.itemsets[j][k]) for k in range(len(self.itemsets[j]))])) + " THEN" # else: # condition = "ELSE" # s += condition + " probability of " + self.class1label + ": " + str( # np.round(self.theta[i] * 100, decimals)) + "% (" + str( # np.round(self.ci_theta[i][0] * 100, decimals)) + "%-" + str( # np.round(self.ci_theta[i][1] * 100, decimals)) + "%)\n" # return header + separator + s[5:] + separator[1:] # else: # return "(Untrained RuleListClassifier)" def __repr__(self, decimals=1): if self.d_star: detect = "" if self.class1label != "class 1": detect = "for detecting " + self.class1label header = "Trained RuleListClassifier " + detect + "\n" separator = "".join(["="] * len(header)) + "\n" s = "" for i in range(len(self.rules_) + 1): if i != len(self.rules_): condition = "ELSE IF " + str(self.rules_[i]) + " THEN" else: condition = "ELSE" s += condition + " probability of " + self.class1label + ": " + str( np.round(self.theta[i] * 100, decimals)) + "% (" + str( np.round(self.ci_theta[i][0] * 100, decimals)) + "%-" + str( np.round(self.ci_theta[i][1] * 100, decimals)) + "%)\n" return header + separator + s[5:] + separator[1:] else: return "(Untrained RuleListClassifier)" def _to_itemset_indices(self, X_df_onehot): # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) for c in X_df_onehot.columns: X_df_onehot[c] = [c if x == 1 else '' for x in list(X_df_onehot[c])] X = [set() for j in range(len(self.itemsets))] X[0] = set(range(X_df_onehot.shape[0])) # the default rule satisfies all data for (j, lhs) in enumerate(self.itemsets): if j > 0: X[j] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)]) return X def predict_proba(self, X): """Compute probabilities of possible outcomes for samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples, n_classes] Returns the probability of the sample for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self) X = check_array(X) if self.discretizer: D = self.discretizer.transform(X) else: D = pd.DataFrame(X, columns=self.feature_names) N = len(D) X2 = self._to_itemset_indices(D) P = preds_d_t(X2, np.zeros((N, 1), dtype=int), self.d_star, self.theta) return np.vstack((1 - P, P)).T def predict(self, X, threshold=0.1): """Perform classification on samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- y_pred : array, shape = [n_samples] Class labels for samples in X. """ check_is_fitted(self) X = check_array(X) # print('predicting!') # print('preds_proba', self.predict_proba(X)[:, 1]) return 1 * (self.predict_proba(X)[:, 1] >= threshold)
Ancestors
- sklearn.base.BaseEstimator
- RuleList
- sklearn.base.ClassifierMixin
Methods
def fit(self, X, y, feature_names=None, undiscretized_features=[], verbose=False)
-
Fit rule lists to data. Note: Numerical data in
X
is automatically discretized. To prevent discretization (e.g. to protect columns containing categorical data represented as integers), pass the list of protected column names in thefit
method, e.g.model.fit(X,y,undiscretized_features=['CAT_COLUMN_NAME'])
(entries in undiscretized columns will be converted to strings and used as categorical values)Parameters
X
:array
-like
,shape
= [n_samples
,n_features
]- Training data
y
:array_like
,shape
= [n_samples
]- Labels
feature_names
:array_like
,shape
= [n_features
], optional (default: [])- String labels for each feature. If empty and X is a DataFrame, column labels are used. If empty and X is not a DataFrame, then features are simply enumerated
undiscretized_features
:array_like
,shape
= [n_features
], optional (default: [])- String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized
verbose
:bool
- Currently doesn't do anything
Returns
self : returns an instance of self.
Expand source code
def fit(self, X, y, feature_names: list = None, undiscretized_features=[], verbose=False): """Fit rule lists to data. Note: Numerical data in `X` is automatically discretized. To prevent discretization (e.g. to protect columns containing categorical data represented as integers), pass the list of protected column names in the `fit` method, e.g. `model.fit(X,y,undiscretized_features=['CAT_COLUMN_NAME'])` (entries in undiscretized columns will be converted to strings and used as categorical values) Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data y : array_like, shape = [n_samples] Labels feature_names : array_like, shape = [n_features], optional (default: []) String labels for each feature. If empty and X is a DataFrame, column labels are used. If empty and X is not a DataFrame, then features are simply enumerated undiscretized_features : array_like, shape = [n_features], optional (default: []) String labels for each feature which is NOT to be discretized. If empty, all numeric features are discretized verbose : bool Currently doesn't do anything Returns ------- self : returns an instance of self. """ self.seed() if len(set(y)) != 2: raise Exception("Only binary classification is supported at this time!") X, y = check_X_y(X, y) check_classification_targets(y) self.n_features_in_ = X.shape[1] self.classes_ = unique_labels(y) self.feature_dict_ = get_feature_dict(X.shape[1], feature_names) self.feature_placeholders = np.array(list(self.feature_dict_.keys())) self.feature_names = np.array(list(self.feature_dict_.values())) itemsets, self.discretizer = extract_fpgrowth(X, y, feature_names=self.feature_placeholders, minsupport=self.minsupport, maxcardinality=self.maxcardinality, undiscretized_features=undiscretized_features, disc_strategy=self.disc_strategy, disc_kwargs=self.disc_kwargs, verbose=verbose) X_df_onehot = self.discretizer.transform(X) # Now form the data-vs.-lhs set # X[j] is the set of data points that contain itemset j (that is, satisfy rule j) for c in X_df_onehot.columns: X_df_onehot[c] = [c if x == 1 else '' for x in list(X_df_onehot[c])] X = [{}] * (len(itemsets) + 1) X[0] = set(range(len(X_df_onehot))) # the default rule satisfies all data for (j, lhs) in enumerate(itemsets): X[j + 1] = set([i for (i, xi) in enumerate(X_df_onehot.values) if set(lhs).issubset(xi)]) # now form lhs_len lhs_len = [0] for lhs in itemsets: lhs_len.append(len(lhs)) nruleslen = Counter(lhs_len) lhs_len = np.array(lhs_len) itemsets_all = ['null'] itemsets_all.extend(itemsets) Xtrain, Ytrain, nruleslen, lhs_len, self.itemsets = ( X, np.vstack((1 - np.array(y), y)).T.astype(int), nruleslen, lhs_len, itemsets_all ) permsdic = defaultdict(default_permsdic) # We will store here the MCMC results # Do MCMC res, Rhat = run_bdl_multichain_serial(self.max_iter, self.thinning, self.alpha, self.listlengthprior, self.listwidthprior, Xtrain, Ytrain, nruleslen, lhs_len, self.maxcardinality, permsdic, self.burnin, self.n_chains, [None] * self.n_chains, verbose=self.verbose, seed=self.random_state) # Merge the chains permsdic = merge_chains(res) # The point estimate, BRL-point self.d_star = get_point_estimate(permsdic, lhs_len, Xtrain, Ytrain, self.alpha, nruleslen, self.maxcardinality, self.listlengthprior, self.listwidthprior, verbose=self.verbose) # get the point estimate if self.d_star: # Compute the rule consequent self.theta, self.ci_theta = get_rule_rhs(Xtrain, Ytrain, self.d_star, self.alpha, True) self.final_itemsets = np.array(self.itemsets, dtype=object)[self.d_star] rule_strs = itemsets_to_rules(self.final_itemsets) self.rules_without_feature_names_ = [Rule(r) for r in rule_strs] self.rules_ = [ replace_feature_name(rule, self.feature_dict_) for rule in self.rules_without_feature_names_ ] self.complexity_ = self._get_complexity() return self
def predict(self, X, threshold=0.1)
-
Perform classification on samples in X.
Parameters
X
:array
-like
,shape
= [n_samples
,n_features
]
Returns
y_pred
:array
,shape
= [n_samples
]- Class labels for samples in X.
Expand source code
def predict(self, X, threshold=0.1): """Perform classification on samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- y_pred : array, shape = [n_samples] Class labels for samples in X. """ check_is_fitted(self) X = check_array(X) # print('predicting!') # print('preds_proba', self.predict_proba(X)[:, 1]) return 1 * (self.predict_proba(X)[:, 1] >= threshold)
def predict_proba(self, X)
-
Compute probabilities of possible outcomes for samples in X.
Parameters
X
:array
-like
,shape
= [n_samples
,n_features
]
Returns
T
:array
-like
,shape
= [n_samples
,n_classes
]- Returns the probability of the sample for each class in
the model. The columns correspond to the classes in sorted
order, as they appear in the attribute
classes_
.
Expand source code
def predict_proba(self, X): """Compute probabilities of possible outcomes for samples in X. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- T : array-like, shape = [n_samples, n_classes] Returns the probability of the sample for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self) X = check_array(X) if self.discretizer: D = self.discretizer.transform(X) else: D = pd.DataFrame(X, columns=self.feature_names) N = len(D) X2 = self._to_itemset_indices(D) P = preds_d_t(X2, np.zeros((N, 1), dtype=int), self.d_star, self.theta) return np.vstack((1 - P, P)).T
def seed(self)
-
Expand source code
def seed(self): if self.random_state is not None: random.seed(self.random_state) np.random.seed(self.random_state)