"""Generates rules using decision trees."""
from joblib import Parallel, delayed
import numpy as np
import iguanas.utils as utils
from iguanas.rules import Rules
from iguanas.rule_generation._base_generator import BaseGenerator
from iguanas.utils.types import PandasDataFrame, PandasSeries
from iguanas.utils.typing import PandasSeriesType, PandasDataFrameType
from datetime import date
from typing import Union, Callable, List, Set, Tuple
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
import sys
[docs]class RuleGeneratorDT(Rules, BaseGenerator):
"""
Generate rules by extracting the highest performing branches from a
tree ensemble model.
Parameters
----------
opt_func : Callable
A function/method which calculates the desired optimisation
metric (e.g. Fbeta score).
n_total_conditions : int
The maximum number of conditions per generated rule.
tree_ensemble : Union[RandomForestClassifier, ExtraTreesClassifier]
Instantiated Sklearn tree ensemble classifier object used to generated
rules.
precision_threshold : float, optional
Precision threshold for the tree/branch to be used to create rules.
If the overall precision of the tree/branch is less than or equal
to this value, it will not be used in rule generation. Note that if
`bootstrap` == True in the tree_ensemble class, the precision will
be based on the bootstrapped sample used to create the tree.
Defaults to 0.
num_cores : int, optional
The number of cores to use when iterating through the ensemble to
generate rules. Defaults to 1.
target_feat_corr_types : Union[Dict[str, List[str]], str], optional
Limits the conditions of the rules based on the target-feature
correlation (e.g. if a feature has a positive correlation with
respect to the target, then only greater than operators are used
for conditions that utilise that feature). Can be either a
dictionary specifying the list of positively correlated features
wrt the target (under the key `PositiveCorr`) and negatively
correlated features wrt the target (under the key `NegativeCorr`),
or 'Infer' (where each target-feature correlation type is inferred
from the data). Defaults to None.
verbose : int, optional
Controls the verbosity - the higher, the more messages. >0 : gives
the overall progress of the training of the ensemble model and the
extraction of the rules from the trees; >1 : also shows the
progress of the training of the individual trees in the ensemble
model. Defaults to 0.
rule_name_prefix : str, optional
Prefix to use for each rule name. If None, the standard prefix is
used. Defaults to None.
Attributes
----------
rule_strings : Dict[str, str]
The generated rules, defined using the standard Iguanas string format
(values) and their names (keys).
rule_descriptions : PandasDataFrameType
A dataframe showing the logic of the rules and their performance
metrics on the given dataset.
"""
def __init__(self, opt_func: Callable,
n_total_conditions: int,
tree_ensemble: Union[RandomForestClassifier, ExtraTreesClassifier],
precision_threshold=0,
num_cores=1,
target_feat_corr_types=None,
verbose=0,
rule_name_prefix=None):
self.tree_ensemble = tree_ensemble
self.tree_ensemble.max_depth = n_total_conditions
self.tree_ensemble.random_state = 0
self.opt_func = opt_func
self.precision_threshold = precision_threshold
self.num_cores = num_cores
self.target_feat_corr_types = target_feat_corr_types
self.verbose = verbose
self.tree_ensemble.verbose = verbose
self._rule_name_counter = 0
today = date.today()
self.today = today.strftime("%Y%m%d")
self.rule_name_prefix = rule_name_prefix
Rules.__init__(self, rule_strings={}, opt_func=self.opt_func)
def __repr__(self):
if self.rule_strings:
return f'RuleGeneratorDT object with {len(self.rule_strings)} rules generated'
else:
return f'RuleGeneratorDT(opt_func={self.opt_func}, n_total_conditions={self.tree_ensemble.max_depth}, tree_ensemble={self.tree_ensemble}, precision_threshold={self.precision_threshold}, num_cores={self.num_cores}, target_feat_corr_types={self.target_feat_corr_types})'
[docs] def fit(self, X: PandasDataFrameType, y: PandasSeriesType,
sample_weight=None) -> PandasDataFrameType:
"""
Generates rules by extracting the highest performing branches in a tree
ensemble model.
Parameters
----------
X : PandasDataFrameType
The feature set used for training the model.
y : PandasSeriesType
The target column.
sample_weight : PandasSeriesType, optional
Record-wise weights to apply. Defaults to None.
Returns
-------
PandasDataFrameType
The binary columns of the rules on the fitted dataset.
"""
utils.check_allowed_types(X, 'X', [PandasDataFrame])
utils.check_allowed_types(y, 'y', [PandasSeries])
if sample_weight is not None:
utils.check_allowed_types(
sample_weight, 'sample_weight', [PandasSeries])
if self.target_feat_corr_types == 'Infer':
if self.verbose:
print(
'--- Calculating correlation of features with respect to the target ---')
self.target_feat_corr_types = self._calc_target_ratio_wrt_features(
X=X, y=y
)
if self.verbose:
print('--- Returning column datatypes ---')
columns_int, columns_cat, _ = utils.return_columns_types(X)
if self.verbose:
print('--- Training tree ensemble ---')
trained_tree_ensemble = self._train_ensemble(
X=X, y=y, tree_ensemble=self.tree_ensemble,
sample_weight=sample_weight, verbose=self.verbose)
if self.verbose:
print('--- Extracting rules from tree ensemble ---')
X_rules = self._extract_rules_from_ensemble(
X=X,
y=y,
num_cores=self.num_cores,
tree_ensemble=trained_tree_ensemble,
columns_int=columns_int,
columns_cat=columns_cat,
sample_weight=sample_weight
)
self.rule_strings = self.rule_descriptions['Logic'].to_dict()
return X_rules
def _extract_rules_from_ensemble(self, X: PandasDataFrameType, y: PandasSeriesType,
tree_ensemble: Union[RandomForestClassifier, ExtraTreesClassifier],
num_cores: int,
sample_weight: PandasSeriesType,
columns_int: List[str],
columns_cat: List[str]) -> PandasDataFrameType:
"""
Method for returning all of the rules from the ensemble tree-based
model.
"""
decision_trees = utils.return_progress_ready_range(
verbose=self.verbose, range=tree_ensemble.estimators_)
with Parallel(n_jobs=num_cores) as parallel:
list_of_rule_string_sets = parallel(delayed(self._extract_rules_from_dt)(
X.columns.tolist(), decision_tree, columns_int, columns_cat
) for decision_tree in decision_trees
)
rule_strings_set = sorted(set().union(*list_of_rule_string_sets))
self.rule_strings = dict((self._generate_rule_name_dt(), rule_string)
for rule_string in rule_strings_set)
if not self.rule_strings:
raise Exception(
'No rules could be generated. Try changing the class parameters.')
X_rules = self.transform(X=X, y=y, sample_weight=sample_weight)
return X_rules
def _extract_rules_from_dt(self, columns: List[str],
decision_tree: DecisionTreeClassifier,
columns_int: List[str], columns_cat: List[str]) -> Set[str]:
"""
Removes low precision DTs and returns the rules from the DT.
"""
left, right, features, thresholds, precisions, tree_prec = self._get_dt_attributes(
decision_tree
)
if tree_prec <= self.precision_threshold:
return set()
else:
return self._extract_rules_from_tree(
columns=columns, precision_threshold=self.precision_threshold,
columns_int=columns_int,
columns_cat=columns_cat, left=left, right=right,
features=features, thresholds=thresholds, precisions=precisions
)
@staticmethod
def _train_ensemble(X: PandasDataFrameType, y: PandasSeriesType,
tree_ensemble: Union[RandomForestClassifier,
ExtraTreesClassifier],
sample_weight: PandasSeriesType,
verbose: int) -> Union[RandomForestClassifier,
ExtraTreesClassifier]:
"""Method for running ML model"""
def _switch_stderr_stdout(verbose: int):
"""Switches stderr and stdout, if verbose > 0"""
if verbose > 0:
sys.stdout, sys.stderr = sys.stderr, sys.stdout
_switch_stderr_stdout(verbose)
tree_ensemble.fit(X=X, y=y, sample_weight=sample_weight)
_switch_stderr_stdout(verbose)
return tree_ensemble
@staticmethod
def _get_dt_attributes(decision_tree: DecisionTreeClassifier) -> Tuple[
np.ndarray]:
"""Returns the attributes associated with a given DT"""
left = decision_tree.tree_.children_left
right = decision_tree.tree_.children_right
thresholds = decision_tree.tree_.threshold
node_splits = decision_tree.tree_.value
features = decision_tree.tree_.feature
node_precs = np.empty(len(node_splits))
tps_l, tps_fps_l = [], []
for i, node_split in enumerate(node_splits):
node_precision = node_split[0][1]/np.sum(node_split[0])
node_pred = np.argmax(node_split[0])
node_precs[i] = node_precision
if left[i] == -1 and node_pred == 1:
tps_l.append(node_splits[i][0][1])
tps_fps_l.append(sum(node_splits[i][0]))
tps = sum(tps_l)
tps_fps = sum(tps_fps_l)
tree_prec = 0 if tps_fps == 0 else tps/tps_fps
return left, right, features, thresholds, node_precs, tree_prec