Source code for iguanas.rule_application.rule_applier
"""Applies rules in the standard Iguanas string format."""
import pandas as pd
import numpy as np
import iguanas.utils as utils
from typing import Dict, Union
from iguanas.utils.types import KoalasDataFrame, KoalasSeries, PandasDataFrame,\
PandasSeries
from iguanas.utils.typing import KoalasDataFrameType, KoalasSeriesType,\
PandasDataFrameType, PandasSeriesType
[docs]class RuleApplier:
"""
Applies rules (stored in the standard Iguanas string format) to a dataset.
Parameters
----------
rule_strings : Dict[str, str]
Set of rules defined using the standard Iguanas string format
(values) and their names (keys).
opt_func : Callable, optional
A function/method which calculates a custom metric (e.g. Fbeta score)
for each rule. Defaults to None.
Attributes
----------
rule_descriptions : PandasDataFrameType
Contains the logic of the rules and their performance metrics as
applied to the dataset.
"""
def __init__(self, rule_strings: Dict[str, str], opt_func=None):
self.opt_func = opt_func
self.rule_strings = rule_strings
self.unapplied_rule_names = []
[docs] def transform(self, X: Union[PandasDataFrameType, KoalasDataFrameType], y=None,
sample_weight=None) -> Union[PandasDataFrameType, KoalasDataFrameType]:
"""
Applies the set of rules to a dataset, `X`. If `y` is provided, the
performance metrics for each rule will also be calculated.
Parameters
----------
X : Union[PandasDataFrameType, KoalasDataFrameType]
The feature set on which the rules should be applied.
y : Union[PandasSeriesType, KoalasSeriesType], optional
The target column. Defaults to None.
sample_weight : Union[PandasSeriesType, KoalasSeriesType], optional
Record-wise weights to apply. Defaults to None.
Returns
-------
Union[PandasDataFrameType, KoalasDataFrameType]
The binary columns of the rules.
"""
utils.check_allowed_types(X, 'X', [PandasDataFrame, KoalasDataFrame])
if y is not None:
utils.check_allowed_types(y, 'y', [PandasSeries, KoalasSeries])
if sample_weight is not None:
utils.check_allowed_types(
sample_weight, 'sample_weight', [PandasSeries, KoalasSeries])
X_rules = self._get_X_rules(X)
rule_strings_list = list(self.rule_strings.values())
if (y is None and self.opt_func is not None) or (y is not None):
rule_descriptions = utils.return_rule_descriptions_from_X_rules(
X_rules=X_rules, X_rules_cols=X_rules.columns, y_true=y,
sample_weight=sample_weight, opt_func=self.opt_func
)
rule_descriptions['Logic'] = rule_strings_list
rule_descriptions['nConditions'] = list(map(
utils.count_rule_conditions, rule_strings_list))
self.rule_descriptions, X_rules = utils.sort_rule_dfs_by_opt_metric(
rule_descriptions, X_rules)
return X_rules
def _get_X_rules(self, X: Union[PandasDataFrameType, KoalasDataFrameType]) -> Union[
PandasDataFrameType, KoalasDataFrameType]:
"""
Returns the binary columns of the list of rules applied to the
dataset `X`.
"""
X_rules_list = []
for rule_name, rule_string in self.rule_strings.items():
try:
X_rule = eval(rule_string)
except KeyError as e:
raise KeyError(
f'Feature {e} in rule `{rule_name}` not found in `X`')
if utils.is_type(X_rule, (PandasSeries, KoalasSeries)):
X_rule = X_rule.fillna(False).astype(int)
X_rule.name = rule_name
elif isinstance(X_rule, np.ndarray):
X_rule = X_rule.astype(int)
X_rules_list.append(X_rule)
if isinstance(X_rules_list[0], np.ndarray):
X_rules = pd.DataFrame(np.asarray(X_rules_list)).T
X_rules.columns = list(self.rule_strings.keys())
else:
X_rules = utils.concat(X_rules_list, axis=1, sort=False)
X_rules.index = X.index
return X_rules