"""
Generates rules by optimising the thresholds of each feature individually, then
combining them.
"""
import pandas as pd
import numpy as np
import math
from itertools import combinations
from iguanas.correlation_reduction.agglomerative_clustering_reducer import AgglomerativeClusteringReducer
import iguanas.utils as utils
from iguanas.rule_application import RuleApplier
from iguanas.rule_generation._base_generator import BaseGenerator
from iguanas.metrics.pairwise import CosineSimilarity
from iguanas.metrics.classification import FScore
from iguanas.rules import Rules
from iguanas.utils.types import PandasDataFrame, PandasSeries
from iguanas.utils.typing import PandasDataFrameType, PandasSeriesType
from datetime import date
from typing import Callable, List, Tuple, Dict
f1 = FScore(1)
[docs]class RuleGeneratorOpt(BaseGenerator, Rules):
"""
Generate rules by optimising the thresholds of single features and
combining these one condition rules with AND conditions to create more
complex rules.
Parameters
----------
opt_func : Callable
A function/method which calculates the desired optimisation metric
(e.g. Fbeta score). Note that the module will assume higher values
correspond to better performing rules.
n_total_conditions : int
The maximum number of conditions per generated rule.
num_rules_keep : int
The top number of rules (by Fbeta score) to keep at the end of each
stage of rule combination. Reducing this number will improve the
runtime, but may result in useful rules being removed.
n_points : int, optional
Number of points to split a numeric feature's range into when
generating the numeric one condition rules. A larger number will result
in better optimised one condition rules, but will take longer to
calculate. Defaults to 10.
ratio_window : int, optional
Factor which determines the optimisation range for numeric features
(e.g. if a numeric field has range of 1 to 11 and ratio_window = 3, the
optimisation range for the <= operator will be from 1 to (11-1)/3 =
3.33; the optimisation range for the >= operator will be from
11-((11-1)/3)=7.67 to 11). A larger number (greater than 1) will result
in a smaller range being used for optimisation of one condition rules;
set to 1 if you want to optimise the one condition rules across the
full range of the numeric feature. Defaults to 2.
one_cond_rule_opt_func : Callable, optional
The optimisation function used for one condition rules. Note that the
module will assume higher values correspond to better performing rules.
Defaults to the method used for calculating the F1 score.
remove_corr_rules : bool, optional
Dictates whether correlated rules should be removed at the end of each
pairwise combination. Defaults to True.
target_feat_corr_types : Union[Dict[str, List[str]], str], optional
Limits the conditions of the rules based on the target-feature
correlation (e.g. if a feature has a positive correlation with respect
to the target, then only greater than operators are used for conditions
that utilise that feature). Can be either a dictionary specifying the
list of positively correlated features wrt the target (under the key
`PositiveCorr`) and negatively correlated features wrt the target
(under the key `NegativeCorr`), or 'Infer' (where each target-feature
correlation type is inferred from the data). Defaults to None.
verbose : int, optional
Controls the verbosity - the higher, the more messages. >0 : gives the
progress of the training of the rules. Defaults to 0.
rule_name_prefix : str, optional
Prefix to use for each rule name. If None, the standard prefix is used.
Defaults to None.
Attributes
----------
rule_strings : Dict[str, str]
The generated rules, defined using the standard Iguanas string format
(values) and their names (keys).
rule_descriptions : PandasDataFrameType
A dataframe showing the logic of the rules and their performance
metrics on the given dataset.
"""
def __init__(self, opt_func: Callable,
n_total_conditions: int, num_rules_keep: int, n_points=10,
ratio_window=2, one_cond_rule_opt_func=f1.fit,
remove_corr_rules=True, target_feat_corr_types=None,
verbose=0, rule_name_prefix=None):
self.opt_func = opt_func
self.n_total_conditions = n_total_conditions
self.num_rules_keep = num_rules_keep
self.n_points = n_points
self.ratio_window = ratio_window
self.one_cond_rule_opt_func = one_cond_rule_opt_func
self.remove_corr_rules = remove_corr_rules
self.target_feat_corr_types = target_feat_corr_types
self.verbose = verbose
self._rule_name_counter = 0
today = date.today()
self.today = today.strftime("%Y%m%d")
self.rule_name_prefix = rule_name_prefix
Rules.__init__(self, rule_strings={}, opt_func=self.opt_func)
def __repr__(self):
if self.rule_strings:
return f'RuleGeneratorOpt object with {len(self.rule_strings)} rules generated'
else:
return f'RuleGeneratorOpt(opt_func={self.opt_func}, n_total_conditions={self.n_total_conditions}, num_rules_keep={self.num_rules_keep}, n_points={self.n_points}, ratio_window={self.ratio_window}, one_cond_rule_opt_func={self.one_cond_rule_opt_func}, remove_corr_rules={self.remove_corr_rules}, target_feat_corr_types={self.target_feat_corr_types})'
[docs] def fit(self, X: PandasDataFrameType, y: PandasSeriesType,
sample_weight=None) -> PandasDataFrameType:
"""
Generate rules by optimising the thresholds of single features and
combining these one condition rules with AND conditions to create more
complex rules.
Parameters
----------
X : PandasDataFrameType
The feature set used for training the model.
y : PandasSeriesType
The target column.
sample_weight : PandasSeriesType, optional
Record-wise weights to apply. Defaults to None.
Returns
-------
PandasDataFrameType
The binary columns of the rules on the fitted dataset.
"""
utils.check_allowed_types(X, 'X', [PandasDataFrame])
utils.check_allowed_types(y, 'y', [PandasSeries])
if sample_weight is not None:
utils.check_allowed_types(
sample_weight, 'sample_weight', [PandasSeries])
if self.target_feat_corr_types == 'Infer':
self.target_feat_corr_types = self._calc_target_ratio_wrt_features(
X=X, y=y
)
rule_descriptions, X_rules = utils.generate_empty_data_structures()
columns_int, columns_cat, columns_float = utils.return_columns_types(
X)
columns_num = [
col for col in columns_int if col not in columns_cat] + columns_float
if columns_num:
if self.verbose > 0:
print(
'--- Generating one condition rules for numeric features ---')
rule_descriptions_num, X_rules_num = self._generate_numeric_one_condition_rules(
X, y, columns_num, columns_int, sample_weight)
rule_descriptions, X_rules = utils.combine_rule_dfs(
rule_descriptions_num, X_rules_num, rule_descriptions, X_rules)
if columns_cat:
if self.verbose > 0:
print(
'--- Generating one condition rules for OHE categorical features ---')
rule_descriptions_cat, X_rules_cat = self._generate_categorical_one_condition_rules(
X, y, columns_cat, sample_weight)
rule_descriptions, X_rules = utils.combine_rule_dfs(
rule_descriptions_cat, X_rules_cat, rule_descriptions, X_rules)
if self.verbose > 0:
print('--- Generating pairwise rules ---')
self.rule_descriptions, X_rules = self._generate_n_order_pairwise_rules(
rule_descriptions, X_rules, y, self.remove_corr_rules, sample_weight)
self.rule_strings = self.rule_descriptions['Logic'].to_dict()
return X_rules
def _generate_numeric_one_condition_rules(self, X: PandasDataFrameType,
y: PandasSeriesType,
columns_num: List[str],
columns_int: List[str],
sample_weight: PandasSeriesType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Generates one condition rules for numeric columns by optimising the
threshold of each column based on `opt_func`.
"""
rule_strings = {}
rule_descriptions, X_rules = utils.generate_empty_data_structures()
if self.target_feat_corr_types is not None:
pos_corr_num_feats = [
col for col in columns_num if col in self.target_feat_corr_types['PositiveCorr']]
neg_corr_num_feats = [
col for col in columns_num if col in self.target_feat_corr_types['NegativeCorr']]
cols_and_operators = list(zip(pos_corr_num_feats, ['>='] * len(pos_corr_num_feats))) + list(
zip(neg_corr_num_feats, ['<='] * len(neg_corr_num_feats)))
else:
cols_and_operators = list(
zip(columns_num * 2, [">="] * len(columns_num) + ["<="] * len(columns_num)))
cols_and_operators = utils.return_progress_ready_range(
verbose=self.verbose, range=cols_and_operators)
for column, operator in cols_and_operators:
X_col = X[column].values
if np.std(X_col) == 0:
continue
x_min, x_max = self._set_iteration_range(
X_col=X_col, column=column, operator=operator, n_points=self.n_points,
ratio_window=self.ratio_window, columns_int=columns_int)
x_iter = self._set_iteration_array(
column, columns_int, x_min, x_max, self.n_points)
# Optimise threshold using self.one_cond_rule_beta
opt_metric_iter = self._calculate_opt_metric_across_range(
x_iter=x_iter, operator=operator, X_col=X_col, y=y,
opt_func=self.one_cond_rule_opt_func, sample_weight=sample_weight)
x_max_opt_metric = self._return_x_of_max_opt_metric(
opt_metric_iter, operator, x_iter)
rule_logic = f"(X['{column}']{operator}{x_max_opt_metric})"
rule_name = self._generate_rule_name()
rule_strings[rule_name] = rule_logic
ara = RuleApplier(rule_strings=rule_strings,
opt_func=self.opt_func)
X_rules = ara.transform(X=X, y=y, sample_weight=sample_weight)
rule_descriptions = ara.rule_descriptions
# Remove rules with zero variance and precision == 0
rule_descriptions, X_rules = self._drop_zero_var_and_precision_rules(
rule_descriptions=rule_descriptions, X_rules=X_rules)
return rule_descriptions, X_rules
def _generate_categorical_one_condition_rules(self, X: PandasDataFrameType,
y: PandasSeriesType,
columns_cat: List[str],
sample_weight: PandasDataFrameType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""Generates one condition rules for OHE categorical columns"""
def _gen_rules_from_target_feat_corr_types(X: PandasDataFrameType, y: PandasSeriesType,
columns_cat: List[str],
sample_weight: PandasSeriesType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Generates rules using the target-feature correlation types given
in `target_feat_corr_types`.
"""
pos_corr_cat_feats = [
col for col in columns_cat if col in self.target_feat_corr_types['PositiveCorr']]
neg_corr_cat_feats = [
col for col in columns_cat if col in self.target_feat_corr_types['NegativeCorr']]
cols_and_operators = list(zip(pos_corr_cat_feats, ['True'] * len(pos_corr_cat_feats))) + list(
zip(neg_corr_cat_feats, ['False'] * len(neg_corr_cat_feats)))
rule_strings = {self._generate_rule_name(): f"(X['{col}']=={operator})" for col,
operator in cols_and_operators}
# Use opt_func (rather than one_cond_rule_opt_func) as rules (in
# this case) are generated from `target_feat_corr_types`
ara = RuleApplier(
rule_strings=rule_strings, opt_func=self.opt_func
)
X_rules = ara.transform(
X=X, y=y, sample_weight=sample_weight
)
rule_descriptions = ara.rule_descriptions
return rule_descriptions, X_rules
def _gen_rules_best_perf_bool_option(X: PandasDataFrameType, y: PandasSeriesType,
columns_cat: List[str],
sample_weight: PandasSeriesType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Generates rules by keeping only the best performing boolean value
per OHE column.
"""
rule_descriptions_list = []
X_rules_list = []
for col in columns_cat:
rule_descriptions_col_list, X_rules_col_list = [], []
for value in ['True', 'False']:
rule_name = self._generate_rule_name()
rule_logic = f"(X['{col}']=={value})"
rule_strings = {rule_name: rule_logic}
ara = RuleApplier(
rule_strings=rule_strings, opt_func=self.one_cond_rule_opt_func)
X_rule = ara.transform(
X=X, y=y, sample_weight=sample_weight)
rule_description = ara.rule_descriptions
if rule_description.iloc[0]['Precision'] == 0:
continue
rule_descriptions_col_list.append(rule_description)
X_rules_col_list.append(X_rule)
rule_descriptions_col = pd.concat(
rule_descriptions_col_list, axis=0)
X_rules_col = pd.concat(X_rules_col_list, axis=1)
# Keep only best performing condition per feature
rule_descriptions_col = rule_descriptions_col.sort_values(
'OptMetric', ascending=False).head(1)
X_rules_col = X_rules_col[rule_descriptions_col.index]
# Re-calculate OptMetric value using opt_func (rather than
# one_cond_rule_opt_func)
rule_descriptions_col['OptMetric'][0] = self.opt_func(
y_true=y, y_preds=X_rules_col.squeeze(), sample_weight=sample_weight)
rule_descriptions_list.append(rule_descriptions_col)
X_rules_list.append(X_rules_col)
rule_descriptions = pd.concat(rule_descriptions_list, axis=0)
X_rules = pd.concat(X_rules_list, axis=1)
return rule_descriptions, X_rules
columns_cat = utils.return_progress_ready_range(
verbose=self.verbose, range=columns_cat)
if self.target_feat_corr_types is not None:
rule_descriptions, X_rules = _gen_rules_from_target_feat_corr_types(
X=X, y=y, columns_cat=columns_cat, sample_weight=sample_weight
)
else:
rule_descriptions, X_rules = _gen_rules_best_perf_bool_option(
X=X, y=y, columns_cat=columns_cat, sample_weight=sample_weight
)
rule_descriptions, X_rules = self._drop_zero_var_and_precision_rules(
rule_descriptions=rule_descriptions, X_rules=X_rules)
return rule_descriptions, X_rules
def _generate_pairwise_rules(self,
X_rules: PandasDataFrameType, y: PandasSeriesType,
rules_combinations: List[Tuple[Tuple[str, str], Tuple[str, str]]],
sample_weight: PandasSeriesType) -> Tuple[PandasDataFrameType, PandasDataFrameType, Dict[str, list]]:
"""Combines binary columns of rules using AND conditions"""
pairwise_info_dict = self._return_pairwise_information(
rules_combinations)
pairwise_logics = list(pairwise_info_dict.keys())
pairwise_info_list = list(pairwise_info_dict.values())
rules_names_1, rules_names_2, pairwise_names = [], [], []
for info_dict in pairwise_info_list:
rules_names_1.append(info_dict['RuleName1'])
rules_names_2.append(info_dict['RuleName2'])
pairwise_names.append(info_dict['PairwiseRuleName'])
X_rules_pairwise_df = self._generate_pairwise_df(
X_rules, rules_names_1, rules_names_2, pairwise_names)
pairwise_descriptions = utils.return_binary_pred_perf_of_set(
y_true=y, y_preds=X_rules_pairwise_df, y_preds_columns=pairwise_names,
sample_weight=sample_weight, opt_func=self.opt_func)
pairwise_descriptions.index.name = 'Rule'
pairwise_descriptions['Logic'] = pairwise_logics
pairwise_descriptions['nConditions'] = pairwise_descriptions['Logic'].apply(
utils.count_rule_conditions)
pairwise_descriptions = pairwise_descriptions.reindex(
['Logic', 'Precision', 'Recall', 'nConditions', 'PercDataFlagged', 'OptMetric'], axis=1)
pairwise_components = dict((info_dict['PairwiseRuleName'], info_dict['PairwiseComponents'])
for _, info_dict in pairwise_info_dict.items())
return pairwise_descriptions, X_rules_pairwise_df, pairwise_components
def _drop_unnecessary_pairwise_rules(self, pairwise_descriptions: PandasDataFrameType,
X_rules_pairwise_df: PandasDataFrameType,
pairwise_to_orig_lookup: Dict[str, Tuple[str, str]],
rule_descriptions: PandasDataFrameType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Drops pairwise rules with precision == 0 or that have a precision less
than one of their component rules.
"""
zero_var_rules = self._return_zero_variance_rules(
X_rules=X_rules_pairwise_df)
zero_prec_rules = self._return_zero_precision_rules(
rule_descriptions=pairwise_descriptions)
# Get rules with precision less than either of the individual rules
pw_rules_less_prec = self._return_pairwise_rules_to_drop(
pairwise_descriptions, pairwise_to_orig_lookup, rule_descriptions)
rules_to_drop = list(
set(zero_var_rules + zero_prec_rules + pw_rules_less_prec))
pairwise_descriptions = pairwise_descriptions.drop(
rules_to_drop, axis=0)
X_rules_pairwise_df = X_rules_pairwise_df.drop(
rules_to_drop, axis=1)
return pairwise_descriptions, X_rules_pairwise_df
def _generate_n_order_pairwise_rules(self, rule_descriptions: PandasDataFrameType,
X_rules: PandasDataFrameType, y: PandasSeriesType,
remove_corr_rules: bool,
sample_weight: PandasSeriesType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Loops through ruleset (starting with one condition rules) and combines
them pairwise to a given order.
"""
n_loops = int(
math.log(2 ** math.ceil(math.log(self.n_total_conditions, 2)), 2))
loop_range = utils.return_progress_ready_range(
verbose=self.verbose, range=range(1, n_loops + 1))
for n_loop in loop_range:
if remove_corr_rules:
rule_descriptions, X_rules = self._remove_corr_rules(
rule_descriptions=rule_descriptions, X_rules=X_rules)
rules_combinations = self._get_rule_combinations_for_loop(
rule_descriptions, n_loop, self.num_rules_keep)
if len(rules_combinations) == 0:
break
rule_descriptions_pairwise, X_rules_pairwise, pairwise_components = self._generate_pairwise_rules(
X_rules, y, rules_combinations, sample_weight)
rule_descriptions_pairwise, X_rules_pairwise = self._drop_unnecessary_pairwise_rules(
rule_descriptions_pairwise, X_rules_pairwise, pairwise_components, rule_descriptions)
X_rules = pd.concat(
[X_rules, X_rules_pairwise], axis=1)
rule_descriptions = pd.concat(
[rule_descriptions, rule_descriptions_pairwise], axis=0)
rule_descriptions = rule_descriptions[rule_descriptions['nConditions']
<= self.n_total_conditions]
X_rules = X_rules[rule_descriptions.index.tolist()]
rule_descriptions, X_rules = utils.sort_rule_dfs_by_opt_metric(
rule_descriptions, X_rules)
return rule_descriptions, X_rules
def _return_pairwise_information(self,
rules_combinations: List[Tuple[Tuple[str, str], Tuple[str, str]]]) -> Dict[str, dict]:
"""
Returns a dict of the pairwise rule logic and associated
information
"""
def clean_rule_logic(rule_name: str) -> str:
"""Removes duplicate columns in combined rule logic"""
rule_name_list = rule_name.split("&")
rule_name_set = sorted(list(set(rule_name_list)))
rule_name = '&'.join(rule_name_set)
return rule_name
pairwise_info_dict = {}
rule_logics_list = []
# Loop through rule combinations and calculate pairwise logic. Then
# link the component rule names, logic and distinct components to the
# pairwise logic
for (rule_name_1, rule_name_2), (rule_logic_1, rule_logic_2) in rules_combinations:
pairwise_rule_logic = clean_rule_logic(
f'{rule_logic_1}&{rule_logic_2}')
if rule_logics_list.count(pairwise_rule_logic) == 0:
pairwise_rule_name = self._generate_rule_name()
pairwise_info_dict[pairwise_rule_logic] = {
'RuleName1': rule_name_1,
'RuleName2': rule_name_2,
'PairwiseRuleName': pairwise_rule_name,
'PairwiseComponents': [rule_name_1, rule_name_2]
}
rule_logics_list.append(pairwise_rule_logic)
else:
pairwise_info_dict[pairwise_rule_logic]['PairwiseComponents'].extend(
[rule_name_1, rule_name_2])
pairwise_info_dict[pairwise_rule_logic]['PairwiseComponents'] = list(set(
pairwise_info_dict[pairwise_rule_logic]['PairwiseComponents']))
return pairwise_info_dict
def _drop_zero_var_and_precision_rules(self,
rule_descriptions: PandasDataFrameType,
X_rules: PandasDataFrameType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Drops zero variance and zero precisions rules from rule_descriptions
and X_rules
"""
zero_var_rules = self._return_zero_variance_rules(X_rules=X_rules)
zero_precision_rules = self._return_zero_precision_rules(
rule_descriptions=rule_descriptions)
rules_to_drop = list(set(zero_var_rules + zero_precision_rules))
rule_descriptions = rule_descriptions.drop(
rules_to_drop, axis=0)
X_rules = X_rules.drop(rules_to_drop, axis=1)
return rule_descriptions, X_rules
def _generate_rule_name(self) -> str:
"""Generates rule name"""
if self.rule_name_prefix is None:
rule_name = f'RGO_Rule_{self.today}_{self._rule_name_counter}'
else:
rule_name = f'{self.rule_name_prefix}_{self._rule_name_counter}'
self._rule_name_counter += 1
return rule_name
@staticmethod
def _remove_corr_rules(rule_descriptions: PandasDataFrameType,
X_rules: PandasDataFrameType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Remove correlated rules using the AgglomerativeClusteringReducer class
"""
cs = CosineSimilarity()
rcr = AgglomerativeClusteringReducer(
threshold=0.75, columns_performance=rule_descriptions['OptMetric'],
strategy='bottom_up', similarity_function=cs.fit)
X_rules = rcr.fit_transform(X_rules)
rule_descriptions = rule_descriptions.loc[X_rules.columns]
return rule_descriptions, X_rules
@staticmethod
def _set_iteration_range(X_col: np.array, column: str, operator: str,
n_points: int, ratio_window: int,
columns_int: List[str]) -> Tuple[float, float]:
"""Sets the iteration range for a given column"""
X_col_max = max(X_col)
X_col_min = min(X_col)
if column in columns_int and n_points > abs(X_col_max - X_col_min):
x_min = X_col_min
x_max = X_col_max
elif operator == "<=":
x_min = X_col_min
x_max = x_min + (X_col_max - x_min) / ratio_window
elif operator == ">=":
x_max = X_col_max
x_min = x_max - (x_max - X_col_min) / ratio_window
return (x_min, x_max)
@staticmethod
def _set_iteration_array(column: str, columns_int: List[str], x_min: float,
x_max: float, n_points: int) -> np.array:
"""Returns the iteration array for a given column"""
def _round_to_n_sf(x: float, n_sf: int) -> float:
"""Method for rounding a float to n significant figures"""
if x == 0:
return 0
return round(x, -int(math.floor(math.log10(abs(x)))) + (n_sf - 1))
if column in columns_int:
x_min, x_max = int(x_min), int(x_max)
if abs(x_min - x_max) < n_points:
x_iter = np.array(range(x_min, x_max + 1))
else:
x_iter = np.ceil(np.linspace(x_min, x_max, n_points))
else:
x_iter = np.linspace(x_min, x_max, n_points)
x_iter = np.array([_round_to_n_sf(x, 2) for x in x_iter])
return x_iter
@staticmethod
def _calculate_opt_metric_across_range(x_iter: np.array, operator: str,
X_col: np.array, y: np.array,
opt_func: Callable[[PandasSeriesType, PandasSeriesType, PandasSeriesType], PandasSeriesType],
sample_weight: np.array) -> np.array:
"""
Calculates the optimisation function at each point in the x_iter
range
"""
opt_metric_iter = np.zeros(len(x_iter))
for i, x in enumerate(x_iter):
X_rule = eval(f'X_col {operator} {x}').astype(int)
opt_metric_iter[i] = opt_func(
y_true=y, y_preds=X_rule, sample_weight=sample_weight)
return opt_metric_iter
@staticmethod
def _return_x_of_max_opt_metric(opt_metric_iter: np.array, operator: str,
x_iter: np.array) -> float:
"""Returns the threshold value which maximises the FBeta score"""
max_opt_metric = np.nanmax(opt_metric_iter)
if max_opt_metric == 0:
return None
if operator == "<=":
idx_max_opt_metric = min([i for i, j in enumerate(
opt_metric_iter) if j == max_opt_metric])
elif operator == ">=":
idx_max_opt_metric = max([i for i, j in enumerate(
opt_metric_iter) if j == max_opt_metric])
return x_iter[idx_max_opt_metric]
@staticmethod
def _get_rule_combinations_for_loop(rule_descriptions: PandasDataFrameType,
n_loop: int,
num_rules_keep: int) -> List[Tuple[Tuple[str, str], Tuple[str, str]]]:
"""Generates pairwise combinations of rules"""
# At beginning of each loop, filter list of rules to include only those
# needed for pairwise calculation
rules_n_conditions = rule_descriptions[(
rule_descriptions['nConditions'] == 2 ** (n_loop - 1))]
# Then sort resulting ruleset by OptMetric and take top num_rules_keep
# rules for pairwise calculation
rules_n_conditions = rules_n_conditions.sort_values(
by='OptMetric', ascending=False)
rules_n_conditions = rules_n_conditions.iloc[:num_rules_keep]
# Get the rule names and their logic
rule_names = rules_n_conditions.index.values
rule_logic = rules_n_conditions['Logic'].values
# Calculate distinct combinations of both the rule names and their
# logic
rules_logic_combinations = list(combinations(rule_logic, r=2))
rules_name_combinations = list(combinations(rule_names, r=2))
# Combine these into a list
rules_combinations = list(
zip(rules_name_combinations, rules_logic_combinations))
return rules_combinations
@staticmethod
def _generate_pairwise_df(X_rules: PandasDataFrameType, rules_names_1: List[str],
rules_names_2: List[str],
pairwise_names: List[str]) -> PandasDataFrameType:
"""
Multiplies the component rules together to give the pairwise dataframe
"""
X_rules_pairwise_arr = X_rules[rules_names_1].values * \
X_rules[rules_names_2].values
X_rules_pairwise_df = pd.DataFrame(
X_rules_pairwise_arr, columns=pairwise_names, index=X_rules.index)
return X_rules_pairwise_df
@staticmethod
def _return_pairwise_rules_to_drop(pairwise_descriptions: PandasDataFrameType,
pairwise_to_orig_lookup: Dict[str, List[str]],
rule_descriptions: PandasDataFrameType) -> List[str]:
"""
Drops pairwise rule if its precision is less than or equal to the
precision of one of its component rules
"""
rules_to_drop = []
for idx, row in pairwise_descriptions.iterrows():
orig_rules = pairwise_to_orig_lookup[idx]
max_orig_prec = rule_descriptions.loc[orig_rules, 'Precision'].max(
)
if row['Precision'] <= max_orig_prec:
rules_to_drop.append(idx)
return rules_to_drop
@staticmethod
def _return_zero_variance_rules(X_rules: PandasDataFrameType) -> List[str]:
"""Returns list of zero variance rules"""
X_rules_std = X_rules.to_numpy().std(axis=0)
mask = X_rules_std == 0
zero_var_rules = X_rules.columns[mask].tolist()
return zero_var_rules
@staticmethod
def _return_zero_precision_rules(rule_descriptions: PandasDataFrameType) -> List[str]:
"""Returns list of zero precision rules"""
mask = rule_descriptions['Precision'].to_numpy() == 0
zero_precision_rules = rule_descriptions.index[mask].tolist()
return zero_precision_rules