Source code for iguanas.rule_selection.greedy_filter
"""Filters rules using a greedy-type methodology"""
import numpy as np
import pandas as pd
import iguanas.utils as utils
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List, Callable, Tuple
import matplotlib.ticker as ticker
import math
from iguanas.utils.typing import PandasDataFrameType, PandasSeriesType
[docs]class GreedyFilter:
"""
Sorts rules by a given metric, calculates the combined performance of the
top n rules, then filters to the rules which give the best combined
performance.
Parameters
----------
opt_func : Callable
The method/function used to calculate the performance of the top n
rules (e.g. Fbeta score).
rule_descriptions : PandasDataFrameType, optional
The standard performance metrics dataframe associated with the rules
(if available). If not given, it will be calculated from `X_rules`.
Defaults to None.
sorting_col : str, optional
Specifies the column within `rule_descriptions` to sort the rules by.
Defaults to 'Precision'.
verbose : int, optional
Controls the verbosity - the higher, the more messages. >0 : shows the
progress of the filtering process. Defaults to 0.
Attributes
----------
rules_to_keep : List[str]
List of rules which give the best combined performance.
"""
def __init__(self, opt_func: Callable,
rule_descriptions=None, sorting_col='Precision', verbose=0):
self.rule_descriptions = rule_descriptions
self.opt_func = opt_func
self.sorting_col = sorting_col
self.verbose = verbose
[docs] def fit(self, X_rules: PandasDataFrameType, y=PandasSeriesType,
sample_weight=None) -> None:
"""
Sorts rules by a given metric, calculates the combined performance of
the top n rules, then calculates the rules which give the best combined
performance.
Parameters
----------
X_rules : PandasDataFrameType
The binary columns of the rules applied to a dataset.
y : PandasSeriesType
The binary target column.
sample_weight : PandasSeriesType, optional
Row-wise weights to apply. Defaults to None.
"""
if self.rule_descriptions is None:
if self.verbose > 0:
print('--- Calculating individual rule performances ---')
self.rule_descriptions = utils.return_binary_pred_perf_of_set(
y_true=y, y_preds=X_rules, y_preds_columns=X_rules.columns,
sample_weight=sample_weight, opt_func=self.opt_func)
self._sort_rule_descriptions()
sorted_rules = self.rule_descriptions.index.tolist()
self.top_n_rule_descriptions, self.top_n_rules = self._return_performance_top_n(
sorted_rules=sorted_rules, X_rules=X_rules, y=y,
sample_weight=sample_weight, opt_func=self.opt_func,
verbose=self.verbose)
self.rules_to_keep = self._return_top_rules_by_opt_func(
self.top_n_rule_descriptions, self.rule_descriptions)
[docs] def transform(self, X_rules: PandasDataFrameType) -> PandasDataFrameType:
"""
Reduces the rule set by keeping the rules which give the best combined
performance.
Parameters
----------
X_rules : PandasDataFrameType
The binary columns of the rules applied to a dataset.
Returns
-------
PandasDataFrameType
The binary columns of the rules which give the best combined performance.
"""
X_rules = X_rules[self.rules_to_keep]
self.rule_descriptions = self.rule_descriptions.loc[self.rules_to_keep]
return X_rules
[docs] def fit_transform(self, X_rules: PandasDataFrameType, y: PandasSeriesType,
sample_weight=None) -> PandasDataFrameType:
"""
Sorts rules by a given metric, calculates the combined performance of
the top n rules, then keeps only the rules which give the best combined
performance.
Parameters
----------
X_rules : PandasDataFrameType
The binary columns of the rules applied to a dataset.
y : PandasSeriesType
The binary target column.
sample_weight : PandasSeriesType, optional
Row-wise weights to apply. Defaults to None.
Returns
-------
PandasDataFrameType
The binary columns of the rules which give the best combined performance.
"""
self.fit(X_rules=X_rules, y=y, sample_weight=sample_weight)
X_rules = self.transform(X_rules=X_rules)
return X_rules
[docs] def plot_top_n_performance_on_train(self,
figsize=(10, 5),
title='`opt_func` performance of the top n rules on the training set') -> sns.lineplot:
"""
Plot the combined performance of the top n rules (as calculated using
the `.fit()` method).
Parameters
----------
figsize : Tuple[int, int], optional
Defines the size of the plot (length, height). Defaults to (10, 5).
verbose : int, optional
Controls the verbosity - the higher, the more messages. >0 : shows
the progress of calculating the combined performance of the top n
rules. Defaults to 0.
title : str, optional
The plot title. Defaults to '`opt_func` performance of the top n
rules on the training set'
Returns
-------
sns.lineplot
Shows the combined performance of the top n rules.
"""
self._plot_performance(
data=self.top_n_rule_descriptions['OptMetric'].to_frame(),
title=title,
figsize=figsize
)
[docs] def plot_top_n_performance(self, X_rules: PandasDataFrameType,
y: PandasSeriesType,
sample_weight=None,
figsize=(10, 5),
verbose=0,
title='`opt_func` performance of the top n rules') -> sns.lineplot:
"""
Plot the combined performance of the top n rules (as calculated using
the `.fit()` method) using the provided rule binary columns.
Parameters
----------
X_rules : PandasDataFrameType
The binary columns of the rules applied to a dataset.
y : PandasSeriesType
The binary target column.
sample_weight : PandasSeriesType, optional
Row-wise weights to apply. Defaults to None.
figsize : Tuple[int, int], optional
Defines the size of the plot (length, height). Defaults to (10, 5).
verbose : int, optional
Controls the verbosity - the higher, the more messages. >0 : shows
the progress of calculating the combined performance of the top n
rules. Defaults to 0.
title : str, optional
The plot title. Defaults to '`opt_func` performance of the top n
rules'
Returns
-------
sns.lineplot
Shows the combined performance of the top n rules, calculated using
the provided rule binary columns.
"""
sorted_rules = list(self.top_n_rules.values())[-1]
top_n_rule_descriptions, _ = self._return_performance_top_n(
sorted_rules=sorted_rules, X_rules=X_rules, y=y,
sample_weight=sample_weight, opt_func=self.opt_func,
verbose=verbose
)
self._plot_performance(
data=top_n_rule_descriptions['OptMetric'].to_frame(),
title=title, figsize=figsize
)
def _sort_rule_descriptions(self) -> None:
"""
Sorts the `rule_descriptions` dataframe by the `sorting_col` parameter.
Additional sorting is carried out on the `Logic` column if available -
this is to ensure repeatable results if multiple rules have the exactly
the same value in the `sorting_col` column.
"""
if 'Logic' not in self.rule_descriptions.columns:
self.rule_descriptions.sort_values(
self.sorting_col, ascending=False, inplace=True)
else:
self.rule_descriptions.sort_values(
[self.sorting_col, 'Logic'], ascending=[False, False], inplace=True)
@staticmethod
def _return_performance_top_n(sorted_rules: list,
X_rules: PandasDataFrameType, y: PandasSeriesType,
sample_weight: PandasSeriesType,
opt_func: Callable,
verbose: int) -> Tuple[PandasDataFrameType, dict]:
"""
Sorts rules by a given metric, calculates the combined performance
of the top n rules
"""
if verbose > 0:
print('--- Calculating performance of top n rules ---')
top_n_rule_descriptions_list = []
top_n_rules = {}
X_rules = X_rules.reindex(sorted_rules, axis=1)
rule_range = utils.return_progress_ready_range(
verbose=verbose, range=range(1, len(sorted_rules) + 1))
for n in rule_range:
top_n_X_rules = X_rules.iloc[:, :n]
top_n_rules[n] = top_n_X_rules.columns.tolist()
top_n_combined = np.bitwise_or.reduce(top_n_X_rules.values, axis=1)
top_n_rule_descriptions_list.append(utils.return_binary_pred_perf_of_set(
y_true=y, y_preds=top_n_combined, y_preds_columns=[n],
sample_weight=sample_weight, opt_func=opt_func))
top_n_rule_descriptions = pd.concat(
top_n_rule_descriptions_list, axis=0)
top_n_rule_descriptions.index.rename('Top n rules', inplace=True)
return top_n_rule_descriptions, top_n_rules
@staticmethod
def _return_top_rules_by_opt_func(top_n_rule_descriptions: PandasDataFrameType,
rule_descriptions: PandasDataFrameType) -> List[str]:
"""Returns rules which give the top combined performance"""
idx_max_perf_func = top_n_rule_descriptions['OptMetric'].idxmax()
rules_to_keep = rule_descriptions.index[:idx_max_perf_func].tolist()
return rules_to_keep
@staticmethod
def _plot_performance(data: PandasDataFrameType, title: str,
figsize: Tuple[int, int]) -> sns.lineplot:
"""Creates seaborn lineplot"""
sns.set_style("whitegrid")
plt.figure(figsize=figsize)
ax = sns.lineplot(data=data)
ax_int = math.ceil(data.index.max()/10)
ax.xaxis.set_major_locator(ticker.MultipleLocator(ax_int))
ax.xaxis.set_major_formatter(ticker.ScalarFormatter())
plt.title(title)
plt.show()