Source code for iguanas.rule_selection.greedy_filter

"""Filters rules using a greedy-type methodology"""
import numpy as np
import pandas as pd
import iguanas.utils as utils
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List, Callable, Tuple
import matplotlib.ticker as ticker
import math

from iguanas.utils.typing import PandasDataFrameType, PandasSeriesType


[docs]class GreedyFilter: """ Sorts rules by a given metric, calculates the combined performance of the top n rules, then filters to the rules which give the best combined performance. Parameters ---------- opt_func : Callable The method/function used to calculate the performance of the top n rules (e.g. Fbeta score). rule_descriptions : PandasDataFrameType, optional The standard performance metrics dataframe associated with the rules (if available). If not given, it will be calculated from `X_rules`. Defaults to None. sorting_col : str, optional Specifies the column within `rule_descriptions` to sort the rules by. Defaults to 'Precision'. verbose : int, optional Controls the verbosity - the higher, the more messages. >0 : shows the progress of the filtering process. Defaults to 0. Attributes ---------- rules_to_keep : List[str] List of rules which give the best combined performance. """ def __init__(self, opt_func: Callable, rule_descriptions=None, sorting_col='Precision', verbose=0): self.rule_descriptions = rule_descriptions self.opt_func = opt_func self.sorting_col = sorting_col self.verbose = verbose
[docs] def fit(self, X_rules: PandasDataFrameType, y=PandasSeriesType, sample_weight=None) -> None: """ Sorts rules by a given metric, calculates the combined performance of the top n rules, then calculates the rules which give the best combined performance. Parameters ---------- X_rules : PandasDataFrameType The binary columns of the rules applied to a dataset. y : PandasSeriesType The binary target column. sample_weight : PandasSeriesType, optional Row-wise weights to apply. Defaults to None. """ if self.rule_descriptions is None: if self.verbose > 0: print('--- Calculating individual rule performances ---') self.rule_descriptions = utils.return_binary_pred_perf_of_set( y_true=y, y_preds=X_rules, y_preds_columns=X_rules.columns, sample_weight=sample_weight, opt_func=self.opt_func) self._sort_rule_descriptions() sorted_rules = self.rule_descriptions.index.tolist() self.top_n_rule_descriptions, self.top_n_rules = self._return_performance_top_n( sorted_rules=sorted_rules, X_rules=X_rules, y=y, sample_weight=sample_weight, opt_func=self.opt_func, verbose=self.verbose) self.rules_to_keep = self._return_top_rules_by_opt_func( self.top_n_rule_descriptions, self.rule_descriptions)
[docs] def transform(self, X_rules: PandasDataFrameType) -> PandasDataFrameType: """ Reduces the rule set by keeping the rules which give the best combined performance. Parameters ---------- X_rules : PandasDataFrameType The binary columns of the rules applied to a dataset. Returns ------- PandasDataFrameType The binary columns of the rules which give the best combined performance. """ X_rules = X_rules[self.rules_to_keep] self.rule_descriptions = self.rule_descriptions.loc[self.rules_to_keep] return X_rules
[docs] def fit_transform(self, X_rules: PandasDataFrameType, y: PandasSeriesType, sample_weight=None) -> PandasDataFrameType: """ Sorts rules by a given metric, calculates the combined performance of the top n rules, then keeps only the rules which give the best combined performance. Parameters ---------- X_rules : PandasDataFrameType The binary columns of the rules applied to a dataset. y : PandasSeriesType The binary target column. sample_weight : PandasSeriesType, optional Row-wise weights to apply. Defaults to None. Returns ------- PandasDataFrameType The binary columns of the rules which give the best combined performance. """ self.fit(X_rules=X_rules, y=y, sample_weight=sample_weight) X_rules = self.transform(X_rules=X_rules) return X_rules
[docs] def plot_top_n_performance_on_train(self, figsize=(10, 5), title='`opt_func` performance of the top n rules on the training set') -> sns.lineplot: """ Plot the combined performance of the top n rules (as calculated using the `.fit()` method). Parameters ---------- figsize : Tuple[int, int], optional Defines the size of the plot (length, height). Defaults to (10, 5). verbose : int, optional Controls the verbosity - the higher, the more messages. >0 : shows the progress of calculating the combined performance of the top n rules. Defaults to 0. title : str, optional The plot title. Defaults to '`opt_func` performance of the top n rules on the training set' Returns ------- sns.lineplot Shows the combined performance of the top n rules. """ self._plot_performance( data=self.top_n_rule_descriptions['OptMetric'].to_frame(), title=title, figsize=figsize )
[docs] def plot_top_n_performance(self, X_rules: PandasDataFrameType, y: PandasSeriesType, sample_weight=None, figsize=(10, 5), verbose=0, title='`opt_func` performance of the top n rules') -> sns.lineplot: """ Plot the combined performance of the top n rules (as calculated using the `.fit()` method) using the provided rule binary columns. Parameters ---------- X_rules : PandasDataFrameType The binary columns of the rules applied to a dataset. y : PandasSeriesType The binary target column. sample_weight : PandasSeriesType, optional Row-wise weights to apply. Defaults to None. figsize : Tuple[int, int], optional Defines the size of the plot (length, height). Defaults to (10, 5). verbose : int, optional Controls the verbosity - the higher, the more messages. >0 : shows the progress of calculating the combined performance of the top n rules. Defaults to 0. title : str, optional The plot title. Defaults to '`opt_func` performance of the top n rules' Returns ------- sns.lineplot Shows the combined performance of the top n rules, calculated using the provided rule binary columns. """ sorted_rules = list(self.top_n_rules.values())[-1] top_n_rule_descriptions, _ = self._return_performance_top_n( sorted_rules=sorted_rules, X_rules=X_rules, y=y, sample_weight=sample_weight, opt_func=self.opt_func, verbose=verbose ) self._plot_performance( data=top_n_rule_descriptions['OptMetric'].to_frame(), title=title, figsize=figsize )
def _sort_rule_descriptions(self) -> None: """ Sorts the `rule_descriptions` dataframe by the `sorting_col` parameter. Additional sorting is carried out on the `Logic` column if available - this is to ensure repeatable results if multiple rules have the exactly the same value in the `sorting_col` column. """ if 'Logic' not in self.rule_descriptions.columns: self.rule_descriptions.sort_values( self.sorting_col, ascending=False, inplace=True) else: self.rule_descriptions.sort_values( [self.sorting_col, 'Logic'], ascending=[False, False], inplace=True) @staticmethod def _return_performance_top_n(sorted_rules: list, X_rules: PandasDataFrameType, y: PandasSeriesType, sample_weight: PandasSeriesType, opt_func: Callable, verbose: int) -> Tuple[PandasDataFrameType, dict]: """ Sorts rules by a given metric, calculates the combined performance of the top n rules """ if verbose > 0: print('--- Calculating performance of top n rules ---') top_n_rule_descriptions_list = [] top_n_rules = {} X_rules = X_rules.reindex(sorted_rules, axis=1) rule_range = utils.return_progress_ready_range( verbose=verbose, range=range(1, len(sorted_rules) + 1)) for n in rule_range: top_n_X_rules = X_rules.iloc[:, :n] top_n_rules[n] = top_n_X_rules.columns.tolist() top_n_combined = np.bitwise_or.reduce(top_n_X_rules.values, axis=1) top_n_rule_descriptions_list.append(utils.return_binary_pred_perf_of_set( y_true=y, y_preds=top_n_combined, y_preds_columns=[n], sample_weight=sample_weight, opt_func=opt_func)) top_n_rule_descriptions = pd.concat( top_n_rule_descriptions_list, axis=0) top_n_rule_descriptions.index.rename('Top n rules', inplace=True) return top_n_rule_descriptions, top_n_rules @staticmethod def _return_top_rules_by_opt_func(top_n_rule_descriptions: PandasDataFrameType, rule_descriptions: PandasDataFrameType) -> List[str]: """Returns rules which give the top combined performance""" idx_max_perf_func = top_n_rule_descriptions['OptMetric'].idxmax() rules_to_keep = rule_descriptions.index[:idx_max_perf_func].tolist() return rules_to_keep @staticmethod def _plot_performance(data: PandasDataFrameType, title: str, figsize: Tuple[int, int]) -> sns.lineplot: """Creates seaborn lineplot""" sns.set_style("whitegrid") plt.figure(figsize=figsize) ax = sns.lineplot(data=data) ax_int = math.ceil(data.index.max()/10) ax.xaxis.set_major_locator(ticker.MultipleLocator(ax_int)) ax.xaxis.set_major_formatter(ticker.ScalarFormatter()) plt.title(title) plt.show()