Source code for iguanas.rbs.rbs_optimiser

"""Optimises a Rules-Based System (RBS) pipeline."""
from typing import List
from hyperopt import hp, tpe, fmin
import numpy as np
from copy import deepcopy
from iguanas.rbs import RBSPipeline
import iguanas.utils as utils
from iguanas.utils.types import PandasDataFrame, PandasSeries
from iguanas.utils.typing import PandasDataFrameType, PandasSeriesType


[docs]class RBSOptimiser(RBSPipeline): """ Optimises the rules within an RBS Pipeline based on an optimisation function. If the `config` parameter is an empty dictionary, then the pipeline configuration is optimised from scratch; else, the rules included within the existing pipeline configuration are optimised. Parameters ---------- pipeline : RBSPipeline The RBS Pipeline to optimise. n_iter : int The number of iterations that the optimiser should perform. algorithm : Callable, optional The algorithm leveraged by hyperopt's `fmin` function, which optimises the rules. Defaults to tpe.suggest, which corresponds to Tree-of-Parzen-Estimator. rule_types : Dict[int, List[str]], optional The list of rules (values) that are assigned to each decision (keys), either 0 or 1. Must be given when the `config` parameter in the `pipeline` is an empty dictionary. Defaults to None. verbose : int, optional Controls the verbosity - the higher, the more messages. >0 : shows the overall progress of the optimisation process. Defaults to 0. Raises ------ ValueError If `config` not provided in `pipeline`, `rule_types` must be given. Attributes ---------- config : List[dict] The optimised pipeline configuration, where each element aligns to a stage in the pipeline. Each element is a dictionary, where the key is the decision made at that stage (either 0 or 1) and the value is a list of the rules that must trigger to give that decision. pipeline_opt_metric : float The result of the `opt_func` function when the pipeline is applied. conf_matrix : PandasDataFrameType The confusion matrix for the applied pipeline. Only generated after running `calc_performance`. conf_matrix_weighted : PandasDataFrameType The confusion matrix for the applied pipeline. Only generated after running `calc_performance` and when `sample_weight` is provided. pipeline_perf : PandasDataFrameType The performance (precision, recall, percentage of data flagged) of each decision made by the pipeline. Only generated after running `calc_performance`. """ def __init__(self, pipeline: RBSPipeline, n_iter: int, algorithm=tpe.suggest, rule_types=None, verbose=0, **kwargs) -> None: self.pipeline = deepcopy(pipeline) self.n_iter = n_iter self.algorithm = algorithm self.rule_types = rule_types self.verbose = verbose self.kwargs = kwargs self.config = self.pipeline.config self.config_given = self.pipeline.config != [] if not self.config_given and rule_types is None: raise ValueError( 'If `config` not provided in `pipeline`, `rule_types` must be given.') RBSPipeline.__init__( self, config=self.config, final_decision=self.pipeline.final_decision, opt_func=self.pipeline.opt_func, )
[docs] def fit(self, X_rules: PandasDataFrameType, y: PandasSeriesType, sample_weight=None) -> None: """ Optimises the pipeline for the given dataset. Parameters ---------- X_rules : PandasDataFrameType Dataset of each applied rule. y : PandasSeriesType The target. sample_weight : PandasSeriesType, optional Record-wise weights to apply. Defaults to None. Defaults to None. """ utils.check_allowed_types(X_rules, 'X_rules', [PandasDataFrame]) utils.check_allowed_types(y, 'y', [PandasSeries]) if sample_weight is not None: utils.check_allowed_types( sample_weight, 'sample_weight', [PandasSeries]) # Get space functions space_funcs = self._get_space_funcs(X_rules) # Optimise pipeline opt_thresholds = self._optimise_pipeline( X_rules, y, sample_weight, space_funcs ) # Generate config self._generate_config(opt_thresholds)
[docs] def fit_predict(self, X_rules: PandasDataFrameType, y: PandasSeriesType, sample_weight=None) -> PandasSeriesType: """ Optimises the pipeline for the given dataset and applies the pipeline to the dataset. Parameters ---------- X_rules : PandasDataFrameType Dataset of each applied rule. y : PandasSeriesType The target. sample_weight : PandasSeriesType, optional Record-wise weights to apply. Defaults to None. Defaults to None. Returns ------- PandasSeriesType The prediction of the pipeline. """ self.fit(X_rules, y, sample_weight) return self.predict(X_rules, y, sample_weight)
def _get_space_funcs(self, X_rules: PandasDataFrameType) -> dict: """Returns the space functions for each rule.""" if self.config_given: space_funcs = { rule: hp.choice(rule, [0, 1]) for rule in X_rules.columns } else: space_funcs = { rule: (hp.choice(f'{rule}%activate', [0, 1]), hp.choice(f'{rule}%stage', list(range(0, X_rules.shape[1])))) for rule in X_rules.columns } return space_funcs def _optimise_pipeline(self, X_rules: PandasDataFrameType, y: PandasSeriesType, sample_weight: PandasSeriesType, space_funcs: dict) -> dict: """Calculates the optimal pipeline configuration""" def _objective_update_config(space_funcs: dict) -> float: """Evaluates the optimisation metric for the updated pipeline""" rules_for_iter = [rule for rule, keep in space_funcs.items() if keep == 1] self.pipeline.config = self._update_config( space_funcs, deepcopy(self.config) ) self.pipeline.predict(X_rules[rules_for_iter], y, sample_weight) return -self.pipeline.pipeline_opt_metric def _objective_generate_config(space_funcs: dict) -> float: """Evaluates the optimisation metric for the generated pipeline""" rules_for_iter = [rule for rule, (keep, _) in space_funcs.items() if keep == 1] self.pipeline.config = self._create_config(space_funcs) self.pipeline.predict(X_rules[rules_for_iter], y, sample_weight) return -self.pipeline.pipeline_opt_metric if self.config_given: _objective = _objective_update_config else: _objective = _objective_generate_config opt_thresholds = fmin( fn=_objective, space=space_funcs, algo=self.algorithm, max_evals=self.n_iter, verbose=self.verbose > 0, rstate=np.random.RandomState(0), **self.kwargs ) return opt_thresholds def _generate_config(self, opt_thresholds: dict) -> None: """Generates final pipeline config based on optimisation""" if self.config_given: self.config = self._update_config(opt_thresholds, self.config) else: opt_thresholds = self._convert_opt_thr(opt_thresholds) self.config = self._create_config(opt_thresholds) def _create_config(self, space_funcs: dict) -> dict: """Creates pipeline config from space functions""" config = [] rule_positions = {rule: position for rule, (keep, position) in space_funcs.items() if keep == 1} rules_ordered = [rule for rule, _ in sorted( rule_positions.items(), key=lambda item: item[1])] rules_decisions = [(rule, 0) if rule in self.rule_types[0] else (rule, 1) for rule in rules_ordered] for i, (rule, decision) in enumerate(rules_decisions): if i == 0: config.append({decision: [rule]}) elif decision == rules_decisions[i-1][1]: config[-1][decision].append(rule) else: config.append({decision: [rule]}) return config @staticmethod def _update_config(space_funcs: dict, config: List[dict]) -> List[dict]: """Updates existing config from space functions""" for rule, to_keep in space_funcs.items(): if to_keep == 0: for stage in config: rules = list(stage.values())[0] if rule in rules: rules.remove(rule) return config @staticmethod def _convert_opt_thr(opt_thresholds: dict) -> dict: """Converts output of optimiser into space function format""" opt_thresholds_ = {rule_label.split( '%')[0]: [None, None] for rule_label in opt_thresholds.keys()} for rule_label, value in opt_thresholds.items(): rule = rule_label.split('%')[0] value_type = rule_label.split('%')[1] if value_type == 'activate': opt_thresholds_[rule][0] = value elif value_type == 'stage': opt_thresholds_[rule][1] = value return opt_thresholds_