"""Defines a Rules-Based System (RBS) pipeline."""
import pandas as pd
import numpy as np
import iguanas.utils as utils
from iguanas.utils.types import PandasDataFrame, PandasSeries
from iguanas.utils.typing import PandasDataFrameType, PandasSeriesType
from typing import List, Callable
[docs]class RBSPipeline:
"""
A pipeline with each stage giving a decision - either 0 or 1 (corresponding
to the binary target). Each stage is configured with a set of rules which,
if any of them trigger, mark the relevant records with that decision.
Parameters
----------
config : List[dict]
The pipeline configuration, where each element aligns to a stage in
the pipeline. Each element is a dictionary, where the key is the
decision made at that stage (either 0 or 1) and the value is a list
of the rules that must trigger to give that decision.
final_decision : int
The final decision to apply if no rules are triggered. Must be
either 0 or 1.
opt_func : Callable
The optimisation function used to calculate the performance metric
of the pipeline (e.g. F1 score).
Raises
------
ValueError
`config` must be a list.
ValueError
`final_decision` must be either 0 or 1.
Attributes
----------
pipeline_opt_metric : float
The result of the `opt_func` function when the pipeline is applied.
conf_matrix : PandasDataFrameType
The confusion matrix for the applied pipeline. Only generated after
running `calc_performance`.
conf_matrix_weighted : PandasDataFrameType
The confusion matrix for the applied pipeline. Only generated after
running `calc_performance` and when `sample_weight` is provided.
pipeline_perf : PandasDataFrameType
The performance (precision, recall, percentage of data flagged) of each
decision made by the pipeline. Only generated after running
`calc_performance`.
"""
def __init__(self, config: List[dict], final_decision: int,
opt_func: Callable) -> None:
if not isinstance(config, list):
raise ValueError('`config` must be a list')
if final_decision not in [0, 1]:
raise ValueError('`final_decision` must be either 0 or 1')
self.config = config
self.final_decision = final_decision
self.opt_func = opt_func
[docs] def predict(self, X_rules: PandasDataFrameType, y: PandasSeriesType,
sample_weight=None) -> PandasSeriesType:
"""
Applies the pipeline to the given dataset.
Parameters
----------
X_rules : PandasDataFrameType
Dataset of each applied rule.
y : PandasSeriesType
The target.
sample_weight : PandasSeriesType, optional
Record-wise weights to apply. Defaults to None. Defaults to
None.
Returns
-------
PandasSeriesType
The prediction of the pipeline.
"""
utils.check_allowed_types(X_rules, 'X_rules', [PandasDataFrame])
utils.check_allowed_types(y, 'y', [PandasSeries])
if sample_weight is not None:
utils.check_allowed_types(
sample_weight, 'sample_weight', [PandasSeries])
num_rows = len(X_rules)
stage_level_preds = self._get_stage_level_preds(X_rules, self.config)
y_pred = self._get_pipeline_pred(stage_level_preds, num_rows)
self.pipeline_opt_metric = self.opt_func(
y_preds=y_pred, y_true=y, sample_weight=sample_weight
)
return y_pred
@staticmethod
def _get_stage_level_preds(X_rules: PandasDataFrameType,
config: List[dict]) -> PandasDataFrameType:
"""Returns the predictions for each stage in the pipeline"""
rules = X_rules.columns.tolist()
stage_level_preds = []
for i, stage in enumerate(config):
decision = list(stage.keys())[0]
stage_rules = list(stage.values())[0]
stage_rules_present = [
rule for rule in stage_rules if rule in rules]
if not stage_rules_present:
continue
y_pred_stage = (
X_rules[stage_rules_present].sum(1) > 0).astype(int)
if decision == 0:
y_pred_stage = -y_pred_stage
y_pred_stage.name = f'Stage={i}, Decision={decision}'
stage_level_preds.append(y_pred_stage)
if not stage_level_preds:
return None
else:
stage_level_preds = pd.concat(stage_level_preds, axis=1)
return stage_level_preds
def _get_pipeline_pred(self, stage_level_preds: PandasDataFrameType,
num_rows: int) -> PandasSeriesType:
"""Returns the predictions of the pipeline"""
if stage_level_preds is None:
return np.ones(num_rows) * self.final_decision
for stage_idx in range(0, len(stage_level_preds.columns)):
if stage_idx == 0:
y_pred = stage_level_preds.iloc[:, stage_idx]
else:
y_pred = (
(y_pred == 0).astype(int) * stage_level_preds.iloc[:, stage_idx]) + y_pred
y_pred = ((y_pred == 0).astype(int) * self.final_decision) + y_pred
y_pred = (y_pred > 0).astype(int)
return y_pred