"""Contains functions that are shared across Iguanas modules."""
import pandas as pd
import numpy as np
import json
from typing import List, Tuple, Union, Iterable
from tqdm import tqdm
import sys
from iguanas.utils.types import KoalasDataFrame, KoalasSeries, PandasDataFrame, \
PandasSeries
from iguanas.utils.typing import KoalasDataFrameType, KoalasSeriesType, \
PandasDataFrameType, PandasSeriesType, PySparkDataFrameType
[docs]def concat(objs: List[Union[PandasDataFrameType, PandasSeriesType, KoalasDataFrameType, KoalasSeriesType]],
**kwargs) -> Union[PandasDataFrameType, KoalasDataFrameType]:
"""
Concatenates a set of Pandas Series/DataFrames or a set of Koalas
Series/DataFrames.
Parameters
----------
objs : List[Union[PandasDataFrameType, PandasSeriesType, KoalasDataFrameType, KoalasSeriesType]]
List of Pandas/Koalas DataFrame to concatenate.
Raises
------
Exception
`objs` must be a list of either Pandas objects or Koalas objects.
Returns
-------
Union[PandasDataFrameType, KoalasDataFrameType]
The concatenated DataFrame.
"""
check_allowed_types(objs[0], 'objs', [
PandasSeries, PandasDataFrame, KoalasSeries, KoalasDataFrame
])
if is_type(objs[0], [PandasSeries, PandasDataFrame]):
return pd.concat(objs, **kwargs)
if is_type(objs[0], [KoalasSeries, KoalasDataFrame]):
import databricks.koalas as ks
with ks.option_context("compute.ops_on_diff_frames", True):
return ks.concat(objs, **kwargs)
[docs]def generate_empty_data_structures() -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Creates data structures often used in classes in Iguanas.
Returns
-------
Tuple[PandasDataFrameType, PandasDataFrameType]
Contains the `rule_descriptions` and `X_rules` dataframes.
"""
columns = [
'Rule', 'Precision', 'Recall', 'nConditions', 'PercDataFlagged',
'OptMetric'
]
rule_descriptions = pd.DataFrame(columns=columns)
rule_descriptions.set_index('Rule', inplace=True)
X_rules = pd.DataFrame([])
return rule_descriptions, X_rules
[docs]def return_columns_types(X: Union[PandasDataFrameType, KoalasDataFrameType]) -> Tuple[List, List, List]:
"""
Returns the integer, float and OHE categorical columns for a given dataset.
Parameters
----------
X : Union[PandasDataFrameType, KoalasDataFrameType])
Dataset.
Returns
-------
Tuple[List, List, List]
List of integer columns, list of float columns, list of OHE categorical
columns.
"""
num_cols = X.shape[1]
int64_cols = list(X.dtypes.index[X.dtypes == 'Int64'])
if len(int64_cols) == num_cols:
int_cols = int64_cols
float_cols = []
elif int64_cols:
X_no_int64 = X.drop(int64_cols, axis=1)
else:
X_no_int64 = X
if is_type(X, [PandasDataFrame]) and len(int64_cols) < num_cols:
int_mask = np.sum(X_no_int64.to_numpy() -
X_no_int64.to_numpy().round(), axis=0) == 0
elif is_type(X, [KoalasDataFrame]) and len(int64_cols) < num_cols:
int_mask = ((X_no_int64 - X_no_int64.round()).sum() == 0).to_numpy()
if len(int64_cols) < num_cols:
int_cols = int64_cols + list(X_no_int64.columns[int_mask])
float_cols = list(X_no_int64.columns[~int_mask])
if int_cols:
poss_ohe_cols_mask = (X[int_cols].nunique() == 2)
poss_ohe_cols = poss_ohe_cols_mask[poss_ohe_cols_mask].index.tolist()
min_zero_mask = (X[poss_ohe_cols].min() == 0)
max_one_mask = (X[poss_ohe_cols].max() == 1)
ohe_mask = min_zero_mask.to_numpy() * max_one_mask.to_numpy()
ohe_cat_cols = [poss_ohe_cols[i] for i, m in enumerate(ohe_mask) if m]
else:
ohe_cat_cols = []
return int_cols, ohe_cat_cols, float_cols
[docs]def sort_rule_dfs_by_opt_metric(rule_descriptions: PandasDataFrameType,
X_rules: PandasDataFrameType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Method for sorting and reindexing `rule_descriptions` and `X_rules` by
opt_metric.
Parameters
----------
rule_descriptions : PandasDataFrameType
The standard rule_descriptions dataframe.
X_rules : PandasDataFrameType
The binary columns of the rules.
Returns
-------
Tuple[PandasDataFrameType, PandasDataFrameType]
`rule_descriptions`, `X_rules`
"""
rule_descriptions.sort_values(
by=['OptMetric'], ascending=False, inplace=True)
X_rules = X_rules.reindex(rule_descriptions.index.tolist(), axis=1)
return rule_descriptions, X_rules
[docs]def combine_rule_dfs(rule_descriptions_1: PandasDataFrameType,
X_rules_1: PandasDataFrameType,
rule_descriptions_2: PandasDataFrameType,
X_rules_2: PandasDataFrameType) -> Tuple[PandasDataFrameType, PandasDataFrameType]:
"""
Combines the `rule_description` and `X_rules` object of two rule sets.
Parameters
----------
rule_descriptions_1 : PandasDataFrameType
The first rule_descriptions.
X_rules_1 : PandasDataFrameType
The first X_rules.
rule_descriptions_2 : PandasDataFrameType
The second rule_descriptions.
X_rules_2 : PandasDataFrameType
The second X_rules.
Returns
-------
Tuple[PandasDataFrameType, PandasDataFrameType]
`rule_descriptions`, `X_rules`
"""
rule_descriptions = pd.concat(
[rule_descriptions_1, rule_descriptions_2], axis=0)
X_rules = pd.concat([X_rules_1, X_rules_2], axis=1)
return rule_descriptions, X_rules
[docs]def create_spark_df(X: KoalasDataFrameType, y: KoalasSeriesType,
sample_weight=None) -> PySparkDataFrameType:
"""
Creates a Spark DataFrame from the features and target given as Koalas
objects.
Parameters
----------
X : KoalasDataFrameType
The feature set.
y : KoalasSeriesType
The target.
sample_weight : KoalasSeriesType, optional
Row-wise weights to apply. Defaults to None.
Returns
-------
PySparkDataFrameType
The Spark DataFrame.
"""
import databricks.koalas as ks
if X.ndim == 1:
X = ks.DataFrame(X)
if sample_weight is None:
spark_df = X.join(y.rename('label_')).to_spark()
else:
spark_df = X.join(y.rename('label_')).join(
sample_weight.rename('sample_weight_')).to_spark()
return spark_df
[docs]def calc_tps_fps_tns_fns(y_true: Union[PandasSeriesType, np.ndarray, KoalasSeriesType],
y_preds: Union[PandasSeriesType, PandasDataFrameType, np.ndarray,
KoalasSeriesType, KoalasDataFrameType],
sample_weight=None, tps=False, fps=False,
tns=False, fns=False, tps_fps=False, tps_fns=False) -> Tuple[
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float]]:
"""
Calculates the True Positives, False Positives, True Negatives, False
Negatives, True Positives + False Positives and True Positives + False
Negatives for a set of binary predictors, given a binary target. The
option to calculate the True Positives + False Positives or True Positives
+ False Positives in one sum is given as it's faster to calculate these
metrics together rather than calculating the individual metrics separately
and summing them.
Parameters
----------
y_true : Union[PandasSeriesType, np.ndarray, KoalasSeriesType]
The binary target.
y_preds : Union[PandasSeriesType, PandasDataFrameType, np.ndarray, KoalasSeriesType, KoalasDataFrameType]
The binary predictors.
sample_weight : Union[np.array, PandasSeriesType, KoalasSeriesType], optional
Row-wise weights to apply. Defaults to None.
tps : bool, optional
If True, the True Positives are calculated. Defaults to False.
fps : bool, optional
If True, the False Positives are calculated. Defaults to False.
tns : bool, optional
If True, the True Negatives are calculated. Defaults to False.
fns : bool, optional
If True, the False Negatives are calculated. Defaults to False.
tps_fps : bool, optional
If True, the True Positives + False Positives are calculated. Defaults
to False.
tps_fns : bool, optional
If True, the True Positives + False Negatives are calculated. Defaults
to False.
Returns
-------
Tuple[Union[np.ndarray, float], Union[np.ndarray, float], Union[np.ndarray, float], Union[np.ndarray, float], Union[np.ndarray, float], Union[np.ndarray, float]]
The True Positives, False Positives, True Negatives, False Negatives,
True Positives + False Positives and True Positives + False Negatives.
"""
def _calc_tps_fps_tns_fns_numpy(y_true: Union[PandasSeriesType, np.ndarray],
y_preds: Union[PandasSeriesType, PandasDataFrameType,
np.ndarray, KoalasSeriesType,
KoalasDataFrameType],
sample_weight: Union[PandasSeriesType,
np.ndarray],
tps: bool, fps: bool, tns: bool,
fns: bool, tps_fps: bool,
tps_fns: bool) -> Tuple[
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float]]:
"""
Calculates the True Positives, False Positives, True Negatives, False
Negatives, True Positives + False Positives and True Positives + False
Negatives for a set of binary predictors, given a binary target, using
Numpy.
"""
# Convert relavent args to numpy arrays
if is_type(y_true, [PandasSeries]):
y_true = y_true.values
if is_type(y_true, [PandasSeries, PandasDataFrameType]):
y_preds = y_preds.values
if is_type(sample_weight, [PandasSeries]) and sample_weight is not None:
sample_weight = sample_weight.values
# Reshape y_true and sample_weight (if given) into same shape as y_preds
if y_preds.shape != y_true.shape:
if sample_weight is not None:
sample_weight_arr = np.tile(
sample_weight, (y_preds.shape[1], 1)).T.astype(int)
y_true_arr = np.tile(y_true, (y_preds.shape[1], 1)).T.astype(int)
else:
y_true_arr = y_true
if sample_weight is not None:
sample_weight_arr = sample_weight
# Calculate TPs, FPs, TNs, FNs, TPs+FPs and TPs+FNs
tps_sum, fps_sum, tns_sum, fns_sum, tps_fps_sum, tps_fns_sum = None, None, None, None, None, None
if sample_weight is not None:
if tps:
tps_sum = (y_preds * y_true_arr * sample_weight_arr).sum(0)
if fps:
fps_sum = (y_preds * (1-y_true_arr) * sample_weight_arr).sum(0)
if tns:
tns_sum = ((1-y_preds) * (1-y_true_arr)
* sample_weight_arr).sum(0)
if fns:
fns_sum = ((1-y_preds) * y_true_arr * sample_weight_arr).sum(0)
if tps_fps:
tps_fps_sum = (y_preds * sample_weight_arr).sum(0)
if tps_fns:
tps_fns_sum = np.array((y_true * sample_weight).sum(0))
else:
if tps:
tps_sum = (y_preds * y_true_arr).sum(0)
if fps:
fps_sum = (y_preds * (1-y_true_arr)).sum(0)
if tns:
tns_sum = ((1-y_preds) * (1-y_true_arr)).sum(0)
if fns:
fns_sum = ((1-y_preds) * y_true_arr).sum(0)
if tps_fps:
tps_fps_sum = (y_preds).sum(0)
if tps_fns:
tps_fns_sum = np.array((y_true).sum(0))
return tps_sum, fps_sum, tns_sum, fns_sum, tps_fps_sum, tps_fns_sum
def _calc_tps_fps_tns_fns_spark(spark_df: PySparkDataFrameType,
features: List[str], tps: bool, fps: bool,
tns: bool, fns: bool, tps_fps: bool,
tps_fns: bool) -> Tuple[
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float],
Union[np.ndarray, float]]:
"""
Calculates the True Positives, False Positives, True Negatives, False
Negatives, True Positives + False Positives and True Positives + False
Negatives for a set of binary predictors, given a binary target, using
Spark.
"""
from pyspark.sql import functions as F
num_feats = len(features)
funcs = []
# Generate spark functions for calculating metrics
if 'sample_weight_' in spark_df.columns:
if tps:
funcs = funcs + [F.sum(F.col(feat) * F.col('label_')
* F.col('sample_weight_')) for feat in features]
if fps:
funcs = funcs + [F.sum(F.col(feat) * (1-F.col('label_'))
* F.col('sample_weight_')) for feat in features]
if tns:
funcs = funcs + [F.sum((1-F.col(feat)) * (1-F.col('label_'))
* F.col('sample_weight_')) for feat in features]
if fns:
funcs = funcs + [F.sum((1-F.col(feat)) * (F.col('label_'))
* F.col('sample_weight_')) for feat in features]
if tps_fps:
funcs = funcs + \
[F.sum((F.col(feat)) * F.col('sample_weight_'))
for feat in features]
if tps_fns:
funcs = funcs + \
[F.sum((F.col('label_')) * F.col('sample_weight_'))]
else:
if tps:
funcs = funcs + [F.sum(F.col(feat) * F.col('label_'))
for feat in features]
if fps:
funcs = funcs + [F.sum(F.col(feat) * (1-F.col('label_')))
for feat in features]
if tns:
funcs = funcs + \
[F.sum((1-F.col(feat)) * (1-F.col('label_')))
for feat in features]
if fns:
funcs = funcs + \
[F.sum((1-F.col(feat)) * (F.col('label_')))
for feat in features]
if tps_fps:
funcs = funcs + [F.sum(F.col(feat)) for feat in features]
if tps_fns:
funcs = funcs + [F.sum(F.col('label_'))]
# Run functions on spark dataframe
all_results = np.array(spark_df.select(funcs).collect()[0])
split_results = []
k = 0
# Extract each result required
for m in [tps, fps, tns, fns, tps_fps, tps_fns]:
if m:
if num_feats == 1:
split_results.append(all_results[k:k+num_feats][0])
else:
split_results.append(all_results[k:k+num_feats])
k += num_feats
else:
split_results.append(None)
return split_results
if tps == False and fps == False and tns == False and fns == False and tps_fps == False and tps_fns == False:
raise ValueError(
'One of the parameters `tps`, `fps`, `tns`, `fns`, `tps_fps` or `tps_fns` must be True')
if is_type(y_true, [KoalasSeries, KoalasDataFrame]) and is_type(y_preds, [KoalasSeries, KoalasDataFrame]):
spark_df = create_spark_df(
X=y_preds, y=y_true, sample_weight=sample_weight)
features = [y_preds.name] if y_preds.ndim == 1 else y_preds.columns
return _calc_tps_fps_tns_fns_spark(
spark_df=spark_df, features=features, tps=tps, fps=fps, tns=tns,
fns=fns, tps_fps=tps_fps, tps_fns=tps_fns)
else:
return _calc_tps_fps_tns_fns_numpy(
y_true=y_true, y_preds=y_preds, sample_weight=sample_weight,
tps=tps, fps=fps, tns=tns, fns=fns, tps_fps=tps_fps,
tps_fns=tps_fns)
[docs]def return_binary_pred_perf_of_set(y_true: Union[PandasSeriesType, np.ndarray, KoalasSeriesType],
y_preds: Union[PandasDataFrameType, np.ndarray, KoalasDataFrameType],
y_preds_columns: List[str],
sample_weight=None,
opt_func=None) -> PandasDataFrameType:
"""
Calculates the performance of a set of binary predictors given a target
column.
Parameters
----------
y_true : Union[PandasSeriesType, np.ndarray, KoalasSeriesType]
Binary integer target column.
y_preds : Union[PandasDataFrameType, np.ndarray, KoalasDataFrameType]
Set of binary integer predictors. Can also be a single predictor.
y_preds_columns : List[str]
Column names for the y_preds array.
sample_weight : Union[PandasSeriesType, np.ndarray, KoalasSeriesType], optional
Row-wise sample_weights to apply. Defaults to None.
opt_func : Callable, optional
A function/method which calculates a custom metric (e.g. Fbeta score)
for each column. Defaults to None.
Returns
-------
PandasDataFrameType
Dataframe containing the performance metrics for each binary predictor.
"""
tps_sum, _, _, _, tps_fps_sum, tps_fns_sum = calc_tps_fps_tns_fns(
y_true=y_true, y_preds=y_preds, sample_weight=sample_weight, tps=True,
tps_fps=True, tps_fns=True)
tps_fps_sum = np.where(tps_fps_sum == 0, np.nan, tps_fps_sum)
precisions = np.nan_to_num(np.divide(tps_sum, tps_fps_sum))
tps_fns_sum = np.where(tps_fns_sum == 0, np.nan, tps_fns_sum)
recalls = np.nan_to_num(np.divide(tps_sum, tps_fns_sum))
if y_preds.ndim == 1:
perc_data_flagged = y_preds.mean(0)
else:
perc_data_flagged = y_preds.mean(0).to_numpy()
# Calculate opt_metric
if opt_func is not None:
opt_metric_results = opt_func(
y_true=y_true, y_preds=y_preds, sample_weight=sample_weight)
else:
opt_metric_results = None
results = pd.DataFrame({
'Precision': precisions,
'Recall': recalls,
'PercDataFlagged': perc_data_flagged,
'OptMetric': opt_metric_results,
}, index=y_preds_columns)
return results
[docs]def return_rule_descriptions_from_X_rules(X_rules: Union[PandasDataFrameType,
KoalasDataFrameType],
X_rules_cols: List[str],
y_true: None,
sample_weight=None,
opt_func=None) -> PandasDataFrameType:
"""
Calculates the performance metrics for the standard `rule_descriptions`
dataframe, given a set of rule binary columns.
Parameters
----------
X_rules : Union[PandasDataFrameType, KoalasDataFrameType]
Set of rule binary columns.
X_rules_cols : List[str]
Columns associated with `X_rules`.
y_true : Union[PandasSeriesType, np.ndarray, KoalasSeriesType], optional
Binary integer target column. Defaults to None.
sample_weight : Union[PandasSeriesType, np.ndarray, KoalasSeriesType], optional
Row-wise sample_weights to apply. Defaults to None.
opt_func : Callable, optional
A function/method which calculates a custom metric (e.g. Fbeta score)
for each rule. Defaults to None.
Returns
-------
PandasDataFrameType
The performance metrics for the standard `rule_descriptions` dataframe.
"""
if y_true is not None:
rule_descriptions = return_binary_pred_perf_of_set(
y_true=y_true, y_preds=X_rules, y_preds_columns=X_rules_cols,
sample_weight=sample_weight, opt_func=opt_func
)
rule_descriptions.index.name = 'Rule'
else:
opt_metric_results = opt_func(y_preds=X_rules)
perc_data_flagged = X_rules.mean(0).to_numpy()
rule_descriptions = pd.DataFrame(data={
'PercDataFlagged': perc_data_flagged,
'OptMetric': opt_metric_results,
},
index=X_rules_cols)
rule_descriptions.index.name = 'Rule'
return rule_descriptions
[docs]def flatten_stringified_json_column(X_column: PandasSeriesType) -> PandasDataFrameType:
"""
Flattens JSONs contained in a column to their own columns.
Parameters
----------
X_column : PandasSeriesType
Contains the JSONs to be flattened.
Returns
-------
PandasDataFrameType
Contains a column per key-value pair in the JSONs.
"""
X_column.fillna('{}', inplace=True)
X_flattened = pd.DataFrame(
list(X_column.apply(lambda x: json.loads(x)).values))
X_flattened.set_index(X_column.index.values, inplace=True)
return X_flattened
[docs]def count_rule_conditions(rule_string: str) -> int:
"""
Counts the number of conditions in a rule string.
Parameters
----------
rule_string : str
The standard Iguanas string representation of the rule.
Returns
-------
int
Number of conditions in the rule.
"""
n_conditions = rule_string.count("X['")
return n_conditions
[docs]def return_progress_ready_range(verbose: bool,
range: Iterable) -> Union[tqdm, Iterable]:
"""
Returns a tqdm object for a given iterable, `range`, if `verbose` is True.
The tqdm object prints the progress of iteration.
Parameters
----------
verbose : bool
Dictates whether the tqdm object should be returned.
range : Iterable
The iterable.
Returns
-------
Union[tqdm, Iterable]
Either the tqdm-version of the iterable, or the original iterable.
"""
if verbose:
return tqdm(range, file=sys.stdout)
else:
return range
[docs]def return_conf_matrix(y_true: Union[PandasSeriesType, np.ndarray, KoalasSeriesType],
y_pred: Union[PandasSeriesType, np.ndarray, KoalasSeriesType],
sample_weight=None) -> PandasDataFrameType:
"""
Creates a confusion matrix from a binary target and binary predictor.
Parameters
----------
y_true : Union[PandasSeriesType, np.ndarray, KoalasSeriesType]
Binary target.
y_pred : Union[PandasSeriesType, np.ndarray, KoalasSeriesType]
Binary predictor.
sample_weight : Union[PandasSeriesType, np.ndarray, KoalasSeriesType], optional
Row-wise weights to apply. Defaults to None.
Returns
-------
PandasDataFrameType
The confusion matrix (the index shows the predicted class; the column
shows the actual class).
"""
tps, fps, tns, fns, _, _ = calc_tps_fps_tns_fns(
y_true=y_true, y_preds=y_pred, sample_weight=sample_weight, tps=True,
fps=True, tns=True, fns=True
)
conf_matrix = pd.DataFrame([
[tps, fps],
[fns, tns]
],
columns=[1, 0], index=[1, 0]
)
return conf_matrix
[docs]def check_allowed_types(x: object, x_name: str,
allowed_types: List[str]) -> None:
"""
Checks whether the stringified type of `x` is in `allowed_types` - a list
of stringified types. If not, it raises a TypeError.
Parameters
----------
x : object
The object to check the type of.
x_name : str
The objects name (used when raising the error).
allowed_types : List[str]
The list of allowed types (in string format).
Raises
------
TypeError
If str(type(`x`)) is not in `allowed_types`.
"""
x_type = str(type(x))
if x_type not in allowed_types:
allowed_types_str = ' or '.join(
[allowed_type.split("'")[1] for allowed_type in allowed_types])
x_type_str = x_type.split("'")[1]
raise TypeError(
f'`{x_name}` must be a {allowed_types_str}. Current type is {x_type_str}.')
[docs]def is_type(x: object, types: List[str]) -> bool:
"""
Returns whether the stringified type of `x` is in `types` - a list of
stringified types.
Parameters
----------
x : object
The object to check the type of.
types : List[str]
The list of allowed types (in string format) to check against.
Returns
-------
bool
If str(type(`x`)) is in `types`.
"""
x_type = str(type(x))
return x_type in types