Source code for tubular.comparison
"""module for comparing and conditionally updating provided columns."""
from __future__ import annotations
from typing import TYPE_CHECKING, Any
import narwhals as nw
if TYPE_CHECKING:
import pandas as pd
from beartype import beartype
from typing_extensions import deprecated
from tubular._utils import (
_convert_dataframe_to_narwhals,
_return_narwhals_or_native_dataframe,
block_from_json,
)
from tubular.base import BaseTransformer, register
from tubular.functions.comparison import (
ConditionEnumStr,
apply_when_then_otherwise,
compare_two_columns,
)
from tubular.mixins import DropOriginalMixin
from tubular.types import (
DataFrame,
ListOfStrs,
ListOfTwoStrs,
NumericTypes,
)
[docs]
@register
class WhenThenOtherwiseTransformer(BaseTransformer):
"""Transformer to apply conditional logic across multiple columns.
This transformer evaluates specified columns against a condition and updates
with given values based on the results.
Attributes
----------
polars_compatible : bool
Indicates whether transformer has been converted to polars/pandas agnostic
narwhals framework.
FITS : bool
Indicates whether transform requires fit to be run first.
jsonable : bool
Indicates if transformer supports to/from_json methods.
lazyframe_compatible : bool
Indicates whether transformer works with lazyframes.
Examples
--------
```pycon
>>> import polars as pl
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [4, 5, 6],
... "condition_col": [True, False, True],
... "update_col": [10, 20, 30],
... }
... )
>>> transformer = WhenThenOtherwiseTransformer(
... columns=["a", "b"], when_column="condition_col", then_column="update_col"
... )
>>> transformed_df = transformer.transform(df)
>>> print(transformed_df)
shape: (3, 4)
┌─────┬─────┬───────────────┬────────────┐
│ a ┆ b ┆ condition_col ┆ update_col │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ bool ┆ i64 │
╞═════╪═════╪═══════════════╪════════════╡
│ 10 ┆ 10 ┆ true ┆ 10 │
│ 2 ┆ 5 ┆ false ┆ 20 │
│ 30 ┆ 30 ┆ true ┆ 30 │
└─────┴─────┴───────────────┴────────────┘
```
"""
polars_compatible = True
FITS = False
jsonable = True
lazyframe_compatible = True
@beartype
def __init__(
self,
columns: ListOfStrs,
when_column: str,
then_column: str,
**kwargs: bool | None,
) -> None:
"""Initialize the WhenThenOtherwiseTransformer.
Parameters
----------
columns : ListOfStrs
List of columns to be transformed.
when_column : bool
Boolean column used to evaluate conditions.
then_column : ListOfOneStr
Column containing values to update the specified columns
based on the condition.
**kwargs : dict[str, bool]
Additional keyword arguments passed to the BaseTransformer.
"""
super().__init__(columns=columns, **kwargs)
self.when_column = when_column
self.then_column = then_column
self.is_fitted_ = True # Set is_fitted to True as no fitting is required
[docs]
@block_from_json
def to_json(self) -> dict[str, dict[str, Any]]:
"""Serialize the transformer to a JSON-compatible dictionary.
Returns
-------
dict[str, dict[str, Any]]:
JSON representation of the transformer, including init parameters.
Examples
--------
```pycon
>>> from pprint import pprint
>>> transformer = WhenThenOtherwiseTransformer(
... columns=["a", "b"],
... when_column="condition_col",
... then_column="update_col", # noqa: E501
... )
>>> pprint(transformer.to_json(), sort_dicts=True)
{'classname': 'WhenThenOtherwiseTransformer',
'fit': {'is_fitted_': True},
'init': {'columns': ['a', 'b'],
'copy': False,
'return_native': True,
'then_column': 'update_col',
'verbose': False,
'when_column': 'condition_col'},
'tubular_version': ...}
```
"""
json_dict = super().to_json()
json_dict["init"].update(
{
"when_column": self.when_column,
"then_column": self.then_column,
},
)
return json_dict
[docs]
def get_transform_exprs(self) -> list[nw.Expr]:
"""Get transform expressions.
Returns
-------
list[nw.Expr]: transform expressions for class
"""
return apply_when_then_otherwise(
columns=self.columns,
when_column=self.when_column,
then_column=self.then_column,
)
[docs]
@beartype
def transform(
self,
X: DataFrame,
) -> DataFrame:
"""Apply conditional logic to transform specified columns.
Parameters
----------
X : DataFrame
DataFrame containing the columns to be transformed.
Returns
-------
DataFrame
Transformed DataFrame with updated columns based on conditions.
Raises
------
TypeError
If the `when_column` is not of type Boolean or if columns
have mismatched types.
Examples
--------
```pycon
>>> import polars as pl
>>> df = pl.DataFrame(
... {
... "a": [1, 2, 3],
... "b": [4, 5, 6],
... "condition_col": [True, False, True],
... "update_col": [10, 20, 30],
... }
... )
>>> transformer = WhenThenOtherwiseTransformer(
... columns=["a", "b"],
... when_column="condition_col",
... then_column="update_col",
... )
>>> transformed_df = transformer.transform(df)
>>> print(transformed_df)
shape: (3, 4)
┌─────┬─────┬───────────────┬────────────┐
│ a ┆ b ┆ condition_col ┆ update_col │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ bool ┆ i64 │
╞═════╪═════╪═══════════════╪════════════╡
│ 10 ┆ 10 ┆ true ┆ 10 │
│ 2 ┆ 5 ┆ false ┆ 20 │
│ 30 ┆ 30 ┆ true ┆ 30 │
└─────┴─────┴───────────────┴────────────┘
```
"""
X = _convert_dataframe_to_narwhals(X)
X = super().transform(X, return_native_override=False)
schema = X.collect_schema()
if schema[self.when_column] != nw.Boolean:
message = f"The column '{self.when_column}' must be of type Boolean."
raise TypeError(message)
then_column_type = schema[self.then_column]
if any(schema[col] != then_column_type for col in self.columns):
message = (
f"All columns in {self.columns} must be of the same type as "
f"'{self.then_column}'."
)
raise TypeError(message)
transform_exprs = self.get_transform_exprs()
X = X.with_columns(*transform_exprs) if transform_exprs else X
return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs]
@register
class CompareTwoColumnsTransformer(BaseTransformer):
"""Transformer to compare two columns and generate outcomes based on conditions.
This transformer evaluates a condition between two columns and generates an
outcome based on the result.
Attributes
----------
polars_compatible : bool
Indicates whether transformer has been converted to polars/pandas
agnostic narwhals framework.
FITS : bool
Indicates whether transform requires fit to be run first.
jsonable : bool
Indicates if transformer supports to/from_json methods.
lazyframe_compatible : bool
Indicates whether transformer works with lazyframes.
Examples
--------
```pycon
>>> import polars as pl
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]})
>>> transformer = CompareTwoColumnsTransformer(
... columns=["a", "b"],
... condition=">",
... )
>>> transformed_df = transformer.transform(df)
>>> print(transformed_df)
shape: (3, 3)
┌─────┬─────┬───────┐
│ a ┆ b ┆ a>b │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ bool │
╞═════╪═════╪═══════╡
│ 1 ┆ 3 ┆ false │
│ 2 ┆ 2 ┆ false │
│ 3 ┆ 1 ┆ true │
└─────┴─────┴───────┘
```
"""
polars_compatible = True
FITS = False
jsonable = True
lazyframe_compatible = True
@beartype
def __init__(
self,
columns: ListOfTwoStrs,
condition: ConditionEnumStr,
**kwargs: bool | None,
) -> None:
"""Initialize the CompareTwoColumnsTransformer.
Parameters
----------
columns : ListOfTwoStrs
Tuple or list containing the names of the two columns to be compared.
condition : str
Logical condition to evaluate the relationship between the two columns.
**kwargs : dict[str, bool]
Additional keyword arguments passed to the BaseTransformer.
"""
super().__init__(columns=columns, **kwargs)
self.condition = condition
self.is_fitted_ = True # Set is_fitted to True as no fitting is required
[docs]
def to_json(self) -> dict[str, dict[str, Any]]:
"""Serialize the transformer to a JSON-compatible dictionary.
Returns
-------
dict[str, dict[str, Any]]:
JSON representation of the transformer, including init parameters.
Examples
--------
```pycon
>>> from tubular.functions.comparison import ConditionEnum
>>> transformer = CompareTwoColumnsTransformer(
... columns=["a", "b"],
... condition=ConditionEnum.GREATER_THAN.value,
... )
>>> json_dict = transformer.to_json()
>>> from pprint import pprint
>>> pprint(json_dict, sort_dicts=True)
{'classname': 'CompareTwoColumnsTransformer',
'fit': {'is_fitted_': True},
'init': {'columns': ['a', 'b'],
'condition': '>',
'copy': False,
'return_native': True,
'verbose': False},
'tubular_version': ...}
```
"""
json_dict = super().to_json()
json_dict["init"]["condition"] = self.condition
return json_dict
[docs]
def get_transform_exprs(self) -> list[nw.Expr]:
"""Get transform expressions.
Returns
-------
list[nw.Expr]: transform expressions for class
"""
return compare_two_columns(
columns=self.columns,
condition=self.condition,
)
[docs]
@beartype
def transform(self, X: DataFrame) -> DataFrame:
"""Transform two columns based on a condition to generate an outcome.
Parameters
----------
X : DataFrame
DataFrame containing the columns to be transformed.
Returns
-------
DataFrame
Transformed DataFrame with the new outcome column.
Raises
------
TypeError
If the columns are not of a numeric type.
Examples
--------
```pycon
>>> import polars as pl
>>> df = pl.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]})
>>> transformer = CompareTwoColumnsTransformer(
... columns=["a", "b"],
... condition=">",
... )
>>> transformed_df = transformer.transform(df)
>>> print(transformed_df)
shape: (3, 3)
┌─────┬─────┬───────┐
│ a ┆ b ┆ a>b │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ bool │
╞═════╪═════╪═══════╡
│ 1 ┆ 3 ┆ false │
│ 2 ┆ 2 ┆ false │
│ 3 ┆ 1 ┆ true │
└─────┴─────┴───────┘
```
"""
X = _convert_dataframe_to_narwhals(X)
X = super().transform(X, return_native_override=False)
schema = X.collect_schema()
bad_cols = [col for col in self.columns if schema[col] not in NumericTypes]
if bad_cols:
message = (
"Columns must be of a numeric type, but the following are not: "
f"{bad_cols}"
)
raise TypeError(message)
transform_expr = self.get_transform_exprs()
X = X.with_columns(transform_expr)
return _return_narwhals_or_native_dataframe(X, self.return_native)
# DEPRECATED TRANSFORMERS
[docs]
@deprecated(
"""This transformer has been superseded by CompareTwoColumnsTransformer
and so has been deprecated, and will be removed in a future major release.
""",
)
class EqualityChecker(
DropOriginalMixin,
BaseTransformer,
):
"""Transformer to check if two columns are equal.
Attributes
----------
built_from_json: bool
indicates if transformer was reconstructed from json,
which limits it's supported functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to
polars/pandas agnostic narwhals framework
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
deprecated: bool
indicates if class has been deprecated
"""
polars_compatible = False
lazyframe_compatible = False
FITS = False
jsonable = False
deprecated = True
@beartype
def __init__(
self,
columns: ListOfTwoStrs,
new_column_name: str,
drop_original: bool = False,
**kwargs: bool | None,
) -> None:
"""Initialise class instance.
Parameters
----------
columns: list
List containing names of the two columns to check.
new_column_name: string
string containing the name of the new column.
drop_original: boolean = False
boolean representing dropping the input columns from X after checks.
**kwargs:
Arbitrary keyword arguments passed onto BaseTransformer.init method.
"""
super().__init__(columns=columns, **kwargs)
self.drop_original = drop_original
self.new_column_name = new_column_name
[docs]
def get_feature_names_out(self) -> list[str]:
"""Get list of features modified/created by the transformer.
Returns
-------
list[str]:
list of features modified/created by the transformer
Examples
--------
```pycon
>>> # base classes just return inputs
>>> transformer = EqualityChecker(
... columns=["a", "b"],
... new_column_name="bla",
... )
>>> transformer.get_feature_names_out()
['bla']
```
"""
return [self.new_column_name]
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Create a column which indicated equality between given columns.
Parameters
----------
X : pd.DataFrame
Data to apply mappings to.
Returns
-------
X : pd.DataFrame
Transformed input X with additional boolean column.
"""
X = super().transform(X)
X[self.new_column_name] = X[self.columns[0]] == X[self.columns[1]]
# Drop original columns if self.drop_original is True
return DropOriginalMixin.drop_original_column(
X,
self.drop_original,
self.columns,
)