Source code for tubular.capping
"""Contains transformers that apply capping to numeric columns."""
from __future__ import annotations
import copy
import warnings
from typing import Annotated
import narwhals as nw
import numpy as np
from beartype import beartype
from beartype.vale import Is
from tubular._stats import _weighted_quantile_expr
from tubular._utils import (
_collect_frame,
_convert_dataframe_to_narwhals,
_is_null,
_return_narwhals_or_native_dataframe,
)
from tubular.base import block_from_json, register
from tubular.mixins import WeightColumnMixin
from tubular.numeric import BaseNumericTransformer
from tubular.types import DataFrame, LazyFrame, Number, Series
CappingValues = Annotated[
list[Number | None],
Is[
lambda list_arg: (
(len(list_arg) == 2) # noqa: PLR2004
& (
all(
(isinstance(value, (int, float)) or value is None)
for value in list_arg
)
)
)
],
]
[docs]
@register
class BaseCappingTransformer(BaseNumericTransformer, WeightColumnMixin):
"""Base class for capping transformers, contains functionality shared across capping transformer classes.
Attributes
----------
capping_values : dict[str, CappingValues] or None
Capping values to apply to each column, capping_values argument.
quantiles : dict[str, CappingValues] or None
Quantiles to set capping values at from input data. Will be empty after init, values
populated when fit is run.
quantile_capping_values : dict[str, CappingValues] or None
Capping values learned from quantiles (if provided) to apply to each column.
weights_column : str or None
weights_column argument.
_replacement_values : dict[str, CappingValues]
Replacement values when capping is applied. Will be a copy of capping_values.
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
"""
polars_compatible = True
lazyframe_compatible = True
FITS = True
jsonable = True
@beartype
def __init__(
self,
capping_values: dict[str, CappingValues] | None = None,
quantiles: dict[str, CappingValues] | None = None,
weights_column: str | None = None,
**kwargs: bool,
) -> None:
"""Initialise class instance.
Parameters
----------
capping_values : dict[str, CappingValues] or None, default = None
Dictionary of capping values to apply to each column. The keys in the dict should be the
column names and each item in the dict should be a list of length 2. Items in the lists
should be ints or floats or None. The first item in the list is the minimum capping value
and the second item in the list is the maximum capping value. If None is supplied for
either value then that capping will not take place for that particular column. Both items
in the lists cannot be None. Either one of capping_values or quantiles must be supplied.
quantiles : dict[str, CappingValues] or None, default = None
Dictionary of quantiles in the range [0, 1] to set capping values at for each column.
The keys in the dict should be the column names and each item in the dict should be a
list of length 2. Items in the lists should be ints or floats or None. The first item in the
list is the lower quantile and the second item is the upper quantile to set the capping
value from. The fit method calculates the values quantile from the input data X. If None is
supplied for either value then that capping will not take place for that particular column.
Both items in the lists cannot be None. Either one of capping_values or quantiles must be
supplied.
weights_column : str or None, default = None
Optional weights column argument that can be used in combination with quantiles. Not used
if capping_values is supplied. Allows weighted quantiles to be calculated.
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.init method.
Raises
------
ValueError: if capping values/quantiles passed are invalid
Examples
--------
```pycon
>>> BaseCappingTransformer(
... capping_values={"a": [10, 20], "b": [1, 3]},
... )
BaseCappingTransformer(capping_values={'a': [10, 20], 'b': [1, 3]})
```
"""
if capping_values is None and quantiles is None:
msg = f"{self.classname()}: both capping_values and quantiles are None, either supply capping values in the capping_values argument or supply quantiles that can be learnt in the fit method"
raise ValueError(msg)
if capping_values is not None and quantiles is not None:
msg = f"{self.classname()}: both capping_values and quantiles are not None, supply one or the other"
raise ValueError(msg)
if capping_values is not None:
self.check_capping_values_dict(capping_values, "capping_values")
super().__init__(columns=list(capping_values.keys()), **kwargs)
if quantiles is not None:
self.check_capping_values_dict(quantiles, "quantiles")
for k, quantile_values in quantiles.items():
for quantile_value in quantile_values:
if (quantile_value is not None) and (
quantile_value < 0 or quantile_value > 1
):
msg = f"{self.classname()}: quantile values must be in the range [0, 1] but got {quantile_value} for key {k}"
raise ValueError(msg)
super().__init__(columns=list(quantiles.keys()), **kwargs)
self.quantiles = quantiles
self.capping_values = capping_values
self.weights_column = weights_column
if capping_values is not None:
self._replacement_values = copy.deepcopy(self.capping_values)
[docs]
@beartype
def check_capping_values_dict(
self,
capping_values_dict: dict[str, CappingValues],
dict_name: str,
) -> None:
"""Check passed dictionary.
Parameters
----------
capping_values_dict: dict[str, float]
dict of form {column_name: [lower_cap, upper_cap]}
dict_name: str
'capping_values' or 'quantiles'
Raises
------
ValueError: if capping values are invalid, e.g. lower_cap>upper_cap
Examples
--------
```pycon
>>> transformer = BaseCappingTransformer(
... capping_values={"a": [10, 20], "b": [1, 3]},
... )
>>> transformer.check_capping_values_dict(transformer.capping_values, "capping_values")
```
"""
for k, cap_values in capping_values_dict.items():
for cap_value in cap_values:
if (cap_value is not None) and (
np.isnan(cap_value) or np.isinf(cap_value)
):
msg = f"{self.classname()}: item in {dict_name} lists contains numpy NaN or Inf values"
raise ValueError(msg)
if all(cap_value is not None for cap_value in cap_values) and (
cap_values[0] >= cap_values[1]
):
msg = f"{self.classname()}: lower value is greater than or equal to upper value for key {k}"
raise ValueError(msg)
if all(cap_value is None for cap_value in cap_values):
msg = f"{self.classname()}: both values are None for key {k}"
raise ValueError(msg)
def _check_for_failed_fit(self) -> None:
"""Check if fit failed to find needed attrs.
This is detected by checking self.quantile_capping_values are None where values were expected.
Raises
------
ValueError: if quantile_capping_values have fit as None where values were expected
"""
failed_columns = []
for col in self.quantiles:
for i, init_value in enumerate(self.quantiles[col]):
fit_value = self.quantile_capping_values[col][i]
if not _is_null(init_value) and _is_null(fit_value):
failed_columns.append(col)
break
if failed_columns:
msg = f"fit has failed for columns {failed_columns}, it is possible that all rows are invalid - check for null/negative weights, all null columns, or other invalid conditions listed in the docstring"
raise ValueError(msg)
[docs]
@block_from_json
@beartype
@beartype
def fit(
self, X: DataFrame, y: Series | LazyFrame | None = None
) -> BaseCappingTransformer:
"""Learn capping values from input data X.
Calculates the quantiles to cap at given the quantiles dictionary supplied
when initialising the transformer. Saves learnt values in the
quantile_capping_values and replacement_values attributes.
Parameters
----------
X : DataFrame
A dataframe with required columns to be capped.
y : Series or LazyFrame or None. Defaults to None
Required for pipeline.
Returns
-------
BaseCappingTransformer: fitted instance of class
Examples
--------
```pycon
>>> import polars as pl
>>> transformer = BaseCappingTransformer(
... quantiles={"a": [0.01, 0.99], "b": [0.05, 0.95]},
... )
>>> test_df = pl.DataFrame({"a": [1, 15, 18, 25], "b": [6, 2, 7, 1], "c": [1, 2, 3, 4]})
>>> test_target = pl.Series(name="target", values=[5, 6, 7, 8])
>>> transformer.fit(test_df, test_target)
BaseCappingTransformer(quantiles={'a': [0.01, 0.99], 'b': [0.05, 0.95]})
```
"""
super().fit(X, y)
X = _convert_dataframe_to_narwhals(X)
weights_column = self.weights_column
if self.weights_column is None:
X, weights_column = WeightColumnMixin._create_unit_weights_column(
X,
return_native=False,
verbose=self.verbose,
)
WeightColumnMixin.check_weights_column(self, X, weights_column)
valid_weights_filter_expr = WeightColumnMixin.get_valid_weights_filter_expr(
weights_column, self.verbose
)
X = X.filter(valid_weights_filter_expr)
self.quantile_capping_values = {}
if self.quantiles is not None:
for col in self.columns:
lower_quantile = self.quantiles[col][0]
upper_quantile = self.quantiles[col][1]
quantiles = [
quantile for quantile in self.quantiles[col] if quantile is not None
]
results = self.weighted_quantile(
X,
quantiles,
values_column=col,
weights_column=weights_column,
)
if lower_quantile is None:
results = [None, results[0]]
elif upper_quantile is None:
results = [results[0], None]
self.quantile_capping_values[col] = results
self._replacement_values = copy.deepcopy(self.quantile_capping_values)
self._check_for_failed_fit()
else:
warnings.warn(
f"{self.classname()}: quantiles not set so no fitting done in CappingTransformer",
stacklevel=2,
)
return self
[docs]
@block_from_json
@beartype
def weighted_quantile( # noqa: PLR6301, self is implicitly used by block_from_json
self,
X: DataFrame,
quantiles: list[Number],
values_column: str,
weights_column: str,
) -> list[Number | None]:
"""Calculate weighted quantiles.
This method is adapted from the "Completely vectorized numpy solution" answer from user
Alleo (https://stackoverflow.com/users/498892/alleo) to the following stackoverflow question;
https://stackoverflow.com/questions/21844024/weighted-percentile-using-numpy. This
method is also licenced under the CC-BY-SA terms, as the original code sample posted
to stackoverflow (pre February 1, 2016) was.
Method is similar to numpy.percentile, but supports weights. Supplied quantiles should be
in the range [0, 1]. Method calculates cumulative % of weight for each observation,
then interpolates between these observations to calculate the desired quantiles. Null values
in the observations (values) and 0 weight observations are filtered out before
calculating.
Parameters
----------
X : DataFrame
Dataframe with relevant columns to calculate quantiles from.
quantiles : list[Number]
Weighted quantiles to calculate. Must all be between 0 and 1.
values_column: str
name of relevant values column in data
weights_column: str
name of relevant weight column in data
Returns
-------
interp_quantiles : list[Number]
List containing computed quantiles.
Examples
--------
```pycon
>>> import polars as pl
>>> x = CappingTransformer(capping_values={"a": [2, 10]})
>>> df = pl.DataFrame({"a": [1, 2, 3], "weight": [1, 1, 1]})
>>> quantiles_to_compute = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
>>> computed_quantiles = x.weighted_quantile(
... X=df, values_column="a", weights_column="weight", quantiles=quantiles_to_compute
... )
>>> [round(q, 1) for q in computed_quantiles]
[1.0, 1.0, 1.0, 1.0, 1.2, 1.5, 1.8, 2.1, 2.4, 2.7, 3.0]
>>> df = pl.DataFrame({"a": [1, 2, 3], "weight": [0, 1, 0]})
>>> computed_quantiles = x.weighted_quantile(
... X=df, values_column="a", weights_column="weight", quantiles=quantiles_to_compute
... )
>>> [round(q, 1) for q in computed_quantiles]
[2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0]
>>> df = pl.DataFrame({"a": [1, 2, 3], "weight": [1, 1, 0]})
>>> computed_quantiles = x.weighted_quantile(
... X=df, values_column="a", weights_column="weight", quantiles=quantiles_to_compute
... )
>>> [round(q, 1) for q in computed_quantiles]
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.2, 1.4, 1.6, 1.8, 2.0]
>>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "weight": [1, 1, 1, 1, 1]})
>>> computed_quantiles = x.weighted_quantile(
... X=df, values_column="a", weights_column="weight", quantiles=quantiles_to_compute
... )
>>> [round(q, 1) for q in computed_quantiles]
[1.0, 1.0, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
>>> df = pl.DataFrame({"a": [1, 2, 3, 4, 5], "weight": [1, 0, 1, 0, 1]})
>>> computed_quantiles = x.weighted_quantile(
... X=df, values_column="a", weights_column="weight", quantiles=[0, 0.5, 1.0]
... )
>>> [round(q, 1) for q in computed_quantiles]
[1.0, 2.0, 5.0]
```
"""
X = _convert_dataframe_to_narwhals(X)
quantiles = np.array(quantiles)
not_null_expr = ~(nw.col(values_column).is_null())
nonzero_weight_expr = ~(nw.col(weights_column) == 0)
combined_filter = not_null_expr & nonzero_weight_expr
X_temp = X.sort(by=values_column, descending=False).filter(combined_filter)
values_expr = nw.col(values_column)
weighted_quantiles_expr = _weighted_quantile_expr(
weights_column=weights_column, values_column=values_column
)
results_dict = _collect_frame(
X_temp.select(weighted_quantiles_expr, values_expr)
).to_dict()
# TODO - once narwhals implements interpolate, replace this with nw
# syntax
weighted_quantiles = results_dict[weights_column].to_numpy()
values = results_dict[values_column].to_numpy()
interp_quantiles = (
np.interp(quantiles, weighted_quantiles, values)
# if no rows have passed filters, return None
if len(values) > 0
else [None] * len(quantiles)
)
return [
float(value) if value is not None else value for value in interp_quantiles
]
[docs]
@beartype
def transform(
self,
X: DataFrame,
return_native_override: bool | None = None,
) -> DataFrame:
"""Apply capping to columns in X.
If cap_value_max is set, any values above cap_value_max will be set to cap_value_max. If cap_value_min
is set any values below cap_value_min will be set to cap_value_min. Only works or numeric columns.
Parameters
----------
X : DataFrame
Data to apply capping to.
return_native_override: Optional[bool]
Option to override return_native attr in transformer, useful when calling parent
methods
Returns
-------
X : DataFrame
Transformed input X with min and max capping applied to the specified columns.
Examples
--------
```pycon
>>> import polars as pl
>>> transformer = BaseCappingTransformer(
... capping_values={"a": [10, 20], "b": [1, 3]},
... )
>>> test_df = pl.DataFrame({"a": [1, 15, 18, 25], "b": [6, 2, 7, 1], "c": [1, 2, 3, 4]})
>>> transformer.transform(test_df)
shape: (4, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 10 ┆ 3 ┆ 1 │
│ 15 ┆ 2 ┆ 2 │
│ 18 ┆ 3 ┆ 3 │
│ 20 ┆ 1 ┆ 4 │
└─────┴─────┴─────┘
```
"""
self.check_is_fitted(["_replacement_values"])
X = _convert_dataframe_to_narwhals(X)
schema = X.collect_schema()
return_native = self._process_return_native(return_native_override)
X = super().transform(X, return_native_override=False)
dict_attrs = ["_replacement_values"]
if self.quantiles:
self.check_is_fitted(["quantile_capping_values"])
capping_values_for_transform = self.quantile_capping_values
dict_attrs = [*dict_attrs, "quantile_capping_values"]
else:
capping_values_for_transform = self.capping_values
dict_attrs = [*dict_attrs, "capping_values"]
exprs = {}
for col in self.columns:
cap_value_min = capping_values_for_transform[col][0]
cap_value_max = capping_values_for_transform[col][1]
replacement_min = self._replacement_values[col][0]
replacement_max = self._replacement_values[col][1]
if cap_value_min is not None and cap_value_max is not None:
col_expr = (
nw.when(nw.col(col) < cap_value_min)
.then(replacement_min)
.otherwise(
nw.when(nw.col(col) > cap_value_max)
.then(replacement_max)
.otherwise(nw.col(col)),
)
)
elif cap_value_min is not None:
col_expr = (
nw.when(nw.col(col) < cap_value_min)
.then(replacement_min)
.otherwise(nw.col(col))
)
elif cap_value_max is not None:
col_expr = (
nw.when(nw.col(col) > cap_value_max)
.then(replacement_max)
.otherwise(nw.col(col))
)
else:
col_expr = nw.col(col)
# make sure type is preserved for single row,
# e.g. mapping single row to int could convert
# from float to int
# TODO - look into better ways to achieve this
exprs[col] = col_expr.cast(
schema[col],
).alias(col)
X = X.with_columns(**exprs) if exprs else X
return _return_narwhals_or_native_dataframe(X, return_native)
[docs]
def to_json(self) -> dict:
"""Return a JSON-serializable representation of the transformer.
Returns
-------
dict
Dictionary containing all necessary attributes to recreate the transformer with
`from_json`. Keys include 'init' (initialization parameters) and 'fit' (fitted values).
"""
data = super().to_json()
data["init"].pop("columns", None)
data["init"].update(
{
"capping_values": self.capping_values,
"quantiles": self.quantiles,
"weights_column": self.weights_column,
},
)
# transformer only fits for quantiles setting
if self.quantiles is not None:
self.check_is_fitted(["quantile_capping_values", "_replacement_values"])
data["fit"].update(
{
"quantile_capping_values": self.quantile_capping_values,
"_replacement_values": self._replacement_values,
},
)
return data
[docs]
@register
class CappingTransformer(BaseCappingTransformer):
"""Transformer to cap numeric values at both or either minimum and maximum values.
For max capping any values above the cap value will be set to the cap. Similarly for min capping
any values below the cap will be set to the cap. Only works for numeric columns.
Attributes:
----------
capping_values : dict[str, CappingValues] or None
Capping values to apply to each column, capping_values argument.
quantiles : dict[str, CappingValues] or None
Quantiles to set capping values at from input data. Will be empty after init, values
populated when fit is run.
quantile_capping_values : dict[str, CappingValues] or None
Capping values learned from quantiles (if provided) to apply to each column.
weights_column : str or None
weights_column argument.
_replacement_values : dict[str, CappingValues]
Replacement values when capping is applied. Will be a copy of capping_values.
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
Example:
-------
```pycon
>>> import polars as pl
>>> transformer = CappingTransformer(
... capping_values={"a": [10, 20], "b": [1, 3]},
... )
>>> test_df = pl.DataFrame({"a": [1, 15, 18, 25], "b": [6, 2, 7, 1], "c": [1, 2, 3, 4]})
>>> transformer.transform(test_df)
shape: (4, 3)
┌─────┬─────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════╡
│ 10 ┆ 3 ┆ 1 │
│ 15 ┆ 2 ┆ 2 │
│ 18 ┆ 3 ┆ 3 │
│ 20 ┆ 1 ┆ 4 │
└─────┴─────┴─────┘
>>> # transformer can also be dumped to json and reinitialised
>>> json_dump = transformer.to_json()
>>> json_dump
{'tubular_version': ..., 'classname': 'CappingTransformer', 'init': {'copy': False, 'verbose': False, 'return_native': True, 'capping_values': {'a': [10, 20], 'b': [1, 3]}, 'quantiles': None, 'weights_column': None}, 'fit': {'is_fitted_': False}}
>>> CappingTransformer.from_json(json_dump)
CappingTransformer(capping_values={'a': [10, 20], 'b': [1, 3]})
```
"""
polars_compatible = True
lazyframe_compatible = True
FITS = True
jsonable = True
@beartype
def __init__(
self,
capping_values: dict[str, CappingValues] | None = None,
quantiles: dict[str, CappingValues] | None = None,
weights_column: str | None = None,
**kwargs: bool,
) -> None:
"""Initialise class instance.
Parameters
----------
capping_values : dict[str, CappingValues] or None, default = None
Dictionary of capping values to apply to each column. The keys in the dict should be the
column names and each item in the dict should be a list of length 2. Items in the lists
should be ints or floats or None. The first item in the list is the minimum capping value
and the second item in the list is the maximum capping value. If None is supplied for
either value then that capping will not take place for that particular column. Both items
in the lists cannot be None. Either one of capping_values or quantiles must be supplied.
quantiles : dict[str, CappingValues] or None, default = None
Dictionary of quantiles in the range [0, 1] to set capping values at for each column.
The keys in the dict should be the column names and each item in the dict should be a
list of length 2. Items in the lists should be ints or floats or None. The first item in the
list is the lower quantile and the second item is the upper quantile to set the capping
value from. The fit method calculates the values quantile from the input data X. If None is
supplied for either value then that capping will not take place for that particular column.
Both items in the lists cannot be None. Either one of capping_values or quantiles must be
supplied.
weights_column : str or None, default = None
Optional weights column argument that can be used in combination with quantiles. Not used
if capping_values is supplied. Allows weighted quantiles to be calculated.
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.init method.
"""
super().__init__(capping_values, quantiles, weights_column, **kwargs)
[docs]
@block_from_json
@beartype
def fit(
self, X: DataFrame, y: Series | LazyFrame | None = None
) -> CappingTransformer:
"""Learn capping values from input data X.
Calculates the quantiles to cap at given the quantiles dictionary supplied
when initialising the transformer. Saves learnt values in the capping_values
attribute.
Parameters
----------
X : DataFrame
A dataframe with required columns to be capped.
y : None
Required for pipeline.
Returns
-------
CappingTransformer: fitted instance of class
Example
-------
```pycon
>>> import polars as pl
>>> transformer = CappingTransformer(
... quantiles={"a": [0.01, 0.99], "b": [0.05, 0.95]},
... )
>>> test_df = pl.DataFrame({"a": [1, 15, 18, 25], "b": [6, 2, 7, 1], "c": [1, 2, 3, 4]})
>>> transformer.fit(test_df)
CappingTransformer(quantiles={'a': [0.01, 0.99], 'b': [0.05, 0.95]})
```
"""
X = _convert_dataframe_to_narwhals(X)
super().fit(X, y)
self.is_fitted_ = True
return self
[docs]
@register
class OutOfRangeNullTransformer(BaseCappingTransformer):
"""Transformer to set values outside of a range to null.
This transformer sets the cut off values in the same way as
the CappingTransformer. So either the user can specify them
directly in the capping_values argument or they can be calculated
in the fit method, if the user supplies the quantiles argument.
Attributes:
----------
capping_values : dict[str, CappingValues] or None
Capping values to apply to each column, capping_values argument.
quantiles : dict[str, CappingValues] or None
Quantiles to set capping values at from input data. Will be empty after init, values
populated when fit is run.
quantile_capping_values : dict[str, CappingValues] or None
Capping values learned from quantiles (if provided) to apply to each column.
weights_column : str or None
weights_column argument.
_replacement_values : dict[str, CappingValues]
Replacement values when capping is applied. This will contain nulls for each column.
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
Example:
-------
```pycon
>>> import polars as pl
>>> transformer = OutOfRangeNullTransformer(
... capping_values={"a": [10, 20], "b": [1, 3]},
... )
>>> transformer
OutOfRangeNullTransformer(capping_values={'a': [10, 20], 'b': [1, 3]})
# transform method is inherited so also demo that here
>>> test_df = pl.DataFrame()
>>> test_df = pl.DataFrame({"a": [1, 15, 18, 25], "b": [6, 2, 7, 1], "c": [1, 2, 3, 4]})
>>> transformer.transform(test_df)
shape: (4, 3)
┌──────┬──────┬─────┐
│ a ┆ b ┆ c │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞══════╪══════╪═════╡
│ null ┆ null ┆ 1 │
│ 15 ┆ 2 ┆ 2 │
│ 18 ┆ null ┆ 3 │
│ null ┆ 1 ┆ 4 │
└──────┴──────┴─────┘
>>> # transformer can also be dumped to json and reinitialised
>>> json_dump = transformer.to_json()
>>> json_dump
{'tubular_version': ..., 'classname': 'OutOfRangeNullTransformer', 'init': {'copy': False, 'verbose': False, 'return_native': True, 'capping_values': {'a': [10, 20], 'b': [1, 3]}, 'quantiles': None, 'weights_column': None}, 'fit': {'is_fitted_': False}}
>>> OutOfRangeNullTransformer.from_json(json_dump)
OutOfRangeNullTransformer(capping_values={'a': [10, 20], 'b': [1, 3]})
```
"""
polars_compatible = True
lazyframe_compatible = True
FITS = True
jsonable = True
@beartype
def __init__(
self,
capping_values: dict[str, CappingValues] | None = None,
quantiles: dict[str, CappingValues] | None = None,
weights_column: str | None = None,
**kwargs: bool,
) -> None:
"""Initialise class instance.
Parameters
----------
capping_values : dict[str, CappingValues] or None, default = None
Dictionary of capping values to apply to each column. The keys in the dict should be the
column names and each item in the dict should be a list of length 2. Items in the lists
should be ints or floats or None. The first item in the list is the minimum capping value
and the second item in the list is the maximum capping value. If None is supplied for
either value then that capping will not take place for that particular column. Both items
in the lists cannot be None. Either one of capping_values or quantiles must be supplied.
quantiles : dict[str, CappingValues] or None, default = None
Dictionary of quantiles to set capping values at for each column. The keys in the dict
should be the column names and each item in the dict should be a list of length 2. Items
in the lists should be ints or floats or None. The first item in the list is the lower
quantile and the second item is the upper quantile to set the capping value from. The fit
method calculates the values quantile from the input data X. If None is supplied for
either value then that capping will not take place for that particular column. Both items
in the lists cannot be None. Either one of capping_values or quantiles must be supplied.
weights_column : str or None, default = None
Optional weights column argument that can be used in combination with quantiles. Not used
if capping_values is supplied. Allows weighted quantiles to be calculated.
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.init method.
"""
super().__init__(
capping_values=capping_values,
quantiles=quantiles,
weights_column=weights_column,
**kwargs,
)
if capping_values:
self._replacement_values = OutOfRangeNullTransformer.set_replacement_values(
self.capping_values,
)
[docs]
@beartype
@staticmethod
def set_replacement_values(
capping_values: dict[str, list[Number | None]],
) -> dict[str, list[bool | None]]:
"""Set the _replacement_values to have all null values.
Keeps the existing keys in the _replacement_values dict and sets all values (except None) in the lists to np.NaN. Any None
values remain in place.
Returns
-------
replacement_values: replacement values for OutOfRangeNullTransformer
Examples
--------
```pycon
>>> import polars as pl
>>> capping_values = {"a": [0.1, 0.2], "b": [None, 10]}
>>> OutOfRangeNullTransformer.set_replacement_values(capping_values)
{'a': [None, None], 'b': [False, None]}
```
"""
replacement_values = {}
for k, cap_values_list in capping_values.items():
null_replacements_list = [
None if replace_value is not None else False
for replace_value in cap_values_list
]
replacement_values[k] = null_replacements_list
return replacement_values
[docs]
@block_from_json
@beartype
def fit(
self, X: DataFrame, y: Series | LazyFrame | None = None
) -> OutOfRangeNullTransformer:
"""Learn capping values from input data X.
Calculates the quantiles to cap at given the quantiles dictionary supplied
when initialising the transformer. Saves learnt values in the capping_values
attribute.
Parameters
----------
X : DataFrame
A dataframe with required columns to be capped.
y : None
Required for pipeline.
Returns
-------
OutOfRangeNullTransformer: fitted instance of class
Example
-------
```pycon
>>> import polars as pl
>>> transformer = OutOfRangeNullTransformer(
... quantiles={"a": [0.01, 0.99], "b": [0.05, 0.95]},
... )
>>> test_df = pl.DataFrame({"a": [1, 15, 18, 25], "b": [6, 2, 7, 1], "c": [1, 2, 3, 4]})
>>> transformer.fit(test_df)
OutOfRangeNullTransformer(quantiles={'a': [0.01, 0.99], 'b': [0.05, 0.95]})
```
"""
X = _convert_dataframe_to_narwhals(X)
super().fit(X=X, y=y)
if self.quantiles:
BaseCappingTransformer.fit(self, X=X, y=y)
self._replacement_values = OutOfRangeNullTransformer.set_replacement_values(
self.quantile_capping_values,
)
else:
warnings.warn(
f"{self.classname()}: quantiles not set so no fitting done in OutOfRangeNullTransformer",
stacklevel=2,
)
self.is_fitted_ = True
return self