Source code for tubular.base
"""Contains transformers that other transformers in the package inherit from.
These transformers contain key checks to be applied in all cases.
"""
from __future__ import annotations
from typing import Any
import narwhals as nw
import pandas as pd
from beartype import beartype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from typing_extensions import deprecated
from tubular._utils import (
_collect_series,
_convert_dataframe_to_narwhals,
_convert_series_to_narwhals,
_get_version,
_return_narwhals_or_native_dataframe,
block_from_json,
)
from tubular.mixins import DropOriginalMixin
from tubular.types import (
DataFrame,
GenericKwargs,
LazyFrame,
ListOfStrs,
NonEmptyListOfStrs,
Series,
)
pd.options.mode.copy_on_write = True
FEATURE_REGISTRY = {}
CLASS_REGISTRY = {}
[docs]
def register(cls: BaseTransformer) -> BaseTransformer:
"""Add transformer to registry dict.
Returns:
-------
cls - transformer
Example:
-------
```pycon
>>> @register
... class MyTransformer(BaseTransformer):
... pass
...
>>> CLASS_REGISTRY["MyTransformer"]
<class 'tubular.base.MyTransformer'>
```
"""
CLASS_REGISTRY[cls.__name__] = cls
return cls
[docs]
@register
class BaseTransformer(BaseEstimator, TransformerMixin):
"""Base transformer class which all other transformers in the package inherit from.
Provides fit and transform methods (required by sklearn transformers), simple input
checking and functionality to copy X prior to transform.
Attributes:
----------
columns : list
Either a list of str values giving which columns in a input pandas.DataFrame the
transformer will be applied to.
copy : bool
Should X be copied before transforms are applied?
Copy argument no longer used and will be deprecated in a future release
verbose : bool
Print statements to show which methods are being run or not.
built_from_json: bool
indicates if transformer was reconstructed from json,
which limits it's supported functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to
polars/pandas agnostic narwhals framework
return_native: bool, default = True
Controls whether transformer returns narwhals or native pandas/polars type
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
Example:
-------
```pycon
>>> BaseTransformer(
... columns="a",
... )
BaseTransformer(columns=['a'])
```
"""
polars_compatible = True
lazyframe_compatible = True
jsonable = True
FITS = True
_version = _get_version()
def __init_subclass__(cls: BaseTransformer) -> None:
"""Logic to be run when a new child class is defined.
This populates a dictionary, which will help us track which
transformers in the repo support which functionality.
"""
deprecated = getattr(cls, "deprecated", False)
# ignore deprecated transformers and base classes
if (
deprecated
or cls.__name__.startswith("Base")
or cls.__name__.startswith("_")
):
return
FEATURE_REGISTRY[cls.__name__] = {
"polars_compatible": cls.polars_compatible,
# repo was originally written in pandas, so the is a given
"pandas_compatible": True,
"jsonable": cls.jsonable,
"lazyframe_compatible": cls.lazyframe_compatible,
}
[docs]
def classname(self) -> str:
"""Return the name of the current class when called.
Returns
-------
str: name of class
"""
return type(self).__name__
@beartype
def __init__(
self,
columns: ListOfStrs | str,
copy: bool = False,
verbose: bool = False,
return_native: bool = True,
) -> None:
"""Init method for class.
Parameters
----------
columns : None or list or str
Columns to apply the transformer to. If a str is passed this is put into
a list.
Value passed in columns is saved in the columns attribute on the object.
copy : bool, default = False
Should X be copied before transforms are applied?
Copy argument no longer used and will be deprecated in a future release
verbose : bool, default = False
Should statements be printed when methods are run?
return_native: bool, default = True
Controls whether transformer returns narwhals or native pandas/polars type
"""
self.verbose = verbose
if self.verbose:
print("BaseTransformer.__init__() called")
# make sure columns is a single str or list of strs
if isinstance(columns, str):
self.columns = [columns]
elif isinstance(columns, list):
self.columns = columns
self.copy = copy
self.return_native = return_native
self.built_from_json = False
self.is_fitted_ = False
[docs]
def get_feature_names_out(self) -> list[str]:
"""List features modified/created by the transformer.
Child classes will need to overload this method if their behaviour is
more complex than just returning the input columns.
Returns
-------
list[str]:
list of features modified/created by the transformer
Examples
--------
```pycon
>>> transformer = BaseTransformer(
... columns="a",
... )
>>> transformer.get_feature_names_out()
['a']
```
"""
return self.columns
[docs]
@block_from_json
def to_json(self) -> dict[str, dict[str, Any]]:
"""Dump transformer to json dict.
Returns
-------
dict[str, dict[str, Any]]:
jsonified transformer. Nested dict containing levels for attributes
set at init and fit.
Raises
------
RuntimeError: if transformer does not have to/from json functionality
enabled
Examples
--------
```pycon
>>> transformer = BaseTransformer(columns=["a", "b"])
>>> # version will vary for local vs CI, so use ... as generic match
>>> transformer.to_json()
{'tubular_version': ..., 'classname': 'BaseTransformer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True}, 'fit': {'is_fitted_': False}}
```
""" # noqa: E501
if not self.jsonable:
msg = (
"This transformer has not yet had to/from json functionality developed"
)
raise RuntimeError(
msg,
)
return {
"tubular_version": self._version,
"classname": self.classname(),
"init": {
"columns": self.columns,
"copy": self.copy,
"verbose": self.verbose,
"return_native": self.return_native,
},
"fit": {"is_fitted_": self.is_fitted_},
}
[docs]
@classmethod
def from_json(cls, json: dict[str, Any]) -> BaseTransformer:
"""Rebuild transformer from json dict, readyfor transform.
Parameters
----------
json: dict[str, dict[str, Any]]
json-ified transformer
Returns
-------
BaseTransformer:
reconstructed transformer class, ready for transform
Raises
------
RuntimeError: if transformer does not have to/from json
functionality enabled
Examples
--------
```pycon
>>> json_dict = {"init": {"columns": ["a", "b"]}, "fit": {}}
>>> BaseTransformer.from_json(json=json_dict)
BaseTransformer(columns=['a', 'b'])
```
"""
if not cls.jsonable:
msg = (
"This transformer has not yet had to/from json functionality developed"
)
raise RuntimeError(
msg,
)
instance = cls(**json["init"])
for attr in json["fit"]:
setattr(instance, attr, json["fit"][attr])
instance.built_from_json = True
return instance
[docs]
@block_from_json
@beartype
def fit(self, X: DataFrame, y: Series | LazyFrame | None = None) -> BaseTransformer:
"""Check data before fit.
Fit calls the columns_check method which will check that the columns
attribute is set and all values are present in X
Parameters
----------
X : DataFrame
Data to fit the transformer on.
y : None or Series or LazyFrame, default = None
Optional argument only required for the transformer to work with sklearn
pipelines.
Returns
-------
BaseTransformer: returns self
Examples
--------
```pycon
>>> import polars as pl
>>> transformer = BaseTransformer(
... columns="a",
... )
>>> df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
>>> transformer.fit(df)
BaseTransformer(columns=['a'])
```
"""
if self.verbose:
print("BaseTransformer.fit() called")
X = _convert_dataframe_to_narwhals(X)
y = _convert_series_to_narwhals(y)
self.columns_check(X)
return self
@block_from_json
@beartype
def _combine_X_y(
self, X: DataFrame, y: Series | LazyFrame, return_native_override: bool = True
) -> DataFrame:
"""Combine X and y by adding a new column with the values of y to a copy of X.
The new column response column will be called `_temporary_response`.
This method can be used by transformers that need to use the response, y,
together with the explanatory variables, X, in their `fit` methods.
Parameters
----------
X : DataFrame
Data containing explanatory variables.
y : Series or LazyFrame
Response variable.
return_native_override: Optional[bool]
option to override return_native attr in transformer, useful when calling parent
methods
Returns
-------
DataFrame: DataFrame with added column containing y
Examples
--------
```pycon
# correct usage
>>> import polars as pl
>>> transformer = BaseTransformer(
... columns="a",
... )
>>> X = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
>>> y = pl.Series(name="a", values=[1, 2])
>>> transformer._combine_X_y(X, y)
shape: (2, 3)
┌─────┬─────┬─────────────────────┐
│ a ┆ b ┆ _temporary_response │
│ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 │
╞═════╪═════╪═════════════════════╡
│ 1 ┆ 3 ┆ 1 │
│ 2 ┆ 4 ┆ 2 │
└─────┴─────┴─────────────────────┘
# example error from mismatched X/y
>>> X = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
>>> y = pl.Series(name="a", values=[1])
>>> transformer._combine_X_y(X, y)
Traceback (most recent call last):
...
narwhals.exceptions.InvalidOperationError: Series _temporary_response, length 1 doesn't match the DataFrame height of 2
...
```
""" # noqa: E501
X = _convert_dataframe_to_narwhals(X)
y = _convert_series_to_narwhals(y)
return_native = self._process_return_native(return_native_override)
# If both X and y are LazyFrames, use join to maintain lazy evaluation
if isinstance(X, nw.LazyFrame) and isinstance(y, nw.LazyFrame):
# Convert LazyFrame y to LazyFrame with row index for joining
y_named = y.with_row_index("__row_idx__")
X_indexed = X.with_row_index("__row_idx__")
y_col = y.columns[0]
X = (
X_indexed.join(
y_named.select("__row_idx__", y_col), on="__row_idx__", how="inner"
)
.select("*")
.exclude("__row_idx__")
.rename({y_col: "_temporary_response"})
)
elif isinstance(y, nw.LazyFrame):
# If y is LazyFrame but X is not, collect y first
y = _collect_series(y)
X = X.with_columns(_temporary_response=y)
else:
# For eager frames or Series, use with_columns
X = X.with_columns(_temporary_response=y)
return _return_narwhals_or_native_dataframe(X, return_native)
@beartype
def _process_return_native(self, return_native_override: bool | None) -> bool:
"""Determine whether to override return_native attr.
Parameters
----------
return_native_override: Optional[bool]
option to override return_native attr in transformer,
useful when calling parent methods
Returns
-------
bool: whether or not to return native type
Example:
--------
```pycon
>>> transformer = BaseTransformer(columns="a", return_native=True)
>>> transformer._process_return_native(return_native_override=False)
False
```
"""
return (
return_native_override
if return_native_override is not None
else self.return_native
)
[docs]
@beartype
def transform(
self,
X: DataFrame,
return_native_override: bool | None = None,
) -> DataFrame:
"""Check data before child transform.
Transform calls the columns_check method which will check columns in columns
attribute are in X.
Parameters
----------
X : DataFrame
Data to transform with the transformer.
return_native_override: Optional[bool]
option to override return_native attr in transformer,
useful when calling parent methods
Returns
-------
X : DataFrame
Input X, copied if specified by user.
Examples
--------
```pycon
>>> import polars as pl
>>> transformer = BaseTransformer(
... columns="a",
... )
>>> df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
>>> transformer.transform(df)
shape: (2, 2)
┌─────┬─────┐
│ a ┆ b │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1 ┆ 3 │
│ 2 ┆ 4 │
└─────┴─────┘
```
"""
self.check_is_fitted("is_fitted_")
return_native = self._process_return_native(return_native_override)
X = _convert_dataframe_to_narwhals(X)
if self.copy and not isinstance(X, nw.LazyFrame):
# to prevent overwriting original dataframe
X = X.clone()
self.columns_check(X)
if self.verbose:
print("BaseTransformer.transform() called")
return _return_narwhals_or_native_dataframe(X, return_native)
[docs]
def check_is_fitted(self, attribute: str) -> None:
"""Check if particular attributes are on the object.
This is useful to do before running transform to avoid
trying to transform data without first running the fit method.
Wrapper for utils.validation.check_is_fitted function.
Parameters
----------
attribute : List
List of str values giving names of attribute to check exist on self.
Example
-------
```pycon
>>> transformer = BaseTransformer(
... columns="a",
... )
>>> transformer.check_is_fitted("columns")
```
"""
check_is_fitted(self, attribute)
[docs]
@beartype
def columns_check(self, X: DataFrame) -> None:
"""Check that the columns attribute is set and all values are present in X.
Parameters
----------
X : DataFrame
Data to check columns are in.
Raises
------
ValueError: if columns missing from dataframe
Examples
--------
```pycon
>>> import polars as pl
>>> transformer = BaseTransformer(
... columns="a",
... )
>>> df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})
>>> transformer.columns_check(df)
```
"""
X = _convert_dataframe_to_narwhals(X)
missing_columns = set(self.columns).difference(X.collect_schema().names())
if len(missing_columns) != 0:
msg = f"{self.classname()}: variables {missing_columns} not in X"
raise ValueError(
msg,
)
# DEPRECATED TRANSFORMERS
[docs]
@deprecated(
"""This transformer has been deprecated in favour of more specialised transformers.
See the aggregations module for aggregation type functionality formerly covered by
this transformer.
If other functionality was being used from this transformer, then please submit an
issue for it to be redeveloped!
""",
)
class DataFrameMethodTransformer(DropOriginalMixin, BaseTransformer):
"""Transformer that applies a pandas.DataFrame method.
Transformer assigns the output of the method to a new column or columns.
It is possible to supply other key word arguments to the transform method,
which will be passed to the pandas.DataFrame method being called.
Be aware it is possible to supply incompatible arguments to init that will only be
identified when transform is run.
This is because there are many combinations of method, input and output sizes.
Additionally some methods may only work as expected when called in
transform with specific key word arguments.
Attributes
----------
new_column_names : str or list of str
The name of the column or columns to be assigned to the output of running the
pandas method in transform.
pd_method_name : str
The name of the pandas.DataFrame method to call.
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's
supported functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to
polars/pandas agnostic narwhals framework
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
deprecated: bool
indicates if class has been deprecated
"""
polars_compatible = False
FITS = False
jsonable = False
lazyframe_compatible = False
deprecated = True
@beartype
def __init__(
self,
new_column_names: list[str] | str,
pd_method_name: str,
columns: NonEmptyListOfStrs | str | None,
pd_method_kwargs: GenericKwargs | None = None,
drop_original: bool = False,
**kwargs: bool | None,
) -> None:
"""Init method for class.
Parameters
----------
new_column_names : str or list of str
The name of the column or columns to be assigned to the output of running
the pandas method in transform.
pd_method_name : str
The name of the pandas.DataFrame method to call.
columns : None or list or str
Columns to apply the transformer to.
If a str is passed this is put into a list. Value passed
in columns is saved in the columns attribute on the object.
Note this has no default value so
the user has to specify the columns when initialising the transformer.
This is avoid likelywhen the user forget to set columns,
in this case all columns would be picked up when super transform runs.
pd_method_kwargs : dict, default = {}
A dictionary of keyword arguments to be passed to the pd.DataFrame method
when it is called.
drop_original : bool, default = False
Should original columns be dropped?
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.__init__().
Raises
------
AttributeError: if pd_method_name is not valid pd.DataFrame method
"""
super().__init__(columns=columns, **kwargs)
if pd_method_kwargs is None:
pd_method_kwargs = {}
self.new_column_names = new_column_names
self.pd_method_name = pd_method_name
self.pd_method_kwargs = pd_method_kwargs
self.drop_original = drop_original
try:
df = pd.DataFrame()
getattr(df, pd_method_name)
except Exception as err:
msg = f'{self.classname()}: error accessing "{pd_method_name}" method on pd.DataFrame object - pd_method_name should be a pd.DataFrame method' # noqa: E501
raise AttributeError(msg) from err
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Transform input data.
Uses the given pandas.DataFrame method and assign the output
back to column or columns in X.
Any keyword arguments set in the pd_method_kwargs attribute are passed onto the
pandas DataFrame method when calling it.
Parameters
----------
X : pd.DataFrame
Data to transform.
Returns
-------
X : pd.DataFrame
Input X with additional column or columns (self.new_column_names) added.
These contain the output of running the pandas DataFrame method.
"""
X = super().transform(X)
# quick fix for empty frames, not spending much
# time on this as transformer is deprecated.
# the new_column_names attr is a bit messy,
# sometimes str and sometimes list
# editing init to make it always a list
# broke other tests which didn't seem worth fixing
# so have included handling for both cases here..
if X.empty:
# hard to know the best dtype to use here given the
# flexibility of this transformer, which is
# partially why it was deprecated
if isinstance(self.new_column_names, list):
for col in X[self.new_column_names]:
X[col] = pd.Series(dtype=float)
else:
X[self.new_column_names] = pd.Series(dtype=float)
else:
X[self.new_column_names] = getattr(X[self.columns], self.pd_method_name)(
**self.pd_method_kwargs,
)
# Drop original columns if self.drop_original is True
return DropOriginalMixin.drop_original_column(
X,
self.drop_original,
self.columns,
)