Source code for tubular.base

"""Contains transformers that other transformers in the package inherit from.

These transformers contain key checks to be applied in all cases.
"""

from __future__ import annotations

from typing import Any

import narwhals as nw
import pandas as pd
from beartype import beartype
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_is_fitted
from typing_extensions import deprecated

from tubular._utils import (
    _collect_series,
    _convert_dataframe_to_narwhals,
    _convert_series_to_narwhals,
    _get_version,
    _return_narwhals_or_native_dataframe,
    block_from_json,
)
from tubular.mixins import DropOriginalMixin
from tubular.types import (
    DataFrame,
    GenericKwargs,
    LazyFrame,
    ListOfStrs,
    NonEmptyListOfStrs,
    Series,
)

pd.options.mode.copy_on_write = True

FEATURE_REGISTRY = {}

CLASS_REGISTRY = {}


[docs] def register(cls: BaseTransformer) -> BaseTransformer: """Add transformer to registry dict. Returns: ------- cls - transformer Example: ------- ```pycon >>> @register ... class MyTransformer(BaseTransformer): ... pass ... >>> CLASS_REGISTRY["MyTransformer"] <class 'tubular.base.MyTransformer'> ``` """ CLASS_REGISTRY[cls.__name__] = cls return cls
[docs] @register class BaseTransformer(BaseEstimator, TransformerMixin): """Base transformer class which all other transformers in the package inherit from. Provides fit and transform methods (required by sklearn transformers), simple input checking and functionality to copy X prior to transform. Attributes: ---------- columns : list Either a list of str values giving which columns in a input pandas.DataFrame the transformer will be applied to. copy : bool Should X be copied before transforms are applied? Copy argument no longer used and will be deprecated in a future release verbose : bool Print statements to show which methods are being run or not. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Example: ------- ```pycon >>> BaseTransformer( ... columns="a", ... ) BaseTransformer(columns=['a']) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = True _version = _get_version() def __init_subclass__(cls: BaseTransformer) -> None: """Logic to be run when a new child class is defined. This populates a dictionary, which will help us track which transformers in the repo support which functionality. """ deprecated = getattr(cls, "deprecated", False) # ignore deprecated transformers and base classes if ( deprecated or cls.__name__.startswith("Base") or cls.__name__.startswith("_") ): return FEATURE_REGISTRY[cls.__name__] = { "polars_compatible": cls.polars_compatible, # repo was originally written in pandas, so the is a given "pandas_compatible": True, "jsonable": cls.jsonable, "lazyframe_compatible": cls.lazyframe_compatible, }
[docs] def classname(self) -> str: """Return the name of the current class when called. Returns ------- str: name of class """ return type(self).__name__
@beartype def __init__( self, columns: ListOfStrs | str, copy: bool = False, verbose: bool = False, return_native: bool = True, ) -> None: """Init method for class. Parameters ---------- columns : None or list or str Columns to apply the transformer to. If a str is passed this is put into a list. Value passed in columns is saved in the columns attribute on the object. copy : bool, default = False Should X be copied before transforms are applied? Copy argument no longer used and will be deprecated in a future release verbose : bool, default = False Should statements be printed when methods are run? return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type """ self.verbose = verbose if self.verbose: print("BaseTransformer.__init__() called") # make sure columns is a single str or list of strs if isinstance(columns, str): self.columns = [columns] elif isinstance(columns, list): self.columns = columns self.copy = copy self.return_native = return_native self.built_from_json = False self.is_fitted_ = False
[docs] def get_feature_names_out(self) -> list[str]: """List features modified/created by the transformer. Child classes will need to overload this method if their behaviour is more complex than just returning the input columns. Returns ------- list[str]: list of features modified/created by the transformer Examples -------- ```pycon >>> transformer = BaseTransformer( ... columns="a", ... ) >>> transformer.get_feature_names_out() ['a'] ``` """ return self.columns
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Dump transformer to json dict. Returns ------- dict[str, dict[str, Any]]: jsonified transformer. Nested dict containing levels for attributes set at init and fit. Raises ------ RuntimeError: if transformer does not have to/from json functionality enabled Examples -------- ```pycon >>> transformer = BaseTransformer(columns=["a", "b"]) >>> # version will vary for local vs CI, so use ... as generic match >>> transformer.to_json() {'tubular_version': ..., 'classname': 'BaseTransformer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True}, 'fit': {'is_fitted_': False}} ``` """ # noqa: E501 if not self.jsonable: msg = ( "This transformer has not yet had to/from json functionality developed" ) raise RuntimeError( msg, ) return { "tubular_version": self._version, "classname": self.classname(), "init": { "columns": self.columns, "copy": self.copy, "verbose": self.verbose, "return_native": self.return_native, }, "fit": {"is_fitted_": self.is_fitted_}, }
[docs] @classmethod def from_json(cls, json: dict[str, Any]) -> BaseTransformer: """Rebuild transformer from json dict, readyfor transform. Parameters ---------- json: dict[str, dict[str, Any]] json-ified transformer Returns ------- BaseTransformer: reconstructed transformer class, ready for transform Raises ------ RuntimeError: if transformer does not have to/from json functionality enabled Examples -------- ```pycon >>> json_dict = {"init": {"columns": ["a", "b"]}, "fit": {}} >>> BaseTransformer.from_json(json=json_dict) BaseTransformer(columns=['a', 'b']) ``` """ if not cls.jsonable: msg = ( "This transformer has not yet had to/from json functionality developed" ) raise RuntimeError( msg, ) instance = cls(**json["init"]) for attr in json["fit"]: setattr(instance, attr, json["fit"][attr]) instance.built_from_json = True return instance
[docs] @block_from_json @beartype def fit(self, X: DataFrame, y: Series | LazyFrame | None = None) -> BaseTransformer: """Check data before fit. Fit calls the columns_check method which will check that the columns attribute is set and all values are present in X Parameters ---------- X : DataFrame Data to fit the transformer on. y : None or Series or LazyFrame, default = None Optional argument only required for the transformer to work with sklearn pipelines. Returns ------- BaseTransformer: returns self Examples -------- ```pycon >>> import polars as pl >>> transformer = BaseTransformer( ... columns="a", ... ) >>> df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) >>> transformer.fit(df) BaseTransformer(columns=['a']) ``` """ if self.verbose: print("BaseTransformer.fit() called") X = _convert_dataframe_to_narwhals(X) y = _convert_series_to_narwhals(y) self.columns_check(X) return self
@block_from_json @beartype def _combine_X_y( self, X: DataFrame, y: Series | LazyFrame, return_native_override: bool = True ) -> DataFrame: """Combine X and y by adding a new column with the values of y to a copy of X. The new column response column will be called `_temporary_response`. This method can be used by transformers that need to use the response, y, together with the explanatory variables, X, in their `fit` methods. Parameters ---------- X : DataFrame Data containing explanatory variables. y : Series or LazyFrame Response variable. return_native_override: Optional[bool] option to override return_native attr in transformer, useful when calling parent methods Returns ------- DataFrame: DataFrame with added column containing y Examples -------- ```pycon # correct usage >>> import polars as pl >>> transformer = BaseTransformer( ... columns="a", ... ) >>> X = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) >>> y = pl.Series(name="a", values=[1, 2]) >>> transformer._combine_X_y(X, y) shape: (2, 3) ┌─────┬─────┬─────────────────────┐ │ a ┆ b ┆ _temporary_response │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═════════════════════╡ │ 1 ┆ 3 ┆ 1 │ │ 2 ┆ 4 ┆ 2 │ └─────┴─────┴─────────────────────┘ # example error from mismatched X/y >>> X = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) >>> y = pl.Series(name="a", values=[1]) >>> transformer._combine_X_y(X, y) Traceback (most recent call last): ... narwhals.exceptions.InvalidOperationError: Series _temporary_response, length 1 doesn't match the DataFrame height of 2 ... ``` """ # noqa: E501 X = _convert_dataframe_to_narwhals(X) y = _convert_series_to_narwhals(y) return_native = self._process_return_native(return_native_override) # If both X and y are LazyFrames, use join to maintain lazy evaluation if isinstance(X, nw.LazyFrame) and isinstance(y, nw.LazyFrame): # Convert LazyFrame y to LazyFrame with row index for joining y_named = y.with_row_index("__row_idx__") X_indexed = X.with_row_index("__row_idx__") y_col = y.columns[0] X = ( X_indexed.join( y_named.select("__row_idx__", y_col), on="__row_idx__", how="inner" ) .select("*") .exclude("__row_idx__") .rename({y_col: "_temporary_response"}) ) elif isinstance(y, nw.LazyFrame): # If y is LazyFrame but X is not, collect y first y = _collect_series(y) X = X.with_columns(_temporary_response=y) else: # For eager frames or Series, use with_columns X = X.with_columns(_temporary_response=y) return _return_narwhals_or_native_dataframe(X, return_native) @beartype def _process_return_native(self, return_native_override: bool | None) -> bool: """Determine whether to override return_native attr. Parameters ---------- return_native_override: Optional[bool] option to override return_native attr in transformer, useful when calling parent methods Returns ------- bool: whether or not to return native type Example: -------- ```pycon >>> transformer = BaseTransformer(columns="a", return_native=True) >>> transformer._process_return_native(return_native_override=False) False ``` """ return ( return_native_override if return_native_override is not None else self.return_native )
[docs] @beartype def transform( self, X: DataFrame, return_native_override: bool | None = None, ) -> DataFrame: """Check data before child transform. Transform calls the columns_check method which will check columns in columns attribute are in X. Parameters ---------- X : DataFrame Data to transform with the transformer. return_native_override: Optional[bool] option to override return_native attr in transformer, useful when calling parent methods Returns ------- X : DataFrame Input X, copied if specified by user. Examples -------- ```pycon >>> import polars as pl >>> transformer = BaseTransformer( ... columns="a", ... ) >>> df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) >>> transformer.transform(df) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 3 │ │ 2 ┆ 4 │ └─────┴─────┘ ``` """ self.check_is_fitted("is_fitted_") return_native = self._process_return_native(return_native_override) X = _convert_dataframe_to_narwhals(X) if self.copy and not isinstance(X, nw.LazyFrame): # to prevent overwriting original dataframe X = X.clone() self.columns_check(X) if self.verbose: print("BaseTransformer.transform() called") return _return_narwhals_or_native_dataframe(X, return_native)
[docs] def check_is_fitted(self, attribute: str) -> None: """Check if particular attributes are on the object. This is useful to do before running transform to avoid trying to transform data without first running the fit method. Wrapper for utils.validation.check_is_fitted function. Parameters ---------- attribute : List List of str values giving names of attribute to check exist on self. Example ------- ```pycon >>> transformer = BaseTransformer( ... columns="a", ... ) >>> transformer.check_is_fitted("columns") ``` """ check_is_fitted(self, attribute)
[docs] @beartype def columns_check(self, X: DataFrame) -> None: """Check that the columns attribute is set and all values are present in X. Parameters ---------- X : DataFrame Data to check columns are in. Raises ------ ValueError: if columns missing from dataframe Examples -------- ```pycon >>> import polars as pl >>> transformer = BaseTransformer( ... columns="a", ... ) >>> df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) >>> transformer.columns_check(df) ``` """ X = _convert_dataframe_to_narwhals(X) missing_columns = set(self.columns).difference(X.collect_schema().names()) if len(missing_columns) != 0: msg = f"{self.classname()}: variables {missing_columns} not in X" raise ValueError( msg, )
# DEPRECATED TRANSFORMERS
[docs] @deprecated( """This transformer has been deprecated in favour of more specialised transformers. See the aggregations module for aggregation type functionality formerly covered by this transformer. If other functionality was being used from this transformer, then please submit an issue for it to be redeveloped! """, ) class DataFrameMethodTransformer(DropOriginalMixin, BaseTransformer): """Transformer that applies a pandas.DataFrame method. Transformer assigns the output of the method to a new column or columns. It is possible to supply other key word arguments to the transform method, which will be passed to the pandas.DataFrame method being called. Be aware it is possible to supply incompatible arguments to init that will only be identified when transform is run. This is because there are many combinations of method, input and output sizes. Additionally some methods may only work as expected when called in transform with specific key word arguments. Attributes ---------- new_column_names : str or list of str The name of the column or columns to be assigned to the output of running the pandas method in transform. pd_method_name : str The name of the pandas.DataFrame method to call. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False FITS = False jsonable = False lazyframe_compatible = False deprecated = True @beartype def __init__( self, new_column_names: list[str] | str, pd_method_name: str, columns: NonEmptyListOfStrs | str | None, pd_method_kwargs: GenericKwargs | None = None, drop_original: bool = False, **kwargs: bool | None, ) -> None: """Init method for class. Parameters ---------- new_column_names : str or list of str The name of the column or columns to be assigned to the output of running the pandas method in transform. pd_method_name : str The name of the pandas.DataFrame method to call. columns : None or list or str Columns to apply the transformer to. If a str is passed this is put into a list. Value passed in columns is saved in the columns attribute on the object. Note this has no default value so the user has to specify the columns when initialising the transformer. This is avoid likelywhen the user forget to set columns, in this case all columns would be picked up when super transform runs. pd_method_kwargs : dict, default = {} A dictionary of keyword arguments to be passed to the pd.DataFrame method when it is called. drop_original : bool, default = False Should original columns be dropped? **kwargs Arbitrary keyword arguments passed onto BaseTransformer.__init__(). Raises ------ AttributeError: if pd_method_name is not valid pd.DataFrame method """ super().__init__(columns=columns, **kwargs) if pd_method_kwargs is None: pd_method_kwargs = {} self.new_column_names = new_column_names self.pd_method_name = pd_method_name self.pd_method_kwargs = pd_method_kwargs self.drop_original = drop_original try: df = pd.DataFrame() getattr(df, pd_method_name) except Exception as err: msg = f'{self.classname()}: error accessing "{pd_method_name}" method on pd.DataFrame object - pd_method_name should be a pd.DataFrame method' # noqa: E501 raise AttributeError(msg) from err
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Transform input data. Uses the given pandas.DataFrame method and assign the output back to column or columns in X. Any keyword arguments set in the pd_method_kwargs attribute are passed onto the pandas DataFrame method when calling it. Parameters ---------- X : pd.DataFrame Data to transform. Returns ------- X : pd.DataFrame Input X with additional column or columns (self.new_column_names) added. These contain the output of running the pandas DataFrame method. """ X = super().transform(X) # quick fix for empty frames, not spending much # time on this as transformer is deprecated. # the new_column_names attr is a bit messy, # sometimes str and sometimes list # editing init to make it always a list # broke other tests which didn't seem worth fixing # so have included handling for both cases here.. if X.empty: # hard to know the best dtype to use here given the # flexibility of this transformer, which is # partially why it was deprecated if isinstance(self.new_column_names, list): for col in X[self.new_column_names]: X[col] = pd.Series(dtype=float) else: X[self.new_column_names] = pd.Series(dtype=float) else: X[self.new_column_names] = getattr(X[self.columns], self.pd_method_name)( **self.pd_method_kwargs, ) # Drop original columns if self.drop_original is True return DropOriginalMixin.drop_original_column( X, self.drop_original, self.columns, )