Source code for tubular.misc

"""Contains legacy transformers for introducing fixed columns and changing dtypes."""

from __future__ import annotations

from enum import Enum
from typing import Annotated, Any

import narwhals as nw
from beartype import beartype
from beartype.vale import Is

from tubular._utils import (
    _convert_dataframe_to_narwhals,
    _return_narwhals_or_native_dataframe,
    block_from_json,
)
from tubular.base import BaseTransformer, register
from tubular.mixins import DropOriginalMixin
from tubular.types import (
    DataFrame,
    ListOfStrs,
)


[docs] @register class SetValueTransformer(BaseTransformer): """Transformer to set value of column(s) to a given value. This should be used if columns need to be set to a constant value. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> SetValueTransformer(columns="a", value=1) SetValueTransformer(columns=['a'], value=1) ``` """ polars_compatible = True lazyframe_compatible = True FITS = False jsonable = True @beartype def __init__( self, columns: ListOfStrs | str, value: int | float | str | bool | None, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns: list or str Columns to set values. value : various Value to set. **kwargs: bool Arbitrary keyword arguments passed onto BaseTransformer.init method. """ self.value = value super().__init__(columns=columns, **kwargs) self.is_fitted_ = True # Does not fit
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Dump transformer to json dict. Returns ------- dict[str, dict[str, Any]]: jsonified transformer. Nested dict containing levels for attributes set at init and fit. Examples -------- ```pycon >>> transformer = SetValueTransformer(columns="a", value=1) >>> transformer.to_json() {'tubular_version': ..., 'classname': 'SetValueTransformer', 'init': {'columns': ['a'], 'copy': False, 'verbose': False, 'return_native': True, 'value': 1}, 'fit': {'is_fitted_': True}} ``` """ # noqa: E501 json_dict = super().to_json() json_dict["init"]["value"] = self.value return json_dict
[docs] @beartype def transform(self, X: DataFrame) -> DataFrame: """Set columns to value. Parameters ---------- X : DataFrame Data to apply mappings to. Returns ------- X : DataFrame Transformed input X with columns set to value. Examples -------- ```pycon >>> import polars as pl >>> transformer = SetValueTransformer(columns="a", value=1) >>> test_df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> transformer.transform(test_df) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i32 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 4 │ │ 1 ┆ 5 │ │ 1 ┆ 6 │ └─────┴─────┘ ``` """ X = _convert_dataframe_to_narwhals(X) X = super().transform(X, return_native_override=False) X = X.with_columns([nw.lit(self.value).alias(c) for c in self.columns]) return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs] @register class RenameColumnsTransformer(BaseTransformer, DropOriginalMixin): """Transformer to rename a given set of columns. This can be useful for personalising the auto-output names from other transformers, or for creating a few different versions of a given column to undergo separate paths of logic in a pipeline (as the expression logic effectively creates duplicates of the column). Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> from pprint import pprint >>> transformer = RenameColumnsTransformer( ... columns="a", new_column_names={"a": "new_a"} ... ) # noqa: E501 >>> transformer RenameColumnsTransformer(columns=['a'], new_column_names={'a': 'new_a'}) >>> # transformer can also be dumped to json and reinitialised >>> json_dump = transformer.to_json() >>> pprint(json_dump, sort_dicts=True) {'classname': 'RenameColumnsTransformer', 'fit': {'is_fitted_': True}, 'init': {'columns': ['a'], 'copy': False, 'drop_original': True, 'new_column_names': {'a': 'new_a'}, 'return_native': True, 'verbose': False}, 'tubular_version': ...} >>> RenameColumnsTransformer.from_json(json_dump) RenameColumnsTransformer(columns=['a'], new_column_names={'a': 'new_a'}) ``` """ polars_compatible = True lazyframe_compatible = True FITS = False jsonable = True @beartype def __init__( self, columns: ListOfStrs | str, new_column_names: dict[str, str], drop_original: bool = True, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns: list or str Columns to set values. new_column_names: dict[str, str] dictionary mapping provided columns to updated names drop_original: bool indicates whether to drop original columns. **kwargs: bool Arbitrary keyword arguments passed onto BaseTransformer.init method. Raises ------ ValueError: if provided columns are not keys of provided new_column_names """ super().__init__(columns=columns, **kwargs) msg = f"{self.classname()}: all provided columns must appear as keys in new_column_names" # noqa: E501 for column in self.columns: if column not in new_column_names: raise ValueError(msg) self.new_column_names = new_column_names self.drop_original = drop_original self.is_fitted_ = True # Does not fit
[docs] def get_feature_names_out(self) -> list[str]: """List features modified/created by the transformer. Returns ------- list[str]: list of features modified/created by the transformer Examples -------- ```pycon >>> transformer = RenameColumnsTransformer( ... columns=["a", "b"], ... new_column_names={"a": "new_a", "b": "new_b"}, ... ) >>> transformer.get_feature_names_out() ['new_a', 'new_b'] ``` """ return list(self.new_column_names.values())
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Dump transformer to json dict. Returns ------- dict[str, dict[str, Any]]: jsonified transformer. Nested dict containing levels for attributes set at init and fit. Examples -------- ```pycon >>> from pprint import pprint >>> transformer = RenameColumnsTransformer( ... columns="a", new_column_names={"a": "new_a"} ... ) # noqa: E501 >>> pprint(transformer.to_json(), sort_dicts=True) {'classname': 'RenameColumnsTransformer', 'fit': {'is_fitted_': True}, 'init': {'columns': ['a'], 'copy': False, 'drop_original': True, 'new_column_names': {'a': 'new_a'}, 'return_native': True, 'verbose': False}, 'tubular_version': ...} ``` """ json_dict = super().to_json() json_dict["init"].update( { "new_column_names": self.new_column_names, "drop_original": self.drop_original, } ) return json_dict
[docs] @beartype def transform(self, X: DataFrame) -> DataFrame: """Create column copies. Parameters ---------- X : DataFrame Data to apply mappings to. Returns ------- X : DataFrame Transformed input X with columns set to value. Raises ------ ValueError: if new_column_names values are already present in X Examples -------- ```pycon >>> import polars as pl >>> transformer = RenameColumnsTransformer( ... columns="a", new_column_names={"a": "new_a"} ... ) # noqa: E501 >>> test_df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) >>> transformer.transform(test_df) shape: (3, 2) ┌─────┬───────┐ │ b ┆ new_a │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═══════╡ │ 4 ┆ 1 │ │ 5 ┆ 2 │ │ 6 ┆ 3 │ └─────┴───────┘ ``` """ X = super().transform(X, return_native_override=False) new_column_names_already_present = sorted( set(self.new_column_names.values()).intersection(X.columns) ) if new_column_names_already_present: msg = f"{self.classname()}: The following new_column_names are already present in X, {new_column_names_already_present}" # noqa: E501 raise ValueError(msg) X = _convert_dataframe_to_narwhals(X) X = X.with_columns( [nw.col(c).alias(self.new_column_names[c]) for c in self.columns] ) X = DropOriginalMixin.drop_original_column(X, self.drop_original, self.columns) return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs] class SimpleCastDtypes(str, Enum): """Allowed dtypes for ColumnDtypeSetter.""" FLOAT64 = "Float64" FLOAT32 = "Float32" INT64 = "Int64" INT32 = "Int32" INT16 = "Int16" INT8 = "Int8" UINT64 = "UInt64" UINT32 = "UInt32" UINT16 = "UInt16" UINT8 = "UInt8" BOOLEAN = "Boolean" STRING = "String" CATEGORICAL = "Categorical"
SimpleCastDtypesStr = Annotated[ str, Is[lambda s: s in SimpleCastDtypes._value2member_map_], ]
[docs] @register class ColumnDtypeSetter(BaseTransformer): """Transformer to set transform columns in a dataframe to a dtype. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = True lazyframe_compatible = True FITS = False jsonable = True deprecated = False @beartype def __init__( self, columns: str | ListOfStrs, dtype: SimpleCastDtypesStr, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns : Union[str, ListOfStrs] Columns to set dtype. Must be set or transform will not run. dtype : SimpleCastDtypesStr dtype to set column to **kwargs: dict[str, Any] Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns, **kwargs) self.dtype = dtype self.is_fitted_ = True # Does not fit
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Dump transformer to json dict. Returns ------- dict[str, dict[str, Any]]: jsonified transformer. Nested dict containing levels for attributes set at init and fit. Examples -------- ```pycon >>> from pprint import pprint >>> transformer = ColumnDtypeSetter(columns="a", dtype="Float32") >>> pprint(transformer.to_json(), sort_dicts=True) {'classname': 'ColumnDtypeSetter', 'fit': {'is_fitted_': True}, 'init': {'columns': ['a'], 'copy': False, 'dtype': 'Float32', 'return_native': True, 'verbose': False}, 'tubular_version': ...} ``` """ json_dict = super().to_json() json_dict["init"]["dtype"] = self.dtype return json_dict
[docs] def transform(self, X: DataFrame) -> DataFrame: """Transform data. Parameters ---------- X: DataFrame data to transform. Returns ------- DataFrame: transformed data Examples -------- ```pycon >>> import polars as pl >>> df = pl.DataFrame({"a": [1, 2]}) >>> transformer = ColumnDtypeSetter(columns="a", dtype="Float32") >>> transformer.transform(df) shape: (2, 1) ┌─────┐ │ a │ │ --- │ │ f32 │ ╞═════╡ │ 1.0 │ │ 2.0 │ └─────┘ ``` """ X = _convert_dataframe_to_narwhals(X) backend = nw.get_native_namespace(X).__name__ X = super().transform(X, return_native_override=False) if backend == "pandas" and self.dtype == "Boolean": X = X.with_columns( nw.maybe_convert_dtypes(X[col]).cast(nw.Boolean) for col in self.columns ) else: X = X.with_columns( [nw.col(col).cast(getattr(nw, self.dtype)) for col in self.columns] ) return _return_narwhals_or_native_dataframe(X, self.return_native)