Source code for tubular.numeric

"""Contains transformers that apply numeric functions."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, ClassVar, Literal

import narwhals as nw
import numpy as np
import pandas as pd
from beartype import beartype
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    PolynomialFeatures,
    StandardScaler,
)
from typing_extensions import deprecated

from tubular._utils import (
    _convert_dataframe_to_narwhals,
    _return_narwhals_or_native_dataframe,
    block_from_json,
)
from tubular.base import BaseTransformer, DataFrameMethodTransformer, register
from tubular.mixins import (
    CheckNumericMixin,
    DropOriginalMixin,
)
from tubular.types import (
    DataFrame,
    FloatBetweenZeroOne,
    FloatTypeAnnotated,
    GenericKwargs,
    ListOfMoreThanOneStrings,
    ListOfOneStr,
    ListOfTwoStrs,
    PositiveNumber,
    StrictlyPositiveInt,
)

if TYPE_CHECKING:
    from narwhals.typing import FrameT, IntoSeriesT


[docs] @register class BaseNumericTransformer(BaseTransformer, CheckNumericMixin): """Extends BaseTransformer for datetime scenarios. Attributes ---------- columns : List[str] List of columns to be operated on built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> BaseNumericTransformer( ... columns="a", ... ) BaseNumericTransformer(columns=['a']) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = False FITS = False def __init__(self, columns: list[str], **kwargs: dict[str, bool]) -> None: """Initialise class instance. Parameters ---------- columns : List[str] List of columns to be operated on. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs)
[docs] def fit( self, X: DataFrame, y: nw.Series | None = None, ) -> BaseNumericTransformer: """Validate data and attributes prior to the child objects fit logic. Parameters ---------- X : DataFrame A dataframe containing the required columns y : Series | None Required for pipeline. Returns ------- BaseNumericTransformer: fitted class instance. Examples -------- ```pycon >>> import polars as pl >>> transformer = BaseNumericTransformer( ... columns="a", ... ) >>> test_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) >>> transformer.fit(test_df) BaseNumericTransformer(columns=['a']) ``` """ X = _convert_dataframe_to_narwhals(X) super().fit(X, y) CheckNumericMixin.check_numeric_columns(self, X.select(self.columns)) return self
[docs] @beartype def transform( self, X: DataFrame, return_native_override: bool | None = None, ) -> DataFrame: """Validate data and attributes prior to the child objects transform logic. Parameters ---------- X : DataFrame Data to transform. return_native_override: Optional[bool] Option to override return_native attr in transformer, useful when calling parent methods Returns ------- X : DataFrame Validated data Examples -------- ```pycon >>> import polars as pl >>> transformer = BaseNumericTransformer( ... columns="a", ... ) >>> test_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]}) >>> # base class has no effect on datag >>> transformer.transform(test_df) shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 3 │ │ 2 ┆ 4 │ └─────┴─────┘ ``` """ X = _convert_dataframe_to_narwhals(X) return_native = self._process_return_native(return_native_override) X = super().transform(X, return_native_override=False) CheckNumericMixin.check_numeric_columns(self, X.select(self.columns)) return _return_narwhals_or_native_dataframe(X, return_native)
[docs] @register class OneDKmeansTransformer(BaseNumericTransformer, DropOriginalMixin): """Generates a new column based on kmeans algorithm. Transformer runs the kmeans algorithm based on given number of clusters and then identifies the bins' cuts based on the results. Finally it passes them into the a cut function. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> OneDKmeansTransformer( ... columns="a", ... n_clusters=2, ... new_column_name="new", ... drop_original=False, ... kmeans_kwargs={"random_state": 42}, ... ) OneDKmeansTransformer(columns=['a'], kmeans_kwargs={'random_state': 42}, n_clusters=2, new_column_name='new') ``` """ polars_compatible = True lazyframe_compatible = False jsonable = True FITS = True
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Serialize the transformer to a JSON-compatible dictionary. Returns ------- dict[str, dict[str, Any]]: JSON representation of the transformer, including init parameters. Examples -------- >>> import polars as pl >>> x = OneDKmeansTransformer( ... columns='a', ... n_clusters=2, ... new_column_name="new", ... drop_original=False, ... kmeans_kwargs={"random_state": 42}, ... ) >>> test_df=pl.DataFrame({'a': [1,2,3,4], 'b': [5,6,7,8]}) >>> x.fit(test_df) OneDKmeansTransformer(columns=['a'], kmeans_kwargs={'random_state': 42}, n_clusters=2, new_column_name='new') >>> x.to_json() {'tubular_version': ..., 'classname': 'OneDKmeansTransformer', 'init': {'columns': ['a'], 'copy': False, 'verbose': False, 'return_native': True, 'new_column_name': 'new', 'n_init': 'auto', 'n_clusters': 2, 'drop_original': False, 'kmeans_kwargs': {'random_state': 42}}, 'fit': {'is_fitted_': True, 'bins': [3, 4]}} """ self.check_is_fitted(["bins"]) json_dict = super().to_json() json_dict["init"].update( { "new_column_name": self.new_column_name, "n_init": self.n_init, "n_clusters": self.n_clusters, "drop_original": self.drop_original, "kmeans_kwargs": self.kmeans_kwargs, }, ) json_dict["fit"]["bins"] = self.bins return json_dict
@beartype def __init__( # noqa: PLR0917, PLR0913 self, columns: str | ListOfOneStr, new_column_name: str, n_init: str | int = "auto", n_clusters: int = 8, drop_original: bool = False, kmeans_kwargs: dict[str, object] | None = None, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns : str or list[str] Name of the column to discretise. new_column_name : str Name given to the new discrete column. n_clusters : int, default = 8 The number of clusters to form as well as the number of centroids to generate. n_init: "auto" or int, default="auto" Number of times the k-means algorithm is run with different centroid seeds. The final results is the best output of n_init consecutive runs in terms of inertia. Several runs are recommended for sparse high-dimensional problems (see `Clustering sparse data with k-means <https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#kmeans-sparse-high-dim>`__). When n_init='auto', the number of runs depends on the value of init: 10 if using init='random' or init is a callable; 1 if using init='k-means++' or init is an array-like.(Init is an arg in kmeans_kwargs. If init is not set then it defaults to k-means++ so n_init defaults to 1) drop_original : bool, default=False Should the original columns to be transformed be dropped after applying the OneDKmeanstransformer? kmeans_kwargs : dict, default = {} A dictionary of keyword arguments to be passed to the sklearn KMeans method when it is called in fit. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init(). """ if kmeans_kwargs is None: kmeans_kwargs = {} self.n_clusters = n_clusters self.new_column_name = new_column_name self.n_init = n_init self.kmeans_kwargs = kmeans_kwargs self.drop_original = drop_original if isinstance(columns, str): self.columns = [columns] else: self.columns = columns super().__init__(columns=self.columns, **kwargs)
[docs] def get_feature_names_out(self) -> list[str]: """List features modified/created by the transformer. Returns ------- list[str]: list of features modified/created by the transformer Examples -------- ```pycon >>> transformer = OneDKmeansTransformer( ... columns="a", ... n_clusters=2, ... new_column_name="kmeans_column", ... drop_original=False, ... kmeans_kwargs={"random_state": 42}, ... ) >>> transformer.get_feature_names_out() ['kmeans_column'] ``` """ return [ self.new_column_name, ]
[docs] @block_from_json @nw.narwhalify def fit(self, X: FrameT, y: IntoSeriesT | None = None) -> OneDKmeansTransformer: """Fit transformer to input data. Parameters ---------- X : pd/pl.DataFrame Dataframe with columns to learn scaling values from. y : None Required for pipeline. Returns ------- OneDKmeansTransformer: Fitted class instance. Raises ------ ValueError: if columns in X contain missing values. Examples -------- ```pycon >>> import polars as pl >>> transformer = OneDKmeansTransformer( ... columns="a", ... n_clusters=2, ... new_column_name="new", ... drop_original=False, ... kmeans_kwargs={"random_state": 42}, ... ) >>> test_df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) >>> transformer.fit(test_df) OneDKmeansTransformer(columns=['a'], kmeans_kwargs={'random_state': 42}, n_clusters=2, new_column_name='new') ``` """ super().fit(X, y) X = nw.from_native(X) # Check that X does not contain Nans and return ValueError. if ( X.select(nw.col(self.columns[0]).is_null().any()).to_numpy().ravel()[0] or X.select(nw.col(self.columns[0]).is_nan().any()).to_numpy().ravel()[0] ): msg = f"{self.classname()}: X should not contain missing values." raise ValueError(msg) kmeans = KMeans( n_clusters=self.n_clusters, n_init=self.n_init, **self.kmeans_kwargs, ) native_backend = nw.get_native_namespace(X).__name__ groups = kmeans.fit_predict(X.select(self.columns[0]).to_numpy()) X = X.with_columns( nw.new_series( name="groups", values=np.copy(groups), backend=native_backend, ), ) self.bins = ( X.group_by("groups") .agg( nw.col(self.columns[0]).max(), ) .sort(self.columns[0]) .select(self.columns[0]) .to_numpy() .ravel() .tolist() ) self.is_fitted_ = True return self
[docs] @nw.narwhalify def transform(self, X: FrameT) -> FrameT: """Generate from input pd/pl.DataFrame (X) bins based on Kmeans results and add this column or columns in X. Parameters ---------- X : pl/pd.DataFrame Data to transform. Returns ------- X : pl/pd.DataFrame Input X with additional cluster column added. Examples -------- ```pycon >>> import polars as pl >>> transformer = OneDKmeansTransformer( ... columns="a", ... n_clusters=2, ... new_column_name="new", ... drop_original=False, ... kmeans_kwargs={"random_state": 42}, ... ) >>> test_df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]}) >>> _ = transformer.fit(test_df) >>> transformer.transform(test_df) shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ new │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═════╡ │ 1 ┆ 5 ┆ 0 │ │ 2 ┆ 6 ┆ 0 │ │ 3 ┆ 7 ┆ 0 │ │ 4 ┆ 8 ┆ 1 │ └─────┴─────┴─────┘ ``` """ X = super().transform(X) X = nw.from_native(X) native_backend = nw.get_native_namespace(X).__name__ groups = np.digitize( X.select(self.columns[0]).to_numpy().ravel(), bins=self.bins, right=True, ) X = X.with_columns( nw.new_series( name=self.new_column_name, values=groups, backend=native_backend, ), ) return DropOriginalMixin.drop_original_column( X, self.drop_original, self.columns[0], )
[docs] @register class DifferenceTransformer(BaseNumericTransformer): """Transformer that performs subtraction operation between two columns. This transformer allows performing subtraction between two columns in a DataFrame and stores the result in a new column. Attributes ---------- columns : ListOfTwoStrs List of exactly two column names to operate on. The second column is subtracted from the first. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> transformer = DifferenceTransformer(columns=["a", "b"]) >>> transformer.columns ['a', 'b'] ``` """ polars_compatible = True FITS = False jsonable = True lazyframe_compatible = True @beartype def __init__( self, columns: ListOfTwoStrs, **kwargs: bool | None, ) -> None: """Initialize the DifferenceTransformer. Parameters ---------- columns : ListOfTwoStrs List of exactly two column names to operate on. The second column is subtracted from the first. verbose : bool, default=False Whether to print verbose output during transformation. kwargs: bool arguments for base class, e.g. verbose. """ super().__init__(columns=columns, **kwargs) # Set new_column_name or generate a default one self.new_column_name = f"{columns[0]}_minus_{columns[1]}" self.is_fitted_ = True # Does not fit
[docs] @beartype def transform( self, X: DataFrame, ) -> DataFrame: """Transform the DataFrame by applying the subtraction operation between two columns. Parameters ---------- X : DataFrame DataFrame containing the columns to operate on. Returns ------- DataFrame Transformed DataFrame with the new column containing the subtraction results. Examples -------- ```pycon >>> import polars as pl >>> transformer = DifferenceTransformer(columns=["a", "b"]) >>> test_df = pl.DataFrame({"a": [100, 200, 300], "b": [80, 150, 200]}) >>> transformer.transform(test_df) shape: (3, 3) ┌─────┬─────┬───────────┐ │ a ┆ b ┆ a_minus_b │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ i64 │ ╞═════╪═════╪═══════════╡ │ 100 ┆ 80 ┆ 20 │ │ 200 ┆ 150 ┆ 50 │ │ 300 ┆ 200 ┆ 100 │ └─────┴─────┴───────────┘ ``` """ X = _convert_dataframe_to_narwhals(X) X = super().transform(X, return_native_override=False) # Create the subtraction expression expr = nw.col(self.columns[0]) - nw.col(self.columns[1]) X = X.with_columns(expr.alias(self.new_column_name)) return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs] def get_feature_names_out(self) -> list[str]: """Get the names of the output features. Returns ------- list[str] List containing the name of the new column created by the transformation. """ return [f"{self.columns[0]}_minus_{self.columns[1]}"]
[docs] @register class RatioTransformer(BaseNumericTransformer): """Transformer that performs division operation between two columns. This transformer allows performing division between two columns in a DataFrame and stores the result in a new column. Attributes ---------- columns : ListOfTwoStrs List of exactly two column names to operate on. The first column is the numerator, and the second column is the denominator. return_dtype : str The dtype of the resulting column, either 'Float32' or 'Float64'. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> transformer = RatioTransformer(columns=["a", "b"], return_dtype="Float32") >>> transformer.columns ['a', 'b'] >>> transformer.return_dtype 'Float32' ``` """ polars_compatible = True FITS = False jsonable = True lazyframe_compatible = True
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Serialize the transformer to a JSON-compatible dictionary. Returns ------- dict[str, dict[str, Any]]: JSON representation of the transformer, including init parameters. Examples -------- ```pycon >>> ratio_transformer = RatioTransformer(columns=["a", "b"], return_dtype="Float32") >>> ratio_transformer.to_json() {'tubular_version': ..., 'classname': 'RatioTransformer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True, 'return_dtype': 'Float32'}, 'fit': {'is_fitted_': True}} ``` """ json_dict = super().to_json() json_dict["init"]["return_dtype"] = self.return_dtype return json_dict
@beartype def __init__( self, columns: ListOfTwoStrs, return_dtype: FloatTypeAnnotated = "Float32", **kwargs: bool | None, ) -> None: """Initialize the RatioTransformer. Parameters ---------- columns : ListOfTwoStrs List of exactly two column names to operate on. The first column is the numerator, and the second column is the denominator. return_dtype : str, default='Float32' The dtype of the resulting column, either 'Float32' or 'Float64'. kwargs: bool arguments for base class, e.g. verbose """ super().__init__(columns=columns, **kwargs) self.return_dtype = return_dtype self.is_fitted_ = True # Does not fit
[docs] @beartype def transform( self, X: DataFrame, ) -> DataFrame: """Transform the DataFrame by applying the division operation between two columns. Parameters ---------- X : DataFrame DataFrame containing the columns to operate on. Returns ------- DataFrame Transformed DataFrame with the new column containing the division results. Examples -------- ```pycon >>> import polars as pl >>> transformer = RatioTransformer(columns=["a", "b"], return_dtype="Float32") >>> test_df = pl.DataFrame({"a": [100, 200, 300], "b": [80, 150, 200]}) >>> transformer.transform(test_df) shape: (3, 3) ┌─────┬─────┬────────────────┐ │ a ┆ b ┆ a_divided_by_b │ │ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ f32 │ ╞═════╪═════╪════════════════╡ │ 100 ┆ 80 ┆ 1.25 │ │ 200 ┆ 150 ┆ 1.333333 │ │ 300 ┆ 200 ┆ 1.5 │ └─────┴─────┴────────────────┘ ``` """ X = _convert_dataframe_to_narwhals(X) X = super().transform(X, return_native_override=False) # Create the division expression expr = ( nw.when(nw.col(self.columns[1]) != 0) .then(nw.col(self.columns[0]) / nw.col(self.columns[1])) .otherwise(None) .cast(getattr(nw, self.return_dtype)) ) # Add the new column new_column_name = f"{self.columns[0]}_divided_by_{self.columns[1]}" X = X.with_columns(expr.alias(new_column_name)) return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs] def get_feature_names_out(self) -> list[str]: """Get the names of the output features. Returns ------- list[str] List containing the name of the new column created by the transformation. """ return [f"{self.columns[0]}_divided_by_{self.columns[1]}"]
# DEPRECATED TRANSFORMERS
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If it is useful to you, please raise an issue for it to be modernised """, ) class LogTransformer(BaseNumericTransformer, DropOriginalMixin): """Transformer to apply log transformation. Transformer has the option to add 1 to the columns to log and drop the original columns. Attributes ---------- add_1 : bool The name of the column or columns to be assigned to the output of running the pandas method in transform. drop_original : bool The name of the pandas.DataFrame method to call. suffix : str The suffix to add onto the end of column names for new columns. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False lazyframe_compatible = False jsonable = False FITS = False deprecated = True @beartype def __init__( self, columns: str | list[str] | None, base: PositiveNumber | None = None, add_1: bool = False, drop_original: bool = True, suffix: str = "log", **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns : None or str or list Columns to log transform. base : None or float/int Base for log transform. If None uses natural log. add_1 : bool Should a constant of 1 be added to the columns to be transformed prior to applying the log transform? drop_original : bool Should the original columns to be transformed be dropped after applying the log transform? suffix : str, default = '_log' The suffix to add onto the end of column names for new columns. kwargs: bool Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.drop_original = drop_original self.base = base self.add_1 = add_1 self.suffix = suffix
[docs] def get_feature_names_out(self) -> list[str]: """List features modified/created by the transformer. Returns ------- list[str]: list of features modified/created by the transformer """ return [f"{column}_{self.suffix}" for column in self.columns]
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Apply the log transform to the specified columns. If the drop attribute is True then the original columns are dropped. If the add_1 attribute is True then the original columns + 1 are logged. Parameters ---------- X : pd.DataFrame The dataframe to be transformed. Returns ------- X : pd.DataFrame The dataframe with the specified columns logged, optionally dropping the original columns if self.drop is True. Raises ------ ValueError: if provided columns contain negative values. """ X = super().transform(X) new_column_names = self.get_feature_names_out() if self.add_1: if (X[self.columns] <= -1).sum().sum() > 0: msg = f"{self.classname()}: values less than or equal to 0 in columns (after adding 1), make greater than 0 before using transform" raise ValueError(msg) if self.base is None: X[new_column_names] = np.log1p(X[self.columns]) else: X[new_column_names] = np.log1p(X[self.columns]) / np.log(self.base) else: if (X[self.columns] <= 0).sum().sum() > 0: msg = f"{self.classname()}: values less than or equal to 0 in columns, make greater than 0 before using transform" raise ValueError(msg) if self.base is None: X[new_column_names] = np.log(X[self.columns]) else: X[new_column_names] = np.log(X[self.columns]) / np.log(self.base) return DropOriginalMixin.drop_original_column( X, self.drop_original, self.columns, )
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If it is useful to you, please raise an issue for it to be modernised """, ) class CutTransformer(BaseNumericTransformer): """Class to bin a column into discrete intervals. Class simply uses the [pd.cut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html) method on the specified column. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False lazyframe_compatible = False jsonable = False FITS = False deprecated = True @beartype def __init__( self, column: str, new_column_name: str, cut_kwargs: GenericKwargs | None = None, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- column : str Name of the column to discretise. new_column_name : str Name given to the new discrete column. cut_kwargs : dict, default = {} A dictionary of keyword arguments to be passed to the pd.cut method when it is called in transform. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init(). """ if cut_kwargs is None: cut_kwargs = {} self.cut_kwargs = cut_kwargs self.new_column_name = new_column_name # This attribute is not for use in any method, use 'columns' instead. # Here only as a fix to allow string representation of transformer. self.column = column super().__init__(columns=[column], **kwargs)
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Discretise specified column using pd.cut. Parameters ---------- X : pd.DataFrame Data with column to transform. Returns ------- pd.DataFrame: Dataframe with binned column """ X = super().transform(X) # quick fix for empty frames, not spending much # time on this as transformer is deprecated if X.empty: X[self.new_column_name] = pd.Series(dtype=float) else: X[self.new_column_name] = pd.cut( X[self.columns[0]].to_numpy(), **self.cut_kwargs, ) return X
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If aspects of it have been useful to you, please raise an issue for it to be replaced with more specific transformers """, ) class TwoColumnOperatorTransformer( DataFrameMethodTransformer, BaseNumericTransformer, ): """Applies a pandas.DataFrame method to two columns (add, sub, mul, div, mod, pow). Transformer assigns the output of the method to a new column. The method will be applied in the form (column 1)operator(column 2), so order matters (if the method does not commute). It is possible to supply other key word arguments to the transform method, which will be passed to the pandas.DataFrame method being called. Attributes ---------- pd_method_name : str The name of the pandas.DataFrame method to be called. columns : list list containing two string items: [column1_name, column2_name] The first will be operated upon by the chosen pandas method using the second. column2_name : str The name of the 2nd column in the operation. new_column_name : str The name of the new column that the output is assigned to. pd_method_kwargs : dict Dictionary of method kwargs to be passed to pandas.DataFrame method. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False lazyframe_compatible = False jsonable = False FITS = False deprecated = True @beartype def __init__( self, pd_method_name: str, columns: ListOfTwoStrs, new_column_name: str, pd_method_kwargs: dict[str, object] | None = None, **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- pd_method_name : str The name of the pandas.DataFrame method to be called. columns: list[str] columns to operate on new_column_name : str The name of the new column that the output is assigned to. pd_method_kwargs : dict, default = {'axis':0} Dictionary of method kwargs to be passed to pandas.DataFrame method. Must contain an entry for axis, set to either 1 or 0. **kwargs : Arbitrary keyword arguments passed onto BaseTransformer.__init__(). Raises ------ ValueError: if axis=0 or axis=1 missing from pd_method_kwargs """ if pd_method_kwargs is None: pd_method_kwargs = {"axis": 0} else: if "axis" not in pd_method_kwargs: msg = f'{self.classname()}: pd_method_kwargs must contain an entry "axis" set to 0 or 1' raise ValueError(msg) if pd_method_kwargs["axis"] not in {0, 1}: msg = f"{self.classname()}: pd_method_kwargs 'axis' must be 0 or 1" raise ValueError(msg) self.new_column_name = new_column_name # call DataFrameMethodTransformer.__init__ # This class will inherit all the below attributes from DataFrameMethodTransformer super().__init__( new_column_names=new_column_name, pd_method_name=pd_method_name, columns=columns, pd_method_kwargs=pd_method_kwargs, **kwargs, ) self.column1_name = columns[0] self.column2_name = columns[1]
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Transform input data by applying the chosen method to the two specified columns. Args: ---- X (pd.DataFrame): Data to transform. Returns: ------- pd.DataFrame: Input X with an additional column. """ # call appropriate parent transforms X = super(DataFrameMethodTransformer, self).transform(X) X = super(BaseNumericTransformer, self).transform(X) X[self.new_column_name] = getattr(X[[self.column1_name]], self.pd_method_name)( X[self.column2_name], **self.pd_method_kwargs, ) return X
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If it is useful to you, please raise an issue for it to be modernised """, ) class ScalingTransformer(BaseNumericTransformer): """Transformer to perform scaling of numeric columns. Transformer can apply min max scaling, max absolute scaling or standardisation (subtract mean and divide by std). The transformer uses the appropriate sklearn.preprocessing scaler. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False lazyframe_compatible = False jsonable = False FITS = True deprecated = True # Dictionary mapping scaler types to their corresponding sklearn classes scaler_options: ClassVar[ dict[str, MinMaxScaler | MaxAbsScaler | StandardScaler] ] = { "min_max": MinMaxScaler, "max_abs": MaxAbsScaler, "standard": StandardScaler, } def __init__( self, columns: str | list[str] | None, scaler_type: str, scaler_kwargs: dict[str, object] | None = None, **kwargs: dict[str, bool], ) -> None: """Initialise class instance. Parameters ---------- columns : str, list or None Name of the columns to apply scaling to. scaler_type : str Type of scaler to use, must be one of 'min_max', 'max_abs' or 'standard'. The corresponding sklearn.preprocessing scaler used in each case is MinMaxScaler, MaxAbsScaler or StandardScaler. scaler_kwargs : dict, default = {} A dictionary of keyword arguments to be passed to the scaler object when it is initialised. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init(). Raises ------ TypeError: if scaler_kwargs is not dict with str keys ValueError: if scaler_type is invalid """ if scaler_kwargs is None: scaler_kwargs = {} # Validate scaler_kwargs type if not isinstance(scaler_kwargs, dict): msg = f"{self.classname()}: scaler_kwargs should be a dict but got type {type(scaler_kwargs)}" raise TypeError(msg) for i, k in enumerate(scaler_kwargs.keys()): if not isinstance(k, str): msg = f"{self.classname()}: unexpected type ({type(k)}) for scaler_kwargs key in position {i}, must be str" raise TypeError(msg) # Validate scaler_type if scaler_type not in self.scaler_options: allowed_scaler_values = list(self.scaler_options.keys()) msg = f"{self.classname()}: scaler_type should be one of; {allowed_scaler_values}" raise ValueError(msg) # Initialize scaler using the dictionary self.scaler = self.scaler_options[scaler_type](**scaler_kwargs) # This attribute is not for use in any method # Here only as a fix to allow string representation of transformer. self.scaler_kwargs = scaler_kwargs self.scaler_type = scaler_type super().__init__(columns=columns, **kwargs)
[docs] def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> ScalingTransformer: """Fit scaler to input data. Parameters ---------- X : pd.DataFrame Dataframe with columns to learn scaling values from. y : None Required for pipeline. Returns ------- ScalingTransformer: fitted class instance. """ super().fit(X, y) if self.columns: self.scaler.fit(X[self.columns]) return self
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Transform input data X with fitted scaler. Parameters ---------- X : pd.DataFrame Dataframe containing columns to be scaled. Returns ------- X : pd.DataFrame Input X with columns scaled. """ X = super().transform(X) # quick fix for empty frames, not spending much # time on this as transformer is deprecated if self.columns: if X.empty: for col in self.columns: X[col] = pd.Series(dtype=float) else: X[self.columns] = self.scaler.transform(X[self.columns]) return X
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If it is useful to you, please raise an issue for it to be modernised """, ) class InteractionTransformer(BaseNumericTransformer): """Generates interaction features. Transformer generates a new column for all combinations from the selected columns up to the maximum degree provided. (For sklearn version higher than 1.0.0>, only interaction of a degree higher or equal to the minimum degree would be computed). Each interaction column consists of the product of the specific combination of columns. Ex: with 3 columns provided ["a","b","c"], if max degree is 3, the total possible combinations are : - of degree 1 : ["a","b","c"] - of degree 2 : ["a b","b c","a c"] - of degree 3 : ["a b c"]. Attributes ---------- min_degree : int minimum degree of interaction features to be considered max_degree : int maximum degree of interaction features to be considered nb_features_to_interact : int number of selected columns from which interactions should be computed. (=len(columns)) nb_combinations : int number of new interaction features interaction_colname : list names of each new interaction feature. The name of an interaction feature is the combinations of previous column names joined with a whitespace. Interaction feature of ["col1","col2","col3] would be "col1 col2 col3". nb_feature_out : int number of total columns of transformed dataset, including new interaction features built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False lazyframe_compatible = False jsonable = False FITS = False deprecated = True MIN_DEGREE_VALUE = 2 @beartype def __init__( self, columns: ListOfMoreThanOneStrings, min_degree: int = 2, max_degree: int = 2, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns : None or list or str Columns to apply the transformer to. If a str is passed this is put into a list. Value passed in columns is saved in the columns attribute on the object. Note this has no default value so the user has to specify the columns when initialising the transformer. This is avoid likely when the user forget to set columns, in this case all columns would be picked up when super transform runs. min_degree : int minimum degree of interaction features to be considered. For example if min_degree=3, only interaction columns from at least 3 columns would be generated. NB- only applies if sklearn version is 1.0.0>= max_degree : int maximum degree of interaction features to be considered. For example if max_degree=3, only interaction columns from up to 3 columns would be generated. kwargs: arguments for base class, e.g. verbose. Raises ------ ValueError: if <=1 column provided ValueError: if min_degree is not int <2 ValueError: if max_degree is not int > min_degree ValueError: if max_degree is not < len(columns) """ super().__init__(columns=columns, **kwargs) if min_degree < self.MIN_DEGREE_VALUE: msg = f"{self.classname()}: min_degree must be equal or greater than 2, got {min_degree}" raise ValueError(msg) self.min_degree = min_degree if min_degree > max_degree: msg = f"{self.classname()}: max_degree must be equal or greater than min_degree" raise ValueError(msg) self.max_degree = max_degree if max_degree > len(columns): msg = f"{self.classname()}: max_degree must be equal or lower than number of columns" raise ValueError(msg) self.max_degree = max_degree self.nb_features_to_interact = len(self.columns) self.nb_combinations = -1 self.interaction_colname = [] self.nb_feature_out = -1
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Generate interaction features using the "product" pandas.DataFrame method. Parameters ---------- X : pd.DataFrame Data to transform. Returns ------- X : pd.DataFrame Input X with additional column or columns (self.interaction_colname) added. These contain the output of running the product pandas DataFrame method on identified combinations. Raises ------ TypeError: for invalid PolynomialFeatures._combinations arguments """ X = super().transform(X) try: interaction_combination_index = PolynomialFeatures._combinations( n_features=self.nb_features_to_interact, min_degree=self.min_degree, max_degree=self.max_degree, interaction_only=True, include_bias=False, ) except TypeError as err: if ( str(err) == "_combinations() got an unexpected keyword argument 'min_degree'" ): interaction_combination_index = PolynomialFeatures._combinations( n_features=self.nb_features_to_interact, degree=self.max_degree, interaction_only=True, include_bias=False, ) else: raise err interaction_combination_colname = [ [self.columns[col_idx] for col_idx in interaction_combination] for interaction_combination in interaction_combination_index ] self.nb_combinations = len(interaction_combination_colname) self.nb_feature_out = self.nb_combinations + len(X) self.interaction_colname = [ " ".join(interaction_combination) for interaction_combination in interaction_combination_colname ] for inter_idx in range(len(interaction_combination_colname)): X[self.interaction_colname[inter_idx]] = X[ interaction_combination_colname[inter_idx] ].product(axis=1, skipna=False) return X
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If it is useful to you, please raise an issue for it to be modernised """, ) class PCATransformer(BaseNumericTransformer): """Generates variables using Principal component analysis (PCA). Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. It is based on sklearn class sklearn.decomposition.PCA Attributes ---------- pca : PCA class from sklearn.decomposition n_components_ : int The estimated number of components. When n_components is set to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this number is estimated from input data. Otherwise it equals the parameter n_components, or the lesser value of n_features and n_samples if n_components is None. feature_names_out: list or None list of feature name representing the new dimensions. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework FITS: bool class attribute, indicates whether transform requires fit to be run first jsonable: bool class attribute, indicates if transformer supports to/from_json methods lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False lazyframe_compatible = False jsonable = False FITS = True deprecated = True @beartype def __init__( self, columns: str | ListOfMoreThanOneStrings | None, n_components: StrictlyPositiveInt | FloatBetweenZeroOne | Literal["mle"] = 2, svd_solver: Literal["auto", "full", "arpack", "randomized"] = "auto", random_state: int | None = None, pca_column_prefix: str = "pca_", **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns : None or list or str Columns to apply the transformer to. If a str is passed this is put into a list. Value passed in columns is saved in the columns attribute on the object. Note this has no default value so the user has to specify the columns when initialising the transformer. When the user forget to set columns, all columns would be picked up when super transform runs. n_components : int, float or 'mle', default=None Number of components to keep. if n_components is not set all components are kept:: n_components == min(n_samples, n_features) If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's MLE is used to guess the dimension. Use of ``n_components == 'mle'`` will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``. If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. If ``svd_solver == 'arpack'``, the number of components must be strictly less than the minimum of n_features and n_samples. Hence, the None case results in:: n_components == min(n_samples, n_features) - 1 svd_solver='auto', tol=0.0, n_oversamples=10, random_state=None svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto' If auto : The solver is selected by a default policy based on `X.shape` and `n_components`: if the input data is larger than 500x500 and the number of components to extract is lower than 80% of the smallest dimension of the data, then the more efficient 'randomized' method is enabled. Otherwise the exact full SVD is computed and optionally truncated afterwards. If full : run exact full SVD calling the standard LAPACK solver via `scipy.linalg.svd` and select the components by postprocessing If arpack : run SVD truncated to n_components calling ARPACK solver via `scipy.sparse.linalg.svds`. It requires strictly 0 < n_components < min(X.shape) If randomized : run randomized SVD by the method of Halko et al. .. sklearn versionadded:: 0.18.0 random_state : int or None, default=None Used when the 'arpack' or 'randomized' solvers are used. Pass an int for reproducible results across multiple function calls. .. sklearn versionadded:: 0.18.0 pca_column_prefix : str, prefix added to each the n components features generated. Default is "pca_" example: if n_components = 3, new columns would be 'pca_0','pca_1','pca_2'. kwargs: arguments for base class, e.g. verbose Raises ------ ValueError: if n_components is numeric and is not both strictly positive and either a float in (0,1) or an int>=1. ValueError: if svd_solver is unknown. TypeError: if random_state is not int. ValueError: if n_components is a str and incompatible with svd_solver. TypeError: if n_components is numeric and incompatible with svd_solver. TypeError: if pca_column_prefix is not str """ super().__init__(columns=columns, **kwargs) self.n_components = n_components self.svd_solver = svd_solver self.random_state = random_state if (svd_solver == "arpack") and (n_components == "mle"): msg = f"{self.classname()}: n_components='mle' cannot be a string with svd_solver='arpack'" raise ValueError(msg) if (svd_solver in {"randomized", "arpack"}) and (type(n_components) is float): msg = f"{self.classname()}: n_components {n_components} cannot be a float with svd_solver='{svd_solver}'" raise TypeError(msg) self.pca_column_prefix = pca_column_prefix self.pca = PCA( n_components=self.n_components, svd_solver=self.svd_solver, random_state=self.random_state, ) self.pca_column_prefix = pca_column_prefix self.feature_names_out = None self.n_components_ = None
[docs] def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame: """Fit PCA to input data. Parameters ---------- X : pd.DataFrame Dataframe with columns to learn scaling values from. y : None Required for pipeline. Returns ------- PCATransformer: fitted class instance. Raises ------ ValueError: if n_components is invalid for data """ super().fit(X, y) X = CheckNumericMixin.check_numeric_columns(self, X) if self.n_components != "mle": if 0 < self.n_components <= min(X[self.columns].shape): pass else: msg = f"{self.classname()}: n_components {self.n_components} must be between 1 and min(n_samples {X[self.columns].shape[0]}, n_features {X[self.columns].shape[1]}) is {min(X[self.columns].shape)} with svd_solver '{self.svd_solver}'" raise ValueError(msg) self.pca.fit(X[self.columns]) self.n_components_ = self.pca.n_components_ self.feature_names_out = [ self.pca_column_prefix + str(i) for i in range(self.n_components_) ] return self
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Generate from input pandas DataFrame (X) PCA features and add this column or columns in X. Parameters ---------- X : pd.DataFrame Data to transform. Returns ------- X : pd.DataFrame Input X with additional column or columns (self.interaction_colname) added. These contain the output of running the product pandas DataFrame method on identified combinations. """ X = super().transform(X) X = CheckNumericMixin.check_numeric_columns(self, X) # quick fix for empty frames, not spending much # time on this as transformer is deprecated if X.empty: for col in self.feature_names_out: X[col] = pd.Series(dtype=float) else: X[self.feature_names_out] = self.pca.transform(X[self.columns]) return X