Source code for tubular.numeric

"""Contains transformers that apply numeric functions."""

from __future__ import annotations

from typing import TYPE_CHECKING, Any, ClassVar, Literal

import narwhals as nw
import numpy as np
import pandas as pd
from beartype import beartype
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import (
    MaxAbsScaler,
    MinMaxScaler,
    PolynomialFeatures,
    StandardScaler,
)
from typing_extensions import deprecated

from tubular._utils import (
    _convert_dataframe_to_narwhals,
    _return_narwhals_or_native_dataframe,
    block_from_json,
)
from tubular.base import BaseTransformer, DataFrameMethodTransformer, register
from tubular.functions.numeric import (
    get_difference_of_two_columns,
    get_ratio_of_two_columns,
)
from tubular.mixins import (
    CheckNumericMixin,
    DropOriginalMixin,
)
from tubular.types import (
    DataFrame,
    FloatBetweenZeroOne,
    FloatTypeAnnotated,
    GenericKwargs,
    ListOfMoreThanOneStrings,
    ListOfOneStr,
    ListOfTwoStrs,
    PositiveNumber,
    StrictlyPositiveInt,
)

if TYPE_CHECKING:
    from narwhals.typing import FrameT, IntoSeriesT



[docs]
@register
class BaseNumericTransformer(BaseTransformer, CheckNumericMixin):
    """Extends BaseTransformer for datetime scenarios.

    Attributes
    ----------
    columns : List[str]
        List of columns to be operated on

    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    Examples
    --------
    ```pycon
    >>> BaseNumericTransformer(
    ...     columns="a",
    ... )
    BaseNumericTransformer(columns=['a'])

    ```

    """

    polars_compatible = True

    lazyframe_compatible = True

    jsonable = False

    FITS = False

    def __init__(self, columns: list[str], **kwargs: dict[str, bool]) -> None:
        """Initialise class instance.

        Parameters
        ----------
        columns : List[str]
            List of columns to be operated on.

        **kwargs
            Arbitrary keyword arguments passed onto BaseTransformer.init method.

        """
        super().__init__(columns=columns, **kwargs)


[docs]
    def fit(
        self,
        X: DataFrame,
        y: nw.Series | None = None,
    ) -> BaseNumericTransformer:
        """Validate data and attributes prior to the child objects fit logic.

        Parameters
        ----------
        X : DataFrame
            A dataframe containing the required columns

        y : Series | None
            Required for pipeline.

        Returns
        -------
            BaseNumericTransformer:
                fitted class instance.

        Examples
        --------
        ```pycon
        >>> import polars as pl

        >>> transformer = BaseNumericTransformer(
        ...     columns="a",
        ... )

        >>> test_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})

        >>> transformer.fit(test_df)
        BaseNumericTransformer(columns=['a'])

        ```

        """
        X = _convert_dataframe_to_narwhals(X)

        super().fit(X, y)

        CheckNumericMixin.check_numeric_columns(self, X.select(self.columns))

        return self



[docs]
    @beartype
    def transform(
        self,
        X: DataFrame,
        return_native_override: bool | None = None,
    ) -> DataFrame:
        """Validate data and attributes prior to the child objects transform logic.

        Parameters
        ----------
        X : DataFrame
            Data to transform.

        return_native_override: Optional[bool]
            Option to override return_native attr in transformer, useful when calling parent
            methods

        Returns
        -------
        X : DataFrame
            Validated data

        Examples
        --------
        ```pycon
        >>> import polars as pl

        >>> transformer = BaseNumericTransformer(
        ...     columns="a",
        ... )

        >>> test_df = pl.DataFrame({"a": [1, 2], "b": [3, 4]})

        >>> # base class has no effect on datag
        >>> transformer.transform(test_df)
        shape: (2, 2)
        ┌─────┬─────┐
        │ a   ┆ b   │
        │ --- ┆ --- │
        │ i64 ┆ i64 │
        ╞═════╪═════╡
        │ 1   ┆ 3   │
        │ 2   ┆ 4   │
        └─────┴─────┘

        ```

        """
        X = _convert_dataframe_to_narwhals(X)
        return_native = self._process_return_native(return_native_override)
        X = super().transform(X, return_native_override=False)

        CheckNumericMixin.check_numeric_columns(self, X.select(self.columns))

        return _return_narwhals_or_native_dataframe(X, return_native)





[docs]
@register
class OneDKmeansTransformer(BaseNumericTransformer, DropOriginalMixin):
    """Generates a new column based on kmeans algorithm.

    Transformer runs the kmeans algorithm based on given number of clusters and then identifies the bins' cuts based on the results.
    Finally it passes them into the a cut function.

    Attributes
    ----------
    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    Examples
    --------
    ```pycon
    >>> OneDKmeansTransformer(
    ...     columns="a",
    ...     n_clusters=2,
    ...     new_column_name="new",
    ...     drop_original=False,
    ...     kmeans_kwargs={"random_state": 42},
    ... )
    OneDKmeansTransformer(columns=['a'], kmeans_kwargs={'random_state': 42},
                          n_clusters=2, new_column_name='new')

    ```

    """

    polars_compatible = True

    lazyframe_compatible = False

    jsonable = True

    FITS = True


[docs]
    @block_from_json
    def to_json(self) -> dict[str, dict[str, Any]]:
        """Serialize the transformer to a JSON-compatible dictionary.

        Returns
        -------
        dict[str, dict[str, Any]]:
            JSON representation of the transformer, including init parameters.

        Examples
        --------
        >>> import polars as pl
        >>> x = OneDKmeansTransformer(
        ... columns='a',
        ... n_clusters=2,
        ... new_column_name="new",
        ... drop_original=False,
        ... kmeans_kwargs={"random_state": 42},
        ...    )
        >>> test_df=pl.DataFrame({'a': [1,2,3,4],  'b': [5,6,7,8]})
        >>> x.fit(test_df)
        OneDKmeansTransformer(columns=['a'], kmeans_kwargs={'random_state': 42},
                              n_clusters=2, new_column_name='new')
        >>> x.to_json()
        {'tubular_version': ..., 'classname': 'OneDKmeansTransformer', 'init': {'columns': ['a'], 'copy': False, 'verbose': False, 'return_native': True, 'new_column_name': 'new', 'n_init': 'auto', 'n_clusters': 2, 'drop_original': False, 'kmeans_kwargs': {'random_state': 42}}, 'fit': {'is_fitted_': True, 'bins': [3, 4]}}

        """
        self.check_is_fitted(["bins"])
        json_dict = super().to_json()

        json_dict["init"].update(
            {
                "new_column_name": self.new_column_name,
                "n_init": self.n_init,
                "n_clusters": self.n_clusters,
                "drop_original": self.drop_original,
                "kmeans_kwargs": self.kmeans_kwargs,
            },
        )
        json_dict["fit"]["bins"] = self.bins

        return json_dict


    @beartype
    def __init__(  # noqa: PLR0917, PLR0913
        self,
        columns: str | ListOfOneStr,
        new_column_name: str,
        n_init: str | int = "auto",
        n_clusters: int = 8,
        drop_original: bool = False,
        kmeans_kwargs: dict[str, object] | None = None,
        **kwargs: bool,
    ) -> None:
        """Initialise class instance.

        Parameters
        ----------
        columns : str or list[str]
            Name of the column to discretise.

        new_column_name : str
            Name given to the new discrete column.

        n_clusters : int, default = 8
            The number of clusters to form as well as the number of centroids to generate.

        n_init: "auto" or int, default="auto"
            Number of times the k-means algorithm is run with different centroid seeds.
            The final results is the best output of n_init consecutive runs in terms of inertia.
            Several runs are recommended for sparse high-dimensional problems (see `Clustering sparse data with k-means <https://scikit-learn.org/stable/auto_examples/text/plot_document_clustering.html#kmeans-sparse-high-dim>`__).

            When n_init='auto', the number of runs depends on the value of init: 10 if using init='random' or init is a callable;
            1 if using init='k-means++' or init is an array-like.(Init is an arg in kmeans_kwargs. If init is not set then it defaults to k-means++ so n_init defaults to 1)

        drop_original : bool, default=False
            Should the original columns to be transformed be dropped after applying the
            OneDKmeanstransformer?

        kmeans_kwargs : dict, default = {}
            A dictionary of keyword arguments to be passed to the sklearn KMeans method when it is called in fit.

        **kwargs
            Arbitrary keyword arguments passed onto BaseTransformer.init().

        """
        if kmeans_kwargs is None:
            kmeans_kwargs = {}

        self.n_clusters = n_clusters
        self.new_column_name = new_column_name
        self.n_init = n_init
        self.kmeans_kwargs = kmeans_kwargs
        self.drop_original = drop_original

        if isinstance(columns, str):
            self.columns = [columns]
        else:
            self.columns = columns

        super().__init__(columns=self.columns, **kwargs)


[docs]
    def get_feature_names_out(self) -> list[str]:
        """List features modified/created by the transformer.

        Returns
        -------
        list[str]:
            list of features modified/created by the transformer

        Examples
        --------
        ```pycon
        >>> transformer = OneDKmeansTransformer(
        ...     columns="a",
        ...     n_clusters=2,
        ...     new_column_name="kmeans_column",
        ...     drop_original=False,
        ...     kmeans_kwargs={"random_state": 42},
        ... )

        >>> transformer.get_feature_names_out()
        ['kmeans_column']

        ```

        """
        return [
            self.new_column_name,
        ]



[docs]
    @block_from_json
    @nw.narwhalify
    def fit(self, X: FrameT, y: IntoSeriesT | None = None) -> OneDKmeansTransformer:
        """Fit transformer to input data.

        Parameters
        ----------
        X : pd/pl.DataFrame
            Dataframe with columns to learn scaling values from.

        y : None
            Required for pipeline.

        Returns
        -------
            OneDKmeansTransformer:
                Fitted class instance.

        Raises
        ------
        ValueError:
            if columns in X contain missing values.

        Examples
        --------
        ```pycon
        >>> import polars as pl

        >>> transformer = OneDKmeansTransformer(
        ...     columns="a",
        ...     n_clusters=2,
        ...     new_column_name="new",
        ...     drop_original=False,
        ...     kmeans_kwargs={"random_state": 42},
        ... )

        >>> test_df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})

        >>> transformer.fit(test_df)
        OneDKmeansTransformer(columns=['a'], kmeans_kwargs={'random_state': 42},
                              n_clusters=2, new_column_name='new')

        ```

        """
        super().fit(X, y)

        X = nw.from_native(X)

        # Check that X does not contain Nans and return ValueError.
        if (
            X.select(nw.col(self.columns[0]).is_null().any()).to_numpy().ravel()[0]
            or X.select(nw.col(self.columns[0]).is_nan().any()).to_numpy().ravel()[0]
        ):
            msg = f"{self.classname()}: X should not contain missing values."
            raise ValueError(msg)

        kmeans = KMeans(
            n_clusters=self.n_clusters,
            n_init=self.n_init,
            **self.kmeans_kwargs,
        )

        native_backend = nw.get_native_namespace(X).__name__
        groups = kmeans.fit_predict(X.select(self.columns[0]).to_numpy())

        X = X.with_columns(
            nw.new_series(
                name="groups",
                values=np.copy(groups),
                backend=native_backend,
            ),
        )

        self.bins = (
            X.group_by("groups")
            .agg(
                nw.col(self.columns[0]).max(),
            )
            .sort(self.columns[0])
            .select(self.columns[0])
            .to_numpy()
            .ravel()
            .tolist()
        )
        self.is_fitted_ = True
        return self



[docs]
    @nw.narwhalify
    def transform(self, X: FrameT) -> FrameT:
        """Generate from input pd/pl.DataFrame (X) bins based on Kmeans results and add this column or columns in X.

        Parameters
        ----------
        X : pl/pd.DataFrame
            Data to transform.

        Returns
        -------
        X : pl/pd.DataFrame
            Input X with additional cluster column added.

        Examples
        --------
        ```pycon
        >>> import polars as pl

        >>> transformer = OneDKmeansTransformer(
        ...     columns="a",
        ...     n_clusters=2,
        ...     new_column_name="new",
        ...     drop_original=False,
        ...     kmeans_kwargs={"random_state": 42},
        ... )

        >>> test_df = pl.DataFrame({"a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})

        >>> _ = transformer.fit(test_df)
        >>> transformer.transform(test_df)
        shape: (4, 3)
        ┌─────┬─────┬─────┐
        │ a   ┆ b   ┆ new │
        │ --- ┆ --- ┆ --- │
        │ i64 ┆ i64 ┆ i64 │
        ╞═════╪═════╪═════╡
        │ 1   ┆ 5   ┆ 0   │
        │ 2   ┆ 6   ┆ 0   │
        │ 3   ┆ 7   ┆ 0   │
        │ 4   ┆ 8   ┆ 1   │
        └─────┴─────┴─────┘

        ```

        """
        X = super().transform(X)

        X = nw.from_native(X)
        native_backend = nw.get_native_namespace(X).__name__

        groups = np.digitize(
            X.select(self.columns[0]).to_numpy().ravel(),
            bins=self.bins,
            right=True,
        )

        X = X.with_columns(
            nw.new_series(
                name=self.new_column_name,
                values=groups,
                backend=native_backend,
            ),
        )
        return DropOriginalMixin.drop_original_column(
            X,
            self.drop_original,
            self.columns[0],
        )





[docs]
@register
class DifferenceTransformer(BaseNumericTransformer):
    """Transformer that performs subtraction operation between two columns.

    This transformer allows performing subtraction between two columns in a DataFrame
    and stores the result in a new column.

    Attributes
    ----------
    columns : ListOfTwoStrs
        List of exactly two column names to operate on. The second column is subtracted from the first.

    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    Examples
    --------
    ```pycon
    >>> transformer = DifferenceTransformer(columns=["a", "b"])
    >>> transformer.columns
    ['a', 'b']

    ```

    """

    polars_compatible = True

    FITS = False

    jsonable = True

    lazyframe_compatible = True

    @beartype
    def __init__(
        self,
        columns: ListOfTwoStrs,
        **kwargs: bool | None,
    ) -> None:
        """Initialize the DifferenceTransformer.

        Parameters
        ----------
        columns : ListOfTwoStrs
            List of exactly two column names to operate on. The second column is subtracted from the first.
        verbose : bool, default=False
            Whether to print verbose output during transformation.
        kwargs: bool
            arguments for base class, e.g. verbose.

        """
        super().__init__(columns=columns, **kwargs)

        # Set new_column_name or generate a default one
        self.new_column_name = f"{columns[0]}_minus_{columns[1]}"
        self.is_fitted_ = True  # Does not fit


[docs]
    def get_transform_exprs(self) -> list[nw.Expr]:
        """Get transform expressions.

        Returns
        -------
        list[nw.Expr]: transform expressions for class

        """
        return get_difference_of_two_columns(columns=self.columns)



[docs]
    @beartype
    def transform(
        self,
        X: DataFrame,
    ) -> DataFrame:
        """Transform the DataFrame by applying the subtraction operation between two columns.

        Parameters
        ----------
        X : DataFrame
            DataFrame containing the columns to operate on.

        Returns
        -------
        DataFrame
            Transformed DataFrame with the new column containing the subtraction results.


        Examples
        --------
        ```pycon
        >>> import polars as pl
        >>> transformer = DifferenceTransformer(columns=["a", "b"])
        >>> test_df = pl.DataFrame({"a": [100, 200, 300], "b": [80, 150, 200]})
        >>> transformer.transform(test_df)
        shape: (3, 3)
        ┌─────┬─────┬───────────┐
        │ a   ┆ b   ┆ a_minus_b │
        │ --- ┆ --- ┆ ---       │
        │ i64 ┆ i64 ┆ i64       │
        ╞═════╪═════╪═══════════╡
        │ 100 ┆ 80  ┆ 20        │
        │ 200 ┆ 150 ┆ 50        │
        │ 300 ┆ 200 ┆ 100       │
        └─────┴─────┴───────────┘

        ```

        """
        X = _convert_dataframe_to_narwhals(X)

        X = super().transform(X, return_native_override=False)

        transform_expr = self.get_transform_exprs()

        X = X.with_columns(transform_expr)

        return _return_narwhals_or_native_dataframe(X, self.return_native)



[docs]
    def get_feature_names_out(self) -> list[str]:
        """Get the names of the output features.

        Returns
        -------
        list[str]
            List containing the name of the new column created by the transformation.

        """
        return [f"{self.columns[0]}_minus_{self.columns[1]}"]





[docs]
@register
class RatioTransformer(BaseNumericTransformer):
    """Transformer that performs division operation between two columns.

    This transformer allows performing division between two columns in a DataFrame
    and stores the result in a new column.

    Attributes
    ----------
    columns : ListOfTwoStrs
        List of exactly two column names to operate on. The first column is the numerator,
        and the second column is the denominator.
    return_dtype : str
        The dtype of the resulting column, either 'Float32' or 'Float64'.

    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    Examples
    --------
    ```pycon
    >>> transformer = RatioTransformer(columns=["a", "b"], return_dtype="Float32")
    >>> transformer.columns
    ['a', 'b']
    >>> transformer.return_dtype
    'Float32'

    ```

    """

    polars_compatible = True

    FITS = False

    jsonable = True

    lazyframe_compatible = True


[docs]
    @block_from_json
    def to_json(self) -> dict[str, dict[str, Any]]:
        """Serialize the transformer to a JSON-compatible dictionary.

        Returns
        -------
        dict[str, dict[str, Any]]:
            JSON representation of the transformer, including init parameters.

        Examples
        --------
        ```pycon
        >>> ratio_transformer = RatioTransformer(columns=["a", "b"], return_dtype="Float32")
        >>> ratio_transformer.to_json()
        {'tubular_version': ..., 'classname': 'RatioTransformer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True, 'return_dtype': 'Float32'}, 'fit': {'is_fitted_': True}}

        ```

        """
        json_dict = super().to_json()
        json_dict["init"]["return_dtype"] = self.return_dtype

        return json_dict


    @beartype
    def __init__(
        self,
        columns: ListOfTwoStrs,
        return_dtype: FloatTypeAnnotated = "Float32",
        **kwargs: bool | None,
    ) -> None:
        """Initialize the RatioTransformer.

        Parameters
        ----------
        columns : ListOfTwoStrs
            List of exactly two column names to operate on. The first column is the numerator,
            and the second column is the denominator.
        return_dtype : str, default='Float32'
            The dtype of the resulting column, either 'Float32' or 'Float64'.
        kwargs: bool
            arguments for base class, e.g. verbose

        """
        super().__init__(columns=columns, **kwargs)

        self.return_dtype = return_dtype
        self.is_fitted_ = True  # Does not fit


[docs]
    def get_transform_exprs(self) -> list[nw.Expr]:
        """Get transform expressions.

        Returns
        -------
        list[nw.Expr]: transform expressions for class

        """
        return get_ratio_of_two_columns(
            columns=self.columns,
            return_dtype=self.return_dtype,
        )



[docs]
    @beartype
    def transform(
        self,
        X: DataFrame,
    ) -> DataFrame:
        """Transform the DataFrame by applying the division operation between two columns.

        Parameters
        ----------
        X : DataFrame
            DataFrame containing the columns to operate on.

        Returns
        -------
        DataFrame
            Transformed DataFrame with the new column containing the division results.

        Examples
        --------
        ```pycon
        >>> import polars as pl
        >>> transformer = RatioTransformer(columns=["a", "b"], return_dtype="Float32")
        >>> test_df = pl.DataFrame({"a": [100, 200, 300], "b": [80, 150, 200]})
        >>> transformer.transform(test_df)
        shape: (3, 3)
        ┌─────┬─────┬────────────────┐
        │ a   ┆ b   ┆ a_divided_by_b │
        │ --- ┆ --- ┆ ---            │
        │ i64 ┆ i64 ┆ f32            │
        ╞═════╪═════╪════════════════╡
        │ 100 ┆ 80  ┆ 1.25           │
        │ 200 ┆ 150 ┆ 1.333333       │
        │ 300 ┆ 200 ┆ 1.5            │
        └─────┴─────┴────────────────┘

        ```

        """
        X = _convert_dataframe_to_narwhals(X)
        X = super().transform(X, return_native_override=False)

        transform_expr = self.get_transform_exprs()

        X = X.with_columns(transform_expr)

        return _return_narwhals_or_native_dataframe(X, self.return_native)



[docs]
    def get_feature_names_out(self) -> list[str]:
        """Get the names of the output features.

        Returns
        -------
        list[str]
            List containing the name of the new column created by the transformation.

        """
        return [f"{self.columns[0]}_divided_by_{self.columns[1]}"]




# DEPRECATED TRANSFORMERS

[docs]
@deprecated(
    """This transformer has not been selected for conversion to polars/narwhals,
    and so has been deprecated. If it is useful to you, please raise an issue
    for it to be modernised
    """,
)
class LogTransformer(BaseNumericTransformer, DropOriginalMixin):
    """Transformer to apply log transformation.

    Transformer has the option to add 1 to the columns to log and drop the
    original columns.

    Attributes
    ----------
    add_1 : bool
        The name of the column or columns to be assigned to the output of running the
        pandas method in transform.

    drop_original : bool
        The name of the pandas.DataFrame method to call.

    suffix : str
        The suffix to add onto the end of column names for new columns.

    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    deprecated: bool
        indicates if class has been deprecated

    """

    polars_compatible = False

    lazyframe_compatible = False

    jsonable = False

    FITS = False

    deprecated = True

    @beartype
    def __init__(
        self,
        columns: str | list[str] | None,
        base: PositiveNumber | None = None,
        add_1: bool = False,
        drop_original: bool = True,
        suffix: str = "log",
        **kwargs: bool,
    ) -> None:
        """Initialise class instance.

        Parameters
        ----------
        columns : None or str or list
            Columns to log transform.

        base : None or float/int
            Base for log transform. If None uses natural log.

        add_1 : bool
            Should a constant of 1 be added to the columns to be transformed prior to
            applying the log transform?

        drop_original : bool
            Should the original columns to be transformed be dropped after applying the
            log transform?

        suffix : str, default = '_log'
            The suffix to add onto the end of column names for new columns.

        kwargs: bool
            Arbitrary keyword arguments passed onto BaseTransformer.init method.

        """
        super().__init__(columns=columns, **kwargs)

        self.drop_original = drop_original
        self.base = base
        self.add_1 = add_1
        self.suffix = suffix


[docs]
    def get_feature_names_out(self) -> list[str]:
        """List features modified/created by the transformer.

        Returns
        -------
        list[str]:
            list of features modified/created by the transformer

        """
        return [f"{column}_{self.suffix}" for column in self.columns]



[docs]
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Apply the log transform to the specified columns.

        If the drop attribute is True then the original columns are dropped. If
        the add_1 attribute is True then the original columns + 1 are logged.

        Parameters
        ----------
        X : pd.DataFrame
            The dataframe to be transformed.

        Returns
        -------
        X : pd.DataFrame
            The dataframe with the specified columns logged, optionally dropping the original
            columns if self.drop is True.

        Raises
        ------
        ValueError:
            if provided columns contain negative values.

        """
        X = super().transform(X)

        new_column_names = self.get_feature_names_out()

        if self.add_1:
            if (X[self.columns] <= -1).sum().sum() > 0:
                msg = f"{self.classname()}: values less than or equal to 0 in columns (after adding 1), make greater than 0 before using transform"
                raise ValueError(msg)

            if self.base is None:
                X[new_column_names] = np.log1p(X[self.columns])

            else:
                X[new_column_names] = np.log1p(X[self.columns]) / np.log(self.base)

        else:
            if (X[self.columns] <= 0).sum().sum() > 0:
                msg = f"{self.classname()}: values less than or equal to 0 in columns, make greater than 0 before using transform"
                raise ValueError(msg)

            if self.base is None:
                X[new_column_names] = np.log(X[self.columns])

            else:
                X[new_column_names] = np.log(X[self.columns]) / np.log(self.base)

        return DropOriginalMixin.drop_original_column(
            X,
            self.drop_original,
            self.columns,
        )





[docs]
@deprecated(
    """This transformer has not been selected for conversion to polars/narwhals,
    and so has been deprecated. If it is useful to you, please raise an issue
    for it to be modernised
    """,
)
class CutTransformer(BaseNumericTransformer):
    """Class to bin a column into discrete intervals.

    Class simply uses the [pd.cut](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html)
    method on the specified column.

    Attributes
    ----------
    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    deprecated: bool
        indicates if class has been deprecated

    """

    polars_compatible = False

    lazyframe_compatible = False

    jsonable = False

    FITS = False

    deprecated = True

    @beartype
    def __init__(
        self,
        column: str,
        new_column_name: str,
        cut_kwargs: GenericKwargs | None = None,
        **kwargs: bool,
    ) -> None:
        """Initialise class instance.

        Parameters
        ----------
        column : str
            Name of the column to discretise.

        new_column_name : str
            Name given to the new discrete column.

        cut_kwargs : dict, default = {}
            A dictionary of keyword arguments to be passed to the pd.cut method when it is called in transform.

        **kwargs
            Arbitrary keyword arguments passed onto BaseTransformer.init().

        """
        if cut_kwargs is None:
            cut_kwargs = {}

        self.cut_kwargs = cut_kwargs
        self.new_column_name = new_column_name

        # This attribute is not for use in any method, use 'columns' instead.
        # Here only as a fix to allow string representation of transformer.
        self.column = column

        super().__init__(columns=[column], **kwargs)


[docs]
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Discretise specified column using pd.cut.

        Parameters
        ----------
        X : pd.DataFrame
            Data with column to transform.

        Returns
        -------
            pd.DataFrame:
                Dataframe with binned column

        """
        X = super().transform(X)

        # quick fix for empty frames, not spending much
        # time on this as transformer is deprecated
        if X.empty:
            X[self.new_column_name] = pd.Series(dtype=float)

        else:
            X[self.new_column_name] = pd.cut(
                X[self.columns[0]].to_numpy(),
                **self.cut_kwargs,
            )

        return X





[docs]
@deprecated(
    """This transformer has not been selected for conversion to polars/narwhals,
    and so has been deprecated. If aspects of it have been useful to you, please raise an issue
    for it to be replaced with more specific transformers
    """,
)
class TwoColumnOperatorTransformer(
    DataFrameMethodTransformer,
    BaseNumericTransformer,
):
    """Applies a pandas.DataFrame method to two columns (add, sub, mul, div, mod, pow).

    Transformer assigns the output of the method to a new column. The method will be applied
    in the form (column 1)operator(column 2), so order matters (if the method does not commute). It is possible to
    supply other key word arguments to the transform method, which will be passed to the pandas.DataFrame method being called.

    Attributes
    ----------
    pd_method_name : str
        The name of the pandas.DataFrame method to be called.

    columns : list
        list containing two string items: [column1_name, column2_name] The first will be operated upon by the
        chosen pandas method using the second.

    column2_name : str
        The name of the 2nd column in the operation.

    new_column_name : str
        The name of the new column that the output is assigned to.

    pd_method_kwargs : dict
        Dictionary of method kwargs to be passed to pandas.DataFrame method.

    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    deprecated: bool
        indicates if class has been deprecated

    """

    polars_compatible = False

    lazyframe_compatible = False

    jsonable = False

    FITS = False

    deprecated = True

    @beartype
    def __init__(
        self,
        pd_method_name: str,
        columns: ListOfTwoStrs,
        new_column_name: str,
        pd_method_kwargs: dict[str, object] | None = None,
        **kwargs: bool | None,
    ) -> None:
        """Initialise class instance.

        Parameters
        ----------
        pd_method_name : str
            The name of the pandas.DataFrame method to be called.

        columns: list[str]
            columns to operate on

        new_column_name : str
            The name of the new column that the output is assigned to.

        pd_method_kwargs : dict, default =  {'axis':0}
            Dictionary of method kwargs to be passed to pandas.DataFrame method. Must contain an entry for axis, set to either 1 or 0.

        **kwargs :
            Arbitrary keyword arguments passed onto BaseTransformer.__init__().

        Raises
        ------
        ValueError:
            if axis=0 or axis=1 missing from pd_method_kwargs

        """
        if pd_method_kwargs is None:
            pd_method_kwargs = {"axis": 0}
        else:
            if "axis" not in pd_method_kwargs:
                msg = f'{self.classname()}: pd_method_kwargs must contain an entry "axis" set to 0 or 1'
                raise ValueError(msg)
            if pd_method_kwargs["axis"] not in {0, 1}:
                msg = f"{self.classname()}: pd_method_kwargs 'axis' must be 0 or 1"
                raise ValueError(msg)

        self.new_column_name = new_column_name

        # call DataFrameMethodTransformer.__init__
        # This class will inherit all the below attributes from DataFrameMethodTransformer
        super().__init__(
            new_column_names=new_column_name,
            pd_method_name=pd_method_name,
            columns=columns,
            pd_method_kwargs=pd_method_kwargs,
            **kwargs,
        )

        self.column1_name = columns[0]
        self.column2_name = columns[1]


[docs]
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform input data by applying the chosen method to the two specified columns.

        Args:
        ----
            X (pd.DataFrame): Data to transform.

        Returns:
        -------
            pd.DataFrame: Input X with an additional column.

        """
        # call appropriate parent transforms
        X = super(DataFrameMethodTransformer, self).transform(X)
        X = super(BaseNumericTransformer, self).transform(X)

        X[self.new_column_name] = getattr(X[[self.column1_name]], self.pd_method_name)(
            X[self.column2_name],
            **self.pd_method_kwargs,
        )

        return X





[docs]
@deprecated(
    """This transformer has not been selected for conversion to polars/narwhals,
    and so has been deprecated. If it is useful to you, please raise an issue
    for it to be modernised
    """,
)
class ScalingTransformer(BaseNumericTransformer):
    """Transformer to perform scaling of numeric columns.

    Transformer can apply min max scaling, max absolute scaling or standardisation (subtract mean and divide by std).
    The transformer uses the appropriate sklearn.preprocessing scaler.

    Attributes
    ----------
    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    deprecated: bool
        indicates if class has been deprecated

    """

    polars_compatible = False

    lazyframe_compatible = False

    jsonable = False

    FITS = True

    deprecated = True

    # Dictionary mapping scaler types to their corresponding sklearn classes
    scaler_options: ClassVar[
        dict[str, MinMaxScaler | MaxAbsScaler | StandardScaler]
    ] = {
        "min_max": MinMaxScaler,
        "max_abs": MaxAbsScaler,
        "standard": StandardScaler,
    }

    def __init__(
        self,
        columns: str | list[str] | None,
        scaler_type: str,
        scaler_kwargs: dict[str, object] | None = None,
        **kwargs: dict[str, bool],
    ) -> None:
        """Initialise class instance.

        Parameters
        ----------
        columns : str, list or None
            Name of the columns to apply scaling to.

        scaler_type : str
            Type of scaler to use, must be one of 'min_max', 'max_abs' or 'standard'. The corresponding
            sklearn.preprocessing scaler used in each case is MinMaxScaler, MaxAbsScaler or StandardScaler.

        scaler_kwargs : dict, default = {}
            A dictionary of keyword arguments to be passed to the scaler object when it is initialised.

        **kwargs
            Arbitrary keyword arguments passed onto BaseTransformer.init().

        Raises
        ------
        TypeError:
            if scaler_kwargs is not dict with str keys

        ValueError:
            if scaler_type is invalid

        """
        if scaler_kwargs is None:
            scaler_kwargs = {}

        # Validate scaler_kwargs type
        if not isinstance(scaler_kwargs, dict):
            msg = f"{self.classname()}: scaler_kwargs should be a dict but got type {type(scaler_kwargs)}"
            raise TypeError(msg)

        for i, k in enumerate(scaler_kwargs.keys()):
            if not isinstance(k, str):
                msg = f"{self.classname()}: unexpected type ({type(k)}) for scaler_kwargs key in position {i}, must be str"
                raise TypeError(msg)

        # Validate scaler_type
        if scaler_type not in self.scaler_options:
            allowed_scaler_values = list(self.scaler_options.keys())
            msg = f"{self.classname()}: scaler_type should be one of; {allowed_scaler_values}"
            raise ValueError(msg)

        # Initialize scaler using the dictionary
        self.scaler = self.scaler_options[scaler_type](**scaler_kwargs)
        # This attribute is not for use in any method
        # Here only as a fix to allow string representation of transformer.
        self.scaler_kwargs = scaler_kwargs
        self.scaler_type = scaler_type

        super().__init__(columns=columns, **kwargs)


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> ScalingTransformer:
        """Fit scaler to input data.

        Parameters
        ----------
        X : pd.DataFrame
            Dataframe with columns to learn scaling values from.

        y : None
            Required for pipeline.

        Returns
        -------
            ScalingTransformer:
                fitted class instance.

        """
        super().fit(X, y)
        if self.columns:
            self.scaler.fit(X[self.columns])
        return self



[docs]
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Transform input data X with fitted scaler.

        Parameters
        ----------
        X : pd.DataFrame
            Dataframe containing columns to be scaled.

        Returns
        -------
        X : pd.DataFrame
            Input X with columns scaled.

        """
        X = super().transform(X)

        # quick fix for empty frames, not spending much
        # time on this as transformer is deprecated
        if self.columns:
            if X.empty:
                for col in self.columns:
                    X[col] = pd.Series(dtype=float)

            else:
                X[self.columns] = self.scaler.transform(X[self.columns])

        return X





[docs]
@deprecated(
    """This transformer has not been selected for conversion to polars/narwhals,
    and so has been deprecated. If it is useful to you, please raise an issue
    for it to be modernised
    """,
)
class InteractionTransformer(BaseNumericTransformer):
    """Generates interaction features.

    Transformer generates a new column  for all combinations from the selected columns up to the maximum degree
    provided. (For sklearn version higher than 1.0.0>, only interaction of a degree higher or equal to the minimum
    degree would be computed).
    Each interaction column consists of the product of the specific combination of columns.
    Ex: with 3 columns provided ["a","b","c"], if max degree is 3, the total possible combinations are :
    - of degree 1 : ["a","b","c"]
    - of degree 2 : ["a b","b c","a c"]
    - of degree 3 : ["a b c"].

    Attributes
    ----------
        min_degree : int
            minimum degree of interaction features to be considered

        max_degree : int
            maximum degree of interaction features to be considered

        nb_features_to_interact : int
            number of selected columns from which interactions should be computed. (=len(columns))

        nb_combinations : int
            number of new interaction features

        interaction_colname : list
            names of each new interaction feature. The name of an interaction feature is the combinations of previous
            column names joined with a whitespace. Interaction feature of ["col1","col2","col3] would be "col1 col2 col3".

        nb_feature_out : int
            number of total columns of transformed dataset, including new interaction features

        built_from_json: bool
            indicates if transformer was reconstructed from json, which limits it's supported
            functionality to .transform

        polars_compatible : bool
            class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

        FITS: bool
            class attribute, indicates whether transform requires fit to be run first

        jsonable: bool
            class attribute, indicates if transformer supports to/from_json methods

        lazyframe_compatible: bool
            class attribute, indicates whether transformer works with lazyframes

        deprecated: bool
            indicates if class has been deprecated

    """

    polars_compatible = False

    lazyframe_compatible = False

    jsonable = False

    FITS = False

    deprecated = True

    MIN_DEGREE_VALUE = 2

    @beartype
    def __init__(
        self,
        columns: ListOfMoreThanOneStrings,
        min_degree: int = 2,
        max_degree: int = 2,
        **kwargs: bool,
    ) -> None:
        """Initialise class instance.

        Parameters
        ----------
        columns : None or list or str
            Columns to apply the transformer to. If a str is passed this is put into a list. Value passed
            in columns is saved in the columns attribute on the object. Note this has no default value so
            the user has to specify the columns when initialising the transformer. This is avoid likely
            when the user forget to set columns, in this case all columns would be picked up when super
            transform runs.
        min_degree : int
            minimum degree of interaction features to be considered. For example if min_degree=3, only interaction
            columns from at least 3 columns would be generated. NB- only applies if sklearn version is 1.0.0>=
        max_degree : int
            maximum degree of interaction features to be considered. For example if max_degree=3, only interaction
            columns from up to 3 columns would be generated.
        kwargs:
            arguments for base class, e.g. verbose.

        Raises
        ------
        ValueError:
            if <=1 column provided

        ValueError:
            if min_degree is not int <2

        ValueError:
            if max_degree is not int > min_degree

        ValueError:
            if max_degree is not < len(columns)

        """
        super().__init__(columns=columns, **kwargs)

        if min_degree < self.MIN_DEGREE_VALUE:
            msg = f"{self.classname()}: min_degree must be equal or greater than 2, got {min_degree}"
            raise ValueError(msg)
        self.min_degree = min_degree

        if min_degree > max_degree:
            msg = f"{self.classname()}: max_degree must be equal or greater than min_degree"
            raise ValueError(msg)
        self.max_degree = max_degree
        if max_degree > len(columns):
            msg = f"{self.classname()}: max_degree must be equal or lower than number of columns"
            raise ValueError(msg)
        self.max_degree = max_degree

        self.nb_features_to_interact = len(self.columns)
        self.nb_combinations = -1
        self.interaction_colname = []
        self.nb_feature_out = -1


[docs]
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Generate interaction features using the "product" pandas.DataFrame method.

        Parameters
        ----------
        X : pd.DataFrame
            Data to transform.

        Returns
        -------
        X : pd.DataFrame
            Input X with additional column or columns (self.interaction_colname) added. These contain the output of
            running the  product pandas DataFrame method on identified combinations.

        Raises
        ------
        TypeError: for invalid PolynomialFeatures._combinations arguments

        """
        X = super().transform(X)

        try:
            interaction_combination_index = PolynomialFeatures._combinations(
                n_features=self.nb_features_to_interact,
                min_degree=self.min_degree,
                max_degree=self.max_degree,
                interaction_only=True,
                include_bias=False,
            )
        except TypeError as err:
            if (
                str(err)
                == "_combinations() got an unexpected keyword argument 'min_degree'"
            ):
                interaction_combination_index = PolynomialFeatures._combinations(
                    n_features=self.nb_features_to_interact,
                    degree=self.max_degree,
                    interaction_only=True,
                    include_bias=False,
                )
            else:
                raise err

        interaction_combination_colname = [
            [self.columns[col_idx] for col_idx in interaction_combination]
            for interaction_combination in interaction_combination_index
        ]
        self.nb_combinations = len(interaction_combination_colname)
        self.nb_feature_out = self.nb_combinations + len(X)

        self.interaction_colname = [
            " ".join(interaction_combination)
            for interaction_combination in interaction_combination_colname
        ]

        for inter_idx in range(len(interaction_combination_colname)):
            X[self.interaction_colname[inter_idx]] = X[
                interaction_combination_colname[inter_idx]
            ].product(axis=1, skipna=False)

        return X





[docs]
@deprecated(
    """This transformer has not been selected for conversion to polars/narwhals,
    and so has been deprecated. If it is useful to you, please raise an issue
    for it to be modernised
    """,
)
class PCATransformer(BaseNumericTransformer):
    """Generates variables using Principal component analysis (PCA).

    Linear dimensionality reduction using Singular Value Decomposition of the
    data to project it to a lower dimensional space.

    It is based on sklearn class sklearn.decomposition.PCA

    Attributes
    ----------
    pca : PCA class from sklearn.decomposition

    n_components_ : int
        The estimated number of components. When n_components is set
        to 'mle' or a number between 0 and 1 (with svd_solver == 'full') this
        number is estimated from input data. Otherwise it equals the parameter
        n_components, or the lesser value of n_features and n_samples
        if n_components is None.

    feature_names_out: list or None
        list of feature name representing the new dimensions.

    built_from_json: bool
        indicates if transformer was reconstructed from json, which limits it's supported
        functionality to .transform

    polars_compatible : bool
        class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework

    FITS: bool
        class attribute, indicates whether transform requires fit to be run first

    jsonable: bool
        class attribute, indicates if transformer supports to/from_json methods

    lazyframe_compatible: bool
        class attribute, indicates whether transformer works with lazyframes

    deprecated: bool
        indicates if class has been deprecated

    """

    polars_compatible = False

    lazyframe_compatible = False

    jsonable = False

    FITS = True

    deprecated = True

    @beartype
    def __init__(
        self,
        columns: str | ListOfMoreThanOneStrings | None,
        n_components: StrictlyPositiveInt | FloatBetweenZeroOne | Literal["mle"] = 2,
        svd_solver: Literal["auto", "full", "arpack", "randomized"] = "auto",
        random_state: int | None = None,
        pca_column_prefix: str = "pca_",
        **kwargs: bool,
    ) -> None:
        """Initialise class instance.

        Parameters
        ----------
        columns : None or list or str
            Columns to apply the transformer to. If a str is passed this is put into a list. Value passed
            in columns is saved in the columns attribute on the object. Note this has no default value so
            the user has to specify the columns when initialising the transformer. When the user forget to set columns,
            all columns would be picked up when super transform runs.
        n_components : int, float or 'mle', default=None
            Number of components to keep.
            if n_components is not set all components are kept::
                n_components == min(n_samples, n_features)
            If ``n_components == 'mle'`` and ``svd_solver == 'full'``, Minka's
            MLE is used to guess the dimension. Use of ``n_components == 'mle'``
            will interpret ``svd_solver == 'auto'`` as ``svd_solver == 'full'``.
            If ``0 < n_components < 1`` and ``svd_solver == 'full'``, select the
            number of components such that the amount of variance that needs to be
            explained is greater than the percentage specified by n_components.
            If ``svd_solver == 'arpack'``, the number of components must be
             strictly less than the minimum of n_features and n_samples.
            Hence, the None case results in::
                n_components == min(n_samples, n_features) - 1   svd_solver='auto', tol=0.0,  n_oversamples=10, random_state=None
        svd_solver : {'auto', 'full', 'arpack', 'randomized'}, default='auto'
            If auto :
                The solver is selected by a default policy based on `X.shape` and
                `n_components`: if the input data is larger than 500x500 and the
                number of components to extract is lower than 80% of the smallest
                dimension of the data, then the more efficient 'randomized'
                method is enabled. Otherwise the exact full SVD is computed and
                optionally truncated afterwards.
            If full :
                run exact full SVD calling the standard LAPACK solver via
                `scipy.linalg.svd` and select the components by postprocessing
            If arpack :
                run SVD truncated to n_components calling ARPACK solver via
                `scipy.sparse.linalg.svds`. It requires strictly
                0 < n_components < min(X.shape)
            If randomized :
                run randomized SVD by the method of Halko et al.
            .. sklearn versionadded:: 0.18.0

        random_state : int or None, default=None
            Used when the 'arpack' or 'randomized' solvers are used. Pass an int
            for reproducible results across multiple function calls.
            .. sklearn versionadded:: 0.18.0
        pca_column_prefix : str, prefix added to each the n components features generated. Default is "pca_"
            example: if n_components = 3, new columns would be 'pca_0','pca_1','pca_2'.
        kwargs:
            arguments for base class, e.g. verbose

        Raises
        ------
        ValueError:
            if n_components is numeric and is not both
            strictly positive and either a float in (0,1)
            or an int>=1.

        ValueError:
            if svd_solver is unknown.

        TypeError:
            if random_state is not int.

        ValueError:
            if n_components is a str and incompatible with svd_solver.

        TypeError:
            if n_components is numeric and incompatible with svd_solver.

        TypeError:
            if pca_column_prefix is not str

        """
        super().__init__(columns=columns, **kwargs)

        self.n_components = n_components

        self.svd_solver = svd_solver

        self.random_state = random_state

        if (svd_solver == "arpack") and (n_components == "mle"):
            msg = f"{self.classname()}: n_components='mle' cannot be a string with svd_solver='arpack'"
            raise ValueError(msg)
        if (svd_solver in {"randomized", "arpack"}) and (type(n_components) is float):
            msg = f"{self.classname()}: n_components {n_components} cannot be a float with svd_solver='{svd_solver}'"
            raise TypeError(msg)

        self.pca_column_prefix = pca_column_prefix

        self.pca = PCA(
            n_components=self.n_components,
            svd_solver=self.svd_solver,
            random_state=self.random_state,
        )

        self.pca_column_prefix = pca_column_prefix
        self.feature_names_out = None
        self.n_components_ = None


[docs]
    def fit(self, X: pd.DataFrame, y: pd.Series | None = None) -> pd.DataFrame:
        """Fit PCA to input data.

        Parameters
        ----------
        X : pd.DataFrame
            Dataframe with columns to learn scaling values from.

        y : None
            Required for pipeline.

        Returns
        -------
            PCATransformer:
                fitted class instance.

        Raises
        ------
        ValueError:
            if n_components is invalid for data

        """
        super().fit(X, y)

        X = CheckNumericMixin.check_numeric_columns(self, X)

        if self.n_components != "mle":
            if 0 < self.n_components <= min(X[self.columns].shape):
                pass
            else:
                msg = f"{self.classname()}: n_components {self.n_components} must be between 1 and min(n_samples {X[self.columns].shape[0]}, n_features {X[self.columns].shape[1]}) is {min(X[self.columns].shape)} with svd_solver '{self.svd_solver}'"
                raise ValueError(msg)

        self.pca.fit(X[self.columns])
        self.n_components_ = self.pca.n_components_
        self.feature_names_out = [
            self.pca_column_prefix + str(i) for i in range(self.n_components_)
        ]

        return self



[docs]
    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
        """Generate from input pandas DataFrame (X) PCA features and add this column or columns in X.

        Parameters
        ----------
        X : pd.DataFrame
            Data to transform.

        Returns
        -------
        X : pd.DataFrame
            Input X with additional column or columns (self.interaction_colname) added. These contain the output of
            running the  product pandas DataFrame method on identified combinations.

        """
        X = super().transform(X)
        X = CheckNumericMixin.check_numeric_columns(self, X)

        # quick fix for empty frames, not spending much
        # time on this as transformer is deprecated
        if X.empty:
            for col in self.feature_names_out:
                X[col] = pd.Series(dtype=float)

        else:
            X[self.feature_names_out] = self.pca.transform(X[self.columns])

        return X