Source code for tubular.strings

"""Contains transformers that apply string functions."""

from __future__ import annotations

from typing import Any

import narwhals as nw
import pandas as pd
from beartype import beartype
from typing_extensions import deprecated

from tubular._utils import (
    _convert_dataframe_to_narwhals,
    _return_narwhals_or_native_dataframe,
    block_from_json,
)
from tubular.base import BaseTransformer, register
from tubular.functions.strings import (
    convert_string_columns_to_lowercase,
    extract_string_components,
    indicate_if_string_columns_contain_reference,
    remove_characters_from_string_columns,
)
from tubular.types import (
    DataFrame,
    GenericKwargs,
    ListOfOneStr,
    ListOfStrs,
    StrictlyPositiveInt,
)


[docs] @register class LowerCaseTransformer(BaseTransformer): """Transformer class to lower case of text columns. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> from pprint import pprint >>> transformer = LowerCaseTransformer( ... columns=["a"], ... ) >>> transformer LowerCaseTransformer(columns=['a']) >>> json_dump = transformer.to_json() >>> pprint(json_dump) {'classname': 'LowerCaseTransformer', 'fit': {'is_fitted_': False}, 'init': {'columns': ['a'], 'copy': False, 'return_native': True, 'verbose': False}, 'tubular_version': ...} >>> LowerCaseTransformer.from_json(json_dump) LowerCaseTransformer(columns=['a']) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = False @beartype def __init__( self, columns: str | ListOfStrs, **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- columns: Union[str, ListOfStrings] columns where values are to be lowercased. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs)
[docs] def get_transform_exprs(self) -> list[nw.Expr]: """Get transform expressions. Returns ------- list[nw.Expr]: transform expressions for class """ return convert_string_columns_to_lowercase(columns=self.columns)
[docs] def transform(self, X: DataFrame) -> DataFrame: """Lower case of text in given columns. Parameters ---------- X : DataFrame Data containing columns to lowercase. Returns ------- X : DataFrame Transformed input X with text lowercased in given columns. Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": ["HeLlO", None, " HI"]}) >>> transformer = LowerCaseTransformer(columns="a") >>> transformer.transform(test_df) shape: (3, 1) ┌───────┐ │ a │ │ --- │ │ str │ ╞═══════╡ │ hello │ │ null │ │ hi │ └───────┘ ``` """ X = _convert_dataframe_to_narwhals(X) transform_exprs = self.get_transform_exprs() X = X.with_columns(*transform_exprs) if transform_exprs else X return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs] @register class ExtractStringComponentsTransformer(BaseTransformer): r"""Transformer class to extract components from string columns, split by given character. Attributes ---------- by: str character to split on return_n_components: int number of components to return built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> from pprint import pprint >>> transformer = ExtractStringComponentsTransformer( ... columns=["a"], by="@", return_n_components=2 ... ) >>> transformer ExtractStringComponentsTransformer(by='@', columns=['a'], return_n_components=2) >>> json_dump = transformer.to_json() >>> pprint(json_dump) {'classname': 'ExtractStringComponentsTransformer', 'fit': {'is_fitted_': False}, 'init': {'by': '@', 'columns': ['a'], 'copy': False, 'return_n_components': 2, 'return_native': True, 'verbose': False}, 'tubular_version': ...} >>> ExtractStringComponentsTransformer.from_json(json_dump) ExtractStringComponentsTransformer(by='@', columns=['a'], return_n_components=2) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = False @beartype def __init__( self, columns: str | ListOfStrs, by: str, return_n_components: StrictlyPositiveInt, **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- columns: Union[str, ListOfStrings] columns containing string values to split into components. by: str character to split strings by return_n_components: number of components to return **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.by = by self.return_n_components = return_n_components
[docs] def get_feature_names_out(self) -> list[str]: """List features modified/created by the transformer. Returns ------- list[str]: list of features modified/created by the transformer Examples -------- ```pycon >>> transformer = ExtractStringComponentsTransformer( ... columns=["a"], by="@", return_n_components=2 ... ) >>> transformer.get_feature_names_out() ['a_split_by_@_entry_0', 'a_split_by_@_entry_1'] ``` """ return [ f"{col}_split_by_{self.by}_entry_{i}" for col in self.columns for i in range(self.return_n_components) ]
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Dump transformer to json dict. Returns ------- dict[str, dict[str, Any]]: jsonified transformer. Nested dict containing levels for attributes set at init and fit. Examples -------- ```pycon >>> from pprint import pprint >>> transformer = ExtractStringComponentsTransformer( ... columns=["a"], by="@", return_n_components=2 ... ) >>> pprint(transformer.to_json()) {'classname': 'ExtractStringComponentsTransformer', 'fit': {'is_fitted_': False}, 'init': {'by': '@', 'columns': ['a'], 'copy': False, 'return_n_components': 2, 'return_native': True, 'verbose': False}, 'tubular_version': ...} ``` """ json_dict = super().to_json() json_dict["init"]["by"] = self.by json_dict["init"]["return_n_components"] = self.return_n_components return json_dict
[docs] def get_transform_exprs(self) -> list[nw.Expr]: """Get transform expressions. Returns ------- list[nw.Expr]: transform expressions for class """ return extract_string_components( columns=self.columns, by=self.by, return_n_components=self.return_n_components, )
[docs] def transform(self, X: DataFrame) -> DataFrame: r"""Extract components from string columns, split by given character. Parameters ---------- X : DataFrame Data containing columns to extract components from. Returns ------- X : DataFrame Transformed input X with string components extracted from columns. Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": ["greg@gmail.com", "bob@apple.net"]}) >>> transformer = ExtractStringComponentsTransformer( ... columns=["a"], by="@", return_n_components=2 ... ) >>> transformer.transform(test_df) shape: (2, 3) ┌────────────────┬──────────────────────┬──────────────────────┐ │ a ┆ a_split_by_@_entry_0 ┆ a_split_by_@_entry_1 │ │ --- ┆ --- ┆ --- │ │ str ┆ str ┆ str │ ╞════════════════╪══════════════════════╪══════════════════════╡ │ greg@gmail.com ┆ greg ┆ gmail.com │ │ bob@apple.net ┆ bob ┆ apple.net │ └────────────────┴──────────────────────┴──────────────────────┘ ``` """ X = _convert_dataframe_to_narwhals(X) transform_exprs = self.get_transform_exprs() X = X.with_columns(*transform_exprs) if transform_exprs else X return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs] @register class RemoveCharactersTransformer(BaseTransformer): r"""Transformer class to remove characters from text columns. Attributes ---------- characters: list[str] list of characters to remove from text columns. characters_formatted: str characters attr formatted into regex string. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> from pprint import pprint >>> transformer = RemoveCharactersTransformer(columns=["a"], characters=["\\d"]) >>> transformer RemoveCharactersTransformer(characters=['\\d'], columns=['a']) >>> json_dump = transformer.to_json() >>> pprint(json_dump) {'classname': 'RemoveCharactersTransformer', 'fit': {'is_fitted_': False}, 'init': {'characters': ['\\d'], 'columns': ['a'], 'copy': False, 'return_native': True, 'verbose': False}, 'tubular_version': ...} >>> RemoveCharactersTransformer.from_json(json_dump) RemoveCharactersTransformer(characters=['\\d'], columns=['a']) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = False @beartype def __init__( self, columns: str | ListOfStrs, characters: list[str], **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- columns: Union[str, ListOfStrings] columns to remove characters from. characters: list[str] characters to remove from specified columns. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.characters = characters self.characters_formatted = r"[{}]".format("".join(self.characters))
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Dump transformer to json dict. Returns ------- dict[str, dict[str, Any]]: jsonified transformer. Nested dict containing levels for attributes set at init and fit. Examples -------- ```pycon >>> from pprint import pprint >>> transformer = RemoveCharactersTransformer(columns=["a", "b"], characters=["a"]) >>> pprint(transformer.to_json()) {'classname': 'RemoveCharactersTransformer', 'fit': {'is_fitted_': False}, 'init': {'characters': ['a'], 'columns': ['a', 'b'], 'copy': False, 'return_native': True, 'verbose': False}, 'tubular_version': ...} ``` """ json_dict = super().to_json() json_dict["init"]["characters"] = self.characters return json_dict
[docs] def get_transform_exprs(self) -> list[nw.Expr]: """Get transform expressions. Returns ------- list[nw.Expr]: transform expressions for class """ return remove_characters_from_string_columns( columns=self.columns, characters_formatted=self.characters_formatted )
[docs] def transform(self, X: DataFrame) -> DataFrame: r"""Strip unwanted characters from specified columns. Parameters ---------- X : DataFrame Data containing columns to strip. Returns ------- X : DataFrame Transformed input X with characters stripped from specified columns. Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": [" 8hi!", None, "9999hello "]}) >>> transformer = RemoveCharactersTransformer(columns=["a"], characters=["\W", "\s"]) >>> transformer.transform(test_df) shape: (3, 1) ┌───────────┐ │ a │ │ --- │ │ str │ ╞═══════════╡ │ 8hi │ │ null │ │ 9999hello │ └───────────┘ ``` """ X = _convert_dataframe_to_narwhals(X) transform_exprs = self.get_transform_exprs() X = X.with_columns(*transform_exprs) if transform_exprs else X return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs] @register class StringContainsTransformer(BaseTransformer): r"""Transformer class to indicate if given columns contain reference values. Attributes ---------- reference: str column or value to compare against, e.g. look for values of reference='a' in columns ['b', 'c']. reference_as_column: bool indicates whether reference represents a column (or value). Note, reference_as_column=True is not supported for pandas backend. characters_formatted: str characters attr formatted into regex string. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> from pprint import pprint >>> transformer = StringContainsTransformer( ... columns=["a"], reference="b", reference_as_column=True ... ) >>> transformer StringContainsTransformer(columns=['a'], reference='b', reference_as_column=True) >>> json_dump = transformer.to_json() >>> pprint(json_dump) {'classname': 'StringContainsTransformer', 'fit': {'is_fitted_': False}, 'init': {'columns': ['a'], 'copy': False, 'reference': 'b', 'reference_as_column': True, 'return_native': True, 'verbose': False}, 'tubular_version': ...} >>> StringContainsTransformer.from_json(json_dump) StringContainsTransformer(columns=['a'], reference='b', reference_as_column=True) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = False @beartype def __init__( self, columns: str | ListOfStrs, reference: str, reference_as_column: bool = False, **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- columns: Union[str, ListOfStrings] columns to remove characters from. reference: str reference value to search for reference_as_column: bool whether to treat reference as a column or a literal **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.reference = reference self.reference_as_column = reference_as_column
[docs] def get_feature_names_out(self) -> list[str]: """List features modified/created by the transformer. Returns ------- list[str]: list of features modified/created by the transformer Examples -------- ```pycon >>> transformer = StringContainsTransformer(columns=["a", "b"], reference="c") >>> transformer.get_feature_names_out() ['a_contains_c', 'b_contains_c'] ``` """ return [f"{col}_contains_{self.reference}" for col in self.columns]
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Dump transformer to json dict. Returns ------- dict[str, dict[str, Any]]: jsonified transformer. Nested dict containing levels for attributes set at init and fit. Examples -------- ```pycon >>> from pprint import pprint >>> transformer = StringContainsTransformer( ... columns=["a"], reference="b", reference_as_column=True ... ) >>> pprint(transformer.to_json()) {'classname': 'StringContainsTransformer', 'fit': {'is_fitted_': False}, 'init': {'columns': ['a'], 'copy': False, 'reference': 'b', 'reference_as_column': True, 'return_native': True, 'verbose': False}, 'tubular_version': ...} ``` """ json_dict = super().to_json() json_dict["init"]["reference"] = self.reference json_dict["init"]["reference_as_column"] = self.reference_as_column return json_dict
[docs] def get_transform_exprs(self) -> list[nw.Expr]: """Get transform expressions. Returns ------- list[nw.Expr]: transform expressions for class """ return indicate_if_string_columns_contain_reference( columns=self.columns, reference=self.reference, reference_as_column=self.reference_as_column, )
[docs] def transform(self, X: DataFrame) -> DataFrame: r"""Indicate if provided columns contain reference values. Parameters ---------- X : DataFrame Data containing columns to strip. Returns ------- X : DataFrame Transformed input X with characters stripped from specified columns. Raises ------ TypeError: if called on pandas df when reference_as_column=True Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame( ... {"a": ["cat", "dog", None, "mouse"], "b": ["cat", "rat", None, "mouse"]} ... ) >>> transformer = StringContainsTransformer( ... columns=["a"], reference="b", reference_as_column=True ... ) >>> transformer.transform(test_df) shape: (4, 3) ┌───────┬───────┬──────────────┐ │ a ┆ b ┆ a_contains_b │ │ --- ┆ --- ┆ --- │ │ str ┆ str ┆ bool │ ╞═══════╪═══════╪══════════════╡ │ cat ┆ cat ┆ true │ │ dog ┆ rat ┆ false │ │ null ┆ null ┆ null │ │ mouse ┆ mouse ┆ true │ └───────┴───────┴──────────────┘ ``` """ X = _convert_dataframe_to_narwhals(X) backend = nw.get_native_namespace(X).__name__ if backend == "pandas" and self.reference_as_column: msg = f"{self.classname()}: reference_as_column=True is only supported for polars backend" raise TypeError(msg) transform_exprs = self.get_transform_exprs() X = X.with_columns(*transform_exprs) if transform_exprs else X return _return_narwhals_or_native_dataframe(X, self.return_native)
# DEPRECATED TRANSFORMERS
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If aspects of it have been useful to you, please raise an issue for it to be replaced with more specific transformers """, ) class SeriesStrMethodTransformer(BaseTransformer): """Transformer that applies a pandas.Series.str method. Transformer assigns the output of the method to a new column. It is possible to supply other key word arguments to the transform method, which will be passed to the pandas.Series.str method being called. Be aware it is possible to supply incompatible arguments to init that will only be identified when transform is run. This is because there are many combinations of method, input and output sizes. Additionally some methods may only work as expected when called in transform with specific key word arguments. Attributes ---------- new_column_name : str The name of the column or columns to be assigned to the output of running the pd.Series.str in transform. pd_method_name : str The name of the pd.Series.str method to call. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False lazyframe_compatible = False jsonable = False deprecated = True @beartype def __init__( self, new_column_name: str, pd_method_name: str, columns: ListOfOneStr, copy: bool = False, pd_method_kwargs: GenericKwargs | None = None, **kwargs: bool | None, ) -> None: """Initialise class. Parameters ---------- new_column_name : str The name of the column to be assigned to the output of running the pd.Series.str in transform. pd_method_name : str The name of the pandas.Series.str method to call e.g. 'split' or 'replace' columns : list Name of column to apply the transformer to. This needs to be passed as a list of length 1. Value passed in columns is saved in the columns attribute of the object. Note this has no default value so the user has to specify the column when initialising the transformer. This is to avoid all columns being picked up when super transform runs if the user forgets an input. pd_method_kwargs : dict, default = {} A dictionary of keyword arguments to be passed to the pd.Series.str method when it is called. copy: bool Perform transform on copy of df? **kwargs Arbitrary keyword arguments passed onto BaseTransformer.__init__(). Raises ------ AttributeError: if pd_method_name is not pd.Series method """ super().__init__(columns=columns, copy=copy, **kwargs) if pd_method_kwargs is None: pd_method_kwargs = {} self.new_column_name = new_column_name self.pd_method_name = pd_method_name self.pd_method_kwargs = pd_method_kwargs try: ser = pd.Series(["a"]) getattr(ser.str, pd_method_name) except Exception as err: msg = f'{self.classname()}: error accessing "str.{pd_method_name}" method on pd.Series object - pd_method_name should be a pd.Series.str method' raise AttributeError(msg) from err
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Apply given pandas.Series.str method to given column. Any keyword arguments set in the pd_method_kwargs attribute are passed onto the pd.Series.str method when calling it. Parameters ---------- X : pd.DataFrame Data to transform. Returns ------- X : pd.DataFrame Input X with additional column (self.new_column_name) added. These contain the output of running the pd.Series.str method. """ X = super().transform(X) X[self.new_column_name] = getattr(X[self.columns[0]].str, self.pd_method_name)( **self.pd_method_kwargs, ) return X
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If it is useful to you, please raise an issue for it to be modernised """, ) class StringConcatenator(BaseTransformer): """Transformer to combine data from specified columns, of mixed datatypes, into a new column containing one string. Parameters ---------- columns : str or list of str Columns to concatenate. new_column_name : str, default = "new_column" New column name separator : str, default = " " Separator for the new string value Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = False lazyframe_compatible = False jsonable = False deprecated = True @beartype def __init__( self, columns: str | ListOfStrs, new_column_name: str = "new_column", separator: str = " ", **kwargs: bool, ) -> None: """Initialise class. Parameters ---------- columns : str or list of str Columns to concatenate. new_column_name : str, default = "new_column" New column name separator : str, default = " " Separator for the new string value **kwargs: arguments for base class """ super().__init__(columns=columns, **kwargs) self.new_column_name = new_column_name self.separator = separator
[docs] def transform(self, X: pd.DataFrame) -> pd.DataFrame: """Combine data from specified columns, of mixed datatypes, into a new column containing one string. Parameters ---------- X : df Data to concatenate values on. Returns ------- X : df Returns a dataframe with concatenated values. """ X = super().transform(X) # quick fix for empty frames, not spending much # time on this as transformer is deprecated if X.empty: X[self.new_column_name] = pd.Series(dtype=str) else: X[self.new_column_name] = ( X[self.columns].astype(str).apply(self.separator.join, axis=1) ) return X