Source code for tubular.strings
"""Contains transformers that apply string functions."""
from __future__ import annotations
from typing import Any
import narwhals as nw
import pandas as pd
from beartype import beartype
from typing_extensions import deprecated
from tubular._utils import (
_convert_dataframe_to_narwhals,
_return_narwhals_or_native_dataframe,
block_from_json,
)
from tubular.base import BaseTransformer, register
from tubular.functions.strings import (
convert_string_columns_to_lowercase,
extract_string_components,
indicate_if_string_columns_contain_reference,
remove_characters_from_string_columns,
)
from tubular.types import (
DataFrame,
GenericKwargs,
ListOfOneStr,
ListOfStrs,
StrictlyPositiveInt,
)
[docs]
@register
class LowerCaseTransformer(BaseTransformer):
"""Transformer class to lower case of text columns.
Attributes
----------
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
return_native: bool, default = True
Controls whether transformer returns narwhals or native pandas/polars type
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
Examples
--------
```pycon
>>> from pprint import pprint
>>> transformer = LowerCaseTransformer(
... columns=["a"],
... )
>>> transformer
LowerCaseTransformer(columns=['a'])
>>> json_dump = transformer.to_json()
>>> pprint(json_dump)
{'classname': 'LowerCaseTransformer',
'fit': {'is_fitted_': False},
'init': {'columns': ['a'],
'copy': False,
'return_native': True,
'verbose': False},
'tubular_version': ...}
>>> LowerCaseTransformer.from_json(json_dump)
LowerCaseTransformer(columns=['a'])
```
"""
polars_compatible = True
lazyframe_compatible = True
jsonable = True
FITS = False
@beartype
def __init__(
self,
columns: str | ListOfStrs,
**kwargs: bool | None,
) -> None:
"""Initialise class instance.
Parameters
----------
columns: Union[str, ListOfStrings]
columns where values are to be lowercased.
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.init method.
"""
super().__init__(columns=columns, **kwargs)
[docs]
def get_transform_exprs(self) -> list[nw.Expr]:
"""Get transform expressions.
Returns
-------
list[nw.Expr]: transform expressions for class
"""
return convert_string_columns_to_lowercase(columns=self.columns)
[docs]
def transform(self, X: DataFrame) -> DataFrame:
"""Lower case of text in given columns.
Parameters
----------
X : DataFrame
Data containing columns to lowercase.
Returns
-------
X : DataFrame
Transformed input X with text lowercased in given columns.
Examples
--------
```pycon
>>> import polars as pl
>>> test_df = pl.DataFrame({"a": ["HeLlO", None, " HI"]})
>>> transformer = LowerCaseTransformer(columns="a")
>>> transformer.transform(test_df)
shape: (3, 1)
┌───────┐
│ a │
│ --- │
│ str │
╞═══════╡
│ hello │
│ null │
│ hi │
└───────┘
```
"""
X = _convert_dataframe_to_narwhals(X)
transform_exprs = self.get_transform_exprs()
X = X.with_columns(*transform_exprs) if transform_exprs else X
return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs]
@register
class ExtractStringComponentsTransformer(BaseTransformer):
r"""Transformer class to extract components from string columns, split by given character.
Attributes
----------
by: str
character to split on
return_n_components: int
number of components to return
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
return_native: bool, default = True
Controls whether transformer returns narwhals or native pandas/polars type
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
Examples
--------
```pycon
>>> from pprint import pprint
>>> transformer = ExtractStringComponentsTransformer(
... columns=["a"], by="@", return_n_components=2
... )
>>> transformer
ExtractStringComponentsTransformer(by='@', columns=['a'], return_n_components=2)
>>> json_dump = transformer.to_json()
>>> pprint(json_dump)
{'classname': 'ExtractStringComponentsTransformer',
'fit': {'is_fitted_': False},
'init': {'by': '@',
'columns': ['a'],
'copy': False,
'return_n_components': 2,
'return_native': True,
'verbose': False},
'tubular_version': ...}
>>> ExtractStringComponentsTransformer.from_json(json_dump)
ExtractStringComponentsTransformer(by='@', columns=['a'], return_n_components=2)
```
"""
polars_compatible = True
lazyframe_compatible = True
jsonable = True
FITS = False
@beartype
def __init__(
self,
columns: str | ListOfStrs,
by: str,
return_n_components: StrictlyPositiveInt,
**kwargs: bool | None,
) -> None:
"""Initialise class instance.
Parameters
----------
columns: Union[str, ListOfStrings]
columns containing string values to split into components.
by: str
character to split strings by
return_n_components:
number of components to return
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.init method.
"""
super().__init__(columns=columns, **kwargs)
self.by = by
self.return_n_components = return_n_components
[docs]
def get_feature_names_out(self) -> list[str]:
"""List features modified/created by the transformer.
Returns
-------
list[str]:
list of features modified/created by the transformer
Examples
--------
```pycon
>>> transformer = ExtractStringComponentsTransformer(
... columns=["a"], by="@", return_n_components=2
... )
>>> transformer.get_feature_names_out()
['a_split_by_@_entry_0', 'a_split_by_@_entry_1']
```
"""
return [
f"{col}_split_by_{self.by}_entry_{i}"
for col in self.columns
for i in range(self.return_n_components)
]
[docs]
@block_from_json
def to_json(self) -> dict[str, dict[str, Any]]:
"""Dump transformer to json dict.
Returns
-------
dict[str, dict[str, Any]]:
jsonified transformer. Nested dict containing levels for attributes
set at init and fit.
Examples
--------
```pycon
>>> from pprint import pprint
>>> transformer = ExtractStringComponentsTransformer(
... columns=["a"], by="@", return_n_components=2
... )
>>> pprint(transformer.to_json())
{'classname': 'ExtractStringComponentsTransformer',
'fit': {'is_fitted_': False},
'init': {'by': '@',
'columns': ['a'],
'copy': False,
'return_n_components': 2,
'return_native': True,
'verbose': False},
'tubular_version': ...}
```
"""
json_dict = super().to_json()
json_dict["init"]["by"] = self.by
json_dict["init"]["return_n_components"] = self.return_n_components
return json_dict
[docs]
def get_transform_exprs(self) -> list[nw.Expr]:
"""Get transform expressions.
Returns
-------
list[nw.Expr]: transform expressions for class
"""
return extract_string_components(
columns=self.columns,
by=self.by,
return_n_components=self.return_n_components,
)
[docs]
def transform(self, X: DataFrame) -> DataFrame:
r"""Extract components from string columns, split by given character.
Parameters
----------
X : DataFrame
Data containing columns to extract components from.
Returns
-------
X : DataFrame
Transformed input X with string components extracted from columns.
Examples
--------
```pycon
>>> import polars as pl
>>> test_df = pl.DataFrame({"a": ["greg@gmail.com", "bob@apple.net"]})
>>> transformer = ExtractStringComponentsTransformer(
... columns=["a"], by="@", return_n_components=2
... )
>>> transformer.transform(test_df)
shape: (2, 3)
┌────────────────┬──────────────────────┬──────────────────────┐
│ a ┆ a_split_by_@_entry_0 ┆ a_split_by_@_entry_1 │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str │
╞════════════════╪══════════════════════╪══════════════════════╡
│ greg@gmail.com ┆ greg ┆ gmail.com │
│ bob@apple.net ┆ bob ┆ apple.net │
└────────────────┴──────────────────────┴──────────────────────┘
```
"""
X = _convert_dataframe_to_narwhals(X)
transform_exprs = self.get_transform_exprs()
X = X.with_columns(*transform_exprs) if transform_exprs else X
return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs]
@register
class RemoveCharactersTransformer(BaseTransformer):
r"""Transformer class to remove characters from text columns.
Attributes
----------
characters: list[str]
list of characters to remove from text columns.
characters_formatted: str
characters attr formatted into regex string.
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
return_native: bool, default = True
Controls whether transformer returns narwhals or native pandas/polars type
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
Examples
--------
```pycon
>>> from pprint import pprint
>>> transformer = RemoveCharactersTransformer(columns=["a"], characters=["\\d"])
>>> transformer
RemoveCharactersTransformer(characters=['\\d'], columns=['a'])
>>> json_dump = transformer.to_json()
>>> pprint(json_dump)
{'classname': 'RemoveCharactersTransformer',
'fit': {'is_fitted_': False},
'init': {'characters': ['\\d'],
'columns': ['a'],
'copy': False,
'return_native': True,
'verbose': False},
'tubular_version': ...}
>>> RemoveCharactersTransformer.from_json(json_dump)
RemoveCharactersTransformer(characters=['\\d'], columns=['a'])
```
"""
polars_compatible = True
lazyframe_compatible = True
jsonable = True
FITS = False
@beartype
def __init__(
self,
columns: str | ListOfStrs,
characters: list[str],
**kwargs: bool | None,
) -> None:
"""Initialise class instance.
Parameters
----------
columns: Union[str, ListOfStrings]
columns to remove characters from.
characters: list[str]
characters to remove from specified columns.
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.init method.
"""
super().__init__(columns=columns, **kwargs)
self.characters = characters
self.characters_formatted = r"[{}]".format("".join(self.characters))
[docs]
@block_from_json
def to_json(self) -> dict[str, dict[str, Any]]:
"""Dump transformer to json dict.
Returns
-------
dict[str, dict[str, Any]]:
jsonified transformer. Nested dict containing levels for attributes
set at init and fit.
Examples
--------
```pycon
>>> from pprint import pprint
>>> transformer = RemoveCharactersTransformer(columns=["a", "b"], characters=["a"])
>>> pprint(transformer.to_json())
{'classname': 'RemoveCharactersTransformer',
'fit': {'is_fitted_': False},
'init': {'characters': ['a'],
'columns': ['a', 'b'],
'copy': False,
'return_native': True,
'verbose': False},
'tubular_version': ...}
```
"""
json_dict = super().to_json()
json_dict["init"]["characters"] = self.characters
return json_dict
[docs]
def get_transform_exprs(self) -> list[nw.Expr]:
"""Get transform expressions.
Returns
-------
list[nw.Expr]: transform expressions for class
"""
return remove_characters_from_string_columns(
columns=self.columns, characters_formatted=self.characters_formatted
)
[docs]
def transform(self, X: DataFrame) -> DataFrame:
r"""Strip unwanted characters from specified columns.
Parameters
----------
X : DataFrame
Data containing columns to strip.
Returns
-------
X : DataFrame
Transformed input X with characters stripped from specified columns.
Examples
--------
```pycon
>>> import polars as pl
>>> test_df = pl.DataFrame({"a": [" 8hi!", None, "9999hello "]})
>>> transformer = RemoveCharactersTransformer(columns=["a"], characters=["\W", "\s"])
>>> transformer.transform(test_df)
shape: (3, 1)
┌───────────┐
│ a │
│ --- │
│ str │
╞═══════════╡
│ 8hi │
│ null │
│ 9999hello │
└───────────┘
```
"""
X = _convert_dataframe_to_narwhals(X)
transform_exprs = self.get_transform_exprs()
X = X.with_columns(*transform_exprs) if transform_exprs else X
return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs]
@register
class StringContainsTransformer(BaseTransformer):
r"""Transformer class to indicate if given columns contain reference values.
Attributes
----------
reference: str
column or value to compare against, e.g.
look for values of reference='a' in columns ['b', 'c'].
reference_as_column: bool
indicates whether reference represents a column (or value).
Note, reference_as_column=True is not supported for pandas backend.
characters_formatted: str
characters attr formatted into regex string.
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
return_native: bool, default = True
Controls whether transformer returns narwhals or native pandas/polars type
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
Examples
--------
```pycon
>>> from pprint import pprint
>>> transformer = StringContainsTransformer(
... columns=["a"], reference="b", reference_as_column=True
... )
>>> transformer
StringContainsTransformer(columns=['a'], reference='b',
reference_as_column=True)
>>> json_dump = transformer.to_json()
>>> pprint(json_dump)
{'classname': 'StringContainsTransformer',
'fit': {'is_fitted_': False},
'init': {'columns': ['a'],
'copy': False,
'reference': 'b',
'reference_as_column': True,
'return_native': True,
'verbose': False},
'tubular_version': ...}
>>> StringContainsTransformer.from_json(json_dump)
StringContainsTransformer(columns=['a'], reference='b',
reference_as_column=True)
```
"""
polars_compatible = True
lazyframe_compatible = True
jsonable = True
FITS = False
@beartype
def __init__(
self,
columns: str | ListOfStrs,
reference: str,
reference_as_column: bool = False,
**kwargs: bool | None,
) -> None:
"""Initialise class instance.
Parameters
----------
columns: Union[str, ListOfStrings]
columns to remove characters from.
reference: str
reference value to search for
reference_as_column: bool
whether to treat reference as a column or a literal
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.init method.
"""
super().__init__(columns=columns, **kwargs)
self.reference = reference
self.reference_as_column = reference_as_column
[docs]
def get_feature_names_out(self) -> list[str]:
"""List features modified/created by the transformer.
Returns
-------
list[str]:
list of features modified/created by the transformer
Examples
--------
```pycon
>>> transformer = StringContainsTransformer(columns=["a", "b"], reference="c")
>>> transformer.get_feature_names_out()
['a_contains_c', 'b_contains_c']
```
"""
return [f"{col}_contains_{self.reference}" for col in self.columns]
[docs]
@block_from_json
def to_json(self) -> dict[str, dict[str, Any]]:
"""Dump transformer to json dict.
Returns
-------
dict[str, dict[str, Any]]:
jsonified transformer. Nested dict containing levels for attributes
set at init and fit.
Examples
--------
```pycon
>>> from pprint import pprint
>>> transformer = StringContainsTransformer(
... columns=["a"], reference="b", reference_as_column=True
... )
>>> pprint(transformer.to_json())
{'classname': 'StringContainsTransformer',
'fit': {'is_fitted_': False},
'init': {'columns': ['a'],
'copy': False,
'reference': 'b',
'reference_as_column': True,
'return_native': True,
'verbose': False},
'tubular_version': ...}
```
"""
json_dict = super().to_json()
json_dict["init"]["reference"] = self.reference
json_dict["init"]["reference_as_column"] = self.reference_as_column
return json_dict
[docs]
def get_transform_exprs(self) -> list[nw.Expr]:
"""Get transform expressions.
Returns
-------
list[nw.Expr]: transform expressions for class
"""
return indicate_if_string_columns_contain_reference(
columns=self.columns,
reference=self.reference,
reference_as_column=self.reference_as_column,
)
[docs]
def transform(self, X: DataFrame) -> DataFrame:
r"""Indicate if provided columns contain reference values.
Parameters
----------
X : DataFrame
Data containing columns to strip.
Returns
-------
X : DataFrame
Transformed input X with characters stripped from specified columns.
Raises
------
TypeError: if called on pandas df when reference_as_column=True
Examples
--------
```pycon
>>> import polars as pl
>>> test_df = pl.DataFrame(
... {"a": ["cat", "dog", None, "mouse"], "b": ["cat", "rat", None, "mouse"]}
... )
>>> transformer = StringContainsTransformer(
... columns=["a"], reference="b", reference_as_column=True
... )
>>> transformer.transform(test_df)
shape: (4, 3)
┌───────┬───────┬──────────────┐
│ a ┆ b ┆ a_contains_b │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ bool │
╞═══════╪═══════╪══════════════╡
│ cat ┆ cat ┆ true │
│ dog ┆ rat ┆ false │
│ null ┆ null ┆ null │
│ mouse ┆ mouse ┆ true │
└───────┴───────┴──────────────┘
```
"""
X = _convert_dataframe_to_narwhals(X)
backend = nw.get_native_namespace(X).__name__
if backend == "pandas" and self.reference_as_column:
msg = f"{self.classname()}: reference_as_column=True is only supported for polars backend"
raise TypeError(msg)
transform_exprs = self.get_transform_exprs()
X = X.with_columns(*transform_exprs) if transform_exprs else X
return _return_narwhals_or_native_dataframe(X, self.return_native)
# DEPRECATED TRANSFORMERS
[docs]
@deprecated(
"""This transformer has not been selected for conversion to polars/narwhals,
and so has been deprecated. If aspects of it have been useful to you, please raise an issue
for it to be replaced with more specific transformers
""",
)
class SeriesStrMethodTransformer(BaseTransformer):
"""Transformer that applies a pandas.Series.str method.
Transformer assigns the output of the method to a new column. It is possible to
supply other key word arguments to the transform method, which will be passed to the
pandas.Series.str method being called.
Be aware it is possible to supply incompatible arguments to init that will only be
identified when transform is run. This is because there are many combinations of method, input
and output sizes. Additionally some methods may only work as expected when called in
transform with specific key word arguments.
Attributes
----------
new_column_name : str
The name of the column or columns to be assigned to the output of running the
pd.Series.str in transform.
pd_method_name : str
The name of the pd.Series.str method to call.
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
deprecated: bool
indicates if class has been deprecated
"""
polars_compatible = False
lazyframe_compatible = False
jsonable = False
deprecated = True
@beartype
def __init__(
self,
new_column_name: str,
pd_method_name: str,
columns: ListOfOneStr,
copy: bool = False,
pd_method_kwargs: GenericKwargs | None = None,
**kwargs: bool | None,
) -> None:
"""Initialise class.
Parameters
----------
new_column_name : str
The name of the column to be assigned to the output of running the pd.Series.str in transform.
pd_method_name : str
The name of the pandas.Series.str method to call e.g. 'split' or 'replace'
columns : list
Name of column to apply the transformer to. This needs to be passed as a list of length 1. Value passed
in columns is saved in the columns attribute of the object. Note this has no default value so
the user has to specify the column when initialising the transformer. This is to avoid all columns
being picked up when super transform runs if the user forgets an input.
pd_method_kwargs : dict, default = {}
A dictionary of keyword arguments to be passed to the pd.Series.str method when it is called.
copy: bool
Perform transform on copy of df?
**kwargs
Arbitrary keyword arguments passed onto BaseTransformer.__init__().
Raises
------
AttributeError: if pd_method_name is not pd.Series method
"""
super().__init__(columns=columns, copy=copy, **kwargs)
if pd_method_kwargs is None:
pd_method_kwargs = {}
self.new_column_name = new_column_name
self.pd_method_name = pd_method_name
self.pd_method_kwargs = pd_method_kwargs
try:
ser = pd.Series(["a"])
getattr(ser.str, pd_method_name)
except Exception as err:
msg = f'{self.classname()}: error accessing "str.{pd_method_name}" method on pd.Series object - pd_method_name should be a pd.Series.str method'
raise AttributeError(msg) from err
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Apply given pandas.Series.str method to given column.
Any keyword arguments set in the pd_method_kwargs attribute are passed onto the pd.Series.str method
when calling it.
Parameters
----------
X : pd.DataFrame
Data to transform.
Returns
-------
X : pd.DataFrame
Input X with additional column (self.new_column_name) added. These contain the output of
running the pd.Series.str method.
"""
X = super().transform(X)
X[self.new_column_name] = getattr(X[self.columns[0]].str, self.pd_method_name)(
**self.pd_method_kwargs,
)
return X
[docs]
@deprecated(
"""This transformer has not been selected for conversion to polars/narwhals,
and so has been deprecated. If it is useful to you, please raise an issue
for it to be modernised
""",
)
class StringConcatenator(BaseTransformer):
"""Transformer to combine data from specified columns, of mixed datatypes, into a new column containing one string.
Parameters
----------
columns : str or list of str
Columns to concatenate.
new_column_name : str, default = "new_column"
New column name
separator : str, default = " "
Separator for the new string value
Attributes
----------
built_from_json: bool
indicates if transformer was reconstructed from json, which limits it's supported
functionality to .transform
polars_compatible : bool
class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework
jsonable: bool
class attribute, indicates if transformer supports to/from_json methods
FITS: bool
class attribute, indicates whether transform requires fit to be run first
lazyframe_compatible: bool
class attribute, indicates whether transformer works with lazyframes
deprecated: bool
indicates if class has been deprecated
"""
polars_compatible = False
lazyframe_compatible = False
jsonable = False
deprecated = True
@beartype
def __init__(
self,
columns: str | ListOfStrs,
new_column_name: str = "new_column",
separator: str = " ",
**kwargs: bool,
) -> None:
"""Initialise class.
Parameters
----------
columns : str or list of str
Columns to concatenate.
new_column_name : str, default = "new_column"
New column name
separator : str, default = " "
Separator for the new string value
**kwargs:
arguments for base class
"""
super().__init__(columns=columns, **kwargs)
self.new_column_name = new_column_name
self.separator = separator
[docs]
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
"""Combine data from specified columns, of mixed datatypes, into a new column containing one string.
Parameters
----------
X : df
Data to concatenate values on.
Returns
-------
X : df
Returns a dataframe with concatenated values.
"""
X = super().transform(X)
# quick fix for empty frames, not spending much
# time on this as transformer is deprecated
if X.empty:
X[self.new_column_name] = pd.Series(dtype=str)
else:
X[self.new_column_name] = (
X[self.columns].astype(str).apply(self.separator.join, axis=1)
)
return X