Source code for tubular.imputers

"""Contains transformers that deal with imputation of missing values."""

from __future__ import annotations

import warnings
from typing import Any

import narwhals as nw
import polars as pl
from beartype import beartype
from typing_extensions import deprecated

from tubular._stats import (
    _get_mean_calculation_expressions,
    _get_median_calculation_expression,
)
from tubular._utils import (
    _collect_frame,
    _convert_dataframe_to_narwhals,
    _convert_series_to_narwhals,
    _is_null,
    _return_narwhals_or_native_dataframe,
    block_from_json,
)
from tubular.base import BaseTransformer, register
from tubular.mixins import WeightColumnMixin
from tubular.types import DataFrame, LazyFrame, ListOfStrs, NumericTypes, Series

pl.enable_string_cache()


[docs] @register class BaseImputer(BaseTransformer): """Contains transform method that will use fill nulls with values from self.impute_values_. Other imputers in this module should inherit from this class. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> BaseImputer(columns=["a", "b"]) BaseImputer(columns=['a', 'b']) ``` """ polars_compatible = True lazyframe_compatible = True # this class is not by itself jsonable, as needs attrs # which are set in the child classes jsonable = False FITS = False
[docs] @block_from_json def to_json(self) -> dict[str, dict[str, Any]]: """Dump transformer to json dict. Returns ------- dict[str, dict[str, Any]]: jsonified transformer. Nested dict containing levels for attributes set at init and fit. Raises ------ RuntimeError: if class is not jsonable Examples -------- ```pycon >>> arbitrary_imputer = ArbitraryImputer(columns=["a", "b"], impute_value=1) >>> # version will vary for local vs CI, so use ... as generic match >>> arbitrary_imputer.to_json() {'tubular_version': ..., 'classname': 'ArbitraryImputer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True, 'impute_value': 1}, 'fit': {'is_fitted_': True, 'impute_values_': {'a': 1, 'b': 1}}} >>> mean_imputer = MeanImputer(columns=["a", "b"]) >>> test_df = pl.DataFrame({"a": [1, None], "b": [None, 2]}) >>> _ = mean_imputer.fit(test_df) >>> mean_imputer.to_json() {'tubular_version': ..., 'classname': 'MeanImputer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True, 'weights_column': None}, 'fit': {'is_fitted_': True, 'impute_values_': {'a': 1.0, 'b': 2.0}}} ``` """ if not self.jsonable: msg = ( "This transformer has not yet had to/from json functionality developed" ) raise RuntimeError( msg, ) self.check_is_fitted("impute_values_") json_dict = super().to_json() # slightly awkward here as API not fully shared # across classes if isinstance( self, ( MeanImputer, MedianImputer, ModeImputer, ), ): json_dict["init"]["weights_column"] = self.weights_column elif isinstance(self, ArbitraryImputer): json_dict["init"]["impute_value"] = self.impute_value json_dict["fit"]["impute_values_"] = self.impute_values_ return json_dict
def _generate_imputation_expressions(self, expr: nw.Expr, col: str) -> nw.Expr: """Update input expressions to include imputation. Parameters ---------- expr : nw.Expr initial expression col: str column being imputed Returns ------- nw.Expr: updated expression, with imputation """ return ( expr.fill_null(value=self.impute_values_[col]) if (self.impute_values_[col] is not None) else expr ) def _check_for_failed_fit(self) -> None: """Check if fit failed to find needed attrs (if impute_values_ are None). Raises ------ ValueError: if impute_values_ have come out as None """ failed_columns = [ col for col in self.columns if _is_null(self.impute_values_[col]) ] if failed_columns: msg = f"fit has failed for columns {failed_columns}, it is possible that all rows are invalid - check for null/negative weights, all null columns, or other invalid conditions listed in the docstring" raise ValueError(msg)
[docs] @beartype def transform( self, X: DataFrame, return_native_override: bool | None = None, ) -> DataFrame: """Impute missing values with values calculated from fit method. Parameters ---------- X : DataFrame Data to impute. return_native_override: Optional[bool] option to override return_native attr in transformer, useful when calling parent methods Returns ------- X : DataFrame Transformed input X with nulls imputed with the median value for the specified columns. Examples -------- ```pycon >>> import polars as pl >>> imputer = BaseImputer(columns=["a", "b"]) >>> imputer.impute_values_ = {"a": 2, "b": 3.5} >>> test_df = pl.DataFrame({"a": [1, None, 2], "b": [3, None, 4]}) >>> imputer.transform(test_df) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ f64 │ ╞═════╪═════╡ │ 1 ┆ 3.0 │ │ 2 ┆ 3.5 │ │ 2 ┆ 4.0 │ └─────┴─────┘ ``` """ self.check_is_fitted("impute_values_") return_native = self._process_return_native(return_native_override) X = _convert_dataframe_to_narwhals(X) X = super().transform(X, return_native_override=False) transform_expressions = { col: self._generate_imputation_expressions(nw.col(col), col) for col in self.columns } X = X.with_columns(**transform_expressions) if transform_expressions else X return _return_narwhals_or_native_dataframe(X, return_native)
class _NumberImputer(BaseImputer): """Private subclass to handle arbitrary number imputation. Attributes ---------- impute_value : int | float Value to impute nulls with. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = False @beartype def __init__( self, impute_value: int | float, columns: str | list[str], **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- impute_value : int or float Value to impute nulls with. columns : str or list of strs Columns to impute **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.impute_values_ = {} self.impute_value = impute_value for c in self.columns: self.impute_values_[c] = self.impute_value self.is_fitted_ = True # Does not fit @beartype def transform(self, X: DataFrame) -> DataFrame: """Impute missing values with the supplied impute_value. Parameters ---------- X : DataFrame Data containing columns to impute. Returns ------- X : DataFrame Transformed input X with nulls imputed with the specified impute_value, for the specified columns. Raises ------ TypeError: if provided data is incompatible with provided impute value Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": [1, None, 2], "b": [3, None, 4]}) >>> imputer = _NumberImputer(columns=["a", "b"], impute_value=5) >>> imputer.transform(test_df) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 3 │ │ 5 ┆ 5 │ │ 2 ┆ 4 │ └─────┴─────┘ ``` """ X = _convert_dataframe_to_narwhals(X) schema = X.collect_schema() bad_types = [ schema[col] for col in self.columns if schema[col] not in {*NumericTypes, nw.Unknown} ] if bad_types: msg = f""" ArbitraryImputer: transformer can only handle Float/Int/UInt/Unknown type columns but got columns with types {bad_types} """ raise TypeError( msg, ) X = BaseTransformer.transform(self, X, return_native_override=False) # next handle imputing transform_expressions = { col: self._generate_imputation_expressions( nw.col(col), col, ) for col in self.columns } X = X.with_columns(**transform_expressions) if transform_expressions else X return _return_narwhals_or_native_dataframe(X, self.return_native) class _StringImputer(BaseImputer): """Private subclass to handle arbitrary string imputation. Attributes ---------- impute_value : string Value to impute nulls with. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = False @beartype def __init__( self, impute_value: str, columns: str | list[str], **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- impute_value : str Value to impute nulls with. columns : str or list of strs Columns to impute **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.impute_values_ = {} self.impute_value = impute_value for c in self.columns: self.impute_values_[c] = self.impute_value self.is_fitted_ = True # Does not fit def cat_to_enum_expr(self, expr: nw.Expr, categories: list[str]) -> nw.Expr: """Update expression to include handling of category types. Parameters ---------- expr : nw.Expr initial expression categories: list[str] list of categories in field initially Returns ------- nw.Expr: updated expression, with category type handling """ return expr.cast(nw.Enum({*categories, self.impute_value})) @beartype def transform(self, X: DataFrame) -> DataFrame: """Impute missing values with the supplied impute_value. Parameters ---------- X : DataFrame Data containing columns to impute. Returns ------- X : DataFrame Transformed input X with nulls imputed with the specified impute_value, for the specified columns. Raises ------ TypeError: if provided data is incompatible with provided impute value Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": ["cat", None, "dog"]}) >>> imputer = _StringImputer(columns=["a"], impute_value="missing") >>> imputer.transform(test_df) shape: (3, 1) ┌─────────┐ │ a │ │ --- │ │ str │ ╞═════════╡ │ cat │ │ missing │ │ dog │ └─────────┘ ``` """ X = _convert_dataframe_to_narwhals(X) native_namespace = nw.get_native_namespace(X).__name__ X = BaseTransformer.transform(self, X, return_native_override=False) schema = X.collect_schema() bad_types = [ schema[col] for col in self.columns if schema[col] not in {nw.String, nw.Categorical, nw.Enum, nw.Unknown} ] if bad_types: msg = f""" ArbitraryImputer: transformer can only handle String/Categorical/Enum/Unknown type columns but got columns with types {bad_types} """ raise TypeError( msg, ) transform_expressions = {} for col in self.columns: # have to handle categorical vars for pandas upfront if native_namespace == "pandas": transform_expressions[col] = ( self.cat_to_enum_expr( nw.col(col), categories=X.get_column(col).cat.get_categories().to_list(), ) if ((schema[col] == nw.Categorical) or (schema[col] == nw.Enum)) else nw.col(col) ) else: transform_expressions[col] = nw.col(col) # next handle imputing transform_expressions[col] = self._generate_imputation_expressions( transform_expressions[col], col, ) X = X.with_columns(**transform_expressions) if transform_expressions else X return _return_narwhals_or_native_dataframe(X, self.return_native) class _BooleanImputer(BaseImputer): """Private subclass to handle arbitrary boolean imputation. Attributes ---------- impute_value : bool Value to impute nulls with. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = False @beartype def __init__( self, impute_value: bool, columns: str | list[str], **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- impute_value : bool Value to impute nulls with. columns : str or list of strs Columns to impute **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.impute_values_ = {} self.impute_value = impute_value for c in self.columns: self.impute_values_[c] = self.impute_value self.is_fitted_ = True # Does not fit @beartype def transform(self, X: DataFrame) -> DataFrame: """Impute missing values with the supplied impute_value. Parameters ---------- X : DataFrame Data containing columns to impute. Returns ------- X : DataFrame Transformed input X with nulls imputed with the specified impute_value, for the specified columns. Raises ------ TypeError: if provided data is incompatible with provided impute value Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": [True, None, False]}) >>> imputer = _BooleanImputer(columns=["a"], impute_value=True) >>> imputer.transform(test_df) shape: (3, 1) ┌───────┐ │ a │ │ --- │ │ bool │ ╞═══════╡ │ true │ │ true │ │ false │ └───────┘ ``` """ X = _convert_dataframe_to_narwhals(X) schema = X.collect_schema() allowed_types = [nw.Boolean, nw.Unknown] allowed_types_str = "Boolean/Unknown" if nw.get_native_namespace(X).__name__ == "pandas": allowed_types += [nw.Object] allowed_types_str += "/Object" bad_types = [ schema[col] for col in self.columns if schema[col] not in allowed_types ] if bad_types: msg = f""" ArbitraryImputer: transformer can only handle {allowed_types_str} type columns but got columns with types {bad_types} """ raise TypeError( msg, ) X = BaseTransformer.transform(self, X, return_native_override=False) # next handle imputing transform_expressions = { col: self._generate_imputation_expressions( nw.col(col), col, ).cast(nw.Boolean) for col in self.columns } X = X.with_columns(**transform_expressions) if transform_expressions else X return _return_narwhals_or_native_dataframe(X, self.return_native)
[docs] @register class ArbitraryImputer(BaseImputer): """Transformer to impute null values with an arbitrary pre-defined value. Attributes ---------- impute_value : int or float or str or bool Value to impute nulls with. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> arbitrary_imputer = ArbitraryImputer(columns=["a", "b"], impute_value=5) >>> arbitrary_imputer ArbitraryImputer(columns=['a', 'b'], impute_value=5) >>> # transformer can also be dumped to json and reinitialised >>> json_dump = arbitrary_imputer.to_json() >>> json_dump {'tubular_version': ..., 'classname': 'ArbitraryImputer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True, 'impute_value': 5}, 'fit': {'is_fitted_': True, 'impute_values_': {'a': 5, 'b': 5}}} >>> ArbitraryImputer.from_json(json_dump) ArbitraryImputer(columns=['a', 'b'], impute_value=5) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = False @beartype def __init__( self, impute_value: int | float | str | bool, columns: str | list[str], **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- impute_value : int or float or str or bool Value to impute nulls with. columns : str or list of strs Columns to impute **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.impute_values_ = {} self.impute_value = impute_value for c in self.columns: self.impute_values_[c] = self.impute_value self.is_fitted_ = True # Does not fit
[docs] @beartype def transform(self, X: DataFrame) -> DataFrame: """Impute missing values with the supplied impute_value. Parameters ---------- X : DataFrame Data containing columns to impute. Returns ------- X : DataFrame Transformed input X with nulls imputed with the specified impute_value, for the specified columns. Example: -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": [1, None, 2], "b": [3, None, 4]}) >>> imputer = ArbitraryImputer(columns=["a", "b"], impute_value=5) >>> imputer.transform(test_df) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 3 │ │ 5 ┆ 5 │ │ 2 ┆ 4 │ └─────┴─────┘ ``` """ self.check_is_fitted("is_fitted_") X = _convert_dataframe_to_narwhals(X) if isinstance(self.impute_value, (int, float)) and not isinstance( self.impute_value, bool ): imp = _NumberImputer( columns=self.columns, impute_value=self.impute_value, return_native=self.return_native, ) elif isinstance(self.impute_value, str): imp = _StringImputer( columns=self.columns, impute_value=self.impute_value, return_native=self.return_native, ) else: imp = _BooleanImputer( columns=self.columns, impute_value=self.impute_value, return_native=self.return_native, ) return imp.transform(X)
[docs] @register class MedianImputer(BaseImputer, WeightColumnMixin): """Transformer to impute missing values with the median of the supplied columns. Attributes ---------- impute_values_ : dict Created during fit method. Dictionary of float / int (median) values of columns in the columns attribute. Keys of impute_values_ give the column names. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> median_imputer = MedianImputer( ... columns=["a", "b"], ... ) >>> median_imputer MedianImputer(columns=['a', 'b']) >>> # once fit, transformer can also be dumped to json and reinitialised >>> test_df = pl.DataFrame({"a": [0, None], "b": [None, 1]}) >>> _ = median_imputer.fit(test_df) >>> json_dump = median_imputer.to_json() >>> json_dump {'tubular_version': ..., 'classname': 'MedianImputer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True, 'weights_column': None}, 'fit': {'is_fitted_': True, 'impute_values_': {'a': 0.0, 'b': 1.0}}} >>> MedianImputer.from_json(json_dump) MedianImputer(columns=['a', 'b']) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = True @beartype def __init__( self, columns: str | list[str], weights_column: str | None = None, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns : None or str or list, default = None Columns to impute, if the default of None is supplied all columns in X are used when the transform method is called. weights_column: None or str, default=None Column containing weights **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.weights_column = weights_column
[docs] @block_from_json @beartype def fit(self, X: DataFrame, y: Series | LazyFrame | None = None) -> MedianImputer: """Calculate median values to impute with from X. Parameters ---------- X : DataFrame Data to "learn" the median values from. y : Series or LazyFrame or None, default = None Not required. Returns ------- MedianImputer: fitted class instance. Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": [1, None, 2], "b": [3, None, 4]}) >>> imputer = MedianImputer(columns=["a", "b"]) >>> imputer = imputer.fit(test_df) >>> imputer.transform(test_df) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════╡ │ 1.0 ┆ 3.0 │ │ 1.5 ┆ 3.5 │ │ 2.0 ┆ 4.0 │ └─────┴─────┘ ``` """ X = _convert_dataframe_to_narwhals(X) y = _convert_series_to_narwhals(y) super().fit(X, y) self.impute_values_ = {} # as median depends on data ordering, it is less amenable to writing in # pure expression form, so implementation here is still # slightly pandas-like # also, the weighted median approach is genuinely different to the unweighted # approach, so have left as two separate logic flows if self.weights_column is not None: WeightColumnMixin.check_weights_column(self, X, self.weights_column) valid_weights_filter_expr = WeightColumnMixin.get_valid_weights_filter_expr( self.weights_column, self.verbose ) X_temp = X.filter(valid_weights_filter_expr) for c in self.columns: col_not_null_expr = ~nw.col(c).is_null() X_c = X_temp.filter(col_not_null_expr) median_expr = _get_median_calculation_expression( values_column=c, weights_column=self.weights_column, ) # impute value is weighted median self.impute_values_[c] = _collect_frame(X_c.select(median_expr)).item( 0, 0 ) else: median_exprs = { c: _get_median_calculation_expression( values_column=c, weights_column=None ) for c in self.columns } results_dict = ( _collect_frame(X) .select( **median_exprs, ) .to_dict(as_series=False) ) self.impute_values_.update( {col: value[0] for col, value in results_dict.items()}, ) self._check_for_failed_fit() self.is_fitted_ = True return self
[docs] @register class MeanImputer(WeightColumnMixin, BaseImputer): """Transformer to impute missing values with the mean of the supplied columns. Attributes ---------- impute_values_ : dict Created during fit method. Dictionary of float / int (mean) values of columns in the columns attribute. Keys of impute_values_ give the column names. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> mean_imputer = MeanImputer( ... columns=["a", "b"], ... ) >>> mean_imputer MeanImputer(columns=['a', 'b']) >>> # once fit, transformer can also be dumped to json and reinitialised >>> test_df = pl.DataFrame({"a": [0, None], "b": [None, 1]}) >>> _ = mean_imputer.fit(test_df) >>> json_dump = mean_imputer.to_json() >>> json_dump {'tubular_version': ..., 'classname': 'MeanImputer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True, 'weights_column': None}, 'fit': {'is_fitted_': True, 'impute_values_': {'a': 0.0, 'b': 1.0}}} >>> MeanImputer.from_json(json_dump) MeanImputer(columns=['a', 'b']) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = True @beartype def __init__( self, columns: str | list[str], weights_column: str | None = None, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns : None or str or list, default = None Columns to impute, if the default of None is supplied all columns in X are used when the transform method is called. weights_column : None or str, default = None Column containing weights. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.weights_column = weights_column
[docs] @block_from_json @beartype def fit(self, X: DataFrame, y: Series | LazyFrame | None = None) -> MeanImputer: """Calculate mean values to impute with from X. Parameters ---------- X : DataFrame Data to "learn" the mean values from. y : Series or LazyFrame or None, default = None Not required. Returns ------- MeanImputer: fitted class instance. Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": [1, None, 2], "b": [3, None, 4]}) >>> imputer = MeanImputer(columns=["a", "b"]) >>> imputer = imputer.fit(test_df) >>> imputer.transform(test_df) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ f64 ┆ f64 │ ╞═════╪═════╡ │ 1.0 ┆ 3.0 │ │ 1.5 ┆ 3.5 │ │ 2.0 ┆ 4.0 │ └─────┴─────┘ ``` """ X = _convert_dataframe_to_narwhals(X) y = _convert_series_to_narwhals(y) super().fit(X, y) self.impute_values_ = {} if not self.columns: return self weights_column = self.weights_column if self.weights_column is None: X, weights_column = WeightColumnMixin._create_unit_weights_column( X, return_native=False, verbose=self.verbose, ) WeightColumnMixin.check_weights_column(self, X, weights_column) valid_weights_filter_expr = WeightColumnMixin.get_valid_weights_filter_expr( weights_column, self.verbose ) X = X.filter(valid_weights_filter_expr) weighted_mean_exprs = _get_mean_calculation_expressions( self.columns, weights_column, ) results_dict = _collect_frame(X.select(**weighted_mean_exprs)).to_dict( as_series=False ) # results looks like {key: [value]} so extract value from list self.impute_values_.update( {col: value[0] for col, value in results_dict.items()}, ) self._check_for_failed_fit() self.is_fitted_ = True return self
[docs] @register class ModeImputer(BaseImputer, WeightColumnMixin): """Transformer to impute missing values with the mode of the supplied columns. If mode is NaN, a warning will be raised. Attributes ---------- impute_values_ : dict Created during fit method. Dictionary of float / int (mode) values of columns in the columns attribute. Keys of impute_values_ give the column names. built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> mode_imputer = ModeImputer( ... columns=["a", "b"], ... ) >>> mode_imputer ModeImputer(columns=['a', 'b']) >>> # once fit, transformer can also be dumped to json and reinitialised >>> test_df = pl.DataFrame({"a": [0, None], "b": [None, 1]}) >>> _ = mode_imputer.fit(test_df) >>> json_dump = mode_imputer.to_json() >>> json_dump {'tubular_version': ..., 'classname': 'ModeImputer', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True, 'weights_column': None}, 'fit': {'is_fitted_': True, 'impute_values_': {'a': 0, 'b': 1}}} >>> ModeImputer.from_json(json_dump) ModeImputer(columns=['a', 'b']) ``` """ polars_compatible = True lazyframe_compatible = True jsonable = True FITS = True @beartype def __init__( self, columns: str | list[str], weights_column: str | None = None, **kwargs: bool, ) -> None: """Initialise class instance. Parameters ---------- columns : None or str or list, default = None Columns to impute, if the default of None is supplied all columns in X are used when the transform method is called. weights_column : str Name of weights columns to use if mode should be in terms of sum of weights not count of rows. **kwargs Arbitrary keyword arguments passed onto BaseTransformer.init method. """ super().__init__(columns=columns, **kwargs) self.weights_column = weights_column
[docs] @block_from_json @beartype def fit(self, X: DataFrame, y: Series | LazyFrame | None = None) -> ModeImputer: """Calculate mode values to impute with from X. In the event of a tie, the highest modal value will be returned. Parameters ---------- X : DataFrame Data to "learn" the mode values from. y : Series or LazyFrame or None, default = None Not required. Returns ------- ModeImputer: fitted class instance Examples -------- ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": [1, None, 2], "b": [3, None, 4]}) >>> imputer = ModeImputer(columns=["a", "b"]) >>> imputer = imputer.fit(test_df) >>> imputer.transform(test_df) shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ │ --- ┆ --- │ │ i64 ┆ i64 │ ╞═════╪═════╡ │ 1 ┆ 3 │ │ 2 ┆ 4 │ │ 2 ┆ 4 │ └─────┴─────┘ ``` """ X = _convert_dataframe_to_narwhals(X) y = _convert_series_to_narwhals(y) super().fit(X, y) self.impute_values_ = {} weights_column = self.weights_column if self.weights_column is None: X, weights_column = WeightColumnMixin._create_unit_weights_column( X, return_native=False, verbose=self.verbose, ) WeightColumnMixin.check_weights_column(self, X, weights_column) valid_weights_filter_expr = WeightColumnMixin.get_valid_weights_filter_expr( weights_column, self.verbose ) X = X.filter(valid_weights_filter_expr) self.impute_values_ = {} for c in self.columns: group = ( X.filter(~nw.col(c).is_null()) .group_by(c) .agg(nw.col(weights_column).sum().alias(f"{c}_total_weight")) .filter( nw.col(f"{c}_total_weight") == nw.col(f"{c}_total_weight").max() ) ) results_dict = _collect_frame(group).to_dict(as_series=True) mode_values = results_dict[c].sort(descending=True).to_list() n_mode_vals = len(mode_values) if n_mode_vals == 0: mode_value = None elif n_mode_vals == 1: mode_value = mode_values[0] elif n_mode_vals > 1: warnings.warn( f"ModeImputer: The Mode of column {c} is tied, will sort in descending order and return first candidate", stacklevel=2, ) mode_value = mode_values[0] self.impute_values_[c] = mode_value self._check_for_failed_fit() self.is_fitted_ = True return self
[docs] @register class NullIndicator(BaseTransformer): """Class to create a binary indicator column for null values. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes Examples -------- ```pycon >>> null_indicator = NullIndicator( ... columns=["a", "b"], ... ) >>> null_indicator NullIndicator(columns=['a', 'b']) >>> # transformer can also be dumped to json and reinitialised >>> json_dump = null_indicator.to_json() >>> json_dump {'tubular_version': ..., 'classname': 'NullIndicator', 'init': {'columns': ['a', 'b'], 'copy': False, 'verbose': False, 'return_native': True}, 'fit': {'is_fitted_': True}} >>> NullIndicator.from_json(json_dump) NullIndicator(columns=['a', 'b']) ``` """ polars_compatible = True lazyframe_compatible = True FITS = False jsonable = True @beartype def __init__( self, columns: ListOfStrs | str, **kwargs: bool | None, ) -> None: """Initialise class instance. Parameters ---------- columns : None or str or list, default = None Columns to produce indicator columns for, if the default of None is supplied all columns in X are used when the transform method is called. kwargs: bool arguments for base class, e.g. verbose. """ super().__init__(columns=columns, **kwargs) self.is_fitted_ = True # does not fit
[docs] @beartype def transform(self, X: DataFrame) -> DataFrame: """Create new columns indicating the position of null values for each variable in self.columns. Parameters ---------- X : DataFrame Data to add indicators to. Returns ------- DataFrame: dataframe with null indicator columns added Examples -------- --------, ```pycon >>> import polars as pl >>> test_df = pl.DataFrame({"a": [1, None, 2], "b": [3, None, 4]}) >>> imputer = NullIndicator(columns=["a", "b"]) >>> imputer.transform(test_df) shape: (3, 4) ┌──────┬──────┬─────────┬─────────┐ │ a ┆ b ┆ a_nulls ┆ b_nulls │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ bool ┆ bool │ ╞══════╪══════╪═════════╪═════════╡ │ 1 ┆ 3 ┆ false ┆ false │ │ null ┆ null ┆ true ┆ true │ │ 2 ┆ 4 ┆ false ┆ false │ └──────┴──────┴─────────┴─────────┘ ``` """ X = super().transform(X, return_native_override=False) X = _convert_dataframe_to_narwhals(X) X = X.with_columns( (nw.col(c).is_null()).alias(f"{c}_nulls") for c in self.columns ) return X if not self.return_native else X.to_native()
# DEPRECATED TRANSFORMERS
[docs] @deprecated( """This transformer has not been selected for conversion to polars/narwhals, and so has been deprecated. If it is useful to you, please raise an issue for it to be modernised """, ) class NearestMeanResponseImputer(BaseImputer): """Impute nulls with the value where the average target is most similar to that for the nulls. Attributes ---------- built_from_json: bool indicates if transformer was reconstructed from json, which limits it's supported functionality to .transform polars_compatible : bool class attribute, indicates whether transformer has been converted to polars/pandas agnostic narwhals framework return_native: bool, default = True Controls whether transformer returns narwhals or native pandas/polars type jsonable: bool class attribute, indicates if transformer supports to/from_json methods FITS: bool class attribute, indicates whether transform requires fit to be run first lazyframe_compatible: bool class attribute, indicates whether transformer works with lazyframes deprecated: bool indicates if class has been deprecated """ polars_compatible = True lazyframe_compatible = False jsonable = False FITS = True deprecated = True def __init__( self, columns: str | list[str] | None = None, **kwargs: dict[str, bool], ) -> None: """Initialise class instance. Parameters ---------- columns : None or str or list, default = None Columns to impute, if the default of None is supplied all columns in X are used when the transform method is called. If the column does not contain nulls at fit, a warning will be issues and this transformer will have no effect on that column. kwargs: bool arguments for base class, e.g. verbose """ super().__init__(columns=columns, **kwargs)
[docs] @beartype def fit(self, X: DataFrame, y: Series) -> NearestMeanResponseImputer: """Calculate mean values to impute with. Parameters ---------- X : FrameT Data to fit the transformer on. y : nw.Series Response column used to determine the value to impute with. The average response for each level of every column is calculated. The level which has the closest average response to the average response of the unknown levels is selected as the imputation value. Returns ------- NearestMeanResponseImputer: fitted class instance Raises ------ ValueError: provided y contains nulls """ X = _convert_dataframe_to_narwhals(X) y = _convert_series_to_narwhals(y) super().fit(X, y) if (n_nulls := y.is_null().sum()) > 0: msg = f"{self.classname()}: y has {n_nulls} null values" raise ValueError(msg) self.impute_values_ = {} X_y = nw.from_native(self._combine_X_y(X, y)) response_column = "_temporary_response" for c in self.columns: c_nulls = X.select(nw.col(c).is_null())[c] if c_nulls.sum() == 0: msg = f"{self.classname()}: Column {c} has no missing values, this transformer will have no effect for this column." warnings.warn(msg, stacklevel=2) self.impute_values_[c] = None else: mean_response_by_levels = ( X_y.filter(~c_nulls).group_by(c).agg(nw.col(response_column).mean()) ) mean_response_nulls = X_y.filter(c_nulls)[response_column].mean() mean_response_by_levels = mean_response_by_levels.with_columns( (nw.col(response_column) - mean_response_nulls) .abs() .alias("abs_diff_response"), ) # take first value having the minimum difference in terms of average response self.impute_values_[c] = mean_response_by_levels.filter( mean_response_by_levels["abs_diff_response"] == mean_response_by_levels["abs_diff_response"].min(), )[c].item(index=0) return self