diff --git a/feature_engine/_docstrings/init_parameters/all_trasnformers.py b/feature_engine/_docstrings/init_parameters/all_trasnformers.py index 5c699d3de..510180463 100644 --- a/feature_engine/_docstrings/init_parameters/all_trasnformers.py +++ b/feature_engine/_docstrings/init_parameters/all_trasnformers.py @@ -22,3 +22,10 @@ contain missing values. If `'ignore'`, missing data will be ignored when learning parameters or performing the transformation. """.rstrip() + +_group_by_docstring = """group_by: str, int, or list of strings or integers,default=None + A group_by operation involves some combination of splitting the object, + applying a function, and combining the results. + This can be used to group large amounts of data and + compute operations on these groups. + """.rstrip() diff --git a/feature_engine/selection/drop_psi_features.py b/feature_engine/selection/drop_psi_features.py index e425f674e..3e87adbdb 100644 --- a/feature_engine/selection/drop_psi_features.py +++ b/feature_engine/selection/drop_psi_features.py @@ -1,5 +1,5 @@ import datetime -from typing import List, Union +from typing import Dict, List, Union import numpy as np import pandas as pd @@ -475,7 +475,7 @@ def fit(self, X: pd.DataFrame, y: pd.Series = None): threshold_cat = self.threshold # Compute the PSI by looping over the features - self.psi_values_ = {} + self.psi_values_: Dict = {} self.features_to_drop_ = [] # Compute PSI for numerical features diff --git a/feature_engine/timeseries/forecasting/base_forecast_transformers.py b/feature_engine/timeseries/forecasting/base_forecast_transformers.py index d6e9fa30f..aee003833 100644 --- a/feature_engine/timeseries/forecasting/base_forecast_transformers.py +++ b/feature_engine/timeseries/forecasting/base_forecast_transformers.py @@ -14,6 +14,7 @@ ) from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, + _group_by_docstring, _missing_values_docstring, ) from feature_engine._docstrings.methods import _fit_not_learn_docstring @@ -37,6 +38,7 @@ feature_names_in_=_feature_names_in_docstring, fit=_fit_not_learn_docstring, n_features_in_=_n_features_in_docstring, + group_by=_group_by_docstring, ) class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOutMixin): """ @@ -51,6 +53,8 @@ class BaseForecastTransformer(BaseEstimator, TransformerMixin, GetFeatureNamesOu {drop_original} + {group_by} + Attributes ---------- {feature_names_in_} @@ -64,6 +68,7 @@ def __init__( variables: Union[None, int, str, List[Union[str, int]]] = None, missing_values: str = "raise", drop_original: bool = False, + group_by: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if missing_values not in ["raise", "ignore"]: @@ -81,6 +86,7 @@ def __init__( self.variables = _check_variables_input_value(variables) self.missing_values = missing_values self.drop_original = drop_original + self.group_by = group_by def _check_index(self, X: pd.DataFrame): """ diff --git a/feature_engine/timeseries/forecasting/expanding_window_features.py b/feature_engine/timeseries/forecasting/expanding_window_features.py index 6a2e5037c..3061f10fc 100644 --- a/feature_engine/timeseries/forecasting/expanding_window_features.py +++ b/feature_engine/timeseries/forecasting/expanding_window_features.py @@ -3,7 +3,7 @@ from __future__ import annotations -from typing import List +from typing import List, Union import pandas as pd @@ -13,6 +13,7 @@ ) from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, + _group_by_docstring, _missing_values_docstring, _variables_numerical_docstring, ) @@ -34,6 +35,7 @@ n_features_in_=_n_features_in_docstring, fit=_fit_not_learn_docstring, fit_transform=_fit_transform_docstring, + group_by=_group_by_docstring, ) class ExpandingWindowFeatures(BaseForecastTransformer): """ @@ -93,6 +95,8 @@ class ExpandingWindowFeatures(BaseForecastTransformer): {drop_original} + {group_by} + Attributes ---------- variables_: @@ -151,6 +155,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, + group_by: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if not isinstance(functions, (str, list)) or not all( @@ -168,7 +173,7 @@ def __init__( f"periods must be a non-negative integer. Got {periods} instead." ) - super().__init__(variables, missing_values, drop_original) + super().__init__(variables, missing_values, drop_original, group_by) self.min_periods = min_periods self.functions = functions @@ -193,12 +198,21 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: # Common dataframe checks and setting up. X = self._check_transform_input_and_state(X) - tmp = ( - X[self.variables_] - .expanding(min_periods=self.min_periods) - .agg(self.functions) - .shift(periods=self.periods, freq=self.freq) - ) + if self.group_by: + original_index = X.index + tmp = X.groupby(self.group_by, as_index=False).apply( + self._agg_expanding_window_features, + include_groups=False, + ) + tmp = tmp.set_index(original_index) + tmp = tmp.reindex(original_index) + else: + tmp = ( + X[self.variables_] + .expanding(min_periods=self.min_periods) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) tmp.columns = self._get_new_features_name() @@ -224,3 +238,25 @@ def _get_new_features_name(self) -> List: ] return feature_names + + def _agg_expanding_window_features( + self, + grouped_df: pd.core.groupby.generic.DataFrameGroupBy, + ) -> Union[pd.Series, pd.DataFrame]: + """generate expanding window features based on groups + Parameters + ---------- + grouped_df : pd.core.groupby.generic.DataFrameGroupBy + dataframe of groups + + Returns + ------- + Union[pd.Series, pd.DataFrame] + returned expanding window features + """ + return ( + grouped_df[self.variables_] + .expanding(min_periods=self.min_periods) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) diff --git a/feature_engine/timeseries/forecasting/lag_features.py b/feature_engine/timeseries/forecasting/lag_features.py index 19822ea5f..65c2c3d38 100644 --- a/feature_engine/timeseries/forecasting/lag_features.py +++ b/feature_engine/timeseries/forecasting/lag_features.py @@ -11,6 +11,7 @@ ) from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, + _group_by_docstring, _missing_values_docstring, _variables_numerical_docstring, ) @@ -32,6 +33,7 @@ n_features_in_=_n_features_in_docstring, fit=_fit_not_learn_docstring, fit_transform=_fit_transform_docstring, + group_by=_group_by_docstring, ) class LagFeatures(BaseForecastTransformer): """ @@ -74,6 +76,8 @@ class LagFeatures(BaseForecastTransformer): {drop_original} + {group_by} + Attributes ---------- variables_: @@ -127,6 +131,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, + group_by: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if not ( @@ -151,7 +156,7 @@ def __init__( "sort_index takes values True and False." f"Got {sort_index} instead." ) - super().__init__(variables, missing_values, drop_original) + super().__init__(variables, missing_values, drop_original, group_by) self.periods = periods self.freq = freq @@ -180,35 +185,57 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if isinstance(self.freq, list): df_ls = [] for fr in self.freq: - tmp = X[self.variables_].shift( - freq=fr, - axis=0, - ) + if self.group_by: + tmp = self._agg_freq_lags( + grouped_df=X.groupby(self.group_by), + freq=fr, + ) + else: + tmp = X[self.variables_].shift( + freq=fr, + axis=0, + ) df_ls.append(tmp) tmp = pd.concat(df_ls, axis=1) else: - tmp = X[self.variables_].shift( - freq=self.freq, - axis=0, - ) + if self.group_by: + tmp = self._agg_freq_lags( + grouped_df=X.groupby(self.group_by), + freq=self.freq, + ) + else: + tmp = X[self.variables_].shift( + freq=self.freq, + axis=0, + ) else: if isinstance(self.periods, list): df_ls = [] for pr in self.periods: - tmp = X[self.variables_].shift( - periods=pr, - axis=0, - ) + if self.group_by: + tmp = X.groupby(self.group_by)[self.variables_].shift( + periods=pr, + ) + else: + tmp = X[self.variables_].shift( + periods=pr, + axis=0, + ) df_ls.append(tmp) tmp = pd.concat(df_ls, axis=1) else: - tmp = X[self.variables_].shift( - periods=self.periods, - axis=0, - ) + if self.group_by: + tmp = X.groupby(self.group_by)[self.variables_].shift( + periods=self.periods, + ) + else: + tmp = X[self.variables_].shift( + periods=self.periods, + axis=0, + ) tmp.columns = self._get_new_features_name() @@ -243,3 +270,30 @@ def _get_new_features_name(self) -> List: ] return feature_names + + def _agg_freq_lags( + self, + grouped_df: pd.core.groupby.generic.DataFrameGroupBy, + freq: Union[str, List[str]], + ) -> Union[pd.Series, pd.DataFrame]: + """_summary_ + + Parameters + ---------- + grouped_df : pd.core.groupby.generic.DataFrameGroupBy + dataframe of groups + freq : Union[str, List[str]] + Offset to use from the tseries module or time rule. + + Returns + ------- + Union[pd.Series, pd.DataFrame] + lag feature or dataframe of lag features + """ + tmp_data = [] + for _, group in grouped_df: + original_idx = group.index + tmp = group[self.variables_].shift(freq=freq).reindex(original_idx) + tmp_data.append(tmp) + tmp = pd.concat(tmp_data).sort_index() + return tmp diff --git a/feature_engine/timeseries/forecasting/window_features.py b/feature_engine/timeseries/forecasting/window_features.py index 3cb89ccfa..0e7d316cb 100644 --- a/feature_engine/timeseries/forecasting/window_features.py +++ b/feature_engine/timeseries/forecasting/window_features.py @@ -8,6 +8,7 @@ ) from feature_engine._docstrings.init_parameters.all_trasnformers import ( _drop_original_docstring, + _group_by_docstring, _missing_values_docstring, _variables_numerical_docstring, ) @@ -29,6 +30,7 @@ n_features_in_=_n_features_in_docstring, fit=_fit_not_learn_docstring, fit_transform=_fit_transform_docstring, + group_by=_group_by_docstring, ) class WindowFeatures(BaseForecastTransformer): """ @@ -98,6 +100,8 @@ class WindowFeatures(BaseForecastTransformer): {drop_original} + {group_by} + Attributes ---------- variables_: @@ -156,6 +160,7 @@ def __init__( sort_index: bool = True, missing_values: str = "raise", drop_original: bool = False, + group_by: Union[None, int, str, List[Union[str, int]]] = None, ) -> None: if isinstance(window, list) and len(window) != len(set(window)): @@ -176,7 +181,7 @@ def __init__( f"periods must be a positive integer. Got {periods} instead." ) - super().__init__(variables, missing_values, drop_original) + super().__init__(variables, missing_values, drop_original, group_by) self.window = window self.min_periods = min_periods @@ -205,22 +210,42 @@ def transform(self, X: pd.DataFrame) -> pd.DataFrame: if isinstance(self.window, list): df_ls = [] for win in self.window: + if self.group_by: + original_index = X.index + tmp = X.groupby(self.group_by, as_index=False).apply( + self._agg_window_features, + win=win, + include_groups=False, + ) + tmp = tmp.set_index(original_index) + tmp = tmp.reindex(original_index) + else: + tmp = ( + X[self.variables_] + .rolling(window=win) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) + df_ls.append(tmp) + tmp = pd.concat(df_ls, axis=1) + + else: + if self.group_by: + original_index = X.index + tmp = X.groupby(self.group_by, as_index=False).apply( + self._agg_window_features, + win=self.window, + include_groups=False, + ) + tmp = tmp.set_index(original_index) + tmp = tmp.reindex(original_index) + else: tmp = ( X[self.variables_] - .rolling(window=win) + .rolling(window=self.window) .agg(self.functions) .shift(periods=self.periods, freq=self.freq) ) - df_ls.append(tmp) - tmp = pd.concat(df_ls, axis=1) - - else: - tmp = ( - X[self.variables_] - .rolling(window=self.window) - .agg(self.functions) - .shift(periods=self.periods, freq=self.freq) - ) tmp.columns = self._get_new_features_name() @@ -254,3 +279,29 @@ def _get_new_features_name(self) -> List: ] return feature_names + + def _agg_window_features( + self, + grouped_df: pd.core.groupby.generic.DataFrameGroupBy, + win: Union[str, int, Callable, List[int], List[str]], + ) -> Union[pd.Series, pd.DataFrame]: + """generate window features based on groups + Parameters + ---------- + grouped_df : pd.core.groupby.generic.DataFrameGroupBy + dataframe of groups + + window: Union[str, int, Callable, List[int], List[str]] + Size of the moving window + + Returns + ------- + Union[pd.Series, pd.DataFrame] + returned window features + """ + return ( + grouped_df[self.variables_] + .rolling(window=win) + .agg(self.functions) + .shift(periods=self.periods, freq=self.freq) + ) diff --git a/tests/test_time_series/test_forecasting/test_expanding_window_features.py b/tests/test_time_series/test_forecasting/test_expanding_window_features.py index cb33ea8e1..01e9806ba 100644 --- a/tests/test_time_series/test_forecasting/test_expanding_window_features.py +++ b/tests/test_time_series/test_forecasting/test_expanding_window_features.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd import pytest from pandas.testing import assert_frame_equal @@ -428,3 +429,137 @@ def test_expanding_window_raises_when_periods_negative(): ValueError, match="periods must be a non-negative integer. Got -1 instead." ): ExpandingWindowFeatures(periods=-1) + + +def test_correct_groupby_expanding_window_when_using_periods(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_expanding_mean": [ + np.nan, + 31.31, + 31.41, + 31.656666666666666, + 31.84, + 31.996, + 32.08, + 32.142857142857146, + 32.21, + 32.382222222222225, + np.nan, + 34.08, + 33.89, + 33.89, + 33.9275, + ], + "irradiation_expanding_mean": [ + np.nan, + 0.51, + 0.65, + 0.65, + 0.6775, + 0.626, + 0.6033333333333334, + 0.5985714285714285, + 0.59375, + 0.61, + np.nan, + 0.47, + 0.505, + 0.47000000000000003, + 0.465, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = ExpandingWindowFeatures( + variables=["ambient_temp", "irradiation"], group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) diff --git a/tests/test_time_series/test_forecasting/test_lag_features.py b/tests/test_time_series/test_forecasting/test_lag_features.py index 8ea349778..f55ff168c 100644 --- a/tests/test_time_series/test_forecasting/test_lag_features.py +++ b/tests/test_time_series/test_forecasting/test_lag_features.py @@ -233,3 +233,609 @@ def test_sort_index(df_time): A = Xs[transformer.variables_].iloc[0:4].values B = X_tr[transformer._get_new_features_name()].iloc[1:5].values assert (A == B).all() + + +def test_correct_groupby_lag_when_using_periods(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_lag_3": [ + np.nan, + np.nan, + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + np.nan, + np.nan, + np.nan, + 34.08, + 33.7, + ], + "irradiation_lag_3": [ + np.nan, + np.nan, + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + np.nan, + np.nan, + np.nan, + 0.47, + 0.54, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = LagFeatures( + variables=["ambient_temp", "irradiation"], periods=3, group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) + + +def test_multiple_periods_with_groupby(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_lag_2": [ + np.nan, + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + np.nan, + np.nan, + 34.08, + 33.7, + 33.89, + ], + "irradiation_lag_2": [ + np.nan, + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + np.nan, + np.nan, + 0.47, + 0.54, + 0.4, + ], + "ambient_temp_lag_3": [ + np.nan, + np.nan, + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + np.nan, + np.nan, + np.nan, + 34.08, + 33.7, + ], + "irradiation_lag_3": [ + np.nan, + np.nan, + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + np.nan, + np.nan, + np.nan, + 0.47, + 0.54, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = LagFeatures( + variables=["ambient_temp", "irradiation"], periods=[2, 3], group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) + + +def test_correct_groupby_lag_when_using_freq(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "irradiation_lag_15min": [ + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + np.nan, + 0.47, + 0.54, + 0.4, + 0.45, + ], + "ambient_temp_lag_15min": [ + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + np.nan, + 34.08, + 33.7, + 33.89, + 34.04, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = LagFeatures( + variables=["irradiation", "ambient_temp"], freq="15min", group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) + + +def test_multiple_freq_with_groupby(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "irradiation_lag_15min": [ + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + np.nan, + 0.47, + 0.54, + 0.4, + 0.45, + ], + "ambient_temp_lag_15min": [ + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + np.nan, + 34.08, + 33.7, + 33.89, + 34.04, + ], + "irradiation_lag_30min": [ + np.nan, + np.nan, + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + np.nan, + np.nan, + 0.47, + 0.54, + 0.4, + ], + "ambient_temp_lag_30min": [ + np.nan, + np.nan, + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + np.nan, + np.nan, + 34.08, + 33.7, + 33.89, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = LagFeatures( + variables=["irradiation", "ambient_temp"], + freq=["15min", "30min"], + group_by="color", + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) diff --git a/tests/test_time_series/test_forecasting/test_window_features.py b/tests/test_time_series/test_forecasting/test_window_features.py index a03259b7e..40fdbc6fa 100644 --- a/tests/test_time_series/test_forecasting/test_window_features.py +++ b/tests/test_time_series/test_forecasting/test_window_features.py @@ -454,3 +454,305 @@ def test_sort_index(df_time): assert_frame_equal( df_tr[transformer.variables_], Xs[transformer.variables_].sort_index() ) + + +def test_correct_groupby_window_when_using_periods(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_window_3_mean": [ + np.nan, + np.nan, + np.nan, + 31.656666666666666, + 32.016666666666666, + 32.38666666666666, + 32.50333333333333, + 32.54666666666667, + 32.56666666666667, + 32.98666666666667, + np.nan, + np.nan, + np.nan, + 33.89, + 33.876666666666665, + ], + "irradiation_window_3_mean": [ + np.nan, + np.nan, + np.nan, + 0.65, + 0.7333333333333334, + 0.61, + 0.5566666666666668, + 0.49333333333333335, + 0.54, + 0.6233333333333334, + np.nan, + np.nan, + np.nan, + 0.47000000000000003, + 0.4633333333333334, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = WindowFeatures( + variables=["ambient_temp", "irradiation"], window=3, group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df) + + +def test_multiple_windows_with_groupby(df_time): + date_time = [ + pd.Timestamp("2020-05-15 12:00:00"), + pd.Timestamp("2020-05-15 12:15:00"), + pd.Timestamp("2020-05-15 12:30:00"), + pd.Timestamp("2020-05-15 12:45:00"), + pd.Timestamp("2020-05-15 13:00:00"), + pd.Timestamp("2020-05-15 13:15:00"), + pd.Timestamp("2020-05-15 13:30:00"), + pd.Timestamp("2020-05-15 13:45:00"), + pd.Timestamp("2020-05-15 14:00:00"), + pd.Timestamp("2020-05-15 14:15:00"), + pd.Timestamp("2020-05-15 14:30:00"), + pd.Timestamp("2020-05-15 14:45:00"), + pd.Timestamp("2020-05-15 15:00:00"), + pd.Timestamp("2020-05-15 15:15:00"), + pd.Timestamp("2020-05-15 15:30:00"), + ] + expected_results = { + "ambient_temp": [ + 31.31, + 31.51, + 32.15, + 32.39, + 32.62, + 32.5, + 32.52, + 32.68, + 33.76, + 34.13, + 34.08, + 33.7, + 33.89, + 34.04, + 34.4, + ], + "module_temp": [ + 49.18, + 49.84, + 52.35, + 50.63, + 49.61, + 47.01, + 46.67, + 47.52, + 49.8, + 55.03, + 54.52, + 47.62, + 46.03, + 44.29, + 46.74, + ], + "irradiation": [ + 0.51, + 0.79, + 0.65, + 0.76, + 0.42, + 0.49, + 0.57, + 0.56, + 0.74, + 0.89, + 0.47, + 0.54, + 0.4, + 0.45, + 0.57, + ], + "color": [ + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "blue", + "green", + "green", + "green", + "green", + "green", + ], + "ambient_temp_window_2_mean": [ + np.nan, + np.nan, + 31.41, + 31.83, + 32.269999999999996, + 32.505, + 32.56, + 32.510000000000005, + 32.60000000000001, + 33.22, + np.nan, + np.nan, + 33.89, + 33.795, + 33.965, + ], + "irradiation_window_2_mean": [ + np.nan, + np.nan, + 0.65, + 0.72, + 0.7050000000000001, + 0.59, + 0.45499999999999996, + 0.53, + 0.5650000000000001, + 0.6500000000000001, + np.nan, + np.nan, + 0.505, + 0.47000000000000003, + 0.42500000000000004, + ], + "ambient_temp_window_3_mean": [ + np.nan, + np.nan, + np.nan, + 31.656666666666666, + 32.016666666666666, + 32.38666666666666, + 32.50333333333333, + 32.54666666666667, + 32.56666666666667, + 32.98666666666667, + np.nan, + np.nan, + np.nan, + 33.89, + 33.876666666666665, + ], + "irradiation_window_3_mean": [ + np.nan, + np.nan, + np.nan, + 0.65, + 0.7333333333333334, + 0.61, + 0.5566666666666668, + 0.49333333333333335, + 0.54, + 0.6233333333333334, + np.nan, + np.nan, + np.nan, + 0.47000000000000003, + 0.4633333333333334, + ], + } + expected_results_df = pd.DataFrame( + data=expected_results, + index=date_time, + ) + # When setting group_by to color + transformer = WindowFeatures( + variables=["ambient_temp", "irradiation"], window=[2, 3], group_by="color" + ) + df_tr = transformer.fit_transform(df_time) + assert df_tr.equals(expected_results_df)