From e13428b04001e8b5f90dd95eac8cf1dc623bbcbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Mon, 27 May 2024 03:33:45 +0100 Subject: [PATCH 01/20] ENH: Enabled skipna argument on groupby reduction ops (#15675) Added a skipna argurment to the groupby reduction ops: sum, prod, min, max, mean, median, var, std and sem Added relevant tests Updated whatsnew to reflect changes Co-authored-by: Tiago Firmino --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/_libs/groupby.pyx | 103 ++++++++++++----- pandas/core/_numba/executor.py | 15 ++- pandas/core/_numba/kernels/mean_.py | 6 +- pandas/core/_numba/kernels/min_max_.py | 4 + pandas/core/_numba/kernels/sum_.py | 62 ++++++----- pandas/core/_numba/kernels/var_.py | 66 ++++++----- pandas/core/groupby/groupby.py | 70 ++++++++++-- pandas/tests/groupby/test_api.py | 12 +- pandas/tests/groupby/test_numba.py | 55 +++++++++ pandas/tests/groupby/test_reductions.py | 142 ++++++++++++++++++++++++ 11 files changed, 428 insertions(+), 109 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 6a6abcf2d48fe..a2174866e4b0f 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -39,13 +39,13 @@ Other enhancements - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`) - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) +- :meth:`.DataFrameGroupBy.sum`, :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.median`, :meth:`.DataFrameGroupBy.sem`, :meth:`.DataFrameGroupBy.std` and :meth:`.DataFrameGroupBy.var` now accept a skipna argument. (:issue:`15675`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`) - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`) - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`) - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`) -- .. --------------------------------------------------------------------------- .. _whatsnew_300.notable_bug_fixes: diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 15f8727c38f8d..d25f1e1b5f3d1 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -104,7 +104,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n cdef float64_t median_linear( float64_t* a, int n, - bint is_datetimelike=False + bint is_datetimelike=False, + bint skipna=True ) noexcept nogil: cdef: int i, j, na_count = 0 @@ -118,10 +119,14 @@ cdef float64_t median_linear( if is_datetimelike: for i in range(n): if a[i] == NPY_NAT: + if not skipna: + return NaN na_count += 1 else: for i in range(n): if a[i] != a[i]: + if not skipna: + return NaN na_count += 1 if na_count: @@ -186,6 +191,7 @@ def group_median_float64( const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -244,7 +250,7 @@ def group_median_float64( ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = median_linear(ptr, size, is_datetimelike) + out[j, i] = median_linear(ptr, size, is_datetimelike, skipna) ptr += size @@ -694,6 +700,7 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 using Kahan summation @@ -733,32 +740,39 @@ def group_sum( else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: - nobs[lab, j] += 1 + if isna_entry: + if skipna: + continue + else: + sumx[lab, j] = val + compensation[lab, j] = 0 + break - if sum_t is object: - # NB: this does not use 'compensation' like the non-object - # track does. - if nobs[lab, j] == 1: - # i.e. we haven't added anything yet; avoid TypeError - # if e.g. val is a str and sumx[lab, j] is 0 - t = val - else: - t = sumx[lab, j] + val - sumx[lab, j] = t + nobs[lab, j] += 1 + if sum_t is object: + # NB: this does not use 'compensation' like the non-object + # track does. + if nobs[lab, j] == 1: + # i.e. we haven't added anything yet; avoid TypeError + # if e.g. val is a str and sumx[lab, j] is 0 + t = val else: - y = val - compensation[lab, j] - t = sumx[lab, j] + y - compensation[lab, j] = t - sumx[lab, j] - y - if compensation[lab, j] != compensation[lab, j]: - # GH#53606 - # If val is +/- infinity compensation is NaN - # which would lead to results being NaN instead - # of +/- infinity. We cannot use util.is_nan - # because of no gil - compensation[lab, j] = 0 - sumx[lab, j] = t + t = sumx[lab, j] + val + sumx[lab, j] = t + + else: + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + if compensation[lab, j] != compensation[lab, j]: + # GH#53606 + # If val is +/- infinity compensation is NaN + # which would lead to results being NaN instead + # of +/- infinity. We cannot use util.is_nan + # because of no gil + compensation[lab, j] = 0 + sumx[lab, j] = t _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx @@ -775,6 +789,7 @@ def group_prod( const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -813,6 +828,10 @@ def group_prod( if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val + elif not skipna: + prodx[lab, j] = val + nobs[lab, j] = 0 + break _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx @@ -832,6 +851,7 @@ def group_var( const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + bint skipna=True, str name="var", ) -> None: cdef: @@ -877,7 +897,12 @@ def group_var( else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: + if not skipna and isna_entry: + out[lab, j] = val + nobs[lab, j] = 0 + break + + elif not isna_entry: nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] @@ -998,6 +1023,7 @@ def group_mean( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, ) -> None: @@ -1021,6 +1047,8 @@ def group_mean( Only used in sum and prod. Always -1. is_datetimelike : bool True if `values` contains datetime-like entries. + skipna : bool, default True + Exclude NA/null values when computing the result. mask : ndarray[bool, ndim=2], optional Mask of the input values. result_mask : ndarray[bool, ndim=2], optional @@ -1078,7 +1106,12 @@ def group_mean( else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: + if not skipna and isna_entry: + sumx[lab, j] = nan_val + nobs[lab, j] = 0 + break + + elif not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y @@ -1096,12 +1129,10 @@ def group_mean( for j in range(K): count = nobs[i, j] if nobs[i, j] == 0: - if uses_mask: result_mask[i, j] = True else: out[i, j] = nan_val - else: out[i, j] = sumx[i, j] / count @@ -1660,6 +1691,7 @@ cdef group_min_max( Py_ssize_t min_count=-1, bint is_datetimelike=False, bint compute_max=True, + bint skipna=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, ): @@ -1683,6 +1715,8 @@ cdef group_min_max( True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min + skipna : bool, default True + Exclude NA/null values when computing the result. mask : ndarray[bool, ndim=2], optional If not None, indices represent missing values, otherwise the mask will not be used @@ -1729,7 +1763,12 @@ cdef group_min_max( else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: + if not skipna and isna_entry: + group_min_or_max[lab, j] = val + nobs[lab, j] = 0 + break + + elif not isna_entry: nobs[lab, j] += 1 if compute_max: if val > group_min_or_max[lab, j]: @@ -1866,6 +1905,7 @@ def group_max( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, ) -> None: @@ -1880,6 +1920,7 @@ def group_max( compute_max=True, mask=mask, result_mask=result_mask, + skipna=skipna, ) @@ -1892,6 +1933,7 @@ def group_min( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, ) -> None: @@ -1906,6 +1948,7 @@ def group_min( compute_max=False, mask=mask, result_mask=result_mask, + skipna=skipna, ) diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 0a26acb7df60a..032534ae22463 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -69,13 +69,20 @@ def column_looper( labels: np.ndarray, ngroups: int, min_periods: int, + skipna: bool = True, *args, ): result = np.empty((values.shape[0], ngroups), dtype=result_dtype) na_positions = {} for i in numba.prange(values.shape[0]): output, na_pos = func( - values[i], result_dtype, labels, ngroups, min_periods, *args + values[i], + result_dtype, + labels, + ngroups, + min_periods, + *args, + skipna, ) result[i] = output if len(na_pos) > 0: @@ -162,6 +169,7 @@ def generate_shared_aggregator( nopython: bool, nogil: bool, parallel: bool, + skipna: bool = True, ): """ Generate a Numba function that loops over the columns 2D object and applies @@ -190,7 +198,6 @@ def generate_shared_aggregator( ------- Numba function """ - # A wrapper around the looper function, # to dispatch based on dtype since numba is unable to do that in nopython mode @@ -214,11 +221,11 @@ def looper_wrapper( # Need to unpack kwargs since numba only supports *args if is_grouped_kernel: result, na_positions = column_looper( - values, labels, ngroups, min_periods, *kwargs.values() + values, labels, ngroups, min_periods, skipna, *kwargs.values() ) else: result, na_positions = column_looper( - values, start, end, min_periods, *kwargs.values() + values, start, end, min_periods, skipna, *kwargs.values() ) if result.dtype.kind == "i": # Look if na_positions is not empty diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index cc10bd003af7e..c1c07f057f0f2 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -169,9 +169,10 @@ def grouped_mean( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods @@ -187,7 +188,8 @@ def grouped_mean( result = sum_x else: result = np.nan - result /= nobs + if nobs != 0: + result /= nobs output[lab] = result # na_position is empty list since float64 can already hold nans diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 59d36732ebae6..9190685d6dd22 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -88,6 +88,7 @@ def grouped_min_max( ngroups: int, min_periods: int, is_max: bool, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) nobs = np.zeros(ngroups, dtype=np.int64) @@ -102,6 +103,9 @@ def grouped_min_max( if values.dtype.kind == "i" or not np.isnan(val): nobs[lab] += 1 + elif not skipna and np.isnan(val): + output[lab] = np.nan + continue else: # NaN value cannot be a min/max value continue diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 76f4e22b43c4b..f98749ee4cb4a 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -165,6 +165,7 @@ def grouped_kahan_sum( result_dtype: np.dtype, labels: npt.NDArray[np.intp], ngroups: int, + skipna: bool = True, ) -> tuple[ np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray ]: @@ -183,32 +184,38 @@ def grouped_kahan_sum( if lab < 0: continue - sum_x = output[lab] - nobs = nobs_arr[lab] - compensation_add = comp_arr[lab] - num_consecutive_same_value = consecutive_counts[lab] - prev_value = prev_vals[lab] + if not skipna and np.isnan(val): + output[lab] = val + consecutive_counts[lab] = 0 + + else: + sum_x = output[lab] + nobs = nobs_arr[lab] + compensation_add = comp_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + prev_value = prev_vals[lab] + + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + + output[lab] = sum_x + consecutive_counts[lab] = num_consecutive_same_value + prev_vals[lab] = prev_value + comp_arr[lab] = compensation_add + nobs_arr[lab] = nobs - ( - nobs, - sum_x, - compensation_add, - num_consecutive_same_value, - prev_value, - ) = add_sum( - val, - nobs, - sum_x, - compensation_add, - num_consecutive_same_value, - prev_value, - ) - - output[lab] = sum_x - consecutive_counts[lab] = num_consecutive_same_value - prev_vals[lab] = prev_value - comp_arr[lab] = compensation_add - nobs_arr[lab] = nobs return output, nobs_arr, comp_arr, consecutive_counts, prev_vals @@ -219,11 +226,12 @@ def grouped_sum( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: na_pos = [] output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods @@ -232,7 +240,7 @@ def grouped_sum( num_consecutive_same_value = consecutive_counts[lab] prev_value = prev_vals[lab] sum_x = output[lab] - if nobs >= min_periods: + if not np.isnan(sum_x) and nobs >= min_periods: if num_consecutive_same_value >= nobs: result = prev_value * nobs else: diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 69aec4d6522c4..9e9c7c7e20c1c 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -176,6 +176,7 @@ def grouped_var( ngroups: int, min_periods: int, ddof: int = 1, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) @@ -193,36 +194,41 @@ def grouped_var( if lab < 0: continue - mean_x = means[lab] - ssqdm_x = output[lab] - nobs = nobs_arr[lab] - compensation_add = comp_arr[lab] - num_consecutive_same_value = consecutive_counts[lab] - prev_value = prev_vals[lab] - - ( - nobs, - mean_x, - ssqdm_x, - compensation_add, - num_consecutive_same_value, - prev_value, - ) = add_var( - val, - nobs, - mean_x, - ssqdm_x, - compensation_add, - num_consecutive_same_value, - prev_value, - ) - - output[lab] = ssqdm_x - means[lab] = mean_x - consecutive_counts[lab] = num_consecutive_same_value - prev_vals[lab] = prev_value - comp_arr[lab] = compensation_add - nobs_arr[lab] = nobs + if not skipna and np.isnan(val): + output[lab] = val + nobs_arr[lab] = 0 + + else: + mean_x = means[lab] + ssqdm_x = output[lab] + nobs = nobs_arr[lab] + compensation_add = comp_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + prev_value = prev_vals[lab] + + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + + output[lab] = ssqdm_x + means[lab] = mean_x + consecutive_counts[lab] = num_consecutive_same_value + prev_vals[lab] = prev_value + comp_arr[lab] = compensation_add + nobs_arr[lab] = nobs # Post-processing, replace vars that don't satisfy min_periods for lab in range(ngroups): diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 1b58317c08736..34613fc82b204 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -1390,6 +1390,7 @@ def _numba_agg_general( func: Callable, dtype_mapping: dict[np.dtype, Any], engine_kwargs: dict[str, bool] | None, + skipna: bool = True, **aggregator_kwargs, ): """ @@ -1408,6 +1409,7 @@ def _numba_agg_general( func, dtype_mapping, True, # is_grouped_kernel + skipna=skipna, **get_jit_arguments(engine_kwargs), ) # Pass group ids to kernel directly if it can handle it @@ -1752,6 +1754,7 @@ def _agg_general( numeric_only: bool = False, min_count: int = -1, *, + skipna: bool = True, alias: str, npfunc: Callable | None = None, **kwargs, @@ -1759,6 +1762,7 @@ def _agg_general( result = self._cython_agg_general( how=alias, alt=npfunc, + skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, @@ -2226,6 +2230,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def mean( self, numeric_only: bool = False, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2241,6 +2246,9 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. * ``'numba'`` : Runs the operation through JIT compiled code from numba. @@ -2307,17 +2315,22 @@ def mean( executor.float_dtype_mapping, engine_kwargs, min_periods=0, + skipna=skipna, ) else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean( + numeric_only=numeric_only, + skipna=skipna, + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False) -> NDFrameT: + def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2332,6 +2345,9 @@ def median(self, numeric_only: bool = False) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -2398,8 +2414,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT: """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2412,6 +2431,7 @@ def std( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -2449,6 +2469,9 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -2503,14 +2526,16 @@ def std( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) ) else: return self._cython_agg_general( "std", - alt=lambda x: Series(x, copy=False).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2520,6 +2545,7 @@ def var( self, ddof: int = 1, engine: Literal["cython", "numba"] | None = None, + skipna: bool = True, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, ): @@ -2559,6 +2585,9 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -2612,13 +2641,15 @@ def var( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) else: return self._cython_agg_general( "var", - alt=lambda x: Series(x, copy=False).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2748,7 +2779,9 @@ def _value_counts( return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + def sem( + self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True + ) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -2768,6 +2801,9 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -2837,9 +2873,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x, copy=False).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2985,6 +3022,7 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2996,6 +3034,7 @@ def sum( executor.default_dtype_mapping, engine_kwargs, min_periods=min_count, + skipna=skipna, ) else: # If we are grouping on categoricals we want unobserved categories to @@ -3005,6 +3044,7 @@ def sum( result = self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="sum", npfunc=np.sum, ) @@ -3052,9 +3092,15 @@ def sum( 2 30 72""" ), ) - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + def prod( + self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True + ) -> NDFrameT: return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + numeric_only=numeric_only, + min_count=min_count, + alias="prod", + npfunc=np.prod, + skipna=skipna, ) @final @@ -3104,6 +3150,7 @@ def min( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3116,6 +3163,7 @@ def min( engine_kwargs, min_periods=min_count, is_max=False, + skipna=skipna, ) else: return self._agg_general( @@ -3123,6 +3171,7 @@ def min( min_count=min_count, alias="min", npfunc=np.min, + skipna=skipna, ) @final @@ -3172,6 +3221,7 @@ def max( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3184,6 +3234,7 @@ def max( engine_kwargs, min_periods=min_count, is_max=True, + skipna=skipna, ) else: return self._agg_general( @@ -3191,6 +3242,7 @@ def max( min_count=min_count, alias="max", npfunc=np.max, + skipna=skipna, ) @final diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 33b39bad4ab81..635396a1227f2 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -173,13 +173,13 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("nunique",): exclude_expected = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): @@ -231,13 +231,13 @@ def test_series_consistency(request, groupby_func): if groupby_func in ("any", "all"): exclude_expected = {"kwargs", "bool_only", "axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 3e32031e51138..1699946b6cc1f 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import ( @@ -71,3 +72,57 @@ def test_no_engine_doesnt_raise(self): res = gb.agg({"b": "first"}) expected = gb.agg({"b": "first"}) tm.assert_frame_equal(res, expected) + + @pytest.mark.parametrize( + "numba_method", ["sum", "min", "max", "std", "var", "mean"] + ) + def test_skipna_numba(self, numba_method): + # GH15675 + df = DataFrame( + { + "l": ["A", "A", "A", "B", "B", "B"], + "int": [-1, 1, -1, 1, 1, np.nan], + "float": [-1.0, 1.2, -1.1, 1.5, 1.0, np.nan], + } + ) + + result_numba = getattr(df.groupby("l").int, numba_method)( + skipna=False, engine="numba" + ) + expected = df.groupby("l").int.apply( + lambda x: getattr(x, numba_method)(skipna=False) + ) + tm.assert_series_equal(result_numba, expected, check_exact=False) + + result_numba = getattr(df.groupby("l").float, numba_method)( + skipna=False, engine="numba" + ) + expected = df.groupby("l").float.apply( + lambda x: getattr(x, numba_method)(skipna=False) + ) + tm.assert_series_equal(result_numba, expected, check_exact=False) + + @pytest.mark.parametrize( + "numba_method", ["sum", "min", "max", "std", "var", "mean"] + ) + def test_skipna_consistency_numba(self, numba_method): + # GH15675 + df = DataFrame( + { + "l": ["A", "A", "A", "B", "B", "B"], + "int": [-1, 1, -1, 1, 1, np.nan], + "float": [-1.0, 1.2, -1.1, 1.5, 1.0, np.nan], + } + ) + + result_with_arg = getattr(df.groupby("l").int, numba_method)( + skipna=True, engine="numba" + ) + result_default = getattr(df.groupby("l").int, numba_method)(engine="numba") + tm.assert_series_equal(result_with_arg, result_default, check_exact=False) + + result_with_arg = getattr(df.groupby("l").float, numba_method)( + skipna=True, engine="numba" + ) + result_default = getattr(df.groupby("l").float, numba_method)(engine="numba") + tm.assert_series_equal(result_with_arg, result_default, check_exact=False) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index edc94b2beeec1..a25170aa4c771 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1047,6 +1047,120 @@ def scipy_sem(*args, **kwargs): return sem(*args, ddof=1, **kwargs) +@pytest.mark.parametrize( + "reduction_method", + ["sum", "min", "max", "mean", "median", "prod", "sem", "std", "var"], +) +def test_skipna_reduction_ops_cython(reduction_method): + # GH15675 + # Testing the skipna parameter against possible datatypes + df = DataFrame( + { + "l": ["A", "A", "A", "A", "B", "B", "B", "B"], + "int": [-1, 1, -1, 2, 1, 1, 1, np.nan], + "float": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0], + "t": [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-04"), + Timestamp("2024-01-06"), + Timestamp("2024-01-07"), + Timestamp("2024-01-08"), + pd.NaT, + ], + "td": [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=4), + pd.Timedelta(days=6), + pd.Timedelta(days=7), + pd.Timedelta(days=8), + pd.NaT, + ], + } + ) + + result_cython = getattr(df.groupby("l").int, reduction_method)(skipna=False) + expected = df.groupby("l").int.apply( + lambda x: getattr(x, reduction_method)(skipna=False) + ) + tm.assert_series_equal(result_cython, expected, check_exact=False) + + result_cython = getattr(df.groupby("l").float, reduction_method)(skipna=False) + expected = df.groupby("l").float.apply( + lambda x: getattr(x, reduction_method)(skipna=False) + ) + tm.assert_series_equal(result_cython, expected, check_exact=False) + + if reduction_method in ["min", "max", "mean", "median", "std"]: + result_ts = getattr(df.groupby("l").t, reduction_method)(skipna=False) + expected_ts = df.groupby("l").t.apply( + lambda x: getattr(x, reduction_method)(skipna=False) + ) + tm.assert_series_equal(result_ts, expected_ts, check_exact=False) + + result_td = getattr(df.groupby("l").td, reduction_method)(skipna=False) + expected_td = df.groupby("l").td.apply( + lambda x: getattr(x, reduction_method)(skipna=False) + ) + tm.assert_series_equal(result_td, expected_td, check_exact=False) + + +@pytest.mark.parametrize( + "reduction_method", + ["sum", "min", "max", "mean", "median", "prod", "sem", "std", "var"], +) +def test_skipna_reduction_ops_consistency(reduction_method): + # GH15675 + # Testing if provinding skipna=True maintains the default functionality + df = DataFrame( + { + "l": ["A", "A", "A", "A", "B", "B", "B", "B"], + "int": [-1, 1, -1, 2, 1, 1, 1, np.nan], + "float": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0], + "t": [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-04"), + Timestamp("2024-01-05"), + Timestamp("2024-01-06"), + pd.NaT, + Timestamp("2024-01-07"), + ], + "td": [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=4), + pd.Timedelta(days=5), + pd.Timedelta(days=6), + pd.NaT, + pd.Timedelta(days=7), + ], + } + ) + + result_with_arg = getattr(df.groupby("l").int, reduction_method)(skipna=True) + result_default = getattr(df.groupby("l").int, reduction_method)() + tm.assert_series_equal(result_with_arg, result_default, check_exact=False) + + result_with_arg = getattr(df.groupby("l").float, reduction_method)(skipna=True) + result_default = getattr(df.groupby("l").float, reduction_method)() + tm.assert_series_equal(result_with_arg, result_default, check_exact=False) + + if reduction_method in ["min", "max", "mean", "median", "std"]: + result_ts_with_arg = getattr(df.groupby("l").t, reduction_method)(skipna=True) + result_ts_default = getattr(df.groupby("l").t, reduction_method)() + tm.assert_series_equal(result_ts_with_arg, result_ts_default, check_exact=False) + + result_td_with_arg = getattr(df.groupby("l").td, reduction_method)(skipna=True) + result_td_default = getattr(df.groupby("l").td, reduction_method)() + tm.assert_series_equal(result_td_with_arg, result_td_default, check_exact=False) + + @pytest.mark.parametrize( "op,targop", [ @@ -1192,3 +1306,31 @@ def test_groupby_std_datetimelike(): exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) tm.assert_frame_equal(result, expected) + + +def test_skipna_string_sum(): + # GH15675 + df = DataFrame( + { + "l": ["A", "A", "A", "B", "B", "B"], + "v": ["foo", "bar", "baz", "foo", pd.NA, "foo"], + } + ) + + result_cython = df.groupby("l").v.sum(skipna=False) + expected = df.groupby("l").v.apply(lambda x: x.sum(skipna=False)) + tm.assert_series_equal(result_cython, expected, check_exact=False) + + +def test_skipna_string_sum_consistency(): + # GH15675 + df = DataFrame( + { + "l": ["A", "A", "A", "B", "B", "B"], + "v": ["foo", "bar", "baz", "foo", pd.NA, "foo"], + } + ) + + result_cython = df.groupby("l").v.sum(skipna=True) + expected = df.groupby("l").v.sum() + tm.assert_series_equal(result_cython, expected, check_exact=False) From 8099d8493c62daa3c444ce32e6b6f8b619371c76 Mon Sep 17 00:00:00 2001 From: Tiago Firmino Date: Mon, 27 May 2024 18:21:52 +0100 Subject: [PATCH 02/20] FIX: fixed pipeline issues related to docs and window tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: André Correia --- pandas/_libs/groupby.pyi | 7 +++++++ pandas/core/_numba/kernels/mean_.py | 5 +++-- pandas/core/_numba/kernels/min_max_.py | 3 ++- pandas/core/_numba/kernels/sum_.py | 3 ++- pandas/core/_numba/kernels/var_.py | 5 +++-- pandas/core/groupby/groupby.py | 12 +++++++++++- pandas/core/resample.py | 7 ++++--- 7 files changed, 32 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 53f5f73624232..3d75bbf953137 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -13,6 +13,7 @@ def group_median_float64( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., # bint + skipna: bool = ..., # bint ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] @@ -66,6 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., # bint ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] @@ -75,6 +77,7 @@ def group_prod( mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., + skipna: bool = ..., # bint ) -> None: ... def group_var( out: np.ndarray, # floating[:, ::1] @@ -86,6 +89,7 @@ def group_var( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., + skipna: bool = ..., # bint name: str = ..., ) -> None: ... def group_skew( @@ -104,6 +108,7 @@ def group_mean( labels: np.ndarray, # const intp_t[:] min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., # bint + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... @@ -170,6 +175,7 @@ def group_max( labels: np.ndarray, # const int64_t[:] min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... @@ -180,6 +186,7 @@ def group_min( labels: np.ndarray, # const int64_t[:] min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index c1c07f057f0f2..1273de6e7cda7 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -71,6 +71,7 @@ def sliding_mean( start: np.ndarray, end: np.ndarray, min_periods: int, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(start) nobs = 0 @@ -182,7 +183,7 @@ def grouped_mean( prev_value = prev_vals[lab] sum_x = output[lab] if nobs >= min_periods: - if num_consecutive_same_value >= nobs: + if (num_consecutive_same_value >= nobs): result = prev_value * nobs else: result = sum_x @@ -196,4 +197,4 @@ def grouped_mean( # Do list comprehension, since numba cannot figure out that na_pos is # empty list of ints on its own na_pos = [0 for i in range(0)] - return output, na_pos + return output, na_pos \ No newline at end of file diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 9190685d6dd22..497bf3d4fdfa7 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -25,6 +25,7 @@ def sliding_min_max( start: np.ndarray, end: np.ndarray, min_periods: int, + skipna: bool, is_max: bool, ) -> tuple[np.ndarray, list[int]]: N = len(start) @@ -127,4 +128,4 @@ def grouped_min_max( if count < min_periods: na_pos.append(lab) - return output, na_pos + return output, na_pos \ No newline at end of file diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index f98749ee4cb4a..9a61d2f5e9676 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -69,6 +69,7 @@ def sliding_sum( start: np.ndarray, end: np.ndarray, min_periods: int, + skipna: bool=True, ) -> tuple[np.ndarray, list[int]]: dtype = values.dtype @@ -250,4 +251,4 @@ def grouped_sum( na_pos.append(lab) output[lab] = result - return output, na_pos + return output, na_pos \ No newline at end of file diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 9e9c7c7e20c1c..81a70f42bd274 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -78,6 +78,7 @@ def sliding_var( start: np.ndarray, end: np.ndarray, min_periods: int, + skipna: bool = True, ddof: int = 1, ) -> tuple[np.ndarray, list[int]]: N = len(start) @@ -197,7 +198,7 @@ def grouped_var( if not skipna and np.isnan(val): output[lab] = val nobs_arr[lab] = 0 - + else: mean_x = means[lab] ssqdm_x = output[lab] @@ -249,4 +250,4 @@ def grouped_var( # Do list comprehension, since numba cannot figure out that na_pos is # empty list of ints on its own na_pos = [0 for i in range(0)] - return output, na_pos + return output, na_pos \ No newline at end of file diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 34613fc82b204..a7b7d9eb8fd7e 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -181,6 +181,9 @@ class providing the base-class of operations. The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. +skipna : bool, default {sn} + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -207,6 +210,9 @@ class providing the base-class of operations. The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. +skipna : bool, default {sn} + Exclude NA/null values when computing the result. + engine : str, default None {e} * ``'cython'`` : Runs rolling apply through C-extensions from cython. * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. @@ -2545,9 +2551,9 @@ def var( self, ddof: int = 1, engine: Literal["cython", "numba"] | None = None, - skipna: bool = True, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -2981,6 +2987,7 @@ def size(self) -> DataFrame | Series: fname="sum", no=False, mc=0, + sn=True, e=None, ek=None, example=dedent( @@ -3057,6 +3064,7 @@ def sum( fname="prod", no=False, mc=0, + sn=True, example=dedent( """\ For SeriesGroupBy: @@ -3109,6 +3117,7 @@ def prod( fname="min", no=False, mc=-1, + sn=True, e=None, ek=None, example=dedent( @@ -3180,6 +3189,7 @@ def min( fname="max", no=False, mc=-1, + sn=True, e=None, ek=None, example=dedent( diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ccbe25fdae841..1bdae5eedd49d 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1194,8 +1194,8 @@ def last( @final @doc(GroupBy.median) - def median(self, numeric_only: bool = False): - return self._downsample("median", numeric_only=numeric_only) + def median(self, numeric_only: bool = False, skipna: bool = True,): + return self._downsample("median", numeric_only=numeric_only, skipna=skipna) @final def mean( @@ -1353,8 +1353,9 @@ def sem( self, ddof: int = 1, numeric_only: bool = False, + skipna: bool = True, ): - return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) + return self._downsample("sem", ddof=ddof, numeric_only=numeric_only, skipna=skipna) @final @doc(GroupBy.ohlc) From 8f61fda34b9fa7ece51e4d26ba32b5c74e900526 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Mon, 27 May 2024 18:34:04 +0100 Subject: [PATCH 03/20] Fix: pre-commit --- pandas/_libs/groupby.pyi | 14 +++++++------- pandas/core/_numba/kernels/mean_.py | 4 ++-- pandas/core/_numba/kernels/min_max_.py | 2 +- pandas/core/_numba/kernels/sum_.py | 4 ++-- pandas/core/_numba/kernels/var_.py | 4 ++-- pandas/core/resample.py | 10 ++++++++-- 6 files changed, 22 insertions(+), 16 deletions(-) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 3d75bbf953137..222e527344cec 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -13,7 +13,7 @@ def group_median_float64( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., # bint - skipna: bool = ..., # bint + skipna: bool = ..., # bint ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] @@ -67,7 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., - skipna: bool = ..., # bint + skipna: bool = ..., # bint ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] @@ -77,7 +77,7 @@ def group_prod( mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., - skipna: bool = ..., # bint + skipna: bool = ..., # bint ) -> None: ... def group_var( out: np.ndarray, # floating[:, ::1] @@ -89,7 +89,7 @@ def group_var( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., - skipna: bool = ..., # bint + skipna: bool = ..., # bint name: str = ..., ) -> None: ... def group_skew( @@ -108,7 +108,7 @@ def group_mean( labels: np.ndarray, # const intp_t[:] min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., # bint - skipna: bool = ..., # bint + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... @@ -175,7 +175,7 @@ def group_max( labels: np.ndarray, # const int64_t[:] min_count: int = ..., is_datetimelike: bool = ..., - skipna: bool = ..., # bint + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... @@ -186,7 +186,7 @@ def group_min( labels: np.ndarray, # const int64_t[:] min_count: int = ..., is_datetimelike: bool = ..., - skipna: bool = ..., # bint + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index 1273de6e7cda7..0ede465acd7ec 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -183,7 +183,7 @@ def grouped_mean( prev_value = prev_vals[lab] sum_x = output[lab] if nobs >= min_periods: - if (num_consecutive_same_value >= nobs): + if num_consecutive_same_value >= nobs: result = prev_value * nobs else: result = sum_x @@ -197,4 +197,4 @@ def grouped_mean( # Do list comprehension, since numba cannot figure out that na_pos is # empty list of ints on its own na_pos = [0 for i in range(0)] - return output, na_pos \ No newline at end of file + return output, na_pos diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 497bf3d4fdfa7..1f9ad9f58f120 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -128,4 +128,4 @@ def grouped_min_max( if count < min_periods: na_pos.append(lab) - return output, na_pos \ No newline at end of file + return output, na_pos diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 9a61d2f5e9676..49a2d4ceb6487 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -69,7 +69,7 @@ def sliding_sum( start: np.ndarray, end: np.ndarray, min_periods: int, - skipna: bool=True, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: dtype = values.dtype @@ -251,4 +251,4 @@ def grouped_sum( na_pos.append(lab) output[lab] = result - return output, na_pos \ No newline at end of file + return output, na_pos diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 81a70f42bd274..d53803029e11b 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -198,7 +198,7 @@ def grouped_var( if not skipna and np.isnan(val): output[lab] = val nobs_arr[lab] = 0 - + else: mean_x = means[lab] ssqdm_x = output[lab] @@ -250,4 +250,4 @@ def grouped_var( # Do list comprehension, since numba cannot figure out that na_pos is # empty list of ints on its own na_pos = [0 for i in range(0)] - return output, na_pos \ No newline at end of file + return output, na_pos diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1bdae5eedd49d..1ccdaf46d701a 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1194,7 +1194,11 @@ def last( @final @doc(GroupBy.median) - def median(self, numeric_only: bool = False, skipna: bool = True,): + def median( + self, + numeric_only: bool = False, + skipna: bool = True, + ): return self._downsample("median", numeric_only=numeric_only, skipna=skipna) @final @@ -1355,7 +1359,9 @@ def sem( numeric_only: bool = False, skipna: bool = True, ): - return self._downsample("sem", ddof=ddof, numeric_only=numeric_only, skipna=skipna) + return self._downsample( + "sem", ddof=ddof, numeric_only=numeric_only, skipna=skipna + ) @final @doc(GroupBy.ohlc) From 251869680cc816f330ba6846f4dcbb9997fd0c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Sun, 2 Jun 2024 20:39:04 +0100 Subject: [PATCH 04/20] Reworked sugestions Co-authored-by: Tiago Firmino --- pandas/_libs/groupby.pyx | 10 +- pandas/core/_numba/executor.py | 2 +- pandas/core/_numba/kernels/min_max_.py | 2 +- pandas/core/_numba/kernels/var_.py | 2 +- pandas/tests/groupby/test_numba.py | 34 ------ pandas/tests/groupby/test_reductions.py | 142 +++++------------------- 6 files changed, 38 insertions(+), 154 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d25f1e1b5f3d1..1f3aba82fdff4 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -746,7 +746,7 @@ def group_sum( else: sumx[lab, j] = val compensation[lab, j] = 0 - break + continue nobs[lab, j] += 1 @@ -831,7 +831,7 @@ def group_prod( elif not skipna: prodx[lab, j] = val nobs[lab, j] = 0 - break + continue _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx @@ -900,7 +900,7 @@ def group_var( if not skipna and isna_entry: out[lab, j] = val nobs[lab, j] = 0 - break + continue elif not isna_entry: nobs[lab, j] += 1 @@ -1109,7 +1109,7 @@ def group_mean( if not skipna and isna_entry: sumx[lab, j] = nan_val nobs[lab, j] = 0 - break + continue elif not isna_entry: nobs[lab, j] += 1 @@ -1766,7 +1766,7 @@ cdef group_min_max( if not skipna and isna_entry: group_min_or_max[lab, j] = val nobs[lab, j] = 0 - break + continue elif not isna_entry: nobs[lab, j] += 1 diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 032534ae22463..46f63e9ee4fda 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -81,8 +81,8 @@ def column_looper( labels, ngroups, min_periods, - *args, skipna, + *args, ) result[i] = output if len(na_pos) > 0: diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 1f9ad9f58f120..fb54d0ee9525a 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -88,8 +88,8 @@ def grouped_min_max( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, is_max: bool, - skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(labels) nobs = np.zeros(ngroups, dtype=np.int64) diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index d53803029e11b..e4ebe26f75ed5 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -176,8 +176,8 @@ def grouped_var( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, - ddof: int = 1, skipna: bool = True, + ddof: int = 1, ) -> tuple[np.ndarray, list[int]]: N = len(labels) diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 1699946b6cc1f..bf00436962cac 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -81,19 +81,10 @@ def test_skipna_numba(self, numba_method): df = DataFrame( { "l": ["A", "A", "A", "B", "B", "B"], - "int": [-1, 1, -1, 1, 1, np.nan], "float": [-1.0, 1.2, -1.1, 1.5, 1.0, np.nan], } ) - result_numba = getattr(df.groupby("l").int, numba_method)( - skipna=False, engine="numba" - ) - expected = df.groupby("l").int.apply( - lambda x: getattr(x, numba_method)(skipna=False) - ) - tm.assert_series_equal(result_numba, expected, check_exact=False) - result_numba = getattr(df.groupby("l").float, numba_method)( skipna=False, engine="numba" ) @@ -101,28 +92,3 @@ def test_skipna_numba(self, numba_method): lambda x: getattr(x, numba_method)(skipna=False) ) tm.assert_series_equal(result_numba, expected, check_exact=False) - - @pytest.mark.parametrize( - "numba_method", ["sum", "min", "max", "std", "var", "mean"] - ) - def test_skipna_consistency_numba(self, numba_method): - # GH15675 - df = DataFrame( - { - "l": ["A", "A", "A", "B", "B", "B"], - "int": [-1, 1, -1, 1, 1, np.nan], - "float": [-1.0, 1.2, -1.1, 1.5, 1.0, np.nan], - } - ) - - result_with_arg = getattr(df.groupby("l").int, numba_method)( - skipna=True, engine="numba" - ) - result_default = getattr(df.groupby("l").int, numba_method)(engine="numba") - tm.assert_series_equal(result_with_arg, result_default, check_exact=False) - - result_with_arg = getattr(df.groupby("l").float, numba_method)( - skipna=True, engine="numba" - ) - result_default = getattr(df.groupby("l").float, numba_method)(engine="numba") - tm.assert_series_equal(result_with_arg, result_default, check_exact=False) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index a25170aa4c771..7501843ae8fca 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1048,17 +1048,12 @@ def scipy_sem(*args, **kwargs): @pytest.mark.parametrize( - "reduction_method", - ["sum", "min", "max", "mean", "median", "prod", "sem", "std", "var"], -) -def test_skipna_reduction_ops_cython(reduction_method): - # GH15675 - # Testing the skipna parameter against possible datatypes - df = DataFrame( + "data", + [ { "l": ["A", "A", "A", "A", "B", "B", "B", "B"], - "int": [-1, 1, -1, 2, 1, 1, 1, np.nan], - "float": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0], + "f": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0], + "s": ["foo", "bar", "baz", "foo", "foo", "foo", pd.NA, "foo"], "t": [ Timestamp("2024-01-01"), Timestamp("2024-01-02"), @@ -1080,85 +1075,36 @@ def test_skipna_reduction_ops_cython(reduction_method): pd.NaT, ], } - ) - - result_cython = getattr(df.groupby("l").int, reduction_method)(skipna=False) - expected = df.groupby("l").int.apply( - lambda x: getattr(x, reduction_method)(skipna=False) - ) - tm.assert_series_equal(result_cython, expected, check_exact=False) - - result_cython = getattr(df.groupby("l").float, reduction_method)(skipna=False) - expected = df.groupby("l").float.apply( - lambda x: getattr(x, reduction_method)(skipna=False) - ) - tm.assert_series_equal(result_cython, expected, check_exact=False) - - if reduction_method in ["min", "max", "mean", "median", "std"]: - result_ts = getattr(df.groupby("l").t, reduction_method)(skipna=False) - expected_ts = df.groupby("l").t.apply( - lambda x: getattr(x, reduction_method)(skipna=False) - ) - tm.assert_series_equal(result_ts, expected_ts, check_exact=False) - - result_td = getattr(df.groupby("l").td, reduction_method)(skipna=False) - expected_td = df.groupby("l").td.apply( - lambda x: getattr(x, reduction_method)(skipna=False) - ) - tm.assert_series_equal(result_td, expected_td, check_exact=False) - - + ], +) @pytest.mark.parametrize( - "reduction_method", - ["sum", "min", "max", "mean", "median", "prod", "sem", "std", "var"], + "reduction_method,columns", + [ + ("sum", ["f", "s"]), + ("min", ["f", "t", "td"]), + ("max", ["f", "t", "td"]), + ("mean", ["f", "t", "td"]), + ("median", ["f", "t", "td"]), + ("prod", ["f"]), + ("sem", ["f"]), + ("std", ["f", "t", "td"]), + ("var", ["f"]), + ("any", ["f"]), + ("all", ["f"]), + ("skew", ["f"]), + ], ) -def test_skipna_reduction_ops_consistency(reduction_method): +def test_skipna_reduction_ops_cython(reduction_method, columns, data): # GH15675 - # Testing if provinding skipna=True maintains the default functionality - df = DataFrame( - { - "l": ["A", "A", "A", "A", "B", "B", "B", "B"], - "int": [-1, 1, -1, 2, 1, 1, 1, np.nan], - "float": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0], - "t": [ - Timestamp("2024-01-01"), - Timestamp("2024-01-02"), - Timestamp("2024-01-03"), - Timestamp("2024-01-04"), - Timestamp("2024-01-05"), - Timestamp("2024-01-06"), - pd.NaT, - Timestamp("2024-01-07"), - ], - "td": [ - pd.Timedelta(days=1), - pd.Timedelta(days=2), - pd.Timedelta(days=3), - pd.Timedelta(days=4), - pd.Timedelta(days=5), - pd.Timedelta(days=6), - pd.NaT, - pd.Timedelta(days=7), - ], - } - ) - - result_with_arg = getattr(df.groupby("l").int, reduction_method)(skipna=True) - result_default = getattr(df.groupby("l").int, reduction_method)() - tm.assert_series_equal(result_with_arg, result_default, check_exact=False) - - result_with_arg = getattr(df.groupby("l").float, reduction_method)(skipna=True) - result_default = getattr(df.groupby("l").float, reduction_method)() - tm.assert_series_equal(result_with_arg, result_default, check_exact=False) - - if reduction_method in ["min", "max", "mean", "median", "std"]: - result_ts_with_arg = getattr(df.groupby("l").t, reduction_method)(skipna=True) - result_ts_default = getattr(df.groupby("l").t, reduction_method)() - tm.assert_series_equal(result_ts_with_arg, result_ts_default, check_exact=False) + # Testing the skipna parameter against possible datatypes + df = DataFrame(data) - result_td_with_arg = getattr(df.groupby("l").td, reduction_method)(skipna=True) - result_td_default = getattr(df.groupby("l").td, reduction_method)() - tm.assert_series_equal(result_td_with_arg, result_td_default, check_exact=False) + for column in columns: + result_cython = getattr(df.groupby("l")[column], reduction_method)(skipna=False) + expected = df.groupby("l")[column].apply( + lambda x: getattr(x, reduction_method)(skipna=False) + ) + tm.assert_series_equal(result_cython, expected, check_exact=False) @pytest.mark.parametrize( @@ -1306,31 +1252,3 @@ def test_groupby_std_datetimelike(): exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) tm.assert_frame_equal(result, expected) - - -def test_skipna_string_sum(): - # GH15675 - df = DataFrame( - { - "l": ["A", "A", "A", "B", "B", "B"], - "v": ["foo", "bar", "baz", "foo", pd.NA, "foo"], - } - ) - - result_cython = df.groupby("l").v.sum(skipna=False) - expected = df.groupby("l").v.apply(lambda x: x.sum(skipna=False)) - tm.assert_series_equal(result_cython, expected, check_exact=False) - - -def test_skipna_string_sum_consistency(): - # GH15675 - df = DataFrame( - { - "l": ["A", "A", "A", "B", "B", "B"], - "v": ["foo", "bar", "baz", "foo", pd.NA, "foo"], - } - ) - - result_cython = df.groupby("l").v.sum(skipna=True) - expected = df.groupby("l").v.sum() - tm.assert_series_equal(result_cython, expected, check_exact=False) From 5cd994c73034231be188e41df20fe42ddec9f375 Mon Sep 17 00:00:00 2001 From: Tiago Firmino Date: Sun, 2 Jun 2024 20:41:38 +0100 Subject: [PATCH 05/20] Reworked documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: André Correia --- pandas/core/resample.py | 129 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 129 insertions(+) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 1ccdaf46d701a..e688b800a4030 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1201,6 +1201,97 @@ def median( ): return self._downsample("median", numeric_only=numeric_only, skipna=skipna) + @final + @doc(GroupBy.mean) + def mean( + self, + numeric_only: bool = False, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return self._downsample( + "mean", + numeric_only=numeric_only, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @final + @doc(GroupBy.min) + def min( + self, + numeric_only: bool = False, + min_count: int = -1, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return self._downsample( + "min", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @final + @doc(GroupBy.max) + def max( + self, + numeric_only: bool = False, + min_count: int = -1, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return self._downsample( + "max", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @final + @doc(GroupBy.sum) + def sum( + self, + numeric_only: bool = False, + min_count: int = 0, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + ): + return self._downsample( + "sum", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) + + @final + @doc(GroupBy.prod) + def prod(self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True): + return self._downsample( + "prod", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) + + @final + @doc(GroupBy.any) + def any(self, skipna: bool = True): + return self._downsample("any", skipna=skipna) + + @final + @doc(GroupBy.all) + def all(self, skipna: bool = True): + return self._downsample("all", skipna=skipna) + @final def mean( self, @@ -1351,6 +1442,44 @@ def var( """ return self._downsample("var", ddof=ddof, numeric_only=numeric_only) + @final + @doc(GroupBy.std) + def std( + self, + ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool = False, + skipna: bool = True, + ): + return self._downsample( + "std", + ddof=ddof, + engine=engine, + engine_kwargs=engine_kwargs, + numeric_only=numeric_only, + skipna=skipna, + ) + + @final + @doc(GroupBy.var) + def var( + self, + ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, + numeric_only: bool = False, + skipna: bool = True, + ): + return self._downsample( + "var", + ddof=ddof, + engine=engine, + engine_kwargs=engine_kwargs, + numeric_only=numeric_only, + skipna=skipna, + ) + @final @doc(GroupBy.sem) def sem( From 8ae0caf1dd69adb7ccb19149cbe65e2d93a24052 Mon Sep 17 00:00:00 2001 From: Tiago Firmino Date: Sun, 2 Jun 2024 21:01:29 +0100 Subject: [PATCH 06/20] FIX: resample redefinition MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: André Correia --- pandas/core/resample.py | 195 +++++++++++++++------------------------- 1 file changed, 70 insertions(+), 125 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index e688b800a4030..ddd3e6f319c26 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1005,10 +1005,14 @@ def asfreq(self, fill_value=None): return self._upsample("asfreq", fill_value=fill_value) @final + @doc(GroupBy.sum) def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, ): """ Compute sum of group values. @@ -1050,14 +1054,18 @@ def sum( 2023-02-01 7 Freq: MS, dtype: int64 """ - return self._downsample("sum", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "sum", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) @final - def prod( - self, - numeric_only: bool = False, - min_count: int = 0, - ): + @doc(GroupBy.prod) + def prod(self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True): """ Compute prod of group values. @@ -1098,13 +1106,19 @@ def prod( 2023-02-01 12 Freq: MS, dtype: int64 """ - return self._downsample("prod", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "prod", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final + @doc(GroupBy.min) def min( self, numeric_only: bool = False, - min_count: int = 0, + min_count: int = -1, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, ): """ Compute min value of group. @@ -1132,13 +1146,24 @@ def min( 2023-02-01 3 Freq: MS, dtype: int64 """ - return self._downsample("min", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "min", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) @final + @doc(GroupBy.max) def max( self, numeric_only: bool = False, - min_count: int = 0, + min_count: int = -1, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, ): """ Compute max value of group. @@ -1166,7 +1191,14 @@ def max( 2023-02-01 4 Freq: MS, dtype: int64 """ - return self._downsample("max", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "max", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) @final @doc(GroupBy.first) @@ -1201,87 +1233,6 @@ def median( ): return self._downsample("median", numeric_only=numeric_only, skipna=skipna) - @final - @doc(GroupBy.mean) - def mean( - self, - numeric_only: bool = False, - skipna: bool = True, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): - return self._downsample( - "mean", - numeric_only=numeric_only, - skipna=skipna, - engine=engine, - engine_kwargs=engine_kwargs, - ) - - @final - @doc(GroupBy.min) - def min( - self, - numeric_only: bool = False, - min_count: int = -1, - skipna: bool = True, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): - return self._downsample( - "min", - numeric_only=numeric_only, - min_count=min_count, - skipna=skipna, - engine=engine, - engine_kwargs=engine_kwargs, - ) - - @final - @doc(GroupBy.max) - def max( - self, - numeric_only: bool = False, - min_count: int = -1, - skipna: bool = True, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): - return self._downsample( - "max", - numeric_only=numeric_only, - min_count=min_count, - skipna=skipna, - engine=engine, - engine_kwargs=engine_kwargs, - ) - - @final - @doc(GroupBy.sum) - def sum( - self, - numeric_only: bool = False, - min_count: int = 0, - skipna: bool = True, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - ): - return self._downsample( - "sum", - numeric_only=numeric_only, - min_count=min_count, - skipna=skipna, - engine=engine, - engine_kwargs=engine_kwargs, - ) - - @final - @doc(GroupBy.prod) - def prod(self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True): - return self._downsample( - "prod", numeric_only=numeric_only, min_count=min_count, skipna=skipna - ) - @final @doc(GroupBy.any) def any(self, skipna: bool = True): @@ -1293,9 +1244,13 @@ def all(self, skipna: bool = True): return self._downsample("all", skipna=skipna) @final + @doc(GroupBy.mean) def mean( self, numeric_only: bool = False, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, ): """ Compute mean of groups, excluding missing values. @@ -1334,13 +1289,23 @@ def mean( 2023-02-01 3.5 Freq: MS, dtype: float64 """ - return self._downsample("mean", numeric_only=numeric_only) + return self._downsample( + "mean", + numeric_only=numeric_only, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) @final + @doc(GroupBy.std) def std( self, ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -1384,13 +1349,24 @@ def std( 2023-02-01 2.645751 Freq: MS, dtype: float64 """ - return self._downsample("std", ddof=ddof, numeric_only=numeric_only) + return self._downsample( + "std", + ddof=ddof, + engine=engine, + engine_kwargs=engine_kwargs, + numeric_only=numeric_only, + skipna=skipna, + ) @final + @doc(GroupBy.var) def var( self, ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -1440,37 +1416,6 @@ def var( 2023-02-01 4.666667 Freq: MS, dtype: float64 """ - return self._downsample("var", ddof=ddof, numeric_only=numeric_only) - - @final - @doc(GroupBy.std) - def std( - self, - ddof: int = 1, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - numeric_only: bool = False, - skipna: bool = True, - ): - return self._downsample( - "std", - ddof=ddof, - engine=engine, - engine_kwargs=engine_kwargs, - numeric_only=numeric_only, - skipna=skipna, - ) - - @final - @doc(GroupBy.var) - def var( - self, - ddof: int = 1, - engine: Literal["cython", "numba"] | None = None, - engine_kwargs: dict[str, bool] | None = None, - numeric_only: bool = False, - skipna: bool = True, - ): return self._downsample( "var", ddof=ddof, From 5e3a9650f0e109408a829bbf92e2d9d2d409ecab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Sun, 2 Jun 2024 23:15:43 +0100 Subject: [PATCH 07/20] FIX: Small tweaks in docs Co-authored-by: Tiago Firmino --- pandas/core/resample.py | 99 ++++++++++++++++++++----- pandas/tests/groupby/test_reductions.py | 2 +- 2 files changed, 83 insertions(+), 18 deletions(-) diff --git a/pandas/core/resample.py b/pandas/core/resample.py index ddd3e6f319c26..edf23bc0f55a2 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1005,7 +1005,6 @@ def asfreq(self, fill_value=None): return self._upsample("asfreq", fill_value=fill_value) @final - @doc(GroupBy.sum) def sum( self, numeric_only: bool = False, @@ -1030,6 +1029,24 @@ def sum( The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values when computing the result. + + engine : str, default None + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` + or globally setting ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + Returns ------- Series or DataFrame @@ -1064,7 +1081,6 @@ def sum( ) @final - @doc(GroupBy.prod) def prod(self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True): """ Compute prod of group values. @@ -1082,6 +1098,9 @@ def prod(self, numeric_only: bool = False, min_count: int = 0, skipna: bool = Tr The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -1111,7 +1130,6 @@ def prod(self, numeric_only: bool = False, min_count: int = 0, skipna: bool = Tr ) @final - @doc(GroupBy.min) def min( self, numeric_only: bool = False, @@ -1156,7 +1174,6 @@ def min( ) @final - @doc(GroupBy.max) def max( self, numeric_only: bool = False, @@ -1234,17 +1251,6 @@ def median( return self._downsample("median", numeric_only=numeric_only, skipna=skipna) @final - @doc(GroupBy.any) - def any(self, skipna: bool = True): - return self._downsample("any", skipna=skipna) - - @final - @doc(GroupBy.all) - def all(self, skipna: bool = True): - return self._downsample("all", skipna=skipna) - - @final - @doc(GroupBy.mean) def mean( self, numeric_only: bool = False, @@ -1264,6 +1270,26 @@ def mean( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + Returns ------- DataFrame or Series @@ -1298,7 +1324,6 @@ def mean( ) @final - @doc(GroupBy.std) def std( self, ddof: int = 1, @@ -1314,6 +1339,24 @@ def std( ---------- ddof : int, default 1 Degrees of freedom. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. @@ -1323,6 +1366,9 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- DataFrame or Series @@ -1359,7 +1405,6 @@ def std( ) @final - @doc(GroupBy.var) def var( self, ddof: int = 1, @@ -1376,6 +1421,23 @@ def var( ddof : int, default 1 Degrees of freedom. + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. @@ -1385,6 +1447,9 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- DataFrame or Series diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 7501843ae8fca..b9200c627cf67 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1087,7 +1087,7 @@ def scipy_sem(*args, **kwargs): ("median", ["f", "t", "td"]), ("prod", ["f"]), ("sem", ["f"]), - ("std", ["f", "t", "td"]), + ("std", ["f"]), ("var", ["f"]), ("any", ["f"]), ("all", ["f"]), From c69207623f16a4d7dcbf9a8fd44c7f3624cf296d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Sun, 9 Jun 2024 19:10:27 +0100 Subject: [PATCH 08/20] Refactored test parameterization Co-authored-by: Tiago Firmino --- pandas/tests/groupby/test_reductions.py | 133 +++++++++++++++++------- 1 file changed, 96 insertions(+), 37 deletions(-) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index b9200c627cf67..12a9b6025c524 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1048,63 +1048,122 @@ def scipy_sem(*args, **kwargs): @pytest.mark.parametrize( - "data", + "reduction_method, values", [ - { - "l": ["A", "A", "A", "A", "B", "B", "B", "B"], - "f": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0], - "s": ["foo", "bar", "baz", "foo", "foo", "foo", pd.NA, "foo"], - "t": [ + ("sum", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("sum", ["foo", "bar", "baz", "foo", pd.NA, "foo"]), + ("min", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "min", + [ Timestamp("2024-01-01"), Timestamp("2024-01-02"), Timestamp("2024-01-03"), - Timestamp("2024-01-04"), - Timestamp("2024-01-06"), Timestamp("2024-01-07"), Timestamp("2024-01-08"), pd.NaT, ], - "td": [ + ), + ( + "min", + [ pd.Timedelta(days=1), pd.Timedelta(days=2), pd.Timedelta(days=3), - pd.Timedelta(days=4), - pd.Timedelta(days=6), pd.Timedelta(days=7), pd.Timedelta(days=8), pd.NaT, ], - } - ], -) -@pytest.mark.parametrize( - "reduction_method,columns", - [ - ("sum", ["f", "s"]), - ("min", ["f", "t", "td"]), - ("max", ["f", "t", "td"]), - ("mean", ["f", "t", "td"]), - ("median", ["f", "t", "td"]), - ("prod", ["f"]), - ("sem", ["f"]), - ("std", ["f"]), - ("var", ["f"]), - ("any", ["f"]), - ("all", ["f"]), - ("skew", ["f"]), + ), + ("max", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "max", + [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-07"), + Timestamp("2024-01-08"), + pd.NaT, + ], + ), + ( + "max", + [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=7), + pd.Timedelta(days=8), + pd.NaT, + ], + ), + ("mean", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "mean", + [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-07"), + Timestamp("2024-01-08"), + pd.NaT, + ], + ), + ( + "mean", + [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=7), + pd.Timedelta(days=8), + pd.NaT, + ], + ), + ("median", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "median", + [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-07"), + Timestamp("2024-01-08"), + pd.NaT, + ], + ), + ( + "median", + [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=7), + pd.Timedelta(days=8), + pd.NaT, + ], + ), + ("prod", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("sem", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("std", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("var", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("any", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("all", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("skew", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ], ) -def test_skipna_reduction_ops_cython(reduction_method, columns, data): +def test_skipna_reduction_ops_cython(reduction_method, values): # GH15675 # Testing the skipna parameter against possible datatypes - df = DataFrame(data) + df = DataFrame({"key": [1, 1, 1, 2, 2, 2], "values": values}) + gb = df.groupby("key") - for column in columns: - result_cython = getattr(df.groupby("l")[column], reduction_method)(skipna=False) - expected = df.groupby("l")[column].apply( - lambda x: getattr(x, reduction_method)(skipna=False) - ) - tm.assert_series_equal(result_cython, expected, check_exact=False) + result_cython = getattr(gb, reduction_method)(skipna=False) + expected = gb.apply( + lambda x: getattr(x, reduction_method)(skipna=False), include_groups=False + ) + tm.assert_frame_equal(result_cython, expected, check_exact=False) @pytest.mark.parametrize( From e87e0303954d61d47ba4990827f9937e772ae8a5 Mon Sep 17 00:00:00 2001 From: Tiago Firmino Date: Tue, 11 Jun 2024 02:10:39 +0100 Subject: [PATCH 09/20] Added tests for EAs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: André Correia --- pandas/_libs/groupby.pyx | 5 +- pandas/core/arrays/arrow/array.py | 4 +- pandas/core/groupby/ops.py | 9 +- pandas/tests/groupby/test_reductions.py | 113 +++++++++++++++++++++++- 4 files changed, 126 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 1f3aba82fdff4..337ce9270547c 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -735,6 +735,9 @@ def group_sum( for j in range(K): val = values[i, j] + if _treat_as_na(sumx[lab, j], is_datetimelike): + continue + if uses_mask: isna_entry = mask[i, j] else: @@ -1107,7 +1110,7 @@ def group_mean( isna_entry = _treat_as_na(val, is_datetimelike) if not skipna and isna_entry: - sumx[lab, j] = nan_val + sumx[lab, j] = val nobs[lab, j] = 0 continue diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 3d55513ab914c..75d801eeed065 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2215,7 +2215,9 @@ def _replace_with_mask( def _to_masked(self): pa_dtype = self._pa_array.type - if pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype): + if pa.types.is_floating(pa_dtype): + na_value = np.nan + elif pa.types.is_integer(pa_dtype): na_value = 1 elif pa.types.is_boolean(pa_dtype): na_value = True diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4f40c4f4283f0..d683cddc9c357 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -392,6 +392,13 @@ def _call_cython_op( values[mask] = True values = values.astype(bool, copy=False).view(np.int8) is_numeric = True + elif ( + self.how in ["median", "sem", "std", "var"] + and "skipna" in kwargs + and not kwargs["skipna"] + ): + # if skipna=False we don't want to use masks created for Nullable dtypes + mask = None values = values.T if mask is not None: @@ -1257,4 +1264,4 @@ def _get_splitter( # i.e. DataFrame klass = FrameSplitter - return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) + return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) \ No newline at end of file diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 12a9b6025c524..3cc26345188b0 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -3,6 +3,7 @@ from string import ascii_lowercase import numpy as np +import pyarrow as pa import pytest from pandas._libs.tslibs import iNaT @@ -1052,7 +1053,31 @@ def scipy_sem(*args, **kwargs): [ ("sum", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ("sum", ["foo", "bar", "baz", "foo", pd.NA, "foo"]), + ( + "sum", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "sum", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ("min", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "min", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "min", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ( "min", [ @@ -1076,6 +1101,18 @@ def scipy_sem(*args, **kwargs): ], ), ("max", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "max", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "max", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ( "max", [ @@ -1099,6 +1136,18 @@ def scipy_sem(*args, **kwargs): ], ), ("mean", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "mean", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "mean", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ( "mean", [ @@ -1122,6 +1171,18 @@ def scipy_sem(*args, **kwargs): ], ), ("median", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "median", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "median", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ( "median", [ @@ -1145,9 +1206,57 @@ def scipy_sem(*args, **kwargs): ], ), ("prod", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "prod", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "prod", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ("sem", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "sem", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "sem", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ("std", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "std", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "std", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ("var", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "var", + Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), + ), + ( + "var", + Series( + pd.array( + [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) + ) + ), + ), ("any", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ("all", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ("skew", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), @@ -1163,7 +1272,7 @@ def test_skipna_reduction_ops_cython(reduction_method, values): expected = gb.apply( lambda x: getattr(x, reduction_method)(skipna=False), include_groups=False ) - tm.assert_frame_equal(result_cython, expected, check_exact=False) + tm.assert_frame_equal(result_cython, expected, check_exact=False, check_dtype=False) @pytest.mark.parametrize( @@ -1310,4 +1419,4 @@ def test_groupby_std_datetimelike(): td4 = pd.Timedelta("2886 days 00:42:34.664668096") exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) - tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) \ No newline at end of file From 4f11daba1d173efa6c27c2e22c343871e4b1fffd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Tue, 11 Jun 2024 02:48:56 +0100 Subject: [PATCH 10/20] Removed Arrow support --- pandas/core/arrays/arrow/array.py | 10 +++- pandas/tests/groupby/test_reductions.py | 75 +------------------------ 2 files changed, 8 insertions(+), 77 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 75d801eeed065..66efa99a1faf4 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2215,9 +2215,7 @@ def _replace_with_mask( def _to_masked(self): pa_dtype = self._pa_array.type - if pa.types.is_floating(pa_dtype): - na_value = np.nan - elif pa.types.is_integer(pa_dtype): + if pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype): na_value = 1 elif pa.types.is_boolean(pa_dtype): na_value = True @@ -2239,6 +2237,12 @@ def _groupby_op( ids: npt.NDArray[np.intp], **kwargs, ): + if how in ["sum", "prod", "mean", "median", "var", "sem", "std", "nim", "max"]: + if "skipna" in kwargs and not kwargs["skipna"]: + raise NotImplementedError( + f"method '{how}' with skipna=False not implemented for Arrow dtypes" + ) + if isinstance(self.dtype, StringDtype): return super()._groupby_op( how=how, diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 3cc26345188b0..2cf6a3eaa94e1 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -3,7 +3,6 @@ from string import ascii_lowercase import numpy as np -import pyarrow as pa import pytest from pandas._libs.tslibs import iNaT @@ -1057,27 +1056,11 @@ def scipy_sem(*args, **kwargs): "sum", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "sum", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ("min", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ( "min", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "min", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ( "min", [ @@ -1105,14 +1088,6 @@ def scipy_sem(*args, **kwargs): "max", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "max", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ( "max", [ @@ -1140,14 +1115,6 @@ def scipy_sem(*args, **kwargs): "mean", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "mean", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ( "mean", [ @@ -1175,14 +1142,6 @@ def scipy_sem(*args, **kwargs): "median", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "median", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ( "median", [ @@ -1210,53 +1169,21 @@ def scipy_sem(*args, **kwargs): "prod", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "prod", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ("sem", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ( "sem", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "sem", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ("std", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ( "std", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "std", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ("var", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ( "var", Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), ), - ( - "var", - Series( - pd.array( - [1.0, 2.0, 3.0, np.nan, 4.0, 5.0], dtype=pd.ArrowDtype(pa.float64()) - ) - ), - ), ("any", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ("all", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ("skew", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), @@ -1419,4 +1346,4 @@ def test_groupby_std_datetimelike(): td4 = pd.Timedelta("2886 days 00:42:34.664668096") exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) - tm.assert_frame_equal(result, expected) \ No newline at end of file + tm.assert_frame_equal(result, expected) From edbb3318f7b4e0164e0a6675726ff6b4545753e4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Jun 2024 02:14:36 +0000 Subject: [PATCH 11/20] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d683cddc9c357..db815d42a10ce 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1264,4 +1264,4 @@ def _get_splitter( # i.e. DataFrame klass = FrameSplitter - return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) \ No newline at end of file + return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) From 3d719b8eb7184d9af703ffea729cf4c1542295b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Tue, 11 Jun 2024 03:14:47 +0100 Subject: [PATCH 12/20] pre-commit fix --- pandas/core/groupby/ops.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d683cddc9c357..db815d42a10ce 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -1264,4 +1264,4 @@ def _get_splitter( # i.e. DataFrame klass = FrameSplitter - return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) \ No newline at end of file + return klass(data, labels, ngroups, sort_idx=sort_idx, sorted_ids=sorted_ids) From 2856c6dd1a6451423bdcaea6ed2e92a325f84168 Mon Sep 17 00:00:00 2001 From: Tiago Firmino Date: Fri, 14 Jun 2024 23:32:17 +0100 Subject: [PATCH 13/20] WIP EAs support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: André Correia --- pandas/_libs/groupby.pyx | 46 +++++++++++++++++++++++---- pandas/core/arrays/arrow/array.py | 36 ++++++++------------- pandas/core/groupby/ops.py | 7 ---- pandas/tests/extension/base/reduce.py | 25 +++++++++++++++ 4 files changed, 78 insertions(+), 36 deletions(-) diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 337ce9270547c..39da04141b661 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT -cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil: +cdef float64_t median_linear_mask( + float64_t* a, + int n, + uint8_t* mask, + bint skipna=True +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -74,6 +79,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n # count NAs for i in range(n): if mask[i]: + if not skipna: + return NaN na_count += 1 if na_count: @@ -235,7 +242,7 @@ def group_median_float64( for j in range(ngroups): size = _counts[j + 1] - result = median_linear_mask(ptr, size, ptr_mask) + result = median_linear_mask(ptr, size, ptr_mask, skipna) out[j, i] = result if result != result: @@ -739,6 +746,8 @@ def group_sum( continue if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, is_datetimelike) @@ -747,7 +756,10 @@ def group_sum( if skipna: continue else: - sumx[lab, j] = val + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = val compensation[lab, j] = 0 continue @@ -824,6 +836,8 @@ def group_prod( val = values[i, j] if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, False) @@ -832,7 +846,10 @@ def group_prod( nobs[lab, j] += 1 prodx[lab, j] *= val elif not skipna: - prodx[lab, j] = val + if uses_mask: + result_mask[lab, j] = True + else: + prodx[lab, j] = val nobs[lab, j] = 0 continue @@ -891,6 +908,8 @@ def group_var( val = values[i, j] if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] elif is_datetimelike: # With group_var, we cannot just use _treat_as_na bc @@ -901,7 +920,10 @@ def group_var( isna_entry = _treat_as_na(val, is_datetimelike) if not skipna and isna_entry: - out[lab, j] = val + if uses_mask: + result_mask[lab, j] = True + else: + out[lab, j] = val nobs[lab, j] = 0 continue @@ -1100,6 +1122,8 @@ def group_mean( val = values[i, j] if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] elif is_datetimelike: # With group_mean, we cannot just use _treat_as_na bc @@ -1110,7 +1134,10 @@ def group_mean( isna_entry = _treat_as_na(val, is_datetimelike) if not skipna and isna_entry: - sumx[lab, j] = val + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = val nobs[lab, j] = 0 continue @@ -1762,12 +1789,17 @@ cdef group_min_max( val = values[i, j] if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, is_datetimelike) if not skipna and isna_entry: - group_min_or_max[lab, j] = val + if uses_mask: + result_mask[lab, j] = True + else: + group_min_or_max[lab, j] = val nobs[lab, j] = 0 continue diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 66efa99a1faf4..38d968f35d958 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2237,12 +2237,6 @@ def _groupby_op( ids: npt.NDArray[np.intp], **kwargs, ): - if how in ["sum", "prod", "mean", "median", "var", "sem", "std", "nim", "max"]: - if "skipna" in kwargs and not kwargs["skipna"]: - raise NotImplementedError( - f"method '{how}' with skipna=False not implemented for Arrow dtypes" - ) - if isinstance(self.dtype, StringDtype): return super()._groupby_op( how=how, @@ -2308,16 +2302,15 @@ def _str_contains( def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: if isinstance(pat, str): result = pc.starts_with(self._pa_array, pattern=pat) + elif len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) + result = pc.starts_with(self._pa_array, pattern=pat[0]) - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) @@ -2325,16 +2318,15 @@ def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self: if isinstance(pat, str): result = pc.ends_with(self._pa_array, pattern=pat) + elif len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) + result = pc.ends_with(self._pa_array, pattern=pat[0]) - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index db815d42a10ce..4f40c4f4283f0 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -392,13 +392,6 @@ def _call_cython_op( values[mask] = True values = values.astype(bool, copy=False).view(np.int8) is_numeric = True - elif ( - self.how in ["median", "sem", "std", "var"] - and "skipna" in kwargs - and not kwargs["skipna"] - ): - # if skipna=False we don't want to use masks created for Nullable dtypes - mask = None values = values.T if mask is not None: diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index c3a6daee2dd54..6026a0d0c3141 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -77,6 +77,19 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): tm.assert_extension_array_equal(result1, expected) + def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): + # Check that groupby reduction behaves correctly + df = pd.DataFrame({"a": ser, "key": [1, 2] * (len(ser) // 2)}) + grp = df.groupby("key")["a"] + res_op = getattr(grp, op_name) + + expected = grp.apply( + lambda x: getattr(x.astype("float64"), op_name)(skipna=skipna) + ) + + result = res_op(skipna=skipna) + tm.assert_series_equal(result, expected) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): op_name = all_boolean_reductions @@ -129,3 +142,15 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna): pytest.skip(f"Reduction {op_name} not supported for this dtype") self.check_reduce_frame(ser, op_name, skipna) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_groupby_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + ser = pd.Series(data) + if not is_numeric_dtype(ser.dtype): + pytest.skip(f"{ser.dtype} is not numeric dtype") + + if not self._supports_reduction(ser, op_name): + pytest.skip(f"Reduction {op_name} not supported for this dtype") + + self.check_reduce_groupby(ser, op_name, skipna) From c20017773fee700e9f313c6959896c4fbb4b4b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Thu, 4 Jul 2024 14:15:24 +0100 Subject: [PATCH 14/20] Extension Array Support Tests Co-authored-by: Tiago Firmino --- pandas/tests/extension/base/reduce.py | 29 +++++++++++++------ .../tests/extension/decimal/test_decimal.py | 3 ++ pandas/tests/extension/test_arrow.py | 10 +++++++ pandas/tests/extension/test_numpy.py | 6 ++++ pandas/tests/extension/test_sparse.py | 3 ++ 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 6026a0d0c3141..fd9e2f61ba0f9 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -13,6 +13,9 @@ class BaseReduceTests: make sense for numeric/boolean operations. """ + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + return self._supports_reduction(ser, op_name) + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: # Specify if we expect this reduction to succeed. return False @@ -78,17 +81,21 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): tm.assert_extension_array_equal(result1, expected) def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): - # Check that groupby reduction behaves correctly df = pd.DataFrame({"a": ser, "key": [1, 2] * (len(ser) // 2)}) - grp = df.groupby("key")["a"] - res_op = getattr(grp, op_name) + grp = df.groupby("key") + res1 = getattr(grp, op_name) + result = res1(skipna=skipna) - expected = grp.apply( - lambda x: getattr(x.astype("float64"), op_name)(skipna=skipna) - ) + if not skipna and ser.isna().any() and op_name != "skew": + expected = pd.DataFrame( + {"a": [pd.NA, pd.NA]}, index=pd.Index([1, 2], name="key") + ) + else: + expected = grp.apply( + lambda x: getattr(x, op_name)(skipna=skipna), include_groups=False + ) - result = res_op(skipna=skipna) - tm.assert_series_equal(result, expected) + tm.assert_almost_equal(result, expected, check_dtype=False, atol=1e-6) @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): @@ -147,10 +154,14 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna): def test_reduce_groupby_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) + if not is_numeric_dtype(ser.dtype): pytest.skip(f"{ser.dtype} is not numeric dtype") - if not self._supports_reduction(ser, op_name): + if op_name in ["count", "kurt", "sem"]: + pytest.skip(f"{op_name} not an array method") + + if not self._supports_reduction_groupby(ser, op_name): pytest.skip(f"Reduction {op_name} not supported for this dtype") self.check_reduce_groupby(ser, op_name, skipna) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 6f18761f77138..fa5a52620229e 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -74,6 +74,9 @@ def _get_expected_exception( def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return True + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + return op_name not in ["std", "median", "skew"] + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): if op_name == "count": return super().check_reduce(ser, op_name, skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7d31fe6085c3a..f9566e4148a2a 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -463,6 +463,16 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques self.check_accumulate(ser, op_name, skipna) + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + dtype = ser.dtype + pa_dtype = dtype.pyarrow_dtype + if op_name == "skew": + return False + if pa.types.is_decimal(pa_dtype): + # Skip decimal types + return False + return self._supports_reduction(ser, op_name) + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: dtype = ser.dtype # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 79cfb736941d6..e45a291344131 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -304,6 +304,12 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): self.frame_scalar_exc = frame_scalar_exc super().test_arith_frame_with_scalar(data, all_arithmetic_operators) + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["std", "skew"] and ser.dtype == "float64": + return False + else: + return self._supports_reduction(ser, op_name) + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: if ser.dtype.kind == "O": return op_name in ["sum", "min", "max", "any", "all"] diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 56c023d99bb1c..2e9906abd20d2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -97,6 +97,9 @@ def data_for_compare(request): class TestSparseArray(base.ExtensionTests): + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + return False + def _supports_reduction(self, obj, op_name: str) -> bool: return True From 262ca972142d57415b5be1e3f0ace39f4fbdcebd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Thu, 4 Jul 2024 15:39:25 +0100 Subject: [PATCH 15/20] WIP: Fixing Tests Co-authored-by: Tiago Firmino --- pandas/tests/extension/test_arrow.py | 4 ++- pandas/tests/groupby/test_reductions.py | 36 ------------------------- 2 files changed, 3 insertions(+), 37 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index f871cde653da7..ea959509d8bca 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -465,7 +465,9 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: dtype = ser.dtype - pa_dtype = dtype.pyarrow_dtype + # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has + # no attribute "pyarrow_dtype" + pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr] if op_name == "skew": return False if pa.types.is_decimal(pa_dtype): diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 072504b170828..44b8aba53a678 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1052,15 +1052,7 @@ def scipy_sem(*args, **kwargs): [ ("sum", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ("sum", ["foo", "bar", "baz", "foo", pd.NA, "foo"]), - ( - "sum", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ("min", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), - ( - "min", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ( "min", [ @@ -1084,10 +1076,6 @@ def scipy_sem(*args, **kwargs): ], ), ("max", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), - ( - "max", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ( "max", [ @@ -1111,10 +1099,6 @@ def scipy_sem(*args, **kwargs): ], ), ("mean", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), - ( - "mean", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ( "mean", [ @@ -1138,10 +1122,6 @@ def scipy_sem(*args, **kwargs): ], ), ("median", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), - ( - "median", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ( "median", [ @@ -1165,25 +1145,9 @@ def scipy_sem(*args, **kwargs): ], ), ("prod", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), - ( - "prod", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ("sem", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), - ( - "sem", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ("std", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), - ( - "std", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ("var", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), - ( - "var", - Series(pd.array([-1.0, 1.2, -1.1, 1.5, np.nan, 1.0], dtype="Float64")), - ), ("any", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ("all", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), ("skew", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), From 91bb3c3d5fa0fba1555ee2a440232b5008288da2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Thu, 4 Jul 2024 16:27:54 +0100 Subject: [PATCH 16/20] WIP: 32bit fix --- pandas/tests/extension/base/reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index fd9e2f61ba0f9..af653bc74a604 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -92,7 +92,7 @@ def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): ) else: expected = grp.apply( - lambda x: getattr(x, op_name)(skipna=skipna), include_groups=False + lambda x: getattr(x.astype(ser.dtype), op_name)(skipna=skipna), include_groups=False ) tm.assert_almost_equal(result, expected, check_dtype=False, atol=1e-6) From 7ee07d1cba5f3ace9b8787dcb95befd8a32f0746 Mon Sep 17 00:00:00 2001 From: Tiago Firmino Date: Sat, 3 Aug 2024 01:05:31 +0100 Subject: [PATCH 17/20] WIP: overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: André Correia --- pandas/tests/extension/test_masked.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 69ce42203d510..8e8c1d70d074e 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -276,6 +276,10 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): # https://github.com/pandas-dev/pandas/issues/30958 cmp_dtype = "int64" + if (op_name == "prod" and skipna and data.dtype.itemsize < 8 + and np.intp().itemsize < 8): + pytest.xfail(reason=f"{op_name} with itemsize + {data.dtype.itemsize} overflows") if ser.dtype.kind == "f": # Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has # no attribute "numpy_dtype" From 5a004cf0680b0cfae1d2361f95d47fe86f1c44af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Sun, 4 Aug 2024 20:56:40 +0100 Subject: [PATCH 18/20] Fix tests 32bit --- pandas/tests/extension/base/reduce.py | 5 +++-- pandas/tests/extension/test_masked.py | 31 +++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 14e7d529fbd14..1403047b22014 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -91,7 +91,8 @@ def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): ) else: expected = grp.apply( - lambda x: getattr(x.astype(ser.dtype), op_name)(skipna=skipna), include_groups=False + lambda x: getattr(x.astype(ser.dtype), op_name)(skipna=skipna), + include_groups=False, ) tm.assert_almost_equal(result, expected, check_dtype=False, atol=1e-6) @@ -152,7 +153,7 @@ def test_reduce_groupby_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) - if not is_numeric_dtype(ser.dtype): + if not pd.core.dtypes.common.is_numeric_dtype(ser.dtype): pytest.skip(f"{ser.dtype} is not numeric dtype") if op_name in ["count", "kurt", "sem"]: diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 2083bab11cbb5..bc5a31be2b99f 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -276,10 +276,6 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): # https://github.com/pandas-dev/pandas/issues/30958 cmp_dtype = "int64" - if (op_name == "prod" and skipna and data.dtype.itemsize < 8 - and np.intp().itemsize < 8): - pytest.xfail(reason=f"{op_name} with itemsize - {data.dtype.itemsize} overflows") if ser.dtype.kind == "f": # Item "dtype[Any]" of "Union[dtype[Any], ExtensionDtype]" has # no attribute "numpy_dtype" @@ -302,6 +298,33 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) + def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): + df = pd.DataFrame({"a": ser, "key": [1, 2] * (len(ser) // 2)}) + grp = df.groupby("key") + res1 = getattr(grp, op_name) + result = res1(skipna=skipna) + if ( + op_name == "prod" + and skipna + and ser.dtype.itemsize < 8 + and np.intp().itemsize < 8 + ): + pytest.mark.xfail( + reason=f"{op_name} with itemsize {ser.dtype.itemsize} overflows" + ) + + if not skipna and ser.isna().any() and op_name != "skew": + expected = pd.DataFrame( + {"a": [pd.NA, pd.NA]}, index=pd.Index([1, 2], name="key") + ) + else: + expected = grp.apply( + lambda x: getattr(x.astype(ser.dtype), op_name)(skipna=skipna), + include_groups=False, + ) + + tm.assert_almost_equal(result, expected, check_dtype=False, atol=1e-6) + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name From 0ef070c1a2dffbba76bcef3269954d345aea2789 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Sun, 4 Aug 2024 21:45:21 +0100 Subject: [PATCH 19/20] small tweaks --- pandas/tests/extension/base/reduce.py | 4 +++- pandas/tests/extension/test_masked.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 1403047b22014..e2bda9a221261 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -2,6 +2,8 @@ import pytest +from pandas.core.dtypes.common import is_numeric_dtype + import pandas as pd import pandas._testing as tm @@ -153,7 +155,7 @@ def test_reduce_groupby_numeric(self, data, all_numeric_reductions, skipna): op_name = all_numeric_reductions ser = pd.Series(data) - if not pd.core.dtypes.common.is_numeric_dtype(ser.dtype): + if not is_numeric_dtype(ser.dtype): pytest.skip(f"{ser.dtype} is not numeric dtype") if op_name in ["count", "kurt", "sem"]: diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index bc5a31be2b99f..81ea476faeede 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -306,11 +306,11 @@ def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): if ( op_name == "prod" and skipna - and ser.dtype.itemsize < 8 + and data.dtype.itemsize < 8 and np.intp().itemsize < 8 ): pytest.mark.xfail( - reason=f"{op_name} with itemsize {ser.dtype.itemsize} overflows" + reason=f"{op_name} with itemsize {data.dtype.itemsize} overflows" ) if not skipna and ser.isna().any() and op_name != "skew": From d02b3082b89b2d1097d495adbe46c7a29996ac48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Correia?= Date: Sun, 4 Aug 2024 22:28:01 +0100 Subject: [PATCH 20/20] simpler test skipping approach --- pandas/tests/extension/test_masked.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 81ea476faeede..8a41d78b0d28e 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -306,11 +306,9 @@ def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): if ( op_name == "prod" and skipna - and data.dtype.itemsize < 8 - and np.intp().itemsize < 8 ): pytest.mark.xfail( - reason=f"{op_name} with itemsize {data.dtype.itemsize} overflows" + reason=f"{op_name} overflows" ) if not skipna and ser.isna().any() and op_name != "skew":