diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index e3c4e69db7cbd..20e85ca5f34bb 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -43,6 +43,7 @@ Other enhancements - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`) - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`) - :func:`DataFrame.to_excel` argument ``merge_cells`` now accepts a value of ``"columns"`` to only merge :class:`MultiIndex` column header header cells (:issue:`35384`) +- :meth:`.DataFrameGroupBy.sum`, :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.median`, :meth:`.DataFrameGroupBy.sem`, :meth:`.DataFrameGroupBy.std` and :meth:`.DataFrameGroupBy.var` now accept a skipna argument. (:issue:`15675`) - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`) - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`) - :meth:`DataFrame.ewm` now allows ``adjust=False`` when ``times`` is provided (:issue:`54328`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 53f5f73624232..222e527344cec 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -13,6 +13,7 @@ def group_median_float64( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., # bint + skipna: bool = ..., # bint ) -> None: ... def group_cumprod( out: np.ndarray, # float64_t[:, ::1] @@ -66,6 +67,7 @@ def group_sum( result_mask: np.ndarray | None = ..., min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., # bint ) -> None: ... def group_prod( out: np.ndarray, # int64float_t[:, ::1] @@ -75,6 +77,7 @@ def group_prod( mask: np.ndarray | None, result_mask: np.ndarray | None = ..., min_count: int = ..., + skipna: bool = ..., # bint ) -> None: ... def group_var( out: np.ndarray, # floating[:, ::1] @@ -86,6 +89,7 @@ def group_var( mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., is_datetimelike: bool = ..., + skipna: bool = ..., # bint name: str = ..., ) -> None: ... def group_skew( @@ -104,6 +108,7 @@ def group_mean( labels: np.ndarray, # const intp_t[:] min_count: int = ..., # Py_ssize_t is_datetimelike: bool = ..., # bint + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... @@ -170,6 +175,7 @@ def group_max( labels: np.ndarray, # const int64_t[:] min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... @@ -180,6 +186,7 @@ def group_min( labels: np.ndarray, # const int64_t[:] min_count: int = ..., is_datetimelike: bool = ..., + skipna: bool = ..., # bint mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., ) -> None: ... diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index d7e485f74e58b..7b69a3188bb34 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -62,7 +62,12 @@ cdef enum InterpolationEnumType: INTERPOLATION_MIDPOINT -cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil: +cdef float64_t median_linear_mask( + float64_t* a, + int n, + uint8_t* mask, + bint skipna=True +) noexcept nogil: cdef: int i, j, na_count = 0 float64_t* tmp @@ -74,6 +79,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n # count NAs for i in range(n): if mask[i]: + if not skipna: + return NaN na_count += 1 if na_count: @@ -104,7 +111,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n cdef float64_t median_linear( float64_t* a, int n, - bint is_datetimelike=False + bint is_datetimelike=False, + bint skipna=True ) noexcept nogil: cdef: int i, j, na_count = 0 @@ -118,10 +126,14 @@ cdef float64_t median_linear( if is_datetimelike: for i in range(n): if a[i] == NPY_NAT: + if not skipna: + return NaN na_count += 1 else: for i in range(n): if a[i] != a[i]: + if not skipna: + return NaN na_count += 1 if na_count: @@ -186,6 +198,7 @@ def group_median_float64( const uint8_t[:, :] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -229,7 +242,7 @@ def group_median_float64( for j in range(ngroups): size = _counts[j + 1] - result = median_linear_mask(ptr, size, ptr_mask) + result = median_linear_mask(ptr, size, ptr_mask, skipna) out[j, i] = result if result != result: @@ -244,7 +257,7 @@ def group_median_float64( ptr += _counts[0] for j in range(ngroups): size = _counts[j + 1] - out[j, i] = median_linear(ptr, size, is_datetimelike) + out[j, i] = median_linear(ptr, size, is_datetimelike, skipna) ptr += size @@ -700,6 +713,7 @@ def group_sum( uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, bint is_datetimelike=False, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 using Kahan summation @@ -734,37 +748,52 @@ def group_sum( for j in range(K): val = values[i, j] + if _treat_as_na(sumx[lab, j], is_datetimelike): + continue + if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: - nobs[lab, j] += 1 - - if sum_t is object: - # NB: this does not use 'compensation' like the non-object - # track does. - if nobs[lab, j] == 1: - # i.e. we haven't added anything yet; avoid TypeError - # if e.g. val is a str and sumx[lab, j] is 0 - t = val + if isna_entry: + if skipna: + continue + else: + if uses_mask: + result_mask[lab, j] = True else: - t = sumx[lab, j] + val - sumx[lab, j] = t + sumx[lab, j] = val + compensation[lab, j] = 0 + continue + nobs[lab, j] += 1 + + if sum_t is object: + # NB: this does not use 'compensation' like the non-object + # track does. + if nobs[lab, j] == 1: + # i.e. we haven't added anything yet; avoid TypeError + # if e.g. val is a str and sumx[lab, j] is 0 + t = val else: - y = val - compensation[lab, j] - t = sumx[lab, j] + y - compensation[lab, j] = t - sumx[lab, j] - y - if compensation[lab, j] != compensation[lab, j]: - # GH#53606 - # If val is +/- infinity compensation is NaN - # which would lead to results being NaN instead - # of +/- infinity. We cannot use util.is_nan - # because of no gil - compensation[lab, j] = 0 - sumx[lab, j] = t + t = sumx[lab, j] + val + sumx[lab, j] = t + + else: + y = val - compensation[lab, j] + t = sumx[lab, j] + y + compensation[lab, j] = t - sumx[lab, j] - y + if compensation[lab, j] != compensation[lab, j]: + # GH#53606 + # If val is +/- infinity compensation is NaN + # which would lead to results being NaN instead + # of +/- infinity. We cannot use util.is_nan + # because of no gil + compensation[lab, j] = 0 + sumx[lab, j] = t _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx @@ -781,6 +810,7 @@ def group_prod( const uint8_t[:, ::1] mask, uint8_t[:, ::1] result_mask=None, Py_ssize_t min_count=0, + bint skipna=True, ) -> None: """ Only aggregates on axis=0 @@ -812,6 +842,8 @@ def group_prod( val = values[i, j] if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, False) @@ -819,6 +851,13 @@ def group_prod( if not isna_entry: nobs[lab, j] += 1 prodx[lab, j] *= val + elif not skipna: + if uses_mask: + result_mask[lab, j] = True + else: + prodx[lab, j] = val + nobs[lab, j] = 0 + continue _check_below_mincount( out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx @@ -838,6 +877,7 @@ def group_var( const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, bint is_datetimelike=False, + bint skipna=True, str name="var", ) -> None: cdef: @@ -874,6 +914,8 @@ def group_var( val = values[i, j] if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] elif is_datetimelike: # With group_var, we cannot just use _treat_as_na bc @@ -883,7 +925,15 @@ def group_var( else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: + if not skipna and isna_entry: + if uses_mask: + result_mask[lab, j] = True + else: + out[lab, j] = val + nobs[lab, j] = 0 + continue + + elif not isna_entry: nobs[lab, j] += 1 oldmean = mean[lab, j] mean[lab, j] += (val - oldmean) / nobs[lab, j] @@ -1004,6 +1054,7 @@ def group_mean( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, ) -> None: @@ -1027,6 +1078,8 @@ def group_mean( Only used in sum and prod. Always -1. is_datetimelike : bool True if `values` contains datetime-like entries. + skipna : bool, default True + Exclude NA/null values when computing the result. mask : ndarray[bool, ndim=2], optional Mask of the input values. result_mask : ndarray[bool, ndim=2], optional @@ -1075,6 +1128,8 @@ def group_mean( val = values[i, j] if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] elif is_datetimelike: # With group_mean, we cannot just use _treat_as_na bc @@ -1084,7 +1139,15 @@ def group_mean( else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: + if not skipna and isna_entry: + if uses_mask: + result_mask[lab, j] = True + else: + sumx[lab, j] = val + nobs[lab, j] = 0 + continue + + elif not isna_entry: nobs[lab, j] += 1 y = val - compensation[lab, j] t = sumx[lab, j] + y @@ -1102,12 +1165,10 @@ def group_mean( for j in range(K): count = nobs[i, j] if nobs[i, j] == 0: - if uses_mask: result_mask[i, j] = True else: out[i, j] = nan_val - else: out[i, j] = sumx[i, j] / count @@ -1666,6 +1727,7 @@ cdef group_min_max( Py_ssize_t min_count=-1, bint is_datetimelike=False, bint compute_max=True, + bint skipna=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, ): @@ -1689,6 +1751,8 @@ cdef group_min_max( True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min + skipna : bool, default True + Exclude NA/null values when computing the result. mask : ndarray[bool, ndim=2], optional If not None, indices represent missing values, otherwise the mask will not be used @@ -1731,11 +1795,21 @@ cdef group_min_max( val = values[i, j] if uses_mask: + if result_mask[lab, j]: + continue isna_entry = mask[i, j] else: isna_entry = _treat_as_na(val, is_datetimelike) - if not isna_entry: + if not skipna and isna_entry: + if uses_mask: + result_mask[lab, j] = True + else: + group_min_or_max[lab, j] = val + nobs[lab, j] = 0 + continue + + elif not isna_entry: nobs[lab, j] += 1 if compute_max: if val > group_min_or_max[lab, j]: @@ -1872,6 +1946,7 @@ def group_max( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, ) -> None: @@ -1886,6 +1961,7 @@ def group_max( compute_max=True, mask=mask, result_mask=result_mask, + skipna=skipna, ) @@ -1898,6 +1974,7 @@ def group_min( const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, + bint skipna=True, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, ) -> None: @@ -1912,6 +1989,7 @@ def group_min( compute_max=False, mask=mask, result_mask=result_mask, + skipna=skipna, ) diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py index 3f3ebe8dbe023..b7cb4b53d2c2b 100644 --- a/pandas/core/_numba/executor.py +++ b/pandas/core/_numba/executor.py @@ -71,13 +71,20 @@ def column_looper( labels: np.ndarray, ngroups: int, min_periods: int, + skipna: bool = True, *args, ): result = np.empty((values.shape[0], ngroups), dtype=result_dtype) na_positions = {} for i in numba.prange(values.shape[0]): output, na_pos = func( - values[i], result_dtype, labels, ngroups, min_periods, *args + values[i], + result_dtype, + labels, + ngroups, + min_periods, + skipna, + *args, ) result[i] = output if len(na_pos) > 0: @@ -164,6 +171,7 @@ def generate_shared_aggregator( nopython: bool, nogil: bool, parallel: bool, + skipna: bool = True, ): """ Generate a Numba function that loops over the columns 2D object and applies @@ -192,7 +200,6 @@ def generate_shared_aggregator( ------- Numba function """ - # A wrapper around the looper function, # to dispatch based on dtype since numba is unable to do that in nopython mode @@ -216,11 +223,11 @@ def looper_wrapper( # Need to unpack kwargs since numba only supports *args if is_grouped_kernel: result, na_positions = column_looper( - values, labels, ngroups, min_periods, *kwargs.values() + values, labels, ngroups, min_periods, skipna, *kwargs.values() ) else: result, na_positions = column_looper( - values, start, end, min_periods, *kwargs.values() + values, start, end, min_periods, skipna, *kwargs.values() ) if result.dtype.kind == "i": # Look if na_positions is not empty diff --git a/pandas/core/_numba/kernels/mean_.py b/pandas/core/_numba/kernels/mean_.py index cc10bd003af7e..0ede465acd7ec 100644 --- a/pandas/core/_numba/kernels/mean_.py +++ b/pandas/core/_numba/kernels/mean_.py @@ -71,6 +71,7 @@ def sliding_mean( start: np.ndarray, end: np.ndarray, min_periods: int, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: N = len(start) nobs = 0 @@ -169,9 +170,10 @@ def grouped_mean( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods @@ -187,7 +189,8 @@ def grouped_mean( result = sum_x else: result = np.nan - result /= nobs + if nobs != 0: + result /= nobs output[lab] = result # na_position is empty list since float64 can already hold nans diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py index 59d36732ebae6..fb54d0ee9525a 100644 --- a/pandas/core/_numba/kernels/min_max_.py +++ b/pandas/core/_numba/kernels/min_max_.py @@ -25,6 +25,7 @@ def sliding_min_max( start: np.ndarray, end: np.ndarray, min_periods: int, + skipna: bool, is_max: bool, ) -> tuple[np.ndarray, list[int]]: N = len(start) @@ -87,6 +88,7 @@ def grouped_min_max( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool, is_max: bool, ) -> tuple[np.ndarray, list[int]]: N = len(labels) @@ -102,6 +104,9 @@ def grouped_min_max( if values.dtype.kind == "i" or not np.isnan(val): nobs[lab] += 1 + elif not skipna and np.isnan(val): + output[lab] = np.nan + continue else: # NaN value cannot be a min/max value continue diff --git a/pandas/core/_numba/kernels/sum_.py b/pandas/core/_numba/kernels/sum_.py index 76f4e22b43c4b..49a2d4ceb6487 100644 --- a/pandas/core/_numba/kernels/sum_.py +++ b/pandas/core/_numba/kernels/sum_.py @@ -69,6 +69,7 @@ def sliding_sum( start: np.ndarray, end: np.ndarray, min_periods: int, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: dtype = values.dtype @@ -165,6 +166,7 @@ def grouped_kahan_sum( result_dtype: np.dtype, labels: npt.NDArray[np.intp], ngroups: int, + skipna: bool = True, ) -> tuple[ np.ndarray, npt.NDArray[np.int64], np.ndarray, npt.NDArray[np.int64], np.ndarray ]: @@ -183,32 +185,38 @@ def grouped_kahan_sum( if lab < 0: continue - sum_x = output[lab] - nobs = nobs_arr[lab] - compensation_add = comp_arr[lab] - num_consecutive_same_value = consecutive_counts[lab] - prev_value = prev_vals[lab] + if not skipna and np.isnan(val): + output[lab] = val + consecutive_counts[lab] = 0 + + else: + sum_x = output[lab] + nobs = nobs_arr[lab] + compensation_add = comp_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + prev_value = prev_vals[lab] + + ( + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_sum( + val, + nobs, + sum_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + + output[lab] = sum_x + consecutive_counts[lab] = num_consecutive_same_value + prev_vals[lab] = prev_value + comp_arr[lab] = compensation_add + nobs_arr[lab] = nobs - ( - nobs, - sum_x, - compensation_add, - num_consecutive_same_value, - prev_value, - ) = add_sum( - val, - nobs, - sum_x, - compensation_add, - num_consecutive_same_value, - prev_value, - ) - - output[lab] = sum_x - consecutive_counts[lab] = num_consecutive_same_value - prev_vals[lab] = prev_value - comp_arr[lab] = compensation_add - nobs_arr[lab] = nobs return output, nobs_arr, comp_arr, consecutive_counts, prev_vals @@ -219,11 +227,12 @@ def grouped_sum( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool = True, ) -> tuple[np.ndarray, list[int]]: na_pos = [] output, nobs_arr, comp_arr, consecutive_counts, prev_vals = grouped_kahan_sum( - values, result_dtype, labels, ngroups + values, result_dtype, labels, ngroups, skipna ) # Post-processing, replace sums that don't satisfy min_periods @@ -232,7 +241,7 @@ def grouped_sum( num_consecutive_same_value = consecutive_counts[lab] prev_value = prev_vals[lab] sum_x = output[lab] - if nobs >= min_periods: + if not np.isnan(sum_x) and nobs >= min_periods: if num_consecutive_same_value >= nobs: result = prev_value * nobs else: diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py index 69aec4d6522c4..e4ebe26f75ed5 100644 --- a/pandas/core/_numba/kernels/var_.py +++ b/pandas/core/_numba/kernels/var_.py @@ -78,6 +78,7 @@ def sliding_var( start: np.ndarray, end: np.ndarray, min_periods: int, + skipna: bool = True, ddof: int = 1, ) -> tuple[np.ndarray, list[int]]: N = len(start) @@ -175,6 +176,7 @@ def grouped_var( labels: npt.NDArray[np.intp], ngroups: int, min_periods: int, + skipna: bool = True, ddof: int = 1, ) -> tuple[np.ndarray, list[int]]: N = len(labels) @@ -193,36 +195,41 @@ def grouped_var( if lab < 0: continue - mean_x = means[lab] - ssqdm_x = output[lab] - nobs = nobs_arr[lab] - compensation_add = comp_arr[lab] - num_consecutive_same_value = consecutive_counts[lab] - prev_value = prev_vals[lab] - - ( - nobs, - mean_x, - ssqdm_x, - compensation_add, - num_consecutive_same_value, - prev_value, - ) = add_var( - val, - nobs, - mean_x, - ssqdm_x, - compensation_add, - num_consecutive_same_value, - prev_value, - ) - - output[lab] = ssqdm_x - means[lab] = mean_x - consecutive_counts[lab] = num_consecutive_same_value - prev_vals[lab] = prev_value - comp_arr[lab] = compensation_add - nobs_arr[lab] = nobs + if not skipna and np.isnan(val): + output[lab] = val + nobs_arr[lab] = 0 + + else: + mean_x = means[lab] + ssqdm_x = output[lab] + nobs = nobs_arr[lab] + compensation_add = comp_arr[lab] + num_consecutive_same_value = consecutive_counts[lab] + prev_value = prev_vals[lab] + + ( + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) = add_var( + val, + nobs, + mean_x, + ssqdm_x, + compensation_add, + num_consecutive_same_value, + prev_value, + ) + + output[lab] = ssqdm_x + means[lab] = mean_x + consecutive_counts[lab] = num_consecutive_same_value + prev_vals[lab] = prev_value + comp_arr[lab] = compensation_add + nobs_arr[lab] = nobs # Post-processing, replace vars that don't satisfy min_periods for lab in range(ngroups): diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index a17056b51a014..82839c2055be1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -2299,16 +2299,15 @@ def _str_contains( def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: if isinstance(pat, str): result = pc.starts_with(self._pa_array, pattern=pat) + elif len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.starts_with(self._pa_array, pattern=pat[0]) + result = pc.starts_with(self._pa_array, pattern=pat[0]) - for p in pat[1:]: - result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) + for p in pat[1:]: + result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) @@ -2316,16 +2315,15 @@ def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self: def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self: if isinstance(pat, str): result = pc.ends_with(self._pa_array, pattern=pat) + elif len(pat) == 0: + # For empty tuple, pd.StringDtype() returns null for missing values + # and false for valid values. + result = pc.if_else(pc.is_null(self._pa_array), None, False) else: - if len(pat) == 0: - # For empty tuple, pd.StringDtype() returns null for missing values - # and false for valid values. - result = pc.if_else(pc.is_null(self._pa_array), None, False) - else: - result = pc.ends_with(self._pa_array, pattern=pat[0]) + result = pc.ends_with(self._pa_array, pattern=pat[0]) - for p in pat[1:]: - result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) + for p in pat[1:]: + result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p)) if not isna(na): result = result.fill_null(na) return type(self)(result) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c07bc56377151..85a98156cba24 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -180,6 +180,9 @@ class providing the base-class of operations. The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. +skipna : bool, default {sn} + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -206,6 +209,9 @@ class providing the base-class of operations. The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. +skipna : bool, default {sn} + Exclude NA/null values when computing the result. + engine : str, default None {e} * ``'cython'`` : Runs rolling apply through C-extensions from cython. * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. @@ -1398,6 +1404,7 @@ def _numba_agg_general( func: Callable, dtype_mapping: dict[np.dtype, Any], engine_kwargs: dict[str, bool] | None, + skipna: bool = True, **aggregator_kwargs, ): """ @@ -1416,6 +1423,7 @@ def _numba_agg_general( func, dtype_mapping, True, # is_grouped_kernel + skipna=skipna, **get_jit_arguments(engine_kwargs), ) # Pass group ids to kernel directly if it can handle it @@ -1760,6 +1768,7 @@ def _agg_general( numeric_only: bool = False, min_count: int = -1, *, + skipna: bool = True, alias: str, npfunc: Callable | None = None, **kwargs, @@ -1767,6 +1776,7 @@ def _agg_general( result = self._cython_agg_general( how=alias, alt=npfunc, + skipna=skipna, numeric_only=numeric_only, min_count=min_count, **kwargs, @@ -2236,6 +2246,7 @@ def hfunc(bvalues: ArrayLike) -> ArrayLike: def mean( self, numeric_only: bool = False, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -2251,6 +2262,9 @@ def mean( numeric_only no longer accepts ``None`` and defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + engine : str, default None * ``'cython'`` : Runs the operation through C-extensions from cython. * ``'numba'`` : Runs the operation through JIT compiled code from numba. @@ -2317,17 +2331,22 @@ def mean( executor.float_dtype_mapping, engine_kwargs, min_periods=0, + skipna=skipna, ) else: result = self._cython_agg_general( "mean", - alt=lambda x: Series(x, copy=False).mean(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).mean( + numeric_only=numeric_only, + skipna=skipna, + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @final - def median(self, numeric_only: bool = False) -> NDFrameT: + def median(self, numeric_only: bool = False, skipna: bool = True) -> NDFrameT: """ Compute median of groups, excluding missing values. @@ -2342,6 +2361,9 @@ def median(self, numeric_only: bool = False) -> NDFrameT: numeric_only no longer accepts ``None`` and defaults to False. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -2414,8 +2436,11 @@ def median(self, numeric_only: bool = False) -> NDFrameT: """ result = self._cython_agg_general( "median", - alt=lambda x: Series(x, copy=False).median(numeric_only=numeric_only), + alt=lambda x: Series(x, copy=False).median( + numeric_only=numeric_only, skipna=skipna + ), numeric_only=numeric_only, + skipna=skipna, ) return result.__finalize__(self.obj, method="groupby") @@ -2428,6 +2453,7 @@ def std( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -2465,6 +2491,9 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -2519,14 +2548,16 @@ def std( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) ) else: return self._cython_agg_general( "std", - alt=lambda x: Series(x, copy=False).std(ddof=ddof), + alt=lambda x: Series(x, copy=False).std(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2538,6 +2569,7 @@ def var( engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -2575,6 +2607,9 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -2628,13 +2663,15 @@ def var( engine_kwargs, min_periods=0, ddof=ddof, + skipna=skipna, ) else: return self._cython_agg_general( "var", - alt=lambda x: Series(x, copy=False).var(ddof=ddof), + alt=lambda x: Series(x, copy=False).var(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2764,7 +2801,9 @@ def _value_counts( return result.__finalize__(self.obj, method="value_counts") @final - def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: + def sem( + self, ddof: int = 1, numeric_only: bool = False, skipna: bool = True + ) -> NDFrameT: """ Compute standard error of the mean of groups, excluding missing values. @@ -2784,6 +2823,9 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -2853,9 +2895,10 @@ def sem(self, ddof: int = 1, numeric_only: bool = False) -> NDFrameT: ) return self._cython_agg_general( "sem", - alt=lambda x: Series(x, copy=False).sem(ddof=ddof), + alt=lambda x: Series(x, copy=False).sem(ddof=ddof, skipna=skipna), numeric_only=numeric_only, ddof=ddof, + skipna=skipna, ) @final @@ -2960,6 +3003,7 @@ def size(self) -> DataFrame | Series: fname="sum", no=False, mc=0, + sn=True, e=None, ek=None, example=dedent( @@ -3001,6 +3045,7 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3012,6 +3057,7 @@ def sum( executor.default_dtype_mapping, engine_kwargs, min_periods=min_count, + skipna=skipna, ) else: # If we are grouping on categoricals we want unobserved categories to @@ -3021,6 +3067,7 @@ def sum( result = self._agg_general( numeric_only=numeric_only, min_count=min_count, + skipna=skipna, alias="sum", npfunc=np.sum, ) @@ -3033,6 +3080,7 @@ def sum( fname="prod", no=False, mc=0, + sn=True, example=dedent( """\ For SeriesGroupBy: @@ -3068,9 +3116,15 @@ def sum( 2 30 72""" ), ) - def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: + def prod( + self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True + ) -> NDFrameT: return self._agg_general( - numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod + numeric_only=numeric_only, + min_count=min_count, + alias="prod", + npfunc=np.prod, + skipna=skipna, ) @final @@ -3079,6 +3133,7 @@ def prod(self, numeric_only: bool = False, min_count: int = 0) -> NDFrameT: fname="min", no=False, mc=-1, + sn=True, e=None, ek=None, example=dedent( @@ -3120,6 +3175,7 @@ def min( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3132,6 +3188,7 @@ def min( engine_kwargs, min_periods=min_count, is_max=False, + skipna=skipna, ) else: return self._agg_general( @@ -3139,6 +3196,7 @@ def min( min_count=min_count, alias="min", npfunc=np.min, + skipna=skipna, ) @final @@ -3147,6 +3205,7 @@ def min( fname="max", no=False, mc=-1, + sn=True, e=None, ek=None, example=dedent( @@ -3188,6 +3247,7 @@ def max( self, numeric_only: bool = False, min_count: int = -1, + skipna: bool = True, engine: Literal["cython", "numba"] | None = None, engine_kwargs: dict[str, bool] | None = None, ): @@ -3200,6 +3260,7 @@ def max( engine_kwargs, min_periods=min_count, is_max=True, + skipna=skipna, ) else: return self._agg_general( @@ -3207,6 +3268,7 @@ def max( min_count=min_count, alias="max", npfunc=np.max, + skipna=skipna, ) @final diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 8ee71ea2293e6..ee3996cd4d4b3 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -1011,6 +1011,9 @@ def sum( self, numeric_only: bool = False, min_count: int = 0, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, ): """ Compute sum of group values. @@ -1028,6 +1031,24 @@ def sum( The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values when computing the result. + + engine : str, default None + * ``'cython'`` : Runs rolling apply through C-extensions from cython. + * ``'numba'`` : Runs rolling apply through JIT compiled code from numba. + Only available when ``raw`` is set to ``True``. + * ``None`` : Defaults to ``'cython'`` + or globally setting ``compute.use_numba`` + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be + applied to both the ``func`` and the ``apply`` groupby aggregation. + Returns ------- Series or DataFrame @@ -1052,14 +1073,17 @@ def sum( 2023-02-01 7 Freq: MS, dtype: int64 """ - return self._downsample("sum", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "sum", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) @final - def prod( - self, - numeric_only: bool = False, - min_count: int = 0, - ): + def prod(self, numeric_only: bool = False, min_count: int = 0, skipna: bool = True): """ Compute prod of group values. @@ -1076,6 +1100,9 @@ def prod( The required number of valid values to perform the operation. If fewer than ``min_count`` non-NA values are present the result will be NA. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- Series or DataFrame @@ -1100,13 +1127,18 @@ def prod( 2023-02-01 12 Freq: MS, dtype: int64 """ - return self._downsample("prod", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "prod", numeric_only=numeric_only, min_count=min_count, skipna=skipna + ) @final def min( self, numeric_only: bool = False, - min_count: int = 0, + min_count: int = -1, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, ): """ Compute min value of group. @@ -1134,13 +1166,23 @@ def min( 2023-02-01 3 Freq: MS, dtype: int64 """ - return self._downsample("min", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "min", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) @final def max( self, numeric_only: bool = False, - min_count: int = 0, + min_count: int = -1, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, ): """ Compute max value of group. @@ -1168,7 +1210,14 @@ def max( 2023-02-01 4 Freq: MS, dtype: int64 """ - return self._downsample("max", numeric_only=numeric_only, min_count=min_count) + return self._downsample( + "max", + numeric_only=numeric_only, + min_count=min_count, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) @final @doc(GroupBy.first) @@ -1196,13 +1245,20 @@ def last( @final @doc(GroupBy.median) - def median(self, numeric_only: bool = False): - return self._downsample("median", numeric_only=numeric_only) + def median( + self, + numeric_only: bool = False, + skipna: bool = True, + ): + return self._downsample("median", numeric_only=numeric_only, skipna=skipna) @final def mean( self, numeric_only: bool = False, + skipna: bool = True, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, ): """ Compute mean of groups, excluding missing values. @@ -1216,6 +1272,26 @@ def mean( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + Returns ------- DataFrame or Series @@ -1241,13 +1317,22 @@ def mean( 2023-02-01 3.5 Freq: MS, dtype: float64 """ - return self._downsample("mean", numeric_only=numeric_only) + return self._downsample( + "mean", + numeric_only=numeric_only, + skipna=skipna, + engine=engine, + engine_kwargs=engine_kwargs, + ) @final def std( self, ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute standard deviation of groups, excluding missing values. @@ -1256,6 +1341,24 @@ def std( ---------- ddof : int, default 1 Degrees of freedom. + + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. @@ -1265,6 +1368,9 @@ def std( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- DataFrame or Series @@ -1291,13 +1397,23 @@ def std( 2023-02-01 2.645751 Freq: MS, dtype: float64 """ - return self._downsample("std", ddof=ddof, numeric_only=numeric_only) + return self._downsample( + "std", + ddof=ddof, + engine=engine, + engine_kwargs=engine_kwargs, + numeric_only=numeric_only, + skipna=skipna, + ) @final def var( self, ddof: int = 1, + engine: Literal["cython", "numba"] | None = None, + engine_kwargs: dict[str, bool] | None = None, numeric_only: bool = False, + skipna: bool = True, ): """ Compute variance of groups, excluding missing values. @@ -1307,6 +1423,23 @@ def var( ddof : int, default 1 Degrees of freedom. + engine : str, default None + * ``'cython'`` : Runs the operation through C-extensions from cython. + * ``'numba'`` : Runs the operation through JIT compiled code from numba. + * ``None`` : Defaults to ``'cython'`` or globally setting + ``compute.use_numba`` + + .. versionadded:: 1.4.0 + + engine_kwargs : dict, default None + * For ``'cython'`` engine, there are no accepted ``engine_kwargs`` + * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil`` + and ``parallel`` dictionary keys. The values must either be ``True`` or + ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is + ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` + + .. versionadded:: 1.4.0 + numeric_only : bool, default False Include only `float`, `int` or `boolean` data. @@ -1316,6 +1449,9 @@ def var( numeric_only now defaults to ``False``. + skipna : bool, default True + Exclude NA/null values when computing the result. + Returns ------- DataFrame or Series @@ -1347,7 +1483,14 @@ def var( 2023-02-01 4.666667 Freq: MS, dtype: float64 """ - return self._downsample("var", ddof=ddof, numeric_only=numeric_only) + return self._downsample( + "var", + ddof=ddof, + engine=engine, + engine_kwargs=engine_kwargs, + numeric_only=numeric_only, + skipna=skipna, + ) @final @doc(GroupBy.sem) @@ -1355,8 +1498,11 @@ def sem( self, ddof: int = 1, numeric_only: bool = False, + skipna: bool = True, ): - return self._downsample("sem", ddof=ddof, numeric_only=numeric_only) + return self._downsample( + "sem", ddof=ddof, numeric_only=numeric_only, skipna=skipna + ) @final @doc(GroupBy.ohlc) diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py index 4b3431d938f96..e2bda9a221261 100644 --- a/pandas/tests/extension/base/reduce.py +++ b/pandas/tests/extension/base/reduce.py @@ -2,6 +2,8 @@ import pytest +from pandas.core.dtypes.common import is_numeric_dtype + import pandas as pd import pandas._testing as tm @@ -12,6 +14,9 @@ class BaseReduceTests: make sense for numeric/boolean operations. """ + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + return self._supports_reduction(ser, op_name) + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: # Specify if we expect this reduction to succeed. return False @@ -76,6 +81,24 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool): tm.assert_extension_array_equal(result1, expected) + def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): + df = pd.DataFrame({"a": ser, "key": [1, 2] * (len(ser) // 2)}) + grp = df.groupby("key") + res1 = getattr(grp, op_name) + result = res1(skipna=skipna) + + if not skipna and ser.isna().any() and op_name != "skew": + expected = pd.DataFrame( + {"a": [pd.NA, pd.NA]}, index=pd.Index([1, 2], name="key") + ) + else: + expected = grp.apply( + lambda x: getattr(x.astype(ser.dtype), op_name)(skipna=skipna), + include_groups=False, + ) + + tm.assert_almost_equal(result, expected, check_dtype=False, atol=1e-6) + @pytest.mark.parametrize("skipna", [True, False]) def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna): op_name = all_boolean_reductions @@ -126,3 +149,19 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna): pytest.skip(f"Reduction {op_name} not supported for this dtype") self.check_reduce_frame(ser, op_name, skipna) + + @pytest.mark.parametrize("skipna", [True, False]) + def test_reduce_groupby_numeric(self, data, all_numeric_reductions, skipna): + op_name = all_numeric_reductions + ser = pd.Series(data) + + if not is_numeric_dtype(ser.dtype): + pytest.skip(f"{ser.dtype} is not numeric dtype") + + if op_name in ["count", "kurt", "sem"]: + pytest.skip(f"{op_name} not an array method") + + if not self._supports_reduction_groupby(ser, op_name): + pytest.skip(f"Reduction {op_name} not supported for this dtype") + + self.check_reduce_groupby(ser, op_name, skipna) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 070feb1fec4b9..d06b440a9dd67 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -76,6 +76,9 @@ def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: return False return True + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + return op_name not in ["std", "median", "skew"] + def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): if op_name == "count": return super().check_reduce(ser, op_name, skipna) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index dbf353d87178f..23a688f0d39ce 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -468,6 +468,18 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques self.check_accumulate(ser, op_name, skipna) + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + dtype = ser.dtype + # error: Item "dtype[Any]" of "dtype[Any] | ExtensionDtype" has + # no attribute "pyarrow_dtype" + pa_dtype = dtype.pyarrow_dtype # type: ignore[union-attr] + if op_name == "skew": + return False + if pa.types.is_decimal(pa_dtype): + # Skip decimal types + return False + return self._supports_reduction(ser, op_name) + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: if op_name in ["kurt", "skew"]: return False diff --git a/pandas/tests/extension/test_masked.py b/pandas/tests/extension/test_masked.py index 3b9079d06e231..8a41d78b0d28e 100644 --- a/pandas/tests/extension/test_masked.py +++ b/pandas/tests/extension/test_masked.py @@ -298,6 +298,31 @@ def check_reduce(self, ser: pd.Series, op_name: str, skipna: bool): expected = pd.NA tm.assert_almost_equal(result, expected) + def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool): + df = pd.DataFrame({"a": ser, "key": [1, 2] * (len(ser) // 2)}) + grp = df.groupby("key") + res1 = getattr(grp, op_name) + result = res1(skipna=skipna) + if ( + op_name == "prod" + and skipna + ): + pytest.mark.xfail( + reason=f"{op_name} overflows" + ) + + if not skipna and ser.isna().any() and op_name != "skew": + expected = pd.DataFrame( + {"a": [pd.NA, pd.NA]}, index=pd.Index([1, 2], name="key") + ) + else: + expected = grp.apply( + lambda x: getattr(x.astype(ser.dtype), op_name)(skipna=skipna), + include_groups=False, + ) + + tm.assert_almost_equal(result, expected, check_dtype=False, atol=1e-6) + def _get_expected_reduction_dtype(self, arr, op_name: str, skipna: bool): if is_float_dtype(arr.dtype): cmp_dtype = arr.dtype.name diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 79cfb736941d6..e45a291344131 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -304,6 +304,12 @@ def test_arith_frame_with_scalar(self, data, all_arithmetic_operators, request): self.frame_scalar_exc = frame_scalar_exc super().test_arith_frame_with_scalar(data, all_arithmetic_operators) + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + if op_name in ["std", "skew"] and ser.dtype == "float64": + return False + else: + return self._supports_reduction(ser, op_name) + def _supports_reduction(self, ser: pd.Series, op_name: str) -> bool: if ser.dtype.kind == "O": return op_name in ["sum", "min", "max", "any", "all"] diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 56c023d99bb1c..2e9906abd20d2 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -97,6 +97,9 @@ def data_for_compare(request): class TestSparseArray(base.ExtensionTests): + def _supports_reduction_groupby(self, ser: pd.Series, op_name: str) -> bool: + return False + def _supports_reduction(self, obj, op_name: str) -> bool: return True diff --git a/pandas/tests/groupby/test_api.py b/pandas/tests/groupby/test_api.py index 33b39bad4ab81..635396a1227f2 100644 --- a/pandas/tests/groupby/test_api.py +++ b/pandas/tests/groupby/test_api.py @@ -173,13 +173,13 @@ def test_frame_consistency(groupby_func): elif groupby_func in ("nunique",): exclude_expected = {"axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): @@ -231,13 +231,13 @@ def test_series_consistency(request, groupby_func): if groupby_func in ("any", "all"): exclude_expected = {"kwargs", "bool_only", "axis"} elif groupby_func in ("max", "min"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"min_count", "engine", "engine_kwargs"} elif groupby_func in ("mean", "std", "sum", "var"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} exclude_result = {"engine", "engine_kwargs"} elif groupby_func in ("median", "prod", "sem"): - exclude_expected = {"axis", "kwargs", "skipna"} + exclude_expected = {"axis", "kwargs"} elif groupby_func in ("bfill", "ffill"): exclude_expected = {"inplace", "axis", "limit_area"} elif groupby_func in ("cummax", "cummin"): diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py index 3e32031e51138..bf00436962cac 100644 --- a/pandas/tests/groupby/test_numba.py +++ b/pandas/tests/groupby/test_numba.py @@ -1,3 +1,4 @@ +import numpy as np import pytest from pandas import ( @@ -71,3 +72,23 @@ def test_no_engine_doesnt_raise(self): res = gb.agg({"b": "first"}) expected = gb.agg({"b": "first"}) tm.assert_frame_equal(res, expected) + + @pytest.mark.parametrize( + "numba_method", ["sum", "min", "max", "std", "var", "mean"] + ) + def test_skipna_numba(self, numba_method): + # GH15675 + df = DataFrame( + { + "l": ["A", "A", "A", "B", "B", "B"], + "float": [-1.0, 1.2, -1.1, 1.5, 1.0, np.nan], + } + ) + + result_numba = getattr(df.groupby("l").float, numba_method)( + skipna=False, engine="numba" + ) + expected = df.groupby("l").float.apply( + lambda x: getattr(x, numba_method)(skipna=False) + ) + tm.assert_series_equal(result_numba, expected, check_exact=False) diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index 8a421654cdf9b..75956767ec1be 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -1050,6 +1050,125 @@ def scipy_sem(*args, **kwargs): return sem(*args, ddof=1, **kwargs) +@pytest.mark.parametrize( + "reduction_method, values", + [ + ("sum", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("sum", ["foo", "bar", "baz", "foo", pd.NA, "foo"]), + ("min", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "min", + [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-07"), + Timestamp("2024-01-08"), + pd.NaT, + ], + ), + ( + "min", + [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=7), + pd.Timedelta(days=8), + pd.NaT, + ], + ), + ("max", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "max", + [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-07"), + Timestamp("2024-01-08"), + pd.NaT, + ], + ), + ( + "max", + [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=7), + pd.Timedelta(days=8), + pd.NaT, + ], + ), + ("mean", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "mean", + [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-07"), + Timestamp("2024-01-08"), + pd.NaT, + ], + ), + ( + "mean", + [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=7), + pd.Timedelta(days=8), + pd.NaT, + ], + ), + ("median", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ( + "median", + [ + Timestamp("2024-01-01"), + Timestamp("2024-01-02"), + Timestamp("2024-01-03"), + Timestamp("2024-01-07"), + Timestamp("2024-01-08"), + pd.NaT, + ], + ), + ( + "median", + [ + pd.Timedelta(days=1), + pd.Timedelta(days=2), + pd.Timedelta(days=3), + pd.Timedelta(days=7), + pd.Timedelta(days=8), + pd.NaT, + ], + ), + ("prod", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("sem", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("std", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("var", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("any", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("all", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ("skew", [-1.0, 1.2, -1.1, 1.5, np.nan, 1.0]), + ], +) +def test_skipna_reduction_ops_cython(reduction_method, values): + # GH15675 + # Testing the skipna parameter against possible datatypes + df = DataFrame({"key": [1, 1, 1, 2, 2, 2], "values": values}) + gb = df.groupby("key") + + result_cython = getattr(gb, reduction_method)(skipna=False) + expected = gb.apply( + lambda x: getattr(x, reduction_method)(skipna=False), include_groups=False + ) + tm.assert_frame_equal(result_cython, expected, check_exact=False, check_dtype=False) + + @pytest.mark.parametrize( "op,targop", [