Skip to content

ENH: Enabled skipna argument on groupby reduction ops #15675 #58844

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
e13428b
ENH: Enabled skipna argument on groupby reduction ops (#15675)
andremcorreia May 27, 2024
8099d84
FIX: fixed pipeline issues related to docs and window tests
tiago-firmino May 27, 2024
8f61fda
Fix: pre-commit
andremcorreia May 27, 2024
2518696
Reworked sugestions
andremcorreia Jun 2, 2024
5cd994c
Reworked documentation
tiago-firmino Jun 2, 2024
8ae0caf
FIX: resample redefinition
tiago-firmino Jun 2, 2024
5e3a965
FIX: Small tweaks in docs
andremcorreia Jun 2, 2024
c692076
Refactored test parameterization
andremcorreia Jun 9, 2024
e87e030
Added tests for EAs
tiago-firmino Jun 11, 2024
4f11dab
Removed Arrow support
andremcorreia Jun 11, 2024
edbb331
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jun 11, 2024
3d719b8
pre-commit fix
andremcorreia Jun 11, 2024
c2ceb57
Merge branch 'add_skipna_on_groupby_ops_pr' of github.com:andremcorre…
andremcorreia Jun 11, 2024
2856c6d
WIP EAs support
tiago-firmino Jun 14, 2024
c200177
Extension Array Support Tests
andremcorreia Jul 4, 2024
66e0ee4
Merge branch 'main' into add_skipna_on_groupby_ops_pr
andremcorreia Jul 4, 2024
262ca97
WIP: Fixing Tests
andremcorreia Jul 4, 2024
91bb3c3
WIP: 32bit fix
andremcorreia Jul 4, 2024
7ee07d1
WIP: overflow
tiago-firmino Aug 3, 2024
bae5217
Merge branch 'main' into add_skipna_on_groupby_ops_pr
andremcorreia Aug 4, 2024
5a004cf
Fix tests 32bit
andremcorreia Aug 4, 2024
0ef070c
small tweaks
andremcorreia Aug 4, 2024
d02b308
simpler test skipping approach
andremcorreia Aug 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,13 @@ Other enhancements
- Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
- :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
- :meth:`.DataFrameGroupBy.sum`, :meth:`.DataFrameGroupBy.prod`, :meth:`.DataFrameGroupBy.min`, :meth:`.DataFrameGroupBy.max`, :meth:`.DataFrameGroupBy.mean`, :meth:`.DataFrameGroupBy.median`, :meth:`.DataFrameGroupBy.sem`, :meth:`.DataFrameGroupBy.std` and :meth:`.DataFrameGroupBy.var` now accept a skipna argument. (:issue:`15675`)
- :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
- :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
- :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
- :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
- :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
- Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
-

.. ---------------------------------------------------------------------------
.. _whatsnew_300.notable_bug_fixes:
Expand Down
7 changes: 7 additions & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def group_median_float64(
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ..., # bint
skipna: bool = ..., # bint
) -> None: ...
def group_cumprod(
out: np.ndarray, # float64_t[:, ::1]
Expand Down Expand Up @@ -66,6 +67,7 @@ def group_sum(
result_mask: np.ndarray | None = ...,
min_count: int = ...,
is_datetimelike: bool = ...,
skipna: bool = ..., # bint
) -> None: ...
def group_prod(
out: np.ndarray, # int64float_t[:, ::1]
Expand All @@ -75,6 +77,7 @@ def group_prod(
mask: np.ndarray | None,
result_mask: np.ndarray | None = ...,
min_count: int = ...,
skipna: bool = ..., # bint
) -> None: ...
def group_var(
out: np.ndarray, # floating[:, ::1]
Expand All @@ -86,6 +89,7 @@ def group_var(
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ...,
skipna: bool = ..., # bint
name: str = ...,
) -> None: ...
def group_skew(
Expand All @@ -104,6 +108,7 @@ def group_mean(
labels: np.ndarray, # const intp_t[:]
min_count: int = ..., # Py_ssize_t
is_datetimelike: bool = ..., # bint
skipna: bool = ..., # bint
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
Expand Down Expand Up @@ -170,6 +175,7 @@ def group_max(
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
is_datetimelike: bool = ...,
skipna: bool = ..., # bint
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
Expand All @@ -180,6 +186,7 @@ def group_min(
labels: np.ndarray, # const int64_t[:]
min_count: int = ...,
is_datetimelike: bool = ...,
skipna: bool = ..., # bint
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
) -> None: ...
Expand Down
106 changes: 76 additions & 30 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
cdef float64_t median_linear(
float64_t* a,
int n,
bint is_datetimelike=False
bint is_datetimelike=False,
bint skipna=True
) noexcept nogil:
cdef:
int i, j, na_count = 0
Expand All @@ -118,10 +119,14 @@ cdef float64_t median_linear(
if is_datetimelike:
for i in range(n):
if a[i] == NPY_NAT:
if not skipna:
return NaN
na_count += 1
else:
for i in range(n):
if a[i] != a[i]:
if not skipna:
return NaN
na_count += 1

if na_count:
Expand Down Expand Up @@ -186,6 +191,7 @@ def group_median_float64(
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -244,7 +250,7 @@ def group_median_float64(
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size, is_datetimelike)
out[j, i] = median_linear(ptr, size, is_datetimelike, skipna)
ptr += size


Expand Down Expand Up @@ -694,6 +700,7 @@ def group_sum(
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint is_datetimelike=False,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0 using Kahan summation
Expand Down Expand Up @@ -728,37 +735,47 @@ def group_sum(
for j in range(K):
val = values[i, j]

if _treat_as_na(sumx[lab, j], is_datetimelike):
continue

if uses_mask:
isna_entry = mask[i, j]
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
nobs[lab, j] += 1
if isna_entry:
if skipna:
continue
else:
sumx[lab, j] = val
compensation[lab, j] = 0
continue

if sum_t is object:
# NB: this does not use 'compensation' like the non-object
# track does.
if nobs[lab, j] == 1:
# i.e. we haven't added anything yet; avoid TypeError
# if e.g. val is a str and sumx[lab, j] is 0
t = val
else:
t = sumx[lab, j] + val
sumx[lab, j] = t
nobs[lab, j] += 1

if sum_t is object:
# NB: this does not use 'compensation' like the non-object
# track does.
if nobs[lab, j] == 1:
# i.e. we haven't added anything yet; avoid TypeError
# if e.g. val is a str and sumx[lab, j] is 0
t = val
else:
y = val - compensation[lab, j]
t = sumx[lab, j] + y
compensation[lab, j] = t - sumx[lab, j] - y
if compensation[lab, j] != compensation[lab, j]:
# GH#53606
# If val is +/- infinity compensation is NaN
# which would lead to results being NaN instead
# of +/- infinity. We cannot use util.is_nan
# because of no gil
compensation[lab, j] = 0
sumx[lab, j] = t
t = sumx[lab, j] + val
sumx[lab, j] = t

else:
y = val - compensation[lab, j]
t = sumx[lab, j] + y
compensation[lab, j] = t - sumx[lab, j] - y
if compensation[lab, j] != compensation[lab, j]:
# GH#53606
# If val is +/- infinity compensation is NaN
# which would lead to results being NaN instead
# of +/- infinity. We cannot use util.is_nan
# because of no gil
compensation[lab, j] = 0
sumx[lab, j] = t

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx
Expand All @@ -775,6 +792,7 @@ def group_prod(
const uint8_t[:, ::1] mask,
uint8_t[:, ::1] result_mask=None,
Py_ssize_t min_count=0,
bint skipna=True,
) -> None:
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -813,6 +831,10 @@ def group_prod(
if not isna_entry:
nobs[lab, j] += 1
prodx[lab, j] *= val
elif not skipna:
prodx[lab, j] = val
nobs[lab, j] = 0
continue

_check_below_mincount(
out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
Expand All @@ -832,6 +854,7 @@ def group_var(
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
bint skipna=True,
str name="var",
) -> None:
cdef:
Expand Down Expand Up @@ -877,7 +900,12 @@ def group_var(
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
if not skipna and isna_entry:
out[lab, j] = val
nobs[lab, j] = 0
continue

elif not isna_entry:
nobs[lab, j] += 1
oldmean = mean[lab, j]
mean[lab, j] += (val - oldmean) / nobs[lab, j]
Expand Down Expand Up @@ -998,6 +1026,7 @@ def group_mean(
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint skipna=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
Expand All @@ -1021,6 +1050,8 @@ def group_mean(
Only used in sum and prod. Always -1.
is_datetimelike : bool
True if `values` contains datetime-like entries.
skipna : bool, default True
Exclude NA/null values when computing the result.
mask : ndarray[bool, ndim=2], optional
Mask of the input values.
result_mask : ndarray[bool, ndim=2], optional
Expand Down Expand Up @@ -1078,7 +1109,12 @@ def group_mean(
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
if not skipna and isna_entry:
sumx[lab, j] = val
nobs[lab, j] = 0
continue

elif not isna_entry:
nobs[lab, j] += 1
y = val - compensation[lab, j]
t = sumx[lab, j] + y
Expand All @@ -1096,12 +1132,10 @@ def group_mean(
for j in range(K):
count = nobs[i, j]
if nobs[i, j] == 0:

if uses_mask:
result_mask[i, j] = True
else:
out[i, j] = nan_val

else:
out[i, j] = sumx[i, j] / count

Expand Down Expand Up @@ -1660,6 +1694,7 @@ cdef group_min_max(
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint compute_max=True,
bint skipna=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
):
Expand All @@ -1683,6 +1718,8 @@ cdef group_min_max(
True if `values` contains datetime-like entries.
compute_max : bint, default True
True to compute group-wise max, False to compute min
skipna : bool, default True
Exclude NA/null values when computing the result.
mask : ndarray[bool, ndim=2], optional
If not None, indices represent missing values,
otherwise the mask will not be used
Expand Down Expand Up @@ -1729,7 +1766,12 @@ cdef group_min_max(
else:
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
if not skipna and isna_entry:
group_min_or_max[lab, j] = val
nobs[lab, j] = 0
continue

elif not isna_entry:
nobs[lab, j] += 1
if compute_max:
if val > group_min_or_max[lab, j]:
Expand Down Expand Up @@ -1866,6 +1908,7 @@ def group_max(
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint skipna=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
Expand All @@ -1880,6 +1923,7 @@ def group_max(
compute_max=True,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand All @@ -1892,6 +1936,7 @@ def group_min(
const intp_t[::1] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint skipna=True,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
) -> None:
Expand All @@ -1906,6 +1951,7 @@ def group_min(
compute_max=False,
mask=mask,
result_mask=result_mask,
skipna=skipna,
)


Expand Down
15 changes: 11 additions & 4 deletions pandas/core/_numba/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,20 @@ def column_looper(
labels: np.ndarray,
ngroups: int,
min_periods: int,
skipna: bool = True,
*args,
):
result = np.empty((values.shape[0], ngroups), dtype=result_dtype)
na_positions = {}
for i in numba.prange(values.shape[0]):
output, na_pos = func(
values[i], result_dtype, labels, ngroups, min_periods, *args
values[i],
result_dtype,
labels,
ngroups,
min_periods,
skipna,
*args,
)
result[i] = output
if len(na_pos) > 0:
Expand Down Expand Up @@ -162,6 +169,7 @@ def generate_shared_aggregator(
nopython: bool,
nogil: bool,
parallel: bool,
skipna: bool = True,
):
"""
Generate a Numba function that loops over the columns 2D object and applies
Expand Down Expand Up @@ -190,7 +198,6 @@ def generate_shared_aggregator(
-------
Numba function
"""

# A wrapper around the looper function,
# to dispatch based on dtype since numba is unable to do that in nopython mode

Expand All @@ -214,11 +221,11 @@ def looper_wrapper(
# Need to unpack kwargs since numba only supports *args
if is_grouped_kernel:
result, na_positions = column_looper(
values, labels, ngroups, min_periods, *kwargs.values()
values, labels, ngroups, min_periods, skipna, *kwargs.values()
)
else:
result, na_positions = column_looper(
values, start, end, min_periods, *kwargs.values()
values, start, end, min_periods, skipna, *kwargs.values()
)
if result.dtype.kind == "i":
# Look if na_positions is not empty
Expand Down
Loading