Support reduction-specific kwargs in finalize

dcherian · dcherian · commit 95701d47f884 · 2021-11-09T21:29:05.000-07:00
E.g. ddof for var, std
diff --git a/dask_groupby/aggregations.py b/dask_groupby/aggregations.py
@@ -147,7 +147,7 @@ def nansum_of_squares(group_idx, array, size=None, fill_value=None):
 # TODO: fix this for complex numbers
 def _var_finalize(sumsq, sum_, count, ddof=0):
     result = (sumsq - (sum_ ** 2 / count)) / (count - ddof)
-    result[(count - ddof) <= 0] = np.nan
+    result[count <= ddof] = np.nan
     return result
 
 
diff --git a/dask_groupby/core.py b/dask_groupby/core.py
@@ -413,6 +413,7 @@ def chunk_reduce(
     reindex: bool = False,
     isbin: bool = False,
     backend: str = "numpy",
+    kwargs=None,
 ) -> IntermediateDict:
     """
     Wrapper for numpy_groupies aggregate that supports nD ``array`` and
@@ -458,6 +459,9 @@ def chunk_reduce(
     if not isinstance(fill_value, Sequence):
         fill_value = (fill_value,)
 
+    if kwargs is None:
+        kwargs = ({},) * len(func)
+
     # when axis is a tuple
     # collapse and move reduction dimensions to the end
     if isinstance(axis, Sequence) and len(axis) < by.ndim:
@@ -503,7 +507,7 @@ def chunk_reduce(
     final_array_shape += results["groups"].shape
     final_groups_shape += results["groups"].shape
 
-    for reduction, fv in zip(func, fill_value):
+    for reduction, fv, kw in zip(func, fill_value, kwargs):
         if empty:
             result = np.full(shape=final_array_shape, fill_value=fv)
         else:
@@ -516,6 +520,7 @@ def chunk_reduce(
                     size=size,
                     # important when reducing with "offset" groups
                     fill_value=fv,
+                    **kw,
                 )
             else:
                 result = _get_aggregate(backend)(
@@ -527,6 +532,7 @@ def chunk_reduce(
                     # important when reducing with "offset" groups
                     fill_value=fv,
                     dtype=np.intp if reduction == "nanlen" else dtype,
+                    **kw,
                 )
             if np.any(~mask):
                 # remove NaN group label which should be last
@@ -573,6 +579,7 @@ def _finalize_results(
     expected_groups: Union[Sequence, np.ndarray, None],
     fill_value: Any,
     min_count: Optional[int] = None,
+    finalize_kwargs: Optional[Mapping] = None,
 ):
     """Finalize results by
     1. Squeezing out dummy dimensions
@@ -595,10 +602,11 @@ def _finalize_results(
         if fill_value is not None:
             counts = squeezed["intermediates"][-1]
             squeezed["intermediates"] = squeezed["intermediates"][:-1]
-
         if min_count is None:
             min_count = 1
-        result[agg.name] = agg.finalize(*squeezed["intermediates"])
+        if finalize_kwargs is None:
+            finalize_kwargs = {}
+        result[agg.name] = agg.finalize(*squeezed["intermediates"], **finalize_kwargs)
         result[agg.name] = np.where(counts >= min_count, result[agg.name], fill_value)
 
     # Final reindexing has to be here to be lazy
@@ -621,10 +629,13 @@ def _npg_aggregate(
     fill_value: Any = None,
     min_count: Optional[int] = None,
     backend: str = "numpy",
+    finalize_kwargs: Optional[Mapping] = None,
 ) -> FinalResultsDict:
     """Final aggregation step of tree reduction"""
     results = _npg_combine(x_chunk, agg, axis, keepdims, group_ndim, backend)
-    return _finalize_results(results, agg, axis, expected_groups, fill_value, min_count)
+    return _finalize_results(
+        results, agg, axis, expected_groups, fill_value, min_count, finalize_kwargs
+    )
 
 
 def _npg_combine(
@@ -782,6 +793,7 @@ def groupby_agg(
     min_count: Optional[int] = None,
     isbin: bool = False,
     backend: str = "numpy",
+    finalize_kwargs: Optional[Mapping] = None,
 ) -> Tuple["DaskArray", Union[np.ndarray, "DaskArray"]]:
 
     import dask.array
@@ -851,6 +863,14 @@ def groupby_agg(
         group_chunks = (len(expected_groups),) if expected_groups is not None else (np.nan,)
         expected_agg = expected_groups
 
+    agg_kwargs = dict(
+        group_ndim=by.ndim,
+        fill_value=fill_value,
+        min_count=min_count,
+        backend=backend,
+        finalize_kwargs=finalize_kwargs,
+    )
+
     if method == "mapreduce":
         # reduced is really a dict mapping reduction name to array
         # and "groups" to an array of group labels
@@ -862,10 +882,7 @@ def groupby_agg(
                 _npg_aggregate,
                 agg=agg,
                 expected_groups=expected_agg,
-                group_ndim=by.ndim,
-                fill_value=fill_value,
-                min_count=min_count,
-                backend=backend,
+                **agg_kwargs,
             ),
             combine=partial(_npg_combine, agg=agg, group_ndim=by.ndim, backend=backend),
             name=f"{name}-reduce",
@@ -892,10 +909,7 @@ def groupby_agg(
                 _npg_aggregate,
                 agg=agg,
                 expected_groups=None,
-                group_ndim=by.ndim,
-                fill_value=fill_value,
-                min_count=min_count,
-                backend=backend,
+                **agg_kwargs,
                 axis=axis,
                 keepdims=True,
             ),
@@ -982,6 +996,7 @@ def groupby_reduce(
     split_out: int = 1,
     method: str = "mapreduce",
     backend: str = "numpy",
+    finalize_kwargs: Optional[Mapping] = None,
 ) -> Tuple["DaskArray", Union[np.ndarray, "DaskArray"]]:
     """
     GroupBy reductions using tree reductions for dask.array
@@ -1026,6 +1041,8 @@ def groupby_reduce(
                         chunking ``array`` for this method by first rechunking using ``rechunk_for_cohorts``.
     backend: {"numpy", "numba"}, optional
         Backend  for numpy_groupies. numpy by default.
+    finalize_kwargs: Mapping, optional
+        Kwargs passed to finalize the reduction such as ddof for var, std.
 
     Returns
     -------
@@ -1112,18 +1129,25 @@ def groupby_reduce(
         reduction.finalize = None
         # xarray's count is npg's nanlen
         func = reduction.name if reduction.name != "count" else "nanlen"
-        if min_count is not None:
+        if finalize_kwargs is None:
+            finalize_kwargs = {}
+        if isinstance(finalize_kwargs, Mapping):
+            finalize_kwargs = (finalize_kwargs,)
+        append_nanlen = min_count is not None or reduction.name in ["nanvar", "nanstd"]
+        if append_nanlen:
             func = (func, "nanlen")
+            finalize_kwargs = finalize_kwargs + ({},)
 
         results = chunk_reduce(
             array,
             by,
             func=func,
             axis=axis,
             expected_groups=expected_groups if isbin else None,
-            fill_value=(fill_value, 0) if min_count is not None else fill_value,
+            fill_value=(fill_value, 0) if append_nanlen else fill_value,
             dtype=reduction.dtype,
             isbin=isbin,
+            kwargs=finalize_kwargs,
         )  # type: ignore
 
         if reduction.name in ["argmin", "argmax", "nanargmax", "nanargmin"]:
@@ -1133,6 +1157,12 @@ def groupby_reduce(
                 results["intermediates"][0] = np.unravel_index(
                     results["intermediates"][0], array.shape
                 )[-1]
+        elif reduction.name in ["nanvar", "nanstd"]:
+            # Fix npg bug where all-NaN rows are 0 instead of NaN
+            value, counts = results["intermediates"]
+            mask = counts <= 0
+            value[mask] = np.nan
+            results["intermediates"] = (value,)
 
         if isbin:
             expected_groups = np.arange(len(expected_groups) - 1)
@@ -1167,6 +1197,7 @@ def groupby_reduce(
             min_count=min_count,
             isbin=isbin,
             backend=backend,
+            finalize_kwargs=finalize_kwargs,
         )
         if method == "cohorts":
             assert len(axis) == 1
diff --git a/dask_groupby/xarray.py b/dask_groupby/xarray.py
@@ -62,6 +62,7 @@ def xarray_reduce(
     keep_attrs: bool = True,
     skipna: bool = True,
     min_count: Optional[int] = None,
+    **finalize_kwargs,
 ):
     """GroupBy reduce operations on xarray objects using numpy-groupies
 
@@ -116,6 +117,8 @@ def xarray_reduce(
     min_count: int, optional
         NaN out when number of non-NaN values in aggregation is < min_count
         Only applies to nansum, nanprod.
+    finalize_kwargs: dict, optional
+        kwargs passed to the finalize function, like ddof for var, std.
 
     Raises
     ------
@@ -291,6 +294,7 @@ def wrapper(*args, **kwargs):
             # from "by" so we need the isbin part of the condition
             "expected_groups": expected_groups[0] if len(by) == 1 and isbin[0] else None,
             "isbin": isbin[0] if len(by) == 1 else False,
+            "finalize_kwargs": finalize_kwargs,
         },
     )
 
diff --git a/tests/test_core.py b/tests/test_core.py
@@ -114,6 +114,8 @@ def test_groupby_reduce(
         pytest.param("nanargmin", marks=(pytest.mark.xfail,)),
         "any",
         "all",
+        pytest.param("median", marks=(pytest.mark.skip,)),
+        pytest.param("nanmedian", marks=(pytest.mark.skip,)),
     ),
 )
 def test_groupby_reduce_all(size, func, backend):
@@ -128,23 +130,33 @@ def test_groupby_reduce_all(size, func, backend):
     if func in ["any", "all"]:
         array = array > 0.5
 
-    with np.errstate(invalid="ignore", divide="ignore"):
-        expected = getattr(np, func)(array, axis=-1)
-    expected = np.expand_dims(expected, -1)
+    finalize_kwargs = tuple({})
+    if "var" in func or "std" in func:
+        finalize_kwargs = finalize_kwargs + ({"ddof": 1}, {"ddof": 0})
 
-    actual, _ = groupby_reduce(array, by, func=func, backend=backend)
-    if "arg" in func:
-        assert actual.dtype.kind == "i"
-    assert_equal(actual, expected)
+    for kwargs in finalize_kwargs:
+        with np.errstate(invalid="ignore", divide="ignore"):
+            expected = getattr(np, func)(array, axis=-1, **kwargs)
+        expected = np.expand_dims(expected, -1)
 
-    for method in ["mapreduce", "cohorts"]:
-        actual, _ = groupby_reduce(
-            da.from_array(array, chunks=3), by, func=func, method=method, backend=backend
-        )
+        actual, _ = groupby_reduce(array, by, func=func, backend=backend, finalize_kwargs=kwargs)
         if "arg" in func:
             assert actual.dtype.kind == "i"
         assert_equal(actual, expected)
 
+        for method in ["mapreduce", "cohorts"]:
+            actual, _ = groupby_reduce(
+                da.from_array(array, chunks=3),
+                by,
+                func=func,
+                method=method,
+                backend=backend,
+                finalize_kwargs=kwargs,
+            )
+            if "arg" in func:
+                assert actual.dtype.kind == "i"
+            assert_equal(actual, expected)
+
 
 @pytest.mark.parametrize("size", ((12,), (12, 5)))
 @pytest.mark.parametrize("func", ("argmax", "nanargmax", "argmin", "nanargmin"))