WIP EAs support

tiago-firmino · tiago-firmino · commit 558bc25b1d3e · 2024-06-14T23:32:17.000+01:00
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -62,7 +62,12 @@ cdef enum InterpolationEnumType:
     INTERPOLATION_MIDPOINT
 
 
-cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept nogil:
+cdef float64_t median_linear_mask(
+    float64_t* a,
+    int n,
+    uint8_t* mask,
+    bint skipna=True
+) noexcept nogil:
     cdef:
         int i, j, na_count = 0
         float64_t* tmp
@@ -74,6 +79,8 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
     # count NAs
     for i in range(n):
         if mask[i]:
+            if not skipna:
+                return NaN
             na_count += 1
 
     if na_count:
@@ -235,7 +242,7 @@ def group_median_float64(
 
                 for j in range(ngroups):
                     size = _counts[j + 1]
-                    result = median_linear_mask(ptr, size, ptr_mask)
+                    result = median_linear_mask(ptr, size, ptr_mask, skipna)
                     out[j, i] = result
 
                     if result != result:
@@ -739,6 +746,8 @@ def group_sum(
                     continue
 
                 if uses_mask:
+                    if result_mask[lab, j]:
+                        continue
                     isna_entry = mask[i, j]
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
@@ -747,7 +756,10 @@ def group_sum(
                     if skipna:
                         continue
                     else:
-                        sumx[lab, j] = val
+                        if uses_mask:
+                            result_mask[lab, j] = True
+                        else:
+                            sumx[lab, j] = val
                         compensation[lab, j] = 0
                         continue
 
@@ -824,6 +836,8 @@ def group_prod(
                 val = values[i, j]
 
                 if uses_mask:
+                    if result_mask[lab, j]:
+                        continue
                     isna_entry = mask[i, j]
                 else:
                     isna_entry = _treat_as_na(val, False)
@@ -832,7 +846,10 @@ def group_prod(
                     nobs[lab, j] += 1
                     prodx[lab, j] *= val
                 elif not skipna:
-                    prodx[lab, j] = val
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        prodx[lab, j] = val
                     nobs[lab, j] = 0
                     continue
 
@@ -891,6 +908,8 @@ def group_var(
                 val = values[i, j]
 
                 if uses_mask:
+                    if result_mask[lab, j]:
+                        continue
                     isna_entry = mask[i, j]
                 elif is_datetimelike:
                     # With group_var, we cannot just use _treat_as_na bc
@@ -901,7 +920,10 @@ def group_var(
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
                 if not skipna and isna_entry:
-                    out[lab, j] = val
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        out[lab, j] = val
                     nobs[lab, j] = 0
                     continue
 
@@ -1100,6 +1122,8 @@ def group_mean(
                 val = values[i, j]
 
                 if uses_mask:
+                    if result_mask[lab, j]:
+                        continue
                     isna_entry = mask[i, j]
                 elif is_datetimelike:
                     # With group_mean, we cannot just use _treat_as_na bc
@@ -1110,7 +1134,10 @@ def group_mean(
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
                 if not skipna and isna_entry:
-                    sumx[lab, j] = val
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        sumx[lab, j] = val
                     nobs[lab, j] = 0
                     continue
 
@@ -1762,12 +1789,17 @@ cdef group_min_max(
                 val = values[i, j]
 
                 if uses_mask:
+                    if result_mask[lab, j]:
+                        continue
                     isna_entry = mask[i, j]
                 else:
                     isna_entry = _treat_as_na(val, is_datetimelike)
 
                 if not skipna and isna_entry:
-                    group_min_or_max[lab, j] = val
+                    if uses_mask:
+                        result_mask[lab, j] = True
+                    else:
+                        group_min_or_max[lab, j] = val
                     nobs[lab, j] = 0
                     continue
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -2237,12 +2237,6 @@ def _groupby_op(
         ids: npt.NDArray[np.intp],
         **kwargs,
     ):
-        if how in ["sum", "prod", "mean", "median", "var", "sem", "std", "nim", "max"]:
-            if "skipna" in kwargs and not kwargs["skipna"]:
-                raise NotImplementedError(
-                    f"method '{how}' with skipna=False not implemented for Arrow dtypes"
-                )
-
         if isinstance(self.dtype, StringDtype):
             return super()._groupby_op(
                 how=how,
@@ -2308,33 +2302,31 @@ def _str_contains(
     def _str_startswith(self, pat: str | tuple[str, ...], na=None) -> Self:
         if isinstance(pat, str):
             result = pc.starts_with(self._pa_array, pattern=pat)
+        elif len(pat) == 0:
+            # For empty tuple, pd.StringDtype() returns null for missing values
+            # and false for valid values.
+            result = pc.if_else(pc.is_null(self._pa_array), None, False)
         else:
-            if len(pat) == 0:
-                # For empty tuple, pd.StringDtype() returns null for missing values
-                # and false for valid values.
-                result = pc.if_else(pc.is_null(self._pa_array), None, False)
-            else:
-                result = pc.starts_with(self._pa_array, pattern=pat[0])
+            result = pc.starts_with(self._pa_array, pattern=pat[0])
 
-                for p in pat[1:]:
-                    result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
+            for p in pat[1:]:
+                result = pc.or_(result, pc.starts_with(self._pa_array, pattern=p))
         if not isna(na):
             result = result.fill_null(na)
         return type(self)(result)
 
     def _str_endswith(self, pat: str | tuple[str, ...], na=None) -> Self:
         if isinstance(pat, str):
             result = pc.ends_with(self._pa_array, pattern=pat)
+        elif len(pat) == 0:
+            # For empty tuple, pd.StringDtype() returns null for missing values
+            # and false for valid values.
+            result = pc.if_else(pc.is_null(self._pa_array), None, False)
         else:
-            if len(pat) == 0:
-                # For empty tuple, pd.StringDtype() returns null for missing values
-                # and false for valid values.
-                result = pc.if_else(pc.is_null(self._pa_array), None, False)
-            else:
-                result = pc.ends_with(self._pa_array, pattern=pat[0])
+            result = pc.ends_with(self._pa_array, pattern=pat[0])
 
-                for p in pat[1:]:
-                    result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
+            for p in pat[1:]:
+                result = pc.or_(result, pc.ends_with(self._pa_array, pattern=p))
         if not isna(na):
             result = result.fill_null(na)
         return type(self)(result)
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -392,13 +392,6 @@ def _call_cython_op(
                         values[mask] = True
             values = values.astype(bool, copy=False).view(np.int8)
             is_numeric = True
-        elif (
-            self.how in ["median", "sem", "std", "var"]
-            and "skipna" in kwargs
-            and not kwargs["skipna"]
-        ):
-            # if skipna=False we don't want to use masks created for Nullable dtypes
-            mask = None
 
         values = values.T
         if mask is not None:
diff --git a/pandas/tests/extension/base/reduce.py b/pandas/tests/extension/base/reduce.py
@@ -77,6 +77,19 @@ def check_reduce_frame(self, ser: pd.Series, op_name: str, skipna: bool):
 
         tm.assert_extension_array_equal(result1, expected)
 
+    def check_reduce_groupby(self, ser: pd.Series, op_name: str, skipna: bool):
+        # Check that groupby reduction behaves correctly
+        df = pd.DataFrame({"a": ser, "key": [1, 2] * (len(ser) // 2)})
+        grp = df.groupby("key")["a"]
+        res_op = getattr(grp, op_name)
+
+        expected = grp.apply(
+            lambda x: getattr(x.astype("float64"), op_name)(skipna=skipna)
+        )
+
+        result = res_op(skipna=skipna)
+        tm.assert_series_equal(result, expected)
+
     @pytest.mark.parametrize("skipna", [True, False])
     def test_reduce_series_boolean(self, data, all_boolean_reductions, skipna):
         op_name = all_boolean_reductions
@@ -129,3 +142,15 @@ def test_reduce_frame(self, data, all_numeric_reductions, skipna):
             pytest.skip(f"Reduction {op_name} not supported for this dtype")
 
         self.check_reduce_frame(ser, op_name, skipna)
+
+    @pytest.mark.parametrize("skipna", [True, False])
+    def test_reduce_groupby_numeric(self, data, all_numeric_reductions, skipna):
+        op_name = all_numeric_reductions
+        ser = pd.Series(data)
+        if not is_numeric_dtype(ser.dtype):
+            pytest.skip(f"{ser.dtype} is not numeric dtype")
+
+        if not self._supports_reduction(ser, op_name):
+            pytest.skip(f"Reduction {op_name} not supported for this dtype")
+
+        self.check_reduce_groupby(ser, op_name, skipna)