Reworked sugestions

andremcorreia · tiago-firmino · andremcorreia · commit 251869680cc8 · 2024-06-02T20:39:04.000+01:00
Co-authored-by: Tiago Firmino &lt;tiago.esteves.firmino@tecnico.ulisboa.pt&gt;
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -746,7 +746,7 @@ def group_sum(
                     else:
                         sumx[lab, j] = val
                         compensation[lab, j] = 0
-                        break
+                        continue
 
                 nobs[lab, j] += 1
 
@@ -831,7 +831,7 @@ def group_prod(
                 elif not skipna:
                     prodx[lab, j] = val
                     nobs[lab, j] = 0
-                    break
+                    continue
 
     _check_below_mincount(
         out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx
@@ -900,7 +900,7 @@ def group_var(
                 if not skipna and isna_entry:
                     out[lab, j] = val
                     nobs[lab, j] = 0
-                    break
+                    continue
 
                 elif not isna_entry:
                     nobs[lab, j] += 1
@@ -1109,7 +1109,7 @@ def group_mean(
                 if not skipna and isna_entry:
                     sumx[lab, j] = nan_val
                     nobs[lab, j] = 0
-                    break
+                    continue
 
                 elif not isna_entry:
                     nobs[lab, j] += 1
@@ -1766,7 +1766,7 @@ cdef group_min_max(
                 if not skipna and isna_entry:
                     group_min_or_max[lab, j] = val
                     nobs[lab, j] = 0
-                    break
+                    continue
 
                 elif not isna_entry:
                     nobs[lab, j] += 1
diff --git a/pandas/core/_numba/executor.py b/pandas/core/_numba/executor.py
@@ -81,8 +81,8 @@ def column_looper(
                     labels,
                     ngroups,
                     min_periods,
-                    *args,
                     skipna,
+                    *args,
                 )
                 result[i] = output
                 if len(na_pos) > 0:
diff --git a/pandas/core/_numba/kernels/min_max_.py b/pandas/core/_numba/kernels/min_max_.py
@@ -88,8 +88,8 @@ def grouped_min_max(
     labels: npt.NDArray[np.intp],
     ngroups: int,
     min_periods: int,
+    skipna: bool,
     is_max: bool,
-    skipna: bool = True,
 ) -> tuple[np.ndarray, list[int]]:
     N = len(labels)
     nobs = np.zeros(ngroups, dtype=np.int64)
diff --git a/pandas/core/_numba/kernels/var_.py b/pandas/core/_numba/kernels/var_.py
@@ -176,8 +176,8 @@ def grouped_var(
     labels: npt.NDArray[np.intp],
     ngroups: int,
     min_periods: int,
-    ddof: int = 1,
     skipna: bool = True,
+    ddof: int = 1,
 ) -> tuple[np.ndarray, list[int]]:
     N = len(labels)
 
diff --git a/pandas/tests/groupby/test_numba.py b/pandas/tests/groupby/test_numba.py
@@ -81,48 +81,14 @@ def test_skipna_numba(self, numba_method):
         df = DataFrame(
             {
                 "l": ["A", "A", "A", "B", "B", "B"],
-                "int": [-1, 1, -1, 1, 1, np.nan],
                 "float": [-1.0, 1.2, -1.1, 1.5, 1.0, np.nan],
             }
         )
 
-        result_numba = getattr(df.groupby("l").int, numba_method)(
-            skipna=False, engine="numba"
-        )
-        expected = df.groupby("l").int.apply(
-            lambda x: getattr(x, numba_method)(skipna=False)
-        )
-        tm.assert_series_equal(result_numba, expected, check_exact=False)
-
         result_numba = getattr(df.groupby("l").float, numba_method)(
             skipna=False, engine="numba"
         )
         expected = df.groupby("l").float.apply(
             lambda x: getattr(x, numba_method)(skipna=False)
         )
         tm.assert_series_equal(result_numba, expected, check_exact=False)
-
-    @pytest.mark.parametrize(
-        "numba_method", ["sum", "min", "max", "std", "var", "mean"]
-    )
-    def test_skipna_consistency_numba(self, numba_method):
-        # GH15675
-        df = DataFrame(
-            {
-                "l": ["A", "A", "A", "B", "B", "B"],
-                "int": [-1, 1, -1, 1, 1, np.nan],
-                "float": [-1.0, 1.2, -1.1, 1.5, 1.0, np.nan],
-            }
-        )
-
-        result_with_arg = getattr(df.groupby("l").int, numba_method)(
-            skipna=True, engine="numba"
-        )
-        result_default = getattr(df.groupby("l").int, numba_method)(engine="numba")
-        tm.assert_series_equal(result_with_arg, result_default, check_exact=False)
-
-        result_with_arg = getattr(df.groupby("l").float, numba_method)(
-            skipna=True, engine="numba"
-        )
-        result_default = getattr(df.groupby("l").float, numba_method)(engine="numba")
-        tm.assert_series_equal(result_with_arg, result_default, check_exact=False)
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -1048,17 +1048,12 @@ def scipy_sem(*args, **kwargs):
 
 
 @pytest.mark.parametrize(
-    "reduction_method",
-    ["sum", "min", "max", "mean", "median", "prod", "sem", "std", "var"],
-)
-def test_skipna_reduction_ops_cython(reduction_method):
-    # GH15675
-    # Testing the skipna parameter against possible datatypes
-    df = DataFrame(
+    "data",
+    [
         {
             "l": ["A", "A", "A", "A", "B", "B", "B", "B"],
-            "int": [-1, 1, -1, 2, 1, 1, 1, np.nan],
-            "float": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0],
+            "f": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0],
+            "s": ["foo", "bar", "baz", "foo", "foo", "foo", pd.NA, "foo"],
             "t": [
                 Timestamp("2024-01-01"),
                 Timestamp("2024-01-02"),
@@ -1080,85 +1075,36 @@ def test_skipna_reduction_ops_cython(reduction_method):
                 pd.NaT,
             ],
         }
-    )
-
-    result_cython = getattr(df.groupby("l").int, reduction_method)(skipna=False)
-    expected = df.groupby("l").int.apply(
-        lambda x: getattr(x, reduction_method)(skipna=False)
-    )
-    tm.assert_series_equal(result_cython, expected, check_exact=False)
-
-    result_cython = getattr(df.groupby("l").float, reduction_method)(skipna=False)
-    expected = df.groupby("l").float.apply(
-        lambda x: getattr(x, reduction_method)(skipna=False)
-    )
-    tm.assert_series_equal(result_cython, expected, check_exact=False)
-
-    if reduction_method in ["min", "max", "mean", "median", "std"]:
-        result_ts = getattr(df.groupby("l").t, reduction_method)(skipna=False)
-        expected_ts = df.groupby("l").t.apply(
-            lambda x: getattr(x, reduction_method)(skipna=False)
-        )
-        tm.assert_series_equal(result_ts, expected_ts, check_exact=False)
-
-        result_td = getattr(df.groupby("l").td, reduction_method)(skipna=False)
-        expected_td = df.groupby("l").td.apply(
-            lambda x: getattr(x, reduction_method)(skipna=False)
-        )
-        tm.assert_series_equal(result_td, expected_td, check_exact=False)
-
-
+    ],
+)
 @pytest.mark.parametrize(
-    "reduction_method",
-    ["sum", "min", "max", "mean", "median", "prod", "sem", "std", "var"],
+    "reduction_method,columns",
+    [
+        ("sum", ["f", "s"]),
+        ("min", ["f", "t", "td"]),
+        ("max", ["f", "t", "td"]),
+        ("mean", ["f", "t", "td"]),
+        ("median", ["f", "t", "td"]),
+        ("prod", ["f"]),
+        ("sem", ["f"]),
+        ("std", ["f", "t", "td"]),
+        ("var", ["f"]),
+        ("any", ["f"]),
+        ("all", ["f"]),
+        ("skew", ["f"]),
+    ],
 )
-def test_skipna_reduction_ops_consistency(reduction_method):
+def test_skipna_reduction_ops_cython(reduction_method, columns, data):
     # GH15675
-    # Testing if provinding skipna=True maintains the default functionality
-    df = DataFrame(
-        {
-            "l": ["A", "A", "A", "A", "B", "B", "B", "B"],
-            "int": [-1, 1, -1, 2, 1, 1, 1, np.nan],
-            "float": [-1.0, 1.2, -1.1, 1.5, -1.1, 1.5, np.nan, 1.0],
-            "t": [
-                Timestamp("2024-01-01"),
-                Timestamp("2024-01-02"),
-                Timestamp("2024-01-03"),
-                Timestamp("2024-01-04"),
-                Timestamp("2024-01-05"),
-                Timestamp("2024-01-06"),
-                pd.NaT,
-                Timestamp("2024-01-07"),
-            ],
-            "td": [
-                pd.Timedelta(days=1),
-                pd.Timedelta(days=2),
-                pd.Timedelta(days=3),
-                pd.Timedelta(days=4),
-                pd.Timedelta(days=5),
-                pd.Timedelta(days=6),
-                pd.NaT,
-                pd.Timedelta(days=7),
-            ],
-        }
-    )
-
-    result_with_arg = getattr(df.groupby("l").int, reduction_method)(skipna=True)
-    result_default = getattr(df.groupby("l").int, reduction_method)()
-    tm.assert_series_equal(result_with_arg, result_default, check_exact=False)
-
-    result_with_arg = getattr(df.groupby("l").float, reduction_method)(skipna=True)
-    result_default = getattr(df.groupby("l").float, reduction_method)()
-    tm.assert_series_equal(result_with_arg, result_default, check_exact=False)
-
-    if reduction_method in ["min", "max", "mean", "median", "std"]:
-        result_ts_with_arg = getattr(df.groupby("l").t, reduction_method)(skipna=True)
-        result_ts_default = getattr(df.groupby("l").t, reduction_method)()
-        tm.assert_series_equal(result_ts_with_arg, result_ts_default, check_exact=False)
+    # Testing the skipna parameter against possible datatypes
+    df = DataFrame(data)
 
-        result_td_with_arg = getattr(df.groupby("l").td, reduction_method)(skipna=True)
-        result_td_default = getattr(df.groupby("l").td, reduction_method)()
-        tm.assert_series_equal(result_td_with_arg, result_td_default, check_exact=False)
+    for column in columns:
+        result_cython = getattr(df.groupby("l")[column], reduction_method)(skipna=False)
+        expected = df.groupby("l")[column].apply(
+            lambda x: getattr(x, reduction_method)(skipna=False)
+        )
+        tm.assert_series_equal(result_cython, expected, check_exact=False)
 
 
 @pytest.mark.parametrize(
@@ -1306,31 +1252,3 @@ def test_groupby_std_datetimelike():
     exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5))
     expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser})
     tm.assert_frame_equal(result, expected)
-
-
-def test_skipna_string_sum():
-    # GH15675
-    df = DataFrame(
-        {
-            "l": ["A", "A", "A", "B", "B", "B"],
-            "v": ["foo", "bar", "baz", "foo", pd.NA, "foo"],
-        }
-    )
-
-    result_cython = df.groupby("l").v.sum(skipna=False)
-    expected = df.groupby("l").v.apply(lambda x: x.sum(skipna=False))
-    tm.assert_series_equal(result_cython, expected, check_exact=False)
-
-
-def test_skipna_string_sum_consistency():
-    # GH15675
-    df = DataFrame(
-        {
-            "l": ["A", "A", "A", "B", "B", "B"],
-            "v": ["foo", "bar", "baz", "foo", pd.NA, "foo"],
-        }
-    )
-
-    result_cython = df.groupby("l").v.sum(skipna=True)
-    expected = df.groupby("l").v.sum()
-    tm.assert_series_equal(result_cython, expected, check_exact=False)