POC: consistent NaN treatment for pyarrow dtypes

jbrockmendel · jbrockmendel · commit 3fad33f9f29c · 2025-06-28T10:07:44.000-07:00
diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -1456,7 +1456,7 @@ def _maybe_upcast(
         if isinstance(arr, IntegerArray) and arr.isna().all():
             # use null instead of int64 in pyarrow
             arr = arr.to_numpy(na_value=None)
-        arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
+        arr = ArrowExtensionArray(pa.array(arr))
 
     return arr
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -16,6 +16,7 @@
 import numpy as np
 
 from pandas._libs import lib
+from pandas._libs.missing import NA
 from pandas._libs.tslibs import (
     Timedelta,
     Timestamp,
@@ -360,7 +361,7 @@ def _from_sequence_of_strings(
                 # duration to string casting behavior
                 mask = isna(scalars)
                 if not isinstance(strings, (pa.Array, pa.ChunkedArray)):
-                    strings = pa.array(strings, type=pa.string(), from_pandas=True)
+                    strings = pa.array(strings, type=pa.string())
                 strings = pc.if_else(mask, None, strings)
                 try:
                     scalars = strings.cast(pa.int64())
@@ -381,7 +382,7 @@ def _from_sequence_of_strings(
             if isinstance(strings, (pa.Array, pa.ChunkedArray)):
                 scalars = strings
             else:
-                scalars = pa.array(strings, type=pa.string(), from_pandas=True)
+                scalars = pa.array(strings, type=pa.string())
             scalars = pc.if_else(pc.equal(scalars, "1.0"), "1", scalars)
             scalars = pc.if_else(pc.equal(scalars, "0.0"), "0", scalars)
             scalars = scalars.cast(pa.bool_())
@@ -393,6 +394,13 @@ def _from_sequence_of_strings(
             from pandas.core.tools.numeric import to_numeric
 
             scalars = to_numeric(strings, errors="raise")
+            if not pa.types.is_decimal(pa_type):
+                # TODO: figure out why doing this cast breaks with decimal dtype
+                #  in test_from_sequence_of_strings_pa_array
+                mask = strings.is_null()
+                scalars = pa.array(scalars, mask=np.array(mask), type=pa_type)
+                # TODO: could we just do strings.cast(pa_type)?
+
         else:
             raise NotImplementedError(
                 f"Converting strings to {pa_type} is not implemented."
@@ -435,7 +443,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
         """
         if isinstance(value, pa.Scalar):
             pa_scalar = value
-        elif isna(value):
+        elif isna(value) and not lib.is_float(value):
             pa_scalar = pa.scalar(None, type=pa_type)
         else:
             # Workaround https://github.com/apache/arrow/issues/37291
@@ -452,7 +460,7 @@ def _box_pa_scalar(cls, value, pa_type: pa.DataType | None = None) -> pa.Scalar:
                     value = value.as_unit(pa_type.unit)
                 value = value._value
 
-            pa_scalar = pa.scalar(value, type=pa_type, from_pandas=True)
+            pa_scalar = pa.scalar(value, type=pa_type)
 
         if pa_type is not None and pa_scalar.type != pa_type:
             pa_scalar = pa_scalar.cast(pa_type)
@@ -484,6 +492,13 @@ def _box_pa_array(
             if copy:
                 value = value.copy()
             pa_array = value.__arrow_array__()
+
+        elif hasattr(value, "__arrow_array__"):
+            # e.g. StringArray
+            if copy:
+                value = value.copy()
+            pa_array = value.__arrow_array__()
+
         else:
             if (
                 isinstance(value, np.ndarray)
@@ -510,19 +525,32 @@ def _box_pa_array(
                 value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit)
                 value = value.to_numpy()
 
+            mask = None
+            if getattr(value, "dtype", None) is None or value.dtype.kind not in "mfM":
+                # similar to isna(value) but exclude NaN
+                # TODO: cythonize!
+                mask = np.array([x is NA or x is None for x in value], dtype=bool)
+
+            from_pandas = False
+            if pa.types.is_integer(pa_type):
+                # If user specifically asks to cast a numpy float array with NaNs
+                #  to pyarrow integer, we'll treat those NaNs as NA
+                from_pandas = True
             try:
-                pa_array = pa.array(value, type=pa_type, from_pandas=True)
+                pa_array = pa.array(
+                    value, type=pa_type, mask=mask, from_pandas=from_pandas
+                )
             except (pa.ArrowInvalid, pa.ArrowTypeError):
                 # GH50430: let pyarrow infer type, then cast
-                pa_array = pa.array(value, from_pandas=True)
+                pa_array = pa.array(value, mask=mask, from_pandas=from_pandas)
 
             if pa_type is None and pa.types.is_duration(pa_array.type):
                 # Workaround https://github.com/apache/arrow/issues/37291
                 from pandas.core.tools.timedeltas import to_timedelta
 
                 value = to_timedelta(value)
                 value = value.to_numpy()
-                pa_array = pa.array(value, type=pa_type, from_pandas=True)
+                pa_array = pa.array(value, type=pa_type)
 
             if pa.types.is_duration(pa_array.type) and pa_array.null_count > 0:
                 # GH52843: upstream bug for duration types when originally
@@ -1169,7 +1197,7 @@ def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
         if not len(values):
             return np.zeros(len(self), dtype=bool)
 
-        result = pc.is_in(self._pa_array, value_set=pa.array(values, from_pandas=True))
+        result = pc.is_in(self._pa_array, value_set=pa.array(values))
         # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
         # to False
         return np.array(result, dtype=np.bool_)
@@ -2002,7 +2030,7 @@ def __setitem__(self, key, value) -> None:
                 raise ValueError("Length of indexer and values mismatch")
             chunks = [
                 *self._pa_array[:key].chunks,
-                pa.array([value], type=self._pa_array.type, from_pandas=True),
+                pa.array([value], type=self._pa_array.type),
                 *self._pa_array[key + 1 :].chunks,
             ]
             data = pa.chunked_array(chunks).combine_chunks()
@@ -2056,7 +2084,7 @@ def _rank_calc(
                 pa_type = pa.float64()
             else:
                 pa_type = pa.uint64()
-            result = pa.array(ranked, type=pa_type, from_pandas=True)
+            result = pa.array(ranked, type=pa_type)
             return result
 
         data = self._pa_array.combine_chunks()
@@ -2308,7 +2336,7 @@ def _to_numpy_and_type(value) -> tuple[np.ndarray, pa.DataType | None]:
         right, right_type = _to_numpy_and_type(right)
         pa_type = left_type or right_type
         result = np.where(cond, left, right)
-        return pa.array(result, type=pa_type, from_pandas=True)
+        return pa.array(result, type=pa_type)
 
     @classmethod
     def _replace_with_mask(
@@ -2351,7 +2379,7 @@ def _replace_with_mask(
             replacements = replacements.as_py()
         result = np.array(values, dtype=object)
         result[mask] = replacements
-        return pa.array(result, type=values.type, from_pandas=True)
+        return pa.array(result, type=values.type)
 
     # ------------------------------------------------------------------
     # GroupBy Methods
@@ -2430,7 +2458,7 @@ def _groupby_op(
             return type(self)(pa_result)
         else:
             # DatetimeArray, TimedeltaArray
-            pa_result = pa.array(result, from_pandas=True)
+            pa_result = pa.array(result)
             return type(self)(pa_result)
 
     def _apply_elementwise(self, func: Callable) -> list[list[Any]]:
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -474,6 +474,12 @@ def _str_map_str_or_object(
             if self.dtype.storage == "pyarrow":
                 import pyarrow as pa
 
+                # TODO: shouldn't this already be caught my passed mask?
+                #  it isn't in test_extract_expand_capture_groups_index
+                # mask = mask | np.array(
+                #    [x is libmissing.NA for x in result], dtype=bool
+                #    )
+
                 result = pa.array(
                     result, mask=mask, type=pa.large_string(), from_pandas=True
                 )
@@ -726,7 +732,7 @@ def __arrow_array__(self, type=None):
 
         values = self._ndarray.copy()
         values[self.isna()] = None
-        return pa.array(values, type=type, from_pandas=True)
+        return pa.array(values, type=type)
 
     def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]:  # type: ignore[override]
         arr = self._ndarray
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -9874,7 +9874,7 @@ def where(
     def where(
         self,
         cond,
-        other=np.nan,
+        other=lib.no_default,
         *,
         inplace: bool = False,
         axis: Axis | None = None,
@@ -10032,6 +10032,23 @@ def where(
                         stacklevel=2,
                     )
 
+        if other is lib.no_default:
+            if self.ndim == 1:
+                if isinstance(self.dtype, ExtensionDtype):
+                    other = self.dtype.na_value
+                else:
+                    other = np.nan
+            else:
+                if self._mgr.nblocks == 1 and isinstance(
+                    self._mgr.blocks[0].values.dtype, ExtensionDtype
+                ):
+                    # FIXME: checking this is kludgy!
+                    other = self._mgr.blocks[0].values.dtype.na_value
+                else:
+                    # FIXME: the same problem we had with Series will now
+                    #  show up column-by-column!
+                    other = np.nan
+
         other = common.apply_if_callable(other, self)
         return self._where(cond, other, inplace=inplace, axis=axis, level=level)
 
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -731,7 +731,7 @@ def test_EA_types(self, engine, data, dtype_backend, request):
                 pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")
             )
         df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})
-        csv_output = df.to_csv(index=False, na_rep=np.nan)
+        csv_output = df.to_csv(index=False, na_rep=np.nan)  # should be NA?
         if pa.types.is_binary(pa_dtype):
             csv_output = BytesIO(csv_output)
         else:
diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py
@@ -381,8 +381,10 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how):
     df = DataFrame(
         {
             "a": [2, 1, 1, 2, 3, 3],
-            "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan],
-            "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan],
+            # TODO: test that has mixed na_value and NaN either working for
+            #  float or raising for int?
+            "b": [na_value, 3.0, na_value, 4.0, na_value, na_value],
+            "c": [na_value, 3.0, na_value, 4.0, na_value, na_value],
         },
         dtype=any_real_nullable_dtype,
     )
diff --git a/pandas/tests/series/methods/test_rank.py b/pandas/tests/series/methods/test_rank.py
@@ -276,6 +276,13 @@ def test_rank_tie_methods(self, ser, results, dtype, using_infer_string):
 
         ser = ser if dtype is None else ser.astype(dtype)
         result = ser.rank(method=method)
+        if dtype == "float64[pyarrow]":
+            # the NaNs are not treated as NA
+            exp = exp.copy()
+            if method == "average":
+                exp[np.isnan(ser)] = 9.5
+            elif method == "dense":
+                exp[np.isnan(ser)] = 6
         tm.assert_series_equal(result, Series(exp, dtype=expected_dtype(dtype, method)))
 
     @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"])
@@ -321,6 +328,8 @@ def test_rank_tie_methods_on_infs_nans(
             order = [ranks[1], ranks[0], ranks[2]]
         elif na_option == "bottom":
             order = [ranks[0], ranks[2], ranks[1]]
+        elif dtype == "float64[pyarrow]":
+            order = [ranks[0], [NA] * chunk, ranks[1]]
         else:
             order = [ranks[0], [np.nan] * chunk, ranks[1]]
         expected = order if ascending else order[::-1]

Original file line number	Diff line number	Diff line change
`@@ -731,7 +731,7 @@ def test_EA_types(self, engine, data, dtype_backend, request):`
`731`	`731`	`pytest.mark.xfail(reason="CSV parsers don't correctly handle binary")`
`732`	`732`	`)`
`733`	`733`	`df = pd.DataFrame({"with_dtype": pd.Series(data, dtype=str(data.dtype))})`
`734`		`- csv_output = df.to_csv(index=False, na_rep=np.nan)`
	`734`	`+ csv_output = df.to_csv(index=False, na_rep=np.nan) # should be NA?`
`735`	`735`	`if pa.types.is_binary(pa_dtype):`
`736`	`736`	`csv_output = BytesIO(csv_output)`
`737`	`737`	`else:`
Original file line number	Diff line number	Diff line change
`@@ -381,8 +381,10 @@ def test_first_last_skipna(any_real_nullable_dtype, sort, skipna, how):`
`381`	`381`	`df = DataFrame(`
`382`	`382`	`{`
`383`	`383`	`"a": [2, 1, 1, 2, 3, 3],`
`384`		`- "b": [na_value, 3.0, na_value, 4.0, np.nan, np.nan],`
`385`		`- "c": [na_value, 3.0, na_value, 4.0, np.nan, np.nan],`
	`384`	`+ # TODO: test that has mixed na_value and NaN either working for`
	`385`	`+ # float or raising for int?`
	`386`	`+ "b": [na_value, 3.0, na_value, 4.0, na_value, na_value],`
	`387`	`+ "c": [na_value, 3.0, na_value, 4.0, na_value, na_value],`
`386`	`388`	`},`
`387`	`389`	`dtype=any_real_nullable_dtype,`
`388`	`390`	`)`