From c6c597234d43c2a3915504847a2e271a102ff258 Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Jul 2025 15:47:02 -0700 Subject: [PATCH 1/5] BUG: Decimal(NaN) incorrectly allowed in ArrowEA constructor with timestamp type --- doc/source/whatsnew/v3.0.0.rst | 2 ++ pandas/core/arrays/arrow/array.py | 6 ++++++ pandas/tests/extension/test_arrow.py | 14 ++++++++++++++ pandas/tests/io/test_sql.py | 6 ++++++ 4 files changed, 28 insertions(+) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 4154942f92907..9104169e10e88 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -707,6 +707,8 @@ Datetimelike - Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`) - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) +- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`??`) +- Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b4e60819b033f..88e7365b1283d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -500,6 +500,12 @@ def _box_pa_array( value = to_timedelta(value, unit=pa_type.unit).as_unit(pa_type.unit) value = value.to_numpy() + if pa_type is not None and pa.types.is_timestamp(pa_type): + # Use to_datetime to handle NaNs, disallow Decimal("NaN") + from pandas import to_datetime + + value = to_datetime(value).as_unit(pa_type.unit) + try: pa_array = pa.array(value, type=pa_type, from_pandas=True) except (pa.ArrowInvalid, pa.ArrowTypeError): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8db837b176fe9..c2ab8fc1faffb 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3548,3 +3548,17 @@ def test_arrow_json_type(): dtype = ArrowDtype(pa.json_(pa.string())) result = dtype.type assert result == str + + +def test_timestamp_dtype_disallows_decimal(): + # constructing with pyarrow timestamp dtype should disallow Decimal NaN, + # just like pd.to_datetime + vals = [pd.Timestamp("2016-01-02 03:04:05"), Decimal("NaN")] + + msg = " is not convertible to datetime" + with pytest.raises(TypeError, match=msg): + # Check that the non-pyarrow version raises as expected + pd.to_datetime(vals) + + with pytest.raises(TypeError, match=msg): + pd.array(vals, dtype=ArrowDtype(pa.timestamp("us"))) diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 4a6a5635eb68c..6e80c0bdb41c5 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -8,6 +8,7 @@ time, timedelta, ) +from decimal import Decimal from io import StringIO from pathlib import Path import sqlite3 @@ -1038,6 +1039,11 @@ def test_dataframe_to_sql_arrow_dtypes(conn, request): def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): # GH 52046 pytest.importorskip("pyarrow") + if isinstance(nulls_fixture, Decimal): + pytest.skip( + reason="Decimal('NaN') not supported in constructor for timestamp dtype" + ) + df = DataFrame( { "datetime": pd.array( From 2b96bdb3d260b786c328aadef51d0f77f6cd5b3c Mon Sep 17 00:00:00 2001 From: Brock Date: Thu, 3 Jul 2025 15:49:04 -0700 Subject: [PATCH 2/5] GH ref --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/tests/extension/test_arrow.py | 4 ++-- pandas/tests/io/test_sql.py | 1 + 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 9104169e10e88..c9f66cb6e4f22 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -706,8 +706,8 @@ Datetimelike - Bug in :meth:`to_datetime` reports incorrect index in case of any failure scenario. (:issue:`58298`) - Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`) - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) +- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) -- Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`??`) - Timedelta diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index c2ab8fc1faffb..055148f5805a8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3551,8 +3551,8 @@ def test_arrow_json_type(): def test_timestamp_dtype_disallows_decimal(): - # constructing with pyarrow timestamp dtype should disallow Decimal NaN, - # just like pd.to_datetime + # GH#61773 constructing with pyarrow timestamp dtype should disallow + # Decimal NaN, just like pd.to_datetime vals = [pd.Timestamp("2016-01-02 03:04:05"), Decimal("NaN")] msg = " is not convertible to datetime" diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 6e80c0bdb41c5..6f4c1602a5e64 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -1041,6 +1041,7 @@ def test_dataframe_to_sql_arrow_dtypes_missing(conn, request, nulls_fixture): pytest.importorskip("pyarrow") if isinstance(nulls_fixture, Decimal): pytest.skip( + # GH#61773 reason="Decimal('NaN') not supported in constructor for timestamp dtype" ) From a8208c89f1e4e458defc0a592b0b53b1b2a099e4 Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Jul 2025 08:21:07 -0700 Subject: [PATCH 3/5] BUG: ArrowEA constructor with timestamp type --- doc/source/whatsnew/v3.0.0.rst | 2 +- pandas/core/arrays/arrow/array.py | 28 +++++++++++++++++++++++++--- pandas/tests/extension/test_arrow.py | 20 +++++++++++++++++--- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index c9f66cb6e4f22..10fb9503ffb3d 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -707,8 +707,8 @@ Datetimelike - Bug in :meth:`to_datetime` with ``format="ISO8601"`` and ``utc=True`` where naive timestamps incorrectly inherited timezone offset from previous timestamps in a series. (:issue:`61389`) - Bug in :meth:`to_datetime` wrongly converts when ``arg`` is a ``np.datetime64`` object with unit of ``ps``. (:issue:`60341`) - Bug in constructing arrays with :class:`ArrowDtype` with ``timestamp`` type incorrectly allowing ``Decimal("NaN")`` (:issue:`61773`) +- Bug in constructing arrays with a timezone-aware :class:`ArrowDtype` from timezone-naive datetime objects incorrectly treating those as UTC times instead of wall times like :class:`DatetimeTZDtype` (:issue:`61775`) - Bug in setting scalar values with mismatched resolution into arrays with non-nanosecond ``datetime64``, ``timedelta64`` or :class:`DatetimeTZDtype` incorrectly truncating those scalars (:issue:`56410`) -- Timedelta ^^^^^^^^^ diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 88e7365b1283d..e7bb580e8731a 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -63,6 +63,7 @@ from pandas.core.arrays.masked import BaseMaskedArray from pandas.core.arrays.string_ import StringDtype import pandas.core.common as com +from pandas.core.construction import extract_array from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -501,10 +502,31 @@ def _box_pa_array( value = value.to_numpy() if pa_type is not None and pa.types.is_timestamp(pa_type): - # Use to_datetime to handle NaNs, disallow Decimal("NaN") - from pandas import to_datetime + # Use DatetimeArray to exclude Decimal(NaN) (GH#61774) and + # ensure constructor treats tznaive the same as non-pyarrow + # dtypes (GH#61775) + from pandas.core.arrays.datetimes import ( + DatetimeArray, + tz_to_dtype, + ) - value = to_datetime(value).as_unit(pa_type.unit) + pass_dtype = tz_to_dtype(tz=pa_type.tz, unit=pa_type.unit) + value = extract_array(value, extract_numpy=True) + if isinstance(value, DatetimeArray): + dta = value + else: + dta = DatetimeArray._from_sequence( + value, copy=copy, dtype=pass_dtype + ) + mask = dta.isna() + value_i8 = dta.view("i8") + if not value_i8.flags["WRITEABLE"]: + # e.g. test_setitem_frame_2d_values + value_i8 = value_i8.copy() + dta = DatetimeArray._from_sequence(value_i8, dtype=dta.dtype) + value_i8[mask] = 0 # GH#61776 avoid __sub__ overflow + pa_array = pa.array(dta._ndarray, type=pa_type, mask=mask) + return pa_array try: pa_array = pa.array(value, type=pa_type, from_pandas=True) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 055148f5805a8..7e7cd8fb13456 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2686,6 +2686,7 @@ def test_dt_tz_localize_unsupported_tz_options(): ser.dt.tz_localize("UTC", nonexistent="NaT") +@pytest.mark.xfail(reason="Converts to UTC before localizing GH#61780") def test_dt_tz_localize_none(): ser = pd.Series( [datetime(year=2023, month=1, day=2, hour=3), None], @@ -2693,7 +2694,7 @@ def test_dt_tz_localize_none(): ) result = ser.dt.tz_localize(None) expected = pd.Series( - [datetime(year=2023, month=1, day=2, hour=3), None], + [ser[0].tz_localize(None), None], dtype=ArrowDtype(pa.timestamp("ns")), ) tm.assert_series_equal(result, expected) @@ -2753,7 +2754,7 @@ def test_dt_tz_convert_none(): ) result = ser.dt.tz_convert(None) expected = pd.Series( - [datetime(year=2023, month=1, day=2, hour=3), None], + [ser[0].tz_convert(None), None], dtype=ArrowDtype(pa.timestamp("ns")), ) tm.assert_series_equal(result, expected) @@ -2767,7 +2768,7 @@ def test_dt_tz_convert(unit): ) result = ser.dt.tz_convert("US/Eastern") expected = pd.Series( - [datetime(year=2023, month=1, day=2, hour=3), None], + [ser[0].tz_convert("US/Eastern"), None], dtype=ArrowDtype(pa.timestamp(unit, "US/Eastern")), ) tm.assert_series_equal(result, expected) @@ -3562,3 +3563,16 @@ def test_timestamp_dtype_disallows_decimal(): with pytest.raises(TypeError, match=msg): pd.array(vals, dtype=ArrowDtype(pa.timestamp("us"))) + + +def test_timestamp_dtype_matches_to_datetime(): + # GH#61775 + dtype1 = "datetime64[ns, US/Eastern]" + dtype2 = "timestamp[ns, US/Eastern][pyarrow]" + + ts = pd.Timestamp("2025-07-03 18:10") + + result = pd.Series([ts], dtype=dtype2) + expected = pd.Series([ts], dtype=dtype1).convert_dtypes(dtype_backend="pyarrow") + + tm.assert_series_equal(result, expected) From 66e391c8615cc1e2d5c0e677f8c06a26054db76b Mon Sep 17 00:00:00 2001 From: Brock Date: Fri, 4 Jul 2025 13:49:25 -0700 Subject: [PATCH 4/5] mypy fixup --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e7bb580e8731a..be21e86265992 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -520,7 +520,7 @@ def _box_pa_array( ) mask = dta.isna() value_i8 = dta.view("i8") - if not value_i8.flags["WRITEABLE"]: + if not np.asarray(value_i8).flags["WRITEABLE"]: # e.g. test_setitem_frame_2d_values value_i8 = value_i8.copy() dta = DatetimeArray._from_sequence(value_i8, dtype=dta.dtype) From aeae12c3be11d190bec2d50a00426a198ce82a34 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 7 Jul 2025 07:58:01 -0700 Subject: [PATCH 5/5] mypy fixup --- pandas/core/arrays/arrow/array.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index be21e86265992..3dc03d9cbf3a2 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -518,14 +518,14 @@ def _box_pa_array( dta = DatetimeArray._from_sequence( value, copy=copy, dtype=pass_dtype ) - mask = dta.isna() - value_i8 = dta.view("i8") - if not np.asarray(value_i8).flags["WRITEABLE"]: + dta_mask = dta.isna() + value_i8 = cast("npt.NDArray", dta.view("i8")) + if not value_i8.flags["WRITEABLE"]: # e.g. test_setitem_frame_2d_values value_i8 = value_i8.copy() dta = DatetimeArray._from_sequence(value_i8, dtype=dta.dtype) - value_i8[mask] = 0 # GH#61776 avoid __sub__ overflow - pa_array = pa.array(dta._ndarray, type=pa_type, mask=mask) + value_i8[dta_mask] = 0 # GH#61776 avoid __sub__ overflow + pa_array = pa.array(dta._ndarray, type=pa_type, mask=dta_mask) return pa_array try: