Skip to content

Commit d08d5e6

Browse files
tadejarok
andauthored
GH-48470: [Python] Construct UuidArray from list of UuidScalars (#48746)
### Rationale for this change Fixes #48470. Also fixes all extension types, not just UUID. ### What changes are included in this PR? An extension scalar is unwrapped to its storage type when building arrays. ### Are these changes tested? Yes, new `test_array_from_extension_scalars` covers builtin (uuid, bool8, json_, opaque) and custom types across all storage types (int, float, bool, string, binary, large string/binary, decimal, fixed-size binary, struct, timestamp, duration, date). ### Are there any user-facing changes? Now user can run such an example to get the output below instead of `ArrowInvalid` message. This now works for any extension type, not just UUID. ```python import pyarrow as pa pa.array([pa.scalar(b'1'*16, type=pa.uuid())], type=pa.uuid()) ``` ``` <pyarrow.lib.UuidArray object at 0x128186970> [ 31313131313131313131313131313131 ] ``` * GitHub Issue: #48470 Lead-authored-by: Tadeja Kadunc <tadeja.kadunc@gmail.com> Co-authored-by: tadeja <tadeja@users.noreply.github.com> Co-authored-by: Rok Mihevc <rok@mihevc.org> Signed-off-by: Rok Mihevc <rok@mihevc.org>
1 parent 2cb1f2b commit d08d5e6

File tree

2 files changed

+74
-7
lines changed

2 files changed

+74
-7
lines changed

python/pyarrow/src/arrow/python/python_to_arrow.cc

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -584,6 +584,14 @@ class PyConverter : public Converter<PyObject*, PyConversionOptions> {
584584
}
585585
};
586586

587+
// Helper function to unwrap extension scalar to its storage scalar
588+
const Scalar& GetStorageScalar(const Scalar& scalar) {
589+
if (scalar.type->id() == Type::EXTENSION) {
590+
return *checked_cast<const ExtensionScalar&>(scalar).value;
591+
}
592+
return scalar;
593+
}
594+
587595
template <typename T, typename Enable = void>
588596
class PyPrimitiveConverter;
589597

@@ -663,7 +671,8 @@ class PyPrimitiveConverter<
663671
} else if (arrow::py::is_scalar(value)) {
664672
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
665673
arrow::py::unwrap_scalar(value));
666-
ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
674+
ARROW_RETURN_NOT_OK(
675+
this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar)));
667676
} else {
668677
ARROW_ASSIGN_OR_RAISE(
669678
auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
@@ -684,7 +693,8 @@ class PyPrimitiveConverter<
684693
} else if (arrow::py::is_scalar(value)) {
685694
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
686695
arrow::py::unwrap_scalar(value));
687-
ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
696+
ARROW_RETURN_NOT_OK(
697+
this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar)));
688698
} else {
689699
ARROW_ASSIGN_OR_RAISE(
690700
auto converted, PyValue::Convert(this->primitive_type_, this->options_, value));
@@ -710,7 +720,8 @@ class PyPrimitiveConverter<T, enable_if_t<std::is_same<T, FixedSizeBinaryType>::
710720
} else if (arrow::py::is_scalar(value)) {
711721
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
712722
arrow::py::unwrap_scalar(value));
713-
ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
723+
ARROW_RETURN_NOT_OK(
724+
this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar)));
714725
} else {
715726
ARROW_RETURN_NOT_OK(
716727
PyValue::Convert(this->primitive_type_, this->options_, value, view_));
@@ -747,7 +758,8 @@ class PyPrimitiveConverter<
747758
} else if (arrow::py::is_scalar(value)) {
748759
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
749760
arrow::py::unwrap_scalar(value));
750-
ARROW_RETURN_NOT_OK(this->primitive_builder_->AppendScalar(*scalar));
761+
ARROW_RETURN_NOT_OK(
762+
this->primitive_builder_->AppendScalar(GetStorageScalar(*scalar)));
751763
} else {
752764
ARROW_RETURN_NOT_OK(
753765
PyValue::Convert(this->primitive_type_, this->options_, value, view_));
@@ -791,7 +803,7 @@ class PyDictionaryConverter<U, enable_if_has_c_type<U>>
791803
} else if (arrow::py::is_scalar(value)) {
792804
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
793805
arrow::py::unwrap_scalar(value));
794-
return this->value_builder_->AppendScalar(*scalar, 1);
806+
return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1);
795807
} else {
796808
ARROW_ASSIGN_OR_RAISE(auto converted,
797809
PyValue::Convert(this->value_type_, this->options_, value));
@@ -810,7 +822,7 @@ class PyDictionaryConverter<U, enable_if_has_string_view<U>>
810822
} else if (arrow::py::is_scalar(value)) {
811823
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
812824
arrow::py::unwrap_scalar(value));
813-
return this->value_builder_->AppendScalar(*scalar, 1);
825+
return this->value_builder_->AppendScalar(GetStorageScalar(*scalar), 1);
814826
} else {
815827
ARROW_RETURN_NOT_OK(
816828
PyValue::Convert(this->value_type_, this->options_, value, view_));
@@ -983,7 +995,7 @@ class PyStructConverter : public StructConverter<PyConverter, PyConverterTrait>
983995
} else if (arrow::py::is_scalar(value)) {
984996
ARROW_ASSIGN_OR_RAISE(std::shared_ptr<Scalar> scalar,
985997
arrow::py::unwrap_scalar(value));
986-
return this->struct_builder_->AppendScalar(*scalar);
998+
return this->struct_builder_->AppendScalar(GetStorageScalar(*scalar));
987999
}
9881000
switch (input_kind_) {
9891001
case InputKind::DICT:

python/pyarrow/tests/test_extension_type.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
# under the License.
1717

1818
import contextlib
19+
import datetime
1920
import os
2021
import shutil
2122
import subprocess
@@ -1486,6 +1487,60 @@ def bytes(self):
14861487
pa.scalar(bad)
14871488

14881489

1490+
def test_array_from_extension_scalars():
1491+
# One case per C++ converter: FixedSizeBinary, Binary/String
1492+
builtin_cases = [
1493+
(pa.uuid(), [b"0123456789abcdef"]),
1494+
(pa.opaque(pa.binary(), "t", "v"), [b"x", b"y"]),
1495+
]
1496+
for ext_type, values in builtin_cases:
1497+
scalars = [pa.scalar(v, type=ext_type) for v in values]
1498+
result = pa.array(scalars, type=ext_type)
1499+
assert result.equals(pa.array(values, type=ext_type))
1500+
1501+
# One case per C++ converter: Numeric, Timestamp/Duration, Struct
1502+
custom_cases = [
1503+
(IntegerType(), [100, 200]),
1504+
(AnnotatedType(pa.timestamp("us"), "ts"),
1505+
[datetime.datetime(2023, 1, 1)]),
1506+
(MyStructType(), [{"left": 1, "right": 2}]),
1507+
]
1508+
for ext_type, values in custom_cases:
1509+
with registered_extension_type(ext_type):
1510+
scalars = [pa.scalar(v, type=ext_type) for v in values]
1511+
result = pa.array(scalars, type=ext_type)
1512+
assert result.equals(pa.array(values, type=ext_type))
1513+
1514+
# Null handling
1515+
uuid_type = pa.uuid()
1516+
scalars = [pa.scalar(b"0123456789abcdef", type=uuid_type),
1517+
pa.scalar(None, type=uuid_type)]
1518+
result = pa.array(scalars, type=uuid_type)
1519+
assert result[0].is_valid and not result[1].is_valid
1520+
1521+
# ExtensionScalar.from_storage path
1522+
scalars = [
1523+
pa.ExtensionScalar.from_storage(uuid_type, b"0123456789abcdef"),
1524+
pa.ExtensionScalar.from_storage(uuid_type, None),
1525+
]
1526+
result = pa.array(scalars, type=uuid_type)
1527+
expected = pa.array([b"0123456789abcdef", None], type=uuid_type)
1528+
assert result.equals(expected)
1529+
1530+
# Type inference without explicit type
1531+
u = uuid4()
1532+
scalars = [pa.scalar(u, type=pa.uuid()), None]
1533+
result = pa.array(scalars)
1534+
assert result.type == pa.uuid()
1535+
assert result[0].as_py() == u
1536+
assert not result[1].is_valid
1537+
1538+
# Mixed extension scalars and raw Python objects
1539+
u1, u2 = uuid4(), uuid4()
1540+
result = pa.array([pa.scalar(u1, type=pa.uuid()), u2], type=pa.uuid())
1541+
assert result.equals(pa.array([u1, u2], type=pa.uuid()))
1542+
1543+
14891544
def test_tensor_type():
14901545
tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
14911546
assert tensor_type.extension_name == "arrow.fixed_shape_tensor"

0 commit comments

Comments
 (0)