Skip to content

Commit fe526ea

Browse files
authored
refactor: decouple extension ABI from arrow-rs with Arrow C Data Interfaces (#6337)
Remove arrow-rs dependency from daft-ext-abi by introducing owned Arrow C Data Interface types (ArrowSchema, ArrowArray, ArrowArrayStream) that are layout compatible with the C ABI. This allows extensions to use any Arrow implementation (arrow-rs, arrow2, etc.) without coupling to a specific version. ## Changes Made - Add ArrowSchema, ArrowArray, ArrowArrayStream C types to daft-ext-abi - Add ArrowData (schema + array pair) for safe FFI data transfer - Move FFI conversion helpers into daft-ext-abi/src/ffi/arrow.rs - Update DaftScalarFunction trait to use ABI types instead of arrow-rs - Update DaftSession trait to use ABI types - Remove arrow-rs dependency from daft-ext-abi Cargo.toml - Add arrow.rs helpers in daft-ext-core for arrow-rs <-> ABI conversions - Update hello example to use new ABI types ## Related Issues - Follow-up to #6301
1 parent b2f98ab commit fe526ea

21 files changed

Lines changed: 1745 additions & 759 deletions

File tree

Cargo.lock

Lines changed: 740 additions & 451 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,9 @@ members = [
224224
"src/daft-cli",
225225
"src/daft-text"
226226
]
227+
exclude = [
228+
"examples/hello"
229+
]
227230

228231
[workspace.dependencies]
229232
arrow = "57.1.0"
@@ -267,9 +270,11 @@ daft-catalog = {path = "src/daft-catalog"}
267270
daft-context = {path = "src/daft-context"}
268271
daft-core = {path = "src/daft-core"}
269272
daft-dsl = {path = "src/daft-dsl"}
270-
daft-ext-abi = {path = "src/daft-ext-abi", default-features = false}
271-
daft-ext-core = {path = "src/daft-ext-core", default-features = false}
272-
daft-ext-internal = {path = "src/daft-ext-internal", default-features = false}
273+
daft-ext-abi = {path = "src/daft-ext-abi"}
274+
daft-ext-core = {path = "src/daft-ext-core"}
275+
daft-ext-internal = {path = "src/daft-ext-internal"}
276+
daft-ext-macros = {path = "src/daft-ext-macros"}
277+
daft-ext = {path = "src/daft-ext"}
273278
daft-file = {path = "src/daft-file"}
274279
daft-functions = {path = "src/daft-functions"}
275280
daft-functions-binary = {path = "src/daft-functions-binary"}

docs/extensions/index.md

Lines changed: 73 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
> Please see the [prompt](#prompt) if you want help generating an extension.
88
99
This document is a guide for authoring Daft native extensions in Rust.
10-
Daft supports native Rust extensions by leveraging a stable C ABI and Arrow FFI. Today we support authoring native
10+
Daft supports native Rust extensions by leveraging a stable C ABI based on the
11+
[Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html).
12+
Extensions are **not coupled** to any particular Arrow library version. The ABI boundary uses
13+
plain C structs (`ArrowSchema`, `ArrowArray`) so your extension can use any arrow-rs version
14+
(or even a different Arrow implementation entirely). Today we support authoring native
1115
scalar functions, but are actively working on additional native extension features.
1216

1317
## Example
@@ -93,16 +97,18 @@ crate-type = ["cdylib"]
9397

9498
[dependencies]
9599
daft-ext = <version>
96-
arrow-array = "57.1.0"
97-
arrow-schema = "57.1.0"
100+
daft-ext-abi = { version = <version>, features = ["arrow-58"] }
101+
arrow = { version = "58", features = ["ffi"] }
98102
```
99103

100-
!!! tip "Arrow types"
104+
!!! tip "Arrow version freedom"
101105

102-
Use `arrow-array` builders and downcasting directly for working with data.
103-
The `daft-ext` prelude re-exports common types like `ArrayRef` and `Field`.
104-
Import `arrow_array::Array` for the `len()` and `is_null()` methods, and
105-
`arrow_array::cast::AsArray` for downcasting (e.g., `as_string`).
106+
The `daft-ext` ABI uses C Data Interface types — your extension is **not** pinned to
107+
Daft's arrow-rs version. Enable a feature flag on `daft-ext-abi` matching your arrow-rs
108+
version (`arrow-56`, `arrow-57`, or `arrow-58`) to get safe `.into()` conversions
109+
between arrow-rs FFI types and the ABI types. For unsupported versions, use the
110+
`from_owned`/`into_owned`/`from_raw`/`as_raw` escape hatches on `ArrowArray`
111+
and `ArrowSchema`.
106112

107113
Then update the pyproject to use `setuptools-rust` as the build system.
108114

@@ -164,14 +170,15 @@ cat src/lib.rs
164170
```
165171

166172
```rust
167-
use std::ffi::CStr;
168-
use std::sync::Arc;
173+
use std::{ffi::CStr, sync::Arc};
169174

170-
use arrow_array::{Array, ArrayRef};
171-
use arrow_array::builder::StringBuilder;
172-
use arrow_array::cast::AsArray;
173-
use arrow_schema::{DataType, Field};
175+
use arrow::{
176+
array::{Array, builder::StringBuilder, cast::AsArray},
177+
datatypes::{DataType, Field, Schema},
178+
ffi::FFI_ArrowSchema,
179+
};
174180
use daft_ext::prelude::*;
181+
use daft_ext_abi::{ArrowData, ArrowSchema};
175182

176183
// ── Module ──────────────────────────────────────────────────────────
177184

@@ -181,7 +188,6 @@ use daft_ext::prelude::*;
181188
struct HelloExtension;
182189

183190
impl DaftExtension for HelloExtension {
184-
185191
/// This is the extension install hook for defining functions in the session.
186192
/// Called once when the extension is loaded into a session. Register each function here.
187193
fn install(session: &mut dyn DaftSession) {
@@ -202,25 +208,42 @@ impl DaftScalarFunction for Greet {
202208
}
203209

204210
/// Type checking.
205-
/// Given the input `Field` schemas, validate types and return the output `Field`.
206-
fn return_field(&self, args: &[Field]) -> DaftResult<Field> {
211+
/// Receives input fields as C Data Interface `ArrowSchema` types.
212+
/// Use `.as_raw()` / `.into()` to convert between arrow-rs and ABI types.
213+
fn return_field(&self, args: &[ArrowSchema]) -> DaftResult<ArrowSchema> {
207214
if args.len() != 1 {
208-
return Err(DaftError::TypeError(
209-
format!("greet: expected 1 argument, got {}", args.len()),
210-
));
215+
return Err(DaftError::TypeError(format!(
216+
"greet: expected 1 argument, got {}",
217+
args.len()
218+
)));
211219
}
212-
if *args[0].data_type() != DataType::Utf8 && *args[0].data_type() != DataType::LargeUtf8 {
213-
return Err(DaftError::TypeError(
214-
format!("greet: expected string argument, got {:?}", args[0].data_type()),
215-
));
220+
let ffi_schema: &FFI_ArrowSchema = unsafe { args[0].as_raw() };
221+
let field = Field::try_from(ffi_schema)
222+
.map_err(|e| DaftError::TypeError(e.to_string()))?;
223+
let dt = field.data_type();
224+
if *dt != DataType::Utf8 && *dt != DataType::LargeUtf8 {
225+
return Err(DaftError::TypeError(format!(
226+
"greet: expected string argument, got {:?}",
227+
dt
228+
)));
216229
}
217-
Ok(Field::new("greet", DataType::Utf8, true))
230+
let out_schema = Schema::new(vec![Field::new("greet", DataType::Utf8, true)]);
231+
let ffi = FFI_ArrowSchema::try_from(&out_schema)
232+
.map_err(|e| DaftError::TypeError(e.to_string()))?;
233+
Ok(ffi.into())
218234
}
219235

220-
/// Evaluation. Receives Arrow arrays, returns an Arrow array. Operates on entire columns at once.
236+
/// Evaluation. Receives columns as C Data Interface `ArrowData` types.
237+
/// Use `.into()` to convert to/from arrow-rs FFI types.
221238
/// All data flows through Arrow arrays — no per-row Python overhead.
222-
fn call(&self, args: &[ArrayRef]) -> DaftResult<ArrayRef> {
223-
let names = args[0].as_string::<i64>();
239+
fn call(&self, args: &[ArrowData]) -> DaftResult<ArrowData> {
240+
let data = unsafe { ArrowData::take_arg(args, 0) };
241+
let ffi_array: arrow::ffi::FFI_ArrowArray = data.array.into();
242+
let ffi_schema: arrow::ffi::FFI_ArrowSchema = data.schema.into();
243+
let arrow_data = unsafe { arrow::ffi::from_ffi(ffi_array, &ffi_schema) }
244+
.map_err(|e| DaftError::RuntimeError(e.to_string()))?;
245+
let input = arrow::array::make_array(arrow_data);
246+
let names = input.as_string::<i64>();
224247
let mut builder = StringBuilder::with_capacity(names.len(), names.len() * 16);
225248
for i in 0..names.len() {
226249
if names.is_null(i) {
@@ -229,11 +252,25 @@ impl DaftScalarFunction for Greet {
229252
builder.append_value(format!("Hello, {}!", names.value(i)));
230253
}
231254
}
232-
Ok(Arc::new(builder.finish()))
255+
let output = builder.finish();
256+
let (out_arr, out_sch) = arrow::ffi::to_ffi(&output.to_data())
257+
.map_err(|e| DaftError::RuntimeError(e.to_string()))?;
258+
Ok(ArrowData {
259+
array: out_arr.into(),
260+
schema: out_sch.into(),
261+
})
233262
}
234263
}
235264
```
236265

266+
!!! tip "ABI pattern"
267+
268+
The `DaftScalarFunction` trait uses C Data Interface types (`ArrowSchema`, `ArrowData`)
269+
at the ABI boundary. Enable a `daft-ext-abi` feature flag (`arrow-56`, `arrow-57`, or
270+
`arrow-58`) matching your arrow-rs version to get `.into()` conversions. Use `.as_raw()`
271+
for zero-copy borrows. This decoupling means your extension is not tied to Daft's
272+
arrow-rs version.
273+
237274
!!! tip "String types"
238275

239276
Daft uses `LargeUtf8` (i64 offsets) for strings internally. When downcasting string arrays,
@@ -380,17 +417,18 @@ Follow the Daft extension authoring guide at docs/extensions/index.md. Here is a
380417

381418
## Rust conventions
382419

383-
- Use `daft_ext::prelude::*` for all imports.
384-
- Import `arrow_array::Array` for `len()`/`is_null()` and `arrow_array::cast::AsArray` for downcasting.
420+
- Use `daft_ext::prelude::*` for all imports (provides `ArrowSchema`, `ArrowData`, errors, traits).
421+
- Add `daft-ext-abi` with a feature flag matching your arrow version (`arrow-56`, `arrow-57`, or `arrow-58`) for `.into()` conversions.
422+
- Import `arrow::array::Array` for `len()`/`is_null()` and `arrow::array::cast::AsArray` for downcasting.
385423
- Daft uses `LargeUtf8` (i64 offsets) for strings — downcast with `as_string::<i64>()`, never `i32`.
386424
- Apply `#[daft_extension]` to a struct implementing `DaftExtension`.
387425
- Register each function in `install()` via `session.define_function(Arc::new(MyFn))`.
388426
- Each function is a struct implementing `DaftScalarFunction` with:
389427
- `name(&self) -> &CStr` — use `c"<extension_name>_<fn_name>"` prefix to avoid collisions.
390-
- `return_field(&self, args: &[Field]) -> DaftResult<Field>`validate arg count and types,
391-
return `Err(DaftError::TypeError(...))` for violations.
392-
- `call(&self, args: &[ArrayRef]) -> DaftResult<ArrayRef>`compute over Arrow arrays,
393-
propagate nulls, return `Err(DaftError::RuntimeError(...))` for failures.
428+
- `return_field(&self, args: &[ArrowSchema]) -> DaftResult<ArrowSchema>`use `.as_raw()` to
429+
borrow as arrow-rs `FFI_ArrowSchema` for type checking, then `.into()` to return output.
430+
- `call(&self, args: &[ArrowData]) -> DaftResult<ArrowData>`use `ArrowData::take_arg` then
431+
`.into()` to convert to arrow-rs FFI types, compute, then `.into()` to return the result.
394432

395433
## Python conventions
396434

examples/hello/Cargo.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,5 @@ name = "hello"
1010
crate-type = ["cdylib"]
1111

1212
[dependencies]
13-
daft-ext = {path = "../../src/daft-ext"}
14-
arrow-array = {version = "57.1.0", features = ["chrono-tz"]}
15-
arrow-schema = "57.1.0"
13+
daft-ext = {path = "../../src/daft-ext", features = ["arrow-58"]}
14+
arrow = {version = "58", features = ["ffi"]}

examples/hello/src/lib.rs

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
use std::{ffi::CStr, sync::Arc};
22

3-
use arrow_array::{Array, ArrayRef, builder::StringBuilder, cast::AsArray};
4-
use arrow_schema::{DataType, Field};
3+
use arrow::{
4+
array::{Array, builder::StringBuilder, cast::AsArray},
5+
datatypes::{DataType, Field},
6+
};
57
use daft_ext::prelude::*;
68

79
// ── Module ──────────────────────────────────────────────────────────
@@ -24,24 +26,35 @@ impl DaftScalarFunction for Greet {
2426
c"greet"
2527
}
2628

27-
fn return_field(&self, args: &[Field]) -> DaftResult<Field> {
29+
fn return_field(&self, args: &[ArrowSchema]) -> DaftResult<ArrowSchema> {
2830
if args.len() != 1 {
2931
return Err(DaftError::TypeError(format!(
3032
"greet: expected 1 argument, got {}",
3133
args.len()
3234
)));
3335
}
34-
if *args[0].data_type() != DataType::Utf8 && *args[0].data_type() != DataType::LargeUtf8 {
36+
let field = Field::try_from(&args[0])?;
37+
let dt = field.data_type();
38+
if *dt != DataType::Utf8 && *dt != DataType::LargeUtf8 {
3539
return Err(DaftError::TypeError(format!(
3640
"greet: expected string argument, got {:?}",
37-
args[0].data_type()
41+
dt
3842
)));
3943
}
40-
Ok(Field::new("greet", DataType::Utf8, true))
44+
Ok(ArrowSchema::try_from(&Field::new(
45+
"greet",
46+
DataType::Utf8,
47+
true,
48+
))?)
4149
}
4250

43-
fn call(&self, args: &[ArrayRef]) -> DaftResult<ArrayRef> {
44-
let names = args[0].as_string::<i64>();
51+
fn call(&self, args: Vec<ArrowData>) -> DaftResult<ArrowData> {
52+
let data = args.into_iter().next().unwrap();
53+
let ffi_array: arrow::ffi::FFI_ArrowArray = data.array.into();
54+
let ffi_schema: arrow::ffi::FFI_ArrowSchema = data.schema.into();
55+
let arrow_data = unsafe { arrow::ffi::from_ffi(ffi_array, &ffi_schema) }?;
56+
let input = arrow::array::make_array(arrow_data);
57+
let names = input.as_string::<i64>();
4558
let mut builder = StringBuilder::with_capacity(names.len(), names.len() * 16);
4659
for i in 0..names.len() {
4760
if names.is_null(i) {
@@ -50,6 +63,11 @@ impl DaftScalarFunction for Greet {
5063
builder.append_value(format!("Hello, {}!", names.value(i)));
5164
}
5265
}
53-
Ok(Arc::new(builder.finish()))
66+
let output = builder.finish();
67+
let (out_arr, out_sch) = arrow::ffi::to_ffi(&output.to_data())?;
68+
Ok(ArrowData {
69+
array: out_arr.into(),
70+
schema: out_sch.into(),
71+
})
5472
}
5573
}

src/daft-ext-abi/Cargo.toml

Lines changed: 26 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,31 @@
1-
[dependencies]
2-
arrow = {workspace = true, features = ["ffi"]}
3-
arrow-array = {workspace = true}
4-
5-
[lints]
6-
workspace = true
7-
81
[package]
92
description = "Daft extension C ABI contract"
103
edition = {workspace = true}
114
name = "daft-ext-abi"
125
version = {workspace = true}
6+
7+
[package.metadata.cargo-machete]
8+
ignored = [
9+
"arrow-data-56",
10+
"arrow-data-57",
11+
"arrow-data-58",
12+
"arrow-schema-56",
13+
"arrow-schema-57",
14+
"arrow-schema-58"
15+
]
16+
17+
[features]
18+
arrow-56 = ["dep:arrow-schema-56", "dep:arrow-data-56"]
19+
arrow-57 = ["dep:arrow-schema-57", "dep:arrow-data-57"]
20+
arrow-58 = ["dep:arrow-schema-58", "dep:arrow-data-58"]
21+
22+
[dependencies]
23+
arrow-schema-56 = {package = "arrow-schema", version = "56", features = ["ffi"], optional = true}
24+
arrow-data-56 = {package = "arrow-data", version = "56", features = ["ffi"], optional = true}
25+
arrow-schema-57 = {package = "arrow-schema", version = "57", features = ["ffi"], optional = true}
26+
arrow-data-57 = {package = "arrow-data", version = "57", features = ["ffi"], optional = true}
27+
arrow-schema-58 = {package = "arrow-schema", version = "58", features = ["ffi"], optional = true}
28+
arrow-data-58 = {package = "arrow-data", version = "58", features = ["ffi"], optional = true}
29+
30+
[lints]
31+
workspace = true

0 commit comments

Comments
 (0)