Skip to content

Commit 38c3856

Browse files
committed
feat: decouple extension ABI from arrow-rs with Arrow C Data Interface types
Remove arrow-rs dependency from daft-ext-abi by introducing owned Arrow C Data Interface types (ArrowSchema, ArrowArray, ArrowArrayStream) that are layout- compatible with the C ABI. This allows extensions to use any Arrow implementation (arrow-rs, arrow2, etc.) without coupling to a specific version. Key changes: - Add ArrowSchema, ArrowArray, ArrowArrayStream C types to daft-ext-abi - Add ArrowData (schema + array pair) for safe FFI data transfer - Move FFI conversion helpers into daft-ext-abi/src/ffi/arrow.rs - Update DaftScalarFunction trait to use ABI types instead of arrow-rs - Update DaftSession trait to use ABI types - Remove arrow-rs dependency from daft-ext-abi Cargo.toml - Add arrow.rs helpers in daft-ext-core for arrow-rs <-> ABI conversions - Update hello example to use new ABI types
1 parent dd7d4de commit 38c3856

File tree

20 files changed

+1027
-286
lines changed

20 files changed

+1027
-286
lines changed

Cargo.lock

Lines changed: 0 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,9 @@ members = [
230230
"src/parquet2",
231231
"src/daft-cli"
232232
]
233+
exclude = [
234+
"examples/hello"
235+
]
233236

234237
[workspace.dependencies]
235238
arrow = "57.1.0"
@@ -273,9 +276,11 @@ daft-catalog = {path = "src/daft-catalog"}
273276
daft-context = {path = "src/daft-context"}
274277
daft-core = {path = "src/daft-core"}
275278
daft-dsl = {path = "src/daft-dsl"}
276-
daft-ext-abi = {path = "src/daft-ext-abi", default-features = false}
277-
daft-ext-core = {path = "src/daft-ext-core", default-features = false}
278-
daft-ext-internal = {path = "src/daft-ext-internal", default-features = false}
279+
daft-ext-abi = {path = "src/daft-ext-abi"}
280+
daft-ext-core = {path = "src/daft-ext-core"}
281+
daft-ext-internal = {path = "src/daft-ext-internal"}
282+
daft-ext-macros = {path = "src/daft-ext-macros"}
283+
daft-ext = {path = "src/daft-ext"}
279284
daft-file = {path = "src/daft-file"}
280285
daft-functions = {path = "src/daft-functions"}
281286
daft-functions-binary = {path = "src/daft-functions-binary"}

docs/extensions/index.md

Lines changed: 56 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,11 @@
77
> Please see the [prompt](#prompt) if you want help generating an extension.
88
99
This document is a guide for authoring Daft native extensions in Rust.
10-
Daft supports native Rust extensions by leveraging a stable C ABI and Arrow FFI. Today we support authoring native
10+
Daft supports native Rust extensions by leveraging a stable C ABI based on the
11+
[Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html).
12+
Extensions are **not coupled** to any particular Arrow library version. The ABI boundary uses
13+
plain C structs (`ArrowSchema`, `ArrowArray`) so your extension can use any arrow-rs version
14+
(or even a different Arrow implementation entirely). Today we support authoring native
1115
scalar functions, but are actively working on additional native extension features.
1216

1317
## Example
@@ -93,16 +97,16 @@ crate-type = ["cdylib"]
9397

9498
[dependencies]
9599
daft-ext = <version>
96-
arrow-array = "57.1.0"
97-
arrow-schema = "57.1.0"
100+
arrow = { version = "58", features = ["ffi"] }
98101
```
99102

100-
!!! tip "Arrow types"
103+
!!! tip "Arrow version freedom"
101104

102-
Use `arrow-array` builders and downcasting directly for working with data.
103-
The `daft-ext` prelude re-exports common types like `ArrayRef` and `Field`.
104-
Import `arrow_array::Array` for the `len()` and `is_null()` methods, and
105-
`arrow_array::cast::AsArray` for downcasting (e.g., `as_string`).
105+
The `daft-ext` ABI uses C Data Interface types — your extension is **not** pinned to
106+
Daft's arrow-rs version. You can use any compatible `arrow-array` / `arrow-schema`
107+
version. The `define_arrow_helpers!()` macro generates `import_array`, `export_array`,
108+
`import_field`, `export_field`, `import_schema`, and `export_schema` helpers for
109+
converting between arrow-rs types and the C ABI types.
106110

107111
Then update the pyproject to use `setuptools-rust` as the build system.
108112

@@ -164,15 +168,17 @@ cat src/lib.rs
164168
```
165169

166170
```rust
167-
use std::ffi::CStr;
168-
use std::sync::Arc;
171+
use std::{ffi::CStr, sync::Arc};
169172

170-
use arrow_array::{Array, ArrayRef};
171-
use arrow_array::builder::StringBuilder;
172-
use arrow_array::cast::AsArray;
173-
use arrow_schema::{DataType, Field};
173+
use arrow::{
174+
array::{Array, builder::StringBuilder, cast::AsArray},
175+
datatypes::{DataType, Field},
176+
};
174177
use daft_ext::prelude::*;
175178

179+
// Generate import_array, export_array, import_field, export_field helpers.
180+
daft_ext::define_arrow_helpers!();
181+
176182
// ── Module ──────────────────────────────────────────────────────────
177183

178184
// #[daft_extension] generates the `daft_module_magic` C symbol that Daft's runtime looks for
@@ -181,7 +187,6 @@ use daft_ext::prelude::*;
181187
struct HelloExtension;
182188

183189
impl DaftExtension for HelloExtension {
184-
185190
/// This is the extension install hook for defining functions in the session.
186191
/// Called once when the extension is loaded into a session. Register each function here.
187192
fn install(session: &mut dyn DaftSession) {
@@ -202,25 +207,33 @@ impl DaftScalarFunction for Greet {
202207
}
203208

204209
/// Type checking.
205-
/// Given the input `Field` schemas, validate types and return the output `Field`.
206-
fn return_field(&self, args: &[Field]) -> DaftResult<Field> {
210+
/// Receives input fields as C Data Interface `ArrowSchema` types.
211+
/// Use `import_field` to convert to arrow-rs types for validation,
212+
/// then `export_field` to return the output field.
213+
fn return_field(&self, args: &[ArrowSchema]) -> DaftResult<ArrowSchema> {
207214
if args.len() != 1 {
208-
return Err(DaftError::TypeError(
209-
format!("greet: expected 1 argument, got {}", args.len()),
210-
));
215+
return Err(DaftError::TypeError(format!(
216+
"greet: expected 1 argument, got {}",
217+
args.len()
218+
)));
211219
}
212-
if *args[0].data_type() != DataType::Utf8 && *args[0].data_type() != DataType::LargeUtf8 {
213-
return Err(DaftError::TypeError(
214-
format!("greet: expected string argument, got {:?}", args[0].data_type()),
215-
));
220+
let field = import_field(&args[0])?;
221+
let dt = field.data_type();
222+
if *dt != DataType::Utf8 && *dt != DataType::LargeUtf8 {
223+
return Err(DaftError::TypeError(format!(
224+
"greet: expected string argument, got {:?}",
225+
dt
226+
)));
216227
}
217-
Ok(Field::new("greet", DataType::Utf8, true))
228+
Ok(export_field(&Field::new("greet", DataType::Utf8, true))?)
218229
}
219230

220-
/// Evaluation. Receives Arrow arrays, returns an Arrow array. Operates on entire columns at once.
231+
/// Evaluation. Receives columns as C Data Interface `ArrowData` types.
232+
/// Use `import_array` / `export_array` to convert to/from arrow-rs arrays.
221233
/// All data flows through Arrow arrays — no per-row Python overhead.
222-
fn call(&self, args: &[ArrayRef]) -> DaftResult<ArrayRef> {
223-
let names = args[0].as_string::<i64>();
234+
fn call(&self, args: &[ArrowData]) -> DaftResult<ArrowData> {
235+
let input = import_array(unsafe { ArrowData::take_arg(args, 0) })?;
236+
let names = input.as_string::<i64>();
224237
let mut builder = StringBuilder::with_capacity(names.len(), names.len() * 16);
225238
for i in 0..names.len() {
226239
if names.is_null(i) {
@@ -229,11 +242,18 @@ impl DaftScalarFunction for Greet {
229242
builder.append_value(format!("Hello, {}!", names.value(i)));
230243
}
231244
}
232-
Ok(Arc::new(builder.finish()))
245+
Ok(export_array(&builder.finish())?)
233246
}
234247
}
235248
```
236249

250+
!!! tip "ABI pattern"
251+
252+
The `DaftScalarFunction` trait uses C Data Interface types (`ArrowSchema`, `ArrowData`)
253+
at the ABI boundary. Use the `import_*` / `export_*` helpers from the prelude to convert
254+
to and from arrow-rs types inside your function bodies. This decoupling means your
255+
extension is not tied to Daft's arrow-rs version.
256+
237257
!!! tip "String types"
238258

239259
Daft uses `LargeUtf8` (i64 offsets) for strings internally. When downcasting string arrays,
@@ -380,17 +400,18 @@ Follow the Daft extension authoring guide at docs/extensions/index.md. Here is a
380400

381401
## Rust conventions
382402

383-
- Use `daft_ext::prelude::*` for all imports.
384-
- Import `arrow_array::Array` for `len()`/`is_null()` and `arrow_array::cast::AsArray` for downcasting.
403+
- Use `daft_ext::prelude::*` for all imports (provides `ArrowSchema`, `ArrowData`, errors, traits).
404+
- Call `daft_ext::define_arrow_helpers!()` to generate `import_array`, `export_array`, `import_field`, `export_field` helpers.
405+
- Import `arrow::array::Array` for `len()`/`is_null()` and `arrow::array::cast::AsArray` for downcasting.
385406
- Daft uses `LargeUtf8` (i64 offsets) for strings — downcast with `as_string::<i64>()`, never `i32`.
386407
- Apply `#[daft_extension]` to a struct implementing `DaftExtension`.
387408
- Register each function in `install()` via `session.define_function(Arc::new(MyFn))`.
388409
- Each function is a struct implementing `DaftScalarFunction` with:
389410
- `name(&self) -> &CStr` — use `c"<extension_name>_<fn_name>"` prefix to avoid collisions.
390-
- `return_field(&self, args: &[Field]) -> DaftResult<Field>`validate arg count and types,
391-
return `Err(DaftError::TypeError(...))` for violations.
392-
- `call(&self, args: &[ArrayRef]) -> DaftResult<ArrayRef>`compute over Arrow arrays,
393-
propagate nulls, return `Err(DaftError::RuntimeError(...))` for failures.
411+
- `return_field(&self, args: &[ArrowSchema]) -> DaftResult<ArrowSchema>`use `import_field` to
412+
convert inputs, validate types, then `export_field` to return the output field.
413+
- `call(&self, args: &[ArrowData]) -> DaftResult<ArrowData>`use `ArrowData::take_arg` + `import_array`
414+
to convert inputs to arrow-rs arrays, compute, then `export_array` to return the result.
394415

395416
## Python conventions
396417

examples/hello/Cargo.toml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,5 +11,4 @@ crate-type = ["cdylib"]
1111

1212
[dependencies]
1313
daft-ext = {path = "../../src/daft-ext"}
14-
arrow-array = {version = "57.1.0", features = ["chrono-tz"]}
15-
arrow-schema = "57.1.0"
14+
arrow = {version = "58", features = ["ffi"]}

examples/hello/src/lib.rs

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,13 @@
11
use std::{ffi::CStr, sync::Arc};
22

3-
use arrow_array::{Array, ArrayRef, builder::StringBuilder, cast::AsArray};
4-
use arrow_schema::{DataType, Field};
3+
use arrow::{
4+
array::{Array, builder::StringBuilder, cast::AsArray},
5+
datatypes::{DataType, Field},
6+
};
57
use daft_ext::prelude::*;
68

9+
daft_ext::define_arrow_helpers!();
10+
711
// ── Module ──────────────────────────────────────────────────────────
812

913
#[daft_extension]
@@ -24,24 +28,27 @@ impl DaftScalarFunction for Greet {
2428
c"greet"
2529
}
2630

27-
fn return_field(&self, args: &[Field]) -> DaftResult<Field> {
31+
fn return_field(&self, args: &[ArrowSchema]) -> DaftResult<ArrowSchema> {
2832
if args.len() != 1 {
2933
return Err(DaftError::TypeError(format!(
3034
"greet: expected 1 argument, got {}",
3135
args.len()
3236
)));
3337
}
34-
if *args[0].data_type() != DataType::Utf8 && *args[0].data_type() != DataType::LargeUtf8 {
38+
let field = import_field(&args[0])?;
39+
let dt = field.data_type();
40+
if *dt != DataType::Utf8 && *dt != DataType::LargeUtf8 {
3541
return Err(DaftError::TypeError(format!(
3642
"greet: expected string argument, got {:?}",
37-
args[0].data_type()
43+
dt
3844
)));
3945
}
40-
Ok(Field::new("greet", DataType::Utf8, true))
46+
Ok(export_field(&Field::new("greet", DataType::Utf8, true))?)
4147
}
4248

43-
fn call(&self, args: &[ArrayRef]) -> DaftResult<ArrayRef> {
44-
let names = args[0].as_string::<i64>();
49+
fn call(&self, args: &[ArrowData]) -> DaftResult<ArrowData> {
50+
let input = import_array(unsafe { ArrowData::take_arg(args, 0) })?;
51+
let names = input.as_string::<i64>();
4552
let mut builder = StringBuilder::with_capacity(names.len(), names.len() * 16);
4653
for i in 0..names.len() {
4754
if names.is_null(i) {
@@ -50,6 +57,6 @@ impl DaftScalarFunction for Greet {
5057
builder.append_value(format!("Hello, {}!", names.value(i)));
5158
}
5259
}
53-
Ok(Arc::new(builder.finish()))
60+
Ok(export_array(&builder.finish())?)
5461
}
5562
}

src/daft-ext-abi/Cargo.toml

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
[dependencies]
2-
arrow = {workspace = true, features = ["ffi"]}
3-
arrow-array = {workspace = true}
4-
5-
[lints]
6-
workspace = true
7-
81
[package]
92
description = "Daft extension C ABI contract"
103
edition = {workspace = true}
114
name = "daft-ext-abi"
125
version = {workspace = true}
6+
7+
[dependencies]
8+
9+
[lints]
10+
workspace = true

0 commit comments

Comments
 (0)