Skip to content

Commit fb5ef0c

Browse files
feat(io): support numeric range expansion in glob patterns (Eventual-Inc#6127)
## Changes Made Add support for bash-style numeric range syntax `{start..end}` in glob patterns. This allows users to easily select files by numeric sequences: df = daft.read_parquet("s3://bucket/{0..99}.parquet") The implementation expands numeric ranges to alternation patterns that are compatible with the globset crate, e.g. `{0..3}` becomes `{0,1,2,3}`. Features: - Basic ranges: {0..10} - Leading zeros preserved: {00..05} -> 00,01,02,03,04,05 - Reverse ranges: {10..0} - Negative numbers: {-5..5} - Multiple ranges in path: {0..1}/{0..2}.csv - Mixed with wildcards: {0..9}_*.parquet Safety: - Maximum range size of 10,000 elements to prevent memory issues - Clear error message when limit exceeded ## Related Issues Closes Eventual-Inc#2708
1 parent 9caf8ed commit fb5ef0c

File tree

4 files changed

+507
-0
lines changed

4 files changed

+507
-0
lines changed

src/daft-io/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ pub mod multipart;
1111
mod object_io;
1212
mod object_store_glob;
1313
mod opendal_source;
14+
mod range_expansion;
1415
mod retry;
1516
pub mod s3_like;
1617
mod stats;

src/daft-io/src/object_store_glob.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use tokio::sync::mpsc::Sender;
1212

1313
use crate::{
1414
object_io::{FileMetadata, FileType, ObjectSource},
15+
range_expansion::expand_numeric_ranges,
1516
stats::IOStatsRef,
1617
};
1718

@@ -391,6 +392,10 @@ pub async fn glob(
391392
limit: Option<usize>,
392393
io_stats: Option<IOStatsRef>,
393394
) -> super::Result<BoxStream<'static, super::Result<FileMetadata>>> {
395+
// Expand numeric ranges like {0..10} to {0,1,2,...,10} before processing
396+
let expanded_glob = expand_numeric_ranges(glob)?;
397+
let glob = &*expanded_glob;
398+
394399
// If no special characters, we fall back to ls behavior
395400
let full_fragment = GlobFragment::new(glob);
396401
if !full_fragment.has_special_character() {

src/daft-io/src/range_expansion.rs

Lines changed: 287 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,287 @@
1+
//! Numeric range expansion for glob patterns.
2+
//!
3+
//! This module provides functionality to expand numeric range patterns like `{0..10}`
4+
//! into alternation patterns like `{0,1,2,3,4,5,6,7,8,9,10}` that are compatible with
5+
//! the `globset` crate.
6+
//!
7+
//! # Examples
8+
//!
9+
//! ```ignore
10+
//! use daft_io::expand_numeric_ranges;
11+
//!
12+
//! // Simple range
13+
//! let result = expand_numeric_ranges("s3://bucket/{0..3}.parquet")?;
14+
//! assert_eq!(result, "s3://bucket/{0,1,2,3}.parquet");
15+
//!
16+
//! // With leading zeros
17+
//! let result = expand_numeric_ranges("s3://bucket/{00..03}.parquet")?;
18+
//! assert_eq!(result, "s3://bucket/{00,01,02,03}.parquet");
19+
//!
20+
//! // Reverse range
21+
//! let result = expand_numeric_ranges("s3://bucket/{3..0}.parquet")?;
22+
//! assert_eq!(result, "s3://bucket/{3,2,1,0}.parquet");
23+
//! ```
24+
25+
use std::{borrow::Cow, sync::LazyLock};
26+
27+
use regex::Regex;
28+
29+
/// Maximum number of elements allowed in a single range expansion.
30+
/// This prevents memory issues from patterns like `{0..1000000}`.
31+
const MAX_RANGE_SIZE: usize = 10_000;
32+
33+
/// Regex pattern to match numeric range syntax: {start..end}
34+
/// Supports optional leading zeros and negative numbers.
35+
/// Examples: {0..10}, {00..99}, {-5..5}
36+
static NUMERIC_RANGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
37+
// Match {start..end} where start and end are integers (possibly with leading zeros or negative)
38+
// We need to be careful not to match alternation syntax like {a,b,c}
39+
Regex::new(r"\{(-?\d+)\.\.(-?\d+)\}").expect("Invalid regex pattern for numeric range")
40+
});
41+
42+
/// Expands all numeric range patterns in a glob string.
43+
///
44+
/// Converts patterns like `{0..10}` to `{0,1,2,3,4,5,6,7,8,9,10}`.
45+
///
46+
/// # Arguments
47+
///
48+
/// * `glob` - The glob pattern string that may contain numeric ranges
49+
///
50+
/// # Returns
51+
///
52+
/// * `Ok(Cow::Borrowed(glob))` - If no range patterns found (zero allocation)
53+
/// * `Ok(Cow::Owned(String))` - The expanded glob pattern
54+
/// * `Err(Error)` - If a range is too large (exceeds MAX_RANGE_SIZE)
55+
///
56+
/// # Examples
57+
///
58+
/// Basic usage:
59+
/// ```ignore
60+
/// let expanded = expand_numeric_ranges("data/{0..2}.csv")?;
61+
/// assert_eq!(expanded, "data/{0,1,2}.csv");
62+
/// ```
63+
pub fn expand_numeric_ranges(glob: &str) -> super::Result<Cow<'_, str>> {
64+
// If no numeric range pattern found, return as-is (zero allocation)
65+
if !NUMERIC_RANGE_PATTERN.is_match(glob) {
66+
return Ok(Cow::Borrowed(glob));
67+
}
68+
69+
// We need to expand ranges one at a time because each expansion might
70+
// change the string length and positions
71+
let mut result = glob.to_string();
72+
73+
// Keep expanding until no more ranges are found
74+
// This handles multiple ranges in a single glob pattern
75+
while let Some(captures) = NUMERIC_RANGE_PATTERN.captures(&result) {
76+
let full_match = captures.get(0).unwrap();
77+
let start_str = captures.get(1).unwrap().as_str();
78+
let end_str = captures.get(2).unwrap().as_str();
79+
80+
let expansion = expand_single_range(start_str, end_str)?;
81+
82+
// Replace the matched range with the expansion
83+
result = format!(
84+
"{}{{{}}}{}",
85+
&result[..full_match.start()],
86+
expansion,
87+
&result[full_match.end()..]
88+
);
89+
}
90+
91+
Ok(Cow::Owned(result))
92+
}
93+
94+
/// Expands a single numeric range into a comma-separated list.
95+
///
96+
/// # Arguments
97+
///
98+
/// * `start_str` - The start value as a string (preserves leading zeros)
99+
/// * `end_str` - The end value as a string (preserves leading zeros)
100+
///
101+
/// # Returns
102+
///
103+
/// A comma-separated string of all values in the range.
104+
fn expand_single_range(start_str: &str, end_str: &str) -> super::Result<String> {
105+
let start: i64 = start_str
106+
.parse()
107+
.map_err(|_| super::Error::InvalidArgument {
108+
msg: format!("Invalid range start value: {}", start_str),
109+
})?;
110+
111+
let end: i64 = end_str.parse().map_err(|_| super::Error::InvalidArgument {
112+
msg: format!("Invalid range end value: {}", end_str),
113+
})?;
114+
115+
// Calculate the range size using i128 to avoid overflow
116+
// when start and end are at opposite extremes of i64 range
117+
let range_size = ((end as i128) - (start as i128)).unsigned_abs() as usize + 1;
118+
if range_size > MAX_RANGE_SIZE {
119+
return Err(super::Error::InvalidArgument {
120+
msg: format!(
121+
"Numeric range {{{}..{}}} would expand to {} elements, which exceeds the maximum of {}. \
122+
Consider using a smaller range or Python list comprehension instead.",
123+
start_str, end_str, range_size, MAX_RANGE_SIZE
124+
),
125+
});
126+
}
127+
128+
// Determine the width for zero-padding based on the longer of start or end
129+
// Only apply padding if either value has leading zeros
130+
let start_has_leading_zero = start_str.starts_with('0') && start_str.len() > 1 && start >= 0;
131+
let end_has_leading_zero = end_str.starts_with('0') && end_str.len() > 1 && end >= 0;
132+
let use_padding = start_has_leading_zero || end_has_leading_zero;
133+
134+
let width = if use_padding {
135+
// Use the maximum width between start and end strings
136+
// For negative numbers, we don't count the minus sign for padding purposes
137+
let start_width = if start < 0 {
138+
start_str.len() - 1
139+
} else {
140+
start_str.len()
141+
};
142+
let end_width = if end < 0 {
143+
end_str.len() - 1
144+
} else {
145+
end_str.len()
146+
};
147+
start_width.max(end_width)
148+
} else {
149+
0
150+
};
151+
152+
// Generate the range values
153+
let values: Vec<String> = if start <= end {
154+
// Ascending range
155+
(start..=end)
156+
.map(|n| format_number(n, width, use_padding))
157+
.collect()
158+
} else {
159+
// Descending range
160+
(end..=start)
161+
.rev()
162+
.map(|n| format_number(n, width, use_padding))
163+
.collect()
164+
};
165+
166+
Ok(values.join(","))
167+
}
168+
169+
/// Formats a number with optional zero-padding.
170+
fn format_number(n: i64, width: usize, use_padding: bool) -> String {
171+
if use_padding && n >= 0 {
172+
format!("{:0>width$}", n, width = width)
173+
} else {
174+
n.to_string()
175+
}
176+
}
177+
178+
#[cfg(test)]
179+
mod tests {
180+
use super::*;
181+
182+
#[test]
183+
fn test_no_expansion_needed() {
184+
let result = expand_numeric_ranges("s3://bucket/*.parquet").unwrap();
185+
assert_eq!(result, "s3://bucket/*.parquet");
186+
}
187+
188+
#[test]
189+
fn test_simple_range() {
190+
let result = expand_numeric_ranges("s3://bucket/{0..3}.parquet").unwrap();
191+
assert_eq!(result, "s3://bucket/{0,1,2,3}.parquet");
192+
}
193+
194+
#[test]
195+
fn test_leading_zeros() {
196+
let result = expand_numeric_ranges("s3://bucket/{00..03}.parquet").unwrap();
197+
assert_eq!(result, "s3://bucket/{00,01,02,03}.parquet");
198+
}
199+
200+
#[test]
201+
fn test_leading_zeros_larger() {
202+
let result = expand_numeric_ranges("s3://bucket/{000..005}.parquet").unwrap();
203+
assert_eq!(result, "s3://bucket/{000,001,002,003,004,005}.parquet");
204+
}
205+
206+
#[test]
207+
fn test_reverse_range() {
208+
let result = expand_numeric_ranges("s3://bucket/{3..0}.parquet").unwrap();
209+
assert_eq!(result, "s3://bucket/{3,2,1,0}.parquet");
210+
}
211+
212+
#[test]
213+
fn test_negative_range() {
214+
let result = expand_numeric_ranges("s3://bucket/{-2..1}.parquet").unwrap();
215+
assert_eq!(result, "s3://bucket/{-2,-1,0,1}.parquet");
216+
}
217+
218+
#[test]
219+
fn test_single_value_range() {
220+
let result = expand_numeric_ranges("s3://bucket/{5..5}.parquet").unwrap();
221+
assert_eq!(result, "s3://bucket/{5}.parquet");
222+
}
223+
224+
#[test]
225+
fn test_multiple_ranges() {
226+
let result = expand_numeric_ranges("s3://bucket/{0..1}/{0..2}.parquet").unwrap();
227+
assert_eq!(result, "s3://bucket/{0,1}/{0,1,2}.parquet");
228+
}
229+
230+
#[test]
231+
fn test_mixed_with_alternation() {
232+
// Alternation syntax {a,b} should remain untouched
233+
let result = expand_numeric_ranges("s3://bucket/{0..1}_{a,b}.parquet").unwrap();
234+
assert_eq!(result, "s3://bucket/{0,1}_{a,b}.parquet");
235+
}
236+
237+
#[test]
238+
fn test_range_in_middle_of_path() {
239+
let result = expand_numeric_ranges("s3://bucket/data_{0..2}_suffix.parquet").unwrap();
240+
assert_eq!(result, "s3://bucket/data_{0,1,2}_suffix.parquet");
241+
}
242+
243+
#[test]
244+
fn test_local_path() {
245+
let result = expand_numeric_ranges("/local/path/{0..2}.csv").unwrap();
246+
assert_eq!(result, "/local/path/{0,1,2}.csv");
247+
}
248+
249+
#[test]
250+
fn test_http_url() {
251+
let result = expand_numeric_ranges("https://example.com/{0..2}.json").unwrap();
252+
assert_eq!(result, "https://example.com/{0,1,2}.json");
253+
}
254+
255+
#[test]
256+
fn test_range_too_large() {
257+
let result = expand_numeric_ranges("s3://bucket/{0..100000}.parquet");
258+
assert!(result.is_err());
259+
let err = result.unwrap_err();
260+
assert!(err.to_string().contains("exceeds the maximum"));
261+
}
262+
263+
#[test]
264+
fn test_no_change_for_alternation() {
265+
// Pure alternation syntax should pass through unchanged
266+
let result = expand_numeric_ranges("s3://bucket/{foo,bar,baz}.parquet").unwrap();
267+
assert_eq!(result, "s3://bucket/{foo,bar,baz}.parquet");
268+
}
269+
270+
#[test]
271+
fn test_asymmetric_padding() {
272+
// When one side has padding and other doesn't, use the larger width
273+
let result = expand_numeric_ranges("s3://bucket/{08..12}.parquet").unwrap();
274+
assert_eq!(result, "s3://bucket/{08,09,10,11,12}.parquet");
275+
}
276+
277+
#[test]
278+
fn test_escaped_range_not_expanded() {
279+
// Escaped braces should NOT be expanded.
280+
// This works because the regex `\{(-?\d+)\.\.(-?\d+)\}` requires a literal `}`,
281+
// but `\\}` has a backslash before it, preventing the match.
282+
// Note: a single-escaped `\{0..3}` (without closing `\}`) would still match,
283+
// but in practice glob escaping always pairs `\\{...\\}`, so this is not a real concern.
284+
let result = expand_numeric_ranges(r"s3://bucket/\{0..3\}.parquet").unwrap();
285+
assert_eq!(result, r"s3://bucket/\{0..3\}.parquet");
286+
}
287+
}

0 commit comments

Comments
 (0)