|
| 1 | +//! Numeric range expansion for glob patterns. |
| 2 | +//! |
| 3 | +//! This module provides functionality to expand numeric range patterns like `{0..10}` |
| 4 | +//! into alternation patterns like `{0,1,2,3,4,5,6,7,8,9,10}` that are compatible with |
| 5 | +//! the `globset` crate. |
| 6 | +//! |
| 7 | +//! # Examples |
| 8 | +//! |
| 9 | +//! ```ignore |
| 10 | +//! use daft_io::expand_numeric_ranges; |
| 11 | +//! |
| 12 | +//! // Simple range |
| 13 | +//! let result = expand_numeric_ranges("s3://bucket/{0..3}.parquet")?; |
| 14 | +//! assert_eq!(result, "s3://bucket/{0,1,2,3}.parquet"); |
| 15 | +//! |
| 16 | +//! // With leading zeros |
| 17 | +//! let result = expand_numeric_ranges("s3://bucket/{00..03}.parquet")?; |
| 18 | +//! assert_eq!(result, "s3://bucket/{00,01,02,03}.parquet"); |
| 19 | +//! |
| 20 | +//! // Reverse range |
| 21 | +//! let result = expand_numeric_ranges("s3://bucket/{3..0}.parquet")?; |
| 22 | +//! assert_eq!(result, "s3://bucket/{3,2,1,0}.parquet"); |
| 23 | +//! ``` |
| 24 | +
|
| 25 | +use std::{borrow::Cow, sync::LazyLock}; |
| 26 | + |
| 27 | +use regex::Regex; |
| 28 | + |
| 29 | +/// Maximum number of elements allowed in a single range expansion. |
| 30 | +/// This prevents memory issues from patterns like `{0..1000000}`. |
| 31 | +const MAX_RANGE_SIZE: usize = 10_000; |
| 32 | + |
| 33 | +/// Regex pattern to match numeric range syntax: {start..end} |
| 34 | +/// Supports optional leading zeros and negative numbers. |
| 35 | +/// Examples: {0..10}, {00..99}, {-5..5} |
| 36 | +static NUMERIC_RANGE_PATTERN: LazyLock<Regex> = LazyLock::new(|| { |
| 37 | + // Match {start..end} where start and end are integers (possibly with leading zeros or negative) |
| 38 | + // We need to be careful not to match alternation syntax like {a,b,c} |
| 39 | + Regex::new(r"\{(-?\d+)\.\.(-?\d+)\}").expect("Invalid regex pattern for numeric range") |
| 40 | +}); |
| 41 | + |
| 42 | +/// Expands all numeric range patterns in a glob string. |
| 43 | +/// |
| 44 | +/// Converts patterns like `{0..10}` to `{0,1,2,3,4,5,6,7,8,9,10}`. |
| 45 | +/// |
| 46 | +/// # Arguments |
| 47 | +/// |
| 48 | +/// * `glob` - The glob pattern string that may contain numeric ranges |
| 49 | +/// |
| 50 | +/// # Returns |
| 51 | +/// |
| 52 | +/// * `Ok(Cow::Borrowed(glob))` - If no range patterns found (zero allocation) |
| 53 | +/// * `Ok(Cow::Owned(String))` - The expanded glob pattern |
| 54 | +/// * `Err(Error)` - If a range is too large (exceeds MAX_RANGE_SIZE) |
| 55 | +/// |
| 56 | +/// # Examples |
| 57 | +/// |
| 58 | +/// Basic usage: |
| 59 | +/// ```ignore |
| 60 | +/// let expanded = expand_numeric_ranges("data/{0..2}.csv")?; |
| 61 | +/// assert_eq!(expanded, "data/{0,1,2}.csv"); |
| 62 | +/// ``` |
| 63 | +pub fn expand_numeric_ranges(glob: &str) -> super::Result<Cow<'_, str>> { |
| 64 | + // If no numeric range pattern found, return as-is (zero allocation) |
| 65 | + if !NUMERIC_RANGE_PATTERN.is_match(glob) { |
| 66 | + return Ok(Cow::Borrowed(glob)); |
| 67 | + } |
| 68 | + |
| 69 | + // We need to expand ranges one at a time because each expansion might |
| 70 | + // change the string length and positions |
| 71 | + let mut result = glob.to_string(); |
| 72 | + |
| 73 | + // Keep expanding until no more ranges are found |
| 74 | + // This handles multiple ranges in a single glob pattern |
| 75 | + while let Some(captures) = NUMERIC_RANGE_PATTERN.captures(&result) { |
| 76 | + let full_match = captures.get(0).unwrap(); |
| 77 | + let start_str = captures.get(1).unwrap().as_str(); |
| 78 | + let end_str = captures.get(2).unwrap().as_str(); |
| 79 | + |
| 80 | + let expansion = expand_single_range(start_str, end_str)?; |
| 81 | + |
| 82 | + // Replace the matched range with the expansion |
| 83 | + result = format!( |
| 84 | + "{}{{{}}}{}", |
| 85 | + &result[..full_match.start()], |
| 86 | + expansion, |
| 87 | + &result[full_match.end()..] |
| 88 | + ); |
| 89 | + } |
| 90 | + |
| 91 | + Ok(Cow::Owned(result)) |
| 92 | +} |
| 93 | + |
| 94 | +/// Expands a single numeric range into a comma-separated list. |
| 95 | +/// |
| 96 | +/// # Arguments |
| 97 | +/// |
| 98 | +/// * `start_str` - The start value as a string (preserves leading zeros) |
| 99 | +/// * `end_str` - The end value as a string (preserves leading zeros) |
| 100 | +/// |
| 101 | +/// # Returns |
| 102 | +/// |
| 103 | +/// A comma-separated string of all values in the range. |
| 104 | +fn expand_single_range(start_str: &str, end_str: &str) -> super::Result<String> { |
| 105 | + let start: i64 = start_str |
| 106 | + .parse() |
| 107 | + .map_err(|_| super::Error::InvalidArgument { |
| 108 | + msg: format!("Invalid range start value: {}", start_str), |
| 109 | + })?; |
| 110 | + |
| 111 | + let end: i64 = end_str.parse().map_err(|_| super::Error::InvalidArgument { |
| 112 | + msg: format!("Invalid range end value: {}", end_str), |
| 113 | + })?; |
| 114 | + |
| 115 | + // Calculate the range size using i128 to avoid overflow |
| 116 | + // when start and end are at opposite extremes of i64 range |
| 117 | + let range_size = ((end as i128) - (start as i128)).unsigned_abs() as usize + 1; |
| 118 | + if range_size > MAX_RANGE_SIZE { |
| 119 | + return Err(super::Error::InvalidArgument { |
| 120 | + msg: format!( |
| 121 | + "Numeric range {{{}..{}}} would expand to {} elements, which exceeds the maximum of {}. \ |
| 122 | + Consider using a smaller range or Python list comprehension instead.", |
| 123 | + start_str, end_str, range_size, MAX_RANGE_SIZE |
| 124 | + ), |
| 125 | + }); |
| 126 | + } |
| 127 | + |
| 128 | + // Determine the width for zero-padding based on the longer of start or end |
| 129 | + // Only apply padding if either value has leading zeros |
| 130 | + let start_has_leading_zero = start_str.starts_with('0') && start_str.len() > 1 && start >= 0; |
| 131 | + let end_has_leading_zero = end_str.starts_with('0') && end_str.len() > 1 && end >= 0; |
| 132 | + let use_padding = start_has_leading_zero || end_has_leading_zero; |
| 133 | + |
| 134 | + let width = if use_padding { |
| 135 | + // Use the maximum width between start and end strings |
| 136 | + // For negative numbers, we don't count the minus sign for padding purposes |
| 137 | + let start_width = if start < 0 { |
| 138 | + start_str.len() - 1 |
| 139 | + } else { |
| 140 | + start_str.len() |
| 141 | + }; |
| 142 | + let end_width = if end < 0 { |
| 143 | + end_str.len() - 1 |
| 144 | + } else { |
| 145 | + end_str.len() |
| 146 | + }; |
| 147 | + start_width.max(end_width) |
| 148 | + } else { |
| 149 | + 0 |
| 150 | + }; |
| 151 | + |
| 152 | + // Generate the range values |
| 153 | + let values: Vec<String> = if start <= end { |
| 154 | + // Ascending range |
| 155 | + (start..=end) |
| 156 | + .map(|n| format_number(n, width, use_padding)) |
| 157 | + .collect() |
| 158 | + } else { |
| 159 | + // Descending range |
| 160 | + (end..=start) |
| 161 | + .rev() |
| 162 | + .map(|n| format_number(n, width, use_padding)) |
| 163 | + .collect() |
| 164 | + }; |
| 165 | + |
| 166 | + Ok(values.join(",")) |
| 167 | +} |
| 168 | + |
| 169 | +/// Formats a number with optional zero-padding. |
| 170 | +fn format_number(n: i64, width: usize, use_padding: bool) -> String { |
| 171 | + if use_padding && n >= 0 { |
| 172 | + format!("{:0>width$}", n, width = width) |
| 173 | + } else { |
| 174 | + n.to_string() |
| 175 | + } |
| 176 | +} |
| 177 | + |
| 178 | +#[cfg(test)] |
| 179 | +mod tests { |
| 180 | + use super::*; |
| 181 | + |
| 182 | + #[test] |
| 183 | + fn test_no_expansion_needed() { |
| 184 | + let result = expand_numeric_ranges("s3://bucket/*.parquet").unwrap(); |
| 185 | + assert_eq!(result, "s3://bucket/*.parquet"); |
| 186 | + } |
| 187 | + |
| 188 | + #[test] |
| 189 | + fn test_simple_range() { |
| 190 | + let result = expand_numeric_ranges("s3://bucket/{0..3}.parquet").unwrap(); |
| 191 | + assert_eq!(result, "s3://bucket/{0,1,2,3}.parquet"); |
| 192 | + } |
| 193 | + |
| 194 | + #[test] |
| 195 | + fn test_leading_zeros() { |
| 196 | + let result = expand_numeric_ranges("s3://bucket/{00..03}.parquet").unwrap(); |
| 197 | + assert_eq!(result, "s3://bucket/{00,01,02,03}.parquet"); |
| 198 | + } |
| 199 | + |
| 200 | + #[test] |
| 201 | + fn test_leading_zeros_larger() { |
| 202 | + let result = expand_numeric_ranges("s3://bucket/{000..005}.parquet").unwrap(); |
| 203 | + assert_eq!(result, "s3://bucket/{000,001,002,003,004,005}.parquet"); |
| 204 | + } |
| 205 | + |
| 206 | + #[test] |
| 207 | + fn test_reverse_range() { |
| 208 | + let result = expand_numeric_ranges("s3://bucket/{3..0}.parquet").unwrap(); |
| 209 | + assert_eq!(result, "s3://bucket/{3,2,1,0}.parquet"); |
| 210 | + } |
| 211 | + |
| 212 | + #[test] |
| 213 | + fn test_negative_range() { |
| 214 | + let result = expand_numeric_ranges("s3://bucket/{-2..1}.parquet").unwrap(); |
| 215 | + assert_eq!(result, "s3://bucket/{-2,-1,0,1}.parquet"); |
| 216 | + } |
| 217 | + |
| 218 | + #[test] |
| 219 | + fn test_single_value_range() { |
| 220 | + let result = expand_numeric_ranges("s3://bucket/{5..5}.parquet").unwrap(); |
| 221 | + assert_eq!(result, "s3://bucket/{5}.parquet"); |
| 222 | + } |
| 223 | + |
| 224 | + #[test] |
| 225 | + fn test_multiple_ranges() { |
| 226 | + let result = expand_numeric_ranges("s3://bucket/{0..1}/{0..2}.parquet").unwrap(); |
| 227 | + assert_eq!(result, "s3://bucket/{0,1}/{0,1,2}.parquet"); |
| 228 | + } |
| 229 | + |
| 230 | + #[test] |
| 231 | + fn test_mixed_with_alternation() { |
| 232 | + // Alternation syntax {a,b} should remain untouched |
| 233 | + let result = expand_numeric_ranges("s3://bucket/{0..1}_{a,b}.parquet").unwrap(); |
| 234 | + assert_eq!(result, "s3://bucket/{0,1}_{a,b}.parquet"); |
| 235 | + } |
| 236 | + |
| 237 | + #[test] |
| 238 | + fn test_range_in_middle_of_path() { |
| 239 | + let result = expand_numeric_ranges("s3://bucket/data_{0..2}_suffix.parquet").unwrap(); |
| 240 | + assert_eq!(result, "s3://bucket/data_{0,1,2}_suffix.parquet"); |
| 241 | + } |
| 242 | + |
| 243 | + #[test] |
| 244 | + fn test_local_path() { |
| 245 | + let result = expand_numeric_ranges("/local/path/{0..2}.csv").unwrap(); |
| 246 | + assert_eq!(result, "/local/path/{0,1,2}.csv"); |
| 247 | + } |
| 248 | + |
| 249 | + #[test] |
| 250 | + fn test_http_url() { |
| 251 | + let result = expand_numeric_ranges("https://example.com/{0..2}.json").unwrap(); |
| 252 | + assert_eq!(result, "https://example.com/{0,1,2}.json"); |
| 253 | + } |
| 254 | + |
| 255 | + #[test] |
| 256 | + fn test_range_too_large() { |
| 257 | + let result = expand_numeric_ranges("s3://bucket/{0..100000}.parquet"); |
| 258 | + assert!(result.is_err()); |
| 259 | + let err = result.unwrap_err(); |
| 260 | + assert!(err.to_string().contains("exceeds the maximum")); |
| 261 | + } |
| 262 | + |
| 263 | + #[test] |
| 264 | + fn test_no_change_for_alternation() { |
| 265 | + // Pure alternation syntax should pass through unchanged |
| 266 | + let result = expand_numeric_ranges("s3://bucket/{foo,bar,baz}.parquet").unwrap(); |
| 267 | + assert_eq!(result, "s3://bucket/{foo,bar,baz}.parquet"); |
| 268 | + } |
| 269 | + |
| 270 | + #[test] |
| 271 | + fn test_asymmetric_padding() { |
| 272 | + // When one side has padding and other doesn't, use the larger width |
| 273 | + let result = expand_numeric_ranges("s3://bucket/{08..12}.parquet").unwrap(); |
| 274 | + assert_eq!(result, "s3://bucket/{08,09,10,11,12}.parquet"); |
| 275 | + } |
| 276 | + |
| 277 | + #[test] |
| 278 | + fn test_escaped_range_not_expanded() { |
| 279 | + // Escaped braces should NOT be expanded. |
| 280 | + // This works because the regex `\{(-?\d+)\.\.(-?\d+)\}` requires a literal `}`, |
| 281 | + // but `\\}` has a backslash before it, preventing the match. |
| 282 | + // Note: a single-escaped `\{0..3}` (without closing `\}`) would still match, |
| 283 | + // but in practice glob escaping always pairs `\\{...\\}`, so this is not a real concern. |
| 284 | + let result = expand_numeric_ranges(r"s3://bucket/\{0..3\}.parquet").unwrap(); |
| 285 | + assert_eq!(result, r"s3://bucket/\{0..3\}.parquet"); |
| 286 | + } |
| 287 | +} |
0 commit comments