@@ -26,13 +26,13 @@ use std::{
26
26
time:: { Instant , SystemTime , UNIX_EPOCH } ,
27
27
} ;
28
28
29
- use arrow_array:: { Array , Float64Array , Int64Array , NullArray , StringArray } ;
29
+ use arrow_array:: { Array , Date32Array , Float64Array , Int64Array , NullArray , StringArray } ;
30
30
use arrow_array:: { BooleanArray , RecordBatch , TimestampMillisecondArray } ;
31
31
use arrow_schema:: { Field , Fields , Schema } ;
32
32
use chrono:: { DateTime , NaiveDateTime , Timelike , Utc } ;
33
33
use datafusion:: { datasource:: MemTable , prelude:: SessionContext } ;
34
34
use derive_more:: { Deref , DerefMut } ;
35
- use futures :: stream :: { FuturesUnordered , StreamExt } ;
35
+ use futures_util :: StreamExt ;
36
36
use itertools:: Itertools ;
37
37
use parquet:: {
38
38
arrow:: ArrowWriter ,
@@ -75,6 +75,8 @@ use super::{
75
75
LogStream , ARROW_FILE_EXTENSION ,
76
76
} ;
77
77
78
+ const MAX_CONCURRENT_FIELD_STATS : usize = 10 ;
79
+
78
80
#[ derive( Serialize , Debug ) ]
79
81
struct DistinctStat {
80
82
distinct_value : String ,
@@ -517,7 +519,7 @@ impl Stream {
517
519
// if yes, then merge them and save
518
520
519
521
if let Some ( mut schema) = schema {
520
- if ! & self . stream_name . contains ( INTERNAL_STREAM_NAME ) {
522
+ if self . get_stream_type ( ) != StreamType :: Internal {
521
523
if let Err ( err) = self . calculate_field_stats ( rbs, schema. clone ( ) . into ( ) ) . await {
522
524
warn ! (
523
525
"Error calculating field stats for stream {}: {}" ,
@@ -1067,19 +1069,25 @@ impl Stream {
1067
1069
ctx : & SessionContext ,
1068
1070
schema : & Arc < Schema > ,
1069
1071
) -> Vec < FieldStat > {
1070
- let field_futures = schema. fields ( ) . iter ( ) . map ( |field| {
1072
+ // Collect field names into an owned Vec<String> to avoid lifetime issues
1073
+ let field_names: Vec < String > = schema
1074
+ . fields ( )
1075
+ . iter ( )
1076
+ . map ( |field| field. name ( ) . clone ( ) )
1077
+ . collect ( ) ;
1078
+
1079
+ let field_futures = field_names. into_iter ( ) . map ( |field_name| {
1071
1080
let ctx = ctx. clone ( ) ;
1072
1081
let stream_name = self . stream_name . clone ( ) ;
1073
- let field_name = field. name ( ) . clone ( ) ;
1074
1082
async move { Self :: calculate_single_field_stats ( ctx, stream_name, field_name) . await }
1075
1083
} ) ;
1076
1084
1077
- FuturesUnordered :: from_iter ( field_futures)
1085
+ futures:: stream:: iter ( field_futures)
1086
+ . buffer_unordered ( MAX_CONCURRENT_FIELD_STATS )
1078
1087
. filter_map ( |x| async { x } )
1079
1088
. collect :: < Vec < _ > > ( )
1080
1089
. await
1081
1090
}
1082
-
1083
1091
async fn calculate_single_field_stats (
1084
1092
ctx : SessionContext ,
1085
1093
stream_name : String ,
@@ -1117,11 +1125,12 @@ impl Stream {
1117
1125
async fn query_single_i64 ( ctx : & SessionContext , sql : & str ) -> Option < i64 > {
1118
1126
let df = ctx. sql ( sql) . await . ok ( ) ?;
1119
1127
let batches = df. collect ( ) . await . ok ( ) ?;
1120
- let array = batches
1121
- . first ( ) ?
1122
- . column ( 0 )
1123
- . as_any ( )
1124
- . downcast_ref :: < arrow_array:: Int64Array > ( ) ?;
1128
+ let batch = batches. first ( ) ?;
1129
+ if batch. num_rows ( ) == 0 {
1130
+ return None ;
1131
+ }
1132
+ let array = batch. column ( 0 ) . as_any ( ) . downcast_ref :: < Int64Array > ( ) ?;
1133
+
1125
1134
Some ( array. value ( 0 ) )
1126
1135
}
1127
1136
@@ -1140,11 +1149,17 @@ impl Stream {
1140
1149
DateTime :: from_timestamp_millis ( timestamp)
1141
1150
. map ( |dt| dt. to_string ( ) )
1142
1151
. unwrap_or_else ( || "INVALID_TIMESTAMP" . to_string ( ) )
1152
+ } else if let Some ( arr) = array. as_any ( ) . downcast_ref :: < Date32Array > ( ) {
1153
+ return arr. value ( idx) . to_string ( ) ;
1143
1154
} else if let Some ( arr) = array. as_any ( ) . downcast_ref :: < BooleanArray > ( ) {
1144
1155
arr. value ( idx) . to_string ( )
1145
1156
} else if array. as_any ( ) . downcast_ref :: < NullArray > ( ) . is_some ( ) {
1146
1157
"NULL" . to_string ( )
1147
1158
} else {
1159
+ warn ! (
1160
+ "Unsupported array type for statistics: {:?}" ,
1161
+ array. data_type( )
1162
+ ) ;
1148
1163
"UNSUPPORTED" . to_string ( )
1149
1164
}
1150
1165
}
@@ -1155,7 +1170,8 @@ impl Stream {
1155
1170
field_name : & str ,
1156
1171
) -> Vec < DistinctStat > {
1157
1172
let sql = format ! (
1158
- "select count(*) as distinct_count, \" {field_name}\" from \" {stream_name}\" where \" {field_name}\" is not null group by \" {field_name}\" order by distinct_count desc limit 50"
1173
+ "select count(*) as distinct_count, \" {field_name}\" from \" {stream_name}\" where \" {field_name}\" is not null group by \" {field_name}\" order by distinct_count desc limit {}" ,
1174
+ PARSEABLE . options. max_field_statistics
1159
1175
) ;
1160
1176
let mut distinct_stats = Vec :: new ( ) ;
1161
1177
if let Ok ( df) = ctx. sql ( & sql) . await {
0 commit comments