@@ -519,6 +519,7 @@ impl Stream {
519
519
// if yes, then merge them and save
520
520
521
521
if let Some ( mut schema) = schema {
522
+ // calculate field stats for all user defined streams
522
523
if self . get_stream_type ( ) != StreamType :: Internal {
523
524
if let Err ( err) = self . calculate_field_stats ( rbs, schema. clone ( ) . into ( ) ) . await {
524
525
warn ! (
@@ -731,6 +732,7 @@ impl Stream {
731
732
let mut writer = ArrowWriter :: try_new ( & mut part_file, schema. clone ( ) , Some ( props. clone ( ) ) ) ?;
732
733
for ref record in record_reader. merged_iter ( schema. clone ( ) , time_partition. cloned ( ) ) {
733
734
writer. write ( record) ?;
735
+ // Collect record batches for finding statistics later
734
736
record_batches. push ( record. clone ( ) ) ;
735
737
}
736
738
writer. close ( ) ?;
@@ -1023,6 +1025,9 @@ impl Stream {
1023
1025
Ok ( ( ) )
1024
1026
}
1025
1027
1028
+ /// Calculates field statistics for the stream and pushes them to the internal stats dataset.
1029
+ /// This function creates a new internal stream for stats if it doesn't exist.
1030
+ /// It collects statistics for each field in the stream
1026
1031
async fn calculate_field_stats (
1027
1032
& self ,
1028
1033
record_batches : Vec < RecordBatch > ,
@@ -1064,6 +1069,9 @@ impl Stream {
1064
1069
Ok ( ( ) )
1065
1070
}
1066
1071
1072
+ /// Collects statistics for all fields in the stream.
1073
+ /// Returns a vector of `FieldStat` for each field with non-zero count.
1074
+ /// Uses `buffer_unordered` to run up to `MAX_CONCURRENT_FIELD_STATS` queries concurrently.
1067
1075
async fn collect_all_field_stats (
1068
1076
& self ,
1069
1077
ctx : & SessionContext ,
@@ -1084,10 +1092,13 @@ impl Stream {
1084
1092
1085
1093
futures:: stream:: iter ( field_futures)
1086
1094
. buffer_unordered ( MAX_CONCURRENT_FIELD_STATS )
1087
- . filter_map ( |x| async { x } )
1095
+ . filter_map ( std :: future :: ready )
1088
1096
. collect :: < Vec < _ > > ( )
1089
1097
. await
1090
1098
}
1099
+
1100
+ /// Calculates statistics for a single field in the stream.
1101
+ /// Returns `None` if the count query returns 0.
1091
1102
async fn calculate_single_field_stats (
1092
1103
ctx : SessionContext ,
1093
1104
stream_name : String ,
@@ -1122,6 +1133,9 @@ impl Stream {
1122
1133
} )
1123
1134
}
1124
1135
1136
+ /// Queries a single integer value from the DataFusion context.
1137
+ /// Returns `None` if the query fails or returns no rows.
1138
+ /// This is used for fetching record count for a field and distinct count.
1125
1139
async fn query_single_i64 ( ctx : & SessionContext , sql : & str ) -> Option < i64 > {
1126
1140
let df = ctx. sql ( sql) . await . ok ( ) ?;
1127
1141
let batches = df. collect ( ) . await . ok ( ) ?;
@@ -1134,6 +1148,8 @@ impl Stream {
1134
1148
Some ( array. value ( 0 ) )
1135
1149
}
1136
1150
1151
+ /// Helper function to format an Arrow value at a given index into a string.
1152
+ /// Handles null values and different data types like String, Int64, Float64, Timestamp, Date32, and Boolean.
1137
1153
fn format_arrow_value ( array : & dyn Array , idx : usize ) -> String {
1138
1154
if array. is_null ( idx) {
1139
1155
return "NULL" . to_string ( ) ;
@@ -1164,6 +1180,9 @@ impl Stream {
1164
1180
}
1165
1181
}
1166
1182
1183
+ /// This function is used to fetch distinct values and their counts for a field in the stream.
1184
+ /// Returns a vector of `DistinctStat` containing distinct values and their counts.
1185
+ /// The query groups by the field and orders by the count in descending order, limiting the results to `PARSEABLE.options.max_field_statistics`.
1167
1186
async fn query_distinct_stats (
1168
1187
ctx : & SessionContext ,
1169
1188
stream_name : & str ,
0 commit comments