feat: add stats for each field

nikhilsinhaparseable · nikhilsinhaparseable · commit 9c2d9f91c6a5 · 2025-06-08T22:52:52.000-04:00
read the record batches from arrow files in staging directory
run datafusion queries to fetch count, distinct count
and count for each distinct values for all fields in the dataset

store in &lt;dataset&gt;_pmeta dataset

UI to call below SQL query to fetch the stats from this dataset-

```
SELECT
    field_name,
	field_count
	distinct_count,
    distinct_value,
    distinct_value_count
FROM (
    SELECT
        field_stats_field_name as field_name,
        field_stats_distinct_stats_distinct_value as distinct_value,
        SUM(field_stats_count) as field_count, field_stats_distinct_count as distinct_count,
         SUM(field_stats_distinct_stats_count) as distinct_value_count,
        ROW_NUMBER() OVER (
            PARTITION BY field_stats_field_name
            ORDER BY SUM(field_stats_count) DESC
        ) as rn
    FROM &lt;dataset&gt;_pmeta
    WHERE field_stats_field_name = 'status_code'
      AND field_stats_distinct_stats_distinct_value IS NOT NULL
    GROUP BY field_stats_field_name, field_stats_distinct_stats_distinct_value, field_stats_distinct_count
) ranked
WHERE rn &lt;= 5
ORDER BY field_name, distinct_value_count DESC;
```
diff --git a/src/parseable/streams.rs b/src/parseable/streams.rs
@@ -27,10 +27,13 @@ use std::{
     time::{Instant, SystemTime, UNIX_EPOCH},
 };
 
-use arrow_array::RecordBatch;
+use arrow_array::{Array, Float64Array, Int64Array, NullArray, StringArray};
+use arrow_array::{BooleanArray, RecordBatch, TimestampMillisecondArray};
 use arrow_schema::{Field, Fields, Schema};
-use chrono::{NaiveDateTime, Timelike, Utc};
+use chrono::{DateTime, NaiveDateTime, Timelike, Utc};
+use datafusion::{datasource::MemTable, prelude::SessionContext};
 use derive_more::{Deref, DerefMut};
+use futures::stream::{FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use parquet::{
     arrow::ArrowWriter,
@@ -41,6 +44,7 @@ use parquet::{
 };
 use rand::distributions::DistString;
 use relative_path::RelativePathBuf;
+use serde::Serialize;
 use tokio::task::JoinSet;
 use tracing::{error, info, trace, warn};
 
@@ -50,9 +54,14 @@ use crate::{
         format::{LogSource, LogSourceEntry},
         DEFAULT_TIMESTAMP_KEY,
     },
+    handlers::http::{
+        cluster::INTERNAL_STREAM_NAME, ingest::PostError,
+        modal::utils::ingest_utils::flatten_and_push_logs,
+    },
     metadata::{LogStreamMetadata, SchemaVersion},
     metrics,
     option::Mode,
+    parseable::PARSEABLE,
     storage::{object_storage::to_bytes, retention::Retention, StreamType},
     utils::time::{Minute, TimeRange},
     LOCK_EXPECT, OBJECT_STORE_DATA_GRANULARITY,
@@ -67,6 +76,26 @@ use super::{
     LogStream, ARROW_FILE_EXTENSION,
 };
 
+#[derive(Serialize, Debug)]
+struct DistinctStat {
+    distinct_value: String,
+    count: i64,
+}
+
+#[derive(Serialize, Debug)]
+struct FieldStat {
+    field_name: String,
+    count: i64,
+    distinct_count: i64,
+    distinct_stats: Vec<DistinctStat>,
+}
+
+#[derive(Serialize, Debug)]
+struct DatasetStats {
+    dataset_name: String,
+    field_stats: Vec<FieldStat>,
+}
+
 /// Returns the filename for parquet if provided arrows file path is valid as per our expectation
 fn arrow_path_to_parquet(path: &Path, random_string: &str) -> Option<PathBuf> {
     let filename = path.file_stem()?.to_str()?;
@@ -114,7 +143,7 @@ impl Stream {
         let data_path = options.local_stream_data_path(&stream_name);
 
         Arc::new(Self {
-            stream_name,
+            stream_name: stream_name.clone(),
             metadata: RwLock::new(metadata),
             data_path,
             options,
@@ -306,7 +335,7 @@ impl Stream {
     }
 
     /// Converts arrow files in staging into parquet files, does so only for past minutes when run with `!shutdown_signal`
-    pub fn prepare_parquet(&self, shutdown_signal: bool) -> Result<(), StagingError> {
+    pub async fn prepare_parquet(&self, shutdown_signal: bool) -> Result<(), StagingError> {
         info!(
             "Starting arrow_conversion job for stream- {}",
             self.stream_name
@@ -317,18 +346,23 @@ impl Stream {
 
         // read arrow files on disk
         // convert them to parquet
-        let schema = self
-            .convert_disk_files_to_parquet(
-                time_partition.as_ref(),
-                custom_partition.as_ref(),
-                shutdown_signal,
-            )
-            .inspect_err(|err| warn!("Error while converting arrow to parquet- {err:?}"))?;
-
+        let (schema, rbs) = self.convert_disk_files_to_parquet(
+            time_partition.as_ref(),
+            custom_partition.as_ref(),
+            shutdown_signal,
+        )?;
         // check if there is already a schema file in staging pertaining to this stream
         // if yes, then merge them and save
 
         if let Some(mut schema) = schema {
+            if !&self.stream_name.contains(INTERNAL_STREAM_NAME) {
+                if let Err(err) = self.calculate_field_stats(rbs, schema.clone().into()).await {
+                    warn!(
+                        "Error calculating field stats for stream {}: {}",
+                        self.stream_name, err
+                    );
+                }
+            }
             let static_schema_flag = self.get_static_schema_flag();
             if !static_schema_flag {
                 // schema is dynamic, read from staging and merge if present
@@ -429,7 +463,7 @@ impl Stream {
         time_partition: Option<&String>,
         custom_partition: Option<&String>,
         shutdown_signal: bool,
-    ) -> Result<Option<Schema>, StagingError> {
+    ) -> Result<(Option<Schema>, Vec<RecordBatch>), StagingError> {
         let mut schemas = Vec::new();
 
         let now = SystemTime::now();
@@ -464,8 +498,7 @@ impl Stream {
         metrics::STORAGE_SIZE
             .with_label_values(&["staging", &self.stream_name, "arrows"])
             .set(total_arrow_files_size as i64);
-
-        // warn!("staging files-\n{staging_files:?}\n");
+        let mut record_batches = Vec::new();
         for (parquet_path, arrow_files) in staging_files {
             let record_reader = MergedReverseRecordReader::try_new(&arrow_files);
             if record_reader.readers.is_empty() {
@@ -486,6 +519,7 @@ impl Stream {
             let mut writer = ArrowWriter::try_new(&mut part_file, schema.clone(), Some(props))?;
             for ref record in record_reader.merged_iter(schema, time_partition.cloned()) {
                 writer.write(record)?;
+                record_batches.push(record.clone());
             }
             writer.close()?;
 
@@ -525,10 +559,10 @@ impl Stream {
         }
 
         if schemas.is_empty() {
-            return Ok(None);
+            return Ok((None, record_batches));
         }
 
-        Ok(Some(Schema::try_merge(schemas).unwrap()))
+        Ok((Some(Schema::try_merge(schemas)?), record_batches))
     }
 
     pub fn updated_schema(&self, current_schema: Schema) -> Schema {
@@ -725,7 +759,7 @@ impl Stream {
     }
 
     /// First flushes arrows onto disk and then converts the arrow into parquet
-    pub fn flush_and_convert(&self, shutdown_signal: bool) -> Result<(), StagingError> {
+    pub async fn flush_and_convert(&self, shutdown_signal: bool) -> Result<(), StagingError> {
         let start_flush = Instant::now();
         self.flush(shutdown_signal);
         trace!(
@@ -735,7 +769,8 @@ impl Stream {
         );
 
         let start_convert = Instant::now();
-        self.prepare_parquet(shutdown_signal)?;
+
+        self.prepare_parquet(shutdown_signal).await?;
         trace!(
             "Converting arrows to parquet on stream ({}) took: {}s",
             self.stream_name,
@@ -744,6 +779,165 @@ impl Stream {
 
         Ok(())
     }
+
+    async fn calculate_field_stats(
+        &self,
+        record_batches: Vec<RecordBatch>,
+        schema: Arc<Schema>,
+    ) -> Result<(), PostError> {
+        let dataset_meta = format!("{}_{INTERNAL_STREAM_NAME}", &self.stream_name);
+        let log_source_entry = LogSourceEntry::new(LogSource::Json, HashSet::new());
+        PARSEABLE
+            .create_stream_if_not_exists(
+                &dataset_meta,
+                StreamType::Internal,
+                vec![log_source_entry],
+            )
+            .await?;
+        let mem_table = MemTable::try_new(schema.clone(), vec![record_batches])
+            .map_err(|e| PostError::Invalid(e.into()))?;
+        let ctx = SessionContext::new();
+        ctx.register_table(&self.stream_name, Arc::new(mem_table))
+            .map_err(|e| PostError::Invalid(e.into()))?;
+
+        let field_stats = self.collect_all_field_stats(&ctx, &schema).await;
+
+        let stats = DatasetStats {
+            dataset_name: self.stream_name.clone(),
+            field_stats,
+        };
+        if stats.field_stats.is_empty() {
+            return Ok(());
+        }
+        let stats_value = serde_json::to_value(&stats).map_err(|e| PostError::Invalid(e.into()))?;
+
+        flatten_and_push_logs(
+            stats_value,
+            &dataset_meta,
+            &LogSource::Json,
+            &HashMap::new(),
+        )
+        .await?;
+        Ok(())
+    }
+
+    async fn collect_all_field_stats(
+        &self,
+        ctx: &SessionContext,
+        schema: &Arc<Schema>,
+    ) -> Vec<FieldStat> {
+        let field_futures = schema.fields().iter().map(|field| {
+            let ctx = ctx.clone();
+            let stream_name = self.stream_name.clone();
+            let field_name = field.name().clone();
+            async move { Self::calculate_single_field_stats(ctx, stream_name, field_name).await }
+        });
+
+        FuturesUnordered::from_iter(field_futures)
+            .filter_map(|x| async { x })
+            .collect::<Vec<_>>()
+            .await
+    }
+
+    async fn calculate_single_field_stats(
+        ctx: SessionContext,
+        stream_name: String,
+        field_name: String,
+    ) -> Option<FieldStat> {
+        let count = Self::query_single_i64(
+            &ctx,
+            &format!(
+                "select count(\"{field_name}\") as count from \"{stream_name}\" where \"{field_name}\" is not null"
+            ),
+        )
+        .await?;
+        if count == 0 {
+            return None;
+        }
+
+        let distinct_count = Self::query_single_i64(
+            &ctx,
+            &format!(
+                "select COUNT(DISTINCT \"{field_name}\") as distinct_count from \"{stream_name}\""
+            ),
+        )
+        .await?;
+
+        let distinct_stats = Self::query_distinct_stats(&ctx, &stream_name, &field_name).await;
+
+        Some(FieldStat {
+            field_name,
+            count,
+            distinct_count,
+            distinct_stats,
+        })
+    }
+
+    async fn query_single_i64(ctx: &SessionContext, sql: &str) -> Option<i64> {
+        let df = ctx.sql(sql).await.ok()?;
+        let batches = df.collect().await.ok()?;
+        let array = batches
+            .first()?
+            .column(0)
+            .as_any()
+            .downcast_ref::<arrow_array::Int64Array>()?;
+        Some(array.value(0))
+    }
+
+    fn format_arrow_value(array: &dyn Array, idx: usize) -> String {
+        if array.is_null(idx) {
+            return "NULL".to_string();
+        }
+        if let Some(arr) = array.as_any().downcast_ref::<StringArray>() {
+            arr.value(idx).to_string()
+        } else if let Some(arr) = array.as_any().downcast_ref::<Int64Array>() {
+            arr.value(idx).to_string()
+        } else if let Some(arr) = array.as_any().downcast_ref::<Float64Array>() {
+            arr.value(idx).to_string()
+        } else if let Some(arr) = array.as_any().downcast_ref::<TimestampMillisecondArray>() {
+            let timestamp = arr.value(idx);
+            DateTime::from_timestamp_millis(timestamp)
+                .map(|dt| dt.to_string())
+                .unwrap_or_else(|| "INVALID_TIMESTAMP".to_string())
+        } else if let Some(arr) = array.as_any().downcast_ref::<BooleanArray>() {
+            arr.value(idx).to_string()
+        } else if array.as_any().downcast_ref::<NullArray>().is_some() {
+            "NULL".to_string()
+        } else {
+            "UNSUPPORTED".to_string()
+        }
+    }
+
+    async fn query_distinct_stats(
+        ctx: &SessionContext,
+        stream_name: &str,
+        field_name: &str,
+    ) -> Vec<DistinctStat> {
+        let sql = format!(
+        "select count(*) as distinct_count, \"{field_name}\" from \"{stream_name}\" where \"{field_name}\" is not null group by \"{field_name}\" order by distinct_count desc limit 50"
+    );
+        let mut distinct_stats = Vec::new();
+        if let Ok(df) = ctx.sql(&sql).await {
+            if let Ok(batches) = df.collect().await {
+                for rb in batches {
+                    let counts = rb
+                        .column(0)
+                        .as_any()
+                        .downcast_ref::<Int64Array>()
+                        .expect("Counts should be Int64Array");
+                    let values = rb.column(1).as_ref();
+                    for i in 0..rb.num_rows() {
+                        let value = Self::format_arrow_value(values, i);
+                        distinct_stats.push(DistinctStat {
+                            distinct_value: value,
+                            count: counts.value(i),
+                        });
+                    }
+                }
+            }
+        }
+        distinct_stats
+    }
 }
 
 #[derive(Deref, DerefMut, Default)]
@@ -829,7 +1023,7 @@ impl Streams {
             .map(Arc::clone)
             .collect();
         for stream in streams {
-            joinset.spawn(async move { stream.flush_and_convert(shutdown_signal) });
+            joinset.spawn(async move { stream.flush_and_convert(shutdown_signal).await });
         }
     }
 }
@@ -1019,7 +1213,7 @@ mod tests {
             None,
         )
         .convert_disk_files_to_parquet(None, None, false)?;
-        assert!(result.is_none());
+        assert!(result.0.is_none());
         // Verify metrics were set to 0
         let staging_files = metrics::STAGING_FILES.with_label_values(&[&stream]).get();
         assert_eq!(staging_files, 0);
@@ -1100,8 +1294,8 @@ mod tests {
             .convert_disk_files_to_parquet(None, None, true)
             .unwrap();
 
-        assert!(result.is_some());
-        let result_schema = result.unwrap();
+        assert!(result.0.is_some());
+        let result_schema = result.0.unwrap();
         assert_eq!(result_schema.fields().len(), 3);
 
         // Verify parquet files were created and the arrow files deleted
@@ -1149,8 +1343,8 @@ mod tests {
             .convert_disk_files_to_parquet(None, None, true)
             .unwrap();
 
-        assert!(result.is_some());
-        let result_schema = result.unwrap();
+        assert!(result.0.is_some());
+        let result_schema = result.0.unwrap();
         assert_eq!(result_schema.fields().len(), 3);
 
         // Verify parquet files were created and the arrow files deleted
@@ -1203,8 +1397,8 @@ mod tests {
             .convert_disk_files_to_parquet(None, None, false)
             .unwrap();
 
-        assert!(result.is_some());
-        let result_schema = result.unwrap();
+        assert!(result.0.is_some());
+        let result_schema = result.0.unwrap();
         assert_eq!(result_schema.fields().len(), 3);
 
         // Verify parquet files were created and the arrow file left