Adding snapshot selection support for audit event and CG problems

jmsteur · lvbirgelen · commit 31e742a1fdf9 · 2025-06-11T16:36:03.000+02:00
diff --git a/security/security-design/shared-assets/oci-security-health-check-forensics/README.md b/security/security-design/shared-assets/oci-security-health-check-forensics/README.md
@@ -1,15 +1,21 @@
-# SHOW_OCI CSV Query Tool
+# OCI Security Health Check Forensics Tool
 
-The SHOW_OCI Query Tool is designed to load and analyze data from Oracle Cloud Infrastructure (OCI) environments using SQL. This tool enables users to import CSV files containing OCI resource information (e.g., compute instances, users, compartments) and perform SQL queries on the data.
+Last updated: 11 June 2025
+
+The OCI Security Health Check Forensics Tool (the tool) is designed to load and analyze data from Oracle Cloud Infrastructure (OCI) environments. This tool enables users to import CSV files containing OCI resource information (e.g., compute instances, users, compartments) and perform SQL queries on the data. This data is used to investigate configuration issues etc.
+
+The tool can also digest audit events and cloud guard problems. These resources can be loaded with different snapshots from a certain date with a number of days prior to that date.
+
+This data can be used to investiage anomalies.
 
 ## Features
 - Automatic OCI data fetching using showoci integration
 - **Audit events** and **Cloud Guard problems** fetching with parallel processing
 - Advanced filtering capabilities for age-based and compartment analysis
-- - Load CSV files with OCI data from multiple tenancies
+- Interactive tenancy selection from combined OCI configuration files
+- Load CSV files with OCI data from multiple tenancies
 - Execute SQL queries on the loaded data using DuckDB backend. Stay tuned for autonomous DB support.
 - Support for `SHOW TABLES` and `DESCRIBE table_name` commands
-- Interactive tenancy selection from combined OCI configuration files
 - Command history and help system
 - Batch query execution from YAML files
 
@@ -281,4 +287,12 @@ The tool supports parallel fetching for large datasets:
 - Date-based filtering with flexible column support
 - Compartment hierarchy analysis and visualization
 - Support for complex nested data structures
-- Chainable filter operations on query results
+- Chainable filter operations on query results
+
+# License
+
+Copyright (c) 2025 Oracle and/or its affiliates.
+
+Licensed under the Universal Permissive License (UPL), Version 1.0.
+
+See [LICENSE](https://github.com/oracle-devrel/technology-engineering/blob/main/LICENSE) for more details.
diff --git a/security/security-design/shared-assets/oci-security-health-check-forensics/classes/commands/control_commands.py b/security/security-design/shared-assets/oci-security-health-check-forensics/classes/commands/control_commands.py
@@ -13,16 +13,21 @@
 from classes.query_selector import QuerySelector
 from classes.output_formatter import OutputFormatter
 from classes.commands.filter_commands import AgeFilterCommand, CompartmentFilterCommand
+import json
+import pandas as pd
+import os
 
 class SetQueriesCommand(Command):
     """
     Usage: set queries [<directory>]
     Launches an interactive YAML-file picker and loads the selected queries.
+    If the YAML file contains a snapshot_type, prompts for snapshot file selection.
     """
     description = """Loads queries from a YAML file for batch execution.
 Usage: set queries [directory]
 - If directory is not specified, uses default query directory
 - Opens an interactive file picker to select the YAML file
+- If YAML contains snapshot_type, prompts to select a snapshot file
 - Loads selected queries into the execution queue"""
 
     def execute(self, args: str):
@@ -35,9 +40,101 @@ def execute(self, args: str):
             return
 
         qs = QuerySelector(yaml_path)
+        
+        # Check if snapshot file is needed
+        if qs.snapshot_type:
+            print(f"\nThis query file requires {qs.snapshot_type} snapshot data.")
+            
+            # Get current snapshot directory
+            snapshot_dir = self.ctx.query_executor.current_snapshot_dir
+            if not snapshot_dir:
+                print("Error: No active tenancy snapshot. Use 'set tenancy' first.")
+                return
+                
+            # Let user select snapshot file
+            snapshot_file = qs.select_snapshot_file(snapshot_dir)
+            if not snapshot_file:
+                print("No snapshot file selected. Query loading cancelled.")
+                return
+                
+            # Load the snapshot file into DuckDB
+            table_name = self._load_snapshot_to_duckdb(snapshot_file, qs.snapshot_type)
+            if table_name:
+                qs.set_snapshot_table(table_name)
+                print(f"✓ Loaded snapshot data into table: {table_name}")
+            else:
+                print("Failed to load snapshot data. Query loading cancelled.")
+                return
+        
+        # Select queries (with possible snapshot substitution)
         qs.select_queries()
         self.ctx.query_selector = qs
         print(f"Loaded queries from '{yaml_path}' into queue.")
+        
+        if qs.snapshot_type:
+            print(f"Queries will use snapshot table: {qs.snapshot_table}")
+
+    def _load_snapshot_to_duckdb(self, json_file, snapshot_type):
+        """Load JSON file into DuckDB and return the table name."""
+        try:
+            # Generate table name based on filename
+            filename = os.path.basename(json_file)
+            if snapshot_type == "audit":
+                table_name = filename.replace('audit_events_', '').replace('.json', '').replace('-', '')
+                table_name = f"audit_events_{table_name}"
+            elif snapshot_type == "cloudguard":
+                table_name = filename.replace('cloudguard_problems_', '').replace('.json', '').replace('-', '_')
+                table_name = f"cloudguard_problems_{table_name}"
+            else:
+                table_name = filename.replace('.json', '').replace('-', '_')
+            
+            print(f"Loading {filename} into table {table_name}...")
+            
+            with open(json_file, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            if not data:
+                print("Warning: JSON file contains no data")
+                return None
+
+            # Check if table already exists
+            existing_tables = self.ctx.query_executor.show_tables()
+            if table_name in existing_tables:
+                print(f"Table '{table_name}' already exists, using existing table.")
+                return table_name
+
+            # Flatten nested JSON
+            flattened = []
+            for item in data:
+                flat_item = {}
+                self._flatten_dict(item, flat_item)
+                flattened.append(flat_item)
+
+            df = pd.DataFrame(flattened)
+            
+            # Register and create table
+            self.ctx.query_executor.conn.register(table_name, df)
+            self.ctx.query_executor.conn.execute(f"CREATE TABLE {table_name} AS SELECT * FROM {table_name}")
+            print(f"Created table '{table_name}' with {len(df)} rows and {len(df.columns)} columns")
+            
+            return table_name
+            
+        except Exception as e:
+            print(f"Error loading snapshot into DuckDB: {e}")
+            return None
+
+    def _flatten_dict(self, d, flat_dict, prefix=''):
+        """Recursively flatten nested dictionaries and handle lists"""
+        for k, v in d.items():
+            key = f"{prefix}{k}" if prefix else k
+            key = key.replace(' ', '_').replace('-', '_').replace('.', '_')
+            
+            if isinstance(v, dict):
+                self._flatten_dict(v, flat_dict, f"{key}_")
+            elif isinstance(v, list):
+                flat_dict[key] = json.dumps(v) if v else None
+            else:
+                flat_dict[key] = v
 
 class SetTenancyCommand(Command):
     """
diff --git a/security/security-design/shared-assets/oci-security-health-check-forensics/classes/query_selector.py b/security/security-design/shared-assets/oci-security-health-check-forensics/classes/query_selector.py
@@ -11,12 +11,16 @@
 import yaml
 import questionary
 import queue
+import os
+import glob
 
 class QuerySelector:
     def __init__(self, yaml_file=None):
         """Initialize QuerySelector with an optional YAML file path and a FIFO queue."""
         self.yaml_file = yaml_file
         self.query_queue = queue.Queue()  # Always initialize an empty FIFO queue
+        self.snapshot_type = None
+        self.snapshot_table = None
 
         if yaml_file:
             self.queries = self.load_queries()
@@ -25,15 +29,74 @@ def __init__(self, yaml_file=None):
             self.queries = []  # Empty query list if no file is provided
 
     def load_queries(self):
-        """Load queries from a YAML file."""
+        """Load queries from a YAML file and check for snapshot_type."""
         try:
             with open(self.yaml_file, "r") as file:
                 data = yaml.safe_load(file)
+                # Check for snapshot_type parameter
+                self.snapshot_type = data.get("snapshot_type", None)
                 return data.get("queries", [])
         except Exception as e:
             print(f"Error loading YAML file: {e}")
             return []
 
+    def select_snapshot_file(self, snapshot_dir):
+        """Select a snapshot file based on the snapshot_type."""
+        if not self.snapshot_type:
+            return None
+            
+        # Determine file pattern based on snapshot type
+        if self.snapshot_type == "audit":
+            pattern = os.path.join(snapshot_dir, "audit_events_*_*.json")
+        elif self.snapshot_type == "cloudguard":
+            pattern = os.path.join(snapshot_dir, "cloudguard_problems_*_*.json")
+        else:
+            print(f"Unknown snapshot type: {self.snapshot_type}")
+            return None
+            
+        # Find matching files
+        files = glob.glob(pattern)
+        
+        if not files:
+            print(f"No {self.snapshot_type} snapshot files found in {snapshot_dir}")
+            return None
+            
+        # Prepare choices with metadata
+        file_choices = []
+        for file_path in sorted(files, key=os.path.getmtime, reverse=True):
+            filename = os.path.basename(file_path)
+            stat = os.stat(file_path)
+            file_size = self._format_file_size(stat.st_size)
+            
+            choice_text = f"{filename} ({file_size})"
+            file_choices.append({
+                'name': choice_text,
+                'value': file_path
+            })
+            
+        # Let user select
+        selected = questionary.select(
+            f"Select a {self.snapshot_type} snapshot file for queries:",
+            choices=[{'name': c['name'], 'value': c['value']} for c in file_choices]
+        ).ask()
+        
+        return selected
+
+    def _format_file_size(self, size_bytes):
+        """Format file size in human readable format."""
+        if size_bytes == 0:
+            return "0 B"
+        size_names = ["B", "KB", "MB", "GB"]
+        import math
+        i = int(math.floor(math.log(size_bytes, 1024)))
+        p = math.pow(1024, i)
+        s = round(size_bytes / p, 1)
+        return f"{s} {size_names[i]}"
+
+    def set_snapshot_table(self, table_name):
+        """Set the snapshot table name for query substitution."""
+        self.snapshot_table = table_name
+
     def select_queries(self):
         """Displays a list of query descriptions, allowing multiple selections, and pushes each item separately onto FIFO queue."""
         if not self.queries:
@@ -52,9 +115,15 @@ def select_queries(self):
             for query in self.queries:
                 if query["description"] == choice:
                     self.query_queue.put(("Description", query["description"]))
-                    self.query_queue.put(("SQL", query["sql"]))
+                    
+                    # Substitute snapshot_table in SQL if needed
+                    sql = query["sql"]
+                    if self.snapshot_table and "{snapshot_data}" in sql:
+                        sql = sql.replace("{snapshot_data}", self.snapshot_table)
+                    
+                    self.query_queue.put(("SQL", sql))
                     if query.get("filter") != None:
-                        self.query_queue.put(("Filter", query.get("filter", "None")))  # Return filter as-is
+                        self.query_queue.put(("Filter", query.get("filter", "None")))
                     break  # Stop after adding matching query
 
     def dequeue_item(self):
@@ -63,6 +132,4 @@ def dequeue_item(self):
             return self.query_queue.get()
         else:
             print("Queue is empty.")
-            return None
-
-
+            return None
diff --git a/security/security-design/shared-assets/oci-security-health-check-forensics/query_files/FORENSIC_Audit.yaml b/security/security-design/shared-assets/oci-security-health-check-forensics/query_files/FORENSIC_Audit.yaml
@@ -1,10 +1,26 @@
-# Replace the table names for audit logs and cloudguard events
+snapshot_type: audit
+
 queries:
   - description: "[FORENSIC]: Fetch distict set of eventtypes from the fetched audit logs window."
-    sql: "SELECT DISTINCT event_type, source, data_event_name, data_compartment_name, data_identity_principal_name FROM audit_events_15042025_10"
+    sql: "SELECT DISTINCT event_type, source, data_event_name, data_compartment_name, data_identity_principal_name FROM {snapshot_data}"
 
   - description: "[FORENSIC] Get all the event_types etc and order them by priciple_name for IdentityControlPlane"
-    sql: "SELECT data_identity_principal_name, data_identity_ip_address, event_type, source, data_compartment_name, data_event_name FROM audit_events_15042025_10 where source = 'IdentityControlPlane' GROUP BY data_identity_principal_name, data_identity_ip_address, event_type, source, data_compartment_name, data_event_name ORDER BY data_identity_principal_name"
+    sql: "SELECT data_identity_principal_name, data_identity_ip_address, event_type, source, data_compartment_name, data_event_name FROM {snapshot_data} where source = 'IdentityControlPlane' GROUP BY data_identity_principal_name, data_identity_ip_address, event_type, source, data_compartment_name, data_event_name ORDER BY data_identity_principal_name"
 
   - description: "[FORENSIC] Get all the event_types etc and order them by priciple_name for ConsoleSignIn"
-    sql: "SELECT data_identity_principal_name, data_identity_ip_address, event_type, source, data_compartment_name, data_event_name FROM audit_events_15042025_10 where source = 'IdentitySignOn' GROUP BY data_identity_principal_name, data_identity_ip_address, event_type, source, data_compartment_name, data_event_name ORDER BY data_identity_principal_name"
+    sql: "SELECT data_identity_principal_name, data_identity_ip_address, event_type, source, data_compartment_name, data_event_name FROM {snapshot_data} where source = 'IdentitySignOn' GROUP BY data_identity_principal_name, data_identity_ip_address, event_type, source, data_compartment_name, data_event_name ORDER BY data_identity_principal_name"
+
+  - description: "[FORENSIC] Find all administrative actions in the last period"
+    sql: "SELECT event_time, data_event_name, data_identity_principal_name, data_resource_name FROM {snapshot_data} WHERE data_event_name LIKE '%Admin%' OR data_event_name LIKE '%Create%' OR data_event_name LIKE '%Delete%' OR data_event_name LIKE '%Update%' ORDER BY event_time DESC"
+    
+  - description: "[FORENSIC] Show all unique users who performed actions"
+    sql: "SELECT DISTINCT data_identity_principal_name, COUNT(*) as action_count FROM {snapshot_data} GROUP BY data_identity_principal_name ORDER BY action_count DESC"
+    
+  - description: "[FORENSIC] Find all failed authentication attempts"
+    sql: "SELECT event_time, data_identity_principal_name, data_event_name, data_response_response_time FROM {snapshot_data} WHERE data_event_name LIKE '%Failed%' OR data_response_status != 'SUCCEEDED' ORDER BY event_time DESC"
+    
+  - description: "[FORENSIC] Show resource deletions"
+    sql: "SELECT event_time, data_user_name, data_resource_name, data_event_name FROM {snapshot_data} WHERE data_event_name LIKE '%Delete%' ORDER BY event_time DESC"
+    
+  - description: "[FORENSIC] Find policy changes"
+    sql: "SELECT event_time, data_user_name, data_resource_name, data_event_name FROM {snapshot_data} WHERE event_type = 'Policy' OR event_type LIKE '%Policy%' ORDER BY event_time DESC"
diff --git a/security/security-design/shared-assets/oci-security-health-check-forensics/query_files/FORENSIC_Cloudguard.yaml b/security/security-design/shared-assets/oci-security-health-check-forensics/query_files/FORENSIC_Cloudguard.yaml
@@ -1,4 +1,40 @@
-# Replace the table names for audit logs and cloudguard events
+# queries/FORENSIC_CloudGuard.yaml
+snapshot_type: cloudguard
+
 queries:
   - description: "[FORENSIC] Get all the CG problems sorted by resource_name"
-    sql: "select resource_name, detector_rule_id, risk_level, labels, time_first_detected, time_last_detected, lifecycle_state, lifecycle_detail, detector_id from cloudguard_problems_10052025_12 ORDER BY resource_name"
+    sql: >
+      SELECT resource_name, detector_rule_id, risk_level, labels, time_first_detected, time_last_detected, lifecycle_state, lifecycle_detail, detector_id 
+      FROM {snapshot_data} 
+      ORDER BY resource_name"
+
+  - description: "[FORENSIC] Show all high-risk Cloud Guard problems"
+    sql: >
+      SELECT resource_name, detector_rule_id, risk_level, labels, time_first_detected, time_last_detected, lifecycle_state 
+      FROM {snapshot_data} 
+      WHERE risk_level = 'HIGH' 
+      ORDER BY time_last_detected DESC
+    
+  - description: "[FORENSIC] Find problems by detector type"
+    sql: >
+      SELECT detector_id, COUNT(*) as problem_count 
+      FROM {snapshot_data} 
+      GROUP BY detector_id 
+      ORDER BY problem_count DESC
+    
+  - description: "[FORENSIC] Show active problems (not resolved)"
+    sql: >
+      SELECT resource_name, detector_rule_id, risk_level, lifecycle_state, lifecycle_detail 
+      FROM {snapshot_data} 
+      WHERE lifecycle_state != 'RESOLVED' 
+      ORDER BY time_last_detected DESC
+    
+  - description: "[FORENSIC] Find problems in specific compartments"
+    sql: >
+      SELECT  ic.name as compartment_name, ic.path as compartment_path, COUNT(*) as problem_count 
+      FROM {snapshot_data} cp 
+      LEFT JOIN identity_compartments ic 
+      ON cp.compartment_id = ic.id 
+      GROUP BY cp.compartment_id, ic.name, ic.path 
+      ORDER BY problem_count DESC
+