huggingface
diff --git a/‎.gradio/certificate.pem
Lines changed: 31 additions & 0 deletions b/‎.gradio/certificate.pem
Lines changed: 31 additions & 0 deletions
diff --git a/‎dashboard_test/chroma_traceback_db/chroma.sqlite3
70.4 MB b/‎dashboard_test/chroma_traceback_db/chroma.sqlite3
70.4 MB
diff --git a/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/data_level0.bin
8.08 MB b/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/data_level0.bin
8.08 MB
diff --git a/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/header.bin
100 Bytes b/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/header.bin
100 Bytes
diff --git a/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/index_metadata.pickle
47.7 KB b/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/index_metadata.pickle
47.7 KB
diff --git a/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/length.bin
7.81 KB b/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/length.bin
7.81 KB
diff --git a/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/link_lists.bin
16.6 KB b/‎dashboard_test/chroma_traceback_db/dc18c944-8dba-4640-9e25-da9910a45a1c/link_lists.bin
16.6 KB
diff --git a/‎dashboard_test/clustering.py
Lines changed: 218 additions & 0 deletions b/‎dashboard_test/clustering.py
Lines changed: 218 additions & 0 deletions
@@ -0,0 +1,31 @@
+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----
@@ -0,0 +1,218 @@
+import chromadb
+from chromadb.config import Settings
+import pandas as pd
+from sklearn.cluster import AgglomerativeClustering
+from dashboard import create_benchmark_status_df
+from sentence_transformers import SentenceTransformer
+import gradio as gr
+import plotly.express as px
+from tqdm import tqdm
+import pickle
+import os
+from typing import Dict, List, Tuple, Optional
+import logging
+import numpy as np
+
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def get_embeddings(texts: List[str]) -> np.ndarray:
+    """Generate embeddings for given texts using BAAI/bge-large-zh-v1.5 model"""
+    logger.info("Generating embeddings...")
+    model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
+    return model.encode(texts, normalize_embeddings=True)
+
+def load_cached_data() -> Tuple[Dict, chromadb.Collection]:
+    """Load cached data and ChromaDB collection"""
+    cache_file = "traceback_cache.pkl"
+    
+    # Initialize ChromaDB with specific settings
+    chroma_client = chromadb.PersistentClient(
+        path="./chroma_traceback_db",
+        settings=Settings(anonymized_telemetry=False)
+    )
+    collection = chroma_client.get_or_create_collection(
+        name="traceback_clusters",
+        metadata={"hnsw:space": "cosine"}
+    )
+    
+    if os.path.exists(cache_file):
+        with open(cache_file, 'rb') as f:
+            cache = pickle.load(f)
+    else:
+        cache = {'processed_tracebacks': set()}
+        
+    return cache, collection
+
+def save_cached_data(cache: Dict):
+    """Save cache to disk"""
+    with open("traceback_cache.pkl", 'wb') as f:
+        pickle.dump(cache, f)
+
+def create_traceback_clusters(pool: bool = True, refresh: bool = True) -> Optional[pd.DataFrame]:
+    """Create and maintain clusters of tracebacks"""
+    
+    if refresh:
+        # Delete all data from ChromaDB collection and cache
+        logger.info("Refreshing all data...")
+        chroma_client = chromadb.PersistentClient(path="./chroma_traceback_db")
+        chroma_client.delete_collection("traceback_clusters")
+        
+        # Delete cache file if it exists
+        if os.path.exists("traceback_cache.pkl"):
+            os.remove("traceback_cache.pkl")
+            logger.info("Deleted cache file")
+            
+        pool = True
+
+    # Load cached data
+    cache, collection = load_cached_data()
+    
+    if pool:
+        # Get benchmark data
+        df_benchmark_status = create_benchmark_status_df(hardware_configs)
+        
+        # Extract tracebacks that are not empty
+        current_tracebacks = set(df_benchmark_status[df_benchmark_status['Traceback'] != '']['Traceback'].tolist())
+        
+        # Find new tracebacks
+        new_tracebacks = current_tracebacks - cache['processed_tracebacks']
+        
+        if new_tracebacks:
+            logger.info(f"Found {len(new_tracebacks)} new tracebacks to process")
+            
+            # Generate embeddings for new tracebacks
+            new_tracebacks_list = list(new_tracebacks)
+            new_embeddings = get_embeddings(new_tracebacks_list)
+            
+            # Add new documents to ChromaDB
+            start_id = len(cache['processed_tracebacks'])
+            new_ids = [str(i) for i in range(start_id, start_id + len(new_tracebacks_list))]
+            
+            # Log the data being added
+            logger.info(f"Adding {len(new_tracebacks_list)} documents with embeddings shape {new_embeddings.shape}")
+            
+            try:
+                collection.add(
+                    documents=new_tracebacks_list,
+                    embeddings=new_embeddings.tolist(),
+                    ids=new_ids
+                )
+                logger.info("Successfully added documents to collection")
+            except Exception as e:
+                logger.error(f"Error adding documents to collection: {str(e)}")
+                return None
+            
+            # Update cache
+            cache['processed_tracebacks'].update(new_tracebacks)
+            save_cached_data(cache)
+    
+
+    # Get all embeddings from ChromaDB
+    all_results = collection.get(include=['embeddings', 'documents'])
+    logger.info(f"Retrieved {len(all_results['ids'])} documents from collection")
+    
+    if not all_results['embeddings'] or len(all_results['embeddings']) == 0:
+        logger.error("No embeddings found in collection")
+        return None
+        
+    all_embeddings = np.array(all_results['embeddings'], dtype=np.float32)
+    all_tracebacks = all_results['documents']
+    
+    # Check for NaN values and handle them
+    if np.isnan(all_embeddings).any():
+        logger.warning("Found NaN values in embeddings, removing corresponding entries")
+        valid_mask = ~np.isnan(all_embeddings).any(axis=1)
+        all_embeddings = all_embeddings[valid_mask]
+        all_tracebacks = [t for i, t in enumerate(all_tracebacks) if valid_mask[i]]
+        
+    if len(all_embeddings) == 0:
+        raise ValueError("No valid embeddings found after filtering NaN values")
+    
+    # Perform hierarchical clustering
+    logger.info("Performing clustering...")
+    clustering = AgglomerativeClustering(
+        n_clusters=None,
+        distance_threshold=0.5,
+        linkage='ward'
+    )
+    
+    clusters = clustering.fit_predict(all_embeddings)
+    
+    # Create cluster summary
+    cluster_summary = pd.DataFrame({
+        'Traceback': all_tracebacks,
+        'Cluster': clusters
+    })
+    
+    # Group by cluster and get representative traceback
+    cluster_representatives = cluster_summary.groupby('Cluster').agg({
+        'Traceback': lambda x: x.iloc[0],
+        'Count': lambda x: len(x)
+    }).reset_index()
+    
+    return cluster_representatives
+
+
+def create_cluster_pie_chart(cluster_representatives):
+    fig = px.pie(
+        cluster_representatives,
+        values='Count',
+        names='Cluster',
+        title='Distribution of Error Clusters'
+    )
+    return fig
+
+def create_cluster_bar_chart(cluster_representatives):
+    fig = px.bar(
+        cluster_representatives,
+        x='Cluster',
+        y='Count',
+        title='Error Cluster Sizes'
+    )
+    return fig
+
+def display_cluster_details(cluster_representatives):
+    markdown_text = "# Detailed Cluster Analysis\n\n"
+    for _, row in cluster_representatives.iterrows():
+        markdown_text += f"## Cluster {row['Cluster']}\n"
+        markdown_text += f"**Count:** {row['Count']} occurrences\n\n"
+        markdown_text += "**Representative Traceback:**\n```\n"
+        markdown_text += (row['Traceback'][:200] + "..." if len(row['Traceback']) > 200 else row['Traceback'])
+        markdown_text += "\n```\n\n---\n\n"
+    return markdown_text
+
+def main():
+    try:
+        # Run clustering
+        logger.info("Starting traceback clustering analysis...")
+        cluster_results = create_traceback_clusters()
+        
+        if cluster_results is None:
+            return
+            
+        # Create Gradio interface
+        with gr.Blocks() as demo:
+            gr.Markdown("# Traceback Clustering Analysis")
+            
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("## Distribution of Error Clusters")
+                    gr.Plot(create_cluster_pie_chart(cluster_results))
+                
+                with gr.Column():
+                    gr.Markdown("## Error Cluster Sizes")
+                    gr.Plot(create_cluster_bar_chart(cluster_results))
+            
+            with gr.Row():
+                gr.Markdown(display_cluster_details(cluster_results))
+        
+        # Launch the interface
+        demo.launch()
+            
+    except Exception as e:
+        logger.error(f"Error running clustering: {str(e)}")
+
+if __name__ == "__main__":
+    main()