Skip to content

Commit 5cefcd3

Browse files
committed
add dashbaord interface
1 parent 8e00573 commit 5cefcd3

File tree

9 files changed

+461
-0
lines changed

9 files changed

+461
-0
lines changed

.gradio/certificate.pem

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
-----BEGIN CERTIFICATE-----
2+
MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
3+
TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
4+
cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
5+
WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
6+
ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
7+
MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
8+
h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
9+
0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
10+
A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
11+
T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
12+
B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
13+
B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
14+
KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
15+
OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
16+
jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
17+
qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
18+
rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
19+
HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
20+
hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
21+
ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
22+
3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
23+
NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
24+
ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
25+
TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
26+
jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
27+
oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
28+
4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
29+
mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
30+
emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
31+
-----END CERTIFICATE-----
Binary file not shown.
Binary file not shown.

dashboard_test/clustering.py

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
import chromadb
2+
from chromadb.config import Settings
3+
import pandas as pd
4+
from sklearn.cluster import AgglomerativeClustering
5+
from dashboard import create_benchmark_status_df
6+
from sentence_transformers import SentenceTransformer
7+
import gradio as gr
8+
import plotly.express as px
9+
from tqdm import tqdm
10+
import pickle
11+
import os
12+
from typing import Dict, List, Tuple, Optional
13+
import logging
14+
import numpy as np
15+
16+
# Set up logging
17+
logging.basicConfig(level=logging.INFO)
18+
logger = logging.getLogger(__name__)
19+
20+
def get_embeddings(texts: List[str]) -> np.ndarray:
21+
"""Generate embeddings for given texts using BAAI/bge-large-zh-v1.5 model"""
22+
logger.info("Generating embeddings...")
23+
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
24+
return model.encode(texts, normalize_embeddings=True)
25+
26+
def load_cached_data() -> Tuple[Dict, chromadb.Collection]:
27+
"""Load cached data and ChromaDB collection"""
28+
cache_file = "traceback_cache.pkl"
29+
30+
# Initialize ChromaDB with specific settings
31+
chroma_client = chromadb.PersistentClient(
32+
path="./chroma_traceback_db",
33+
settings=Settings(anonymized_telemetry=False)
34+
)
35+
collection = chroma_client.get_or_create_collection(
36+
name="traceback_clusters",
37+
metadata={"hnsw:space": "cosine"}
38+
)
39+
40+
if os.path.exists(cache_file):
41+
with open(cache_file, 'rb') as f:
42+
cache = pickle.load(f)
43+
else:
44+
cache = {'processed_tracebacks': set()}
45+
46+
return cache, collection
47+
48+
def save_cached_data(cache: Dict):
49+
"""Save cache to disk"""
50+
with open("traceback_cache.pkl", 'wb') as f:
51+
pickle.dump(cache, f)
52+
53+
def create_traceback_clusters(pool: bool = True, refresh: bool = True) -> Optional[pd.DataFrame]:
54+
"""Create and maintain clusters of tracebacks"""
55+
56+
if refresh:
57+
# Delete all data from ChromaDB collection and cache
58+
logger.info("Refreshing all data...")
59+
chroma_client = chromadb.PersistentClient(path="./chroma_traceback_db")
60+
chroma_client.delete_collection("traceback_clusters")
61+
62+
# Delete cache file if it exists
63+
if os.path.exists("traceback_cache.pkl"):
64+
os.remove("traceback_cache.pkl")
65+
logger.info("Deleted cache file")
66+
67+
pool = True
68+
69+
# Load cached data
70+
cache, collection = load_cached_data()
71+
72+
if pool:
73+
# Get benchmark data
74+
df_benchmark_status = create_benchmark_status_df(hardware_configs)
75+
76+
# Extract tracebacks that are not empty
77+
current_tracebacks = set(df_benchmark_status[df_benchmark_status['Traceback'] != '']['Traceback'].tolist())
78+
79+
# Find new tracebacks
80+
new_tracebacks = current_tracebacks - cache['processed_tracebacks']
81+
82+
if new_tracebacks:
83+
logger.info(f"Found {len(new_tracebacks)} new tracebacks to process")
84+
85+
# Generate embeddings for new tracebacks
86+
new_tracebacks_list = list(new_tracebacks)
87+
new_embeddings = get_embeddings(new_tracebacks_list)
88+
89+
# Add new documents to ChromaDB
90+
start_id = len(cache['processed_tracebacks'])
91+
new_ids = [str(i) for i in range(start_id, start_id + len(new_tracebacks_list))]
92+
93+
# Log the data being added
94+
logger.info(f"Adding {len(new_tracebacks_list)} documents with embeddings shape {new_embeddings.shape}")
95+
96+
try:
97+
collection.add(
98+
documents=new_tracebacks_list,
99+
embeddings=new_embeddings.tolist(),
100+
ids=new_ids
101+
)
102+
logger.info("Successfully added documents to collection")
103+
except Exception as e:
104+
logger.error(f"Error adding documents to collection: {str(e)}")
105+
return None
106+
107+
# Update cache
108+
cache['processed_tracebacks'].update(new_tracebacks)
109+
save_cached_data(cache)
110+
111+
112+
# Get all embeddings from ChromaDB
113+
all_results = collection.get(include=['embeddings', 'documents'])
114+
logger.info(f"Retrieved {len(all_results['ids'])} documents from collection")
115+
116+
if not all_results['embeddings'] or len(all_results['embeddings']) == 0:
117+
logger.error("No embeddings found in collection")
118+
return None
119+
120+
all_embeddings = np.array(all_results['embeddings'], dtype=np.float32)
121+
all_tracebacks = all_results['documents']
122+
123+
# Check for NaN values and handle them
124+
if np.isnan(all_embeddings).any():
125+
logger.warning("Found NaN values in embeddings, removing corresponding entries")
126+
valid_mask = ~np.isnan(all_embeddings).any(axis=1)
127+
all_embeddings = all_embeddings[valid_mask]
128+
all_tracebacks = [t for i, t in enumerate(all_tracebacks) if valid_mask[i]]
129+
130+
if len(all_embeddings) == 0:
131+
raise ValueError("No valid embeddings found after filtering NaN values")
132+
133+
# Perform hierarchical clustering
134+
logger.info("Performing clustering...")
135+
clustering = AgglomerativeClustering(
136+
n_clusters=None,
137+
distance_threshold=0.5,
138+
linkage='ward'
139+
)
140+
141+
clusters = clustering.fit_predict(all_embeddings)
142+
143+
# Create cluster summary
144+
cluster_summary = pd.DataFrame({
145+
'Traceback': all_tracebacks,
146+
'Cluster': clusters
147+
})
148+
149+
# Group by cluster and get representative traceback
150+
cluster_representatives = cluster_summary.groupby('Cluster').agg({
151+
'Traceback': lambda x: x.iloc[0],
152+
'Count': lambda x: len(x)
153+
}).reset_index()
154+
155+
return cluster_representatives
156+
157+
158+
def create_cluster_pie_chart(cluster_representatives):
159+
fig = px.pie(
160+
cluster_representatives,
161+
values='Count',
162+
names='Cluster',
163+
title='Distribution of Error Clusters'
164+
)
165+
return fig
166+
167+
def create_cluster_bar_chart(cluster_representatives):
168+
fig = px.bar(
169+
cluster_representatives,
170+
x='Cluster',
171+
y='Count',
172+
title='Error Cluster Sizes'
173+
)
174+
return fig
175+
176+
def display_cluster_details(cluster_representatives):
177+
markdown_text = "# Detailed Cluster Analysis\n\n"
178+
for _, row in cluster_representatives.iterrows():
179+
markdown_text += f"## Cluster {row['Cluster']}\n"
180+
markdown_text += f"**Count:** {row['Count']} occurrences\n\n"
181+
markdown_text += "**Representative Traceback:**\n```\n"
182+
markdown_text += (row['Traceback'][:200] + "..." if len(row['Traceback']) > 200 else row['Traceback'])
183+
markdown_text += "\n```\n\n---\n\n"
184+
return markdown_text
185+
186+
def main():
187+
try:
188+
# Run clustering
189+
logger.info("Starting traceback clustering analysis...")
190+
cluster_results = create_traceback_clusters()
191+
192+
if cluster_results is None:
193+
return
194+
195+
# Create Gradio interface
196+
with gr.Blocks() as demo:
197+
gr.Markdown("# Traceback Clustering Analysis")
198+
199+
with gr.Row():
200+
with gr.Column():
201+
gr.Markdown("## Distribution of Error Clusters")
202+
gr.Plot(create_cluster_pie_chart(cluster_results))
203+
204+
with gr.Column():
205+
gr.Markdown("## Error Cluster Sizes")
206+
gr.Plot(create_cluster_bar_chart(cluster_results))
207+
208+
with gr.Row():
209+
gr.Markdown(display_cluster_details(cluster_results))
210+
211+
# Launch the interface
212+
demo.launch()
213+
214+
except Exception as e:
215+
logger.error(f"Error running clustering: {str(e)}")
216+
217+
if __name__ == "__main__":
218+
main()

0 commit comments

Comments
 (0)