huggingface
diff --git a/‎dashboard/clustering.py
Lines changed: 34 additions & 0 deletions b/‎dashboard/clustering.py
Lines changed: 34 additions & 0 deletions
diff --git a/‎dashboard/dashboard.py
Lines changed: 212 additions & 0 deletions b/‎dashboard/dashboard.py
Lines changed: 212 additions & 0 deletions
diff --git a/‎optimum-benchmark
Lines changed: 0 additions & 1 deletion b/‎optimum-benchmark
Lines changed: 0 additions & 1 deletion
diff --git a/‎pyproject.toml.bak
Lines changed: 0 additions & 29 deletions b/‎pyproject.toml.bak
Lines changed: 0 additions & 29 deletions
diff --git a/‎setup.py
Lines changed: 5 additions & 0 deletions b/‎setup.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎test.py b/‎test.py
diff --git a/‎test.sh
Lines changed: 0 additions & 4 deletions b/‎test.sh
Lines changed: 0 additions & 4 deletions
@@ -0,0 +1,34 @@
+import torch
+from sentence_transformers import SentenceTransformer
+
+# Each query needs to be accompanied by an corresponding instruction describing the task.
+task_name_to_instruct = {"example": "Given a question, retrieve passages that answer the question",}
+
+query_prefix = "Instruct: "+task_name_to_instruct["example"]+"\nQuery: "
+queries = [
+    'are judo throws allowed in wrestling?', 
+    'how to become a radiology technician in michigan?'
+    ]
+
+# No instruction needed for retrieval passages
+passages = [
+    "Since you're reading this, you are probably someone from a judo background or someone who is just wondering how judo techniques can be applied under wrestling rules. So without further ado, let's get to the question. Are Judo throws allowed in wrestling? Yes, judo throws are allowed in freestyle and folkstyle wrestling. You only need to be careful to follow the slam rules when executing judo throws. In wrestling, a slam is lifting and returning an opponent to the mat with unnecessary force.",
+    "Below are the basic steps to becoming a radiologic technologist in Michigan:Earn a high school diploma. As with most careers in health care, a high school education is the first step to finding entry-level employment. Taking classes in math and science, such as anatomy, biology, chemistry, physiology, and physics, can help prepare students for their college studies and future careers.Earn an associate degree. Entry-level radiologic positions typically require at least an Associate of Applied Science. Before enrolling in one of these degree programs, students should make sure it has been properly accredited by the Joint Review Committee on Education in Radiologic Technology (JRCERT).Get licensed or certified in the state of Michigan."
+]
+
+# load model with tokenizer
+model = SentenceTransformer('nvidia/NV-Embed-v2', trust_remote_code=True)
+model.max_seq_length = 32768
+model.tokenizer.padding_side="right"
+
+def add_eos(input_examples):
+  input_examples = [input_example + model.tokenizer.eos_token for input_example in input_examples]
+  return input_examples
+
+# get the embeddings
+batch_size = 2
+query_embeddings = model.encode(add_eos(queries), batch_size=batch_size, prompt=query_prefix, normalize_embeddings=True)
+passage_embeddings = model.encode(add_eos(passages), batch_size=batch_size, normalize_embeddings=True)
+
+scores = (query_embeddings @ passage_embeddings.T) * 100
+print(scores.tolist())
@@ -0,0 +1,212 @@
+import gradio as gr
+import pandas as pd
+from huggingface_hub import repo_exists, snapshot_download
+from llm_perf.common.hardware_config import load_hardware_configs
+from glob import glob
+from llm_perf.update_llm_perf_leaderboard import patch_json
+from optimum_benchmark import Benchmark
+import json
+from huggingface_hub.errors import RepositoryNotFoundError
+
+PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}"
+
+def create_status_df():
+    hardware_configs = load_hardware_configs("llm_perf/hardware.yaml")
+    
+    rows = []
+    for hardware_config in hardware_configs:
+        for subset in hardware_config.subsets:
+            for backend in hardware_config.backends:
+                repo_id = PERF_REPO_ID.format(
+                    subset=subset,
+                    machine=hardware_config.machine,
+                    backend=backend, 
+                    hardware=hardware_config.hardware
+                )
+                
+                exists = repo_exists(repo_id, repo_type="dataset")
+                status = "✅" if exists else "⛔️"
+                
+                rows.append({
+                    "Backend": backend,
+                    "Hardware": hardware_config.hardware,
+                    "Subset": subset,
+                    "Machine": hardware_config.machine,
+                    "Status": status
+                })
+    
+    df = pd.DataFrame(rows)
+    return df
+
+def create_benchmark_status_df():
+    hardware_configs = load_hardware_configs("llm_perf/hardware.yaml")
+    
+    rows = []
+    for hardware_config in hardware_configs:
+        for subset in hardware_config.subsets:
+            for backend in hardware_config.backends:
+                repo_id = PERF_REPO_ID.format(
+                    subset=subset,
+                    machine=hardware_config.machine,
+                    backend=backend,
+                    hardware=hardware_config.hardware
+                )
+                
+                try:
+                    snapshot = snapshot_download(
+                        repo_type="dataset",
+                        repo_id=repo_id,
+                        allow_patterns=["**/benchmark.json"],
+                    )
+                except RepositoryNotFoundError as e:
+                    print(f"Repository {repo_id} not found")
+                    continue
+                
+                for file in glob(f"{snapshot}/**/benchmark.json", recursive=True):
+                    patch_json(file)
+                    
+                    with open(file, "r") as f:
+                        data = json.load(f)
+                        benchmark = Benchmark.from_json(file)
+                        df = benchmark.to_dataframe()
+                        
+                        # print("hello")
+                        
+                        for _, row in df.iterrows():
+                            if "report.traceback" in row:
+                                traceback = row["report.traceback"]
+                            else:
+                                traceback = ""
+                                # print(f"No traceback for {row['config.name']} {row['config.backend.model']}")
+                            rows.append({
+                                "Backend": backend,
+                                "Hardware": hardware_config.hardware,
+                                "Subset": subset,
+                                "Machine": hardware_config.machine,
+                                "Status": "✅" if traceback == "" else "⛔️",
+                                "Model": row["config.backend.model"],
+                                "Experiment": row["config.name"],
+                                "Traceback": traceback,
+                                "Full Data": json.dumps(row.to_dict()),
+                                # "Markdown": f"### Model: {row['config.backend.model']}\n### Experiment: {row['config.name']}\n\n```json\n{json.dumps(row.to_dict(), indent=2)}\n```"
+                            })
+                # except:
+                #     rows.append({
+                #         "Backend": backend,
+                #         "Hardware": hardware_config.hardware,
+                #         "Subset": subset,
+                #         "Machine": hardware_config.machine,
+                #         "Status": "⛔️",
+                #         "Model": "N/A",
+                #         "Experiment": "N/A"
+                #     })
+    
+    df = pd.DataFrame(rows)
+    return df
+
+def create_status_table():
+    df = create_status_df()
+    return gr.DataFrame(
+        value=df,
+        headers=["Backend", "Hardware", "Subset", "Machine", "Status"],
+        row_count=(len(df), "fixed"),
+        col_count=(5, "fixed"),
+        wrap=True
+    )
+
+def create_benchmark_table(df_benchmark_status):
+
+    return gr.DataFrame(
+        value=df_benchmark_status,
+        headers=["Backend", "Hardware", "Subset", "Machine", "Status", "Model", "Experiment", "Traceback", "Full Data"],
+        row_count=(len(df_benchmark_status), "fixed"),
+        col_count=(9, "fixed"),
+        column_widths=[100, 100, 100, 100, 100, 200, 100, 100, 100],
+    )
+    
+def compute_machine_stats(df_benchmark_status):
+    """
+    Compute statistics about failed benchmarks per machine
+    Args:
+        df_benchmark_status (pd.DataFrame): DataFrame containing benchmark status information
+    Returns:
+        gr.DataFrame: Gradio DataFrame with machine failure statistics
+    """
+    # Stats per machine
+    stats_by_machine = df_benchmark_status.groupby(['Machine']).agg(
+        Total_Benchmarks=('Status', 'count'),
+        Failed_Benchmarks=('Status', lambda x: (x == '⛔️').sum())
+    ).reset_index()
+    
+    stats_by_machine['Success_Rate'] = ((stats_by_machine['Total_Benchmarks'] - stats_by_machine['Failed_Benchmarks']) / 
+                            stats_by_machine['Total_Benchmarks'] * 100).round(2)
+    stats_by_machine['Success_Rate'] = stats_by_machine['Success_Rate'].astype(str) + '%'
+    
+    machine_stats = gr.DataFrame(
+        value=stats_by_machine,
+        headers=["Machine", "Total_Benchmarks", "Failed_Benchmarks", "Success_Rate"],
+        row_count=(len(stats_by_machine), "fixed"),
+        col_count=(4, "fixed"),
+        wrap=True
+    )
+    
+    return machine_stats
+
+def compute_config_stats(df_benchmark_status):
+    """
+    Compute statistics about failed benchmarks per configuration
+    Args:
+        df_benchmark_status (pd.DataFrame): DataFrame containing benchmark status information
+    Returns:
+        gr.DataFrame: Gradio DataFrame with configuration failure statistics
+    """
+    # Stats per configuration
+    stats_by_config = df_benchmark_status.groupby(['Backend', 'Hardware', 'Subset', 'Machine']).agg(
+        Total_Benchmarks=('Status', 'count'),
+        Failed_Benchmarks=('Status', lambda x: (x == '⛔️').sum())
+    ).reset_index()
+    
+    stats_by_config['Success_Rate'] = ((stats_by_config['Total_Benchmarks'] - stats_by_config['Failed_Benchmarks']) / 
+                            stats_by_config['Total_Benchmarks'] * 100).round(2)
+    stats_by_config['Success_Rate'] = stats_by_config['Success_Rate'].astype(str) + '%'
+    
+    config_stats = gr.DataFrame(
+        value=stats_by_config,
+        headers=["Backend", "Hardware", "Subset", "Machine", "Total_Benchmarks", "Failed_Benchmarks", "Success_Rate"],
+        row_count=(len(stats_by_config), "fixed"),
+        col_count=(7, "fixed"),
+        wrap=True
+    )
+    
+    return config_stats
+
+def main():
+    
+    df_benchmark_status = create_benchmark_status_df()
+    
+    with gr.Blocks() as demo:
+        with gr.Tab("Hardware status"):    
+            gr.Markdown("# LLM Performance Dashboard")
+            gr.Markdown("Status of benchmark results across different configurations")
+            create_status_table()
+        with gr.Tab("Benchmark status"):
+            gr.Markdown("# Benchmark Results Status")
+            gr.Markdown("Status of individual benchmark runs with model and experiment details")
+            create_benchmark_table(df_benchmark_status)
+        with gr.Tab("Stats"):
+            gr.Markdown("# Stats")
+            gr.Markdown("## Stats by Machine")
+            gr.Markdown("Overall statistics per machine")
+            compute_machine_stats(df_benchmark_status)
+            gr.Markdown("## Stats by Configuration")
+            gr.Markdown("Detailed statistics for each configuration")
+            compute_config_stats(df_benchmark_status)
+        with gr.Tab("Trends"):
+            gr.Markdown("## Trends")
+            gr.Markdown("Trends in benchmark results")
+            gr.Markdown("TODO")
+    
+    demo.launch()
+
+if __name__ == "__main__":
+    main()
@@ -14,6 +14,7 @@
     "huggingface_hub[hf_transfer]",
     "datasets>=2.14.6",
     "beautifulsoup4",
+
     "optimum-benchmark @ git+https://github.com/huggingface/optimum-benchmark.git",
 ]
 
@@ -33,6 +34,10 @@
         "bitsandbytes",
         "autoawq",
     ],
+    "dashboard": [
+        "gradio>=5.0.0",
+        "sentence-transformers",
+    ]
 }
 
 setup(