|
| 1 | +import gradio as gr |
| 2 | +import pandas as pd |
| 3 | +from huggingface_hub import repo_exists, snapshot_download |
| 4 | +from llm_perf.common.hardware_config import load_hardware_configs |
| 5 | +from glob import glob |
| 6 | +from llm_perf.update_llm_perf_leaderboard import patch_json |
| 7 | +from optimum_benchmark import Benchmark |
| 8 | +import json |
| 9 | +from huggingface_hub.errors import RepositoryNotFoundError |
| 10 | + |
| 11 | +PERF_REPO_ID = "optimum-benchmark/llm-perf-{backend}-{hardware}-{subset}-{machine}" |
| 12 | + |
| 13 | +def create_status_df(): |
| 14 | + hardware_configs = load_hardware_configs("llm_perf/hardware.yaml") |
| 15 | + |
| 16 | + rows = [] |
| 17 | + for hardware_config in hardware_configs: |
| 18 | + for subset in hardware_config.subsets: |
| 19 | + for backend in hardware_config.backends: |
| 20 | + repo_id = PERF_REPO_ID.format( |
| 21 | + subset=subset, |
| 22 | + machine=hardware_config.machine, |
| 23 | + backend=backend, |
| 24 | + hardware=hardware_config.hardware |
| 25 | + ) |
| 26 | + |
| 27 | + exists = repo_exists(repo_id, repo_type="dataset") |
| 28 | + status = "✅" if exists else "⛔️" |
| 29 | + |
| 30 | + rows.append({ |
| 31 | + "Backend": backend, |
| 32 | + "Hardware": hardware_config.hardware, |
| 33 | + "Subset": subset, |
| 34 | + "Machine": hardware_config.machine, |
| 35 | + "Status": status |
| 36 | + }) |
| 37 | + |
| 38 | + df = pd.DataFrame(rows) |
| 39 | + return df |
| 40 | + |
| 41 | +def create_benchmark_status_df(): |
| 42 | + hardware_configs = load_hardware_configs("llm_perf/hardware.yaml") |
| 43 | + |
| 44 | + rows = [] |
| 45 | + for hardware_config in hardware_configs: |
| 46 | + for subset in hardware_config.subsets: |
| 47 | + for backend in hardware_config.backends: |
| 48 | + repo_id = PERF_REPO_ID.format( |
| 49 | + subset=subset, |
| 50 | + machine=hardware_config.machine, |
| 51 | + backend=backend, |
| 52 | + hardware=hardware_config.hardware |
| 53 | + ) |
| 54 | + |
| 55 | + try: |
| 56 | + snapshot = snapshot_download( |
| 57 | + repo_type="dataset", |
| 58 | + repo_id=repo_id, |
| 59 | + allow_patterns=["**/benchmark.json"], |
| 60 | + ) |
| 61 | + except RepositoryNotFoundError as e: |
| 62 | + print(f"Repository {repo_id} not found") |
| 63 | + continue |
| 64 | + |
| 65 | + for file in glob(f"{snapshot}/**/benchmark.json", recursive=True): |
| 66 | + patch_json(file) |
| 67 | + |
| 68 | + with open(file, "r") as f: |
| 69 | + data = json.load(f) |
| 70 | + benchmark = Benchmark.from_json(file) |
| 71 | + df = benchmark.to_dataframe() |
| 72 | + |
| 73 | + # print("hello") |
| 74 | + |
| 75 | + for _, row in df.iterrows(): |
| 76 | + if "report.traceback" in row: |
| 77 | + traceback = row["report.traceback"] |
| 78 | + else: |
| 79 | + traceback = "" |
| 80 | + # print(f"No traceback for {row['config.name']} {row['config.backend.model']}") |
| 81 | + rows.append({ |
| 82 | + "Backend": backend, |
| 83 | + "Hardware": hardware_config.hardware, |
| 84 | + "Subset": subset, |
| 85 | + "Machine": hardware_config.machine, |
| 86 | + "Status": "✅" if traceback == "" else "⛔️", |
| 87 | + "Model": row["config.backend.model"], |
| 88 | + "Experiment": row["config.name"], |
| 89 | + "Traceback": traceback, |
| 90 | + "Full Data": json.dumps(row.to_dict()), |
| 91 | + # "Markdown": f"### Model: {row['config.backend.model']}\n### Experiment: {row['config.name']}\n\n```json\n{json.dumps(row.to_dict(), indent=2)}\n```" |
| 92 | + }) |
| 93 | + # except: |
| 94 | + # rows.append({ |
| 95 | + # "Backend": backend, |
| 96 | + # "Hardware": hardware_config.hardware, |
| 97 | + # "Subset": subset, |
| 98 | + # "Machine": hardware_config.machine, |
| 99 | + # "Status": "⛔️", |
| 100 | + # "Model": "N/A", |
| 101 | + # "Experiment": "N/A" |
| 102 | + # }) |
| 103 | + |
| 104 | + df = pd.DataFrame(rows) |
| 105 | + return df |
| 106 | + |
| 107 | +def create_status_table(): |
| 108 | + df = create_status_df() |
| 109 | + return gr.DataFrame( |
| 110 | + value=df, |
| 111 | + headers=["Backend", "Hardware", "Subset", "Machine", "Status"], |
| 112 | + row_count=(len(df), "fixed"), |
| 113 | + col_count=(5, "fixed"), |
| 114 | + wrap=True |
| 115 | + ) |
| 116 | + |
| 117 | +def create_benchmark_table(df_benchmark_status): |
| 118 | + |
| 119 | + return gr.DataFrame( |
| 120 | + value=df_benchmark_status, |
| 121 | + headers=["Backend", "Hardware", "Subset", "Machine", "Status", "Model", "Experiment", "Traceback", "Full Data"], |
| 122 | + row_count=(len(df_benchmark_status), "fixed"), |
| 123 | + col_count=(9, "fixed"), |
| 124 | + column_widths=[100, 100, 100, 100, 100, 200, 100, 100, 100], |
| 125 | + ) |
| 126 | + |
| 127 | +def compute_machine_stats(df_benchmark_status): |
| 128 | + """ |
| 129 | + Compute statistics about failed benchmarks per machine |
| 130 | + Args: |
| 131 | + df_benchmark_status (pd.DataFrame): DataFrame containing benchmark status information |
| 132 | + Returns: |
| 133 | + gr.DataFrame: Gradio DataFrame with machine failure statistics |
| 134 | + """ |
| 135 | + # Stats per machine |
| 136 | + stats_by_machine = df_benchmark_status.groupby(['Machine']).agg( |
| 137 | + Total_Benchmarks=('Status', 'count'), |
| 138 | + Failed_Benchmarks=('Status', lambda x: (x == '⛔️').sum()) |
| 139 | + ).reset_index() |
| 140 | + |
| 141 | + stats_by_machine['Success_Rate'] = ((stats_by_machine['Total_Benchmarks'] - stats_by_machine['Failed_Benchmarks']) / |
| 142 | + stats_by_machine['Total_Benchmarks'] * 100).round(2) |
| 143 | + stats_by_machine['Success_Rate'] = stats_by_machine['Success_Rate'].astype(str) + '%' |
| 144 | + |
| 145 | + machine_stats = gr.DataFrame( |
| 146 | + value=stats_by_machine, |
| 147 | + headers=["Machine", "Total_Benchmarks", "Failed_Benchmarks", "Success_Rate"], |
| 148 | + row_count=(len(stats_by_machine), "fixed"), |
| 149 | + col_count=(4, "fixed"), |
| 150 | + wrap=True |
| 151 | + ) |
| 152 | + |
| 153 | + return machine_stats |
| 154 | + |
| 155 | +def compute_config_stats(df_benchmark_status): |
| 156 | + """ |
| 157 | + Compute statistics about failed benchmarks per configuration |
| 158 | + Args: |
| 159 | + df_benchmark_status (pd.DataFrame): DataFrame containing benchmark status information |
| 160 | + Returns: |
| 161 | + gr.DataFrame: Gradio DataFrame with configuration failure statistics |
| 162 | + """ |
| 163 | + # Stats per configuration |
| 164 | + stats_by_config = df_benchmark_status.groupby(['Backend', 'Hardware', 'Subset', 'Machine']).agg( |
| 165 | + Total_Benchmarks=('Status', 'count'), |
| 166 | + Failed_Benchmarks=('Status', lambda x: (x == '⛔️').sum()) |
| 167 | + ).reset_index() |
| 168 | + |
| 169 | + stats_by_config['Success_Rate'] = ((stats_by_config['Total_Benchmarks'] - stats_by_config['Failed_Benchmarks']) / |
| 170 | + stats_by_config['Total_Benchmarks'] * 100).round(2) |
| 171 | + stats_by_config['Success_Rate'] = stats_by_config['Success_Rate'].astype(str) + '%' |
| 172 | + |
| 173 | + config_stats = gr.DataFrame( |
| 174 | + value=stats_by_config, |
| 175 | + headers=["Backend", "Hardware", "Subset", "Machine", "Total_Benchmarks", "Failed_Benchmarks", "Success_Rate"], |
| 176 | + row_count=(len(stats_by_config), "fixed"), |
| 177 | + col_count=(7, "fixed"), |
| 178 | + wrap=True |
| 179 | + ) |
| 180 | + |
| 181 | + return config_stats |
| 182 | + |
| 183 | +def main(): |
| 184 | + |
| 185 | + df_benchmark_status = create_benchmark_status_df() |
| 186 | + |
| 187 | + with gr.Blocks() as demo: |
| 188 | + with gr.Tab("Hardware status"): |
| 189 | + gr.Markdown("# LLM Performance Dashboard") |
| 190 | + gr.Markdown("Status of benchmark results across different configurations") |
| 191 | + create_status_table() |
| 192 | + with gr.Tab("Benchmark status"): |
| 193 | + gr.Markdown("# Benchmark Results Status") |
| 194 | + gr.Markdown("Status of individual benchmark runs with model and experiment details") |
| 195 | + create_benchmark_table(df_benchmark_status) |
| 196 | + with gr.Tab("Stats"): |
| 197 | + gr.Markdown("# Stats") |
| 198 | + gr.Markdown("## Stats by Machine") |
| 199 | + gr.Markdown("Overall statistics per machine") |
| 200 | + compute_machine_stats(df_benchmark_status) |
| 201 | + gr.Markdown("## Stats by Configuration") |
| 202 | + gr.Markdown("Detailed statistics for each configuration") |
| 203 | + compute_config_stats(df_benchmark_status) |
| 204 | + with gr.Tab("Trends"): |
| 205 | + gr.Markdown("## Trends") |
| 206 | + gr.Markdown("Trends in benchmark results") |
| 207 | + gr.Markdown("TODO") |
| 208 | + |
| 209 | + demo.launch() |
| 210 | + |
| 211 | +if __name__ == "__main__": |
| 212 | + main() |
0 commit comments