diff --git a/.github/workflows/benchmark_cpu_onnxruntime.yaml b/.github/workflows/benchmark_cpu_onnxruntime.yaml
index 6d40a5c..999e975 100644
--- a/.github/workflows/benchmark_cpu_onnxruntime.yaml
+++ b/.github/workflows/benchmark_cpu_onnxruntime.yaml
@@ -2,8 +2,14 @@ name: Benchmark CPU Onnxruntime
on:
workflow_dispatch:
+ inputs:
+ rerun_already_conducted_benchmarks:
+ description: 'Rerun benchmarks that were already conducted'
+ required: true
+ type: boolean
+ default: false
schedule:
- - cron: "0 12 * * 3"
+ - cron: "0 12 * * *"
pull_request:
concurrency:
@@ -16,15 +22,18 @@ env:
jobs:
run_benchmarks:
if: >-
- (github.event_name == 'push' && contains(github.event.head_commit.message, 'cpu_onnxruntime')) ||
- (github.event_name == 'push' && contains(github.event.head_commit.message, 'all_benchmarks')) ||
+ (github.event_name == 'push' && (
+ contains(github.event.head_commit.message, 'cpu_onnxruntime') ||
+ contains(github.event.head_commit.message, 'all_benchmarks')
+ )) ||
(github.event_name == 'push' && github.ref == 'refs/heads/main') ||
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'pull_request' && (
contains(github.event.pull_request.labels.*.name, 'leaderboard') ||
contains(github.event.pull_request.labels.*.name, 'cpu') ||
contains(github.event.pull_request.labels.*.name, 'onnxruntime') ||
- contains(github.event.pull_request.labels.*.name, 'cpu_onnxruntime')
+ contains(github.event.pull_request.labels.*.name, 'cpu_onnxruntime') ||
+ contains(github.event.pull_request.labels.*.name, 'all_benchmarks')
))
strategy:
@@ -48,6 +57,7 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DISABLE_WARNINGS: 1
BENCHMARK_TOP_N: 3
+ RERUN_ALREADY_CONDUCTED_BENCHMARKS: ${{ github.event.inputs.rerun_already_conducted_benchmarks }}
with:
image: ${{ env.IMAGE }}
options: |
@@ -60,6 +70,7 @@ jobs:
--env HF_HUB_ENABLE_HF_TRANSFER=1
--env DISABLE_WARNINGS
--env BENCHMARK_TOP_N
+ --env RERUN_ALREADY_CONDUCTED_BENCHMARKS
--volume ${{ github.workspace }}:/workspace
--workdir /workspace
run: |
diff --git a/.github/workflows/benchmark_cpu_openvino.yaml b/.github/workflows/benchmark_cpu_openvino.yaml
index 9e4c70f..1dab78b 100644
--- a/.github/workflows/benchmark_cpu_openvino.yaml
+++ b/.github/workflows/benchmark_cpu_openvino.yaml
@@ -2,8 +2,14 @@ name: Benchmark CPU OpenVINO
on:
workflow_dispatch:
+ inputs:
+ rerun_already_conducted_benchmarks:
+ description: 'Rerun benchmarks that were already conducted'
+ required: true
+ type: boolean
+ default: false
schedule:
- - cron: "0 0 * * *"
+ - cron: "0 6 * * *"
pull_request:
concurrency:
@@ -16,15 +22,18 @@ env:
jobs:
run_benchmarks:
if: >-
- (github.event_name == 'push' && contains(github.event.head_commit.message, 'cpu_openvino')) ||
- (github.event_name == 'push' && contains(github.event.head_commit.message, 'all_benchmarks')) ||
+ (github.event_name == 'push' && (
+ contains(github.event.head_commit.message, 'cpu_openvino') ||
+ contains(github.event.head_commit.message, 'all_benchmarks')
+ )) ||
(github.event_name == 'push' && github.ref == 'refs/heads/main') ||
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'pull_request' && (
contains(github.event.pull_request.labels.*.name, 'leaderboard') ||
contains(github.event.pull_request.labels.*.name, 'cpu') ||
contains(github.event.pull_request.labels.*.name, 'openvino') ||
- contains(github.event.pull_request.labels.*.name, 'cpu_openvino')
+ contains(github.event.pull_request.labels.*.name, 'cpu_openvino') ||
+ contains(github.event.pull_request.labels.*.name, 'all_benchmarks')
))
strategy:
@@ -48,6 +57,7 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
DISABLE_WARNINGS: 1
BENCHMARK_TOP_N: 50
+ RERUN_ALREADY_CONDUCTED_BENCHMARKS: ${{ github.event.inputs.rerun_already_conducted_benchmarks }}
with:
image: ${{ env.IMAGE }}
options: |
@@ -60,6 +70,7 @@ jobs:
--env HF_HUB_ENABLE_HF_TRANSFER=1
--env DISABLE_WARNINGS
--env BENCHMARK_TOP_N
+ --env RERUN_ALREADY_CONDUCTED_BENCHMARKS
--volume ${{ github.workspace }}:/workspace
--workdir /workspace
run: |
diff --git a/.github/workflows/benchmark_cpu_pytorch.yaml b/.github/workflows/benchmark_cpu_pytorch.yaml
index d0ca50a..a287606 100644
--- a/.github/workflows/benchmark_cpu_pytorch.yaml
+++ b/.github/workflows/benchmark_cpu_pytorch.yaml
@@ -2,6 +2,12 @@ name: Benchmark CPU PyTorch
on:
workflow_dispatch:
+ inputs:
+ rerun_already_conducted_benchmarks:
+ description: 'Rerun benchmarks that were already conducted'
+ required: true
+ type: boolean
+ default: false
schedule:
- cron: "0 0 * * *"
pull_request:
@@ -16,15 +22,18 @@ env:
jobs:
run_benchmarks:
if: >-
- (github.event_name == 'push' && contains(github.event.head_commit.message, 'cpu_pytorch')) ||
- (github.event_name == 'push' && contains(github.event.head_commit.message, 'all_benchmarks')) ||
+ (github.event_name == 'push' && (
+ contains(github.event.head_commit.message, 'cpu_pytorch') ||
+ contains(github.event.head_commit.message, 'all_benchmarks')
+ )) ||
(github.event_name == 'push' && github.ref == 'refs/heads/main') ||
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'pull_request' && (
contains(github.event.pull_request.labels.*.name, 'leaderboard') ||
contains(github.event.pull_request.labels.*.name, 'cpu') ||
contains(github.event.pull_request.labels.*.name, 'pytorch') ||
- contains(github.event.pull_request.labels.*.name, 'cpu_pytorch')
+ contains(github.event.pull_request.labels.*.name, 'cpu_pytorch') ||
+ contains(github.event.pull_request.labels.*.name, 'all_benchmarks')
))
strategy:
@@ -47,6 +56,7 @@ jobs:
MACHINE: ${{ matrix.machine.name }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
BENCHMARK_TOP_N: 50
+ RERUN_ALREADY_CONDUCTED_BENCHMARKS: ${{ github.event.inputs.rerun_already_conducted_benchmarks }}
with:
image: ${{ env.IMAGE }}
options: |
@@ -58,6 +68,7 @@ jobs:
--env MKL_THREADING_LAYER=GNU
--env HF_HUB_ENABLE_HF_TRANSFER=1
--env BENCHMARK_TOP_N
+ --env RERUN_ALREADY_CONDUCTED_BENCHMARKS
--volume ${{ github.workspace }}:/workspace
--workdir /workspace
run: |
diff --git a/.github/workflows/benchmark_cuda_pytorch.yaml b/.github/workflows/benchmark_cuda_pytorch.yaml
index d87204b..d04f8d0 100644
--- a/.github/workflows/benchmark_cuda_pytorch.yaml
+++ b/.github/workflows/benchmark_cuda_pytorch.yaml
@@ -2,8 +2,14 @@ name: Benchmark CUDA PyTorch
on:
workflow_dispatch:
+ inputs:
+ rerun_already_conducted_benchmarks:
+ description: 'Rerun benchmarks that were already conducted'
+ required: true
+ type: boolean
+ default: false
schedule:
- - cron: "0 3 * * 0"
+ - cron: "0 0 * * *"
pull_request:
concurrency:
@@ -16,15 +22,18 @@ env:
jobs:
run_benchmarks:
if: >-
- (github.event_name == 'push' && contains(github.event.head_commit.message, 'cuda_pytorch')) ||
- (github.event_name == 'push' && contains(github.event.head_commit.message, 'all_benchmarks')) ||
+ (github.event_name == 'push' && (
+ contains(github.event.head_commit.message, 'cuda_pytorch') ||
+ contains(github.event.head_commit.message, 'all_benchmarks')
+ )) ||
(github.event_name == 'push' && github.ref == 'refs/heads/main') ||
github.event_name == 'workflow_dispatch' ||
(github.event_name == 'pull_request' && (
contains(github.event.pull_request.labels.*.name, 'leaderboard') ||
contains(github.event.pull_request.labels.*.name, 'cuda') ||
contains(github.event.pull_request.labels.*.name, 'pytorch') ||
- contains(github.event.pull_request.labels.*.name, 'cuda_pytorch')
+ contains(github.event.pull_request.labels.*.name, 'cuda_pytorch') ||
+ contains(github.event.pull_request.labels.*.name, 'all_benchmarks')
))
strategy:
@@ -51,6 +60,7 @@ jobs:
MACHINE: ${{ matrix.machine.name }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
BENCHMARK_TOP_N: 50
+ RERUN_ALREADY_CONDUCTED_BENCHMARKS: ${{ github.event.inputs.rerun_already_conducted_benchmarks }}
with:
image: ${{ env.IMAGE }}
options: |
@@ -63,6 +73,7 @@ jobs:
--env MKL_THREADING_LAYER=GNU
--env HF_HUB_ENABLE_HF_TRANSFER=1
--env BENCHMARK_TOP_N
+ --env RERUN_ALREADY_CONDUCTED_BENCHMARKS
--volume ${{ github.workspace }}:/workspace
--workdir /workspace
run: |
diff --git a/.gitignore b/.gitignore
index 3fcc481..6800897 100644
--- a/.gitignore
+++ b/.gitignore
@@ -191,3 +191,4 @@ optimum-benchmark/
*.egg-info/
data/
+load_model_codecarbon.json
\ No newline at end of file
diff --git a/Makefile b/Makefile
index dc0ad56..f9ed5f1 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,11 @@
+# Load environment variables
+ifneq (,$(wildcard .env))
+ include .env
+ export
+endif
+
# Style and Quality checks
-.PHONY: style quality install install-dev run_cpu_container run_cuda_container run_rocm_container cpu-pytorch-container cpu-openvino-container collector-container
+.PHONY: style quality run_cpu_container run_cuda_container run_rocm_container cpu-pytorch-container cpu-openvino-container collector-container help
quality:
ruff check .
@@ -9,31 +15,46 @@ style:
ruff format .
ruff check --fix .
-install:
- pip install .
-
-install-dev:
- DEBUG=1 uv pip install -e .
-
# Running optimum-benchmark containers
-run_cpu_container:
+run-optimum-benchmark-cpu-container:
docker run -it --rm --pid host --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-cpu
-run_cuda_container:
+run-optimum-benchmark-cuda-container:
docker run -it --rm --pid host --gpus all --shm-size 64G --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-cuda
-run_rocm_container:
+run-optimum-benchmark-rocm-container:
docker run -it --rm --shm-size 64G --device /dev/kfd --device /dev/dri --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-rocm
-# Running llm-perf backend containers
-cpu-pytorch-container:
- docker build -t cpu-pytorch -f docker/cpu-pytorch/Dockerfile .
- docker run -it --rm --pid host cpu-pytorch
-
-cpu-openvino-container:
- docker build -t cpu-openvino -f docker/cpu-openvino/Dockerfile .
- docker run -it --rm --pid host cpu-openvino
+# Running llm-perf-leaderboard benchmarks
+run-llm-perf-benchmark-cpu-pytorch:
+ docker build -t llm-perf-backend-cpu-pytorch -f docker/cpu-pytorch/Dockerfile .
+ docker run -it --rm --pid host llm-perf-backend-cpu-pytorch
+
+run-llm-perf-benchmark-cpu-openvino:
+ docker build -t llm-perf-backend-cpu-openvino -f docker/cpu-openvino/Dockerfile .
+ docker run -it --rm --pid host llm-perf-backend-cpu-openvino
+
+run-llm-perf-benchmark-cuda-pytorch:
+ docker build -t llm-perf-backend-cuda-pytorch -f docker/gpu-cuda/Dockerfile .
+ docker run -it --rm --pid host --gpus all --shm-size 64G --volume .:/llm-perf-backend --workdir /llm-perf-backend llm-perf-backend-cuda-pytorch
+
+run-llm-perf-benchmark-collector:
+ docker build -t llm-perf-backend-collector -f docker/collector/Dockerfile .
+ docker run -it --rm --pid host llm-perf-backend-collector
+
+help:
+ @echo "Commands:"
+ @echo " style - Format code and fix style issues"
+ @echo " quality - Run style checks without fixing"
+ @echo ""
+ @echo "Optimum Benchmark Containers:"
+ @echo " run-optimum-benchmark-cpu-container - Run CPU container"
+ @echo " run-optimum-benchmark-cuda-container - Run CUDA container"
+ @echo " run-optimum-benchmark-rocm-container - Run ROCm container"
+ @echo ""
+ @echo "LLM Performance Backend Containers:"
+ @echo " run-llm-perf-benchmark-cpu-pytorch - Run the llm-perf-leaderboard Benchmark CPU PyTorch"
+ @echo " run-llm-perf-benchmark-cpu-openvino - Run the llm-perf-leaderboard Benchmark CPU OpenVINO"
+ @echo " run-llm-perf-benchmark-cuda-pytorch - Run the llm-perf-leaderboard Benchmark CUDA PyTorch"
+ @echo " run-llm-perf-benchmark-collector - Run the llm-perf-leaderboard Collector container"
-collector-container:
- docker build -t collector -f docker/collector/Dockerfile .
- docker run -it --rm --pid host collector
diff --git a/README.md b/README.md
index 7c69305..d25bd97 100644
--- a/README.md
+++ b/README.md
@@ -23,8 +23,7 @@ LLM-perf Backend is designed to:
## Installation 🛠️
-1. Clone the repository:
-```bash
+1. Clone the repository:```bash
git clone https://github.com/huggingface/llm-perf-backend
cd llm-perf-backend
```
@@ -53,7 +52,6 @@ llm-perf run-benchmark --hardware cpu --backend pytorch
```
### Configuration Options
-
View all the options with
```bash
llm-perf run-benchmark --help
@@ -62,6 +60,18 @@ llm-perf run-benchmark --help
- `--hardware`: Target hardware platform (cpu, cuda)
- `--backend`: Backend framework to use (pytorch, onnxruntime, etc.)
+### (Optional) Running Benchmarks via Docker
+
+You can run the benchmarks using the following make commands:
+
+```bash
+# CPU Benchmarks
+make run-llm-perf-benchmark-cpu-pytorch # Run PyTorch CPU benchmark
+make run-llm-perf-benchmark-cpu-openvino # Run OpenVINO CPU benchmark
+
+# GPU Benchmarks
+make run-llm-perf-benchmark-cuda-pytorch # Run PyTorch CUDA benchmark
+
## Benchmark Dataset 📊
Results are published to the official dataset:
@@ -75,4 +85,5 @@ All benchmarks follow these standardized settings:
- Memory tracking:
- Maximum allocated memory
- Maximum reserved memory
- - Maximum used memory (via PyNVML for GPU)
\ No newline at end of file
+ - Maximum used memory (via PyNVML for GPU)
+
diff --git a/dashboard/main.py b/dashboard/main.py
new file mode 100644
index 0000000..df3fd88
--- /dev/null
+++ b/dashboard/main.py
@@ -0,0 +1,4 @@
+# -> need to view on the indvidual runs to get details
+# -> get stats about the latest runs for all the hardware.yml
+# -> get stats on the latest github actions
+# -> get the stats on the top 50 models
diff --git a/dashboard/requirements.txt b/dashboard/requirements.txt
new file mode 100644
index 0000000..e69de29
diff --git a/docker/cpu-onnxruntime/Dockerfile b/docker/cpu-onnxruntime/Dockerfile
new file mode 100644
index 0000000..03ec92d
--- /dev/null
+++ b/docker/cpu-onnxruntime/Dockerfile
@@ -0,0 +1,11 @@
+FROM ghcr.io/huggingface/optimum-benchmark:latest-cpu
+
+WORKDIR /workspace
+
+COPY setup.py .
+
+RUN pip install -e .[onnxruntime]
+
+COPY . .
+
+ENTRYPOINT ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "onnxruntime"]
diff --git a/docker/cpu-openvino/Dockerfile b/docker/cpu-openvino/Dockerfile
index 2f88e1e..4446353 100644
--- a/docker/cpu-openvino/Dockerfile
+++ b/docker/cpu-openvino/Dockerfile
@@ -3,10 +3,9 @@ FROM ghcr.io/huggingface/optimum-benchmark:latest-cpu
WORKDIR /workspace
COPY setup.py .
-# COPY pyproject.toml .
RUN pip install -e .[openvino]
COPY . .
-CMD ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "openvino"]
+ENTRYPOINT ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "openvino"]
diff --git a/docker/cpu-pytorch/Dockerfile b/docker/cpu-pytorch/Dockerfile
index f6e3cc7..76052d0 100644
--- a/docker/cpu-pytorch/Dockerfile
+++ b/docker/cpu-pytorch/Dockerfile
@@ -2,8 +2,10 @@ FROM ghcr.io/huggingface/optimum-benchmark:latest-cpu
WORKDIR /workspace
-COPY . .
+COPY setup.py .
RUN pip install -e .
-CMD ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "pytorch"]
+COPY . .
+
+ENTRYPOINT ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "pytorch"]
diff --git a/docker/gpu-cuda/Dockerfile b/docker/gpu-cuda/Dockerfile
new file mode 100644
index 0000000..3e503b4
--- /dev/null
+++ b/docker/gpu-cuda/Dockerfile
@@ -0,0 +1,12 @@
+FROM ghcr.io/huggingface/optimum-benchmark:latest-cuda
+
+WORKDIR /workspace
+
+COPY setup.py .
+
+RUN pip install -e .[cuda] \
+ && pip install flash-attn --no-build-isolation
+
+COPY . .
+
+ENTRYPOINT ["llm-perf", "run-benchmark", "--hardware", "cuda", "--backend", "pytorch"]
diff --git a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py
index 2d8861e..eac63b5 100644
--- a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py
+++ b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py
@@ -19,9 +19,9 @@ def __init__(self):
super().__init__(backend="onnxruntime", device="cpu")
self.attention_configs = self._get_attention_configs()
- assert (
- self.subset is not None
- ), "SUBSET environment variable must be set for benchmarking"
+ assert self.subset is not None, (
+ "SUBSET environment variable must be set for benchmarking"
+ )
self.weights_configs = self._get_weights_configs(self.subset)
def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
@@ -47,9 +47,9 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
weights_config = kwargs["weights_config"]
attn_implementation = kwargs["attn_implementation"]
- assert (
- weights_config in self.weights_configs
- ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue"
+ assert weights_config in self.weights_configs, (
+ f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue"
+ )
torch_dtype = self.weights_configs[weights_config]["torch_dtype"]
quant_config = self.weights_configs[weights_config]["quant_config"]
diff --git a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py
index 0a1f771..0d7e3f8 100644
--- a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py
+++ b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py
@@ -19,9 +19,9 @@ def __init__(self):
super().__init__(backend="openvino", device="cpu")
self.attention_configs = self._get_attention_configs()
- assert (
- self.subset is not None
- ), "SUBSET environment variable must be set for benchmarking"
+ assert self.subset is not None, (
+ "SUBSET environment variable must be set for benchmarking"
+ )
self.weights_configs = self._get_weights_configs(self.subset)
def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
@@ -47,9 +47,9 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
weights_config = kwargs["weights_config"]
attn_implementation = kwargs["attn_implementation"]
- assert (
- weights_config in self.weights_configs
- ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue"
+ assert weights_config in self.weights_configs, (
+ f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue"
+ )
quant_config = self.weights_configs[weights_config]["quant_config"]
diff --git a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py
index c14cf6b..1c4440f 100644
--- a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py
+++ b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py
@@ -19,9 +19,9 @@ def __init__(self):
super().__init__(backend="pytorch", device="cpu")
self.attention_configs = self._get_attention_configs()
- assert (
- self.subset is not None
- ), "SUBSET environment variable must be set for benchmarking"
+ assert self.subset is not None, (
+ "SUBSET environment variable must be set for benchmarking"
+ )
self.weights_configs = self._get_weights_configs(self.subset)
def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
@@ -47,9 +47,9 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
weights_config = kwargs["weights_config"]
attn_implementation = kwargs["attn_implementation"]
- assert (
- weights_config in self.weights_configs
- ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue"
+ assert weights_config in self.weights_configs, (
+ f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue"
+ )
torch_dtype = self.weights_configs[weights_config]["torch_dtype"]
quant_scheme = self.weights_configs[weights_config]["quant_scheme"]
diff --git a/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py
index fef89c9..b2f717b 100644
--- a/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py
+++ b/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py
@@ -19,9 +19,9 @@ def __init__(self):
super().__init__(backend="pytorch", device="cuda")
self.attention_configs = self._get_attention_configs()
- assert (
- self.subset is not None
- ), "SUBSET environment variable must be set for benchmarking"
+ assert self.subset is not None, (
+ "SUBSET environment variable must be set for benchmarking"
+ )
self.weights_configs = self._get_weights_configs(self.subset)
def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
@@ -55,9 +55,9 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
weights_config = kwargs["weights_config"]
attn_implementation = kwargs["attn_implementation"]
- assert (
- weights_config in self.weights_configs
- ), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue"
+ assert weights_config in self.weights_configs, (
+ f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue"
+ )
torch_dtype = self.weights_configs[weights_config]["torch_dtype"]
quant_scheme = self.weights_configs[weights_config]["quant_scheme"]
@@ -206,7 +206,7 @@ def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]:
raise ValueError(f"Unknown subset: {subset}")
def _get_attention_configs(self) -> List[str]:
- return ["eager", "sdpa", "flash_attention_2"]
+ return ["eager", "sdpa"]
if __name__ == "__main__":
diff --git a/llm_perf/cli.py b/llm_perf/cli.py
index 6993744..ec42640 100644
--- a/llm_perf/cli.py
+++ b/llm_perf/cli.py
@@ -20,9 +20,13 @@
from llm_perf.update_llm_perf_leaderboard import update_llm_perf_leaderboard
+from loguru import logger
+
if os.environ.get("DISABLE_WARNINGS", "0") == "1":
warnings.filterwarnings("ignore")
+os.environ["CI"] = "GITHUB_ACTIONS"
+
app = typer.Typer()
@@ -46,9 +50,9 @@ def run_benchmark(
):
env_vars = load_dotenv()
if env_vars:
- print("Environment variables loaded successfully")
+ logger.info("Environment variables loaded successfully")
else:
- print("No environment variables loaded")
+ logger.info("No environment variables loaded")
if hardware == Hardware.CPU:
if backend == Backend.ONNXRUNTIME:
@@ -61,7 +65,7 @@ def run_benchmark(
if backend == Backend.PYTORCH:
runner = CUDAPyTorchBenchmarkRunner()
else:
- typer.echo(f"CUDA is not supported for {backend} backend")
+ logger.error(f"CUDA is not supported for {backend} backend")
raise typer.Exit(code=1)
runner.run_benchmarks()
@@ -72,5 +76,19 @@ def update_leaderboard():
update_llm_perf_leaderboard()
+@app.command()
+def launch_dashboard(
+ port: int = typer.Option(7860, help="Port to run the dashboard on"),
+ share: bool = typer.Option(False, help="Whether to create a public URL"),
+):
+ """Launch the LLM Performance Dashboard."""
+ from llm_perf.dashboard_app import DashboardApp
+
+ logger.info(f"Starting dashboard on port {port}")
+
+ app = DashboardApp()
+ app.launch()
+
+
if __name__ == "__main__":
app()
diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py
index e6ecac8..33de8ca 100644
--- a/llm_perf/common/benchmark_runner.py
+++ b/llm_perf/common/benchmark_runner.py
@@ -1,17 +1,22 @@
import os
+import sys
import traceback
from abc import ABC, abstractmethod
-from logging import getLogger
from typing import Any, Dict, List, Optional
+import subprocess
+import time
+import uuid
+from datetime import datetime
+from loguru import logger
from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport
-from optimum_benchmark.logging_utils import setup_logging
from llm_perf.common.utils import (
CANONICAL_PRETRAINED_OPEN_LLM_LIST,
- OPEN_LLM_LIST,
- PRETRAINED_OPEN_LLM_LIST,
)
+from llm_perf.common.memory_utils import log_memory_usage
+from llm_perf.common.dashboard import BenchmarkRunDetails
+from llm_perf.common.dashboard_manager import DashboardManager
class LLMPerfBenchmarkManager(ABC):
@@ -26,7 +31,7 @@ def __init__(
self.device = device
self.subset = subset or os.getenv("SUBSET", None)
self.machine = machine or os.getenv("MACHINE", None)
- self.logger = getLogger("llm-perf-backend")
+ self.dashboard_manager = DashboardManager()
if self.machine is None and self.subset is None:
self.push_repo_id = (
@@ -34,6 +39,7 @@ def __init__(
)
self.canonical_pretrained_open_llm_list = ["gpt2"]
self.subset = "unquantized"
+ self.machine = "debug" # Set a default machine name for debug mode
elif self.machine is not None and self.subset is not None:
self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.device}-{self.subset}-{self.machine}"
else:
@@ -41,12 +47,8 @@ def __init__(
"Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging"
)
- self.logger.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}")
- self.logger.info(
- f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}"
- )
- self.logger.info(
- f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}"
+ logger.info(
+ f"Starting benchmark runner with backend: {self.backend}, device: {self.device}, subset: {self.subset}, machine: {self.machine}"
)
@abstractmethod
@@ -73,22 +75,188 @@ def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
"This method should be implemented in the child class"
)
- def run_benchmarks(self):
- os.environ["LOG_TO_FILE"] = "0"
- os.environ["LOG_LEVEL"] = "INFO"
- setup_logging(level="INFO", prefix="MAIN-PROCESS")
+ def run_single_benchmark_in_subprocess(
+ self, model: str, run_id: str, run_start_time: str, **kwargs
+ ) -> bool:
+ """Run a single benchmark in a separate process"""
+ try:
+ # Create the Python script to run in subprocess
+ script = f"""
+import sys
+import os
+from {self.__class__.__module__} import {self.__class__.__name__}
+from loguru import logger
+import traceback
+
+try:
+ runner = {self.__class__.__name__}()
+
+ runner.run_benchmark(model="{model}", **{kwargs})
+ sys.exit(0)
+except Exception:
+ logger.error("Error in subprocess:" + "\\n" + traceback.format_exc())
+ sys.exit(1)
+"""
+
+ # Run the subprocess with timeout
+ result = subprocess.run(
+ [sys.executable, "-c", script],
+ text=True,
+ env={
+ **os.environ,
+ "PYTHONUNBUFFERED": "1",
+ "LOG_TO_FILE": "0", # Disable file logging for optimum-benchmark
+ "BENCHMARK_RUN_ID": run_id,
+ "BENCHMARK_START_TIME": run_start_time,
+ },
+ timeout=3600, # 1 hour timeout
+ )
+
+ return result.returncode == 0
+
+ except subprocess.TimeoutExpired:
+ logger.error(f"Benchmark timed out for model {model}")
+ return False
+ except Exception:
+ logger.error(
+ "Failed to run benchmark process:" + "\n" + traceback.format_exc()
+ )
+ return False
+ def run_benchmarks(self):
+ """Run all benchmarks sequentially with process isolation"""
benchmarks_to_run = self.get_list_of_benchmarks_to_run()
- self.logger.info(
+ logger.info(
f"Running a total of {len(benchmarks_to_run)} benchmarks, "
f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models"
)
- for benchmark_name in benchmarks_to_run:
- assert "model" in benchmark_name, "each benchmark should have a model"
+ logger.info(
+ f"Models that are being benchmarked: {CANONICAL_PRETRAINED_OPEN_LLM_LIST}"
+ )
+
+ rerun_already_conducted_benchmarks = (
+ os.getenv("RERUN_ALREADY_CONDUCTED_BENCHMARKS", "false") == "true"
+ )
+
+ total_benchmarks = len(benchmarks_to_run)
+ completed_benchmarks = 0
+ failed_benchmarks = 0
+ skipped_benchmarks = 0
+ failed_models = []
+ start_time = time.time()
+
+ # Generate run ID and start time for this benchmark session
+ run_id = str(uuid.uuid4())
+ run_start_time = datetime.now().isoformat()
+
+ for benchmark_config in benchmarks_to_run:
+ try:
+ # Log memory before benchmark
+ logger.info("Memory usage before benchmark:")
+ log_memory_usage("before")
+
+ model = benchmark_config.pop("model") # Remove model from kwargs
+ benchmark_name = self.get_benchmark_name(model, **benchmark_config)
+ subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
+
+ if not rerun_already_conducted_benchmarks:
+ if self.is_benchmark_conducted(self.push_repo_id, subfolder):
+ logger.info(
+ f"Skipping already conducted benchmark: {benchmark_name}"
+ )
+ benchmark_config["model"] = model # Restore model key
+ completed_benchmarks += 1
+ skipped_benchmarks += 1
+ success_rate = (
+ (
+ (completed_benchmarks - failed_benchmarks)
+ / completed_benchmarks
+ )
+ * 100
+ if completed_benchmarks > 0
+ else 100
+ )
+ logger.info(
+ f"\nProgress: {completed_benchmarks}/{total_benchmarks} benchmarks completed ({(completed_benchmarks / total_benchmarks) * 100:.1f}%) - Current success rate: {success_rate:.1f}%\n"
+ )
+ continue
+
+ logger.info(
+ f"Starting benchmark for model {model} with config: {benchmark_config}"
+ )
+
+ # Run the benchmark in a separate process
+ success = self.run_single_benchmark_in_subprocess(
+ model=model,
+ run_id=run_id,
+ run_start_time=run_start_time,
+ **benchmark_config,
+ )
+
+ if not success:
+ logger.error(f"Benchmark failed for model {model}")
+ failed_benchmarks += 1
+ failed_models.append(model)
- self.run_benchmark(**benchmark_name)
+ completed_benchmarks += 1
+ success_rate = (
+ ((completed_benchmarks - failed_benchmarks) / completed_benchmarks)
+ * 100
+ if completed_benchmarks > 0
+ else 100
+ )
+ logger.info(
+ f"\nProgress: {completed_benchmarks}/{total_benchmarks} benchmarks completed ({(completed_benchmarks / total_benchmarks) * 100:.1f}%) - Current success rate: {success_rate:.1f}%\n"
+ )
+
+ # Log memory after benchmark
+ logger.info("Memory usage after benchmark:")
+ log_memory_usage("after")
+
+ except Exception as e:
+ logger.error(f"Failed to run benchmark for {model}: {str(e)}")
+ logger.error(traceback.format_exc())
+ failed_benchmarks += 1
+ failed_models.append(model)
+ finally:
+ # Restore model key in case the config is reused
+ benchmark_config["model"] = model
+
+ # Calculate execution time
+ total_time = time.time() - start_time
+ hours = int(total_time // 3600)
+ minutes = int((total_time % 3600) // 60)
+ seconds = int(total_time % 60)
+
+ # Print summary
+ logger.info("\n" + "=" * 50)
+ logger.info("BENCHMARK EXECUTION SUMMARY")
+ logger.info("=" * 50)
+ logger.info(f"Total execution time: {hours}h {minutes}m {seconds}s")
+ logger.info(f"Total benchmarks: {total_benchmarks}")
+ logger.info(
+ f"Successfully completed: {completed_benchmarks - failed_benchmarks}"
+ )
+ logger.info(f"Failed: {failed_benchmarks}")
+ logger.info(f"Skipped (already conducted): {skipped_benchmarks}")
+ logger.info(
+ f"Success rate: {((completed_benchmarks - failed_benchmarks) / total_benchmarks) * 100:.1f}%"
+ )
+
+ if failed_models:
+ logger.info("\nFailed models:")
+ for model in failed_models:
+ logger.info(f" - {model}")
+
+ logger.info("\nConfiguration:")
+ logger.info(f" Backend: {self.backend}")
+ logger.info(f" Device: {self.device}")
+ logger.info(f" Subset: {self.subset}")
+ logger.info(f" Machine: {self.machine}")
+ logger.info(f" Rerun already conducted: {rerun_already_conducted_benchmarks}")
+ logger.info("=" * 50 + "\n")
def is_benchmark_conducted(self, push_repo_id, subfolder):
try:
@@ -114,18 +282,6 @@ def run_benchmark(self, **kwargs):
benchmark_name = self.get_benchmark_name(model, **kwargs)
subfolder = f"{benchmark_name}/{model.replace('/', '--')}"
- if not self.is_benchmark_supported(**kwargs):
- self.logger.info(
- f"Skipping benchmark {benchmark_name} with model {model} since it is not supported"
- )
- return
-
- if self.is_benchmark_conducted(self.push_repo_id, subfolder):
- self.logger.info(
- f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted"
- )
- return
-
benchmark_config = self.get_benchmark_config(model, **kwargs)
benchmark_config.push_to_hub(
repo_id=self.push_repo_id, subfolder=subfolder, private=True
@@ -141,8 +297,23 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
def execute_and_log_benchmark(
self, benchmark_config: BenchmarkConfig, subfolder: str
):
+ # Get run_id and run_start_time from environment variables
+ run_id = os.environ.get("BENCHMARK_RUN_ID")
+ run_start_time = os.environ.get("BENCHMARK_START_TIME")
+
+ if not run_id or not run_start_time:
+ # Fallback to generating new ones if not provided
+ run_id = str(uuid.uuid4())
+ run_start_time = datetime.now().isoformat()
+
+ success = False
+ error_traceback = ""
+
try:
- self.logger.info(
+ logger.info("Memory usage before execution:")
+ log_memory_usage("before")
+
+ logger.info(
f"Running benchmark {benchmark_config.name} with model {benchmark_config.backend.model}"
)
benchmark_report = Benchmark.launch(benchmark_config)
@@ -153,13 +324,17 @@ def execute_and_log_benchmark(
benchmark.push_to_hub(
repo_id=self.push_repo_id, subfolder=subfolder, private=True
)
+
+ logger.info("Memory usage after execution:")
+ log_memory_usage("after")
+
+ success = True
+
except Exception as e:
- self.logger.error(
- f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}, error:\n{e}"
- )
- benchmark_report = BenchmarkReport.from_dict(
- {"traceback": traceback.format_exc()}
- )
+ error_msg = f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}, error:\n{e}"
+ logger.error(error_msg)
+ error_traceback = traceback.format_exc()
+ benchmark_report = BenchmarkReport.from_dict({"traceback": error_traceback})
benchmark_report.push_to_hub(
repo_id=self.push_repo_id, subfolder=subfolder, private=True
)
@@ -167,3 +342,26 @@ def execute_and_log_benchmark(
benchmark.push_to_hub(
repo_id=self.push_repo_id, subfolder=subfolder, private=True
)
+
+ finally:
+ # At this point self.machine and self.subset should be strings
+ # If they're not, use default values
+ machine = self.machine if self.machine is not None else "unknown"
+ subset = self.subset if self.subset is not None else "unknown"
+
+ # Create and upload run details
+ run_details = BenchmarkRunDetails(
+ machine=machine,
+ hardware=self.device,
+ subsets=subset,
+ backends=self.backend,
+ model=benchmark_config.backend.model,
+ success=success,
+ traceback=error_traceback,
+ last_updated=datetime.now().isoformat(),
+ run_id=run_id,
+ run_start_time=run_start_time,
+ )
+
+ # Upload to dashboard
+ self.dashboard_manager.upload_run_details(run_details)
diff --git a/llm_perf/common/dashboard.py b/llm_perf/common/dashboard.py
new file mode 100644
index 0000000..cba4345
--- /dev/null
+++ b/llm_perf/common/dashboard.py
@@ -0,0 +1,15 @@
+from dataclasses import dataclass
+
+
+@dataclass
+class BenchmarkRunDetails:
+ machine: str
+ hardware: str
+ subsets: str
+ backends: str
+ model: str
+ success: bool
+ traceback: str
+ last_updated: str
+ run_id: str
+ run_start_time: str
diff --git a/llm_perf/common/dashboard_manager.py b/llm_perf/common/dashboard_manager.py
new file mode 100644
index 0000000..88d2469
--- /dev/null
+++ b/llm_perf/common/dashboard_manager.py
@@ -0,0 +1,232 @@
+import pandas as pd
+from datasets import Dataset, load_dataset
+from huggingface_hub import create_repo, HfApi
+from loguru import logger
+from typing import List, Optional
+import time
+
+from llm_perf.common.dashboard import BenchmarkRunDetails
+
+DASHBOARD_REPO_ID = "optimum-benchmark/llm-perf-dashboard"
+MAX_RETRIES = 3
+RETRY_DELAY = 2 # seconds
+
+
+class DashboardManager:
+ def __init__(self):
+ # Ensure the dataset repository exists
+ create_repo(repo_id=DASHBOARD_REPO_ID, repo_type="dataset", exist_ok=True)
+ self._current_commit = None
+ self._api = HfApi()
+ self._is_first_upload = False
+
+ def _get_current_commit(self) -> Optional[str]:
+ """Get the current commit hash of the main branch."""
+ try:
+ repo_info = self._api.repo_info(
+ repo_id=DASHBOARD_REPO_ID, repo_type="dataset"
+ )
+ return repo_info.sha
+ except Exception as e:
+ logger.error(f"Failed to get current commit: {str(e)}")
+ return None
+
+ def _load_existing_dataset(self) -> Optional[Dataset]:
+ """Load the existing dataset from the hub."""
+ try:
+ dataset = load_dataset(DASHBOARD_REPO_ID, split="train")
+ if isinstance(dataset, Dataset):
+ self._current_commit = self._get_current_commit()
+ return dataset
+ else:
+ logger.error("Loaded dataset is not of type Dataset")
+ return None
+ except Exception as e:
+ if "doesn't contain any data files" in str(e):
+ logger.info("No existing dataset found, this will be the first upload")
+ self._is_first_upload = True
+ self._current_commit = self._get_current_commit()
+ return None
+ logger.error(f"Failed to load existing dataset: {str(e)}")
+ return None
+
+ def _verify_commit(self) -> bool:
+ """Verify that the current commit hasn't changed."""
+ if self._is_first_upload:
+ # For first upload, we don't need to verify commit
+ return True
+
+ current = self._get_current_commit()
+ if current != self._current_commit:
+ logger.error("Dataset has been updated since last read. Aborting upload.")
+ return False
+ return True
+
+ def _convert_to_dict(self, run_details: BenchmarkRunDetails) -> dict:
+ """Convert BenchmarkRunDetails to a dictionary format suitable for the dataset."""
+ return {
+ "machine": run_details.machine,
+ "hardware": run_details.hardware,
+ "subsets": run_details.subsets,
+ "backends": run_details.backends,
+ "model": run_details.model,
+ "success": run_details.success,
+ "traceback": run_details.traceback,
+ "last_updated": run_details.last_updated,
+ "run_id": run_details.run_id,
+ "run_start_time": run_details.run_start_time,
+ }
+
+ def upload_run_details(self, run_details: BenchmarkRunDetails):
+ """Upload a single benchmark run details to the dashboard dataset."""
+ for attempt in range(MAX_RETRIES):
+ try:
+ # Reset first upload flag on each attempt
+ self._is_first_upload = False
+
+ # Load existing dataset
+ existing_dataset = self._load_existing_dataset()
+ if existing_dataset is None and not self._is_first_upload:
+ # Failed to load for reasons other than being first upload
+ if attempt < MAX_RETRIES - 1:
+ time.sleep(RETRY_DELAY)
+ continue
+ else:
+ logger.error(
+ "Max retries reached. Failed to upload run details."
+ )
+ return
+
+ # Get existing data or empty list for first upload
+ existing_data = existing_dataset.to_list() if existing_dataset else []
+
+ # Convert the new run details to a dictionary
+ new_run = self._convert_to_dict(run_details)
+
+ # Combine existing data with new run
+ combined_data = existing_data + [new_run]
+
+ # Create new dataset
+ dataset = Dataset.from_list(combined_data)
+
+ # Verify commit hasn't changed (skipped for first upload)
+ if not self._verify_commit():
+ if attempt < MAX_RETRIES - 1:
+ time.sleep(RETRY_DELAY)
+ continue
+ else:
+ logger.error(
+ "Max retries reached. Failed to upload run details."
+ )
+ return
+
+ # Push to hub
+ dataset.push_to_hub(repo_id=DASHBOARD_REPO_ID, split="train")
+ logger.info(
+ f"Successfully uploaded run details for {run_details.run_id} to dashboard"
+ )
+ break
+
+ except Exception as e:
+ logger.error(
+ f"Failed to upload run details (attempt {attempt + 1}/{MAX_RETRIES}): {str(e)}"
+ )
+ if attempt < MAX_RETRIES - 1:
+ time.sleep(RETRY_DELAY)
+ continue
+ break
+
+ def upload_multiple_run_details(self, run_details_list: List[BenchmarkRunDetails]):
+ """Upload multiple benchmark run details to the dashboard dataset."""
+ for attempt in range(MAX_RETRIES):
+ try:
+ # Load existing dataset
+ existing_dataset = self._load_existing_dataset()
+ if existing_dataset is None:
+ existing_data = []
+ else:
+ existing_data = existing_dataset.to_list()
+
+ # Convert all new run details to dictionaries
+ new_runs = [self._convert_to_dict(rd) for rd in run_details_list]
+
+ # Combine existing data with new runs
+ combined_data = existing_data + new_runs
+
+ # Create new dataset
+ dataset = Dataset.from_list(combined_data)
+
+ # Verify commit hasn't changed
+ if not self._verify_commit():
+ if attempt < MAX_RETRIES - 1:
+ time.sleep(RETRY_DELAY)
+ continue
+ else:
+ logger.error(
+ "Max retries reached. Failed to upload run details."
+ )
+ return
+
+ # Push to hub
+ dataset.push_to_hub(repo_id=DASHBOARD_REPO_ID, split="train")
+ logger.info(
+ f"Successfully uploaded {len(run_details_list)} run details to dashboard"
+ )
+ break
+
+ except Exception as e:
+ logger.error(
+ f"Failed to upload run details (attempt {attempt + 1}/{MAX_RETRIES}): {str(e)}"
+ )
+ if attempt < MAX_RETRIES - 1:
+ time.sleep(RETRY_DELAY)
+ continue
+ break
+
+ def get_latest_runs(
+ self,
+ machine: Optional[str] = None,
+ hardware: Optional[str] = None,
+ model: Optional[str] = None,
+ limit: int = 100,
+ ) -> pd.DataFrame:
+ """
+ Retrieve the latest benchmark runs from the dashboard dataset.
+
+ Args:
+ machine: Filter by machine name
+ hardware: Filter by hardware type
+ model: Filter by model name
+ limit: Maximum number of runs to return
+
+ Returns:
+ DataFrame containing the latest runs
+ """
+ try:
+ # Load the dataset
+ dataset = load_dataset(DASHBOARD_REPO_ID, split="train")
+ if not isinstance(dataset, Dataset):
+ logger.error("Failed to load dataset: not a Dataset instance")
+ return pd.DataFrame()
+
+ # Convert to pandas DataFrame using dictionary
+ data_dict = {col: dataset[col] for col in dataset.column_names}
+ df = pd.DataFrame(data_dict)
+
+ # Apply filters
+ if machine:
+ df = df[df["machine"] == machine]
+ if hardware:
+ df = df[df["hardware"] == hardware]
+ if model:
+ df = df[df["model"] == model]
+
+ # Sort by last_updated and take the most recent runs
+ df["last_updated"] = pd.to_datetime(df["last_updated"])
+ df = df.sort_values("last_updated", ascending=False).head(limit)
+
+ return df
+
+ except Exception as e:
+ logger.error(f"Failed to retrieve latest runs: {str(e)}")
+ return pd.DataFrame()
diff --git a/llm_perf/common/get_top_model_from_hub.py b/llm_perf/common/get_top_model_from_hub.py
index 79feb64..ba337f0 100644
--- a/llm_perf/common/get_top_model_from_hub.py
+++ b/llm_perf/common/get_top_model_from_hub.py
@@ -5,6 +5,7 @@
import requests
from datasets import Dataset
+from loguru import logger
def get_top_text_generation_models(
@@ -42,7 +43,7 @@ def get_top_text_generation_models(
def save_to_json(data: List[Dict], filename: str):
with open(filename, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
- print(f"Data saved to {filename}")
+ logger.info(f"Data saved to {filename}")
def compute_org_downloads(models: List[Dict]) -> Dict[str, int]:
@@ -55,7 +56,7 @@ def compute_org_downloads(models: List[Dict]) -> Dict[str, int]:
def upload_to_hf_dataset(data: List[Dict], dataset_name: str):
dataset = Dataset.from_list(data)
dataset.push_to_hub(dataset_name)
- print(f"Data uploaded to Hugging Face dataset: {dataset_name}")
+ logger.info(f"Data uploaded to Hugging Face dataset: {dataset_name}")
def main():
@@ -64,16 +65,16 @@ def main():
if huggingface_token:
os.environ["HUGGINGFACE_HUB_TOKEN"] = huggingface_token
else:
- print(
+ logger.warning(
"Warning: HUGGINGFACE_TOKEN not found in environment variables. Running without authentication."
)
n = 100
top_models = get_top_text_generation_models(n)
- print(f"\nTop {n} text generation models on Hugging Face Hub:")
+ logger.info(f"\nTop {n} text generation models on Hugging Face Hub:")
for i, model in enumerate(top_models, 1):
- print(
+ logger.info(
f"{i}. {model['organization']}/{model['model_name']}: {model['downloads']:,} downloads"
)
@@ -82,11 +83,11 @@ def main():
upload_to_hf_dataset(top_models, dataset_name)
# Display top 10 organizations by downloads
- print("\nTop 10 organizations by total downloads:")
+ logger.info("\nTop 10 organizations by total downloads:")
org_downloads = compute_org_downloads(top_models)
sorted_orgs = sorted(org_downloads.items(), key=lambda x: x[1], reverse=True)[:10]
for i, (org, downloads) in enumerate(sorted_orgs, 1):
- print(f"{i}. {org}: {downloads:,} downloads")
+ logger.info(f"{i}. {org}: {downloads:,} downloads")
if __name__ == "__main__":
diff --git a/llm_perf/common/memory_utils.py b/llm_perf/common/memory_utils.py
new file mode 100644
index 0000000..a9f9f90
--- /dev/null
+++ b/llm_perf/common/memory_utils.py
@@ -0,0 +1,195 @@
+import os
+import psutil
+import gc
+from typing import Dict, Optional
+from loguru import logger
+
+try:
+ import torch
+
+ TORCH_AVAILABLE = True
+except ImportError:
+ TORCH_AVAILABLE = False
+
+# Memory thresholds in MB
+MEMORY_THRESHOLDS = {
+ "cpu_rss": 8192, # 8GB
+ "cpu_percent": 90, # 90%
+ "gpu_allocated": 8192, # 8GB
+}
+
+
+class MemoryTracker:
+ def __init__(self):
+ self.initial_memory: Dict = {}
+ self.peak_memory: Dict = {"cpu_rss": 0, "cpu_percent": 0, "gpu_allocated": 0}
+ self.consecutive_increases = 0
+ self.last_memory: Optional[Dict] = None
+ self.before_memory: Optional[Dict] = None # Store memory state before benchmark
+
+ def get_gpu_memory_info(self):
+ """Get GPU memory usage if CUDA is available"""
+ if not TORCH_AVAILABLE or not torch.cuda.is_available():
+ return None
+
+ try:
+ gpu_memory = []
+ for i in range(torch.cuda.device_count()):
+ allocated = torch.cuda.memory_allocated(i) / (1024 * 1024) # MB
+ reserved = torch.cuda.memory_reserved(i) / (1024 * 1024) # MB
+ gpu_memory.append(
+ {"device": i, "allocated": allocated, "reserved": reserved}
+ )
+ return gpu_memory
+ except Exception as e:
+ logger.warning(f"Failed to get GPU memory info: {e}")
+ return None
+
+ def get_cpu_memory_info(self):
+ """Get CPU memory usage"""
+ try:
+ process = psutil.Process(os.getpid())
+ memory_info = process.memory_info()
+ return {
+ "rss": memory_info.rss / (1024 * 1024), # MB
+ "vms": memory_info.vms / (1024 * 1024), # MB
+ "percent": process.memory_percent(),
+ }
+ except Exception as e:
+ logger.warning(f"Failed to get CPU memory info: {e}")
+ return None
+
+ def check_thresholds(self, cpu_info: Optional[Dict], gpu_info: Optional[list]):
+ """Check if memory usage exceeds thresholds"""
+ if cpu_info:
+ if cpu_info["rss"] > MEMORY_THRESHOLDS["cpu_rss"]:
+ logger.warning(
+ f"CPU RSS memory ({cpu_info['rss']:.0f}MB) exceeds threshold ({MEMORY_THRESHOLDS['cpu_rss']}MB)"
+ )
+ if cpu_info["percent"] > MEMORY_THRESHOLDS["cpu_percent"]:
+ logger.warning(
+ f"CPU usage ({cpu_info['percent']:.1f}%) exceeds threshold ({MEMORY_THRESHOLDS['cpu_percent']}%)"
+ )
+
+ if gpu_info:
+ for device in gpu_info:
+ if device["allocated"] > MEMORY_THRESHOLDS["gpu_allocated"]:
+ logger.warning(
+ f"GPU {device['device']} allocated memory ({device['allocated']:.0f}MB) "
+ f"exceeds threshold ({MEMORY_THRESHOLDS['gpu_allocated']}MB)"
+ )
+
+ def check_persistent_growth(
+ self, cpu_info: Optional[Dict], gpu_info: Optional[list]
+ ):
+ """Monitor for persistent memory growth"""
+ if not cpu_info:
+ return
+
+ current_memory = {
+ "cpu_rss": cpu_info["rss"],
+ "cpu_percent": cpu_info["percent"],
+ "gpu_allocated": gpu_info[0]["allocated"] if gpu_info else 0,
+ }
+
+ # Update peak memory
+ for key in self.peak_memory:
+ self.peak_memory[key] = max(self.peak_memory[key], current_memory[key])
+
+ # Check for persistent growth
+ if self.last_memory:
+ is_increasing = all(
+ current_memory[key]
+ > self.last_memory[key] * 1.05 # 5% increase threshold
+ for key in current_memory
+ )
+
+ if is_increasing:
+ self.consecutive_increases += 1
+ if (
+ self.consecutive_increases >= 3
+ ): # Alert after 3 consecutive increases
+ logger.warning(
+ "Detected persistent memory growth over last 3 benchmarks:\n"
+ f"Initial: CPU RSS={self.initial_memory.get('cpu_rss', 0):.0f}MB\n"
+ f"Current: CPU RSS={current_memory['cpu_rss']:.0f}MB\n"
+ f"Peak: CPU RSS={self.peak_memory['cpu_rss']:.0f}MB"
+ )
+ else:
+ self.consecutive_increases = 0
+
+ # Store current memory for next comparison
+ self.last_memory = current_memory
+
+ # Store initial memory on first run
+ if not self.initial_memory:
+ self.initial_memory = current_memory
+
+ def log_memory_usage(self, phase: str = "current"):
+ """Log current memory usage for both CPU and GPU"""
+ # Force garbage collection
+ gc.collect()
+ if TORCH_AVAILABLE and torch.cuda.is_available():
+ torch.cuda.empty_cache()
+
+ # Get memory info
+ cpu_info = self.get_cpu_memory_info()
+ gpu_info = self.get_gpu_memory_info()
+
+ # Check thresholds and persistent growth
+ self.check_thresholds(cpu_info, gpu_info)
+ self.check_persistent_growth(cpu_info, gpu_info)
+
+ # Store before memory state
+ if phase == "before":
+ self.before_memory = {"cpu": cpu_info, "gpu": gpu_info}
+ prefix = "Before benchmark -"
+ elif phase == "after" and self.before_memory:
+ prefix = "After benchmark -"
+ else:
+ prefix = "Current -"
+
+ # Log CPU memory
+ if cpu_info:
+ cpu_msg = f"{prefix} CPU Memory - RSS: {cpu_info['rss']:.2f}MB, VMS: {cpu_info['vms']:.2f}MB, Percent: {cpu_info['percent']:.1f}%"
+
+ # Add delta if we're in after phase
+ if phase == "after" and self.before_memory and self.before_memory["cpu"]:
+ before_cpu = self.before_memory["cpu"]
+ cpu_msg += f" (Δ RSS: {cpu_info['rss'] - before_cpu['rss']:+.2f}MB, Δ VMS: {cpu_info['vms'] - before_cpu['vms']:+.2f}MB, Δ %: {cpu_info['percent'] - before_cpu['percent']:+.1f})"
+
+ logger.info(cpu_msg)
+
+ # Log GPU memory if available
+ if gpu_info:
+ for device in gpu_info:
+ gpu_msg = f"{prefix} GPU {device['device']} Memory - Allocated: {device['allocated']:.2f}MB, Reserved: {device['reserved']:.2f}MB"
+
+ # Add delta if we're in after phase
+ if (
+ phase == "after"
+ and self.before_memory
+ and self.before_memory["gpu"]
+ ):
+ before_gpu = next(
+ (
+ g
+ for g in self.before_memory["gpu"]
+ if g["device"] == device["device"]
+ ),
+ None,
+ )
+ if before_gpu:
+ gpu_msg += f" (Δ Allocated: {device['allocated'] - before_gpu['allocated']:+.2f}MB, Δ Reserved: {device['reserved'] - before_gpu['reserved']:+.2f}MB)"
+
+ logger.info(gpu_msg)
+
+
+# Create a global memory tracker instance
+memory_tracker = MemoryTracker()
+
+
+# Function to use in other modules
+def log_memory_usage(phase: str = "current"):
+ """Global function to log memory usage"""
+ memory_tracker.log_memory_usage(phase)
diff --git a/llm_perf/common/utils.py b/llm_perf/common/utils.py
index bf67fe9..4f90947 100644
--- a/llm_perf/common/utils.py
+++ b/llm_perf/common/utils.py
@@ -1,6 +1,7 @@
import pandas as pd
from llm_perf.common.dependency import get_benchmark_top_n, is_debug_mode
+from loguru import logger
INPUT_SHAPES = {"batch_size": 1, "sequence_length": 256}
GENERATE_KWARGS = {"max_new_tokens": 64, "min_new_tokens": 64}
@@ -47,7 +48,7 @@ def get_top_llm_list(n: int = 10) -> list[str]:
return top_models
except Exception as e:
- print(f"Error fetching top LLM list: {e}")
+ logger.error(f"Error fetching top LLM list: {e}")
return []
@@ -55,6 +56,3 @@ def get_top_llm_list(n: int = 10) -> list[str]:
CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["bigscience/bloomz-560m"]
else:
CANONICAL_PRETRAINED_OPEN_LLM_LIST = get_top_llm_list(n=get_benchmark_top_n())
- print(
- f"Benchamrking the following {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models: {CANONICAL_PRETRAINED_OPEN_LLM_LIST}"
- )
diff --git a/llm_perf/dashboard_app.py b/llm_perf/dashboard_app.py
new file mode 100644
index 0000000..eba6808
--- /dev/null
+++ b/llm_perf/dashboard_app.py
@@ -0,0 +1,216 @@
+import gradio as gr
+import pandas as pd
+from datetime import datetime, timedelta
+import plotly.express as px
+from plotly.graph_objs._figure import Figure
+from loguru import logger
+from typing import Optional, Tuple, List
+
+from llm_perf.common.dashboard_manager import DashboardManager
+
+
+def create_status_plot(df: pd.DataFrame) -> Optional[Figure]:
+ """Create a status plot showing success/failure over time."""
+ if df.empty:
+ return None
+
+ # Ensure last_updated is datetime
+ df["last_updated"] = pd.to_datetime(df["last_updated"])
+ df["success_str"] = df["success"].map({True: "Success", False: "Failure"})
+
+ # Create hover text with more details
+ df["hover_text"] = df.apply(
+ lambda row: f"Model: {row['model']}
"
+ + f"Hardware: {row['hardware']}
"
+ + f"Machine: {row['machine']}
"
+ + f"Status: {row['success_str']}
"
+ + f"Time: {row['last_updated'].strftime('%Y-%m-%d %H:%M:%S')}",
+ axis=1,
+ )
+
+ fig = px.scatter(
+ df,
+ x="last_updated",
+ y="model",
+ color="success_str",
+ title="Benchmark Status Over Time",
+ labels={"last_updated": "Time", "model": "Model", "success_str": "Status"},
+ hover_data=["hover_text"],
+ height=600,
+ ) # Make plot taller to accommodate more models
+
+ # Update layout for better readability
+ fig.update_layout(
+ xaxis_title="Time",
+ yaxis_title="Model",
+ showlegend=True,
+ legend_title="Status",
+ hovermode="closest",
+ )
+
+ return fig
+
+
+def create_hardware_stats(df: pd.DataFrame) -> Optional[Figure]:
+ """Create statistics about hardware usage."""
+ if df.empty:
+ return None
+
+ stats = (
+ df.groupby(["hardware", "machine"])["success"]
+ .agg(["count", "mean"])
+ .reset_index()
+ )
+ # Calculate success rate as percentage
+ stats["success_rate"] = (stats["mean"] * 100).round(2)
+ # Drop the mean column since we've converted it to success_rate
+ stats = stats.drop("mean", axis=1)
+ stats = stats.rename(columns={"count": "total_runs"})
+
+ fig = px.bar(
+ stats,
+ x="hardware",
+ y="total_runs",
+ color="success_rate",
+ title="Hardware Usage and Success Rate",
+ labels={
+ "hardware": "Hardware Type",
+ "total_runs": "Total Runs",
+ "success_rate": "Success Rate (%)",
+ },
+ )
+ return fig
+
+
+class DashboardApp:
+ def __init__(self):
+ self.dashboard_manager = DashboardManager()
+
+ def refresh_data(
+ self,
+ time_range: str,
+ machine: str = "All",
+ hardware: str = "All",
+ model: str = "All",
+ ) -> Tuple[Optional[Figure], Optional[Figure], Optional[List[List[str]]]]:
+ """
+ Refresh dashboard data based on filters.
+
+ Args:
+ time_range: Time range to filter (e.g., '1d', '7d', '30d', 'all')
+ machine: Machine name filter
+ hardware: Hardware type filter
+ model: Model name filter
+
+ Returns:
+ Tuple of (status plot, hardware stats plot, data table)
+ """
+ try:
+ # Get the data
+ df = self.dashboard_manager.get_latest_runs(
+ machine=machine if machine != "All" else None,
+ hardware=hardware if hardware != "All" else None,
+ model=model if model != "All" else None,
+ )
+
+ if df.empty:
+ return None, None, None
+
+ # Apply time range filter
+ if time_range != "all":
+ days = int(time_range[:-1])
+ cutoff = datetime.now() - timedelta(days=days)
+ df = df[df["last_updated"] >= cutoff]
+
+ # Create visualizations
+ status_plot = create_status_plot(df)
+ hardware_plot = create_hardware_stats(df)
+
+ # Prepare table data
+ table_df = df[
+ ["model", "hardware", "machine", "success", "last_updated"]
+ ].copy()
+ table_df["last_updated"] = table_df["last_updated"].dt.strftime(
+ "%Y-%m-%d %H:%M:%S"
+ )
+ table_data: List[List[str]] = [
+ [str(val) for val in row] for row in table_df.values.tolist()
+ ]
+
+ return status_plot, hardware_plot, table_data
+
+ except Exception as e:
+ logger.error(f"Error refreshing dashboard data: {str(e)}")
+ return None, None, None
+
+ def launch(self, port: int = 7860, share: bool = False):
+ """Launch the Gradio interface.
+
+ Args:
+ port: Port to run the dashboard on
+ share: Whether to create a public URL
+ """
+ with gr.Blocks(title="LLM Performance Dashboard") as interface:
+ gr.Markdown("# 🚀 LLM Performance Dashboard")
+ gr.Markdown(
+ "Monitor the status and performance of LLM benchmarks across different hardware configurations."
+ )
+
+ with gr.Row():
+ time_range = gr.Dropdown(
+ choices=["1d", "7d", "30d", "all"], value="7d", label="Time Range"
+ )
+ machine = gr.Dropdown(
+ choices=["All"], # Will be populated dynamically
+ value="All",
+ label="Machine",
+ )
+ hardware = gr.Dropdown(
+ choices=["All"], # Will be populated dynamically
+ value="All",
+ label="Hardware",
+ )
+ model = gr.Dropdown(
+ choices=["All"], # Will be populated dynamically
+ value="All",
+ label="Model",
+ )
+ refresh_btn = gr.Button("🔄 Refresh")
+
+ with gr.Row():
+ status_plot = gr.Plot(label="Benchmark Status")
+ hardware_plot = gr.Plot(label="Hardware Statistics")
+
+ with gr.Row():
+ results_table = gr.Dataframe(
+ headers=["Model", "Hardware", "Machine", "Success", "Last Updated"],
+ label="Recent Benchmark Results",
+ )
+
+ # Update function
+ def update_dashboard(
+ time_range: str, machine: str, hardware: str, model: str
+ ):
+ return self.refresh_data(time_range, machine, hardware, model)
+
+ # Register update function
+ refresh_btn.click(
+ fn=update_dashboard,
+ inputs=[time_range, machine, hardware, model],
+ outputs=[status_plot, hardware_plot, results_table],
+ )
+
+ # Auto-refresh on load
+ interface.load(
+ fn=update_dashboard,
+ inputs=[time_range, machine, hardware, model],
+ outputs=[status_plot, hardware_plot, results_table],
+ )
+
+ # Launch the interface with specified parameters
+ interface.launch(server_port=port, share=share)
+
+
+if __name__ == "__main__":
+ app = DashboardApp()
+ app.launch()
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
index 0dd368b..90e8552 100644
--- a/llm_perf/update_llm_perf_leaderboard.py
+++ b/llm_perf/update_llm_perf_leaderboard.py
@@ -8,9 +8,9 @@
import json
from llm_perf.common.hardware_config import load_hardware_configs
-from huggingface_hub.utils import disable_progress_bars
+from loguru import logger
-disable_progress_bars()
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
REPO_TYPE = "dataset"
MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard"
@@ -93,7 +93,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str):
path_in_repo=perf_df,
path_or_fileobj=perf_df,
)
- print(f"Uploaded {perf_df} to {MAIN_REPO_ID}")
+ logger.info(f"Uploaded {perf_df} to {MAIN_REPO_ID}")
# def check_if_url_exists(url: str):
@@ -101,7 +101,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str):
# Check if a URL exists
# """
# repo_exists
-# print(f"response: {response}")
+# logger.info(f"response: {response}")
# return response.status_code == 200
@@ -123,31 +123,33 @@ def update_perf_dfs():
hardware_config.hardware,
)
except Exception:
- print("Dataset not found for:")
- print(f" • Backend: {backend}")
- print(f" • Subset: {subset}")
- print(f" • Machine: {hardware_config.machine}")
- print(f" • Hardware Type: {hardware_config.hardware}")
+ logger.error("Dataset not found for:")
+ logger.error(f" • Backend: {backend}")
+ logger.error(f" • Subset: {subset}")
+ logger.error(f" • Machine: {hardware_config.machine}")
+ logger.error(f" • Hardware Type: {hardware_config.hardware}")
url = f"{PERF_REPO_ID.format(subset=subset, machine=hardware_config.machine, backend=backend, hardware=hardware_config.hardware)}"
does_exist = repo_exists(url, repo_type="dataset")
if does_exist:
- print(f"Dataset exists: {url} but could not be processed")
+ logger.error(
+ f"Dataset exists: {url} but could not be processed"
+ )
def update_llm_df():
"""
Scrape the open-llm-leaderboard and update the leaderboard dataframe
"""
-
+
scrapping_script = """
git clone https://github.com/Weyaxi/scrape-open-llm-leaderboard.git
pip install -r scrape-open-llm-leaderboard/requirements.txt -q
python scrape-open-llm-leaderboard/main.py
rm -rf scrape-open-llm-leaderboard
"""
-
+
subprocess.run(scrapping_script, shell=True)
create_repo(repo_id=MAIN_REPO_ID, repo_type=REPO_TYPE, exist_ok=True, private=False)
upload_file(
diff --git a/setup.py b/setup.py
index 7f6e115..ec61ad5 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,10 @@
"huggingface_hub[hf_transfer]",
"datasets>=2.14.6",
"beautifulsoup4",
+ "loguru",
"optimum-benchmark @ git+https://github.com/huggingface/optimum-benchmark.git",
+ "psutil",
+ "torch<2.6.0",
]
# Optional dependencies
@@ -28,11 +31,12 @@
"optimum-benchmark[openvino] @ git+https://github.com/huggingface/optimum-benchmark.git"
],
"cuda": [
- "flash-attn",
- "auto-gptq",
- "bitsandbytes",
- "autoawq",
- "torchao",
+ "optimum-benchmark[bitsandbytes,autoawq,auto-gptq] @ git+https://github.com/huggingface/optimum-benchmark.git",
+ ],
+ "dashboard": [
+ "gradio",
+ "pandas",
+ "plotly",
],
}