diff --git a/.github/workflows/benchmark_cpu_onnxruntime.yaml b/.github/workflows/benchmark_cpu_onnxruntime.yaml index 6d40a5c..999e975 100644 --- a/.github/workflows/benchmark_cpu_onnxruntime.yaml +++ b/.github/workflows/benchmark_cpu_onnxruntime.yaml @@ -2,8 +2,14 @@ name: Benchmark CPU Onnxruntime on: workflow_dispatch: + inputs: + rerun_already_conducted_benchmarks: + description: 'Rerun benchmarks that were already conducted' + required: true + type: boolean + default: false schedule: - - cron: "0 12 * * 3" + - cron: "0 12 * * *" pull_request: concurrency: @@ -16,15 +22,18 @@ env: jobs: run_benchmarks: if: >- - (github.event_name == 'push' && contains(github.event.head_commit.message, 'cpu_onnxruntime')) || - (github.event_name == 'push' && contains(github.event.head_commit.message, 'all_benchmarks')) || + (github.event_name == 'push' && ( + contains(github.event.head_commit.message, 'cpu_onnxruntime') || + contains(github.event.head_commit.message, 'all_benchmarks') + )) || (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && ( contains(github.event.pull_request.labels.*.name, 'leaderboard') || contains(github.event.pull_request.labels.*.name, 'cpu') || contains(github.event.pull_request.labels.*.name, 'onnxruntime') || - contains(github.event.pull_request.labels.*.name, 'cpu_onnxruntime') + contains(github.event.pull_request.labels.*.name, 'cpu_onnxruntime') || + contains(github.event.pull_request.labels.*.name, 'all_benchmarks') )) strategy: @@ -48,6 +57,7 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} DISABLE_WARNINGS: 1 BENCHMARK_TOP_N: 3 + RERUN_ALREADY_CONDUCTED_BENCHMARKS: ${{ github.event.inputs.rerun_already_conducted_benchmarks }} with: image: ${{ env.IMAGE }} options: | @@ -60,6 +70,7 @@ jobs: --env HF_HUB_ENABLE_HF_TRANSFER=1 --env DISABLE_WARNINGS --env BENCHMARK_TOP_N + --env RERUN_ALREADY_CONDUCTED_BENCHMARKS --volume ${{ github.workspace }}:/workspace --workdir /workspace run: | diff --git a/.github/workflows/benchmark_cpu_openvino.yaml b/.github/workflows/benchmark_cpu_openvino.yaml index 9e4c70f..1dab78b 100644 --- a/.github/workflows/benchmark_cpu_openvino.yaml +++ b/.github/workflows/benchmark_cpu_openvino.yaml @@ -2,8 +2,14 @@ name: Benchmark CPU OpenVINO on: workflow_dispatch: + inputs: + rerun_already_conducted_benchmarks: + description: 'Rerun benchmarks that were already conducted' + required: true + type: boolean + default: false schedule: - - cron: "0 0 * * *" + - cron: "0 6 * * *" pull_request: concurrency: @@ -16,15 +22,18 @@ env: jobs: run_benchmarks: if: >- - (github.event_name == 'push' && contains(github.event.head_commit.message, 'cpu_openvino')) || - (github.event_name == 'push' && contains(github.event.head_commit.message, 'all_benchmarks')) || + (github.event_name == 'push' && ( + contains(github.event.head_commit.message, 'cpu_openvino') || + contains(github.event.head_commit.message, 'all_benchmarks') + )) || (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && ( contains(github.event.pull_request.labels.*.name, 'leaderboard') || contains(github.event.pull_request.labels.*.name, 'cpu') || contains(github.event.pull_request.labels.*.name, 'openvino') || - contains(github.event.pull_request.labels.*.name, 'cpu_openvino') + contains(github.event.pull_request.labels.*.name, 'cpu_openvino') || + contains(github.event.pull_request.labels.*.name, 'all_benchmarks') )) strategy: @@ -48,6 +57,7 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} DISABLE_WARNINGS: 1 BENCHMARK_TOP_N: 50 + RERUN_ALREADY_CONDUCTED_BENCHMARKS: ${{ github.event.inputs.rerun_already_conducted_benchmarks }} with: image: ${{ env.IMAGE }} options: | @@ -60,6 +70,7 @@ jobs: --env HF_HUB_ENABLE_HF_TRANSFER=1 --env DISABLE_WARNINGS --env BENCHMARK_TOP_N + --env RERUN_ALREADY_CONDUCTED_BENCHMARKS --volume ${{ github.workspace }}:/workspace --workdir /workspace run: | diff --git a/.github/workflows/benchmark_cpu_pytorch.yaml b/.github/workflows/benchmark_cpu_pytorch.yaml index d0ca50a..a287606 100644 --- a/.github/workflows/benchmark_cpu_pytorch.yaml +++ b/.github/workflows/benchmark_cpu_pytorch.yaml @@ -2,6 +2,12 @@ name: Benchmark CPU PyTorch on: workflow_dispatch: + inputs: + rerun_already_conducted_benchmarks: + description: 'Rerun benchmarks that were already conducted' + required: true + type: boolean + default: false schedule: - cron: "0 0 * * *" pull_request: @@ -16,15 +22,18 @@ env: jobs: run_benchmarks: if: >- - (github.event_name == 'push' && contains(github.event.head_commit.message, 'cpu_pytorch')) || - (github.event_name == 'push' && contains(github.event.head_commit.message, 'all_benchmarks')) || + (github.event_name == 'push' && ( + contains(github.event.head_commit.message, 'cpu_pytorch') || + contains(github.event.head_commit.message, 'all_benchmarks') + )) || (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && ( contains(github.event.pull_request.labels.*.name, 'leaderboard') || contains(github.event.pull_request.labels.*.name, 'cpu') || contains(github.event.pull_request.labels.*.name, 'pytorch') || - contains(github.event.pull_request.labels.*.name, 'cpu_pytorch') + contains(github.event.pull_request.labels.*.name, 'cpu_pytorch') || + contains(github.event.pull_request.labels.*.name, 'all_benchmarks') )) strategy: @@ -47,6 +56,7 @@ jobs: MACHINE: ${{ matrix.machine.name }} HF_TOKEN: ${{ secrets.HF_TOKEN }} BENCHMARK_TOP_N: 50 + RERUN_ALREADY_CONDUCTED_BENCHMARKS: ${{ github.event.inputs.rerun_already_conducted_benchmarks }} with: image: ${{ env.IMAGE }} options: | @@ -58,6 +68,7 @@ jobs: --env MKL_THREADING_LAYER=GNU --env HF_HUB_ENABLE_HF_TRANSFER=1 --env BENCHMARK_TOP_N + --env RERUN_ALREADY_CONDUCTED_BENCHMARKS --volume ${{ github.workspace }}:/workspace --workdir /workspace run: | diff --git a/.github/workflows/benchmark_cuda_pytorch.yaml b/.github/workflows/benchmark_cuda_pytorch.yaml index d87204b..d04f8d0 100644 --- a/.github/workflows/benchmark_cuda_pytorch.yaml +++ b/.github/workflows/benchmark_cuda_pytorch.yaml @@ -2,8 +2,14 @@ name: Benchmark CUDA PyTorch on: workflow_dispatch: + inputs: + rerun_already_conducted_benchmarks: + description: 'Rerun benchmarks that were already conducted' + required: true + type: boolean + default: false schedule: - - cron: "0 3 * * 0" + - cron: "0 0 * * *" pull_request: concurrency: @@ -16,15 +22,18 @@ env: jobs: run_benchmarks: if: >- - (github.event_name == 'push' && contains(github.event.head_commit.message, 'cuda_pytorch')) || - (github.event_name == 'push' && contains(github.event.head_commit.message, 'all_benchmarks')) || + (github.event_name == 'push' && ( + contains(github.event.head_commit.message, 'cuda_pytorch') || + contains(github.event.head_commit.message, 'all_benchmarks') + )) || (github.event_name == 'push' && github.ref == 'refs/heads/main') || github.event_name == 'workflow_dispatch' || (github.event_name == 'pull_request' && ( contains(github.event.pull_request.labels.*.name, 'leaderboard') || contains(github.event.pull_request.labels.*.name, 'cuda') || contains(github.event.pull_request.labels.*.name, 'pytorch') || - contains(github.event.pull_request.labels.*.name, 'cuda_pytorch') + contains(github.event.pull_request.labels.*.name, 'cuda_pytorch') || + contains(github.event.pull_request.labels.*.name, 'all_benchmarks') )) strategy: @@ -51,6 +60,7 @@ jobs: MACHINE: ${{ matrix.machine.name }} HF_TOKEN: ${{ secrets.HF_TOKEN }} BENCHMARK_TOP_N: 50 + RERUN_ALREADY_CONDUCTED_BENCHMARKS: ${{ github.event.inputs.rerun_already_conducted_benchmarks }} with: image: ${{ env.IMAGE }} options: | @@ -63,6 +73,7 @@ jobs: --env MKL_THREADING_LAYER=GNU --env HF_HUB_ENABLE_HF_TRANSFER=1 --env BENCHMARK_TOP_N + --env RERUN_ALREADY_CONDUCTED_BENCHMARKS --volume ${{ github.workspace }}:/workspace --workdir /workspace run: | diff --git a/.gitignore b/.gitignore index 3fcc481..6800897 100644 --- a/.gitignore +++ b/.gitignore @@ -191,3 +191,4 @@ optimum-benchmark/ *.egg-info/ data/ +load_model_codecarbon.json \ No newline at end of file diff --git a/Makefile b/Makefile index dc0ad56..f9ed5f1 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,11 @@ +# Load environment variables +ifneq (,$(wildcard .env)) + include .env + export +endif + # Style and Quality checks -.PHONY: style quality install install-dev run_cpu_container run_cuda_container run_rocm_container cpu-pytorch-container cpu-openvino-container collector-container +.PHONY: style quality run_cpu_container run_cuda_container run_rocm_container cpu-pytorch-container cpu-openvino-container collector-container help quality: ruff check . @@ -9,31 +15,46 @@ style: ruff format . ruff check --fix . -install: - pip install . - -install-dev: - DEBUG=1 uv pip install -e . - # Running optimum-benchmark containers -run_cpu_container: +run-optimum-benchmark-cpu-container: docker run -it --rm --pid host --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-cpu -run_cuda_container: +run-optimum-benchmark-cuda-container: docker run -it --rm --pid host --gpus all --shm-size 64G --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-cuda -run_rocm_container: +run-optimum-benchmark-rocm-container: docker run -it --rm --shm-size 64G --device /dev/kfd --device /dev/dri --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-rocm -# Running llm-perf backend containers -cpu-pytorch-container: - docker build -t cpu-pytorch -f docker/cpu-pytorch/Dockerfile . - docker run -it --rm --pid host cpu-pytorch - -cpu-openvino-container: - docker build -t cpu-openvino -f docker/cpu-openvino/Dockerfile . - docker run -it --rm --pid host cpu-openvino +# Running llm-perf-leaderboard benchmarks +run-llm-perf-benchmark-cpu-pytorch: + docker build -t llm-perf-backend-cpu-pytorch -f docker/cpu-pytorch/Dockerfile . + docker run -it --rm --pid host llm-perf-backend-cpu-pytorch + +run-llm-perf-benchmark-cpu-openvino: + docker build -t llm-perf-backend-cpu-openvino -f docker/cpu-openvino/Dockerfile . + docker run -it --rm --pid host llm-perf-backend-cpu-openvino + +run-llm-perf-benchmark-cuda-pytorch: + docker build -t llm-perf-backend-cuda-pytorch -f docker/gpu-cuda/Dockerfile . + docker run -it --rm --pid host --gpus all --shm-size 64G --volume .:/llm-perf-backend --workdir /llm-perf-backend llm-perf-backend-cuda-pytorch + +run-llm-perf-benchmark-collector: + docker build -t llm-perf-backend-collector -f docker/collector/Dockerfile . + docker run -it --rm --pid host llm-perf-backend-collector + +help: + @echo "Commands:" + @echo " style - Format code and fix style issues" + @echo " quality - Run style checks without fixing" + @echo "" + @echo "Optimum Benchmark Containers:" + @echo " run-optimum-benchmark-cpu-container - Run CPU container" + @echo " run-optimum-benchmark-cuda-container - Run CUDA container" + @echo " run-optimum-benchmark-rocm-container - Run ROCm container" + @echo "" + @echo "LLM Performance Backend Containers:" + @echo " run-llm-perf-benchmark-cpu-pytorch - Run the llm-perf-leaderboard Benchmark CPU PyTorch" + @echo " run-llm-perf-benchmark-cpu-openvino - Run the llm-perf-leaderboard Benchmark CPU OpenVINO" + @echo " run-llm-perf-benchmark-cuda-pytorch - Run the llm-perf-leaderboard Benchmark CUDA PyTorch" + @echo " run-llm-perf-benchmark-collector - Run the llm-perf-leaderboard Collector container" -collector-container: - docker build -t collector -f docker/collector/Dockerfile . - docker run -it --rm --pid host collector diff --git a/README.md b/README.md index 7c69305..d25bd97 100644 --- a/README.md +++ b/README.md @@ -23,8 +23,7 @@ LLM-perf Backend is designed to: ## Installation 🛠️ -1. Clone the repository: -```bash +1. Clone the repository:```bash git clone https://github.com/huggingface/llm-perf-backend cd llm-perf-backend ``` @@ -53,7 +52,6 @@ llm-perf run-benchmark --hardware cpu --backend pytorch ``` ### Configuration Options - View all the options with ```bash llm-perf run-benchmark --help @@ -62,6 +60,18 @@ llm-perf run-benchmark --help - `--hardware`: Target hardware platform (cpu, cuda) - `--backend`: Backend framework to use (pytorch, onnxruntime, etc.) +### (Optional) Running Benchmarks via Docker + +You can run the benchmarks using the following make commands: + +```bash +# CPU Benchmarks +make run-llm-perf-benchmark-cpu-pytorch # Run PyTorch CPU benchmark +make run-llm-perf-benchmark-cpu-openvino # Run OpenVINO CPU benchmark + +# GPU Benchmarks +make run-llm-perf-benchmark-cuda-pytorch # Run PyTorch CUDA benchmark + ## Benchmark Dataset 📊 Results are published to the official dataset: @@ -75,4 +85,5 @@ All benchmarks follow these standardized settings: - Memory tracking: - Maximum allocated memory - Maximum reserved memory - - Maximum used memory (via PyNVML for GPU) \ No newline at end of file + - Maximum used memory (via PyNVML for GPU) + diff --git a/dashboard/main.py b/dashboard/main.py new file mode 100644 index 0000000..df3fd88 --- /dev/null +++ b/dashboard/main.py @@ -0,0 +1,4 @@ +# -> need to view on the indvidual runs to get details +# -> get stats about the latest runs for all the hardware.yml +# -> get stats on the latest github actions +# -> get the stats on the top 50 models diff --git a/dashboard/requirements.txt b/dashboard/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/docker/cpu-onnxruntime/Dockerfile b/docker/cpu-onnxruntime/Dockerfile new file mode 100644 index 0000000..03ec92d --- /dev/null +++ b/docker/cpu-onnxruntime/Dockerfile @@ -0,0 +1,11 @@ +FROM ghcr.io/huggingface/optimum-benchmark:latest-cpu + +WORKDIR /workspace + +COPY setup.py . + +RUN pip install -e .[onnxruntime] + +COPY . . + +ENTRYPOINT ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "onnxruntime"] diff --git a/docker/cpu-openvino/Dockerfile b/docker/cpu-openvino/Dockerfile index 2f88e1e..4446353 100644 --- a/docker/cpu-openvino/Dockerfile +++ b/docker/cpu-openvino/Dockerfile @@ -3,10 +3,9 @@ FROM ghcr.io/huggingface/optimum-benchmark:latest-cpu WORKDIR /workspace COPY setup.py . -# COPY pyproject.toml . RUN pip install -e .[openvino] COPY . . -CMD ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "openvino"] +ENTRYPOINT ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "openvino"] diff --git a/docker/cpu-pytorch/Dockerfile b/docker/cpu-pytorch/Dockerfile index f6e3cc7..76052d0 100644 --- a/docker/cpu-pytorch/Dockerfile +++ b/docker/cpu-pytorch/Dockerfile @@ -2,8 +2,10 @@ FROM ghcr.io/huggingface/optimum-benchmark:latest-cpu WORKDIR /workspace -COPY . . +COPY setup.py . RUN pip install -e . -CMD ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "pytorch"] +COPY . . + +ENTRYPOINT ["llm-perf", "run-benchmark", "--hardware", "cpu", "--backend", "pytorch"] diff --git a/docker/gpu-cuda/Dockerfile b/docker/gpu-cuda/Dockerfile new file mode 100644 index 0000000..3e503b4 --- /dev/null +++ b/docker/gpu-cuda/Dockerfile @@ -0,0 +1,12 @@ +FROM ghcr.io/huggingface/optimum-benchmark:latest-cuda + +WORKDIR /workspace + +COPY setup.py . + +RUN pip install -e .[cuda] \ + && pip install flash-attn --no-build-isolation + +COPY . . + +ENTRYPOINT ["llm-perf", "run-benchmark", "--hardware", "cuda", "--backend", "pytorch"] diff --git a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py index 2d8861e..eac63b5 100644 --- a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py +++ b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py @@ -19,9 +19,9 @@ def __init__(self): super().__init__(backend="onnxruntime", device="cpu") self.attention_configs = self._get_attention_configs() - assert ( - self.subset is not None - ), "SUBSET environment variable must be set for benchmarking" + assert self.subset is not None, ( + "SUBSET environment variable must be set for benchmarking" + ) self.weights_configs = self._get_weights_configs(self.subset) def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: @@ -47,9 +47,9 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: weights_config = kwargs["weights_config"] attn_implementation = kwargs["attn_implementation"] - assert ( - weights_config in self.weights_configs - ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue" + assert weights_config in self.weights_configs, ( + f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue" + ) torch_dtype = self.weights_configs[weights_config]["torch_dtype"] quant_config = self.weights_configs[weights_config]["quant_config"] diff --git a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py index 0a1f771..0d7e3f8 100644 --- a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py +++ b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py @@ -19,9 +19,9 @@ def __init__(self): super().__init__(backend="openvino", device="cpu") self.attention_configs = self._get_attention_configs() - assert ( - self.subset is not None - ), "SUBSET environment variable must be set for benchmarking" + assert self.subset is not None, ( + "SUBSET environment variable must be set for benchmarking" + ) self.weights_configs = self._get_weights_configs(self.subset) def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: @@ -47,9 +47,9 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: weights_config = kwargs["weights_config"] attn_implementation = kwargs["attn_implementation"] - assert ( - weights_config in self.weights_configs - ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue" + assert weights_config in self.weights_configs, ( + f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue" + ) quant_config = self.weights_configs[weights_config]["quant_config"] diff --git a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py index c14cf6b..1c4440f 100644 --- a/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py +++ b/llm_perf/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py @@ -19,9 +19,9 @@ def __init__(self): super().__init__(backend="pytorch", device="cpu") self.attention_configs = self._get_attention_configs() - assert ( - self.subset is not None - ), "SUBSET environment variable must be set for benchmarking" + assert self.subset is not None, ( + "SUBSET environment variable must be set for benchmarking" + ) self.weights_configs = self._get_weights_configs(self.subset) def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: @@ -47,9 +47,9 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: weights_config = kwargs["weights_config"] attn_implementation = kwargs["attn_implementation"] - assert ( - weights_config in self.weights_configs - ), f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue" + assert weights_config in self.weights_configs, ( + f"your config does not contain {weights_config}, adjust your _get_weights_configs to fix this issue" + ) torch_dtype = self.weights_configs[weights_config]["torch_dtype"] quant_scheme = self.weights_configs[weights_config]["quant_scheme"] diff --git a/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py index fef89c9..b2f717b 100644 --- a/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py +++ b/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py @@ -19,9 +19,9 @@ def __init__(self): super().__init__(backend="pytorch", device="cuda") self.attention_configs = self._get_attention_configs() - assert ( - self.subset is not None - ), "SUBSET environment variable must be set for benchmarking" + assert self.subset is not None, ( + "SUBSET environment variable must be set for benchmarking" + ) self.weights_configs = self._get_weights_configs(self.subset) def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: @@ -55,9 +55,9 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: weights_config = kwargs["weights_config"] attn_implementation = kwargs["attn_implementation"] - assert ( - weights_config in self.weights_configs - ), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue" + assert weights_config in self.weights_configs, ( + f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue" + ) torch_dtype = self.weights_configs[weights_config]["torch_dtype"] quant_scheme = self.weights_configs[weights_config]["quant_scheme"] @@ -206,7 +206,7 @@ def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]: raise ValueError(f"Unknown subset: {subset}") def _get_attention_configs(self) -> List[str]: - return ["eager", "sdpa", "flash_attention_2"] + return ["eager", "sdpa"] if __name__ == "__main__": diff --git a/llm_perf/cli.py b/llm_perf/cli.py index 6993744..ec42640 100644 --- a/llm_perf/cli.py +++ b/llm_perf/cli.py @@ -20,9 +20,13 @@ from llm_perf.update_llm_perf_leaderboard import update_llm_perf_leaderboard +from loguru import logger + if os.environ.get("DISABLE_WARNINGS", "0") == "1": warnings.filterwarnings("ignore") +os.environ["CI"] = "GITHUB_ACTIONS" + app = typer.Typer() @@ -46,9 +50,9 @@ def run_benchmark( ): env_vars = load_dotenv() if env_vars: - print("Environment variables loaded successfully") + logger.info("Environment variables loaded successfully") else: - print("No environment variables loaded") + logger.info("No environment variables loaded") if hardware == Hardware.CPU: if backend == Backend.ONNXRUNTIME: @@ -61,7 +65,7 @@ def run_benchmark( if backend == Backend.PYTORCH: runner = CUDAPyTorchBenchmarkRunner() else: - typer.echo(f"CUDA is not supported for {backend} backend") + logger.error(f"CUDA is not supported for {backend} backend") raise typer.Exit(code=1) runner.run_benchmarks() @@ -72,5 +76,19 @@ def update_leaderboard(): update_llm_perf_leaderboard() +@app.command() +def launch_dashboard( + port: int = typer.Option(7860, help="Port to run the dashboard on"), + share: bool = typer.Option(False, help="Whether to create a public URL"), +): + """Launch the LLM Performance Dashboard.""" + from llm_perf.dashboard_app import DashboardApp + + logger.info(f"Starting dashboard on port {port}") + + app = DashboardApp() + app.launch() + + if __name__ == "__main__": app() diff --git a/llm_perf/common/benchmark_runner.py b/llm_perf/common/benchmark_runner.py index e6ecac8..33de8ca 100644 --- a/llm_perf/common/benchmark_runner.py +++ b/llm_perf/common/benchmark_runner.py @@ -1,17 +1,22 @@ import os +import sys import traceback from abc import ABC, abstractmethod -from logging import getLogger from typing import Any, Dict, List, Optional +import subprocess +import time +import uuid +from datetime import datetime +from loguru import logger from optimum_benchmark import Benchmark, BenchmarkConfig, BenchmarkReport -from optimum_benchmark.logging_utils import setup_logging from llm_perf.common.utils import ( CANONICAL_PRETRAINED_OPEN_LLM_LIST, - OPEN_LLM_LIST, - PRETRAINED_OPEN_LLM_LIST, ) +from llm_perf.common.memory_utils import log_memory_usage +from llm_perf.common.dashboard import BenchmarkRunDetails +from llm_perf.common.dashboard_manager import DashboardManager class LLMPerfBenchmarkManager(ABC): @@ -26,7 +31,7 @@ def __init__( self.device = device self.subset = subset or os.getenv("SUBSET", None) self.machine = machine or os.getenv("MACHINE", None) - self.logger = getLogger("llm-perf-backend") + self.dashboard_manager = DashboardManager() if self.machine is None and self.subset is None: self.push_repo_id = ( @@ -34,6 +39,7 @@ def __init__( ) self.canonical_pretrained_open_llm_list = ["gpt2"] self.subset = "unquantized" + self.machine = "debug" # Set a default machine name for debug mode elif self.machine is not None and self.subset is not None: self.push_repo_id = f"optimum-benchmark/llm-perf-{self.backend}-{self.device}-{self.subset}-{self.machine}" else: @@ -41,12 +47,8 @@ def __init__( "Either both MACHINE and SUBSET should be set for benchmarking or neither for debugging" ) - self.logger.info(f"len(OPEN_LLM_LIST): {len(OPEN_LLM_LIST)}") - self.logger.info( - f"len(PRETRAINED_OPEN_LLM_LIST): {len(PRETRAINED_OPEN_LLM_LIST)}" - ) - self.logger.info( - f"len(CANONICAL_PRETRAINED_OPEN_LLM_LIST): {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)}" + logger.info( + f"Starting benchmark runner with backend: {self.backend}, device: {self.device}, subset: {self.subset}, machine: {self.machine}" ) @abstractmethod @@ -73,22 +75,188 @@ def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: "This method should be implemented in the child class" ) - def run_benchmarks(self): - os.environ["LOG_TO_FILE"] = "0" - os.environ["LOG_LEVEL"] = "INFO" - setup_logging(level="INFO", prefix="MAIN-PROCESS") + def run_single_benchmark_in_subprocess( + self, model: str, run_id: str, run_start_time: str, **kwargs + ) -> bool: + """Run a single benchmark in a separate process""" + try: + # Create the Python script to run in subprocess + script = f""" +import sys +import os +from {self.__class__.__module__} import {self.__class__.__name__} +from loguru import logger +import traceback + +try: + runner = {self.__class__.__name__}() + + runner.run_benchmark(model="{model}", **{kwargs}) + sys.exit(0) +except Exception: + logger.error("Error in subprocess:" + "\\n" + traceback.format_exc()) + sys.exit(1) +""" + + # Run the subprocess with timeout + result = subprocess.run( + [sys.executable, "-c", script], + text=True, + env={ + **os.environ, + "PYTHONUNBUFFERED": "1", + "LOG_TO_FILE": "0", # Disable file logging for optimum-benchmark + "BENCHMARK_RUN_ID": run_id, + "BENCHMARK_START_TIME": run_start_time, + }, + timeout=3600, # 1 hour timeout + ) + + return result.returncode == 0 + + except subprocess.TimeoutExpired: + logger.error(f"Benchmark timed out for model {model}") + return False + except Exception: + logger.error( + "Failed to run benchmark process:" + "\n" + traceback.format_exc() + ) + return False + def run_benchmarks(self): + """Run all benchmarks sequentially with process isolation""" benchmarks_to_run = self.get_list_of_benchmarks_to_run() - self.logger.info( + logger.info( f"Running a total of {len(benchmarks_to_run)} benchmarks, " f"with {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models" ) - for benchmark_name in benchmarks_to_run: - assert "model" in benchmark_name, "each benchmark should have a model" + logger.info( + f"Models that are being benchmarked: {CANONICAL_PRETRAINED_OPEN_LLM_LIST}" + ) + + rerun_already_conducted_benchmarks = ( + os.getenv("RERUN_ALREADY_CONDUCTED_BENCHMARKS", "false") == "true" + ) + + total_benchmarks = len(benchmarks_to_run) + completed_benchmarks = 0 + failed_benchmarks = 0 + skipped_benchmarks = 0 + failed_models = [] + start_time = time.time() + + # Generate run ID and start time for this benchmark session + run_id = str(uuid.uuid4()) + run_start_time = datetime.now().isoformat() + + for benchmark_config in benchmarks_to_run: + try: + # Log memory before benchmark + logger.info("Memory usage before benchmark:") + log_memory_usage("before") + + model = benchmark_config.pop("model") # Remove model from kwargs + benchmark_name = self.get_benchmark_name(model, **benchmark_config) + subfolder = f"{benchmark_name}/{model.replace('/', '--')}" + + if not rerun_already_conducted_benchmarks: + if self.is_benchmark_conducted(self.push_repo_id, subfolder): + logger.info( + f"Skipping already conducted benchmark: {benchmark_name}" + ) + benchmark_config["model"] = model # Restore model key + completed_benchmarks += 1 + skipped_benchmarks += 1 + success_rate = ( + ( + (completed_benchmarks - failed_benchmarks) + / completed_benchmarks + ) + * 100 + if completed_benchmarks > 0 + else 100 + ) + logger.info( + f"\nProgress: {completed_benchmarks}/{total_benchmarks} benchmarks completed ({(completed_benchmarks / total_benchmarks) * 100:.1f}%) - Current success rate: {success_rate:.1f}%\n" + ) + continue + + logger.info( + f"Starting benchmark for model {model} with config: {benchmark_config}" + ) + + # Run the benchmark in a separate process + success = self.run_single_benchmark_in_subprocess( + model=model, + run_id=run_id, + run_start_time=run_start_time, + **benchmark_config, + ) + + if not success: + logger.error(f"Benchmark failed for model {model}") + failed_benchmarks += 1 + failed_models.append(model) - self.run_benchmark(**benchmark_name) + completed_benchmarks += 1 + success_rate = ( + ((completed_benchmarks - failed_benchmarks) / completed_benchmarks) + * 100 + if completed_benchmarks > 0 + else 100 + ) + logger.info( + f"\nProgress: {completed_benchmarks}/{total_benchmarks} benchmarks completed ({(completed_benchmarks / total_benchmarks) * 100:.1f}%) - Current success rate: {success_rate:.1f}%\n" + ) + + # Log memory after benchmark + logger.info("Memory usage after benchmark:") + log_memory_usage("after") + + except Exception as e: + logger.error(f"Failed to run benchmark for {model}: {str(e)}") + logger.error(traceback.format_exc()) + failed_benchmarks += 1 + failed_models.append(model) + finally: + # Restore model key in case the config is reused + benchmark_config["model"] = model + + # Calculate execution time + total_time = time.time() - start_time + hours = int(total_time // 3600) + minutes = int((total_time % 3600) // 60) + seconds = int(total_time % 60) + + # Print summary + logger.info("\n" + "=" * 50) + logger.info("BENCHMARK EXECUTION SUMMARY") + logger.info("=" * 50) + logger.info(f"Total execution time: {hours}h {minutes}m {seconds}s") + logger.info(f"Total benchmarks: {total_benchmarks}") + logger.info( + f"Successfully completed: {completed_benchmarks - failed_benchmarks}" + ) + logger.info(f"Failed: {failed_benchmarks}") + logger.info(f"Skipped (already conducted): {skipped_benchmarks}") + logger.info( + f"Success rate: {((completed_benchmarks - failed_benchmarks) / total_benchmarks) * 100:.1f}%" + ) + + if failed_models: + logger.info("\nFailed models:") + for model in failed_models: + logger.info(f" - {model}") + + logger.info("\nConfiguration:") + logger.info(f" Backend: {self.backend}") + logger.info(f" Device: {self.device}") + logger.info(f" Subset: {self.subset}") + logger.info(f" Machine: {self.machine}") + logger.info(f" Rerun already conducted: {rerun_already_conducted_benchmarks}") + logger.info("=" * 50 + "\n") def is_benchmark_conducted(self, push_repo_id, subfolder): try: @@ -114,18 +282,6 @@ def run_benchmark(self, **kwargs): benchmark_name = self.get_benchmark_name(model, **kwargs) subfolder = f"{benchmark_name}/{model.replace('/', '--')}" - if not self.is_benchmark_supported(**kwargs): - self.logger.info( - f"Skipping benchmark {benchmark_name} with model {model} since it is not supported" - ) - return - - if self.is_benchmark_conducted(self.push_repo_id, subfolder): - self.logger.info( - f"Skipping benchmark {benchmark_name} with model {model} since it was already conducted" - ) - return - benchmark_config = self.get_benchmark_config(model, **kwargs) benchmark_config.push_to_hub( repo_id=self.push_repo_id, subfolder=subfolder, private=True @@ -141,8 +297,23 @@ def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: def execute_and_log_benchmark( self, benchmark_config: BenchmarkConfig, subfolder: str ): + # Get run_id and run_start_time from environment variables + run_id = os.environ.get("BENCHMARK_RUN_ID") + run_start_time = os.environ.get("BENCHMARK_START_TIME") + + if not run_id or not run_start_time: + # Fallback to generating new ones if not provided + run_id = str(uuid.uuid4()) + run_start_time = datetime.now().isoformat() + + success = False + error_traceback = "" + try: - self.logger.info( + logger.info("Memory usage before execution:") + log_memory_usage("before") + + logger.info( f"Running benchmark {benchmark_config.name} with model {benchmark_config.backend.model}" ) benchmark_report = Benchmark.launch(benchmark_config) @@ -153,13 +324,17 @@ def execute_and_log_benchmark( benchmark.push_to_hub( repo_id=self.push_repo_id, subfolder=subfolder, private=True ) + + logger.info("Memory usage after execution:") + log_memory_usage("after") + + success = True + except Exception as e: - self.logger.error( - f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}, error:\n{e}" - ) - benchmark_report = BenchmarkReport.from_dict( - {"traceback": traceback.format_exc()} - ) + error_msg = f"Benchmark {benchmark_config.name} failed with model {benchmark_config.backend.model}, error:\n{e}" + logger.error(error_msg) + error_traceback = traceback.format_exc() + benchmark_report = BenchmarkReport.from_dict({"traceback": error_traceback}) benchmark_report.push_to_hub( repo_id=self.push_repo_id, subfolder=subfolder, private=True ) @@ -167,3 +342,26 @@ def execute_and_log_benchmark( benchmark.push_to_hub( repo_id=self.push_repo_id, subfolder=subfolder, private=True ) + + finally: + # At this point self.machine and self.subset should be strings + # If they're not, use default values + machine = self.machine if self.machine is not None else "unknown" + subset = self.subset if self.subset is not None else "unknown" + + # Create and upload run details + run_details = BenchmarkRunDetails( + machine=machine, + hardware=self.device, + subsets=subset, + backends=self.backend, + model=benchmark_config.backend.model, + success=success, + traceback=error_traceback, + last_updated=datetime.now().isoformat(), + run_id=run_id, + run_start_time=run_start_time, + ) + + # Upload to dashboard + self.dashboard_manager.upload_run_details(run_details) diff --git a/llm_perf/common/dashboard.py b/llm_perf/common/dashboard.py new file mode 100644 index 0000000..cba4345 --- /dev/null +++ b/llm_perf/common/dashboard.py @@ -0,0 +1,15 @@ +from dataclasses import dataclass + + +@dataclass +class BenchmarkRunDetails: + machine: str + hardware: str + subsets: str + backends: str + model: str + success: bool + traceback: str + last_updated: str + run_id: str + run_start_time: str diff --git a/llm_perf/common/dashboard_manager.py b/llm_perf/common/dashboard_manager.py new file mode 100644 index 0000000..88d2469 --- /dev/null +++ b/llm_perf/common/dashboard_manager.py @@ -0,0 +1,232 @@ +import pandas as pd +from datasets import Dataset, load_dataset +from huggingface_hub import create_repo, HfApi +from loguru import logger +from typing import List, Optional +import time + +from llm_perf.common.dashboard import BenchmarkRunDetails + +DASHBOARD_REPO_ID = "optimum-benchmark/llm-perf-dashboard" +MAX_RETRIES = 3 +RETRY_DELAY = 2 # seconds + + +class DashboardManager: + def __init__(self): + # Ensure the dataset repository exists + create_repo(repo_id=DASHBOARD_REPO_ID, repo_type="dataset", exist_ok=True) + self._current_commit = None + self._api = HfApi() + self._is_first_upload = False + + def _get_current_commit(self) -> Optional[str]: + """Get the current commit hash of the main branch.""" + try: + repo_info = self._api.repo_info( + repo_id=DASHBOARD_REPO_ID, repo_type="dataset" + ) + return repo_info.sha + except Exception as e: + logger.error(f"Failed to get current commit: {str(e)}") + return None + + def _load_existing_dataset(self) -> Optional[Dataset]: + """Load the existing dataset from the hub.""" + try: + dataset = load_dataset(DASHBOARD_REPO_ID, split="train") + if isinstance(dataset, Dataset): + self._current_commit = self._get_current_commit() + return dataset + else: + logger.error("Loaded dataset is not of type Dataset") + return None + except Exception as e: + if "doesn't contain any data files" in str(e): + logger.info("No existing dataset found, this will be the first upload") + self._is_first_upload = True + self._current_commit = self._get_current_commit() + return None + logger.error(f"Failed to load existing dataset: {str(e)}") + return None + + def _verify_commit(self) -> bool: + """Verify that the current commit hasn't changed.""" + if self._is_first_upload: + # For first upload, we don't need to verify commit + return True + + current = self._get_current_commit() + if current != self._current_commit: + logger.error("Dataset has been updated since last read. Aborting upload.") + return False + return True + + def _convert_to_dict(self, run_details: BenchmarkRunDetails) -> dict: + """Convert BenchmarkRunDetails to a dictionary format suitable for the dataset.""" + return { + "machine": run_details.machine, + "hardware": run_details.hardware, + "subsets": run_details.subsets, + "backends": run_details.backends, + "model": run_details.model, + "success": run_details.success, + "traceback": run_details.traceback, + "last_updated": run_details.last_updated, + "run_id": run_details.run_id, + "run_start_time": run_details.run_start_time, + } + + def upload_run_details(self, run_details: BenchmarkRunDetails): + """Upload a single benchmark run details to the dashboard dataset.""" + for attempt in range(MAX_RETRIES): + try: + # Reset first upload flag on each attempt + self._is_first_upload = False + + # Load existing dataset + existing_dataset = self._load_existing_dataset() + if existing_dataset is None and not self._is_first_upload: + # Failed to load for reasons other than being first upload + if attempt < MAX_RETRIES - 1: + time.sleep(RETRY_DELAY) + continue + else: + logger.error( + "Max retries reached. Failed to upload run details." + ) + return + + # Get existing data or empty list for first upload + existing_data = existing_dataset.to_list() if existing_dataset else [] + + # Convert the new run details to a dictionary + new_run = self._convert_to_dict(run_details) + + # Combine existing data with new run + combined_data = existing_data + [new_run] + + # Create new dataset + dataset = Dataset.from_list(combined_data) + + # Verify commit hasn't changed (skipped for first upload) + if not self._verify_commit(): + if attempt < MAX_RETRIES - 1: + time.sleep(RETRY_DELAY) + continue + else: + logger.error( + "Max retries reached. Failed to upload run details." + ) + return + + # Push to hub + dataset.push_to_hub(repo_id=DASHBOARD_REPO_ID, split="train") + logger.info( + f"Successfully uploaded run details for {run_details.run_id} to dashboard" + ) + break + + except Exception as e: + logger.error( + f"Failed to upload run details (attempt {attempt + 1}/{MAX_RETRIES}): {str(e)}" + ) + if attempt < MAX_RETRIES - 1: + time.sleep(RETRY_DELAY) + continue + break + + def upload_multiple_run_details(self, run_details_list: List[BenchmarkRunDetails]): + """Upload multiple benchmark run details to the dashboard dataset.""" + for attempt in range(MAX_RETRIES): + try: + # Load existing dataset + existing_dataset = self._load_existing_dataset() + if existing_dataset is None: + existing_data = [] + else: + existing_data = existing_dataset.to_list() + + # Convert all new run details to dictionaries + new_runs = [self._convert_to_dict(rd) for rd in run_details_list] + + # Combine existing data with new runs + combined_data = existing_data + new_runs + + # Create new dataset + dataset = Dataset.from_list(combined_data) + + # Verify commit hasn't changed + if not self._verify_commit(): + if attempt < MAX_RETRIES - 1: + time.sleep(RETRY_DELAY) + continue + else: + logger.error( + "Max retries reached. Failed to upload run details." + ) + return + + # Push to hub + dataset.push_to_hub(repo_id=DASHBOARD_REPO_ID, split="train") + logger.info( + f"Successfully uploaded {len(run_details_list)} run details to dashboard" + ) + break + + except Exception as e: + logger.error( + f"Failed to upload run details (attempt {attempt + 1}/{MAX_RETRIES}): {str(e)}" + ) + if attempt < MAX_RETRIES - 1: + time.sleep(RETRY_DELAY) + continue + break + + def get_latest_runs( + self, + machine: Optional[str] = None, + hardware: Optional[str] = None, + model: Optional[str] = None, + limit: int = 100, + ) -> pd.DataFrame: + """ + Retrieve the latest benchmark runs from the dashboard dataset. + + Args: + machine: Filter by machine name + hardware: Filter by hardware type + model: Filter by model name + limit: Maximum number of runs to return + + Returns: + DataFrame containing the latest runs + """ + try: + # Load the dataset + dataset = load_dataset(DASHBOARD_REPO_ID, split="train") + if not isinstance(dataset, Dataset): + logger.error("Failed to load dataset: not a Dataset instance") + return pd.DataFrame() + + # Convert to pandas DataFrame using dictionary + data_dict = {col: dataset[col] for col in dataset.column_names} + df = pd.DataFrame(data_dict) + + # Apply filters + if machine: + df = df[df["machine"] == machine] + if hardware: + df = df[df["hardware"] == hardware] + if model: + df = df[df["model"] == model] + + # Sort by last_updated and take the most recent runs + df["last_updated"] = pd.to_datetime(df["last_updated"]) + df = df.sort_values("last_updated", ascending=False).head(limit) + + return df + + except Exception as e: + logger.error(f"Failed to retrieve latest runs: {str(e)}") + return pd.DataFrame() diff --git a/llm_perf/common/get_top_model_from_hub.py b/llm_perf/common/get_top_model_from_hub.py index 79feb64..ba337f0 100644 --- a/llm_perf/common/get_top_model_from_hub.py +++ b/llm_perf/common/get_top_model_from_hub.py @@ -5,6 +5,7 @@ import requests from datasets import Dataset +from loguru import logger def get_top_text_generation_models( @@ -42,7 +43,7 @@ def get_top_text_generation_models( def save_to_json(data: List[Dict], filename: str): with open(filename, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False) - print(f"Data saved to {filename}") + logger.info(f"Data saved to {filename}") def compute_org_downloads(models: List[Dict]) -> Dict[str, int]: @@ -55,7 +56,7 @@ def compute_org_downloads(models: List[Dict]) -> Dict[str, int]: def upload_to_hf_dataset(data: List[Dict], dataset_name: str): dataset = Dataset.from_list(data) dataset.push_to_hub(dataset_name) - print(f"Data uploaded to Hugging Face dataset: {dataset_name}") + logger.info(f"Data uploaded to Hugging Face dataset: {dataset_name}") def main(): @@ -64,16 +65,16 @@ def main(): if huggingface_token: os.environ["HUGGINGFACE_HUB_TOKEN"] = huggingface_token else: - print( + logger.warning( "Warning: HUGGINGFACE_TOKEN not found in environment variables. Running without authentication." ) n = 100 top_models = get_top_text_generation_models(n) - print(f"\nTop {n} text generation models on Hugging Face Hub:") + logger.info(f"\nTop {n} text generation models on Hugging Face Hub:") for i, model in enumerate(top_models, 1): - print( + logger.info( f"{i}. {model['organization']}/{model['model_name']}: {model['downloads']:,} downloads" ) @@ -82,11 +83,11 @@ def main(): upload_to_hf_dataset(top_models, dataset_name) # Display top 10 organizations by downloads - print("\nTop 10 organizations by total downloads:") + logger.info("\nTop 10 organizations by total downloads:") org_downloads = compute_org_downloads(top_models) sorted_orgs = sorted(org_downloads.items(), key=lambda x: x[1], reverse=True)[:10] for i, (org, downloads) in enumerate(sorted_orgs, 1): - print(f"{i}. {org}: {downloads:,} downloads") + logger.info(f"{i}. {org}: {downloads:,} downloads") if __name__ == "__main__": diff --git a/llm_perf/common/memory_utils.py b/llm_perf/common/memory_utils.py new file mode 100644 index 0000000..a9f9f90 --- /dev/null +++ b/llm_perf/common/memory_utils.py @@ -0,0 +1,195 @@ +import os +import psutil +import gc +from typing import Dict, Optional +from loguru import logger + +try: + import torch + + TORCH_AVAILABLE = True +except ImportError: + TORCH_AVAILABLE = False + +# Memory thresholds in MB +MEMORY_THRESHOLDS = { + "cpu_rss": 8192, # 8GB + "cpu_percent": 90, # 90% + "gpu_allocated": 8192, # 8GB +} + + +class MemoryTracker: + def __init__(self): + self.initial_memory: Dict = {} + self.peak_memory: Dict = {"cpu_rss": 0, "cpu_percent": 0, "gpu_allocated": 0} + self.consecutive_increases = 0 + self.last_memory: Optional[Dict] = None + self.before_memory: Optional[Dict] = None # Store memory state before benchmark + + def get_gpu_memory_info(self): + """Get GPU memory usage if CUDA is available""" + if not TORCH_AVAILABLE or not torch.cuda.is_available(): + return None + + try: + gpu_memory = [] + for i in range(torch.cuda.device_count()): + allocated = torch.cuda.memory_allocated(i) / (1024 * 1024) # MB + reserved = torch.cuda.memory_reserved(i) / (1024 * 1024) # MB + gpu_memory.append( + {"device": i, "allocated": allocated, "reserved": reserved} + ) + return gpu_memory + except Exception as e: + logger.warning(f"Failed to get GPU memory info: {e}") + return None + + def get_cpu_memory_info(self): + """Get CPU memory usage""" + try: + process = psutil.Process(os.getpid()) + memory_info = process.memory_info() + return { + "rss": memory_info.rss / (1024 * 1024), # MB + "vms": memory_info.vms / (1024 * 1024), # MB + "percent": process.memory_percent(), + } + except Exception as e: + logger.warning(f"Failed to get CPU memory info: {e}") + return None + + def check_thresholds(self, cpu_info: Optional[Dict], gpu_info: Optional[list]): + """Check if memory usage exceeds thresholds""" + if cpu_info: + if cpu_info["rss"] > MEMORY_THRESHOLDS["cpu_rss"]: + logger.warning( + f"CPU RSS memory ({cpu_info['rss']:.0f}MB) exceeds threshold ({MEMORY_THRESHOLDS['cpu_rss']}MB)" + ) + if cpu_info["percent"] > MEMORY_THRESHOLDS["cpu_percent"]: + logger.warning( + f"CPU usage ({cpu_info['percent']:.1f}%) exceeds threshold ({MEMORY_THRESHOLDS['cpu_percent']}%)" + ) + + if gpu_info: + for device in gpu_info: + if device["allocated"] > MEMORY_THRESHOLDS["gpu_allocated"]: + logger.warning( + f"GPU {device['device']} allocated memory ({device['allocated']:.0f}MB) " + f"exceeds threshold ({MEMORY_THRESHOLDS['gpu_allocated']}MB)" + ) + + def check_persistent_growth( + self, cpu_info: Optional[Dict], gpu_info: Optional[list] + ): + """Monitor for persistent memory growth""" + if not cpu_info: + return + + current_memory = { + "cpu_rss": cpu_info["rss"], + "cpu_percent": cpu_info["percent"], + "gpu_allocated": gpu_info[0]["allocated"] if gpu_info else 0, + } + + # Update peak memory + for key in self.peak_memory: + self.peak_memory[key] = max(self.peak_memory[key], current_memory[key]) + + # Check for persistent growth + if self.last_memory: + is_increasing = all( + current_memory[key] + > self.last_memory[key] * 1.05 # 5% increase threshold + for key in current_memory + ) + + if is_increasing: + self.consecutive_increases += 1 + if ( + self.consecutive_increases >= 3 + ): # Alert after 3 consecutive increases + logger.warning( + "Detected persistent memory growth over last 3 benchmarks:\n" + f"Initial: CPU RSS={self.initial_memory.get('cpu_rss', 0):.0f}MB\n" + f"Current: CPU RSS={current_memory['cpu_rss']:.0f}MB\n" + f"Peak: CPU RSS={self.peak_memory['cpu_rss']:.0f}MB" + ) + else: + self.consecutive_increases = 0 + + # Store current memory for next comparison + self.last_memory = current_memory + + # Store initial memory on first run + if not self.initial_memory: + self.initial_memory = current_memory + + def log_memory_usage(self, phase: str = "current"): + """Log current memory usage for both CPU and GPU""" + # Force garbage collection + gc.collect() + if TORCH_AVAILABLE and torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Get memory info + cpu_info = self.get_cpu_memory_info() + gpu_info = self.get_gpu_memory_info() + + # Check thresholds and persistent growth + self.check_thresholds(cpu_info, gpu_info) + self.check_persistent_growth(cpu_info, gpu_info) + + # Store before memory state + if phase == "before": + self.before_memory = {"cpu": cpu_info, "gpu": gpu_info} + prefix = "Before benchmark -" + elif phase == "after" and self.before_memory: + prefix = "After benchmark -" + else: + prefix = "Current -" + + # Log CPU memory + if cpu_info: + cpu_msg = f"{prefix} CPU Memory - RSS: {cpu_info['rss']:.2f}MB, VMS: {cpu_info['vms']:.2f}MB, Percent: {cpu_info['percent']:.1f}%" + + # Add delta if we're in after phase + if phase == "after" and self.before_memory and self.before_memory["cpu"]: + before_cpu = self.before_memory["cpu"] + cpu_msg += f" (Δ RSS: {cpu_info['rss'] - before_cpu['rss']:+.2f}MB, Δ VMS: {cpu_info['vms'] - before_cpu['vms']:+.2f}MB, Δ %: {cpu_info['percent'] - before_cpu['percent']:+.1f})" + + logger.info(cpu_msg) + + # Log GPU memory if available + if gpu_info: + for device in gpu_info: + gpu_msg = f"{prefix} GPU {device['device']} Memory - Allocated: {device['allocated']:.2f}MB, Reserved: {device['reserved']:.2f}MB" + + # Add delta if we're in after phase + if ( + phase == "after" + and self.before_memory + and self.before_memory["gpu"] + ): + before_gpu = next( + ( + g + for g in self.before_memory["gpu"] + if g["device"] == device["device"] + ), + None, + ) + if before_gpu: + gpu_msg += f" (Δ Allocated: {device['allocated'] - before_gpu['allocated']:+.2f}MB, Δ Reserved: {device['reserved'] - before_gpu['reserved']:+.2f}MB)" + + logger.info(gpu_msg) + + +# Create a global memory tracker instance +memory_tracker = MemoryTracker() + + +# Function to use in other modules +def log_memory_usage(phase: str = "current"): + """Global function to log memory usage""" + memory_tracker.log_memory_usage(phase) diff --git a/llm_perf/common/utils.py b/llm_perf/common/utils.py index bf67fe9..4f90947 100644 --- a/llm_perf/common/utils.py +++ b/llm_perf/common/utils.py @@ -1,6 +1,7 @@ import pandas as pd from llm_perf.common.dependency import get_benchmark_top_n, is_debug_mode +from loguru import logger INPUT_SHAPES = {"batch_size": 1, "sequence_length": 256} GENERATE_KWARGS = {"max_new_tokens": 64, "min_new_tokens": 64} @@ -47,7 +48,7 @@ def get_top_llm_list(n: int = 10) -> list[str]: return top_models except Exception as e: - print(f"Error fetching top LLM list: {e}") + logger.error(f"Error fetching top LLM list: {e}") return [] @@ -55,6 +56,3 @@ def get_top_llm_list(n: int = 10) -> list[str]: CANONICAL_PRETRAINED_OPEN_LLM_LIST = ["bigscience/bloomz-560m"] else: CANONICAL_PRETRAINED_OPEN_LLM_LIST = get_top_llm_list(n=get_benchmark_top_n()) - print( - f"Benchamrking the following {len(CANONICAL_PRETRAINED_OPEN_LLM_LIST)} models: {CANONICAL_PRETRAINED_OPEN_LLM_LIST}" - ) diff --git a/llm_perf/dashboard_app.py b/llm_perf/dashboard_app.py new file mode 100644 index 0000000..eba6808 --- /dev/null +++ b/llm_perf/dashboard_app.py @@ -0,0 +1,216 @@ +import gradio as gr +import pandas as pd +from datetime import datetime, timedelta +import plotly.express as px +from plotly.graph_objs._figure import Figure +from loguru import logger +from typing import Optional, Tuple, List + +from llm_perf.common.dashboard_manager import DashboardManager + + +def create_status_plot(df: pd.DataFrame) -> Optional[Figure]: + """Create a status plot showing success/failure over time.""" + if df.empty: + return None + + # Ensure last_updated is datetime + df["last_updated"] = pd.to_datetime(df["last_updated"]) + df["success_str"] = df["success"].map({True: "Success", False: "Failure"}) + + # Create hover text with more details + df["hover_text"] = df.apply( + lambda row: f"Model: {row['model']}
" + + f"Hardware: {row['hardware']}
" + + f"Machine: {row['machine']}
" + + f"Status: {row['success_str']}
" + + f"Time: {row['last_updated'].strftime('%Y-%m-%d %H:%M:%S')}", + axis=1, + ) + + fig = px.scatter( + df, + x="last_updated", + y="model", + color="success_str", + title="Benchmark Status Over Time", + labels={"last_updated": "Time", "model": "Model", "success_str": "Status"}, + hover_data=["hover_text"], + height=600, + ) # Make plot taller to accommodate more models + + # Update layout for better readability + fig.update_layout( + xaxis_title="Time", + yaxis_title="Model", + showlegend=True, + legend_title="Status", + hovermode="closest", + ) + + return fig + + +def create_hardware_stats(df: pd.DataFrame) -> Optional[Figure]: + """Create statistics about hardware usage.""" + if df.empty: + return None + + stats = ( + df.groupby(["hardware", "machine"])["success"] + .agg(["count", "mean"]) + .reset_index() + ) + # Calculate success rate as percentage + stats["success_rate"] = (stats["mean"] * 100).round(2) + # Drop the mean column since we've converted it to success_rate + stats = stats.drop("mean", axis=1) + stats = stats.rename(columns={"count": "total_runs"}) + + fig = px.bar( + stats, + x="hardware", + y="total_runs", + color="success_rate", + title="Hardware Usage and Success Rate", + labels={ + "hardware": "Hardware Type", + "total_runs": "Total Runs", + "success_rate": "Success Rate (%)", + }, + ) + return fig + + +class DashboardApp: + def __init__(self): + self.dashboard_manager = DashboardManager() + + def refresh_data( + self, + time_range: str, + machine: str = "All", + hardware: str = "All", + model: str = "All", + ) -> Tuple[Optional[Figure], Optional[Figure], Optional[List[List[str]]]]: + """ + Refresh dashboard data based on filters. + + Args: + time_range: Time range to filter (e.g., '1d', '7d', '30d', 'all') + machine: Machine name filter + hardware: Hardware type filter + model: Model name filter + + Returns: + Tuple of (status plot, hardware stats plot, data table) + """ + try: + # Get the data + df = self.dashboard_manager.get_latest_runs( + machine=machine if machine != "All" else None, + hardware=hardware if hardware != "All" else None, + model=model if model != "All" else None, + ) + + if df.empty: + return None, None, None + + # Apply time range filter + if time_range != "all": + days = int(time_range[:-1]) + cutoff = datetime.now() - timedelta(days=days) + df = df[df["last_updated"] >= cutoff] + + # Create visualizations + status_plot = create_status_plot(df) + hardware_plot = create_hardware_stats(df) + + # Prepare table data + table_df = df[ + ["model", "hardware", "machine", "success", "last_updated"] + ].copy() + table_df["last_updated"] = table_df["last_updated"].dt.strftime( + "%Y-%m-%d %H:%M:%S" + ) + table_data: List[List[str]] = [ + [str(val) for val in row] for row in table_df.values.tolist() + ] + + return status_plot, hardware_plot, table_data + + except Exception as e: + logger.error(f"Error refreshing dashboard data: {str(e)}") + return None, None, None + + def launch(self, port: int = 7860, share: bool = False): + """Launch the Gradio interface. + + Args: + port: Port to run the dashboard on + share: Whether to create a public URL + """ + with gr.Blocks(title="LLM Performance Dashboard") as interface: + gr.Markdown("# 🚀 LLM Performance Dashboard") + gr.Markdown( + "Monitor the status and performance of LLM benchmarks across different hardware configurations." + ) + + with gr.Row(): + time_range = gr.Dropdown( + choices=["1d", "7d", "30d", "all"], value="7d", label="Time Range" + ) + machine = gr.Dropdown( + choices=["All"], # Will be populated dynamically + value="All", + label="Machine", + ) + hardware = gr.Dropdown( + choices=["All"], # Will be populated dynamically + value="All", + label="Hardware", + ) + model = gr.Dropdown( + choices=["All"], # Will be populated dynamically + value="All", + label="Model", + ) + refresh_btn = gr.Button("🔄 Refresh") + + with gr.Row(): + status_plot = gr.Plot(label="Benchmark Status") + hardware_plot = gr.Plot(label="Hardware Statistics") + + with gr.Row(): + results_table = gr.Dataframe( + headers=["Model", "Hardware", "Machine", "Success", "Last Updated"], + label="Recent Benchmark Results", + ) + + # Update function + def update_dashboard( + time_range: str, machine: str, hardware: str, model: str + ): + return self.refresh_data(time_range, machine, hardware, model) + + # Register update function + refresh_btn.click( + fn=update_dashboard, + inputs=[time_range, machine, hardware, model], + outputs=[status_plot, hardware_plot, results_table], + ) + + # Auto-refresh on load + interface.load( + fn=update_dashboard, + inputs=[time_range, machine, hardware, model], + outputs=[status_plot, hardware_plot, results_table], + ) + + # Launch the interface with specified parameters + interface.launch(server_port=port, share=share) + + +if __name__ == "__main__": + app = DashboardApp() + app.launch() diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 0dd368b..90e8552 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -8,9 +8,9 @@ import json from llm_perf.common.hardware_config import load_hardware_configs -from huggingface_hub.utils import disable_progress_bars +from loguru import logger -disable_progress_bars() +os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" REPO_TYPE = "dataset" MAIN_REPO_ID = "optimum-benchmark/llm-perf-leaderboard" @@ -93,7 +93,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str): path_in_repo=perf_df, path_or_fileobj=perf_df, ) - print(f"Uploaded {perf_df} to {MAIN_REPO_ID}") + logger.info(f"Uploaded {perf_df} to {MAIN_REPO_ID}") # def check_if_url_exists(url: str): @@ -101,7 +101,7 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str): # Check if a URL exists # """ # repo_exists -# print(f"response: {response}") +# logger.info(f"response: {response}") # return response.status_code == 200 @@ -123,31 +123,33 @@ def update_perf_dfs(): hardware_config.hardware, ) except Exception: - print("Dataset not found for:") - print(f" • Backend: {backend}") - print(f" • Subset: {subset}") - print(f" • Machine: {hardware_config.machine}") - print(f" • Hardware Type: {hardware_config.hardware}") + logger.error("Dataset not found for:") + logger.error(f" • Backend: {backend}") + logger.error(f" • Subset: {subset}") + logger.error(f" • Machine: {hardware_config.machine}") + logger.error(f" • Hardware Type: {hardware_config.hardware}") url = f"{PERF_REPO_ID.format(subset=subset, machine=hardware_config.machine, backend=backend, hardware=hardware_config.hardware)}" does_exist = repo_exists(url, repo_type="dataset") if does_exist: - print(f"Dataset exists: {url} but could not be processed") + logger.error( + f"Dataset exists: {url} but could not be processed" + ) def update_llm_df(): """ Scrape the open-llm-leaderboard and update the leaderboard dataframe """ - + scrapping_script = """ git clone https://github.com/Weyaxi/scrape-open-llm-leaderboard.git pip install -r scrape-open-llm-leaderboard/requirements.txt -q python scrape-open-llm-leaderboard/main.py rm -rf scrape-open-llm-leaderboard """ - + subprocess.run(scrapping_script, shell=True) create_repo(repo_id=MAIN_REPO_ID, repo_type=REPO_TYPE, exist_ok=True, private=False) upload_file( diff --git a/setup.py b/setup.py index 7f6e115..ec61ad5 100644 --- a/setup.py +++ b/setup.py @@ -14,7 +14,10 @@ "huggingface_hub[hf_transfer]", "datasets>=2.14.6", "beautifulsoup4", + "loguru", "optimum-benchmark @ git+https://github.com/huggingface/optimum-benchmark.git", + "psutil", + "torch<2.6.0", ] # Optional dependencies @@ -28,11 +31,12 @@ "optimum-benchmark[openvino] @ git+https://github.com/huggingface/optimum-benchmark.git" ], "cuda": [ - "flash-attn", - "auto-gptq", - "bitsandbytes", - "autoawq", - "torchao", + "optimum-benchmark[bitsandbytes,autoawq,auto-gptq] @ git+https://github.com/huggingface/optimum-benchmark.git", + ], + "dashboard": [ + "gradio", + "pandas", + "plotly", ], }