diff --git a/.github/workflows/benchmark_cpu_onnxruntime.yaml b/.github/workflows/benchmark_cpu_onnxruntime.yaml index 32c1ed5..56be345 100644 --- a/.github/workflows/benchmark_cpu_onnxruntime.yaml +++ b/.github/workflows/benchmark_cpu_onnxruntime.yaml @@ -3,10 +3,7 @@ name: Benchmark CPU Onnxruntime on: workflow_dispatch: schedule: - - cron: "0 0 * * *" - push: - branches: - - '*' + - cron: "0 12 * * *" pull_request: concurrency: diff --git a/.github/workflows/benchmark_cpu_openvino.yaml b/.github/workflows/benchmark_cpu_openvino.yaml index afafb96..9e4c70f 100644 --- a/.github/workflows/benchmark_cpu_openvino.yaml +++ b/.github/workflows/benchmark_cpu_openvino.yaml @@ -4,9 +4,6 @@ on: workflow_dispatch: schedule: - cron: "0 0 * * *" - push: - branches: - - '*' pull_request: concurrency: diff --git a/.github/workflows/benchmark_cpu_pytorch.yaml b/.github/workflows/benchmark_cpu_pytorch.yaml index b2aa7e0..d0ca50a 100644 --- a/.github/workflows/benchmark_cpu_pytorch.yaml +++ b/.github/workflows/benchmark_cpu_pytorch.yaml @@ -4,9 +4,6 @@ on: workflow_dispatch: schedule: - cron: "0 0 * * *" - push: - branches: - - '*' pull_request: concurrency: diff --git a/.github/workflows/benchmark_cuda_pytorch.yaml b/.github/workflows/benchmark_cuda_pytorch.yaml index 624adef..028b84c 100644 --- a/.github/workflows/benchmark_cuda_pytorch.yaml +++ b/.github/workflows/benchmark_cuda_pytorch.yaml @@ -3,10 +3,7 @@ name: Benchmark CUDA PyTorch on: workflow_dispatch: schedule: - - cron: "0 0 * * *" - push: - branches: - - '*' + - cron: "0 3 * * *" pull_request: concurrency: @@ -33,7 +30,7 @@ jobs: strategy: fail-fast: false matrix: - subset: [unquantized, bnb, awq, gptq] + subset: [torchao] machine: [ diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml index e11e717..b9c43e7 100644 --- a/.github/workflows/update_llm_perf_leaderboard.yaml +++ b/.github/workflows/update_llm_perf_leaderboard.yaml @@ -3,7 +3,7 @@ name: Update LLM Perf Leaderboard on: workflow_dispatch: schedule: - - cron: "0 */6 * * *" + - cron: "0 0 * * *" push: branches: - main diff --git a/.gitignore b/.gitignore index e6b5c39..04db50f 100644 --- a/.gitignore +++ b/.gitignore @@ -187,4 +187,6 @@ outputs/ wip/ *.csv -optimum-benchmark/ \ No newline at end of file +optimum-benchmark/ + +*.egg-info/ \ No newline at end of file diff --git a/Makefile b/Makefile index aa80fe1..dc0ad56 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Style and Quality checks -.PHONY: style quality +.PHONY: style quality install install-dev run_cpu_container run_cuda_container run_rocm_container cpu-pytorch-container cpu-openvino-container collector-container quality: ruff check . @@ -9,17 +9,13 @@ style: ruff format . ruff check --fix . -.PHONY: install - install: pip install . install-dev: DEBUG=1 uv pip install -e . -# Running containers -.PHONY: run_cpu_container run_cuda_container run_rocm_container - +# Running optimum-benchmark containers run_cpu_container: docker run -it --rm --pid host --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-cpu @@ -29,15 +25,15 @@ run_cuda_container: run_rocm_container: docker run -it --rm --shm-size 64G --device /dev/kfd --device /dev/dri --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-rocm +# Running llm-perf backend containers cpu-pytorch-container: docker build -t cpu-pytorch -f docker/cpu-pytorch/Dockerfile . - # docker run -it --rm --pid host cpu-pytorch /bin/bash docker run -it --rm --pid host cpu-pytorch -collector-container: - docker build -t collector -f docker/collector/Dockerfile . - docker run -it --rm --pid host collector - cpu-openvino-container: docker build -t cpu-openvino -f docker/cpu-openvino/Dockerfile . docker run -it --rm --pid host cpu-openvino + +collector-container: + docker build -t collector -f docker/collector/Dockerfile . + docker run -it --rm --pid host collector diff --git a/README.md b/README.md index 2fa09f0..7c69305 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,78 @@ -# llm-perf-backend -The backend of [the LLM-perf leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) +# LLM-perf Backend 🏋️ -## Why -this runs all the benchmarks to get results for the leaderboard +The official backend system powering the [LLM-perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard). This repository contains the infrastructure and tools needed to run standardized benchmarks for Large Language Models (LLMs) across different hardware configurations and optimization backends. -## How to install -git clone -pip install -e .[openvino] +## About 📝 -## How to use the cli -llm-perf run-benchmark --hardware cpu --backend openvino +LLM-perf Backend is designed to: +- Run automated benchmarks for the LLM-perf leaderboard +- Ensure consistent and reproducible performance measurements +- Support multiple hardware configurations and optimization backends +- Generate standardized performance metrics for latency, throughput, memory usage, and energy consumption + +## Key Features 🔑 + +- Standardized benchmarking pipeline using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) +- Support for multiple hardware configurations (CPU, GPU) +- Multiple backend implementations (PyTorch, Onnxruntime, etc.) +- Automated metric collection: + - Latency and throughput measurements + - Memory usage tracking + - Energy consumption monitoring + - Quality metrics integration with Open LLM Leaderboard + +## Installation 🛠️ + +1. Clone the repository: +```bash +git clone https://github.com/huggingface/llm-perf-backend +cd llm-perf-backend +``` + +2. Create a python env +```bash +python -m venv .venv +source .venv/bin/activate +``` + +2. Install the package with required dependencies: +```bash +pip install -e "." +# or +pip install -e ".[all]" # to install optional dependency like Onnxruntime +``` + +## Usage 📋 + +### Command Line Interface + +Run benchmarks using the CLI tool: + +```bash llm-perf run-benchmark --hardware cpu --backend pytorch +``` + +### Configuration Options + +View all the options with +```bash +llm-perf run-benchmark --help +``` + +- `--hardware`: Target hardware platform (cpu, cuda) +- `--backend`: Backend framework to use (pytorch, onnxruntime, etc.) + +## Benchmark Dataset 📊 + +Results are published to the official dataset: +[optimum-benchmark/llm-perf-leaderboard](https://huggingface.co/datasets/optimum-benchmark/llm-perf-leaderboard) + +## Benchmark Specifications 📑 -https://huggingface.co/datasets/optimum-benchmark/llm-perf-leaderboard \ No newline at end of file +All benchmarks follow these standardized settings: +- Single GPU usage to avoid communication-dependent results +- Energy monitoring via CodeCarbon +- Memory tracking: + - Maximum allocated memory + - Maximum reserved memory + - Maximum used memory (via PyNVML for GPU) \ No newline at end of file diff --git a/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py index 2beab28..fef89c9 100644 --- a/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py +++ b/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py @@ -191,6 +191,17 @@ def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]: }, }, } + elif subset == "torchao": + return { + "torchao-int4wo-128": { + "torch_dtype": "bfloat16", + "quant_scheme": "torchao", + "quant_config": { + "quant_type": "int4_weight_only", + "group_size": 128, + }, + }, + } else: raise ValueError(f"Unknown subset: {subset}") diff --git a/llm_perf/hardware.yaml b/llm_perf/hardware.yaml index c3cd754..319801a 100644 --- a/llm_perf/hardware.yaml +++ b/llm_perf/hardware.yaml @@ -5,6 +5,7 @@ - awq - bnb - gptq + - torchao backends: - pytorch @@ -15,6 +16,7 @@ - awq - bnb - gptq + - torchao backends: - pytorch @@ -25,6 +27,7 @@ - awq - bnb - gptq + - torchao backends: - pytorch diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py index 91d8592..1e9adf9 100644 --- a/llm_perf/update_llm_perf_leaderboard.py +++ b/llm_perf/update_llm_perf_leaderboard.py @@ -4,7 +4,6 @@ import pandas as pd from huggingface_hub import create_repo, snapshot_download, upload_file, repo_exists from optimum_benchmark import Benchmark -import requests import json from llm_perf.common.hardware_config import load_hardware_configs @@ -19,6 +18,7 @@ PERF_DF = "perf-df-{backend}-{hardware}-{subset}-{machine}.csv" LLM_DF = "llm-df.csv" + def patch_json(file): """ Patch a JSON file by adding a 'stdev_' key with the same value as 'stdev' for all occurrences, @@ -37,7 +37,7 @@ def patch_json(file): """ with open(file, "r") as f: data = json.load(f) - + def add_stdev_(obj): if isinstance(obj, dict): new_items = [] @@ -53,10 +53,11 @@ def add_stdev_(obj): add_stdev_(item) add_stdev_(data) - + with open(file, "w") as f: json.dump(data, f, indent=4) + def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str): """ Gather the benchmarks for a given machine @@ -99,7 +100,6 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str): # return response.status_code == 200 - def update_perf_dfs(): """ Update the performance dataframes for all machines @@ -116,19 +116,18 @@ def update_perf_dfs(): backend, hardware_config.hardware, ) - except Exception as e: + except Exception: print("Dataset not found for:") print(f" • Backend: {backend}") print(f" • Subset: {subset}") print(f" • Machine: {hardware_config.machine}") print(f" • Hardware Type: {hardware_config.hardware}") url = f"{PERF_REPO_ID.format(subset=subset, machine=hardware_config.machine, backend=backend, hardware=hardware_config.hardware)}" - + does_exist = repo_exists(url, repo_type="dataset") if does_exist: print(f"Dataset exists: {url} but could not be processed") - scrapping_script = """ diff --git a/optimum-benchmark b/optimum-benchmark deleted file mode 160000 index de1e792..0000000 --- a/optimum-benchmark +++ /dev/null @@ -1 +0,0 @@ -Subproject commit de1e7921fd787594add32a6ebbe5fae2578ac06b diff --git a/pyproject.toml.bak b/pyproject.toml.bak deleted file mode 100644 index 2634309..0000000 --- a/pyproject.toml.bak +++ /dev/null @@ -1,29 +0,0 @@ -[project] -name = "llm-perf-backend" -version = "0.1.0" -description = "Backend for https://huggingface.co/spaces/optimum/llm-perf-leaderboard" -authors = [ - { name = "baptiste", email = "baptiste.colle@huggingface.co" } -] -readme = "README.md" -requires-python = ">=3.8" - -[project.scripts] -llm-perf = "src.cli:app" - -[tool.hatch.metadata] -allow-direct-references = true - -[tool.hatch.build.targets.wheel] -packages = ["src"] - -[tool.ruff] -line-length = 120 -lint.ignore = ["C901", "E501"] -lint.select = ["C", "E", "F", "I", "W", "I001"] - -[tool.ruff.format] -line-ending = "auto" -quote-style = "double" -indent-style = "space" -skip-magic-trailing-comma = false \ No newline at end of file diff --git a/setup.py b/setup.py index b8a397b..7f6e115 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ "auto-gptq", "bitsandbytes", "autoawq", + "torchao", ], } diff --git a/test.py b/test.py deleted file mode 100644 index e69de29..0000000 diff --git a/test.sh b/test.sh deleted file mode 100644 index 89c47e3..0000000 --- a/test.sh +++ /dev/null @@ -1,4 +0,0 @@ -docker run -it --rm ghcr.io/huggingface/optimum-benchmark:latest-cpu - - -docker run -it --rm --pid host --volume "$(pwd)":/optimum-benchmark --workdir /optimum-benchmark ghcr.io/huggingface/optimum-benchmark:latest-cpu \ No newline at end of file