Skip to content

add amd to leaderboard #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions .github/workflows/update_llm_perf_cpu_onnxruntime.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ on:
branches:
- main
pull_request:
paths:
- 'src/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py'
- 'src/common/**'

concurrency:
cancel-in-progress: true
Expand Down Expand Up @@ -55,5 +58,3 @@ jobs:
pip install -e git+https://github.com/huggingface/optimum-benchmark.git#egg=optimum-benchmark[onnxruntime]
pip install -e .
python src/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py


6 changes: 3 additions & 3 deletions .github/workflows/update_llm_perf_cpu_openvino.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
push:
branches:
- main
pull_request:
paths:
- 'src/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py'
- 'src/common/**'

concurrency:
cancel-in-progress: true
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/update_llm_perf_cpu_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
push:
branches:
- main
pull_request:
paths:
- 'src/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py'
- 'src/common/**'

concurrency:
cancel-in-progress: true
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/update_llm_perf_cuda_pytorch.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
push:
branches:
- main
pull_request:
paths:
- 'src/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py'
- 'src/common/**'

concurrency:
cancel-in-progress: true
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/update_llm_perf_leaderboard.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@ on:
workflow_dispatch:
schedule:
- cron: "0 */6 * * *"
push:
branches:
- main
pull_request:
paths:
- 'src/update_llm_perf_leaderboard.py'
- 'src/hardware.yaml'
- 'src/common/**'


concurrency:
cancel-in-progress: true
Expand Down
63 changes: 63 additions & 0 deletions .github/workflows/update_llm_perf_rocm_pytorch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: Update LLM Perf Benchmarks - ROCm PyTorch

on:
workflow_dispatch:
schedule:
- cron: "0 0 * * *"
pull_request:
paths:
- 'src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py'
- 'src/common/**'

concurrency:
cancel-in-progress: true
group: ${{ github.workflow }}-${{ github.ref }}

env:
IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm

jobs:
run_benchmarks:
strategy:
fail-fast: false
matrix:
subset: [unquantized]
# subset: [unquantized, bnb, awq, gptq]

machine:
[
{ name: 1xA10, runs-on: { group: "aws-g5-4xlarge-plus" } },
# { name: 1xT4, runs-on: { group: "aws-g4dn-2xlarge" } },
]

runs-on: ${{ matrix.machine.runs-on }}

steps:
- name: Checkout
uses: actions/checkout@v4

- name: Run benchmarks
uses: addnab/docker-run-action@v3
env:
SUBSET: ${{ matrix.subset }}
MACHINE: ${{ matrix.machine.name }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
with:
image: ${{ env.IMAGE }}
options: |
--rm
--gpus all
--shm-size 64G
--env SUBSET
--env MACHINE
--env HF_TOKEN
--env MKL_THREADING_LAYER=GNU
--env HF_HUB_ENABLE_HF_TRANSFER=1
--volume ${{ github.workspace }}:/workspace
--workdir /workspace
run: |
pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
pip install -U transformers huggingface_hub[hf_transfer]
pip install git+https://github.com/huggingface/optimum-benchmark.git
pip install -e .
python src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,8 @@ external_repos/
outputs/
.env
wip/

requirements.lock
requirements-dev.lock

pyrsmi
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# llm-perf-backend
The backend of [the LLM-perf leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)

# How to install
`rye sync`



## Why
this runs all the benchmarks to get results for the leaderboard
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ authors = [
]
dependencies = [
"ruff>=0.6.8",
"optimum-benchmark @ git+https://github.com/huggingface/optimum-benchmark",
]
readme = "README.md"
requires-python = ">= 3.8"
Expand Down
193 changes: 193 additions & 0 deletions src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
from itertools import product
from typing import Any, Dict, List

from optimum_benchmark import PyTorchConfig
from optimum_benchmark.benchmark.config import BenchmarkConfig
from optimum_benchmark.launchers.process.config import ProcessConfig
from optimum_benchmark.scenarios.inference.config import InferenceConfig

from src.common.benchmark_runner import LLMPerfBenchmarkManager
from src.common.utils import (
CANONICAL_PRETRAINED_OPEN_LLM_LIST,
GENERATE_KWARGS,
INPUT_SHAPES,
)


class ROCmPyTorchBenchmarkRunner(LLMPerfBenchmarkManager):
def __init__(self):
super().__init__(backend="pytorch", device="rocm")

self.attention_configs = self._get_attention_configs()
assert self.subset is not None, "SUBSET environment variable must be set for benchmarking"
self.weights_configs = self._get_weights_configs(self.subset)

def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
return [
{
"model": model,
"attn_implementation": attn_impl,
"weights_config": weights_cfg,
}
for model, attn_impl, weights_cfg in product(
CANONICAL_PRETRAINED_OPEN_LLM_LIST,
self.attention_configs,
self.weights_configs.keys(),
)
]

def get_benchmark_name(self, model: str, **kwargs) -> str:
weights_config = kwargs["weights_config"]
attn_implementation = kwargs["attn_implementation"]
return f"{model}-{weights_config}-{attn_implementation}-{self.backend}"

def is_benchmark_supported(self, **kwargs) -> bool:
if kwargs["attn_implementation"] == "flash_attention_2" and kwargs["weights_config"] == "float32":
return False
return True

def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
weights_config = kwargs["weights_config"]
attn_implementation = kwargs["attn_implementation"]

assert (
weights_config in self.weights_configs
), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue"

torch_dtype = self.weights_configs[weights_config]["torch_dtype"]
quant_scheme = self.weights_configs[weights_config]["quant_scheme"]
quant_config = self.weights_configs[weights_config]["quant_config"]

launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
scenario_config = InferenceConfig(
memory=True,
latency=True,
duration=10,
iterations=10,
warmup_runs=10,
input_shapes=INPUT_SHAPES,
generate_kwargs=GENERATE_KWARGS,
)
backend_config = PyTorchConfig(
model=model,
no_weights=True,
library="transformers",
task="text-generation",
torch_dtype=torch_dtype,
quantization_scheme=quant_scheme,
quantization_config=quant_config,
attn_implementation=attn_implementation,
model_kwargs={"trust_remote_code": True},
)

return BenchmarkConfig(
name=f"{weights_config}-{attn_implementation}",
scenario=scenario_config,
launcher=launcher_config,
backend=backend_config,
)

def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]:
if subset == "unquantized":
return {
"float32": {
"torch_dtype": "float32",
"quant_scheme": None,
"quant_config": {},
},
"float16": {
"torch_dtype": "float16",
"quant_scheme": None,
"quant_config": {},
},
"bfloat16": {
"torch_dtype": "bfloat16",
"quant_scheme": None,
"quant_config": {},
},
}
elif subset == "bnb":
return {
"4bit-bnb": {
"torch_dtype": "float16",
"quant_scheme": "bnb",
"quant_config": {"load_in_4bit": True},
},
"8bit-bnb": {
"torch_dtype": "float16",
"quant_scheme": "bnb",
"quant_config": {"load_in_8bit": True},
},
}
elif subset == "gptq":
return {
"4bit-gptq-exllama-v1": {
"torch_dtype": "float16",
"quant_scheme": "gptq",
"quant_config": {
"bits": 4,
"use_exllama ": True,
"version": 1,
"model_seqlen": 256,
},
},
"4bit-gptq-exllama-v2": {
"torch_dtype": "float16",
"quant_scheme": "gptq",
"quant_config": {
"bits": 4,
"use_exllama ": True,
"version": 2,
"model_seqlen": 256,
},
},
}
elif subset == "awq":
return {
"4bit-awq-gemm": {
"torch_dtype": "float16",
"quant_scheme": "awq",
"quant_config": {"bits": 4, "version": "gemm"},
},
"4bit-awq-gemv": {
"torch_dtype": "float16",
"quant_scheme": "awq",
"quant_config": {"bits": 4, "version": "gemv"},
},
"4bit-awq-exllama-v1": {
"torch_dtype": "float16",
"quant_scheme": "awq",
"quant_config": {
"bits": 4,
"version": "exllama",
"exllama_config": {
"version": 1,
"max_input_len": 64,
"max_batch_size": 1,
},
},
},
"4bit-awq-exllama-v2": {
"torch_dtype": "float16",
"quant_scheme": "awq",
"quant_config": {
"bits": 4,
"version": "exllama",
"exllama_config": {
"version": 2,
"max_input_len": 64,
"max_batch_size": 1,
},
},
},
}
else:
raise ValueError(f"Unknown subset: {subset}")

def _get_attention_configs(self) -> List[str]:
return ["eager", "sdpa", "flash_attention_2"]


if __name__ == "__main__":
runner = ROCmPyTorchBenchmarkRunner()
runner.run_benchmarks()