diff --git a/.github/workflows/update_llm_perf_cpu_onnxruntime.yaml b/.github/workflows/update_llm_perf_cpu_onnxruntime.yaml index 4216161..8cf8e67 100644 --- a/.github/workflows/update_llm_perf_cpu_onnxruntime.yaml +++ b/.github/workflows/update_llm_perf_cpu_onnxruntime.yaml @@ -8,6 +8,9 @@ on: branches: - main pull_request: + paths: + - 'src/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py' + - 'src/common/**' concurrency: cancel-in-progress: true @@ -55,5 +58,3 @@ jobs: pip install -e git+https://github.com/huggingface/optimum-benchmark.git#egg=optimum-benchmark[onnxruntime] pip install -e . python src/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py - - diff --git a/.github/workflows/update_llm_perf_cpu_openvino.yaml b/.github/workflows/update_llm_perf_cpu_openvino.yaml index d1b5377..42dccfa 100644 --- a/.github/workflows/update_llm_perf_cpu_openvino.yaml +++ b/.github/workflows/update_llm_perf_cpu_openvino.yaml @@ -4,10 +4,10 @@ on: workflow_dispatch: schedule: - cron: "0 0 * * *" - push: - branches: - - main pull_request: + paths: + - 'src/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py' + - 'src/common/**' concurrency: cancel-in-progress: true diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml index e79d5b2..cd82d4d 100644 --- a/.github/workflows/update_llm_perf_cpu_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cpu_pytorch.yaml @@ -4,10 +4,10 @@ on: workflow_dispatch: schedule: - cron: "0 0 * * *" - push: - branches: - - main pull_request: + paths: + - 'src/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py' + - 'src/common/**' concurrency: cancel-in-progress: true diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml index b133d48..2dc6ef1 100644 --- a/.github/workflows/update_llm_perf_cuda_pytorch.yaml +++ b/.github/workflows/update_llm_perf_cuda_pytorch.yaml @@ -4,10 +4,10 @@ on: workflow_dispatch: schedule: - cron: "0 0 * * *" - push: - branches: - - main pull_request: + paths: + - 'src/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py' + - 'src/common/**' concurrency: cancel-in-progress: true diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml index 2be9c34..afcec02 100644 --- a/.github/workflows/update_llm_perf_leaderboard.yaml +++ b/.github/workflows/update_llm_perf_leaderboard.yaml @@ -4,10 +4,12 @@ on: workflow_dispatch: schedule: - cron: "0 */6 * * *" - push: - branches: - - main pull_request: + paths: + - 'src/update_llm_perf_leaderboard.py' + - 'src/hardware.yaml' + - 'src/common/**' + concurrency: cancel-in-progress: true diff --git a/.github/workflows/update_llm_perf_rocm_pytorch.yaml b/.github/workflows/update_llm_perf_rocm_pytorch.yaml new file mode 100644 index 0000000..88357c8 --- /dev/null +++ b/.github/workflows/update_llm_perf_rocm_pytorch.yaml @@ -0,0 +1,63 @@ +name: Update LLM Perf Benchmarks - ROCm PyTorch + +on: + workflow_dispatch: + schedule: + - cron: "0 0 * * *" + pull_request: + paths: + - 'src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py' + - 'src/common/**' + +concurrency: + cancel-in-progress: true + group: ${{ github.workflow }}-${{ github.ref }} + +env: + IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm + +jobs: + run_benchmarks: + strategy: + fail-fast: false + matrix: + subset: [unquantized] + # subset: [unquantized, bnb, awq, gptq] + + machine: + [ + { name: 1xA10, runs-on: { group: "aws-g5-4xlarge-plus" } }, + # { name: 1xT4, runs-on: { group: "aws-g4dn-2xlarge" } }, + ] + + runs-on: ${{ matrix.machine.runs-on }} + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Run benchmarks + uses: addnab/docker-run-action@v3 + env: + SUBSET: ${{ matrix.subset }} + MACHINE: ${{ matrix.machine.name }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} + with: + image: ${{ env.IMAGE }} + options: | + --rm + --gpus all + --shm-size 64G + --env SUBSET + --env MACHINE + --env HF_TOKEN + --env MKL_THREADING_LAYER=GNU + --env HF_HUB_ENABLE_HF_TRANSFER=1 + --volume ${{ github.workspace }}:/workspace + --workdir /workspace + run: | + pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon + pip install -U transformers huggingface_hub[hf_transfer] + pip install git+https://github.com/huggingface/optimum-benchmark.git + pip install -e . + python src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py diff --git a/.gitignore b/.gitignore index 0325277..6211b82 100644 --- a/.gitignore +++ b/.gitignore @@ -185,3 +185,8 @@ external_repos/ outputs/ .env wip/ + +requirements.lock +requirements-dev.lock + +pyrsmi \ No newline at end of file diff --git a/README.md b/README.md index 2d6b1e1..648517e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,10 @@ # llm-perf-backend The backend of [the LLM-perf leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard) +# How to install +`rye sync` + + + ## Why this runs all the benchmarks to get results for the leaderboard \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index a46b676..9596c9b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,7 @@ authors = [ ] dependencies = [ "ruff>=0.6.8", + "optimum-benchmark @ git+https://github.com/huggingface/optimum-benchmark", ] readme = "README.md" requires-python = ">= 3.8" diff --git a/src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py b/src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py new file mode 100644 index 0000000..3da802f --- /dev/null +++ b/src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py @@ -0,0 +1,193 @@ +from itertools import product +from typing import Any, Dict, List + +from optimum_benchmark import PyTorchConfig +from optimum_benchmark.benchmark.config import BenchmarkConfig +from optimum_benchmark.launchers.process.config import ProcessConfig +from optimum_benchmark.scenarios.inference.config import InferenceConfig + +from src.common.benchmark_runner import LLMPerfBenchmarkManager +from src.common.utils import ( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + GENERATE_KWARGS, + INPUT_SHAPES, +) + + +class ROCmPyTorchBenchmarkRunner(LLMPerfBenchmarkManager): + def __init__(self): + super().__init__(backend="pytorch", device="rocm") + + self.attention_configs = self._get_attention_configs() + assert self.subset is not None, "SUBSET environment variable must be set for benchmarking" + self.weights_configs = self._get_weights_configs(self.subset) + + def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]: + return [ + { + "model": model, + "attn_implementation": attn_impl, + "weights_config": weights_cfg, + } + for model, attn_impl, weights_cfg in product( + CANONICAL_PRETRAINED_OPEN_LLM_LIST, + self.attention_configs, + self.weights_configs.keys(), + ) + ] + + def get_benchmark_name(self, model: str, **kwargs) -> str: + weights_config = kwargs["weights_config"] + attn_implementation = kwargs["attn_implementation"] + return f"{model}-{weights_config}-{attn_implementation}-{self.backend}" + + def is_benchmark_supported(self, **kwargs) -> bool: + if kwargs["attn_implementation"] == "flash_attention_2" and kwargs["weights_config"] == "float32": + return False + return True + + def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig: + weights_config = kwargs["weights_config"] + attn_implementation = kwargs["attn_implementation"] + + assert ( + weights_config in self.weights_configs + ), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue" + + torch_dtype = self.weights_configs[weights_config]["torch_dtype"] + quant_scheme = self.weights_configs[weights_config]["quant_scheme"] + quant_config = self.weights_configs[weights_config]["quant_config"] + + launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill") + scenario_config = InferenceConfig( + memory=True, + latency=True, + duration=10, + iterations=10, + warmup_runs=10, + input_shapes=INPUT_SHAPES, + generate_kwargs=GENERATE_KWARGS, + ) + backend_config = PyTorchConfig( + model=model, + no_weights=True, + library="transformers", + task="text-generation", + torch_dtype=torch_dtype, + quantization_scheme=quant_scheme, + quantization_config=quant_config, + attn_implementation=attn_implementation, + model_kwargs={"trust_remote_code": True}, + ) + + return BenchmarkConfig( + name=f"{weights_config}-{attn_implementation}", + scenario=scenario_config, + launcher=launcher_config, + backend=backend_config, + ) + + def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]: + if subset == "unquantized": + return { + "float32": { + "torch_dtype": "float32", + "quant_scheme": None, + "quant_config": {}, + }, + "float16": { + "torch_dtype": "float16", + "quant_scheme": None, + "quant_config": {}, + }, + "bfloat16": { + "torch_dtype": "bfloat16", + "quant_scheme": None, + "quant_config": {}, + }, + } + elif subset == "bnb": + return { + "4bit-bnb": { + "torch_dtype": "float16", + "quant_scheme": "bnb", + "quant_config": {"load_in_4bit": True}, + }, + "8bit-bnb": { + "torch_dtype": "float16", + "quant_scheme": "bnb", + "quant_config": {"load_in_8bit": True}, + }, + } + elif subset == "gptq": + return { + "4bit-gptq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": { + "bits": 4, + "use_exllama ": True, + "version": 1, + "model_seqlen": 256, + }, + }, + "4bit-gptq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "gptq", + "quant_config": { + "bits": 4, + "use_exllama ": True, + "version": 2, + "model_seqlen": 256, + }, + }, + } + elif subset == "awq": + return { + "4bit-awq-gemm": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemm"}, + }, + "4bit-awq-gemv": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": {"bits": 4, "version": "gemv"}, + }, + "4bit-awq-exllama-v1": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": { + "version": 1, + "max_input_len": 64, + "max_batch_size": 1, + }, + }, + }, + "4bit-awq-exllama-v2": { + "torch_dtype": "float16", + "quant_scheme": "awq", + "quant_config": { + "bits": 4, + "version": "exllama", + "exllama_config": { + "version": 2, + "max_input_len": 64, + "max_batch_size": 1, + }, + }, + }, + } + else: + raise ValueError(f"Unknown subset: {subset}") + + def _get_attention_configs(self) -> List[str]: + return ["eager", "sdpa", "flash_attention_2"] + + +if __name__ == "__main__": + runner = ROCmPyTorchBenchmarkRunner() + runner.run_benchmarks()