huggingface · baptistecolle · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024 · Oct 3, 2024
diff --git a/.github/workflows/update_llm_perf_cpu_onnxruntime.yaml b/.github/workflows/update_llm_perf_cpu_onnxruntime.yaml
@@ -8,6 +8,9 @@ on:
     branches:
       - main
   pull_request:
+    paths:
+      - 'src/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py'
+      - 'src/common/**'
 
 concurrency:
   cancel-in-progress: true
@@ -55,5 +58,3 @@ jobs:
             pip install -e git+https://github.com/huggingface/optimum-benchmark.git#egg=optimum-benchmark[onnxruntime]
             pip install -e .
             python src/benchmark_runners/cpu/update_llm_perf_cpu_onnxruntime.py
-
-
diff --git a/.github/workflows/update_llm_perf_cpu_openvino.yaml b/.github/workflows/update_llm_perf_cpu_openvino.yaml
@@ -4,10 +4,10 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  push:
-    branches:
-      - main
   pull_request:
+    paths:
+      - 'src/benchmark_runners/cpu/update_llm_perf_cpu_openvino.py'
+      - 'src/common/**'
 
 concurrency:
   cancel-in-progress: true

diff --git a/.github/workflows/update_llm_perf_cpu_pytorch.yaml b/.github/workflows/update_llm_perf_cpu_pytorch.yaml
@@ -4,10 +4,10 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  push:
-    branches:
-      - main
   pull_request:
+    paths:
+      - 'src/benchmark_runners/cpu/update_llm_perf_cpu_pytorch.py'
+      - 'src/common/**'
 
 concurrency:
   cancel-in-progress: true

diff --git a/.github/workflows/update_llm_perf_cuda_pytorch.yaml b/.github/workflows/update_llm_perf_cuda_pytorch.yaml
@@ -4,10 +4,10 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  push:
-    branches:
-      - main
   pull_request:
+    paths:
+      - 'src/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py'
+      - 'src/common/**'
 
 concurrency:
   cancel-in-progress: true

diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml
@@ -4,10 +4,12 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 */6 * * *"
-  push:
-    branches:
-      - main
   pull_request:
+    paths:
+      - 'src/update_llm_perf_leaderboard.py'
+      - 'src/hardware.yaml'
+      - 'src/common/**'
+
 
 concurrency:
   cancel-in-progress: true

diff --git a/.github/workflows/update_llm_perf_rocm_pytorch.yaml b/.github/workflows/update_llm_perf_rocm_pytorch.yaml
@@ -0,0 +1,63 @@
+name: Update LLM Perf Benchmarks - ROCm PyTorch
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+  pull_request:
+    paths:
+      - 'src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py'
+      - 'src/common/**'
+
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+env:
+  IMAGE: ghcr.io/huggingface/optimum-benchmark:latest-rocm
+
+jobs:
+  run_benchmarks:
+    strategy:
+      fail-fast: false
+      matrix:
+        subset: [unquantized]
+        # subset: [unquantized, bnb, awq, gptq]
+
+        machine:
+          [
+            { name: 1xA10, runs-on: { group: "aws-g5-4xlarge-plus" } },
+            # { name: 1xT4, runs-on: { group: "aws-g4dn-2xlarge" } },
+          ]
+
+    runs-on: ${{ matrix.machine.runs-on }}
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Run benchmarks
+        uses: addnab/docker-run-action@v3
+        env:
+          SUBSET: ${{ matrix.subset }}
+          MACHINE: ${{ matrix.machine.name }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        with:
+          image: ${{ env.IMAGE }}
+          options: |
+            --rm
+            --gpus all
+            --shm-size 64G
+            --env SUBSET
+            --env MACHINE
+            --env HF_TOKEN
+            --env MKL_THREADING_LAYER=GNU
+            --env HF_HUB_ENABLE_HF_TRANSFER=1
+            --volume ${{ github.workspace }}:/workspace
+            --workdir /workspace
+          run: |
+            pip install packaging && pip install flash-attn einops scipy auto-gptq optimum bitsandbytes autoawq codecarbon
+            pip install -U transformers huggingface_hub[hf_transfer]
+            pip install git+https://github.com/huggingface/optimum-benchmark.git
+            pip install -e .
+            python src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py
diff --git a/.gitignore b/.gitignore
@@ -185,3 +185,8 @@ external_repos/
 outputs/
 .env
 wip/
+
+requirements.lock
+requirements-dev.lock
+
+pyrsmi
diff --git a/README.md b/README.md
@@ -1,5 +1,10 @@
 # llm-perf-backend
 The backend of [the LLM-perf leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
 
+# How to install 
+`rye sync`
+
+
+
 ## Why
 this runs all the benchmarks to get results for the leaderboard
diff --git a/pyproject.toml b/pyproject.toml
@@ -7,6 +7,7 @@ authors = [
 ]
 dependencies = [
     "ruff>=0.6.8",
+    "optimum-benchmark @ git+https://github.com/huggingface/optimum-benchmark",
 ]
 readme = "README.md"
 requires-python = ">= 3.8"

diff --git a/src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py b/src/benchmark_runners/rocm/update_llm_perf_rocm_pytorch.py
@@ -0,0 +1,193 @@
+from itertools import product
+from typing import Any, Dict, List
+
+from optimum_benchmark import PyTorchConfig
+from optimum_benchmark.benchmark.config import BenchmarkConfig
+from optimum_benchmark.launchers.process.config import ProcessConfig
+from optimum_benchmark.scenarios.inference.config import InferenceConfig
+
+from src.common.benchmark_runner import LLMPerfBenchmarkManager
+from src.common.utils import (
+    CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+    GENERATE_KWARGS,
+    INPUT_SHAPES,
+)
+
+
+class ROCmPyTorchBenchmarkRunner(LLMPerfBenchmarkManager):
+    def __init__(self):
+        super().__init__(backend="pytorch", device="rocm")
+
+        self.attention_configs = self._get_attention_configs()
+        assert self.subset is not None, "SUBSET environment variable must be set for benchmarking"
+        self.weights_configs = self._get_weights_configs(self.subset)
+
+    def get_list_of_benchmarks_to_run(self) -> List[Dict[str, Any]]:
+        return [
+            {
+                "model": model,
+                "attn_implementation": attn_impl,
+                "weights_config": weights_cfg,
+            }
+            for model, attn_impl, weights_cfg in product(
+                CANONICAL_PRETRAINED_OPEN_LLM_LIST,
+                self.attention_configs,
+                self.weights_configs.keys(),
+            )
+        ]
+
+    def get_benchmark_name(self, model: str, **kwargs) -> str:
+        weights_config = kwargs["weights_config"]
+        attn_implementation = kwargs["attn_implementation"]
+        return f"{model}-{weights_config}-{attn_implementation}-{self.backend}"
+
+    def is_benchmark_supported(self, **kwargs) -> bool:
+        if kwargs["attn_implementation"] == "flash_attention_2" and kwargs["weights_config"] == "float32":
+            return False
+        return True
+
+    def get_benchmark_config(self, model: str, **kwargs) -> BenchmarkConfig:
+        weights_config = kwargs["weights_config"]
+        attn_implementation = kwargs["attn_implementation"]
+
+        assert (
+            weights_config in self.weights_configs
+        ), f"your config does contains the {weights_config}, adjust your _get_weights_configs to fix this issue"
+
+        torch_dtype = self.weights_configs[weights_config]["torch_dtype"]
+        quant_scheme = self.weights_configs[weights_config]["quant_scheme"]
+        quant_config = self.weights_configs[weights_config]["quant_config"]
+
+        launcher_config = ProcessConfig(device_isolation=True, device_isolation_action="kill")
+        scenario_config = InferenceConfig(
+            memory=True,
+            latency=True,
+            duration=10,
+            iterations=10,
+            warmup_runs=10,
+            input_shapes=INPUT_SHAPES,
+            generate_kwargs=GENERATE_KWARGS,
+        )
+        backend_config = PyTorchConfig(
+            model=model,
+            no_weights=True,
+            library="transformers",
+            task="text-generation",
+            torch_dtype=torch_dtype,
+            quantization_scheme=quant_scheme,
+            quantization_config=quant_config,
+            attn_implementation=attn_implementation,
+            model_kwargs={"trust_remote_code": True},
+        )
+
+        return BenchmarkConfig(
+            name=f"{weights_config}-{attn_implementation}",
+            scenario=scenario_config,
+            launcher=launcher_config,
+            backend=backend_config,
+        )
+
+    def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]:
+        if subset == "unquantized":
+            return {
+                "float32": {
+                    "torch_dtype": "float32",
+                    "quant_scheme": None,
+                    "quant_config": {},
+                },
+                "float16": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": None,
+                    "quant_config": {},
+                },
+                "bfloat16": {
+                    "torch_dtype": "bfloat16",
+                    "quant_scheme": None,
+                    "quant_config": {},
+                },
+            }
+        elif subset == "bnb":
+            return {
+                "4bit-bnb": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "bnb",
+                    "quant_config": {"load_in_4bit": True},
+                },
+                "8bit-bnb": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "bnb",
+                    "quant_config": {"load_in_8bit": True},
+                },
+            }
+        elif subset == "gptq":
+            return {
+                "4bit-gptq-exllama-v1": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "gptq",
+                    "quant_config": {
+                        "bits": 4,
+                        "use_exllama ": True,
+                        "version": 1,
+                        "model_seqlen": 256,
+                    },
+                },
+                "4bit-gptq-exllama-v2": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "gptq",
+                    "quant_config": {
+                        "bits": 4,
+                        "use_exllama ": True,
+                        "version": 2,
+                        "model_seqlen": 256,
+                    },
+                },
+            }
+        elif subset == "awq":
+            return {
+                "4bit-awq-gemm": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "awq",
+                    "quant_config": {"bits": 4, "version": "gemm"},
+                },
+                "4bit-awq-gemv": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "awq",
+                    "quant_config": {"bits": 4, "version": "gemv"},
+                },
+                "4bit-awq-exllama-v1": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "awq",
+                    "quant_config": {
+                        "bits": 4,
+                        "version": "exllama",
+                        "exllama_config": {
+                            "version": 1,
+                            "max_input_len": 64,
+                            "max_batch_size": 1,
+                        },
+                    },
+                },
+                "4bit-awq-exllama-v2": {
+                    "torch_dtype": "float16",
+                    "quant_scheme": "awq",
+                    "quant_config": {
+                        "bits": 4,
+                        "version": "exllama",
+                        "exllama_config": {
+                            "version": 2,
+                            "max_input_len": 64,
+                            "max_batch_size": 1,
+                        },
+                    },
+                },
+            }
+        else:
+            raise ValueError(f"Unknown subset: {subset}")
+
+    def _get_attention_configs(self) -> List[str]:
+        return ["eager", "sdpa", "flash_attention_2"]
+
+
+if __name__ == "__main__":
+    runner = ROCmPyTorchBenchmarkRunner()
+    runner.run_benchmarks()