Merge pull request #8 from huggingface/make-backend-public

baptistecolle · web-flow · commit fa884a52011b · 2024-12-13T10:14:04.000+01:00
Make backend public
diff --git a/.github/workflows/benchmark_cpu_onnxruntime.yaml b/.github/workflows/benchmark_cpu_onnxruntime.yaml
@@ -3,10 +3,7 @@ name: Benchmark CPU Onnxruntime
 on:
   workflow_dispatch:
   schedule:
-    - cron: "0 0 * * *"
-  push:
-    branches:
-      - '*'
+    - cron: "0 12 * * *"
   pull_request:
 
 concurrency:
diff --git a/.github/workflows/benchmark_cpu_openvino.yaml b/.github/workflows/benchmark_cpu_openvino.yaml
@@ -4,9 +4,6 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  push:
-    branches:
-      - '*'
   pull_request:
 
 concurrency:
diff --git a/.github/workflows/benchmark_cpu_pytorch.yaml b/.github/workflows/benchmark_cpu_pytorch.yaml
@@ -4,9 +4,6 @@ on:
   workflow_dispatch:
   schedule:
     - cron: "0 0 * * *"
-  push:
-    branches:
-      - '*'
   pull_request:
 
 concurrency:
diff --git a/.github/workflows/benchmark_cuda_pytorch.yaml b/.github/workflows/benchmark_cuda_pytorch.yaml
@@ -3,10 +3,7 @@ name: Benchmark CUDA PyTorch
 on:
   workflow_dispatch:
   schedule:
-    - cron: "0 0 * * *"
-  push:
-    branches:
-      - '*'
+    - cron: "0 3 * * *"
   pull_request:
 
 concurrency:
@@ -33,7 +30,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        subset: [unquantized, bnb, awq, gptq]
+        subset: [torchao]
 
         machine:
           [
diff --git a/.github/workflows/update_llm_perf_leaderboard.yaml b/.github/workflows/update_llm_perf_leaderboard.yaml
@@ -3,7 +3,7 @@ name: Update LLM Perf Leaderboard
 on:
   workflow_dispatch:
   schedule:
-    - cron: "0 */6 * * *"
+    - cron: "0 0 * * *"
   push:
     branches:
       - main
diff --git a/.gitignore b/.gitignore
@@ -187,4 +187,6 @@ outputs/
 wip/
 
 *.csv
-optimum-benchmark/
+optimum-benchmark/
+
+*.egg-info/
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 # Style and Quality checks
-.PHONY: style quality
+.PHONY: style quality install install-dev run_cpu_container run_cuda_container run_rocm_container cpu-pytorch-container cpu-openvino-container collector-container
 
 quality:
 	ruff check .
@@ -9,17 +9,13 @@ style:
 	ruff format .
 	ruff check --fix .
 
-.PHONY: install
-
 install:
 	pip install .
 
 install-dev:
 	DEBUG=1 uv pip install -e .
 
-# Running containers
-.PHONY: run_cpu_container run_cuda_container run_rocm_container
-
+# Running optimum-benchmark containers
 run_cpu_container:
 	docker run -it --rm --pid host --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-cpu
 
@@ -29,15 +25,15 @@ run_cuda_container:
 run_rocm_container:
 	docker run -it --rm --shm-size 64G --device /dev/kfd --device /dev/dri --volume .:/llm-perf-backend --workdir /llm-perf-backend ghcr.io/huggingface/optimum-benchmark:latest-rocm
 
+# Running llm-perf backend containers
 cpu-pytorch-container:
 	docker build -t cpu-pytorch -f docker/cpu-pytorch/Dockerfile .
-	# docker run -it --rm --pid host cpu-pytorch /bin/bash
 	docker run -it --rm --pid host cpu-pytorch
 
-collector-container:
-	docker build -t collector -f docker/collector/Dockerfile .
-	docker run -it --rm --pid host collector
-
 cpu-openvino-container:
 	docker build -t cpu-openvino -f docker/cpu-openvino/Dockerfile .
 	docker run -it --rm --pid host cpu-openvino
+
+collector-container:
+	docker build -t collector -f docker/collector/Dockerfile .
+	docker run -it --rm --pid host collector
diff --git a/README.md b/README.md
@@ -1,15 +1,78 @@
-# llm-perf-backend
-The backend of [the LLM-perf leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard)
+# LLM-perf Backend 🏋️
 
-## Why
-this runs all the benchmarks to get results for the leaderboard
+The official backend system powering the [LLM-perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard). This repository contains the infrastructure and tools needed to run standardized benchmarks for Large Language Models (LLMs) across different hardware configurations and optimization backends.
 
-## How to install
-git clone 
-pip install -e .[openvino]
+## About 📝
 
-## How to use the cli 
-llm-perf run-benchmark --hardware cpu --backend openvino
+LLM-perf Backend is designed to:
+- Run automated benchmarks for the LLM-perf leaderboard
+- Ensure consistent and reproducible performance measurements
+- Support multiple hardware configurations and optimization backends
+- Generate standardized performance metrics for latency, throughput, memory usage, and energy consumption
+
+## Key Features 🔑
+
+- Standardized benchmarking pipeline using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark)
+- Support for multiple hardware configurations (CPU, GPU)
+- Multiple backend implementations (PyTorch, Onnxruntime, etc.)
+- Automated metric collection:
+  - Latency and throughput measurements
+  - Memory usage tracking
+  - Energy consumption monitoring
+  - Quality metrics integration with Open LLM Leaderboard
+
+## Installation 🛠️
+
+1. Clone the repository:
+```bash
+git clone https://github.com/huggingface/llm-perf-backend
+cd llm-perf-backend
+```
+
+2. Create a python env
+```bash
+python -m venv .venv
+source .venv/bin/activate
+```
+
+2. Install the package with required dependencies:
+```bash
+pip install -e "." 
+# or
+pip install -e ".[all]" # to install optional dependency like Onnxruntime
+```
+
+## Usage 📋
+
+### Command Line Interface
+
+Run benchmarks using the CLI tool:
+
+```bash
 llm-perf run-benchmark --hardware cpu --backend pytorch
+```
+
+### Configuration Options
+
+View all the options with
+```bash
+llm-perf run-benchmark --help
+```
+
+- `--hardware`: Target hardware platform (cpu, cuda)
+- `--backend`: Backend framework to use (pytorch, onnxruntime, etc.)
+
+## Benchmark Dataset 📊
+
+Results are published to the official dataset:
+[optimum-benchmark/llm-perf-leaderboard](https://huggingface.co/datasets/optimum-benchmark/llm-perf-leaderboard)
+
+## Benchmark Specifications 📑
 
-https://huggingface.co/datasets/optimum-benchmark/llm-perf-leaderboard
+All benchmarks follow these standardized settings:
+- Single GPU usage to avoid communication-dependent results
+- Energy monitoring via CodeCarbon
+- Memory tracking:
+  - Maximum allocated memory
+  - Maximum reserved memory
+  - Maximum used memory (via PyNVML for GPU)
diff --git a/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py b/llm_perf/benchmark_runners/cuda/update_llm_perf_cuda_pytorch.py
@@ -191,6 +191,17 @@ def _get_weights_configs(self, subset) -> Dict[str, Dict[str, Any]]:
                     },
                 },
             }
+        elif subset == "torchao":
+            return {
+                "torchao-int4wo-128": {
+                    "torch_dtype": "bfloat16",
+                    "quant_scheme": "torchao",
+                    "quant_config": {
+                        "quant_type": "int4_weight_only",
+                        "group_size": 128,
+                    },
+                },
+            }
         else:
             raise ValueError(f"Unknown subset: {subset}")
 
diff --git a/llm_perf/hardware.yaml b/llm_perf/hardware.yaml
@@ -5,6 +5,7 @@
     - awq
     - bnb
     - gptq
+    - torchao
   backends:
     - pytorch
 
@@ -15,6 +16,7 @@
     - awq
     - bnb
     - gptq
+    - torchao
   backends:
     - pytorch
 
@@ -25,6 +27,7 @@
     - awq
     - bnb
     - gptq
+    - torchao
   backends:
     - pytorch
 
diff --git a/llm_perf/update_llm_perf_leaderboard.py b/llm_perf/update_llm_perf_leaderboard.py
@@ -4,7 +4,6 @@
 import pandas as pd
 from huggingface_hub import create_repo, snapshot_download, upload_file, repo_exists
 from optimum_benchmark import Benchmark
-import requests
 import json
 
 from llm_perf.common.hardware_config import load_hardware_configs
@@ -19,6 +18,7 @@
 PERF_DF = "perf-df-{backend}-{hardware}-{subset}-{machine}.csv"
 LLM_DF = "llm-df.csv"
 
+
 def patch_json(file):
     """
     Patch a JSON file by adding a 'stdev_' key with the same value as 'stdev' for all occurrences,
@@ -37,7 +37,7 @@ def patch_json(file):
     """
     with open(file, "r") as f:
         data = json.load(f)
-    
+
     def add_stdev_(obj):
         if isinstance(obj, dict):
             new_items = []
@@ -53,10 +53,11 @@ def add_stdev_(obj):
                 add_stdev_(item)
 
     add_stdev_(data)
-    
+
     with open(file, "w") as f:
         json.dump(data, f, indent=4)
 
+
 def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str):
     """
     Gather the benchmarks for a given machine
@@ -99,7 +100,6 @@ def gather_benchmarks(subset: str, machine: str, backend: str, hardware: str):
 #     return response.status_code == 200
 
 
-
 def update_perf_dfs():
     """
     Update the performance dataframes for all machines
@@ -116,19 +116,18 @@ def update_perf_dfs():
                         backend,
                         hardware_config.hardware,
                     )
-                except Exception as e:
+                except Exception:
                     print("Dataset not found for:")
                     print(f"  • Backend: {backend}")
                     print(f"  • Subset: {subset}")
                     print(f"  • Machine: {hardware_config.machine}")
                     print(f"  • Hardware Type: {hardware_config.hardware}")
                     url = f"{PERF_REPO_ID.format(subset=subset, machine=hardware_config.machine, backend=backend, hardware=hardware_config.hardware)}"
-                    
+
                     does_exist = repo_exists(url, repo_type="dataset")
 
                     if does_exist:
                         print(f"Dataset exists: {url} but could not be processed")
-                    
 
 
 scrapping_script = """
diff --git a/optimum-benchmark b/optimum-benchmark
diff --git a/pyproject.toml.bak b/pyproject.toml.bak
diff --git a/setup.py b/setup.py
@@ -32,6 +32,7 @@
         "auto-gptq",
         "bitsandbytes",
         "autoawq",
+        "torchao",
     ],
 }
 
diff --git a/test.py b/test.py
diff --git a/test.sh b/test.sh

Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@`
`32`	`32`	`"auto-gptq",`
`33`	`33`	`"bitsandbytes",`
`34`	`34`	`"autoawq",`
	`35`	`+ "torchao",`
`35`	`36`	`],`
`36`	`37`	`}`
`37`	`38`