diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index ff915e046946..1fb02b86fe96 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -13,15 +13,15 @@ env: MKL_NUM_THREADS: 8 jobs: - torch_pipelines_cuda_benchmark_tests: + torch_models_cuda_benchmark_tests: env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL_BENCHMARK }} - name: Torch Core Pipelines CUDA Benchmarking Tests + name: Torch Core Models CUDA Benchmarking Tests strategy: fail-fast: false max-parallel: 1 runs-on: - group: aws-g6-4xlarge-plus + group: aws-g6e-xlarge-plus container: image: diffusers/diffusers-pytorch-compile-cuda options: --shm-size "16gb" --ipc host --gpus 0 @@ -37,8 +37,9 @@ jobs: run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] - python -m uv pip install pandas peft - python -m uv pip uninstall transformers && python -m uv pip install transformers==4.48.0 + python -m uv pip install pandas peft torchprofile + # Temporary. + pip install --force-reinstall https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl - name: Environment run: | python utils/print_env.py @@ -47,8 +48,8 @@ jobs: HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }} BASE_PATH: benchmark_outputs run: | - export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))") - cd benchmarks && mkdir ${BASE_PATH} && python run_all.py && python push_results.py + cd benchmarks && python run_all.py && python push_results.py + mkdir ${BASE_PATH} && mv *.csv ${BASE_PATH} - name: Test suite reports artifacts if: ${{ always() }} diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/benchmarks/base_classes.py b/benchmarks/base_classes.py deleted file mode 100644 index 45bf65c93c93..000000000000 --- a/benchmarks/base_classes.py +++ /dev/null @@ -1,346 +0,0 @@ -import os -import sys - -import torch - -from diffusers import ( - AutoPipelineForImage2Image, - AutoPipelineForInpainting, - AutoPipelineForText2Image, - ControlNetModel, - LCMScheduler, - StableDiffusionAdapterPipeline, - StableDiffusionControlNetPipeline, - StableDiffusionXLAdapterPipeline, - StableDiffusionXLControlNetPipeline, - T2IAdapter, - WuerstchenCombinedPipeline, -) -from diffusers.utils import load_image - - -sys.path.append(".") - -from utils import ( # noqa: E402 - BASE_PATH, - PROMPT, - BenchmarkInfo, - benchmark_fn, - bytes_to_giga_bytes, - flush, - generate_csv_dict, - write_to_csv, -) - - -RESOLUTION_MAPPING = { - "Lykon/DreamShaper": (512, 512), - "lllyasviel/sd-controlnet-canny": (512, 512), - "diffusers/controlnet-canny-sdxl-1.0": (1024, 1024), - "TencentARC/t2iadapter_canny_sd14v1": (512, 512), - "TencentARC/t2i-adapter-canny-sdxl-1.0": (1024, 1024), - "stabilityai/stable-diffusion-2-1": (768, 768), - "stabilityai/stable-diffusion-xl-base-1.0": (1024, 1024), - "stabilityai/stable-diffusion-xl-refiner-1.0": (1024, 1024), - "stabilityai/sdxl-turbo": (512, 512), -} - - -class BaseBenchmak: - pipeline_class = None - - def __init__(self, args): - super().__init__() - - def run_inference(self, args): - raise NotImplementedError - - def benchmark(self, args): - raise NotImplementedError - - def get_result_filepath(self, args): - pipeline_class_name = str(self.pipe.__class__.__name__) - name = ( - args.ckpt.replace("/", "_") - + "_" - + pipeline_class_name - + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" - ) - filepath = os.path.join(BASE_PATH, name) - return filepath - - -class TextToImageBenchmark(BaseBenchmak): - pipeline_class = AutoPipelineForText2Image - - def __init__(self, args): - pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) - pipe = pipe.to("cuda") - - if args.run_compile: - if not isinstance(pipe, WuerstchenCombinedPipeline): - pipe.unet.to(memory_format=torch.channels_last) - print("Run torch compile") - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - if hasattr(pipe, "movq") and getattr(pipe, "movq", None) is not None: - pipe.movq.to(memory_format=torch.channels_last) - pipe.movq = torch.compile(pipe.movq, mode="reduce-overhead", fullgraph=True) - else: - print("Run torch compile") - pipe.decoder = torch.compile(pipe.decoder, mode="reduce-overhead", fullgraph=True) - pipe.vqgan = torch.compile(pipe.vqgan, mode="reduce-overhead", fullgraph=True) - - pipe.set_progress_bar_config(disable=True) - self.pipe = pipe - - def run_inference(self, pipe, args): - _ = pipe( - prompt=PROMPT, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - ) - - def benchmark(self, args): - flush() - - print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n") - - time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. - memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. - benchmark_info = BenchmarkInfo(time=time, memory=memory) - - pipeline_class_name = str(self.pipe.__class__.__name__) - flush() - csv_dict = generate_csv_dict( - pipeline_cls=pipeline_class_name, ckpt=args.ckpt, args=args, benchmark_info=benchmark_info - ) - filepath = self.get_result_filepath(args) - write_to_csv(filepath, csv_dict) - print(f"Logs written to: {filepath}") - flush() - - -class TurboTextToImageBenchmark(TextToImageBenchmark): - def __init__(self, args): - super().__init__(args) - - def run_inference(self, pipe, args): - _ = pipe( - prompt=PROMPT, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - guidance_scale=0.0, - ) - - -class LCMLoRATextToImageBenchmark(TextToImageBenchmark): - lora_id = "latent-consistency/lcm-lora-sdxl" - - def __init__(self, args): - super().__init__(args) - self.pipe.load_lora_weights(self.lora_id) - self.pipe.fuse_lora() - self.pipe.unload_lora_weights() - self.pipe.scheduler = LCMScheduler.from_config(self.pipe.scheduler.config) - - def get_result_filepath(self, args): - pipeline_class_name = str(self.pipe.__class__.__name__) - name = ( - self.lora_id.replace("/", "_") - + "_" - + pipeline_class_name - + f"-bs@{args.batch_size}-steps@{args.num_inference_steps}-mco@{args.model_cpu_offload}-compile@{args.run_compile}.csv" - ) - filepath = os.path.join(BASE_PATH, name) - return filepath - - def run_inference(self, pipe, args): - _ = pipe( - prompt=PROMPT, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - guidance_scale=1.0, - ) - - def benchmark(self, args): - flush() - - print(f"[INFO] {self.pipe.__class__.__name__}: Running benchmark with: {vars(args)}\n") - - time = benchmark_fn(self.run_inference, self.pipe, args) # in seconds. - memory = bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) # in GBs. - benchmark_info = BenchmarkInfo(time=time, memory=memory) - - pipeline_class_name = str(self.pipe.__class__.__name__) - flush() - csv_dict = generate_csv_dict( - pipeline_cls=pipeline_class_name, ckpt=self.lora_id, args=args, benchmark_info=benchmark_info - ) - filepath = self.get_result_filepath(args) - write_to_csv(filepath, csv_dict) - print(f"Logs written to: {filepath}") - flush() - - -class ImageToImageBenchmark(TextToImageBenchmark): - pipeline_class = AutoPipelineForImage2Image - url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/1665_Girl_with_a_Pearl_Earring.jpg" - image = load_image(url).convert("RGB") - - def __init__(self, args): - super().__init__(args) - self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) - - def run_inference(self, pipe, args): - _ = pipe( - prompt=PROMPT, - image=self.image, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - ) - - -class TurboImageToImageBenchmark(ImageToImageBenchmark): - def __init__(self, args): - super().__init__(args) - - def run_inference(self, pipe, args): - _ = pipe( - prompt=PROMPT, - image=self.image, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - guidance_scale=0.0, - strength=0.5, - ) - - -class InpaintingBenchmark(ImageToImageBenchmark): - pipeline_class = AutoPipelineForInpainting - mask_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/overture-creations-5sI6fQgYIuo_mask.png" - mask = load_image(mask_url).convert("RGB") - - def __init__(self, args): - super().__init__(args) - self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) - self.mask = self.mask.resize(RESOLUTION_MAPPING[args.ckpt]) - - def run_inference(self, pipe, args): - _ = pipe( - prompt=PROMPT, - image=self.image, - mask_image=self.mask, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - ) - - -class IPAdapterTextToImageBenchmark(TextToImageBenchmark): - url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png" - image = load_image(url) - - def __init__(self, args): - pipe = self.pipeline_class.from_pretrained(args.ckpt, torch_dtype=torch.float16).to("cuda") - pipe.load_ip_adapter( - args.ip_adapter_id[0], - subfolder="models" if "sdxl" not in args.ip_adapter_id[1] else "sdxl_models", - weight_name=args.ip_adapter_id[1], - ) - - if args.run_compile: - pipe.unet.to(memory_format=torch.channels_last) - print("Run torch compile") - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - - pipe.set_progress_bar_config(disable=True) - self.pipe = pipe - - def run_inference(self, pipe, args): - _ = pipe( - prompt=PROMPT, - ip_adapter_image=self.image, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - ) - - -class ControlNetBenchmark(TextToImageBenchmark): - pipeline_class = StableDiffusionControlNetPipeline - aux_network_class = ControlNetModel - root_ckpt = "Lykon/DreamShaper" - - url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_image_condition.png" - image = load_image(url).convert("RGB") - - def __init__(self, args): - aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) - pipe = self.pipeline_class.from_pretrained(self.root_ckpt, controlnet=aux_network, torch_dtype=torch.float16) - pipe = pipe.to("cuda") - - pipe.set_progress_bar_config(disable=True) - self.pipe = pipe - - if args.run_compile: - pipe.unet.to(memory_format=torch.channels_last) - pipe.controlnet.to(memory_format=torch.channels_last) - - print("Run torch compile") - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - pipe.controlnet = torch.compile(pipe.controlnet, mode="reduce-overhead", fullgraph=True) - - self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) - - def run_inference(self, pipe, args): - _ = pipe( - prompt=PROMPT, - image=self.image, - num_inference_steps=args.num_inference_steps, - num_images_per_prompt=args.batch_size, - ) - - -class ControlNetSDXLBenchmark(ControlNetBenchmark): - pipeline_class = StableDiffusionXLControlNetPipeline - root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" - - def __init__(self, args): - super().__init__(args) - - -class T2IAdapterBenchmark(ControlNetBenchmark): - pipeline_class = StableDiffusionAdapterPipeline - aux_network_class = T2IAdapter - root_ckpt = "Lykon/DreamShaper" - - url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter.png" - image = load_image(url).convert("L") - - def __init__(self, args): - aux_network = self.aux_network_class.from_pretrained(args.ckpt, torch_dtype=torch.float16) - pipe = self.pipeline_class.from_pretrained(self.root_ckpt, adapter=aux_network, torch_dtype=torch.float16) - pipe = pipe.to("cuda") - - pipe.set_progress_bar_config(disable=True) - self.pipe = pipe - - if args.run_compile: - pipe.unet.to(memory_format=torch.channels_last) - pipe.adapter.to(memory_format=torch.channels_last) - - print("Run torch compile") - pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True) - pipe.adapter = torch.compile(pipe.adapter, mode="reduce-overhead", fullgraph=True) - - self.image = self.image.resize(RESOLUTION_MAPPING[args.ckpt]) - - -class T2IAdapterSDXLBenchmark(T2IAdapterBenchmark): - pipeline_class = StableDiffusionXLAdapterPipeline - root_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" - - url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/benchmarking/canny_for_adapter_sdxl.png" - image = load_image(url) - - def __init__(self, args): - super().__init__(args) diff --git a/benchmarks/benchmark_controlnet.py b/benchmarks/benchmark_controlnet.py deleted file mode 100644 index 9217004461dc..000000000000 --- a/benchmarks/benchmark_controlnet.py +++ /dev/null @@ -1,26 +0,0 @@ -import argparse -import sys - - -sys.path.append(".") -from base_classes import ControlNetBenchmark, ControlNetSDXLBenchmark # noqa: E402 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--ckpt", - type=str, - default="lllyasviel/sd-controlnet-canny", - choices=["lllyasviel/sd-controlnet-canny", "diffusers/controlnet-canny-sdxl-1.0"], - ) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--num_inference_steps", type=int, default=50) - parser.add_argument("--model_cpu_offload", action="store_true") - parser.add_argument("--run_compile", action="store_true") - args = parser.parse_args() - - benchmark_pipe = ( - ControlNetBenchmark(args) if args.ckpt == "lllyasviel/sd-controlnet-canny" else ControlNetSDXLBenchmark(args) - ) - benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_ip_adapters.py b/benchmarks/benchmark_ip_adapters.py deleted file mode 100644 index 9a31a21fc60d..000000000000 --- a/benchmarks/benchmark_ip_adapters.py +++ /dev/null @@ -1,33 +0,0 @@ -import argparse -import sys - - -sys.path.append(".") -from base_classes import IPAdapterTextToImageBenchmark # noqa: E402 - - -IP_ADAPTER_CKPTS = { - # because original SD v1.5 has been taken down. - "Lykon/DreamShaper": ("h94/IP-Adapter", "ip-adapter_sd15.bin"), - "stabilityai/stable-diffusion-xl-base-1.0": ("h94/IP-Adapter", "ip-adapter_sdxl.bin"), -} - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--ckpt", - type=str, - default="rstabilityai/stable-diffusion-xl-base-1.0", - choices=list(IP_ADAPTER_CKPTS.keys()), - ) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--num_inference_steps", type=int, default=50) - parser.add_argument("--model_cpu_offload", action="store_true") - parser.add_argument("--run_compile", action="store_true") - args = parser.parse_args() - - args.ip_adapter_id = IP_ADAPTER_CKPTS[args.ckpt] - benchmark_pipe = IPAdapterTextToImageBenchmark(args) - args.ckpt = f"{args.ckpt} (IP-Adapter)" - benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_sd_img.py b/benchmarks/benchmark_sd_img.py deleted file mode 100644 index 772befe8795f..000000000000 --- a/benchmarks/benchmark_sd_img.py +++ /dev/null @@ -1,29 +0,0 @@ -import argparse -import sys - - -sys.path.append(".") -from base_classes import ImageToImageBenchmark, TurboImageToImageBenchmark # noqa: E402 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--ckpt", - type=str, - default="Lykon/DreamShaper", - choices=[ - "Lykon/DreamShaper", - "stabilityai/stable-diffusion-2-1", - "stabilityai/stable-diffusion-xl-refiner-1.0", - "stabilityai/sdxl-turbo", - ], - ) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--num_inference_steps", type=int, default=50) - parser.add_argument("--model_cpu_offload", action="store_true") - parser.add_argument("--run_compile", action="store_true") - args = parser.parse_args() - - benchmark_pipe = ImageToImageBenchmark(args) if "turbo" not in args.ckpt else TurboImageToImageBenchmark(args) - benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_sd_inpainting.py b/benchmarks/benchmark_sd_inpainting.py deleted file mode 100644 index 143adcb0d87c..000000000000 --- a/benchmarks/benchmark_sd_inpainting.py +++ /dev/null @@ -1,28 +0,0 @@ -import argparse -import sys - - -sys.path.append(".") -from base_classes import InpaintingBenchmark # noqa: E402 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--ckpt", - type=str, - default="Lykon/DreamShaper", - choices=[ - "Lykon/DreamShaper", - "stabilityai/stable-diffusion-2-1", - "stabilityai/stable-diffusion-xl-base-1.0", - ], - ) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--num_inference_steps", type=int, default=50) - parser.add_argument("--model_cpu_offload", action="store_true") - parser.add_argument("--run_compile", action="store_true") - args = parser.parse_args() - - benchmark_pipe = InpaintingBenchmark(args) - benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_t2i_adapter.py b/benchmarks/benchmark_t2i_adapter.py deleted file mode 100644 index 44b04b470ea6..000000000000 --- a/benchmarks/benchmark_t2i_adapter.py +++ /dev/null @@ -1,28 +0,0 @@ -import argparse -import sys - - -sys.path.append(".") -from base_classes import T2IAdapterBenchmark, T2IAdapterSDXLBenchmark # noqa: E402 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--ckpt", - type=str, - default="TencentARC/t2iadapter_canny_sd14v1", - choices=["TencentARC/t2iadapter_canny_sd14v1", "TencentARC/t2i-adapter-canny-sdxl-1.0"], - ) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--num_inference_steps", type=int, default=50) - parser.add_argument("--model_cpu_offload", action="store_true") - parser.add_argument("--run_compile", action="store_true") - args = parser.parse_args() - - benchmark_pipe = ( - T2IAdapterBenchmark(args) - if args.ckpt == "TencentARC/t2iadapter_canny_sd14v1" - else T2IAdapterSDXLBenchmark(args) - ) - benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_t2i_lcm_lora.py b/benchmarks/benchmark_t2i_lcm_lora.py deleted file mode 100644 index 957e0a463e28..000000000000 --- a/benchmarks/benchmark_t2i_lcm_lora.py +++ /dev/null @@ -1,23 +0,0 @@ -import argparse -import sys - - -sys.path.append(".") -from base_classes import LCMLoRATextToImageBenchmark # noqa: E402 - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--ckpt", - type=str, - default="stabilityai/stable-diffusion-xl-base-1.0", - ) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--num_inference_steps", type=int, default=4) - parser.add_argument("--model_cpu_offload", action="store_true") - parser.add_argument("--run_compile", action="store_true") - args = parser.parse_args() - - benchmark_pipe = LCMLoRATextToImageBenchmark(args) - benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmark_text_to_image.py b/benchmarks/benchmark_text_to_image.py deleted file mode 100644 index ddc7fb2676a5..000000000000 --- a/benchmarks/benchmark_text_to_image.py +++ /dev/null @@ -1,40 +0,0 @@ -import argparse -import sys - - -sys.path.append(".") -from base_classes import TextToImageBenchmark, TurboTextToImageBenchmark # noqa: E402 - - -ALL_T2I_CKPTS = [ - "Lykon/DreamShaper", - "segmind/SSD-1B", - "stabilityai/stable-diffusion-xl-base-1.0", - "kandinsky-community/kandinsky-2-2-decoder", - "warp-ai/wuerstchen", - "stabilityai/sdxl-turbo", -] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--ckpt", - type=str, - default="Lykon/DreamShaper", - choices=ALL_T2I_CKPTS, - ) - parser.add_argument("--batch_size", type=int, default=1) - parser.add_argument("--num_inference_steps", type=int, default=50) - parser.add_argument("--model_cpu_offload", action="store_true") - parser.add_argument("--run_compile", action="store_true") - args = parser.parse_args() - - benchmark_cls = None - if "turbo" in args.ckpt: - benchmark_cls = TurboTextToImageBenchmark - else: - benchmark_cls = TextToImageBenchmark - - benchmark_pipe = benchmark_cls(args) - benchmark_pipe.benchmark(args) diff --git a/benchmarks/benchmarking_flux.py b/benchmarks/benchmarking_flux.py new file mode 100644 index 000000000000..18a2680052ea --- /dev/null +++ b/benchmarks/benchmarking_flux.py @@ -0,0 +1,98 @@ +from functools import partial + +import torch +from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn + +from diffusers import BitsAndBytesConfig, FluxTransformer2DModel +from diffusers.utils.testing_utils import torch_device + + +CKPT_ID = "black-forest-labs/FLUX.1-dev" +RESULT_FILENAME = "flux.csv" + + +def get_input_dict(**device_dtype_kwargs): + # resolution: 1024x1024 + # maximum sequence length 512 + hidden_states = torch.randn(1, 4096, 64, **device_dtype_kwargs) + encoder_hidden_states = torch.randn(1, 512, 4096, **device_dtype_kwargs) + pooled_prompt_embeds = torch.randn(1, 768, **device_dtype_kwargs) + image_ids = torch.ones(512, 3, **device_dtype_kwargs) + text_ids = torch.ones(4096, 3, **device_dtype_kwargs) + timestep = torch.tensor([1.0], **device_dtype_kwargs) + guidance = torch.tensor([1.0], **device_dtype_kwargs) + + return { + "hidden_states": hidden_states, + "encoder_hidden_states": encoder_hidden_states, + "img_ids": image_ids, + "txt_ids": text_ids, + "pooled_projections": pooled_prompt_embeds, + "timestep": timestep, + "guidance": guidance, + } + + +if __name__ == "__main__": + scenarios = [ + BenchmarkScenario( + name=f"{CKPT_ID}-bf16", + model_cls=FluxTransformer2DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=model_init_fn, + compile_kwargs={"fullgraph": True}, + ), + BenchmarkScenario( + name=f"{CKPT_ID}-bnb-nf4", + model_cls=FluxTransformer2DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + "quantization_config": BitsAndBytesConfig( + load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4" + ), + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=model_init_fn, + ), + BenchmarkScenario( + name=f"{CKPT_ID}-layerwise-upcasting", + model_cls=FluxTransformer2DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=partial(model_init_fn, layerwise_upcasting=True), + ), + BenchmarkScenario( + name=f"{CKPT_ID}-group-offload-leaf", + model_cls=FluxTransformer2DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=partial( + model_init_fn, + group_offload_kwargs={ + "onload_device": torch_device, + "offload_device": torch.device("cpu"), + "offload_type": "leaf_level", + "use_stream": True, + "non_blocking": True, + }, + ), + ), + ] + + runner = BenchmarkMixin() + runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME) diff --git a/benchmarks/benchmarking_ltx.py b/benchmarks/benchmarking_ltx.py new file mode 100644 index 000000000000..3d698fd0bd57 --- /dev/null +++ b/benchmarks/benchmarking_ltx.py @@ -0,0 +1,80 @@ +from functools import partial + +import torch +from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn + +from diffusers import LTXVideoTransformer3DModel +from diffusers.utils.testing_utils import torch_device + + +CKPT_ID = "Lightricks/LTX-Video-0.9.7-dev" +RESULT_FILENAME = "ltx.csv" + + +def get_input_dict(**device_dtype_kwargs): + # 512x704 (161 frames) + # `max_sequence_length`: 256 + hidden_states = torch.randn(1, 7392, 128, **device_dtype_kwargs) + encoder_hidden_states = torch.randn(1, 256, 4096, **device_dtype_kwargs) + encoder_attention_mask = torch.ones(1, 256, **device_dtype_kwargs) + timestep = torch.tensor([1.0], **device_dtype_kwargs) + video_coords = torch.randn(1, 3, 7392, **device_dtype_kwargs) + + return { + "hidden_states": hidden_states, + "encoder_hidden_states": encoder_hidden_states, + "encoder_attention_mask": encoder_attention_mask, + "timestep": timestep, + "video_coords": video_coords, + } + + +if __name__ == "__main__": + scenarios = [ + BenchmarkScenario( + name=f"{CKPT_ID}-bf16", + model_cls=LTXVideoTransformer3DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=model_init_fn, + compile_kwargs={"fullgraph": True}, + ), + BenchmarkScenario( + name=f"{CKPT_ID}-layerwise-upcasting", + model_cls=LTXVideoTransformer3DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=partial(model_init_fn, layerwise_upcasting=True), + ), + BenchmarkScenario( + name=f"{CKPT_ID}-group-offload-leaf", + model_cls=LTXVideoTransformer3DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=partial( + model_init_fn, + group_offload_kwargs={ + "onload_device": torch_device, + "offload_device": torch.device("cpu"), + "offload_type": "leaf_level", + "use_stream": True, + "non_blocking": True, + }, + ), + ), + ] + + runner = BenchmarkMixin() + runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME) diff --git a/benchmarks/benchmarking_sdxl.py b/benchmarks/benchmarking_sdxl.py new file mode 100644 index 000000000000..ded62784f290 --- /dev/null +++ b/benchmarks/benchmarking_sdxl.py @@ -0,0 +1,82 @@ +from functools import partial + +import torch +from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn + +from diffusers import UNet2DConditionModel +from diffusers.utils.testing_utils import torch_device + + +CKPT_ID = "stabilityai/stable-diffusion-xl-base-1.0" +RESULT_FILENAME = "sdxl.csv" + + +def get_input_dict(**device_dtype_kwargs): + # height: 1024 + # width: 1024 + # max_sequence_length: 77 + hidden_states = torch.randn(1, 4, 128, 128, **device_dtype_kwargs) + encoder_hidden_states = torch.randn(1, 77, 2048, **device_dtype_kwargs) + timestep = torch.tensor([1.0], **device_dtype_kwargs) + added_cond_kwargs = { + "text_embeds": torch.randn(1, 1280, **device_dtype_kwargs), + "time_ids": torch.ones(1, 6, **device_dtype_kwargs), + } + + return { + "sample": hidden_states, + "encoder_hidden_states": encoder_hidden_states, + "timestep": timestep, + "added_cond_kwargs": added_cond_kwargs, + } + + +if __name__ == "__main__": + scenarios = [ + BenchmarkScenario( + name=f"{CKPT_ID}-bf16", + model_cls=UNet2DConditionModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "unet", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=model_init_fn, + compile_kwargs={"fullgraph": True}, + ), + BenchmarkScenario( + name=f"{CKPT_ID}-layerwise-upcasting", + model_cls=UNet2DConditionModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "unet", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=partial(model_init_fn, layerwise_upcasting=True), + ), + BenchmarkScenario( + name=f"{CKPT_ID}-group-offload-leaf", + model_cls=UNet2DConditionModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "unet", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=partial( + model_init_fn, + group_offload_kwargs={ + "onload_device": torch_device, + "offload_device": torch.device("cpu"), + "offload_type": "leaf_level", + "use_stream": True, + "non_blocking": True, + }, + ), + ), + ] + + runner = BenchmarkMixin() + runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME) diff --git a/benchmarks/benchmarking_utils.py b/benchmarks/benchmarking_utils.py new file mode 100644 index 000000000000..c4a3a976309e --- /dev/null +++ b/benchmarks/benchmarking_utils.py @@ -0,0 +1,204 @@ +import gc +import inspect +from contextlib import nullcontext +from dataclasses import dataclass +from typing import Any, Callable, Dict, Optional, Union + +import pandas as pd +import torch +import torch.utils.benchmark as benchmark + +from diffusers.models.modeling_utils import ModelMixin +from diffusers.utils.testing_utils import require_torch_gpu, torch_device + + +def benchmark_fn(f, *args, **kwargs): + t0 = benchmark.Timer( + stmt="f(*args, **kwargs)", + globals={"args": args, "kwargs": kwargs, "f": f}, + num_threads=1, + ) + return float(f"{(t0.blocked_autorange().mean):.3f}") + + +def flush(): + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_max_memory_allocated() + torch.cuda.reset_peak_memory_stats() + + +# Adapted from https://github.com/lucasb-eyer/cnn_vit_benchmarks/blob/15b665ff758e8062131353076153905cae00a71f/main.py +def calculate_flops(model, input_dict): + try: + from torchprofile import profile_macs + except ModuleNotFoundError: + raise + + # This is a hacky way to convert the kwargs to args as `profile_macs` cries about kwargs. + sig = inspect.signature(model.forward) + param_names = [ + p.name + for p in sig.parameters.values() + if p.kind + in ( + inspect.Parameter.POSITIONAL_ONLY, + inspect.Parameter.POSITIONAL_OR_KEYWORD, + ) + and p.name != "self" + ] + bound = sig.bind_partial(**input_dict) + bound.apply_defaults() + args = tuple(bound.arguments[name] for name in param_names) + + model.eval() + with torch.no_grad(): + macs = profile_macs(model, args) + flops = 2 * macs # 1 MAC operation = 2 FLOPs (1 multiplication + 1 addition) + return flops + + +def calculate_params(model): + return sum(p.numel() for p in model.parameters()) + + +# Users can define their own in case this doesn't suffice. For most cases, +# it should be sufficient. +def model_init_fn(model_cls, group_offload_kwargs=None, layerwise_upcasting=False, **init_kwargs): + model = model_cls.from_pretrained(**init_kwargs).eval() + if group_offload_kwargs and isinstance(group_offload_kwargs, dict): + model.enable_group_offload(**group_offload_kwargs) + else: + model.to(torch_device) + if layerwise_upcasting: + model.enable_layerwise_casting( + storage_dtype=torch.float8_e4m3fn, compute_dtype=init_kwargs.get("torch_dtype", torch.bfloat16) + ) + return model + + +@dataclass +class BenchmarkScenario: + name: str + model_cls: ModelMixin + model_init_kwargs: Dict[str, Any] + model_init_fn: Callable + get_model_input_dict: Callable + compile_kwargs: Optional[Dict[str, Any]] = None + + +@require_torch_gpu +class BenchmarkMixin: + def pre_benchmark(self): + flush() + torch.compiler.reset() + + def post_benchmark(self, model): + model.cpu() + flush() + torch.compiler.reset() + + @torch.no_grad() + def run_benchmark(self, scenario: BenchmarkScenario): + # 0) Basic stats + print(f"Running scenario: {scenario.name}.") + model = model_init_fn(scenario.model_cls, **scenario.model_init_kwargs) + num_params = round(calculate_params(model) / 1e6, 2) + flops = round(calculate_flops(model, input_dict=scenario.get_model_input_dict()) / 1e6, 2) + model.cpu() + del model + self.pre_benchmark() + + # 1) plain stats + results = {} + plain = None + try: + plain = self._run_phase( + model_cls=scenario.model_cls, + init_fn=scenario.model_init_fn, + init_kwargs=scenario.model_init_kwargs, + get_input_fn=scenario.get_model_input_dict, + compile_kwargs=None, + ) + except Exception as e: + print(f"Benchmark could not be run with the following error\n: {e}") + return results + + # 2) compiled stats (if any) + compiled = {"time": None, "memory": None} + if scenario.compile_kwargs: + try: + compiled = self._run_phase( + model_cls=scenario.model_cls, + init_fn=scenario.model_init_fn, + init_kwargs=scenario.model_init_kwargs, + get_input_fn=scenario.get_model_input_dict, + compile_kwargs=scenario.compile_kwargs, + ) + except Exception as e: + print(f"Compilation benchmark could not be run with the following error\n: {e}") + if plain is None: + return results + + # 3) merge + result = { + "scenario": scenario.name, + "model_cls": scenario.model_cls.__name__, + "num_params_M": num_params, + "flops_M": flops, + "time_plain_s": plain["time"], + "mem_plain_GB": plain["memory"], + "time_compile_s": compiled["time"], + "mem_compile_GB": compiled["memory"], + } + if scenario.compile_kwargs: + result["fullgraph"] = scenario.compile_kwargs.get("fullgraph", False) + result["mode"] = scenario.compile_kwargs.get("mode", "default") + else: + result["fullgraph"], result["mode"] = None, None + return result + + def run_bencmarks_and_collate(self, scenarios: Union[BenchmarkScenario, list[BenchmarkScenario]], filename: str): + if not isinstance(scenarios, list): + scenarios = [scenarios] + records = [] + for s in scenarios: + try: + records.append(self.run_benchmark(s)) + except Exception as e: + print(f"Running scenario ({s.name}) led to error:\n{e}") + df = pd.DataFrame.from_records([r for r in records if r]) + df.to_csv(filename, index=False) + print(f"Results serialized to {filename=}.") + + def _run_phase( + self, + *, + model_cls: ModelMixin, + init_fn: Callable, + init_kwargs: Dict[str, Any], + get_input_fn: Callable, + compile_kwargs: Optional[Dict[str, Any]], + ) -> Dict[str, float]: + # setup + self.pre_benchmark() + + # init & (optional) compile + model = init_fn(model_cls, **init_kwargs) + if compile_kwargs: + model.compile(**compile_kwargs) + + # build inputs + inp = get_input_fn() + + # measure + run_ctx = torch._inductor.utils.fresh_inductor_cache() if compile_kwargs else nullcontext() + with run_ctx: + time_s = benchmark_fn(lambda m, d: m(**d), model, inp) + mem_gb = torch.cuda.max_memory_allocated() / (1024**3) + mem_gb = round(mem_gb, 2) + + # teardown + self.post_benchmark(model) + del model + return {"time": time_s, "memory": mem_gb} diff --git a/benchmarks/benchmarking_wan.py b/benchmarks/benchmarking_wan.py new file mode 100644 index 000000000000..64e81fdb6b09 --- /dev/null +++ b/benchmarks/benchmarking_wan.py @@ -0,0 +1,74 @@ +from functools import partial + +import torch +from benchmarking_utils import BenchmarkMixin, BenchmarkScenario, model_init_fn + +from diffusers import WanTransformer3DModel +from diffusers.utils.testing_utils import torch_device + + +CKPT_ID = "Wan-AI/Wan2.1-T2V-14B-Diffusers" +RESULT_FILENAME = "wan.csv" + + +def get_input_dict(**device_dtype_kwargs): + # height: 480 + # width: 832 + # num_frames: 81 + # max_sequence_length: 512 + hidden_states = torch.randn(1, 16, 21, 60, 104, **device_dtype_kwargs) + encoder_hidden_states = torch.randn(1, 512, 4096, **device_dtype_kwargs) + timestep = torch.tensor([1.0], **device_dtype_kwargs) + + return {"hidden_states": hidden_states, "encoder_hidden_states": encoder_hidden_states, "timestep": timestep} + + +if __name__ == "__main__": + scenarios = [ + BenchmarkScenario( + name=f"{CKPT_ID}-bf16", + model_cls=WanTransformer3DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=model_init_fn, + compile_kwargs={"fullgraph": True}, + ), + BenchmarkScenario( + name=f"{CKPT_ID}-layerwise-upcasting", + model_cls=WanTransformer3DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=partial(model_init_fn, layerwise_upcasting=True), + ), + BenchmarkScenario( + name=f"{CKPT_ID}-group-offload-leaf", + model_cls=WanTransformer3DModel, + model_init_kwargs={ + "pretrained_model_name_or_path": CKPT_ID, + "torch_dtype": torch.bfloat16, + "subfolder": "transformer", + }, + get_model_input_dict=partial(get_input_dict, device=torch_device, dtype=torch.bfloat16), + model_init_fn=partial( + model_init_fn, + group_offload_kwargs={ + "onload_device": torch_device, + "offload_device": torch.device("cpu"), + "offload_type": "leaf_level", + "use_stream": True, + "non_blocking": True, + }, + ), + ), + ] + + runner = BenchmarkMixin() + runner.run_bencmarks_and_collate(scenarios, filename=RESULT_FILENAME) diff --git a/benchmarks/push_results.py b/benchmarks/push_results.py index 71cd60f32c0f..30da0c053863 100644 --- a/benchmarks/push_results.py +++ b/benchmarks/push_results.py @@ -1,19 +1,17 @@ -import glob -import sys - import pandas as pd from huggingface_hub import hf_hub_download, upload_file from huggingface_hub.utils import EntryNotFoundError -sys.path.append(".") -from utils import BASE_PATH, FINAL_CSV_FILE, GITHUB_SHA, REPO_ID, collate_csv # noqa: E402 +REPO_ID = "diffusers/benchmarks" def has_previous_benchmark() -> str: + from run_all import FINAL_CSV_FILENAME + csv_path = None try: - csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILE) + csv_path = hf_hub_download(repo_id=REPO_ID, repo_type="dataset", filename=FINAL_CSV_FILENAME) except EntryNotFoundError: csv_path = None return csv_path @@ -26,43 +24,40 @@ def filter_float(value): def push_to_hf_dataset(): - all_csvs = sorted(glob.glob(f"{BASE_PATH}/*.csv")) - collate_csv(all_csvs, FINAL_CSV_FILE) + from run_all import FINAL_CSV_FILENAME, GITHUB_SHA - # If there's an existing benchmark file, we should report the changes. csv_path = has_previous_benchmark() if csv_path is not None: - current_results = pd.read_csv(FINAL_CSV_FILE) + current_results = pd.read_csv(FINAL_CSV_FILENAME) previous_results = pd.read_csv(csv_path) numeric_columns = current_results.select_dtypes(include=["float64", "int64"]).columns - numeric_columns = [ - c for c in numeric_columns if c not in ["batch_size", "num_inference_steps", "actual_gpu_memory (gbs)"] - ] for column in numeric_columns: - previous_results[column] = previous_results[column].map(lambda x: filter_float(x)) + # get previous values as floats, aligned to current index + prev_vals = previous_results[column].map(filter_float).reindex(current_results.index) - # Calculate the percentage change - current_results[column] = current_results[column].astype(float) - previous_results[column] = previous_results[column].astype(float) - percent_change = ((current_results[column] - previous_results[column]) / previous_results[column]) * 100 + # get current values as floats + curr_vals = current_results[column].astype(float) - # Format the values with '+' or '-' sign and append to original values - current_results[column] = current_results[column].map(str) + percent_change.map( - lambda x: f" ({'+' if x > 0 else ''}{x:.2f}%)" + # stringify the current values + curr_str = curr_vals.map(str) + + # build an appendage only when prev exists and differs + append_str = prev_vals.where(prev_vals.notnull() & (prev_vals != curr_vals), other=pd.NA).map( + lambda x: f" ({x})" if pd.notnull(x) else "" ) - # There might be newly added rows. So, filter out the NaNs. - current_results[column] = current_results[column].map(lambda x: x.replace(" (nan%)", "")) - # Overwrite the current result file. - current_results.to_csv(FINAL_CSV_FILE, index=False) + # combine + current_results[column] = curr_str + append_str + + current_results.to_csv(FINAL_CSV_FILENAME, index=False) commit_message = f"upload from sha: {GITHUB_SHA}" if GITHUB_SHA is not None else "upload benchmark results" upload_file( repo_id=REPO_ID, - path_in_repo=FINAL_CSV_FILE, - path_or_fileobj=FINAL_CSV_FILE, + path_in_repo=FINAL_CSV_FILENAME, + path_or_fileobj=FINAL_CSV_FILENAME, repo_type="dataset", commit_message=commit_message, ) diff --git a/benchmarks/run_all.py b/benchmarks/run_all.py index c9932cc71c38..278683bdc254 100644 --- a/benchmarks/run_all.py +++ b/benchmarks/run_all.py @@ -1,14 +1,13 @@ import glob +import os import subprocess -import sys -from typing import List +import pandas as pd -sys.path.append(".") -from benchmark_text_to_image import ALL_T2I_CKPTS # noqa: E402 - -PATTERN = "benchmark_*.py" +PATTERN = "benchmarking_*.py" +FINAL_CSV_FILENAME = "collated_results.csv" +GITHUB_SHA = os.getenv("GITHUB_SHA", None) class SubprocessCallException(Exception): @@ -16,7 +15,7 @@ class SubprocessCallException(Exception): # Taken from `test_examples_utils.py` -def run_command(command: List[str], return_stdout=False): +def run_command(command: list[str], return_stdout=False): """ Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture if an error occurred while running `command` @@ -33,69 +32,28 @@ def run_command(command: List[str], return_stdout=False): ) from e -def main(): - python_files = glob.glob(PATTERN) +def run_scripts(): + python_files = sorted(glob.glob(PATTERN)) for file in python_files: - print(f"****** Running file: {file} ******") - - # Run with canonical settings. - if file != "benchmark_text_to_image.py" and file != "benchmark_ip_adapters.py": + if file != "benchmarking_utils.py": + print(f"****** Running file: {file} ******") command = f"python {file}" - run_command(command.split()) - - command += " --run_compile" - run_command(command.split()) - - # Run variants. - for file in python_files: - # See: https://github.com/pytorch/pytorch/issues/129637 - if file == "benchmark_ip_adapters.py": - continue - - if file == "benchmark_text_to_image.py": - for ckpt in ALL_T2I_CKPTS: - command = f"python {file} --ckpt {ckpt}" - - if "turbo" in ckpt: - command += " --num_inference_steps 1" - + try: run_command(command.split()) + except SubprocessCallException as e: + print(f"Error running {file}: {e}") + continue - command += " --run_compile" - run_command(command.split()) - - elif file == "benchmark_sd_img.py": - for ckpt in ["stabilityai/stable-diffusion-xl-refiner-1.0", "stabilityai/sdxl-turbo"]: - command = f"python {file} --ckpt {ckpt}" - - if ckpt == "stabilityai/sdxl-turbo": - command += " --num_inference_steps 2" - - run_command(command.split()) - command += " --run_compile" - run_command(command.split()) - - elif file in ["benchmark_sd_inpainting.py", "benchmark_ip_adapters.py"]: - sdxl_ckpt = "stabilityai/stable-diffusion-xl-base-1.0" - command = f"python {file} --ckpt {sdxl_ckpt}" - run_command(command.split()) - - command += " --run_compile" - run_command(command.split()) - - elif file in ["benchmark_controlnet.py", "benchmark_t2i_adapter.py"]: - sdxl_ckpt = ( - "diffusers/controlnet-canny-sdxl-1.0" - if "controlnet" in file - else "TencentARC/t2i-adapter-canny-sdxl-1.0" - ) - command = f"python {file} --ckpt {sdxl_ckpt}" - run_command(command.split()) - command += " --run_compile" - run_command(command.split()) +def merge_csvs(): + all_csvs = glob.glob("*.csv") + final_df = pd.concat([pd.read_csv(f) for f in all_csvs]).reset_index(drop=True) + if GITHUB_SHA: + final_df["github_sha"] = GITHUB_SHA + final_df.to_csv(FINAL_CSV_FILENAME) if __name__ == "__main__": - main() + run_scripts() + merge_csvs() diff --git a/benchmarks/utils.py b/benchmarks/utils.py deleted file mode 100644 index 5fce920ac6c3..000000000000 --- a/benchmarks/utils.py +++ /dev/null @@ -1,98 +0,0 @@ -import argparse -import csv -import gc -import os -from dataclasses import dataclass -from typing import Dict, List, Union - -import torch -import torch.utils.benchmark as benchmark - - -GITHUB_SHA = os.getenv("GITHUB_SHA", None) -BENCHMARK_FIELDS = [ - "pipeline_cls", - "ckpt_id", - "batch_size", - "num_inference_steps", - "model_cpu_offload", - "run_compile", - "time (secs)", - "memory (gbs)", - "actual_gpu_memory (gbs)", - "github_sha", -] - -PROMPT = "ghibli style, a fantasy landscape with castles" -BASE_PATH = os.getenv("BASE_PATH", ".") -TOTAL_GPU_MEMORY = float(os.getenv("TOTAL_GPU_MEMORY", torch.cuda.get_device_properties(0).total_memory / (1024**3))) - -REPO_ID = "diffusers/benchmarks" -FINAL_CSV_FILE = "collated_results.csv" - - -@dataclass -class BenchmarkInfo: - time: float - memory: float - - -def flush(): - """Wipes off memory.""" - gc.collect() - torch.cuda.empty_cache() - torch.cuda.reset_max_memory_allocated() - torch.cuda.reset_peak_memory_stats() - - -def bytes_to_giga_bytes(bytes): - return f"{(bytes / 1024 / 1024 / 1024):.3f}" - - -def benchmark_fn(f, *args, **kwargs): - t0 = benchmark.Timer( - stmt="f(*args, **kwargs)", - globals={"args": args, "kwargs": kwargs, "f": f}, - num_threads=torch.get_num_threads(), - ) - return f"{(t0.blocked_autorange().mean):.3f}" - - -def generate_csv_dict( - pipeline_cls: str, ckpt: str, args: argparse.Namespace, benchmark_info: BenchmarkInfo -) -> Dict[str, Union[str, bool, float]]: - """Packs benchmarking data into a dictionary for latter serialization.""" - data_dict = { - "pipeline_cls": pipeline_cls, - "ckpt_id": ckpt, - "batch_size": args.batch_size, - "num_inference_steps": args.num_inference_steps, - "model_cpu_offload": args.model_cpu_offload, - "run_compile": args.run_compile, - "time (secs)": benchmark_info.time, - "memory (gbs)": benchmark_info.memory, - "actual_gpu_memory (gbs)": f"{(TOTAL_GPU_MEMORY):.3f}", - "github_sha": GITHUB_SHA, - } - return data_dict - - -def write_to_csv(file_name: str, data_dict: Dict[str, Union[str, bool, float]]): - """Serializes a dictionary into a CSV file.""" - with open(file_name, mode="w", newline="") as csvfile: - writer = csv.DictWriter(csvfile, fieldnames=BENCHMARK_FIELDS) - writer.writeheader() - writer.writerow(data_dict) - - -def collate_csv(input_files: List[str], output_file: str): - """Collates multiple identically structured CSVs into a single CSV file.""" - with open(output_file, mode="w", newline="") as outfile: - writer = csv.DictWriter(outfile, fieldnames=BENCHMARK_FIELDS) - writer.writeheader() - - for file in input_files: - with open(file, mode="r") as infile: - reader = csv.DictReader(infile) - for row in reader: - writer.writerow(row)