From 84b226d47deee9f4bbf87c7e0849e7bddee561d5 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 30 May 2025 11:17:07 +0100 Subject: [PATCH 1/3] [UR] Support 2D and 3D enqueue kernel launches in offload A (rather trivial) test has also been added since Offload currently has issues with arguments. --- .../source/adapters/offload/enqueue.cpp | 27 +++++++++++-------- .../conformance/device_code/CMakeLists.txt | 1 + .../test/conformance/device_code/no_args.cpp | 17 ++++++++++++ .../enqueue/urEnqueueKernelLaunch.cpp | 19 +++++++++++++ 4 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 unified-runtime/test/conformance/device_code/no_args.cpp diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index cd9a138dc3b9b..c7a88eda6fb87 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -28,22 +28,27 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // (void)pGlobalWorkOffset; - (void)pLocalWorkSize; - if (workDim == 1) { - std::cerr - << "UR Offload adapter only supports 1d kernel launches at the moment"; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + if (!pLocalWorkSize) { + // TODO: This is not optimal, but it is legal + static size_t DefaultWorkSize[3] = {1, 1, 1}; + pLocalWorkSize = DefaultWorkSize; + } + + if (pLocalWorkSize[0] > pGlobalWorkSize[0] || + pLocalWorkSize[1] > pGlobalWorkSize[1] || + pLocalWorkSize[2] > pGlobalWorkSize[2]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } ol_kernel_launch_size_args_t LaunchArgs; LaunchArgs.Dimensions = workDim; - LaunchArgs.NumGroupsX = pGlobalWorkSize[0]; - LaunchArgs.NumGroupsY = 1; - LaunchArgs.NumGroupsZ = 1; - LaunchArgs.GroupSizeX = 1; - LaunchArgs.GroupSizeY = 1; - LaunchArgs.GroupSizeZ = 1; + LaunchArgs.NumGroupsX = pGlobalWorkSize[0] / pLocalWorkSize[0]; + LaunchArgs.NumGroupsY = pGlobalWorkSize[1] / pLocalWorkSize[1]; + LaunchArgs.NumGroupsZ = pGlobalWorkSize[2] / pLocalWorkSize[2]; + LaunchArgs.GroupSizeX = pLocalWorkSize[0]; + LaunchArgs.GroupSizeY = pLocalWorkSize[1]; + LaunchArgs.GroupSizeZ = pLocalWorkSize[2]; LaunchArgs.DynSharedMemory = 0; ol_event_handle_t EventOut; diff --git a/unified-runtime/test/conformance/device_code/CMakeLists.txt b/unified-runtime/test/conformance/device_code/CMakeLists.txt index a5401fdf52898..1bc8ce784ca0e 100644 --- a/unified-runtime/test/conformance/device_code/CMakeLists.txt +++ b/unified-runtime/test/conformance/device_code/CMakeLists.txt @@ -163,6 +163,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/standard_types.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/subgroup.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/linker_error.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/saxpy_usm_local_mem.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/no_args.cpp) set(KERNEL_HEADER ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}/kernel_entry_points.h) add_custom_command(OUTPUT ${KERNEL_HEADER} diff --git a/unified-runtime/test/conformance/device_code/no_args.cpp b/unified-runtime/test/conformance/device_code/no_args.cpp new file mode 100644 index 0000000000000..a0ba54158760b --- /dev/null +++ b/unified-runtime/test/conformance/device_code/no_args.cpp @@ -0,0 +1,17 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + sycl::queue sycl_queue; + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<3>{128, 128, 128}, + [](sycl::item<3> itemId) { itemId.get_id(0); }); + }); + return 0; +} diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index f2a3b29416764..e7bf64a96dd8a 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -9,6 +9,18 @@ #include #include +struct urEnqueueKernelLaunchNoArgs3DTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "no_args"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + + size_t global_size[3] = {32, 16, 8}; + size_t global_offset[3] = {0, 0, 0}; + size_t n_dimensions = 3; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchNoArgs3DTest); + struct urEnqueueKernelLaunchTest : uur::urKernelExecutionTest { void SetUp() override { program_name = "fill"; @@ -67,6 +79,13 @@ struct urEnqueueKernelLaunchKernelStandardTest : uur::urKernelExecutionTest { }; UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchKernelStandardTest); +TEST_P(urEnqueueKernelLaunchNoArgs3DTest, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + global_offset, global_size, nullptr, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); +} + TEST_P(urEnqueueKernelLaunchTest, Success) { ur_mem_handle_t buffer = nullptr; AddBuffer1DArg(sizeof(val) * global_size, &buffer); From 520cbcfa00cc1d18de0a02d55841e14f53811009 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 30 May 2025 13:13:55 +0100 Subject: [PATCH 2/3] Fix oob access --- .../source/adapters/offload/enqueue.cpp | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index c7a88eda6fb87..61f1925fcd58a 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -29,26 +29,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( (void)pGlobalWorkOffset; - if (!pLocalWorkSize) { - // TODO: This is not optimal, but it is legal - static size_t DefaultWorkSize[3] = {1, 1, 1}; - pLocalWorkSize = DefaultWorkSize; + // TODO: We default to 1, 1, 1 here. In future if pLocalWorkSize is not + // specified, we should pick the "best" one + size_t WorkSize[3] = {1, 1, 1}; + if (pLocalWorkSize) { + for (uint32_t I = 0; I < workDim; I++) { + WorkSize[I] = pLocalWorkSize[I]; + } } - if (pLocalWorkSize[0] > pGlobalWorkSize[0] || - pLocalWorkSize[1] > pGlobalWorkSize[1] || - pLocalWorkSize[2] > pGlobalWorkSize[2]) { + if (WorkSize[0] > pGlobalWorkSize[0] || WorkSize[1] > pGlobalWorkSize[1] || + WorkSize[2] > pGlobalWorkSize[2]) { return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } ol_kernel_launch_size_args_t LaunchArgs; LaunchArgs.Dimensions = workDim; - LaunchArgs.NumGroupsX = pGlobalWorkSize[0] / pLocalWorkSize[0]; - LaunchArgs.NumGroupsY = pGlobalWorkSize[1] / pLocalWorkSize[1]; - LaunchArgs.NumGroupsZ = pGlobalWorkSize[2] / pLocalWorkSize[2]; - LaunchArgs.GroupSizeX = pLocalWorkSize[0]; - LaunchArgs.GroupSizeY = pLocalWorkSize[1]; - LaunchArgs.GroupSizeZ = pLocalWorkSize[2]; + LaunchArgs.NumGroupsX = pGlobalWorkSize[0] / WorkSize[0]; + LaunchArgs.NumGroupsY = pGlobalWorkSize[1] / WorkSize[1]; + LaunchArgs.NumGroupsZ = pGlobalWorkSize[2] / WorkSize[2]; + LaunchArgs.GroupSizeX = WorkSize[0]; + LaunchArgs.GroupSizeY = WorkSize[1]; + LaunchArgs.GroupSizeZ = WorkSize[2]; LaunchArgs.DynSharedMemory = 0; ol_event_handle_t EventOut; From c2881cdf32ad411402750eb211da908505fae95a Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Fri, 30 May 2025 13:17:28 +0100 Subject: [PATCH 3/3] Fix other oob access --- .../source/adapters/offload/enqueue.cpp | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 61f1925fcd58a..9fc50c22f96da 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -29,28 +29,33 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( (void)pGlobalWorkOffset; + size_t GlobalSize[3] = {1, 1, 1}; + for (uint32_t I = 0; I < workDim; I++) { + GlobalSize[I] = pGlobalWorkSize[I]; + } + // TODO: We default to 1, 1, 1 here. In future if pLocalWorkSize is not // specified, we should pick the "best" one - size_t WorkSize[3] = {1, 1, 1}; + size_t GroupSize[3] = {1, 1, 1}; if (pLocalWorkSize) { for (uint32_t I = 0; I < workDim; I++) { - WorkSize[I] = pLocalWorkSize[I]; + GroupSize[I] = pLocalWorkSize[I]; } } - if (WorkSize[0] > pGlobalWorkSize[0] || WorkSize[1] > pGlobalWorkSize[1] || - WorkSize[2] > pGlobalWorkSize[2]) { + if (GroupSize[0] > GlobalSize[0] || GroupSize[1] > GlobalSize[1] || + GroupSize[2] > GlobalSize[2]) { return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } ol_kernel_launch_size_args_t LaunchArgs; LaunchArgs.Dimensions = workDim; - LaunchArgs.NumGroupsX = pGlobalWorkSize[0] / WorkSize[0]; - LaunchArgs.NumGroupsY = pGlobalWorkSize[1] / WorkSize[1]; - LaunchArgs.NumGroupsZ = pGlobalWorkSize[2] / WorkSize[2]; - LaunchArgs.GroupSizeX = WorkSize[0]; - LaunchArgs.GroupSizeY = WorkSize[1]; - LaunchArgs.GroupSizeZ = WorkSize[2]; + LaunchArgs.NumGroupsX = GlobalSize[0] / GroupSize[0]; + LaunchArgs.NumGroupsY = GlobalSize[1] / GroupSize[1]; + LaunchArgs.NumGroupsZ = GlobalSize[2] / GroupSize[2]; + LaunchArgs.GroupSizeX = GroupSize[0]; + LaunchArgs.GroupSizeY = GroupSize[1]; + LaunchArgs.GroupSizeZ = GroupSize[2]; LaunchArgs.DynSharedMemory = 0; ol_event_handle_t EventOut;