diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index cd9a138dc3b9b..9fc50c22f96da 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -28,22 +28,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( // (void)pGlobalWorkOffset; - (void)pLocalWorkSize; - if (workDim == 1) { - std::cerr - << "UR Offload adapter only supports 1d kernel launches at the moment"; - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + size_t GlobalSize[3] = {1, 1, 1}; + for (uint32_t I = 0; I < workDim; I++) { + GlobalSize[I] = pGlobalWorkSize[I]; + } + + // TODO: We default to 1, 1, 1 here. In future if pLocalWorkSize is not + // specified, we should pick the "best" one + size_t GroupSize[3] = {1, 1, 1}; + if (pLocalWorkSize) { + for (uint32_t I = 0; I < workDim; I++) { + GroupSize[I] = pLocalWorkSize[I]; + } + } + + if (GroupSize[0] > GlobalSize[0] || GroupSize[1] > GlobalSize[1] || + GroupSize[2] > GlobalSize[2]) { + return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE; } ol_kernel_launch_size_args_t LaunchArgs; LaunchArgs.Dimensions = workDim; - LaunchArgs.NumGroupsX = pGlobalWorkSize[0]; - LaunchArgs.NumGroupsY = 1; - LaunchArgs.NumGroupsZ = 1; - LaunchArgs.GroupSizeX = 1; - LaunchArgs.GroupSizeY = 1; - LaunchArgs.GroupSizeZ = 1; + LaunchArgs.NumGroupsX = GlobalSize[0] / GroupSize[0]; + LaunchArgs.NumGroupsY = GlobalSize[1] / GroupSize[1]; + LaunchArgs.NumGroupsZ = GlobalSize[2] / GroupSize[2]; + LaunchArgs.GroupSizeX = GroupSize[0]; + LaunchArgs.GroupSizeY = GroupSize[1]; + LaunchArgs.GroupSizeZ = GroupSize[2]; LaunchArgs.DynSharedMemory = 0; ol_event_handle_t EventOut; diff --git a/unified-runtime/test/conformance/device_code/CMakeLists.txt b/unified-runtime/test/conformance/device_code/CMakeLists.txt index a5401fdf52898..1bc8ce784ca0e 100644 --- a/unified-runtime/test/conformance/device_code/CMakeLists.txt +++ b/unified-runtime/test/conformance/device_code/CMakeLists.txt @@ -163,6 +163,7 @@ add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/standard_types.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/subgroup.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/linker_error.cpp) add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/saxpy_usm_local_mem.cpp) +add_device_binary(${CMAKE_CURRENT_SOURCE_DIR}/no_args.cpp) set(KERNEL_HEADER ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}/kernel_entry_points.h) add_custom_command(OUTPUT ${KERNEL_HEADER} diff --git a/unified-runtime/test/conformance/device_code/no_args.cpp b/unified-runtime/test/conformance/device_code/no_args.cpp new file mode 100644 index 0000000000000..a0ba54158760b --- /dev/null +++ b/unified-runtime/test/conformance/device_code/no_args.cpp @@ -0,0 +1,17 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +int main() { + sycl::queue sycl_queue; + sycl_queue.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<3>{128, 128, 128}, + [](sycl::item<3> itemId) { itemId.get_id(0); }); + }); + return 0; +} diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp index f2a3b29416764..e7bf64a96dd8a 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueKernelLaunch.cpp @@ -9,6 +9,18 @@ #include #include +struct urEnqueueKernelLaunchNoArgs3DTest : uur::urKernelExecutionTest { + void SetUp() override { + program_name = "no_args"; + UUR_RETURN_ON_FATAL_FAILURE(urKernelExecutionTest::SetUp()); + } + + size_t global_size[3] = {32, 16, 8}; + size_t global_offset[3] = {0, 0, 0}; + size_t n_dimensions = 3; +}; +UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchNoArgs3DTest); + struct urEnqueueKernelLaunchTest : uur::urKernelExecutionTest { void SetUp() override { program_name = "fill"; @@ -67,6 +79,13 @@ struct urEnqueueKernelLaunchKernelStandardTest : uur::urKernelExecutionTest { }; UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEnqueueKernelLaunchKernelStandardTest); +TEST_P(urEnqueueKernelLaunchNoArgs3DTest, Success) { + ASSERT_SUCCESS(urEnqueueKernelLaunch(queue, kernel, n_dimensions, + global_offset, global_size, nullptr, 0, + nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(queue)); +} + TEST_P(urEnqueueKernelLaunchTest, Success) { ur_mem_handle_t buffer = nullptr; AddBuffer1DArg(sizeof(val) * global_size, &buffer);