Skip to content

[UR] Consolidate kernel launch entry points in UR. #18385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
67f4acc
[UR] Consolidate kernel launch entry points in UR.
aarongreig May 9, 2025
ade16f1
Propagate change to sycl.
aarongreig May 9, 2025
2f36d98
Fix script formatting and missed old enum usage.
aarongreig May 9, 2025
f3e085d
Fix accidental revert of changes to l0 KernelLaunch.
aarongreig May 9, 2025
5cfdf0c
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 9, 2025
9a526ca
Fix more uses of old enums
aarongreig May 9, 2025
f967cd9
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 9, 2025
e318856
Remove accidentally re-added test for deprecated error code.
aarongreig May 9, 2025
719b311
Fix e2e and add cts tests.
aarongreig May 12, 2025
1a0cba9
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 12, 2025
dd91869
Sort out device properties.
aarongreig May 13, 2025
1986667
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 13, 2025
f01b6bc
Fix conflicts
aarongreig May 13, 2025
266fc07
Fix validation
aarongreig May 13, 2025
57ef0cb
Use a bitfield to report launch properties support.
aarongreig May 14, 2025
98a5c6b
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 14, 2025
9d99467
Fix spec.
aarongreig May 14, 2025
e217c9b
Shorten kernel launch properties flags name.
aarongreig May 20, 2025
3c5ad3b
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 20, 2025
84b6fe5
Rename KERNEL_LAUNCH_PROPERTIES_SUPPORT -> KERNEL_LAUNCH_CAPABILITIES
aarongreig May 21, 2025
8be7c08
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 21, 2025
cbfb4c7
Fix reference to old query name.
aarongreig May 21, 2025
9f9a3fe
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 21, 2025
9b6ef16
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 22, 2025
6a91f0e
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 22, 2025
9706a25
Add missing return to CL.
aarongreig May 22, 2025
e8504a9
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 26, 2025
1d56f41
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 28, 2025
7c775c8
Add return to validate launchPropList == nullptr.
aarongreig May 28, 2025
5103d40
Change unsupported property return to UNSUPPORTED_FEATURE
aarongreig May 28, 2025
0f96bda
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 29, 2025
e0d95f8
Fix test using old signature
aarongreig May 29, 2025
3fded70
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 30, 2025
795949b
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig May 30, 2025
54bfab5
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig Jun 2, 2025
38547d7
Merge branch 'sycl' into aaron/consolidateEnqueueKernelAPIs
aarongreig Jun 2, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions sycl/include/sycl/info/device_traits.def
Original file line number Diff line number Diff line change
Expand Up @@ -235,8 +235,7 @@ __SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_work_groups_2d, id<2>, __SYCL_TR
__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_work_groups_3d, id<3>,
UR_DEVICE_INFO_MAX_WORK_GROUPS_3D)
__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_max_global_work_groups, size_t, __SYCL_TRAIT_HANDLED_IN_RT)
__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_cuda_cluster_group, bool,
UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP)
__SYCL_PARAM_TRAITS_SPEC(device, ext_oneapi_cuda_cluster_group, bool, __SYCL_TRAIT_HANDLED_IN_RT)

#ifdef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC_NEEDS_UNDEF
#undef __SYCL_PARAM_TRAITS_TEMPLATE_SPEC
Expand Down
9 changes: 4 additions & 5 deletions sycl/source/detail/device_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -886,11 +886,10 @@ class device_impl : public std::enable_shared_from_this<device_impl> {
}

CASE(info::device::ext_oneapi_cuda_cluster_group) {
if (getBackend() != backend::ext_oneapi_cuda)
return false;

return get_info_impl_nocheck<UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP>()
.value_or(0) != 0;
auto SupportFlags =
get_info_impl<UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES>();
return static_cast<bool>(
SupportFlags & UR_KERNEL_LAUNCH_PROPERTIES_FLAG_CLUSTER_DIMENSION);
}

// ext_codeplay_device_traits.def
Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/kernel_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,7 +371,7 @@ kernel_impl::queryMaxNumWorkGroups(queue Queue,

uint32_t GroupCount{0};
if (auto Result = Adapter->call_nocheck<
UrApiKind::urKernelSuggestMaxCooperativeGroupCountExp>(
UrApiKind::urKernelSuggestMaxCooperativeGroupCount>(
Handle, DeviceHandleRef, Dimensions, WG, DynamicLocalMemorySize,
&GroupCount);
Result != UR_RESULT_ERROR_UNSUPPORTED_FEATURE &&
Expand Down
60 changes: 19 additions & 41 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2474,65 +2474,43 @@ static ur_result_t SetKernelParamsAndLaunch(
if (EnforcedLocalSize)
LocalSize = RequiredWGSize;
}

const bool HasOffset = NDRDesc.GlobalOffset[0] != 0 ||
NDRDesc.GlobalOffset[1] != 0 ||
NDRDesc.GlobalOffset[2] != 0;

std::vector<ur_exp_launch_property_t> property_list;
std::vector<ur_kernel_launch_property_t> property_list;

if (KernelUsesClusterLaunch) {
ur_exp_launch_property_value_t launch_property_value_cluster_range;
ur_kernel_launch_property_value_t launch_property_value_cluster_range;
launch_property_value_cluster_range.clusterDim[0] =
NDRDesc.ClusterDimensions[0];
launch_property_value_cluster_range.clusterDim[1] =
NDRDesc.ClusterDimensions[1];
launch_property_value_cluster_range.clusterDim[2] =
NDRDesc.ClusterDimensions[2];

property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_CLUSTER_DIMENSION,
launch_property_value_cluster_range});

if (IsCooperative) {
ur_exp_launch_property_value_t launch_property_value_cooperative;
launch_property_value_cooperative.cooperative = 1;
property_list.push_back({UR_EXP_LAUNCH_PROPERTY_ID_COOPERATIVE,
launch_property_value_cooperative});
}
}
if (IsCooperative) {
ur_kernel_launch_property_value_t launch_property_value_cooperative;
launch_property_value_cooperative.cooperative = 1;
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE,
launch_property_value_cooperative});
}
// If there is no implicit arg, let the driver handle it via a property
if (WorkGroupMemorySize && !ImplicitLocalArg.has_value()) {
property_list.push_back(
{UR_EXP_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY, {{WorkGroupMemorySize}}});
}
if (!property_list.empty()) {
ur_event_handle_t UREvent = nullptr;
ur_result_t Error =
Adapter->call_nocheck<UrApiKind::urEnqueueKernelLaunchCustomExp>(
Queue.getHandleRef(), Kernel, NDRDesc.Dims,
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr,
&NDRDesc.GlobalSize[0], LocalSize, property_list.size(),
property_list.data(), RawEvents.size(),
RawEvents.empty() ? nullptr : &RawEvents[0],
OutEventImpl ? &UREvent : nullptr);
if ((Error == UR_RESULT_SUCCESS) && OutEventImpl) {
OutEventImpl->setHandle(UREvent);
}
return Error;
property_list.push_back({UR_KERNEL_LAUNCH_PROPERTY_ID_WORK_GROUP_MEMORY,
{{WorkGroupMemorySize}}});
}
ur_event_handle_t UREvent = nullptr;
ur_result_t Error =
[&](auto... Args) {
if (IsCooperative) {
return Adapter
->call_nocheck<UrApiKind::urEnqueueCooperativeKernelLaunchExp>(
Args...);
}
return Adapter->call_nocheck<UrApiKind::urEnqueueKernelLaunch>(Args...);
}(Queue.getHandleRef(), Kernel, NDRDesc.Dims,
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0],
LocalSize, RawEvents.size(),
RawEvents.empty() ? nullptr : &RawEvents[0],
OutEventImpl ? &UREvent : nullptr);
ur_result_t Error = Adapter->call_nocheck<UrApiKind::urEnqueueKernelLaunch>(
Queue.getHandleRef(), Kernel, NDRDesc.Dims,
HasOffset ? &NDRDesc.GlobalOffset[0] : nullptr, &NDRDesc.GlobalSize[0],
LocalSize, property_list.size(),
property_list.empty() ? nullptr : property_list.data(), RawEvents.size(),
RawEvents.empty() ? nullptr : &RawEvents[0],
OutEventImpl ? &UREvent : nullptr);
if (Error == UR_RESULT_SUCCESS && OutEventImpl) {
OutEventImpl->setHandle(UREvent);
}
Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/ur_device_info_ret_types.inc
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,6 @@ MAP(UR_DEVICE_INFO_BINDLESS_SAMPLED_IMAGE_FETCH_3D_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_BINDLESS_SAMPLE_1D_USM_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_BINDLESS_SAMPLE_2D_USM_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_BINDLESS_UNIQUE_ADDRESSING_PER_DIM_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_CLUSTER_LAUNCH_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_CUBEMAP_SEAMLESS_FILTERING_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_CUBEMAP_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_EXTERNAL_MEMORY_IMPORT_SUPPORT_EXP, ur_bool_t)
Expand All @@ -188,4 +187,5 @@ MAP(UR_DEVICE_INFO_MIPMAP_LEVEL_REFERENCE_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_MIPMAP_MAX_ANISOTROPY_EXP, uint32_t)
MAP(UR_DEVICE_INFO_MIPMAP_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT_EXP, ur_bool_t)
MAP(UR_DEVICE_INFO_KERNEL_LAUNCH_CAPABILITIES, ur_kernel_launch_properties_flags_t)
// clang-format on
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Checks whether or not event Dependencies are honored by
// urEnqueueKernelLaunchCustomExp
// urEnqueueKernelLaunch with cluster dimensions
// REQUIRES: target-nvidia, aspect-ext_oneapi_cuda_cluster_group
// RUN: %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_90 -o %t.out
// RUN: %{run} %t.out
Expand Down
9 changes: 4 additions & 5 deletions sycl/unittests/helpers/UrMock.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -393,10 +393,9 @@ inline ur_result_t mock_urEventGetInfo(void *pParams) {
}
}

inline ur_result_t
mock_urKernelSuggestMaxCooperativeGroupCountExp(void *pParams) {
inline ur_result_t mock_urKernelSuggestMaxCooperativeGroupCount(void *pParams) {
auto params = reinterpret_cast<
ur_kernel_suggest_max_cooperative_group_count_exp_params_t *>(pParams);
ur_kernel_suggest_max_cooperative_group_count_params_t *>(pParams);
**params->ppGroupCountRet = 1;
return UR_RESULT_SUCCESS;
}
Expand Down Expand Up @@ -571,8 +570,8 @@ template <sycl::backend Backend = backend::opencl> class UrMock {
ADD_DEFAULT_OVERRIDE(urProgramGetInfo, mock_urProgramGetInfo)
ADD_DEFAULT_OVERRIDE(urKernelGetGroupInfo, mock_urKernelGetGroupInfo)
ADD_DEFAULT_OVERRIDE(urEventGetInfo, mock_urEventGetInfo)
ADD_DEFAULT_OVERRIDE(urKernelSuggestMaxCooperativeGroupCountExp,
mock_urKernelSuggestMaxCooperativeGroupCountExp)
ADD_DEFAULT_OVERRIDE(urKernelSuggestMaxCooperativeGroupCount,
mock_urKernelSuggestMaxCooperativeGroupCount)
ADD_DEFAULT_OVERRIDE(urDeviceSelectBinary, mock_urDeviceSelectBinary)
ADD_DEFAULT_OVERRIDE(urPlatformGetBackendOption,
mock_urPlatformGetBackendOption)
Expand Down
Loading