diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 565fb4b430046..08fc995782d76 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -1606,12 +1606,12 @@ def SYCLType: InheritableAttr { let Subjects = SubjectList<[CXXRecord, Enum], ErrorDiag>; let LangOpts = [SYCLIsDevice, SilentlyIgnoreSYCLIsHost]; let Args = [EnumArgument<"Type", "SYCLType", /*is_string=*/true, - ["accessor", "local_accessor", "work_group_memory", + ["accessor", "local_accessor", "work_group_memory", "dynamic_work_group_memory", "specialization_id", "kernel_handler", "buffer_location", "no_alias", "accessor_property_list", "group", "private_memory", "aspect", "annotated_ptr", "annotated_arg", "stream", "sampler", "host_pipe", "multi_ptr"], - ["accessor", "local_accessor", "work_group_memory", + ["accessor", "local_accessor", "work_group_memory", "dynamic_work_group_memory", "specialization_id", "kernel_handler", "buffer_location", "no_alias", "accessor_property_list", "group", "private_memory", "aspect", "annotated_ptr", "annotated_arg", diff --git a/clang/include/clang/Sema/SemaSYCL.h b/clang/include/clang/Sema/SemaSYCL.h index a360014f92c24..28a9c1859638a 100644 --- a/clang/include/clang/Sema/SemaSYCL.h +++ b/clang/include/clang/Sema/SemaSYCL.h @@ -63,7 +63,8 @@ class SYCLIntegrationHeader { kind_specialization_constants_buffer, kind_stream, kind_work_group_memory, - kind_last = kind_work_group_memory + kind_dynamic_work_group_memory, + kind_last = kind_dynamic_work_group_memory }; public: diff --git a/clang/lib/Sema/SemaSYCL.cpp b/clang/lib/Sema/SemaSYCL.cpp index 57bb67c880c2f..861ac6241e308 100644 --- a/clang/lib/Sema/SemaSYCL.cpp +++ b/clang/lib/Sema/SemaSYCL.cpp @@ -2090,7 +2090,9 @@ class SyclKernelFieldChecker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - if (!SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + if (!SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory) && + !SemaSYCL::isSyclType(ParamTy, + SYCLTypeAttr::dynamic_work_group_memory)) { Diag.Report(PD->getLocation(), diag::err_bad_kernel_param_type) << ParamTy; IsInvalid = true; @@ -2246,7 +2248,8 @@ class SyclKernelUnionChecker : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - if (!SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) + if (!SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory) && + !SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::dynamic_work_group_memory)) unsupportedFreeFunctionParamType(); // TODO return true; } @@ -3032,7 +3035,9 @@ class SyclKernelDeclCreator : public SyclKernelFieldHandler { } bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory) || + SemaSYCL::isSyclType(ParamTy, + SYCLTypeAttr::dynamic_work_group_memory)) { const auto *RecordDecl = ParamTy->getAsCXXRecordDecl(); assert(RecordDecl && "The type must be a RecordDecl"); CXXMethodDecl *InitMethod = getMethodByName(RecordDecl, InitMethodName); @@ -4544,7 +4549,9 @@ class FreeFunctionKernelBodyCreator : public SyclKernelFieldHandler { // TODO: Revisit this approach once https://github.com/intel/llvm/issues/16061 // is closed. bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { - if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) { + if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory) || + SemaSYCL::isSyclType(ParamTy, + SYCLTypeAttr::dynamic_work_group_memory)) { const auto *RecordDecl = ParamTy->getAsCXXRecordDecl(); AccessSpecifier DefaultConstructorAccess; auto DefaultConstructor = @@ -4823,6 +4830,10 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { } else if (SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::work_group_memory)) { addParam(FieldTy, SYCLIntegrationHeader::kind_work_group_memory, offsetOf(RD, BC.getType()->getAsCXXRecordDecl())); + } else if (SemaSYCL::isSyclType(FieldTy, + SYCLTypeAttr::dynamic_work_group_memory)) { + addParam(FieldTy, SYCLIntegrationHeader::kind_dynamic_work_group_memory, + offsetOf(RD, BC.getType()->getAsCXXRecordDecl())); } return true; } @@ -4846,6 +4857,10 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { } else if (SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::work_group_memory)) { addParam(FieldTy, SYCLIntegrationHeader::kind_work_group_memory, offsetOf(FD, FieldTy)); + } else if (SemaSYCL::isSyclType(FieldTy, + SYCLTypeAttr::dynamic_work_group_memory)) { + addParam(FieldTy, SYCLIntegrationHeader::kind_dynamic_work_group_memory, + offsetOf(FD, FieldTy)); } else if (SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::sampler) || SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::annotated_ptr) || SemaSYCL::isSyclType(FieldTy, SYCLTypeAttr::annotated_arg)) { @@ -4870,6 +4885,10 @@ class SyclKernelIntHeaderCreator : public SyclKernelFieldHandler { bool handleSyclSpecialType(ParmVarDecl *PD, QualType ParamTy) final { if (SemaSYCL::isSyclType(ParamTy, SYCLTypeAttr::work_group_memory)) addParam(PD, ParamTy, SYCLIntegrationHeader::kind_work_group_memory); + else if (SemaSYCL::isSyclType(ParamTy, + SYCLTypeAttr::dynamic_work_group_memory)) + addParam(PD, ParamTy, + SYCLIntegrationHeader::kind_dynamic_work_group_memory); else unsupportedFreeFunctionParamType(); // TODO return true; @@ -5993,6 +6012,7 @@ static const char *paramKind2Str(KernelParamKind K) { CASE(specialization_constants_buffer); CASE(pointer); CASE(work_group_memory); + CASE(dynamic_work_group_memory); } return ""; diff --git a/clang/test/CodeGenSYCL/Inputs/sycl.hpp b/clang/test/CodeGenSYCL/Inputs/sycl.hpp index 9fbf260b0c350..74b5f68edc34c 100644 --- a/clang/test/CodeGenSYCL/Inputs/sycl.hpp +++ b/clang/test/CodeGenSYCL/Inputs/sycl.hpp @@ -667,6 +667,19 @@ __SYCL_TYPE(work_group_memory) work_group_memory { __attribute((opencl_local)) DataT *Ptr; }; +template +class __attribute__((sycl_special_class)) +__SYCL_TYPE(dynamic_work_group_memory) dynamic_work_group_memory { +public: + dynamic_work_group_memory() = default; + + void __init(__attribute((opencl_local)) DataT *Ptr) { this->LocalMem.__init(Ptr); } + work_group_memory get() const { return LocalMem; } + +private: + work_group_memory LocalMem; +}; + template class buffer { diff --git a/clang/test/CodeGenSYCL/dynamic_work_group_memory.cpp b/clang/test/CodeGenSYCL/dynamic_work_group_memory.cpp new file mode 100644 index 0000000000000..4e2602b5cbaa7 --- /dev/null +++ b/clang/test/CodeGenSYCL/dynamic_work_group_memory.cpp @@ -0,0 +1,39 @@ +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -disable-llvm-passes -emit-llvm %s -o %t.ll +// RUN: FileCheck < %t.ll %s --check-prefix CHECK-IR +// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsycl-int-header=%t.h %s +// RUN: FileCheck < %t.h %s --check-prefix CHECK-INT-HEADER +// +// Tests for dynamic_work_group_memory kernel parameter using the dummy implementation in Inputs/sycl.hpp. +// The first two RUN commands verify that the init call is generated with the correct arguments in LLVM IR +// and the second two RUN commands verify the contents of the integration header produced by the frontend. +// +// CHECK-IR: define dso_local spir_kernel void @ +// CHECK-IR-SAME: ptr addrspace(3) noundef align 4 [[PTR:%[a-zA-Z0-9_]+]] +// +// CHECK-IR: [[PTR]].addr = alloca ptr addrspace(3), align 8 +// CHECK-IR: [[PTR]].addr.ascast = addrspacecast ptr [[PTR]].addr to ptr addrspace(4) +// CHECK-IR: store ptr addrspace(3) [[PTR]], ptr addrspace(4) [[PTR]].addr.ascast, align 8 +// CHECK-IR: [[PTR_LOAD:%[a-zA-Z0-9_]+]] = load ptr addrspace(3), ptr addrspace(4) [[PTR]].addr.ascast, align 8 +// +// CHECK-IR: call spir_func void @{{.*}}__init{{.*}}(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %{{[a-zA-Z0-9_]+}}, ptr addrspace(3) noundef [[PTR_LOAD]]) +// +// CHECK-INT-HEADER: const kernel_param_desc_t kernel_signatures[] = { +// CHECK-INT-HEADER-NEXT: //--- _ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_EUlNS0_4itemILi1EEEE_ +// CHECK-INT-HEADER-NEXT: { kernel_param_kind_t::kind_dynamic_work_group_memory, {{[4,8]}}, 0 }, +// CHECK-INT-HEADER-EMPTY: +// CHECK-INT-HEADER-NEXT: { kernel_param_kind_t::kind_invalid, -987654321, -987654321 }, +// CHECK-INT-HEADER-NEXT: }; + +#include "Inputs/sycl.hpp" + +int main() { + sycl::queue Q; + sycl::dynamic_work_group_memory dynMem; + Q.submit([&](sycl::handler &CGH) { + sycl::range<1> ndr; + CGH.parallel_for(ndr, [=](sycl::item<1> it) { + auto localMem = dynMem.get(); + int *ptr = &localMem; }); + }); + return 0; +} diff --git a/clang/test/CodeGenSYCL/free_function_int_header.cpp b/clang/test/CodeGenSYCL/free_function_int_header.cpp index 6a196dedc2fc2..9e5fdd9fd495f 100644 --- a/clang/test/CodeGenSYCL/free_function_int_header.cpp +++ b/clang/test/CodeGenSYCL/free_function_int_header.cpp @@ -2,7 +2,8 @@ // RUN: FileCheck -input-file=%t.h %s // // This test checks integration header contents for free functions with scalar, -// pointer, non-decomposed struct parameters and work group memory parameters. +// pointer, non-decomposed struct parameters, work group memory parameters and +// dynamic work group memory parameters. #include "mock_properties.hpp" #include "sycl.hpp" @@ -101,6 +102,11 @@ __attribute__((sycl_device)) void ff_8(sycl::work_group_memory) { } +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_9(sycl::dynamic_work_group_memory) { +} + // CHECK: const char* const kernel_names[] = { // CHECK-NEXT: {{.*}}__sycl_kernel_ff_2Piii @@ -112,6 +118,7 @@ void ff_8(sycl::work_group_memory) { // CHECK-NEXT: {{.*}}__sycl_kernel_ff_6I3Agg7DerivedEvT_T0_i // CHECK-NEXT: {{.*}}__sycl_kernel_ff_7ILi3EEv16KArgWithPtrArrayIXT_EE // CHECK-NEXT: {{.*}}__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE +// CHECK-NEXT: {{.*}}__sycl_kernel_ff_9N4sycl3_V125dynamic_work_group_memoryIiEE // CHECK-NEXT: "" // CHECK-NEXT: }; @@ -158,6 +165,9 @@ void ff_8(sycl::work_group_memory) { // CHECK: //--- _Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE // CHECK-NEXT: { kernel_param_kind_t::kind_work_group_memory, 8, 0 }, +// CHECK: //--- _Z18__sycl_kernel_ff_9N4sycl3_V125dynamic_work_group_memoryIiEE +// CHECK-NEXT: { kernel_param_kind_t::kind_dynamic_work_group_memory, 8, 0 }, + // CHECK: { kernel_param_kind_t::kind_invalid, -987654321, -987654321 }, // CHECK-NEXT: }; @@ -324,6 +334,26 @@ void ff_8(sycl::work_group_memory) { // CHECK-NEXT: }; // CHECK-NEXT: } +// CHECK: // Definition of _Z18__sycl_kernel_ff_9N4sycl3_V125dynamic_work_group_memoryIiEE as a free function kernel +// +// CHECK: Forward declarations of kernel and its argument types: +// CHECK: template class dynamic_work_group_memory; + +// CHECK: void ff_9(sycl::dynamic_work_group_memory); +// CHECK-NEXT: static constexpr auto __sycl_shim10() { +// CHECK-NEXT: return (void (*)(class sycl::dynamic_work_group_memory))ff_9; +// CHECK-NEXT: } +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_kernel<__sycl_shim10()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: template <> +// CHECK-NEXT: struct ext::oneapi::experimental::is_single_task_kernel<__sycl_shim10()> { +// CHECK-NEXT: static constexpr bool value = true; +// CHECK-NEXT: }; +// CHECK-NEXT: } + // CHECK: #include // CHECK: Definition of kernel_id of _Z18__sycl_kernel_ff_2Piii @@ -397,3 +427,11 @@ void ff_8(sycl::work_group_memory) { // CHECK-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"_Z18__sycl_kernel_ff_8N4sycl3_V117work_group_memoryIiEE"}); // CHECK-NEXT: } // CHECK-NEXT: } +// +// CHECK: // Definition of kernel_id of _Z18__sycl_kernel_ff_9N4sycl3_V125dynamic_work_group_memoryIiEE +// CHECK-NEXT: namespace sycl { +// CHECK-NEXT: template <> +// CHECK-NEXT: kernel_id ext::oneapi::experimental::get_kernel_id<__sycl_shim10()>() { +// CHECK-NEXT: return sycl::detail::get_kernel_id_impl(std::string_view{"_Z18__sycl_kernel_ff_9N4sycl3_V125dynamic_work_group_memoryIiEE"}); +// CHECK-NEXT: } +// CHECK-NEXT: } diff --git a/clang/test/CodeGenSYCL/free_function_kernel_params.cpp b/clang/test/CodeGenSYCL/free_function_kernel_params.cpp index 2e78116824ad2..ef54faa79e614 100644 --- a/clang/test/CodeGenSYCL/free_function_kernel_params.cpp +++ b/clang/test/CodeGenSYCL/free_function_kernel_params.cpp @@ -1,7 +1,7 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -triple spir64 \ // RUN: -emit-llvm %s -o - | FileCheck %s // This test checks parameter IR generation for free functions with parameters -// of non-decomposed struct type and work group memory type. +// of non-decomposed struct type, work group memory type and dynamic work group memory type. #include "sycl.hpp" @@ -71,3 +71,16 @@ void ff_7(sycl::work_group_memory mem) { // CHECK-NEXT: [[REGISTER:%[a-zA-Z0-9_]+]] = load ptr addrspace(3), ptr addrspace(4) %__arg_Ptr.addr.ascast, align 8 // CHECK-NEXT: call spir_func void @{{.*}}work_group_memory{{.*}}__init{{.*}}(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %mem.ascast, ptr addrspace(3) noundef [[REGISTER]]) +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_7(sycl::dynamic_work_group_memory DynMem) { +} + +// CHECK: define dso_local spir_kernel void @{{.*}}__sycl_kernel_ff_7{{.*}}(ptr addrspace(3) noundef align 4 %__arg_Ptr) +// CHECK: %__arg_Ptr.addr = alloca ptr addrspace(3), align 8 +// CHECK-NEXT: %DynMem = alloca %"class.sycl::_V1::dynamic_work_group_memory", align 8 +// CHECK: %__arg_Ptr.addr.ascast = addrspacecast ptr %__arg_Ptr.addr to ptr addrspace(4) +// CHECK-NEXT: %DynMem.ascast = addrspacecast ptr %DynMem to ptr addrspace(4) +// CHECK: store ptr addrspace(3) %__arg_Ptr, ptr addrspace(4) %__arg_Ptr.addr.ascast, align 8 +// CHECK-NEXT: [[REGISTER:%[a-zA-Z0-9_]+]] = load ptr addrspace(3), ptr addrspace(4) %__arg_Ptr.addr.ascast, align 8 +// CHECK-NEXT: call spir_func void @{{.*}}dynamic_work_group_memory{{.*}}__init{{.*}}(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) %DynMem.ascast, ptr addrspace(3) noundef [[REGISTER]]) diff --git a/clang/test/SemaSYCL/Inputs/sycl.hpp b/clang/test/SemaSYCL/Inputs/sycl.hpp index 5df1550ed2dcb..f1521af2b34f2 100644 --- a/clang/test/SemaSYCL/Inputs/sycl.hpp +++ b/clang/test/SemaSYCL/Inputs/sycl.hpp @@ -452,11 +452,9 @@ class __SYCL_TYPE(multi_ptr) multi_ptr { template class __attribute__((sycl_special_class)) __SYCL_TYPE(work_group_memory) work_group_memory { - -// Default constructor for objects later initialized with __init member. - work_group_memory() = default; - public: + // Default constructor for objects later initialized with __init member. + work_group_memory() = default; work_group_memory(handler &CGH) {} void __init(__attribute((opencl_local)) DataT *Ptr) { this->Ptr = Ptr; } @@ -465,6 +463,19 @@ __SYCL_TYPE(work_group_memory) work_group_memory { __attribute((opencl_local)) DataT *Ptr; }; +template +class __attribute__((sycl_special_class)) +__SYCL_TYPE(dynamic_work_group_memory) dynamic_work_group_memory { +public: + dynamic_work_group_memory() = default; + + void __init(__attribute((opencl_local)) DataT *Ptr) { this->LocalMem.__init(Ptr); } + work_group_memory get() const { return LocalMem; } + +private: + work_group_memory LocalMem; +}; + namespace ext { namespace oneapi { namespace experimental { diff --git a/clang/test/SemaSYCL/Inputs/sycl/detail/kernel_desc.hpp b/clang/test/SemaSYCL/Inputs/sycl/detail/kernel_desc.hpp index 820013ed0eff7..413743731a6e5 100644 --- a/clang/test/SemaSYCL/Inputs/sycl/detail/kernel_desc.hpp +++ b/clang/test/SemaSYCL/Inputs/sycl/detail/kernel_desc.hpp @@ -19,6 +19,7 @@ namespace detail { kind_specialization_constants_buffer = 4, kind_stream = 5, kind_work_group_memory = 6, + kind_dynamic_work_group_memory = 7, kind_invalid = 0xf, // not a valid kernel kind }; diff --git a/clang/test/SemaSYCL/free_function_kernel_params.cpp b/clang/test/SemaSYCL/free_function_kernel_params.cpp index da229145a34ad..0b930064cbca9 100644 --- a/clang/test/SemaSYCL/free_function_kernel_params.cpp +++ b/clang/test/SemaSYCL/free_function_kernel_params.cpp @@ -1,7 +1,8 @@ // RUN: %clang_cc1 -internal-isystem %S/Inputs -fsycl-is-device -ast-dump \ // RUN: %s -o - | FileCheck %s // This test checks parameter rewriting for free functions with parameters -// of type scalar, pointer, non-decomposed struct and work group memory. +// of type scalar, pointer, non-decomposed struct, work group memory and +// dynamic work group memory. #include "sycl.hpp" @@ -191,3 +192,23 @@ void ff_7(sycl::work_group_memory mem) { // CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(sycl::work_group_memory)' // CHECK-NEXT: DeclRefExpr {{.*}} 'void (sycl::work_group_memory)' lvalue Function {{.*}} 'ff_7' 'void (sycl::work_group_memory)' // CHECK-NEXT: DeclRefExpr {{.*}} 'sycl::work_group_memory' Var {{.*}} 'mem' 'sycl::work_group_memory' + +__attribute__((sycl_device)) +[[__sycl_detail__::add_ir_attributes_function("sycl-nd-range-kernel", 0)]] +void ff_8(sycl::dynamic_work_group_memory DynMem) { +} +// CHECK: FunctionDecl {{.*}}__sycl_kernel{{.*}}'void (__local int *)' +// CHECK-NEXT: ParmVarDecl {{.*}} used __arg_Ptr '__local int *' +// CHECK-NEXT: CompoundStmt +// CHECK-NEXT: DeclStmt +// CHECK-NEXT: VarDecl {{.*}} used DynMem 'sycl::dynamic_work_group_memory' callinit +// CHECK-NEXT: CXXConstructExpr {{.*}} 'sycl::dynamic_work_group_memory' 'void () noexcept' +// CHECK-NEXT: CXXMemberCallExpr {{.*}} 'void' +// CHECK-NEXT: MemberExpr {{.*}} 'void (__local int *)' lvalue .__init +// CHECK-NEXT: DeclRefExpr {{.*}} 'sycl::dynamic_work_group_memory' Var {{.*}} 'DynMem' 'sycl::dynamic_work_group_memory' +// CHECK-NEXT: ImplicitCastExpr {{.*}} '__local int *' +// CHECK-NEXT: DeclRefExpr {{.*}} '__local int *' lvalue ParmVar {{.*}} '__arg_Ptr' '__local int *' +// CHECK-NEXT: CallExpr {{.*}} 'void' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'void (*)(sycl::dynamic_work_group_memory)' +// CHECK-NEXT: DeclRefExpr {{.*}} 'void (sycl::dynamic_work_group_memory)' lvalue Function {{.*}} 'ff_8' 'void (sycl::dynamic_work_group_memory)' +// CHECK-NEXT: DeclRefExpr {{.*}} 'sycl::dynamic_work_group_memory' Var {{.*}} 'DynMem' 'sycl::dynamic_work_group_memory' diff --git a/sycl-jit/common/include/Kernel.h b/sycl-jit/common/include/Kernel.h index 25885c775a96d..44dc934914a48 100644 --- a/sycl-jit/common/include/Kernel.h +++ b/sycl-jit/common/include/Kernel.h @@ -60,6 +60,7 @@ enum class ParameterKind : uint32_t { SpecConstBuffer = 4, Stream = 5, WorkGroupMemory = 6, + DynamicWorkGroupMemory = 7, Invalid = 0xF, }; diff --git a/sycl/include/sycl/detail/kernel_desc.hpp b/sycl/include/sycl/detail/kernel_desc.hpp index d4fd31fc889f5..d782f97463167 100644 --- a/sycl/include/sycl/detail/kernel_desc.hpp +++ b/sycl/include/sycl/detail/kernel_desc.hpp @@ -59,6 +59,7 @@ enum class kernel_param_kind_t { kind_specialization_constants_buffer = 4, kind_stream = 5, kind_work_group_memory = 6, + kind_dynamic_work_group_memory = 7, kind_invalid = 0xf, // not a valid kernel kind }; diff --git a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp index 59f69fa63e839..5f257b0fb65f1 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/graph.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/graph.hpp @@ -17,9 +17,11 @@ #ifdef __INTEL_PREVIEW_BREAKING_CHANGES #include #endif -#include // for device +#include // for device #include // for graph properties classes -#include // for range, nd_range +#include // for dynamic_work_group_memory +#include // for empty_properties_t +#include // for range, nd_range #include // for is_property, is_property_of #include // for property_list @@ -47,6 +49,7 @@ enum class graph_state { // Forward declare ext::oneapi::experimental classes template class command_graph; class raw_kernel_arg; +template class work_group_memory; namespace detail { // List of sycl features and extensions which are not supported by graphs. Used @@ -501,6 +504,10 @@ class command_graph namespace detail { class __SYCL_EXPORT dynamic_parameter_base { public: + dynamic_parameter_base() = default; + dynamic_parameter_base( + sycl::ext::oneapi::experimental::command_graph + Graph); dynamic_parameter_base( sycl::ext::oneapi::experimental::command_graph Graph, @@ -525,14 +532,98 @@ class __SYCL_EXPORT dynamic_parameter_base { void updateValue(const raw_kernel_arg *NewRawValue, size_t Size); void updateAccessor(const sycl::detail::AccessorBaseHost *Acc); + + void updateWorkGroupMem(size_t BufferSize); + std::shared_ptr impl; template friend const decltype(Obj::impl) & sycl::detail::getSyclObjImpl(const Obj &SyclObject); }; + +class dynamic_work_group_memory_base +#ifndef __SYCL_DEVICE_ONLY__ + : public dynamic_parameter_base +#endif +{ +public: + dynamic_work_group_memory_base() = default; + dynamic_work_group_memory_base( + [[maybe_unused]] experimental::command_graph + Graph, + [[maybe_unused]] size_t Size) +#ifndef __SYCL_DEVICE_ONLY__ + : dynamic_parameter_base(Graph), BufferSize(Size) +#endif + { + } + +private: +#ifdef __SYCL_DEVICE_ONLY__ + [[maybe_unused]] unsigned char Padding[sizeof(dynamic_parameter_base)]; +#endif + size_t BufferSize{}; + friend class sycl::handler; +}; } // namespace detail +template +class __SYCL_SPECIAL_CLASS +__SYCL_TYPE(dynamic_work_group_memory) dynamic_work_group_memory + : public detail::dynamic_work_group_memory_base { +public: + // Check that DataT is an unbounded array type. + static_assert(std::is_array_v && std::extent_v == 0); + static_assert(std::is_same_v); + + // Frontend requires special types to have a default constructor in order to + // have a uniform way of initializing an object of special type to then call + // the __init method on it. This is purely an implementation detail and not + // part of the spec. + // TODO: Revisit this once https://github.com/intel/llvm/issues/16061 is + // closed. + dynamic_work_group_memory() = default; + + /// Constructs a new dynamic_work_group_memory object. + /// @param Graph The graph associated with this object. + /// @param Num Number of elements in the unbounded array DataT. + dynamic_work_group_memory( + experimental::command_graph Graph, size_t Num) + : detail::dynamic_work_group_memory_base( + Graph, Num * sizeof(std::remove_extent_t)) {} + + work_group_memory get() const { +#ifndef __SYCL_DEVICE_ONLY__ + throw sycl::exception(sycl::make_error_code(errc::invalid), + "Error: dynamic_work_group_memory::get() can be only " + "called on the device!"); +#endif + return WorkGroupMem; + } + + /// Updates on the host this dynamic_work_group_memory and all registered + /// nodes with a new buffer size. + /// @param Num The new number of elements in the unbounded array. + void update([[maybe_unused]] size_t Num) { +#ifndef __SYCL_DEVICE_ONLY__ + detail::dynamic_parameter_base::updateWorkGroupMem( + Num * sizeof(std::remove_extent_t)); +#endif + } + +private: + work_group_memory WorkGroupMem; + +#ifdef __SYCL_DEVICE_ONLY__ + using value_type = std::remove_all_extents_t; + using decoratedPtr = typename sycl::detail::DecoratedType< + value_type, access::address_space::local_space>::type *; + + void __init(decoratedPtr Ptr) { this->WorkGroupMem.__init(Ptr); } +#endif +}; + template class dynamic_parameter : public detail::dynamic_parameter_base { static constexpr bool IsAccessor = @@ -607,4 +698,14 @@ struct hash> { return std::hash()(ID); } }; + +template +struct hash> { + size_t operator()( + const sycl::ext::oneapi::experimental::dynamic_work_group_memory + &DynWorkGroupMem) const { + auto ID = sycl::detail::getSyclObjImpl(DynWorkGroupMem)->getID(); + return std::hash()(ID); + } +}; } // namespace std diff --git a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp index c03bdef7efceb..ebee7791b9841 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/work_group_memory.hpp @@ -9,6 +9,7 @@ #include #include +#include #include #include @@ -115,6 +116,9 @@ class __SYCL_SPECIAL_CLASS __SYCL_TYPE(work_group_memory) work_group_memory friend class sycl::handler; // needed in order for handler class to be aware // of the private inheritance with // work_group_memory_impl as base class + + template friend class dynamic_work_group_memory; + decoratedPtr ptr = nullptr; }; } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 2f7bbdfebc1f1..ac26234b547da 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -150,6 +150,7 @@ class pipe; namespace ext ::oneapi ::experimental { template class work_group_memory; +template class dynamic_work_group_memory; struct image_descriptor; __SYCL_EXPORT void async_free(sycl::handler &h, void *ptr); __SYCL_EXPORT void *async_malloc(sycl::handler &h, sycl::usm::alloc kind, @@ -160,6 +161,8 @@ __SYCL_EXPORT void *async_malloc_from_pool(sycl::handler &h, size_t size, namespace ext::oneapi::experimental::detail { class graph_impl; +class dynamic_parameter_base; +class dynamic_work_group_memory_base; } // namespace ext::oneapi::experimental::detail namespace detail { @@ -680,6 +683,12 @@ class __SYCL_EXPORT handler { registerDynamicParameter(DynamicParam, ArgIndex); } + // setArgHelper for graph dynamic_work_group_memory + void + setArgHelper(int ArgIndex, + ext::oneapi::experimental::detail::dynamic_work_group_memory_base + &DynWorkGroupBase); + // setArgHelper for the raw_kernel_arg extension type. void setArgHelper(int ArgIndex, sycl::ext::oneapi::experimental::raw_kernel_arg &&Arg) { @@ -1879,6 +1888,18 @@ class __SYCL_EXPORT handler { setArgHelper(argIndex, dynamicParam); } + // set_arg for graph dynamic_work_group_memory + template + void set_arg( + int argIndex, + ext::oneapi::experimental::dynamic_work_group_memory + &dynWorkGroupMem) { + ext::oneapi::experimental::detail::dynamic_work_group_memory_base + &dynWorkGroupBase = dynWorkGroupMem; + setArgHelper(argIndex, dynWorkGroupBase); + } + // set_arg for the raw_kernel_arg extension type. void set_arg(int argIndex, ext::oneapi::experimental::raw_kernel_arg &&Arg) { setArgHelper(argIndex, std::move(Arg)); @@ -3771,7 +3792,8 @@ class __SYCL_EXPORT handler { "A local accessor must not be used in a SYCL kernel function " "that is invoked via single_task or via the simple form of " "parallel_for that takes a range parameter."); - if (Kind == detail::kernel_param_kind_t::kind_work_group_memory) + if (Kind == detail::kernel_param_kind_t::kind_work_group_memory || + Kind == detail::kernel_param_kind_t::kind_dynamic_work_group_memory) throw sycl::exception( make_error_code(errc::kernel_argument), "A work group memory object must not be used in a SYCL kernel " diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp index 1bb649e2b9459..35f3b461bc01b 100644 --- a/sycl/source/detail/graph_impl.cpp +++ b/sycl/source/detail/graph_impl.cpp @@ -1994,6 +1994,11 @@ void executable_command_graph::update(const std::vector &Nodes) { impl->update(NodeImpls); } +dynamic_parameter_base::dynamic_parameter_base( + command_graph Graph) + : impl(std::make_shared( + sycl::detail::getSyclObjImpl(Graph))) {} + dynamic_parameter_base::dynamic_parameter_base( command_graph Graph, size_t ParamSize, const void *Data) @@ -2014,6 +2019,10 @@ void dynamic_parameter_base::updateAccessor( impl->updateAccessor(Acc); } +void dynamic_parameter_base::updateWorkGroupMem(size_t BufferSize) { + impl->updateWorkGroupMem(BufferSize); +} + void dynamic_parameter_impl::updateValue(const raw_kernel_arg *NewRawValue, size_t Size) { // Number of bytes is taken from member of raw_kernel_arg object rather @@ -2069,6 +2078,39 @@ void dynamic_parameter_impl::updateAccessor( sizeof(sycl::detail::AccessorBaseHost)); } +void dynamic_parameter_impl::updateWorkGroupMem(size_t BufferSize) { + for (auto &[NodeWeak, ArgIndex] : MNodes) { + auto NodeShared = NodeWeak.lock(); + if (NodeShared) { + dynamic_parameter_impl::updateCGWorkGroupMem(NodeShared->MCommandGroup, + ArgIndex, BufferSize); + } + } + + for (auto &DynCGInfo : MDynCGs) { + auto DynCG = DynCGInfo.DynCG.lock(); + if (DynCG) { + auto &CG = DynCG->MCommandGroups[DynCGInfo.CGIndex]; + dynamic_parameter_impl::updateCGWorkGroupMem(CG, DynCGInfo.ArgIndex, + BufferSize); + } + } +} + +void dynamic_parameter_impl::updateCGWorkGroupMem( + std::shared_ptr CG, int ArgIndex, size_t BufferSize) { + + auto &Args = static_cast(CG.get())->MArgs; + for (auto &Arg : Args) { + if (Arg.MIndex != ArgIndex) { + continue; + } + assert(Arg.MType == sycl::detail::kernel_param_kind_t::kind_std_layout); + Arg.MSize = BufferSize; + break; + } +} + void dynamic_parameter_impl::updateCGArgValue( std::shared_ptr CG, int ArgIndex, const void *NewValue, size_t Size) { diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp index 6045cc9ededac..39c90ea0abd23 100644 --- a/sycl/source/detail/graph_impl.hpp +++ b/sycl/source/detail/graph_impl.hpp @@ -1479,6 +1479,9 @@ class exec_graph_impl { class dynamic_parameter_impl { public: + dynamic_parameter_impl(std::shared_ptr GraphImpl) + : MGraph(GraphImpl) {} + dynamic_parameter_impl(std::shared_ptr GraphImpl, size_t ParamSize, const void *Data) : MGraph(GraphImpl), MValueStorage(ParamSize), @@ -1546,6 +1549,22 @@ class dynamic_parameter_impl { /// @param Acc The new accessor value void updateAccessor(const sycl::detail::AccessorBaseHost *Acc); + /// Update the internal value of this dynamic parameter as well as the value + /// of this parameter in all registered nodes and dynamic CGs. Should only be + /// called for dynamic_work_group_memory arguments parameter. + /// @param BufferSize The total size in bytes of the new work_group_memory + /// array + void updateWorkGroupMem(size_t BufferSize); + + /// Static helper function for updating command-group + /// dynamic_work_group_memory arguments. + /// @param CG The command-group to update the argument information for. + /// @param ArgIndex The argument index to update. + /// @param BufferSize The total size in bytes of the new work_group_memory + /// array + static void updateCGWorkGroupMem(std::shared_ptr CG, + int ArgIndex, size_t BufferSize); + /// Static helper function for updating command-group value arguments. /// @param CG The command-group to update the argument information for. /// @param ArgIndex The argument index to update. @@ -1569,7 +1588,7 @@ class dynamic_parameter_impl { // Dynamic command-groups which will be updated std::vector MDynCGs; - std::shared_ptr MGraph; + std::weak_ptr MGraph; std::vector MValueStorage; private: diff --git a/sycl/source/detail/jit_compiler.cpp b/sycl/source/detail/jit_compiler.cpp index e24e49cf68463..3b9441455f295 100644 --- a/sycl/source/detail/jit_compiler.cpp +++ b/sycl/source/detail/jit_compiler.cpp @@ -180,6 +180,8 @@ translateArgType(kernel_param_kind_t Kind) { return PK::Stream; case kind::kind_work_group_memory: return PK::WorkGroupMemory; + case kind::kind_dynamic_work_group_memory: + return PK::DynamicWorkGroupMemory; case kind::kind_invalid: return PK::Invalid; } diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 93b78628c13b1..80166ee6342cb 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -2319,6 +2319,8 @@ void SetArgBasedOnType( const ContextImplPtr &ContextImpl, detail::ArgDesc &Arg, size_t NextTrueIndex) { switch (Arg.MType) { + case kernel_param_kind_t::kind_dynamic_work_group_memory: + break; case kernel_param_kind_t::kind_work_group_memory: break; case kernel_param_kind_t::kind_stream: diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 5103e55246693..89905f6625632 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -33,6 +33,7 @@ #include #include +#include "sycl/ext/oneapi/experimental/graph.hpp" #include #include #include @@ -1009,6 +1010,21 @@ void handler::processArg(void *Ptr, const detail::kernel_param_kind_t &Kind, } break; } + case kernel_param_kind_t::kind_dynamic_work_group_memory: { + + auto *DynBase = static_cast< + ext::oneapi::experimental::detail::dynamic_parameter_base *>(Ptr); + + auto *DynWorkGroupBase = static_cast< + ext::oneapi::experimental::detail::dynamic_work_group_memory_base *>( + Ptr); + + registerDynamicParameter(*DynBase, Index + IndexShift); + + addArg(kernel_param_kind_t::kind_std_layout, nullptr, + DynWorkGroupBase->BufferSize, Index + IndexShift); + break; + } case kernel_param_kind_t::kind_work_group_memory: { addArg(kernel_param_kind_t::kind_std_layout, nullptr, static_cast(Ptr)->buffer_size, @@ -1039,6 +1055,19 @@ void handler::setArgHelper(int ArgIndex, detail::work_group_memory_impl &Arg) { impl->MWorkGroupMemoryObjects.back().get(), 0, ArgIndex); } +void handler::setArgHelper( + int ArgIndex, + ext::oneapi::experimental::detail::dynamic_work_group_memory_base + &DynWorkGroupBase) { + + addArg(detail::kernel_param_kind_t::kind_dynamic_work_group_memory, + &DynWorkGroupBase, 0, ArgIndex); + + // Register the dynamic parameter with the handler for later association + // with the node being added + registerDynamicParameter(DynWorkGroupBase, ArgIndex); +} + // The argument can take up more space to store additional information about // MAccessRange, MMemoryRange, and MOffset added with addArgsForGlobalAccessor. // We use the worst-case estimate because the lifetime of the vector is short. @@ -2069,7 +2098,7 @@ void handler::registerDynamicParameter( } auto Paraimpl = detail::getSyclObjImpl(DynamicParamBase); - if (Paraimpl->MGraph != this->impl->MGraph) { + if (Paraimpl->MGraph.lock() != this->impl->MGraph) { throw sycl::exception( make_error_code(errc::invalid), "Cannot use a Dynamic Parameter with a node associated with a graph " diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/dyn_work_group_memory_basic.cpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/dyn_work_group_memory_basic.cpp new file mode 100644 index 0000000000000..4cedb5a47c7f2 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/dyn_work_group_memory_basic.cpp @@ -0,0 +1,63 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: cuda +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/16004 + +// Tests updating dynamic_work_group_memory with a new size. + +#include "../../graph_common.hpp" +#include "free_function_kernels.hpp" + +int main() { + constexpr int LocalSize{32}; + nd_range<1> NDRange{Size, LocalSize}; + + queue Queue{}; + context Ctxt{Queue.get_context()}; + + int *PtrA = malloc_device(Size, Queue); + std::vector HostDataA(Size); + + exp_ext::command_graph Graph{Ctxt, Queue.get_device()}; + exp_ext::dynamic_work_group_memory DynLocalMem{Graph, LocalSize}; + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + + kernel_bundle Bundle = get_kernel_bundle(Ctxt); + kernel_id KernelID = exp_ext::get_kernel_id(); + kernel Kernel = Bundle.get_kernel(KernelID); + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.set_arg(0, DynLocalMem); + cgh.set_arg(1, PtrA); + cgh.parallel_for(NDRange, Kernel); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == LocalSize * LocalSize); + } + + constexpr int NewLocalSize{64}; + DynLocalMem.update(NewLocalSize); + KernelNode.update_nd_range(nd_range<1>{range{Size}, range{NewLocalSize}}); + ExecGraph.update(KernelNode); + + Queue.memset(PtrA, 0, Size * sizeof(int)).wait(); + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == NewLocalSize * NewLocalSize); + } + + free(PtrA, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp index d319d819abdad..3f595568876ae 100644 --- a/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp +++ b/sycl/test-e2e/Graph/Update/FreeFunctionKernels/free_function_kernels.hpp @@ -4,6 +4,7 @@ #include "sycl/ext/oneapi/kernel_properties/properties.hpp" #include "sycl/kernel_bundle.hpp" #include +#include SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::single_task_kernel)) void ff_0(int *Ptr) { @@ -55,3 +56,17 @@ void ff_6(int *Ptr, int ScalarValue) { Ptr[i] = ScalarValue; } } +SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((exp_ext::nd_range_kernel<1>)) +void ff_7(exp_ext::dynamic_work_group_memory DynLocalMem, int *PtrA) { + const auto Item = sycl::ext::oneapi::this_work_item::get_nd_item<1>(); + size_t GlobalID = Item.get_global_id(); + auto LocalRange = Item.get_local_range(0); + auto LocalMem = DynLocalMem.get(); + + LocalMem[Item.get_local_id()] = LocalRange; + group_barrier(Item.get_group()); + + for (size_t i{0}; i < LocalRange; ++i) { + PtrA[GlobalID] += LocalMem[i]; + } +} diff --git a/sycl/test-e2e/Graph/Update/dyn_cgf_with_dyn_work_group_mem.cpp b/sycl/test-e2e/Graph/Update/dyn_cgf_with_dyn_work_group_mem.cpp new file mode 100644 index 0000000000000..1d5d57b98af78 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_cgf_with_dyn_work_group_mem.cpp @@ -0,0 +1,92 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests using a dynamic command-group object with dynamic_work_group_memory. + +#include "../graph_common.hpp" +#include + +int main() { + queue Queue{}; + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + constexpr int LocalSizeA{16}; + constexpr int LocalSizeB{64}; + + int *PtrA = malloc_device(Size, Queue); + int *PtrB = malloc_device(Size, Queue); + + std::vector HostDataA(Size); + std::vector HostDataB(Size); + + exp_ext::dynamic_work_group_memory DynLocalMem(Graph, LocalSizeA); + + nd_range<1> NDrangeA{Size, LocalSizeA}; + auto CGFA = [&](handler &CGH) { + CGH.parallel_for(NDrangeA, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + auto LocalRange = Item.get_local_range(0); + auto LocalMem = DynLocalMem.get(); + + LocalMem[Item.get_local_id()] = LocalRange; + group_barrier(Item.get_group()); + + for (size_t i{0}; i < LocalRange; ++i) { + PtrA[GlobalID] += LocalMem[i]; + } + }); + }; + + nd_range<1> NDrangeB{Size, LocalSizeB}; + auto CGFB = [&](handler &CGH) { + CGH.parallel_for(NDrangeB, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + auto LocalRange = Item.get_local_range(0); + auto LocalMem = DynLocalMem.get(); + + LocalMem[Item.get_local_id()] = LocalRange; + group_barrier(Item.get_group()); + + for (size_t i{0}; i < LocalRange; ++i) { + PtrB[GlobalID] += LocalMem[i]; + } + }); + }; + + auto DynamicCG = exp_ext::dynamic_command_group(Graph, {CGFA, CGFB}); + auto DynamicCGNode = Graph.add(DynamicCG); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + auto ExecuteGraphAndVerifyResults = [&](bool A, bool B, const int LocalSize) { + Queue.memset(PtrA, 0, Size * sizeof(int)); + Queue.memset(PtrB, 0, Size * sizeof(int)); + Queue.wait(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size); + Queue.copy(PtrB, HostDataB.data(), Size); + Queue.wait(); + + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == (A ? LocalSize * LocalSize : 0)); + assert(HostDataB[i] == (B ? LocalSize * LocalSize : 0)); + } + }; + ExecuteGraphAndVerifyResults(true, false, LocalSizeA); + + DynamicCG.set_active_index(1); + DynLocalMem.update(LocalSizeB); + ExecGraph.update(DynamicCGNode); + ExecuteGraphAndVerifyResults(false, true, LocalSizeB); + + sycl::free(PtrA, Queue); + sycl::free(PtrB, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_work_group_memory_basic.cpp b/sycl/test-e2e/Graph/Update/dyn_work_group_memory_basic.cpp new file mode 100644 index 0000000000000..a8e52dff397db --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_work_group_memory_basic.cpp @@ -0,0 +1,71 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests updating dynamic_work_group_memory with a new size. + +#include "../graph_common.hpp" +#include + +int main() { + queue Queue{}; + constexpr int LocalSize{16}; + + using T = int; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + std::vector HostDataA(Size); + + exp_ext::dynamic_work_group_memory DynLocalMem{Graph, LocalSize}; + + Queue.memset(PtrA, 0, Size * sizeof(T)).wait(); + + nd_range<1> NDRange{range{Size}, range{LocalSize}}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(NDRange, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + auto LocalRange = Item.get_local_range(0); + auto LocalMem = DynLocalMem.get(); + + LocalMem[Item.get_local_id()] = LocalRange; + group_barrier(Item.get_group()); + + for (size_t i{0}; i < LocalRange; ++i) { + PtrA[GlobalID] += LocalMem[i]; + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == LocalSize * LocalSize); + } + + Queue.memset(PtrA, 0, Size * sizeof(T)).wait(); + + constexpr int NewLocalSize{32}; + + DynLocalMem.update(NewLocalSize); + KernelNode.update_nd_range(nd_range<1>{range{Size}, range{NewLocalSize}}); + ExecGraph.update(KernelNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == NewLocalSize * NewLocalSize); + } + + free(PtrA, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_work_group_memory_multiple.cpp b/sycl/test-e2e/Graph/Update/dyn_work_group_memory_multiple.cpp new file mode 100644 index 0000000000000..8d478a961385a --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_work_group_memory_multiple.cpp @@ -0,0 +1,77 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests using more than one dynamic_work_group_memory object in the same node. + +#include "../graph_common.hpp" +#include + +int main() { + queue Queue{}; + + constexpr int LocalSize{16}; + + using T = int; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(Size, Queue); + std::vector HostDataA(Size); + + exp_ext::dynamic_work_group_memory DynLocalMemA{Graph, LocalSize}; + exp_ext::dynamic_work_group_memory DynLocalMemB{Graph, LocalSize}; + + Queue.memset(PtrA, 0, Size * sizeof(T)).wait(); + + nd_range<1> NDRange{range{Size}, range{LocalSize}}; + + auto KernelNode = Graph.add([&](handler &cgh) { + cgh.parallel_for(NDRange, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + auto LocalRange = Item.get_local_range(0); + + auto LocalMemA = DynLocalMemA.get(); + auto LocalMemB = DynLocalMemB.get(); + + LocalMemA[Item.get_local_id()] = LocalRange; + LocalMemB[Item.get_local_id()] = LocalRange; + group_barrier(Item.get_group()); + + for (size_t i{0}; i < LocalRange; ++i) { + PtrA[GlobalID] += (T)(LocalMemA[i] + LocalMemB[i]); + } + }); + }); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == LocalSize * LocalSize * 2); + } + + Queue.memset(PtrA, 0, Size * sizeof(T)).wait(); + + constexpr int NewLocalSize{32}; + + DynLocalMemA.update(NewLocalSize); + DynLocalMemB.update(NewLocalSize); + KernelNode.update_nd_range(nd_range<1>{range{Size}, range{NewLocalSize}}); + ExecGraph.update(KernelNode); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size).wait(); + for (size_t i = 0; i < Size; i++) { + assert(HostDataA[i] == NewLocalSize * NewLocalSize * 2); + } + + free(PtrA, Queue); + return 0; +} diff --git a/sycl/test-e2e/Graph/Update/dyn_work_group_memory_multiple_nodes.cpp b/sycl/test-e2e/Graph/Update/dyn_work_group_memory_multiple_nodes.cpp new file mode 100644 index 0000000000000..b5955113e23e7 --- /dev/null +++ b/sycl/test-e2e/Graph/Update/dyn_work_group_memory_multiple_nodes.cpp @@ -0,0 +1,130 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// Extra run to check for immediate-command-list in Level Zero +// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests using a dynamic_work_group_memory with multiple nodes. + +#include "../graph_common.hpp" +#include + +int main() { + queue Queue{}; + + constexpr int LocalSize{16}; + + using T = int; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + + int *PtrA = malloc_device(Size * Size, Queue); + std::vector HostDataA(Size * Size); + + exp_ext::dynamic_work_group_memory DynLocalMemA{Graph, + LocalSize}; + exp_ext::dynamic_work_group_memory DynLocalMemC{Graph, LocalSize}; + + Queue.memset(PtrA, 0, Size * Size * sizeof(T)).wait(); + + nd_range<2> NDRange2D{range<2>{Size, Size}, range<2>{LocalSize, LocalSize}}; + + auto KernelNodeA = Graph.add([&](handler &cgh) { + cgh.parallel_for(NDRange2D, [=](nd_item<2> Item) { + size_t GlobalID = Item.get_global_linear_id(); + auto LocalRange = Item.get_local_range(0); + const auto i = Item.get_local_id()[0]; + const auto j = Item.get_local_id()[1]; + + auto LocalMemA = DynLocalMemA.get(); + + LocalMemA[i][j] = LocalRange; + group_barrier(Item.get_group()); + + for (size_t k{0}; k < LocalRange; ++k) { + for (size_t z{0}; z < LocalRange; ++z) { + PtrA[GlobalID] += (T)(LocalMemA[k][z]); + } + } + }); + }); + + auto KernelNodeB = Graph.add( + [&](handler &cgh) { + cgh.parallel_for(NDRange2D, [=](nd_item<2> Item) { + size_t GlobalID = Item.get_global_linear_id(); + auto LocalRange = Item.get_local_range(0); + const auto i = Item.get_local_id()[0]; + const auto j = Item.get_local_id()[1]; + + auto LocalMemA = DynLocalMemA.get(); + + LocalMemA[i][j] = LocalRange; + group_barrier(Item.get_group()); + + // Substracting what was added in NodeA gives 0. + for (size_t k{0}; k < LocalRange; ++k) { + for (size_t z{0}; z < LocalRange; ++z) { + PtrA[GlobalID] -= (T)(LocalMemA[k][z]); + } + } + }); + }, + exp_ext::property::node::depends_on{KernelNodeA}); + + nd_range<1> NDRange{Size * Size, LocalSize}; + auto KernelNodeC = Graph.add( + [&](handler &cgh) { + cgh.parallel_for(NDRange, [=](nd_item<1> Item) { + size_t GlobalID = Item.get_global_id(); + auto LocalRange = Item.get_local_range(0); + + auto LocalMemC = DynLocalMemC.get(); + + LocalMemC[Item.get_local_id()] = LocalRange; + group_barrier(Item.get_group()); + + for (size_t i{0}; i < LocalRange; ++i) { + PtrA[GlobalID] += (T)(LocalMemC[i]); + } + }); + }, + exp_ext::property::node::depends_on{KernelNodeB}); + + auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{}); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size * Size).wait(); + for (size_t i = 0; i < Size * Size; i++) { + assert(HostDataA[i] == LocalSize * LocalSize); + } + + Queue.memset(PtrA, 0, Size * Size * sizeof(T)).wait(); + + constexpr int NewLocalSize{32}; + + DynLocalMemA.update(NewLocalSize); + DynLocalMemC.update(NewLocalSize); + + KernelNodeA.update_nd_range( + nd_range<2>{range<2>{Size, Size}, range<2>{NewLocalSize, NewLocalSize}}); + KernelNodeB.update_nd_range( + nd_range<2>{range<2>{Size, Size}, range<2>{NewLocalSize, NewLocalSize}}); + KernelNodeC.update_nd_range(nd_range<1>{Size * Size, NewLocalSize}); + + ExecGraph.update(KernelNodeA); + ExecGraph.update(KernelNodeB); + ExecGraph.update(KernelNodeC); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + Queue.copy(PtrA, HostDataA.data(), Size * Size).wait(); + for (size_t i = 0; i < Size * Size; i++) { + assert(HostDataA[i] == NewLocalSize * NewLocalSize); + } + + free(PtrA, Queue); + return 0; +} diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index d617ead2079d0..bbbe31d702d51 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -3077,7 +3077,10 @@ _ZN4sycl3_V13ext6oneapi12experimental6detail17build_from_sourceERNS0_13kernel_bu _ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPKNS3_14raw_kernel_argEm _ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base11updateValueEPKvm _ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base14updateAccessorEPKNS0_6detail16AccessorBaseHostE +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_base18updateWorkGroupMemEm +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC1ENS3_13command_graphILNS3_11graph_stateE0EEE _ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC1ENS3_13command_graphILNS3_11graph_stateE0EEEmPKv +_ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC2ENS3_13command_graphILNS3_11graph_stateE0EEE _ZN4sycl3_V13ext6oneapi12experimental6detail22dynamic_parameter_baseC2ENS3_13command_graphILNS3_11graph_stateE0EEEmPKv _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph12finalizeImplEv _ZN4sycl3_V13ext6oneapi12experimental6detail24executable_command_graph6updateERKNS3_13command_graphILNS3_11graph_stateE0EEE @@ -3506,6 +3509,7 @@ _ZN4sycl3_V17handler11saveCodeLocENS0_6detail13code_locationE _ZN4sycl3_V17handler11saveCodeLocENS0_6detail13code_locationEb _ZN4sycl3_V17handler11storeRawArgEPKvm _ZN4sycl3_V17handler12addReductionERKSt10shared_ptrIKvE +_ZN4sycl3_V17handler12setArgHelperEiRNS0_3ext6oneapi12experimental6detail30dynamic_work_group_memory_baseE _ZN4sycl3_V17handler12setArgHelperEiRNS0_6detail22work_group_memory_implE _ZN4sycl3_V17handler13getKernelNameEv _ZN4sycl3_V17handler14addAccessorReqESt10shared_ptrINS0_6detail16AccessorImplHostEE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index cb6bebb417c85..c1706c78ae08c 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -505,10 +505,14 @@ ??1exception@_V1@sycl@@UEAA@XZ ??1exception_list@_V1@sycl@@QEAA@XZ ??1executable_command_graph@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@V?$command_graph@$0A@@23456@@Z +?updateWorkGroupMem@dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@IEAAX_K@Z ??1filter_selector@ONEAPI@_V1@sycl@@UEAA@XZ ??1filter_selector@oneapi@ext@_V1@sycl@@UEAA@XZ ??1fusion_wrapper@experimental@codeplay@ext@_V1@sycl@@QEAA@XZ ??1gpu_selector@_V1@sycl@@UEAA@XZ +??0dynamic_parameter_base@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ +?setArgHelper@handler@_V1@sycl@@AEAAXHAEAVdynamic_work_group_memory_base@detail@experimental@oneapi@ext@23@@Z ??1handler@_V1@sycl@@AEAA@XZ ??1image_mem@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ ??1image_mem_impl@detail@experimental@oneapi@ext@_V1@sycl@@QEAA@XZ diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp index f0573a71d825d..b76826f8c318f 100644 --- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp +++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp @@ -118,6 +118,7 @@ // CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.hpp // CHECK-NEXT: ext/oneapi/experimental/detail/properties/graph_properties.def // CHECK-NEXT: ext/oneapi/experimental/detail/properties/node_properties.def +// CHECK-NEXT: ext/oneapi/experimental/work_group_memory.hpp // CHECK-NEXT: handler.hpp // CHECK-NEXT: detail/cl.h // CHECK-NEXT: CL/cl.h diff --git a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp index 198c6a09fcf1e..453e1beb72adf 100644 --- a/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp +++ b/sycl/unittests/Extensions/CommandGraph/CommandGraph.cpp @@ -674,3 +674,18 @@ TEST_F(CommandGraphTest, TransitiveRecordingShortcuts) { ASSERT_EQ(Q3.ext_oneapi_get_state(), ext::oneapi::experimental::queue_state::executing); } + +// Tests that dynamic_work_group_memory.get() will throw on the host side. +TEST_F(CommandGraphTest, DynamicWorkGroupMemoryGet) { + device Dev; + context Ctx{{Dev}}; + queue Queue{Ctx, Dev}; + constexpr int LocalSize{32}; + + ext::oneapi::experimental::command_graph Graph{Queue.get_context(), + Queue.get_device()}; + + ext::oneapi::experimental::dynamic_work_group_memory DynLocalMem{ + Graph, LocalSize}; + ASSERT_ANY_THROW(DynLocalMem.get()); +}