Skip to content

Commit 4e36825

Browse files
authored
[SYCL] Record aspect names when computing device requirements (#13974)
After #13486, aspect name information is visible in `sycl-post-link` without the use of `!sycl_aspects`, so this PR updates `sycl-post-link` to use the aspect names that are now available within the `!sycl_used_aspects` metadata instead of `!sycl_aspects`. Aditionally, this PR also adds E2E related to optional kernel features for AOT enabled by these changes
1 parent 6d591f1 commit 4e36825

File tree

6 files changed

+143
-136
lines changed

6 files changed

+143
-136
lines changed

llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,19 @@ class PropertyValue;
3030
}
3131

3232
struct SYCLDeviceRequirements {
33-
std::set<uint32_t> Aspects;
33+
struct AspectNameValuePair {
34+
llvm::SmallString<64> Name;
35+
uint32_t Value;
36+
AspectNameValuePair(StringRef Name, uint32_t Value)
37+
: Name(Name), Value(Value) {}
38+
bool operator<(const AspectNameValuePair &rhs) const {
39+
return Value < rhs.Value;
40+
}
41+
bool operator==(const AspectNameValuePair &rhs) const {
42+
return Value == rhs.Value;
43+
}
44+
};
45+
std::set<AspectNameValuePair> Aspects;
3446
std::set<uint32_t> FixedTarget;
3547
std::optional<llvm::SmallVector<uint64_t, 3>> ReqdWorkGroupSize;
3648
std::optional<uint32_t> WorkGroupNumDim;

llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -43,19 +43,20 @@ llvm::computeDeviceRequirements(const module_split::ModuleDesc &MD) {
4343
// Process all functions in the module
4444
for (const Function &F : MD.getModule()) {
4545
if (auto *MDN = F.getMetadata("sycl_used_aspects")) {
46-
for (auto &MDOp : MDN->operands()) {
47-
int64_t Val;
48-
if (auto Pair = dyn_cast<MDNode>(MDOp)) {
46+
for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
47+
StringRef AspectName = "";
48+
int64_t AspectValue;
49+
if (auto Pair = dyn_cast<MDNode>(MDN->getOperand(I))) {
4950
assert(Pair->getNumOperands() == 2);
50-
Val = mdconst::extract<ConstantInt>(Pair->getOperand(1))
51-
->getZExtValue();
51+
AspectName = ExtractStringFromMDNodeOperand(Pair, 0);
52+
AspectValue = ExtractSignedIntegerFromMDNodeOperand(Pair, 1);
5253
} else {
53-
Val = mdconst::extract<ConstantInt>(MDOp)->getZExtValue();
54+
AspectValue = ExtractSignedIntegerFromMDNodeOperand(MDN, I);
5455
}
5556
// Don't put internal aspects (with negative integer value) into the
5657
// requirements, they are used only for device image splitting.
57-
if (Val >= 0)
58-
Reqs.Aspects.insert(Val);
58+
if (AspectValue >= 0)
59+
Reqs.Aspects.insert({AspectName, uint32_t(AspectValue)});
5960
}
6061
}
6162

@@ -133,8 +134,11 @@ std::map<StringRef, util::PropertyValue> SYCLDeviceRequirements::asMap() const {
133134
// For all properties except for "aspects", we'll only add the
134135
// value to the map if the corresponding value from
135136
// SYCLDeviceRequirements has a value/is non-empty.
136-
Requirements["aspects"] =
137-
std::vector<uint32_t>(Aspects.begin(), Aspects.end());
137+
std::vector<uint32_t> AspectValues;
138+
AspectValues.reserve(Aspects.size());
139+
for (auto Aspect : Aspects)
140+
AspectValues.push_back(Aspect.Value);
141+
Requirements["aspects"] = std::move(AspectValues);
138142

139143
if (!FixedTarget.empty())
140144
Requirements["fixed_target"] =

llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll

Lines changed: 11 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -65,136 +65,56 @@
6565
target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
6666
target triple = "spir64-unknown-unknown"
6767

68-
; Function Attrs: mustprogress norecurse nounwind
69-
define weak_odr dso_local spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) local_unnamed_addr #0 !srcloc !65 !kernel_arg_buffer_location !66 !sycl_used_aspects !67 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 {
68+
define spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) #0 !sycl_used_aspects !67 {
7069
entry:
71-
%0 = load double, ptr addrspace(1) %_arg_out, align 8, !tbaa !70
70+
%0 = load double, ptr addrspace(1) %_arg_out, align 8
7271
%mul.i = fmul double %0, 2.000000e-01
73-
store double %mul.i, ptr addrspace(1) %_arg_out, align 8, !tbaa !70
72+
store double %mul.i, ptr addrspace(1) %_arg_out, align 8
7473
ret void
7574
}
7675

77-
; Function Attrs: mustprogress norecurse nounwind
78-
define weak_odr dso_local spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) local_unnamed_addr #0 !srcloc !74 !kernel_arg_buffer_location !66 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 {
76+
define spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) #0 {
7977
entry:
80-
%0 = load float, ptr addrspace(1) %_arg_out, align 4, !tbaa !75
78+
%0 = load float, ptr addrspace(1) %_arg_out, align 4
8179
%mul.i = fmul float %0, 0x3FC99999A0000000
82-
store float %mul.i, ptr addrspace(1) %_arg_out, align 4, !tbaa !75
80+
store float %mul.i, ptr addrspace(1) %_arg_out, align 4
8381
ret void
8482
}
8583

86-
; Function Attrs: mustprogress norecurse nounwind
87-
define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_8() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !78 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
84+
define spir_kernel void @reqd_sub_group_size_kernel_8() #0 !intel_reqd_sub_group_size !78 {
8885
entry:
8986
ret void
9087
}
9188

92-
; Function Attrs: mustprogress norecurse nounwind
93-
define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_16() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !79 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
89+
define spir_kernel void @reqd_sub_group_size_kernel_16() #0 !intel_reqd_sub_group_size !79 {
9490
entry:
9591
ret void
9692
}
9793

98-
; Function Attrs: mustprogress norecurse nounwind
99-
define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_32() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !80 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
94+
define spir_kernel void @reqd_sub_group_size_kernel_32() #0 !intel_reqd_sub_group_size !80 {
10095
entry:
10196
ret void
10297
}
10398

104-
; Function Attrs: mustprogress norecurse nounwind
105-
define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_64() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !81 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
99+
define spir_kernel void @reqd_sub_group_size_kernel_64() #0 !intel_reqd_sub_group_size !81 {
106100
entry:
107101
ret void
108102
}
109103

110-
declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...)
111-
112104
attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="double.cpp" "sycl-optlevel"="3" "uniform-work-group-size"="true" }
113105

114106
!llvm.module.flags = !{!0, !1}
115107
!opencl.spir.version = !{!2}
116108
!spirv.Source = !{!3}
117-
!sycl_aspects = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63}
118109
!llvm.ident = !{!64}
119110

120111
!0 = !{i32 1, !"wchar_size", i32 4}
121112
!1 = !{i32 7, !"frame-pointer", i32 2}
122113
!2 = !{i32 1, i32 2}
123114
!3 = !{i32 4, i32 100000}
124-
!4 = !{!"cpu", i32 1}
125-
!5 = !{!"gpu", i32 2}
126-
!6 = !{!"accelerator", i32 3}
127-
!7 = !{!"custom", i32 4}
128-
!8 = !{!"fp16", i32 5}
129115
!9 = !{!"fp64", i32 6}
130-
!10 = !{!"image", i32 9}
131-
!11 = !{!"online_compiler", i32 10}
132-
!12 = !{!"online_linker", i32 11}
133-
!13 = !{!"queue_profiling", i32 12}
134-
!14 = !{!"usm_device_allocations", i32 13}
135-
!15 = !{!"usm_host_allocations", i32 14}
136-
!16 = !{!"usm_shared_allocations", i32 15}
137-
!17 = !{!"usm_system_allocations", i32 17}
138-
!18 = !{!"ext_intel_pci_address", i32 18}
139-
!19 = !{!"ext_intel_gpu_eu_count", i32 19}
140-
!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20}
141-
!21 = !{!"ext_intel_gpu_slices", i32 21}
142-
!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22}
143-
!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23}
144-
!24 = !{!"ext_intel_max_mem_bandwidth", i32 24}
145-
!25 = !{!"ext_intel_mem_channel", i32 25}
146-
!26 = !{!"usm_atomic_host_allocations", i32 26}
147-
!27 = !{!"usm_atomic_shared_allocations", i32 27}
148-
!28 = !{!"atomic64", i32 28}
149-
!29 = !{!"ext_intel_device_info_uuid", i32 29}
150-
!30 = !{!"ext_oneapi_srgb", i32 30}
151-
!31 = !{!"ext_oneapi_native_assert", i32 31}
152-
!32 = !{!"host_debuggable", i32 32}
153-
!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33}
154-
!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34}
155-
!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35}
156-
!36 = !{!"ext_intel_free_memory", i32 36}
157-
!37 = !{!"ext_intel_device_id", i32 37}
158-
!38 = !{!"ext_intel_memory_clock_rate", i32 38}
159-
!39 = !{!"ext_intel_memory_bus_width", i32 39}
160-
!40 = !{!"emulated", i32 40}
161-
!41 = !{!"ext_intel_legacy_image", i32 41}
162-
!42 = !{!"ext_oneapi_bindless_images", i32 42}
163-
!43 = !{!"ext_oneapi_bindless_images_shared_usm", i32 43}
164-
!44 = !{!"ext_oneapi_bindless_images_1d_usm", i32 44}
165-
!45 = !{!"ext_oneapi_bindless_images_2d_usm", i32 45}
166-
!46 = !{!"ext_oneapi_interop_memory_import", i32 46}
167-
!47 = !{!"ext_oneapi_interop_memory_export", i32 47}
168-
!48 = !{!"ext_oneapi_interop_semaphore_import", i32 48}
169-
!49 = !{!"ext_oneapi_interop_semaphore_export", i32 49}
170-
!50 = !{!"ext_oneapi_mipmap", i32 50}
171-
!51 = !{!"ext_oneapi_mipmap_anisotropy", i32 51}
172-
!52 = !{!"ext_oneapi_mipmap_level_reference", i32 52}
173-
!53 = !{!"ext_intel_esimd", i32 53}
174-
!54 = !{!"ext_oneapi_ballot_group", i32 54}
175-
!55 = !{!"ext_oneapi_fixed_size_group", i32 55}
176-
!56 = !{!"ext_oneapi_opportunistic_group", i32 56}
177-
!57 = !{!"ext_oneapi_tangle_group", i32 57}
178-
!58 = !{!"ext_intel_matrix", i32 58}
179-
!59 = !{!"int64_base_atomics", i32 7}
180-
!60 = !{!"int64_extended_atomics", i32 8}
181-
!61 = !{!"usm_system_allocator", i32 17}
182-
!62 = !{!"usm_restricted_shared_allocations", i32 16}
183-
!63 = !{!"host", i32 0}
184116
!64 = !{!"clang version 19.0.0git (/ws/llvm/clang a7f3a637bdd6299831f903bbed9e8d069fea5c86)"}
185-
!65 = !{i32 233}
186-
!66 = !{i32 -1}
187-
!67 = !{i32 6}
188-
!68 = !{}
189-
!69 = !{i1 false}
190-
!70 = !{!71, !71, i64 0}
191-
!71 = !{!"double", !72, i64 0}
192-
!72 = !{!"omnipotent char", !73, i64 0}
193-
!73 = !{!"Simple C++ TBAA"}
194-
!74 = !{i32 364}
195-
!75 = !{!76, !76, i64 0}
196-
!76 = !{!"float", !72, i64 0}
197-
!77 = !{i32 529}
117+
!67 = !{!9}
198118
!78 = !{i32 8}
199119
!79 = !{i32 16}
200120
!80 = !{i32 32}

llvm/tools/sycl-post-link/sycl-post-link.cpp

Lines changed: 5 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,41 +1014,12 @@ bool isTargetCompatibleWithModule(const std::optional<std::string> &Target,
10141014
DeviceConfigFile::TargetTable[*Target];
10151015
const SYCLDeviceRequirements &ModuleReqs =
10161016
IrMD.getOrComputeDeviceRequirements();
1017-
// The device config file data stores the target's supported
1018-
// aspects as a vector of the strings, so we need to translate
1019-
// the values to a common format.
1020-
const NamedMDNode *Node = IrMD.getModule().getNamedMetadata("sycl_aspects");
1021-
if (Node) {
1022-
SmallMapVector<StringRef, int, 32> AspectNameToValue;
1023-
for (const MDNode *N : Node->operands()) {
1024-
assert(N->getNumOperands() == 2 &&
1025-
"Each operand of sycl_aspects must be a pair.");
1026-
1027-
// The aspect's name is the first operand.
1028-
const auto *AspectName = cast<MDString>(N->getOperand(0));
1029-
1030-
// The aspect's integral value is the second operand.
1031-
const auto *AspectCAM = cast<ConstantAsMetadata>(N->getOperand(1));
1032-
const Constant *AspectC = AspectCAM->getValue();
1033-
1034-
AspectNameToValue[AspectName->getString()] =
1035-
cast<ConstantInt>(AspectC)->getSExtValue();
1036-
}
1037-
1038-
// Make the set of aspects values the target supports.
1039-
SmallSet<int64_t, 32> TargetAspectValueSet;
1040-
for (const auto &Aspect : TargetInfo.aspects) {
1041-
auto It = AspectNameToValue.find(Aspect);
1042-
assert(It != AspectNameToValue.end() && "Aspect value mapping unknown!");
1043-
TargetAspectValueSet.insert(It->second);
1044-
}
10451017

1046-
// Now check to see if all the requirements of the input module
1047-
// are compatbile with the target.
1048-
for (const auto &Aspect : ModuleReqs.Aspects) {
1049-
if (!TargetAspectValueSet.contains(Aspect))
1050-
return false;
1051-
}
1018+
// Check to see if all the requirements of the input module
1019+
// are compatbile with the target.
1020+
for (const auto &Aspect : ModuleReqs.Aspects) {
1021+
if (!is_contained(TargetInfo.aspects, Aspect.Name))
1022+
return false;
10521023
}
10531024

10541025
// Check if module sub group size is compatible with the target.

sycl/test-e2e/AOT/double.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// This test ensures that a program that has a kernel
2+
// using fp64 can be compiled AOT.
3+
4+
// REQUIRES: ocloc
5+
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s
6+
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s
7+
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s
8+
9+
#include <sycl/detail/core.hpp>
10+
11+
using namespace sycl;
12+
13+
int main() {
14+
queue q;
15+
if (q.get_device().has(aspect::fp64)) {
16+
double d = 2.5;
17+
{
18+
buffer<double, 1> buf(&d, 1);
19+
q.submit([&](handler &cgh) {
20+
accessor acc{buf, cgh};
21+
cgh.single_task([=] { acc[0] *= 2; });
22+
});
23+
}
24+
std::cout << d << "\n";
25+
}
26+
}

sycl/test-e2e/AOT/reqd-sg-size.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// This test ensures that a program that has a kernel
2+
// using various required sub-group sizes can be compiled AOT.
3+
4+
// REQUIRES: ocloc
5+
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s
6+
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s
7+
// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s
8+
9+
#include <cstdio>
10+
#include <iostream>
11+
12+
#include <sycl/detail/core.hpp>
13+
14+
using namespace sycl;
15+
16+
template <int N> class kernel_name;
17+
18+
template <size_t... Ns> struct SubgroupDispatcher {
19+
std::vector<std::pair<size_t, size_t>> fails;
20+
SubgroupDispatcher(queue &q) : q(q) {}
21+
22+
void operator()(const std::vector<size_t> &v) {
23+
for (auto i : v)
24+
(*this)(i);
25+
}
26+
27+
void operator()(size_t n) { (dispatch<Ns>(n), ...); }
28+
29+
private:
30+
queue &q;
31+
32+
template <size_t size> void dispatch(size_t n) {
33+
if (n == size) {
34+
size_t res = 0;
35+
{
36+
buffer<size_t, 1> buf(&res, 1);
37+
q.submit([&](handler &cgh) {
38+
accessor acc{buf, cgh};
39+
cgh.parallel_for<kernel_name<size>>(
40+
nd_range<1>(1, 1),
41+
[=](auto item) [[intel::reqd_sub_group_size(size)]] {
42+
acc[0] = item.get_sub_group().get_max_local_range()[0];
43+
});
44+
});
45+
}
46+
if (res != size)
47+
fails.push_back({res, size});
48+
}
49+
}
50+
};
51+
52+
int main() {
53+
queue q;
54+
auto ctx = q.get_context();
55+
auto dev = q.get_device();
56+
auto sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
57+
std::cout << " sub-group sizes supported by the device: " << sizes[0];
58+
for (int i = 1; i < sizes.size(); ++i) {
59+
std::cout << ", " << sizes[i];
60+
}
61+
std::cout << '\n';
62+
63+
using dispatcher_t = SubgroupDispatcher<4, 8, 16, 32, 64, 128>;
64+
dispatcher_t dispatcher(q);
65+
dispatcher(sizes);
66+
if (dispatcher.fails.size() > 0) {
67+
for (auto [actual, expected] : dispatcher.fails) {
68+
std::cout << "actual: " << actual << "\n"
69+
<< "expected: " << expected << "\n";
70+
}
71+
} else {
72+
std::cout << "pass\n";
73+
}
74+
}

0 commit comments

Comments
 (0)