Skip to content

Commit 85a3861

Browse files
authored
[SYCL][UR][OpenCL] allow passing events from multiple contexts to urEventWait (#18711)
SYCL allows creating cross-context dependencies. It automatically handles cases where contexts are from different adapters. However, events from a single adapter are always passed directly to urEventWait (even if they are from different contexts). This causes problems with opencl CPU (it returns CL_INVALID_CONTEXT from clWaitForEvents). In general, OpenCL spec does not guarantee that passing events from different contexts works for any function. The same problem exists for all enqueue functions but in practice SYCL does not mix events in the waitList - it will always use urEventWait for synchronization.
1 parent b12d280 commit 85a3861

File tree

3 files changed

+234
-31
lines changed

3 files changed

+234
-31
lines changed
Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include <sycl/detail/core.hpp>
5+
#include <sycl/usm.hpp>
6+
7+
#include <iostream>
8+
#include <vector>
9+
10+
std::vector<sycl::event> submit_dependencies(sycl::queue q1, sycl::queue q2,
11+
int *mem1, int *mem2) {
12+
int delay_ops = 1024 * 1024;
13+
auto delay = [=] {
14+
volatile int value = delay_ops;
15+
while (--value)
16+
;
17+
};
18+
19+
auto ev1 =
20+
q1.parallel_for(sycl::range<1>(1024), [=]([[maybe_unused]] auto u) {
21+
delay();
22+
mem1[u.get_id()] = 1;
23+
});
24+
auto ev2 =
25+
q2.parallel_for(sycl::range<1>(1024), [=]([[maybe_unused]] auto u) {
26+
delay();
27+
mem2[u.get_id()] = 2;
28+
});
29+
30+
return {ev1, ev2};
31+
}
32+
33+
void test_host_task() {
34+
sycl::context c1{};
35+
sycl::context c2{};
36+
37+
sycl::queue q1(c1, sycl::default_selector_v);
38+
sycl::queue q2(c2, sycl::default_selector_v);
39+
40+
auto mem1 = sycl::malloc_host<int>(1024, q1);
41+
auto mem2 = sycl::malloc_host<int>(1024, q2);
42+
43+
auto events = submit_dependencies(q1, q2, mem1, mem2);
44+
45+
q2.submit([&](sycl::handler &cgh) {
46+
cgh.depends_on(events[0]);
47+
cgh.depends_on(events[1]);
48+
cgh.host_task([=]() {
49+
for (int i = 0; i < 1024; i++) {
50+
assert(mem1[i] == 1);
51+
assert(mem2[i] == 2);
52+
}
53+
});
54+
});
55+
56+
q2.wait();
57+
58+
sycl::free(mem1, c1);
59+
sycl::free(mem2, c2);
60+
}
61+
62+
void test_kernel() {
63+
sycl::context c1{};
64+
sycl::context c2{};
65+
66+
sycl::queue q1(c1, sycl::default_selector_v);
67+
sycl::queue q2(c2, sycl::default_selector_v);
68+
69+
auto mem1 = sycl::malloc_device<int>(1024, q1);
70+
auto mem2 = sycl::malloc_device<int>(1024, q2);
71+
72+
auto events = submit_dependencies(q1, q2, mem1, mem2);
73+
74+
q1.submit([&](sycl::handler &cgh) {
75+
cgh.depends_on(events[0]);
76+
cgh.depends_on(events[1]);
77+
cgh.parallel_for(sycl::range<1>(1024),
78+
[=](auto item) { assert(mem1[item.get_id()] == 1); });
79+
});
80+
81+
q2.submit([&](sycl::handler &cgh) {
82+
cgh.depends_on(events[0]);
83+
cgh.depends_on(events[1]);
84+
cgh.parallel_for(sycl::range<1>(1024),
85+
[=](auto item) { assert(mem2[item.get_id()] == 2); });
86+
});
87+
88+
q1.wait();
89+
q2.wait();
90+
91+
sycl::free(mem1, c1);
92+
sycl::free(mem2, c2);
93+
}
94+
95+
int main() {
96+
test_host_task();
97+
test_kernel();
98+
99+
return 0;
100+
}

unified-runtime/source/adapters/opencl/event.cpp

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -149,12 +149,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
149149

150150
UR_APIEXPORT ur_result_t UR_APICALL
151151
urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
152-
std::vector<cl_event> CLEvents(numEvents);
152+
ur_context_handle_t hContext = phEventWaitList[0]->Context;
153+
std::vector<cl_event> CLEvents;
154+
CLEvents.reserve(numEvents);
155+
156+
// clWaitForEvents can only be called on events from the same context.
157+
// If the events are from different contexts, we need to wait for each
158+
// set of events separately.
153159
for (uint32_t i = 0; i < numEvents; i++) {
154-
CLEvents[i] = phEventWaitList[i]->CLEvent;
160+
if (phEventWaitList[i]->Context != hContext) {
161+
CL_RETURN_ON_FAILURE(clWaitForEvents(CLEvents.size(), CLEvents.data()));
162+
CLEvents.clear();
163+
}
164+
165+
CLEvents.push_back(phEventWaitList[i]->CLEvent);
166+
hContext = phEventWaitList[i]->Context;
167+
}
168+
if (CLEvents.size()) {
169+
CL_RETURN_ON_FAILURE(clWaitForEvents(CLEvents.size(), CLEvents.data()));
155170
}
156-
cl_int RetErr = clWaitForEvents(numEvents, CLEvents.data());
157-
CL_RETURN_ON_FAILURE(RetErr);
158171
return UR_RESULT_SUCCESS;
159172
}
160173

unified-runtime/test/conformance/event/urEventWait.cpp

Lines changed: 117 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -7,56 +7,78 @@
77
#include <uur/fixtures.h>
88
#include <uur/known_failure.h>
99

10-
struct urEventWaitTest : uur::urQueueTest {
10+
struct urEventWaitTest : uur::urDeviceTest {
1111
void SetUp() override {
12-
UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp());
13-
ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_ONLY, size,
14-
nullptr, &src_buffer));
15-
ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_WRITE_ONLY, size,
16-
nullptr, &dst_buffer));
17-
input.assign(count, 42);
18-
ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue, src_buffer, false, 0, size,
19-
input.data(), 0, nullptr, &event));
20-
ASSERT_SUCCESS(urEventWait(1, &event));
12+
UUR_RETURN_ON_FATAL_FAILURE(urDeviceTest::SetUp());
13+
14+
for (size_t i = 0; i < maxNumContexts; ++i) {
15+
ur_context_handle_t context = nullptr;
16+
ASSERT_SUCCESS(urContextCreate(1, &device, nullptr, &context));
17+
ASSERT_NE(context, nullptr);
18+
contexts.push_back(context);
19+
20+
ur_queue_handle_t queue = nullptr;
21+
ASSERT_SUCCESS(urQueueCreate(context, device, 0, &queue));
22+
ASSERT_NE(queue, nullptr);
23+
queues.push_back(queue);
24+
25+
src_buffer.emplace_back();
26+
dst_buffer.emplace_back();
27+
28+
ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
29+
nullptr, &src_buffer[i]));
30+
ASSERT_SUCCESS(urMemBufferCreate(context, UR_MEM_FLAG_READ_WRITE, size,
31+
nullptr, &dst_buffer[i]));
32+
input.emplace_back();
33+
input[i].assign(count, uint32_t(99 + i));
34+
ASSERT_SUCCESS(urEnqueueMemBufferWrite(queue, src_buffer[i], true, 0,
35+
size, input[i].data(), 0, nullptr,
36+
nullptr));
37+
}
2138
}
2239

2340
void TearDown() override {
24-
if (src_buffer) {
25-
EXPECT_SUCCESS(urMemRelease(src_buffer));
41+
for (size_t i = 0; i < src_buffer.size(); ++i) {
42+
EXPECT_SUCCESS(urMemRelease(src_buffer[i]));
43+
EXPECT_SUCCESS(urMemRelease(dst_buffer[i]));
2644
}
27-
if (dst_buffer) {
28-
EXPECT_SUCCESS(urMemRelease(dst_buffer));
45+
for (size_t i = 0; i < queues.size(); ++i) {
46+
EXPECT_SUCCESS(urQueueRelease(queues[i]));
2947
}
30-
if (event) {
31-
EXPECT_SUCCESS(urEventRelease(event));
48+
for (size_t i = 0; i < contexts.size(); ++i) {
49+
EXPECT_SUCCESS(urContextRelease(contexts[i]));
3250
}
33-
urQueueTest::TearDown();
51+
UUR_RETURN_ON_FATAL_FAILURE(urDeviceTest::TearDown());
3452
}
3553

54+
const size_t maxNumContexts = 5;
55+
std::vector<ur_context_handle_t> contexts;
56+
std::vector<ur_queue_handle_t> queues;
57+
std::vector<ur_mem_handle_t> src_buffer;
58+
std::vector<ur_mem_handle_t> dst_buffer;
3659
const size_t count = 1024;
3760
const size_t size = sizeof(uint32_t) * count;
38-
ur_mem_handle_t src_buffer = nullptr;
39-
ur_mem_handle_t dst_buffer = nullptr;
40-
ur_event_handle_t event = nullptr;
41-
std::vector<uint32_t> input;
61+
std::vector<std::vector<uint32_t>> input;
4262
};
63+
4364
UUR_INSTANTIATE_DEVICE_TEST_SUITE(urEventWaitTest);
4465

4566
TEST_P(urEventWaitTest, Success) {
4667
UUR_KNOWN_FAILURE_ON(uur::NativeCPU{});
4768

4869
ur_event_handle_t event1 = nullptr;
49-
ASSERT_SUCCESS(urEnqueueMemBufferCopy(queue, src_buffer, dst_buffer, 0, 0,
50-
size, 0, nullptr, &event1));
70+
ASSERT_SUCCESS(urEnqueueMemBufferCopy(queues[0], src_buffer[0], dst_buffer[0],
71+
0, 0, size, 0, nullptr, &event1));
5172
std::vector<uint32_t> output(count, 1);
5273
ur_event_handle_t event2 = nullptr;
53-
ASSERT_SUCCESS(urEnqueueMemBufferRead(queue, dst_buffer, false, 0, size,
54-
output.data(), 0, nullptr, &event2));
74+
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], dst_buffer[0], false, 0,
75+
size, output.data(), 0, nullptr,
76+
&event2));
5577
std::vector<ur_event_handle_t> events{event1, event2};
56-
EXPECT_SUCCESS(urQueueFlush(queue));
78+
EXPECT_SUCCESS(urQueueFlush(queues[0]));
5779
ASSERT_SUCCESS(
5880
urEventWait(static_cast<uint32_t>(events.size()), events.data()));
59-
ASSERT_EQ(input, output);
81+
ASSERT_EQ(input[0], output);
6082

6183
EXPECT_SUCCESS(urEventRelease(event1));
6284
EXPECT_SUCCESS(urEventRelease(event2));
@@ -75,3 +97,71 @@ TEST_P(urEventWaitNegativeTest, InvalidNullPointerEventList) {
7597
ASSERT_EQ_RESULT(UR_RESULT_ERROR_INVALID_NULL_POINTER,
7698
urEventWait(1, nullptr));
7799
}
100+
101+
TEST_P(urEventWaitTest, WaitWithMultipleContexts) {
102+
UUR_KNOWN_FAILURE_ON(uur::NativeCPU{});
103+
104+
for (size_t i = 0; i < maxNumContexts; i++) {
105+
ASSERT_SUCCESS(urEnqueueMemBufferCopy(queues[i], src_buffer[i],
106+
dst_buffer[i], 0, 0, size, 0, nullptr,
107+
nullptr));
108+
}
109+
110+
std::vector<ur_event_handle_t> events;
111+
std::vector<std::vector<uint32_t>> output;
112+
for (size_t i = 0; i < maxNumContexts; i++) {
113+
output.emplace_back(count, 1);
114+
events.emplace_back();
115+
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[i], dst_buffer[i], false, 0,
116+
size, output[i].data(), 0, nullptr,
117+
&events.back()));
118+
}
119+
120+
ASSERT_SUCCESS(
121+
urEventWait(static_cast<uint32_t>(events.size()), events.data()));
122+
123+
for (size_t i = 0; i < maxNumContexts; i++) {
124+
ASSERT_EQ(input[i], output[i]);
125+
}
126+
127+
for (auto &event : events) {
128+
EXPECT_SUCCESS(urEventRelease(event));
129+
}
130+
}
131+
132+
TEST_P(urEventWaitTest, WithCrossContextDependencies) {
133+
// OpenCL: https://github.com/intel/llvm/issues/18765
134+
UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}, uur::OpenCL{});
135+
136+
std::vector<uint32_t> output(count, 1);
137+
138+
std::vector<ur_event_handle_t> events;
139+
for (size_t i = 0; i < maxNumContexts - 1; i++) {
140+
auto waitEvent = events.size() ? &events.back() : nullptr;
141+
ur_event_handle_t event = nullptr;
142+
ASSERT_SUCCESS(
143+
urEnqueueMemBufferCopy(queues[i], src_buffer[i], src_buffer[i + 1], 0,
144+
0, size, waitEvent ? 1 : 0, waitEvent, &event));
145+
events.push_back(event);
146+
}
147+
148+
ur_event_handle_t event1 = nullptr;
149+
ASSERT_SUCCESS(urEnqueueMemBufferCopy(queues.back(), src_buffer.back(),
150+
dst_buffer.back(), 0, 0, size, 1,
151+
&events.back(), &event1));
152+
153+
ur_event_handle_t event2 = nullptr;
154+
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues.back(), dst_buffer.back(), false,
155+
0, size, output.data(), 0, nullptr,
156+
&event2));
157+
158+
events.push_back(event1);
159+
events.push_back(event2);
160+
161+
ASSERT_SUCCESS(
162+
urEventWait(static_cast<uint32_t>(events.size()), events.data()));
163+
ASSERT_EQ(input.front(), output);
164+
165+
EXPECT_SUCCESS(urEventRelease(event1));
166+
EXPECT_SUCCESS(urEventRelease(event2));
167+
}

0 commit comments

Comments
 (0)