You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
// For conditions of distribution and use, see copyright notice in nabla.h
4
+
5
+
6
+
// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
7
+
#include"../common/MonoDeviceApplication.hpp"
8
+
9
+
usingnamespacenbl;
10
+
usingnamespacecore;
11
+
usingnamespacesystem;
12
+
usingnamespaceasset;
13
+
usingnamespacevideo;
14
+
15
+
16
+
// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platofmrs
17
+
classHelloComputeAppfinal : public nbl::examples::MonoDeviceApplication
18
+
{
19
+
usingbase_t = examples::MonoDeviceApplication;
20
+
public:
21
+
// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
22
+
usingbase_t::base_t;
23
+
24
+
// we stuff all our work here because its a "single shot" app
// Remember to call the base class initialization!
28
+
if (!base_t::onAppInitialized(std::move(system)))
29
+
returnfalse;
30
+
31
+
// TODO: redo completely the rest of the sample
32
+
33
+
34
+
constexpruint32_t WorkgroupSize = 256;
35
+
constexpruint32_t WorkgroupCount = 2048;
36
+
// A word about `nbl::asset::IAsset`s, whenever you see an `nbl::asset::ICPUSomething` you can be sure an `nbl::video::IGPUSomething exists, and they both inherit from `nbl::asset::ISomething`.
37
+
// The convention is that an `ICPU` object represents a potentially Mutable (and in the past, Serializable) recipe for creating an `IGPU` object, and later examples will show automated systems for doing that.
38
+
// The Assets always form a Directed Acyclic Graph and our type system enforces that property at compile time (i.e. an `IBuffer` cannot reference an `IImageView` even indirectly).
39
+
// Another reason for the 1:1 pairing of types is that one can use a CPU-to-GPU associative cache (asset manager has a default one) and use the pointers to the CPU objects as UUIDs.
40
+
// The ICPUShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`.
41
+
// They can be created: from buffers of code, by compilation from some other source code, or loaded from files (next example will do that).
// Normally we'd use the ISystem and the IAssetManager to load shaders flexibly from (virtual) files for ease of development (syntax highlighting and Intellisense),
45
+
// but I want to show the full process of assembling a shader from raw source code at least once.
// A simple shader that writes out the Global Invocation Index to the position it corresponds to in the buffer
49
+
// Note the injection of a define from C++ to keep the workgroup size in sync.
50
+
// P.S. We don't have an entry point name compiler option because we expect that future compilers should support multiple entry points, so for now there must be a single entry point called "main".
51
+
// P.P.S. Yes we know workgroup sizes can come from specialization constants, however DXC has a problem with that https://github.com/microsoft/DirectXShaderCompiler/issues/3092
// theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one
// the simplest example would have used push constants and BDA, but RenderDoc's debugging of that sucks, so I'll demonstrate "classical" binding of buffers with descriptors
// A `nbl::video::DeviceMemoryAllocator` is an interface to implement anything that can dish out free memory range to bind to back a `nbl::video::IGPUBuffer` or a `nbl::video::IGPUImage`
128
+
// The Logical Device itself implements the interface and behaves as the most simple allocator, it will create a new `nbl::video::IDeviceMemoryAllocation` every single time.
129
+
// We will cover allocators and suballocation in a later example.
// Always default the creation parameters, there's a lot of extra stuff for DirectX/CUDA interop and slotting into external engines you don't usually care about.
logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
164
+
returnfalse;
165
+
}
166
+
// Note that we performed a Dedicated Allocation above, so there's no need to bind the memory anymore (since the allocator knows the dedication, it can already bind).
167
+
// This is a carryover from having an OpenGL backend, where you couldn't have a memory allocation separate from the resource, so all allocations had to be "dedicated".
168
+
// In Vulkan dedicated allocations are the most performant and still make sense as long as you won't blow the 4096 allocation limit on windows.
169
+
// You should always use dedicated allocations for images used for swapchains, framebuffer attachments (esp transient), as well as objects used in CUDA/DirectX interop.
// This is a cool utility you can use instead of counting up how much of each descriptor type you need to N_i allocate descriptor sets with layout L_i from a single pool
173
+
smart_refctd_ptr<nbl::video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,&dsLayout.get(),&dsLayout.get()+1);
174
+
175
+
// note how the pool will go out of scope but thanks for backreferences in each object to its parent/dependency it will be kept alive for as long as all the Sets it allocated
auto ptr = m_device->mapMemory(memoryRange,IDeviceMemoryAllocation::EMCAF_READ);
193
+
if (!ptr)
194
+
{
195
+
logFail("Failed to map the Device Memory!\n");
196
+
returnfalse;
197
+
}
198
+
199
+
// queues are inherent parts of the device, ergo not refcounted (you refcount the device instead)
200
+
IGPUQueue* const queue = getComputeQueue();
201
+
202
+
// Our commandbuffers are cool because they refcount the resources used by each command you record into them, so you can rely a commandbuffer on keeping them alive.
// Default, we have no semaphores to wait on before we can start our workload
229
+
IGPUQueue::SSubmitInfo submitInfo = {};
230
+
// The IGPUCommandBuffer is the only object whose usage does not get automagically tracked internally, you're responsible for holding onto it as long as the GPU needs it.
231
+
// So this is why our commandbuffer, even though its transient lives in the scope equal or above the place where we wait for the submission to be signalled as complete.
232
+
submitInfo.commandBufferCount = 1;
233
+
submitInfo.commandBuffers = &cmdbuf.get();
234
+
235
+
// We have a cool integration with RenderDoc that allows you to start and end captures programmatically.
236
+
// This is super useful for debugging multi-queue workloads and by default RenderDoc delimits captures only by Swapchain presents.
237
+
queue->startCapture();
238
+
queue->submit(1u,&submitInfo,done.get());
239
+
queue->endCapture();
240
+
}
241
+
// As the name implies this function will not progress until the fence signals or repeated waiting returns an error.
242
+
m_device->blockForFences(1,&done.get());
243
+
244
+
// You don't need to do this, but putting it here to demonstrate that its safe to drop a commandbuffer after GPU is done (try moving it above and see if you BSOD or just get a validation error).
245
+
cmdbuf = nullptr;
246
+
247
+
// if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
248
+
if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
// a simple test to check we got the right thing back
252
+
auto buffData = reinterpret_cast<constuint32_t*>(ptr);
253
+
for (auto i=0; i<WorkgroupSize*WorkgroupCount; i++)
254
+
if (buffData[i]!=i)
255
+
{
256
+
logFail("DWORD at position %d doesn't match!\n",i);
257
+
returnfalse;
258
+
}
259
+
m_device->unmapMemory(allocation.memory.get());
260
+
261
+
returntrue;
262
+
}
263
+
264
+
// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
265
+
voidworkLoopBody() override {}
266
+
267
+
// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
0 commit comments