Skip to content

Commit 6a86ee4

Browse files
Start some work on how to select a device
1 parent 15ff1fd commit 6a86ee4

File tree

5 files changed

+641
-0
lines changed

5 files changed

+641
-0
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
include(common RESULT_VARIABLE RES)
2+
if(NOT RES)
3+
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
4+
endif()
5+
6+
nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"enableParallelBuild": true,
3+
"threadsPerBuildProcess" : 2,
4+
"isExecuted": false,
5+
"scriptPath": "",
6+
"cmake": {
7+
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
8+
"buildModes": [],
9+
"requiredOptions": []
10+
},
11+
"profiles": [
12+
{
13+
"backend": "vulkan", // should be none
14+
"platform": "windows",
15+
"buildModes": [],
16+
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
17+
"gpuArchitectures": []
18+
}
19+
],
20+
"dependencies": [],
21+
"data": [
22+
{
23+
"dependencies": [],
24+
"command": [""],
25+
"outputs": []
26+
}
27+
]
28+
}
Lines changed: 273 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,273 @@
1+
// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
2+
// This file is part of the "Nabla Engine".
3+
// For conditions of distribution and use, see copyright notice in nabla.h
4+
5+
6+
// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
7+
#include "../common/MonoDeviceApplication.hpp"
8+
9+
using namespace nbl;
10+
using namespace core;
11+
using namespace system;
12+
using namespace asset;
13+
using namespace video;
14+
15+
16+
// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platofmrs
17+
class HelloComputeApp final : public nbl::examples::MonoDeviceApplication
18+
{
19+
using base_t = examples::MonoDeviceApplication;
20+
public:
21+
// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
22+
using base_t::base_t;
23+
24+
// we stuff all our work here because its a "single shot" app
25+
bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
26+
{
27+
// Remember to call the base class initialization!
28+
if (!base_t::onAppInitialized(std::move(system)))
29+
return false;
30+
31+
// TODO: redo completely the rest of the sample
32+
33+
34+
constexpr uint32_t WorkgroupSize = 256;
35+
constexpr uint32_t WorkgroupCount = 2048;
36+
// A word about `nbl::asset::IAsset`s, whenever you see an `nbl::asset::ICPUSomething` you can be sure an `nbl::video::IGPUSomething exists, and they both inherit from `nbl::asset::ISomething`.
37+
// The convention is that an `ICPU` object represents a potentially Mutable (and in the past, Serializable) recipe for creating an `IGPU` object, and later examples will show automated systems for doing that.
38+
// The Assets always form a Directed Acyclic Graph and our type system enforces that property at compile time (i.e. an `IBuffer` cannot reference an `IImageView` even indirectly).
39+
// Another reason for the 1:1 pairing of types is that one can use a CPU-to-GPU associative cache (asset manager has a default one) and use the pointers to the CPU objects as UUIDs.
40+
// The ICPUShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`.
41+
// They can be created: from buffers of code, by compilation from some other source code, or loaded from files (next example will do that).
42+
smart_refctd_ptr<nbl::asset::ICPUShader> cpuShader;
43+
{
44+
// Normally we'd use the ISystem and the IAssetManager to load shaders flexibly from (virtual) files for ease of development (syntax highlighting and Intellisense),
45+
// but I want to show the full process of assembling a shader from raw source code at least once.
46+
smart_refctd_ptr<nbl::asset::IShaderCompiler> compiler = make_smart_refctd_ptr<nbl::asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
47+
48+
// A simple shader that writes out the Global Invocation Index to the position it corresponds to in the buffer
49+
// Note the injection of a define from C++ to keep the workgroup size in sync.
50+
// P.S. We don't have an entry point name compiler option because we expect that future compilers should support multiple entry points, so for now there must be a single entry point called "main".
51+
// P.P.S. Yes we know workgroup sizes can come from specialization constants, however DXC has a problem with that https://github.com/microsoft/DirectXShaderCompiler/issues/3092
52+
const string source = "#define WORKGROUP_SIZE "+std::to_string(WorkgroupSize)+R"===(
53+
[[vk::binding(0,0)]] RWStructuredBuffer<uint32_t> buff;
54+
55+
[numthreads(WORKGROUP_SIZE,1,1)]
56+
void main(uint32_t3 ID : SV_DispatchThreadID)
57+
{
58+
buff[ID.x] = ID.x;
59+
}
60+
)===";
61+
62+
CHLSLCompiler::SOptions options = {};
63+
options.stage = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
64+
// want as much debug as possible
65+
options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
66+
// this lets you source-level debug/step shaders in renderdoc
67+
if (m_device->getPhysicalDevice()->getLimits().shaderNonSemanticInfo)
68+
options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_NON_SEMANTIC_BIT;
69+
// if you don't set the logger and source identifier you'll have no meaningful errors
70+
options.preprocessorOptions.sourceIdentifier = "embedded.comp.hlsl";
71+
options.preprocessorOptions.logger = m_logger.get();
72+
cpuShader = compiler->compileToSPIRV(source.c_str(), options);
73+
74+
if (!cpuShader)
75+
{
76+
logFail("Failed to compile following HLSL Shader:\n%s\n",source);
77+
return false;
78+
}
79+
}
80+
81+
// Note how each ILogicalDevice method takes a smart-pointer r-value, so that the GPU objects refcount their dependencies
82+
smart_refctd_ptr<nbl::video::IGPUShader> shader = m_device->createShader(std::move(cpuShader));
83+
if (!shader)
84+
{
85+
logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n");
86+
return false;
87+
}
88+
89+
// we'll cover the specialization constant API in another example
90+
const nbl::asset::ISpecializedShader::SInfo info(nullptr,nullptr,"main");
91+
// theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one
92+
smart_refctd_ptr<nbl::video::IGPUSpecializedShader> specShader = m_device->createSpecializedShader(shader.get(),info);
93+
94+
// the simplest example would have used push constants and BDA, but RenderDoc's debugging of that sucks, so I'll demonstrate "classical" binding of buffers with descriptors
95+
nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = {
96+
{
97+
.binding=0,
98+
.type=nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
99+
.createFlags=IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, // not is not the time for descriptor indexing
100+
.stageFlags=IGPUShader::ESS_COMPUTE,
101+
.count=1,
102+
.samplers=nullptr // irrelevant for a buffer
103+
}
104+
};
105+
smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = m_device->createDescriptorSetLayout(bindings,bindings+1);
106+
if (!dsLayout)
107+
{
108+
logFail("Failed to create a Descriptor Layout!\n");
109+
return false;
110+
}
111+
112+
// Nabla actually has facilities for SPIR-V Reflection and "guessing" pipeline layouts for a given SPIR-V which we'll cover in a different example
113+
smart_refctd_ptr<nbl::video::IGPUPipelineLayout> pplnLayout = m_device->createPipelineLayout(nullptr,nullptr,smart_refctd_ptr(dsLayout));
114+
if (!pplnLayout)
115+
{
116+
logFail("Failed to create a Pipeline Layout!\n");
117+
return false;
118+
}
119+
120+
// we use strong typing on the pipelines, since there's no reason to polymorphically switch between different pipelines
121+
smart_refctd_ptr<nbl::video::IGPUComputePipeline> pipeline = m_device->createComputePipeline(nullptr,smart_refctd_ptr(pplnLayout),std::move(specShader));
122+
123+
// Our Descriptor Sets track (refcount) resources written into them, so you can pretty much drop and forget whatever you write into them.
124+
// A later Descriptor Indexing example will test that this tracking is also correct for Update-After-Bind Descriptor Set bindings too.
125+
smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
126+
127+
// A `nbl::video::DeviceMemoryAllocator` is an interface to implement anything that can dish out free memory range to bind to back a `nbl::video::IGPUBuffer` or a `nbl::video::IGPUImage`
128+
// The Logical Device itself implements the interface and behaves as the most simple allocator, it will create a new `nbl::video::IDeviceMemoryAllocation` every single time.
129+
// We will cover allocators and suballocation in a later example.
130+
nbl::video::IDeviceMemoryAllocator::SMemoryOffset allocation = {};
131+
{
132+
constexpr size_t BufferSize = sizeof(uint32_t)*WorkgroupSize*WorkgroupCount;
133+
134+
// Always default the creation parameters, there's a lot of extra stuff for DirectX/CUDA interop and slotting into external engines you don't usually care about.
135+
nbl::video::IGPUBuffer::SCreationParams params = {};
136+
params.size = BufferSize;
137+
// While the usages on `ICPUBuffers` are mere hints to our automated CPU-to-GPU conversion systems which need to be patched up anyway,
138+
// the usages on an `IGPUBuffer` are crucial to specify correctly.
139+
params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
140+
smart_refctd_ptr<IGPUBuffer> outputBuff = m_device->createBuffer(std::move(params));
141+
if (!outputBuff)
142+
{
143+
logFail("Failed to create a GPU Buffer of size %d!\n",params.size);
144+
return false;
145+
}
146+
147+
// Naming objects is cool because not only errors (such as Vulkan Validation Layers) will show their names, but RenderDoc captures too.
148+
outputBuff->setObjectDebugName("My Output Buffer");
149+
150+
// We don't want to bother explaining best staging buffer practices just yet, so we will create a buffer over
151+
// a memory type thats Host Visible (can be mapped and give the CPU a direct pointer to read from)
152+
nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
153+
// you can simply constrain the memory requirements by AND-ing the type bits of the host visible memory types
154+
reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits();
155+
156+
// There are actually two `allocate` overloads, one which allocates memory if you already know the type you want.
157+
// And this one which is a utility which tries to allocate from every type that matches your requirements in some order of preference.
158+
// The other of preference (iteration over compatible types) can be controlled by the method's template parameter,
159+
// the default is from lowest index to highest, but skipping over incompatible types.
160+
allocation = m_device->allocate(reqs,outputBuff.get(),nbl::video::IDeviceMemoryAllocation::EMAF_NONE);
161+
if (!allocation.isValid())
162+
{
163+
logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
164+
return false;
165+
}
166+
// Note that we performed a Dedicated Allocation above, so there's no need to bind the memory anymore (since the allocator knows the dedication, it can already bind).
167+
// This is a carryover from having an OpenGL backend, where you couldn't have a memory allocation separate from the resource, so all allocations had to be "dedicated".
168+
// In Vulkan dedicated allocations are the most performant and still make sense as long as you won't blow the 4096 allocation limit on windows.
169+
// You should always use dedicated allocations for images used for swapchains, framebuffer attachments (esp transient), as well as objects used in CUDA/DirectX interop.
170+
assert(outputBuff->getBoundMemory()==allocation.memory.get());
171+
172+
// This is a cool utility you can use instead of counting up how much of each descriptor type you need to N_i allocate descriptor sets with layout L_i from a single pool
173+
smart_refctd_ptr<nbl::video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,&dsLayout.get(),&dsLayout.get()+1);
174+
175+
// note how the pool will go out of scope but thanks for backreferences in each object to its parent/dependency it will be kept alive for as long as all the Sets it allocated
176+
ds = pool->createDescriptorSet(std::move(dsLayout));
177+
// we still use Vulkan 1.0 descriptor update style, could move to Update Templates but Descriptor Buffer ubiquity seems just around the corner
178+
{
179+
IGPUDescriptorSet::SDescriptorInfo info[1];
180+
info[0].desc = smart_refctd_ptr(outputBuff); // bad API, too late to change, should just take raw-pointers since not consumed
181+
info[0].info.buffer = {.offset=0,.size=BufferSize};
182+
IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
183+
{.dstSet=ds.get(),.binding=0,.arrayElement=0,.count=1,.descriptorType=IDescriptor::E_TYPE::ET_STORAGE_BUFFER,.info=info}
184+
};
185+
m_device->updateDescriptorSets(1u,writes,0u,nullptr);
186+
}
187+
}
188+
189+
// To be able to read the contents of the buffer we need to map its memory
190+
// P.S. Nabla mandates Persistent Memory Mappings on all backends (but not coherent memory types)
191+
const IDeviceMemoryAllocation::MappedMemoryRange memoryRange(allocation.memory.get(),0ull,allocation.memory->getAllocationSize());
192+
auto ptr = m_device->mapMemory(memoryRange,IDeviceMemoryAllocation::EMCAF_READ);
193+
if (!ptr)
194+
{
195+
logFail("Failed to map the Device Memory!\n");
196+
return false;
197+
}
198+
199+
// queues are inherent parts of the device, ergo not refcounted (you refcount the device instead)
200+
IGPUQueue* const queue = getComputeQueue();
201+
202+
// Our commandbuffers are cool because they refcount the resources used by each command you record into them, so you can rely a commandbuffer on keeping them alive.
203+
smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdbuf;
204+
{
205+
smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::ECF_TRANSIENT_BIT);
206+
if (!m_device->createCommandBuffers(cmdpool.get(),IGPUCommandBuffer::EL_PRIMARY,1u,&cmdbuf))
207+
{
208+
logFail("Failed to create Command Buffers!\n");
209+
return false;
210+
}
211+
}
212+
213+
cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
214+
// If you enable the `debugUtils` API Connection feature on a supported backend as we've done, you'll get these pretty debug sections in RenderDoc
215+
cmdbuf->beginDebugMarker("My Compute Dispatch",core::vectorSIMDf(0,1,0,1));
216+
// you want to bind the pipeline first to avoid accidental unbind of descriptor sets due to compatibility matching
217+
cmdbuf->bindComputePipeline(pipeline.get());
218+
cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE,pplnLayout.get(),0,1,&ds.get());
219+
cmdbuf->dispatch(WorkgroupCount,1,1);
220+
cmdbuf->endDebugMarker();
221+
// Normally you'd want to perform a memory barrier when using the output of a compute shader or renderpass,
222+
// however waiting on a timeline semaphore (or fence) on the Host makes all Device writes visible.
223+
cmdbuf->end();
224+
225+
// TODO: Redo with timeline semaphores
226+
smart_refctd_ptr<IGPUFence> done = m_device->createFence(IGPUFence::ECF_UNSIGNALED);
227+
{
228+
// Default, we have no semaphores to wait on before we can start our workload
229+
IGPUQueue::SSubmitInfo submitInfo = {};
230+
// The IGPUCommandBuffer is the only object whose usage does not get automagically tracked internally, you're responsible for holding onto it as long as the GPU needs it.
231+
// So this is why our commandbuffer, even though its transient lives in the scope equal or above the place where we wait for the submission to be signalled as complete.
232+
submitInfo.commandBufferCount = 1;
233+
submitInfo.commandBuffers = &cmdbuf.get();
234+
235+
// We have a cool integration with RenderDoc that allows you to start and end captures programmatically.
236+
// This is super useful for debugging multi-queue workloads and by default RenderDoc delimits captures only by Swapchain presents.
237+
queue->startCapture();
238+
queue->submit(1u,&submitInfo,done.get());
239+
queue->endCapture();
240+
}
241+
// As the name implies this function will not progress until the fence signals or repeated waiting returns an error.
242+
m_device->blockForFences(1,&done.get());
243+
244+
// You don't need to do this, but putting it here to demonstrate that its safe to drop a commandbuffer after GPU is done (try moving it above and see if you BSOD or just get a validation error).
245+
cmdbuf = nullptr;
246+
247+
// if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
248+
if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
249+
m_device->invalidateMappedMemoryRanges(1,&memoryRange);
250+
251+
// a simple test to check we got the right thing back
252+
auto buffData = reinterpret_cast<const uint32_t*>(ptr);
253+
for (auto i=0; i<WorkgroupSize*WorkgroupCount; i++)
254+
if (buffData[i]!=i)
255+
{
256+
logFail("DWORD at position %d doesn't match!\n",i);
257+
return false;
258+
}
259+
m_device->unmapMemory(allocation.memory.get());
260+
261+
return true;
262+
}
263+
264+
// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
265+
void workLoopBody() override {}
266+
267+
// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
268+
bool keepRunning() override {return false;}
269+
270+
};
271+
272+
273+
NBL_MAIN_FUNC(HelloComputeApp)
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import org.DevshGraphicsProgramming.Agent
2+
import org.DevshGraphicsProgramming.BuilderInfo
3+
import org.DevshGraphicsProgramming.IBuilder
4+
5+
class CSystemTestBuilder extends IBuilder
6+
{
7+
public CSystemTestBuilder(Agent _agent, _info)
8+
{
9+
super(_agent, _info)
10+
}
11+
12+
@Override
13+
public boolean prepare(Map axisMapping)
14+
{
15+
return true
16+
}
17+
18+
@Override
19+
public boolean build(Map axisMapping)
20+
{
21+
IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
22+
IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
23+
24+
def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
25+
def nameOfConfig = getNameOfConfig(config)
26+
27+
agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
28+
29+
return true
30+
}
31+
32+
@Override
33+
public boolean test(Map axisMapping)
34+
{
35+
return true
36+
}
37+
38+
@Override
39+
public boolean install(Map axisMapping)
40+
{
41+
return true
42+
}
43+
}
44+
45+
def create(Agent _agent, _info)
46+
{
47+
return new CSystemTestBuilder(_agent, _info)
48+
}
49+
50+
return this

0 commit comments

Comments
 (0)