Start some work on how to select a device

devshgraphicsprogramming · devshgraphicsprogramming · commit 6a86ee49fe4f · 2023-11-10T13:59:29.000+01:00
diff --git a/03_DeviceSelectionAndSharedSources/CMakeLists.txt b/03_DeviceSelectionAndSharedSources/CMakeLists.txt
@@ -0,0 +1,6 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
diff --git a/03_DeviceSelectionAndSharedSources/config.json.template b/03_DeviceSelectionAndSharedSources/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
diff --git a/03_DeviceSelectionAndSharedSources/main.cpp b/03_DeviceSelectionAndSharedSources/main.cpp
@@ -0,0 +1,273 @@
+// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
+#include "../common/MonoDeviceApplication.hpp"
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+
+// this time instead of defining our own `int main()` we derive from `nbl::system::IApplicationFramework` to play "nice" wil all platofmrs
+class HelloComputeApp final : public nbl::examples::MonoDeviceApplication
+{
+		using base_t = examples::MonoDeviceApplication;
+	public:
+		// Generally speaking because certain platforms delay initialization from main object construction you should just forward and not do anything in the ctor
+		using base_t::base_t;
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			// TODO: redo completely the rest of the sample
+
+
+			constexpr uint32_t WorkgroupSize = 256;
+			constexpr uint32_t WorkgroupCount = 2048;
+			// A word about `nbl::asset::IAsset`s, whenever you see an `nbl::asset::ICPUSomething` you can be sure an `nbl::video::IGPUSomething exists, and they both inherit from `nbl::asset::ISomething`.
+			// The convention is that an `ICPU` object represents a potentially Mutable (and in the past, Serializable) recipe for creating an `IGPU` object, and later examples will show automated systems for doing that.
+			// The Assets always form a Directed Acyclic Graph and our type system enforces that property at compile time (i.e. an `IBuffer` cannot reference an `IImageView` even indirectly).
+			// Another reason for the 1:1 pairing of types is that one can use a CPU-to-GPU associative cache (asset manager has a default one) and use the pointers to the CPU objects as UUIDs.
+			// The ICPUShader is just a mutable container for source code (can be high level like HLSL needing compilation to SPIR-V or SPIR-V itself) held in an `nbl::asset::ICPUBuffer`.
+			// They can be created: from buffers of code, by compilation from some other source code, or loaded from files (next example will do that).
+			smart_refctd_ptr<nbl::asset::ICPUShader> cpuShader;
+			{
+				// Normally we'd use the ISystem and the IAssetManager to load shaders flexibly from (virtual) files for ease of development (syntax highlighting and Intellisense),
+				// but I want to show the full process of assembling a shader from raw source code at least once.
+				smart_refctd_ptr<nbl::asset::IShaderCompiler> compiler = make_smart_refctd_ptr<nbl::asset::CHLSLCompiler>(smart_refctd_ptr(m_system));
+
+				// A simple shader that writes out the Global Invocation Index to the position it corresponds to in the buffer
+				// Note the injection of a define from C++ to keep the workgroup size in sync.
+				// P.S. We don't have an entry point name compiler option because we expect that future compilers should support multiple entry points, so for now there must be a single entry point called "main".
+				// P.P.S. Yes we know workgroup sizes can come from specialization constants, however DXC has a problem with that https://github.com/microsoft/DirectXShaderCompiler/issues/3092
+				const string source = "#define WORKGROUP_SIZE "+std::to_string(WorkgroupSize)+R"===(
+					[[vk::binding(0,0)]] RWStructuredBuffer<uint32_t> buff;
+
+					[numthreads(WORKGROUP_SIZE,1,1)]
+					void main(uint32_t3 ID : SV_DispatchThreadID)
+					{
+						buff[ID.x] = ID.x;
+					}
+				)===";
+
+				CHLSLCompiler::SOptions options = {};
+				options.stage = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
+				// want as much debug as possible
+				options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_LINE_BIT;
+				// this lets you source-level debug/step shaders in renderdoc
+				if (m_device->getPhysicalDevice()->getLimits().shaderNonSemanticInfo)
+					options.debugInfoFlags |= IShaderCompiler::E_DEBUG_INFO_FLAGS::EDIF_NON_SEMANTIC_BIT;
+				// if you don't set the logger and source identifier you'll have no meaningful errors
+				options.preprocessorOptions.sourceIdentifier = "embedded.comp.hlsl";
+				options.preprocessorOptions.logger = m_logger.get();
+				cpuShader = compiler->compileToSPIRV(source.c_str(), options);
+
+				if (!cpuShader)
+				{
+					logFail("Failed to compile following HLSL Shader:\n%s\n",source);
+					return false;
+				}
+			}
+
+			// Note how each ILogicalDevice method takes a smart-pointer r-value, so that the GPU objects refcount their dependencies
+			smart_refctd_ptr<nbl::video::IGPUShader> shader = m_device->createShader(std::move(cpuShader));
+			if (!shader)
+			{
+				logFail("Failed to create a GPU Shader, seems the Driver doesn't like the SPIR-V we're feeding it!\n");
+				return false;
+			}
+
+			// we'll cover the specialization constant API in another example
+			const nbl::asset::ISpecializedShader::SInfo info(nullptr,nullptr,"main");
+			// theoretically a blob of SPIR-V can contain multiple named entry points and one has to be chosen, in practice most compilers only support outputting one
+			smart_refctd_ptr<nbl::video::IGPUSpecializedShader> specShader = m_device->createSpecializedShader(shader.get(),info);
+
+			// the simplest example would have used push constants and BDA, but RenderDoc's debugging of that sucks, so I'll demonstrate "classical" binding of buffers with descriptors
+			nbl::video::IGPUDescriptorSetLayout::SBinding bindings[1] = {
+				{
+					.binding=0,
+					.type=nbl::asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER,
+					.createFlags=IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, // not is not the time for descriptor indexing
+					.stageFlags=IGPUShader::ESS_COMPUTE,
+					.count=1,
+					.samplers=nullptr // irrelevant for a buffer
+				}
+			};
+			smart_refctd_ptr<IGPUDescriptorSetLayout> dsLayout = m_device->createDescriptorSetLayout(bindings,bindings+1);
+			if (!dsLayout)
+			{
+				logFail("Failed to create a Descriptor Layout!\n");
+				return false;
+			}
+
+			// Nabla actually has facilities for SPIR-V Reflection and "guessing" pipeline layouts for a given SPIR-V which we'll cover in a different example
+			smart_refctd_ptr<nbl::video::IGPUPipelineLayout> pplnLayout = m_device->createPipelineLayout(nullptr,nullptr,smart_refctd_ptr(dsLayout));
+			if (!pplnLayout)
+			{
+				logFail("Failed to create a Pipeline Layout!\n");
+				return false;
+			}
+
+			// we use strong typing on the pipelines, since there's no reason to polymorphically switch between different pipelines
+			smart_refctd_ptr<nbl::video::IGPUComputePipeline> pipeline = m_device->createComputePipeline(nullptr,smart_refctd_ptr(pplnLayout),std::move(specShader));
+
+			// Our Descriptor Sets track (refcount) resources written into them, so you can pretty much drop and forget whatever you write into them.
+			// A later Descriptor Indexing example will test that this tracking is also correct for Update-After-Bind Descriptor Set bindings too.
+			smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
+
+			// A `nbl::video::DeviceMemoryAllocator` is an interface to implement anything that can dish out free memory range to bind to back a `nbl::video::IGPUBuffer` or a `nbl::video::IGPUImage`
+			// The Logical Device itself implements the interface and behaves as the most simple allocator, it will create a new `nbl::video::IDeviceMemoryAllocation` every single time.
+			// We will cover allocators and suballocation in a later example.
+			nbl::video::IDeviceMemoryAllocator::SMemoryOffset allocation = {};
+			{
+				constexpr size_t BufferSize = sizeof(uint32_t)*WorkgroupSize*WorkgroupCount;
+
+				// Always default the creation parameters, there's a lot of extra stuff for DirectX/CUDA interop and slotting into external engines you don't usually care about. 
+				nbl::video::IGPUBuffer::SCreationParams params = {};
+				params.size = BufferSize;
+				// While the usages on `ICPUBuffers` are mere hints to our automated CPU-to-GPU conversion systems which need to be patched up anyway,
+				// the usages on an `IGPUBuffer` are crucial to specify correctly.
+				params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT;
+				smart_refctd_ptr<IGPUBuffer> outputBuff = m_device->createBuffer(std::move(params));
+				if (!outputBuff)
+				{
+					logFail("Failed to create a GPU Buffer of size %d!\n",params.size);
+					return false;
+				}
+
+				// Naming objects is cool because not only errors (such as Vulkan Validation Layers) will show their names, but RenderDoc captures too.
+				outputBuff->setObjectDebugName("My Output Buffer");
+
+				// We don't want to bother explaining best staging buffer practices just yet, so we will create a buffer over
+				// a memory type thats Host Visible (can be mapped and give the CPU a direct pointer to read from)
+				nbl::video::IDeviceMemoryBacked::SDeviceMemoryRequirements reqs = outputBuff->getMemoryReqs();
+				// you can simply constrain the memory requirements by AND-ing the type bits of the host visible memory types
+				reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getHostVisibleMemoryTypeBits();
+
+				// There are actually two `allocate` overloads, one which allocates memory if you already know the type you want.
+				// And this one which is a utility which tries to allocate from every type that matches your requirements in some order of preference.
+				// The other of preference (iteration over compatible types) can be controlled by the method's template parameter,
+				// the default is from lowest index to highest, but skipping over incompatible types.
+				allocation = m_device->allocate(reqs,outputBuff.get(),nbl::video::IDeviceMemoryAllocation::EMAF_NONE);
+				if (!allocation.isValid())
+				{
+					logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+					return false;
+				}
+				// Note that we performed a Dedicated Allocation above, so there's no need to bind the memory anymore (since the allocator knows the dedication, it can already bind).
+				// This is a carryover from having an OpenGL backend, where you couldn't have a memory allocation separate from the resource, so all allocations had to be "dedicated".
+				// In Vulkan dedicated allocations are the most performant and still make sense as long as you won't blow the 4096 allocation limit on windows.
+				// You should always use dedicated allocations for images used for swapchains, framebuffer attachments (esp transient), as well as objects used in CUDA/DirectX interop.
+				assert(outputBuff->getBoundMemory()==allocation.memory.get());
+
+				// This is a cool utility you can use instead of counting up how much of each descriptor type you need to N_i allocate descriptor sets with layout L_i from a single pool
+				smart_refctd_ptr<nbl::video::IDescriptorPool> pool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE,&dsLayout.get(),&dsLayout.get()+1);
+
+				// note how the pool will go out of scope but thanks for backreferences in each object to its parent/dependency it will be kept alive for as long as all the Sets it allocated
+				ds = pool->createDescriptorSet(std::move(dsLayout));
+				// we still use Vulkan 1.0 descriptor update style, could move to Update Templates but Descriptor Buffer ubiquity seems just around the corner
+				{
+					IGPUDescriptorSet::SDescriptorInfo info[1];
+					info[0].desc = smart_refctd_ptr(outputBuff); // bad API, too late to change, should just take raw-pointers since not consumed
+					info[0].info.buffer = {.offset=0,.size=BufferSize};
+					IGPUDescriptorSet::SWriteDescriptorSet writes[1] = {
+						{.dstSet=ds.get(),.binding=0,.arrayElement=0,.count=1,.descriptorType=IDescriptor::E_TYPE::ET_STORAGE_BUFFER,.info=info}
+					};
+					m_device->updateDescriptorSets(1u,writes,0u,nullptr);
+				}
+			}
+
+			// To be able to read the contents of the buffer we need to map its memory
+			// P.S. Nabla mandates Persistent Memory Mappings on all backends (but not coherent memory types)
+			const IDeviceMemoryAllocation::MappedMemoryRange memoryRange(allocation.memory.get(),0ull,allocation.memory->getAllocationSize());
+			auto ptr = m_device->mapMemory(memoryRange,IDeviceMemoryAllocation::EMCAF_READ);
+			if (!ptr)
+			{
+				logFail("Failed to map the Device Memory!\n");
+				return false;
+			}
+
+			// queues are inherent parts of the device, ergo not refcounted (you refcount the device instead)
+			IGPUQueue* const queue = getComputeQueue();
+
+			// Our commandbuffers are cool because they refcount the resources used by each command you record into them, so you can rely a commandbuffer on keeping them alive.
+			smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdbuf;
+			{
+				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::ECF_TRANSIENT_BIT);
+				if (!m_device->createCommandBuffers(cmdpool.get(),IGPUCommandBuffer::EL_PRIMARY,1u,&cmdbuf))
+				{
+					logFail("Failed to create Command Buffers!\n");
+					return false;
+				}
+			}
+
+			cmdbuf->begin(IGPUCommandBuffer::EU_ONE_TIME_SUBMIT_BIT);
+			// If you enable the `debugUtils` API Connection feature on a supported backend as we've done, you'll get these pretty debug sections in RenderDoc
+			cmdbuf->beginDebugMarker("My Compute Dispatch",core::vectorSIMDf(0,1,0,1));
+			// you want to bind the pipeline first to avoid accidental unbind of descriptor sets due to compatibility matching
+			cmdbuf->bindComputePipeline(pipeline.get());
+			cmdbuf->bindDescriptorSets(nbl::asset::EPBP_COMPUTE,pplnLayout.get(),0,1,&ds.get());
+			cmdbuf->dispatch(WorkgroupCount,1,1);
+			cmdbuf->endDebugMarker();
+			// Normally you'd want to perform a memory barrier when using the output of a compute shader or renderpass,
+			// however waiting on a timeline semaphore (or fence) on the Host makes all Device writes visible.
+			cmdbuf->end();
+
+			// TODO: Redo with timeline semaphores
+			smart_refctd_ptr<IGPUFence> done = m_device->createFence(IGPUFence::ECF_UNSIGNALED);
+			{
+				// Default, we have no semaphores to wait on before we can start our workload
+				IGPUQueue::SSubmitInfo submitInfo = {};
+				// The IGPUCommandBuffer is the only object whose usage does not get automagically tracked internally, you're responsible for holding onto it as long as the GPU needs it.
+				// So this is why our commandbuffer, even though its transient lives in the scope equal or above the place where we wait for the submission to be signalled as complete.
+				submitInfo.commandBufferCount = 1;
+				submitInfo.commandBuffers = &cmdbuf.get();
+
+				// We have a cool integration with RenderDoc that allows you to start and end captures programmatically.
+				// This is super useful for debugging multi-queue workloads and by default RenderDoc delimits captures only by Swapchain presents.
+				queue->startCapture();
+				queue->submit(1u,&submitInfo,done.get());
+				queue->endCapture();
+			}
+			// As the name implies this function will not progress until the fence signals or repeated waiting returns an error.
+			m_device->blockForFences(1,&done.get());
+
+			// You don't need to do this, but putting it here to demonstrate that its safe to drop a commandbuffer after GPU is done (try moving it above and see if you BSOD or just get a validation error). 
+			cmdbuf = nullptr;
+
+			// if the mapping is not coherent the range needs to be invalidated to pull in new data for the CPU's caches
+			if (!allocation.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1,&memoryRange);
+
+			// a simple test to check we got the right thing back
+			auto buffData = reinterpret_cast<const uint32_t*>(ptr);
+			for (auto i=0; i<WorkgroupSize*WorkgroupCount; i++)
+			if (buffData[i]!=i)
+			{
+				logFail("DWORD at position %d doesn't match!\n",i);
+				return false;
+			}
+			m_device->unmapMemory(allocation.memory.get());
+
+			return true;
+		}
+
+		// Platforms like WASM expect the main entry point to periodically return control, hence if you want a crossplatform app, you have to let the framework deal with your "game loop"
+		void workLoopBody() override {}
+
+		// Whether to keep invoking the above. In this example because its headless GPU compute, we do all the work in the app initialization.
+		bool keepRunning() override {return false;}
+
+};
+
+
+NBL_MAIN_FUNC(HelloComputeApp)
diff --git a/03_DeviceSelectionAndSharedSources/pipeline.groovy b/03_DeviceSelectionAndSharedSources/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CSystemTestBuilder extends IBuilder
+{
+	public CSystemTestBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CSystemTestBuilder(_agent, _info)
+}
+
+return this
diff --git a/common/MonoDeviceApplication.hpp b/common/MonoDeviceApplication.hpp