Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sub-allocated descriptor sets #95

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
33 changes: 20 additions & 13 deletions 08_HelloSwapchain/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
public:
using base_t::base_t;

// We inherit from an application that tries to find Graphics and Compute queues
// because applications with presentable images often want to perform Graphics family operations
virtual bool isComputeOnly() const {return false;}

deprilula28 marked this conversation as resolved.
Show resolved Hide resolved
virtual video::IAPIConnection::SFeatures getAPIFeaturesToEnable() override
{
auto retval = base_t::getAPIFeaturesToEnable();
Expand All @@ -26,22 +30,23 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
}

// New function, we neeed to know about surfaces to create ahead of time
virtual core::vector<const video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;
virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const = 0;

virtual core::set<video::IPhysicalDevice*> filterDevices(const core::SRange<video::IPhysicalDevice* const>& physicalDevices) const
// We have a very simple heuristic, the device must be able to render to all windows!
// (want to make something more complex? you're on your own!)
virtual void filterDevices(core::set<video::IPhysicalDevice*>& physicalDevices) const
{
const auto firstFilter = base_t::filterDevices(physicalDevices);
base_t::filterDevices(physicalDevices);

video::SPhysicalDeviceFilter deviceFilter = {};

const auto surfaces = getSurfaces();
deviceFilter.requiredSurfaceCompatibilities = surfaces.data();
deviceFilter.requiredSurfaceCompatibilitiesCount = surfaces.size();
auto surfaces = getSurfaces();
deviceFilter.requiredSurfaceCompatibilities = {surfaces};

return deviceFilter(physicalDevices);
}

virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
{
// Remember to call the base class initialization!
if (!base_t::onAppInitialized(std::move(system)))
Expand All @@ -52,6 +57,7 @@ class WindowedApplication : public virtual BasicMultiQueueApplication
#else
#error "Unimplemented!"
#endif
return true;
}

core::smart_refctd_ptr<ui::IWindowManager> m_winMgr;
Expand Down Expand Up @@ -87,7 +93,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
public:
using base_t::base_t;

virtual bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
virtual bool onAppInitialized(core::smart_refctd_ptr<system::ISystem>&& system) override
{
// Remember to call the base class initialization!
if (!base_t::onAppInitialized(std::move(system)))
Expand All @@ -98,7 +104,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
return true;
}

virtual core::vector<const video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
virtual core::vector<video::SPhysicalDeviceFilter::SurfaceCompatibility> getSurfaces() const
{
return {{m_surface.get()/*,EQF_NONE*/}};
}
Expand All @@ -112,15 +118,15 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
}

protected:
virtual IWindow::SCreationParams getWindowCreationParams() const
virtual ui::IWindow::SCreationParams getWindowCreationParams() const
{
IWindow::SCreationParams params = {};
params.callback = make_smart_refctd_ptr<IWindowClosedCallback>();
ui::IWindow::SCreationParams params = {};
params.callback = core::make_smart_refctd_ptr<IWindowClosedCallback>();
params.width = 640;
params.height = 480;
params.x = 32;
params.y = 32;
params.flags = IWindow::ECF_NONE;
params.flags = ui::IWindow::ECF_NONE;
params.windowCaption = "SingleNonResizableWindowApplication";
return params;
}
Expand All @@ -130,6 +136,7 @@ class SingleNonResizableWindowApplication : public virtual WindowedApplication
};
}

#include "nbl/video/CVulkanSwapchain.h"

using namespace nbl;
using namespace core;
Expand Down
24 changes: 24 additions & 0 deletions 67_SubAllocatedDescriptorSet/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
Comment on lines +1 to +24

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd make this example 1x or 2x, using 2x for basic utility/extension tests now

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

number 27 is up for grabs

28 changes: 28 additions & 0 deletions 67_SubAllocatedDescriptorSet/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
183 changes: 183 additions & 0 deletions 67_SubAllocatedDescriptorSet/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h


#include "nbl/video/surface/CSurfaceVulkan.h"
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved
#include "nbl/video/alloc/SubAllocatedDescriptorSet.h"

#include "../common/BasicMultiQueueApplication.hpp"
#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"

using namespace nbl;
using namespace core;
using namespace system;
using namespace ui;
using namespace asset;
using namespace video;

#include "nbl/builtin/hlsl/bit.hlsl"
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved

// In this application we'll cover buffer streaming, Buffer Device Address (BDA) and push constants
class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
{
using device_base_t = examples::MonoDeviceApplication;
using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;

// The pool cache is just a formalized way of round-robining command pools and resetting + reusing them after their most recent submit signals finished.
// Its a little more ergonomic to use if you don't have a 1:1 mapping between frames and pools.
smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;

smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet> m_subAllocDescriptorSet;

// This example really lets the advantages of a timeline semaphore shine through!
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved
smart_refctd_ptr<ISemaphore> m_timeline;
uint64_t m_iteration = 0;
constexpr static inline uint64_t MaxIterations = 200;

constexpr static inline uint32_t MaxDescriptorSetAllocationAlignment = 64u*1024u; // if you need larger alignments then you're not right in the head
constexpr static inline uint32_t MinDescriptorSetAllocationSize = 1u;
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved

public:
// Yay thanks to multiple inheritance we cannot forward ctors anymore
SubAllocatedDescriptorSetApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}

// we stuff all our work here because its a "single shot" app
bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
{
using nbl::video::IGPUDescriptorSetLayout;

// Remember to call the base class initialization!
if (!device_base_t::onAppInitialized(std::move(system)))
return false;
if (!asset_base_t::onAppInitialized(std::move(system)))
return false;


// We'll allow subsequent iterations to overlap each other on the GPU, the only limiting factors are
// the amount of memory in the streaming buffers and the number of commandpools we can use simultaenously.
constexpr auto MaxConcurrency = 64;

// Since this time we don't throw the Command Pools away and we'll reset them instead, we don't create the pools with the transient flag
m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency);

// In contrast to fences, we just need one semaphore to rule all dispatches
m_timeline = m_device->createSemaphore(m_iteration);

// Descriptor set sub allocator

video::IGPUDescriptorSetLayout::SBinding bindings[12];
{
for (uint32_t i = 0; i < 12; i++)
{
bindings[i].binding = i;
bindings[i].count = 16000;
bindings[i].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT)
| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT
| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
if (i % 2 == 0) bindings[i].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
else if (i % 2 == 1) bindings[i].type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER;
Comment on lines +168 to +169

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might want to try all descriptor types just to make the testing complete

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enable acceleration structure if available, and even test that if its there

(can be done as separate PR, no clue if the AS refactor works without stupid typos yet)

bindings[i].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
}
}

std::span<video::IGPUDescriptorSetLayout::SBinding> bindingsSpan(bindings);

auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);

// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(
descriptorSetLayout.get(), MaxDescriptorSetAllocationAlignment, MinDescriptorSetAllocationSize
);

std::vector<uint32_t> allocation(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
{
subAllocatedDescriptorSet->multi_allocate(0, allocation.size(), &allocation[0]);
for (uint32_t i = 0; i < allocation.size(); i++)
{
m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation[i]);
assert(allocation[i] != core::PoolAddressAllocator<uint32_t>::invalid_address);
}
}
{
std::vector<uint32_t> addr;
for (uint32_t i = 0; i < allocation.size(); i+=2)
{
addr.push_back(allocation[i]);
}
subAllocatedDescriptorSet->multi_deallocate(0, addr.size(), &addr[0]);
}
m_logger->log("freed half the descriptors", system::ILogger::ELL_INFO);
std::vector<uint32_t> allocation2(128, core::PoolAddressAllocator<uint32_t>::invalid_address);
{
subAllocatedDescriptorSet->multi_allocate(0, allocation2.size(), &allocation2[0]);
for (uint32_t i = 0; i < allocation2.size(); i++)
{
m_logger->log("allocation[%d]: %d", system::ILogger::ELL_INFO, i, allocation2[i]);
assert(allocation2[i] != core::PoolAddressAllocator<uint32_t>::invalid_address);
}
}

return true;
}

// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
bool keepRunning() override { return m_iteration<MaxIterations; }

// Finally the first actual work-loop
void workLoopBody() override
{
IQueue* const queue = getComputeQueue();

// Obtain our command pool once one gets recycled
uint32_t poolIx;
do
{
poolIx = m_poolCache->acquirePool();
} while (poolIx==ICommandPoolCache::invalid_index);

smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
{
m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger));
// lets record, its still a one time submit because we have to re-record with different push constants each time
cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);

// COMMAND RECORDING

auto result = cmdbuf->end();
assert(result);
}


const auto savedIterNum = m_iteration++;
{
const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
{
.cmdbuf = cmdbuf.get()
};
const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
{
.semaphore = m_timeline.get(),
.value = m_iteration,
.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
};
// Generally speaking we don't need to wait on any semaphore because in this example every dispatch gets its own clean piece of memory to use
// from the point of view of the GPU. Implicit domain operations between Host and Device happen upon a submit and a semaphore/fence signal operation,
// this ensures we can touch the input and get accurate values from the output memory using the CPU before and after respectively, each submit becoming PENDING.
// If we actually cared about this submit seeing the memory accesses of a previous dispatch we could add a semaphore wait
const IQueue::SSubmitInfo submitInfo = {
.waitSemaphores = {},
.commandBuffers = {&cmdbufInfo,1},
.signalSemaphores = {&signalInfo,1}
};

queue->startCapture();
auto statusCode = queue->submit({ &submitInfo,1 });
queue->endCapture();
assert(statusCode == IQueue::RESULT::SUCCESS);
}
}
};

NBL_MAIN_FUNC(SubAllocatedDescriptorSetApp)
50 changes: 50 additions & 0 deletions 67_SubAllocatedDescriptorSet/pipeline.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import org.DevshGraphicsProgramming.Agent
import org.DevshGraphicsProgramming.BuilderInfo
import org.DevshGraphicsProgramming.IBuilder

class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
{
public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved
{
super(_agent, _info)
}

@Override
public boolean prepare(Map axisMapping)
{
return true
}

@Override
public boolean build(Map axisMapping)
{
IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")

def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
def nameOfConfig = getNameOfConfig(config)

agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")

return true
}

@Override
public boolean test(Map axisMapping)
{
return true
}

@Override
public boolean install(Map axisMapping)
{
return true
}
}

def create(Agent _agent, _info)
{
return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
}

return this
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -65,5 +65,6 @@ if(NBL_BUILD_EXAMPLES)
#add_subdirectory(61_UI EXCLUDE_FROM_ALL)
add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
add_subdirectory(67_SubAllocatedDescriptorSet EXCLUDE_FROM_ALL)
add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
endif()
Loading