Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sub-allocated descriptor sets #95

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
24 changes: 24 additions & 0 deletions 67_SubAllocatedDescriptorSet/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()
Comment on lines +1 to +24

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd make this example 1x or 2x, using 2x for basic utility/extension tests now

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

number 27 is up for grabs

28 changes: 28 additions & 0 deletions 67_SubAllocatedDescriptorSet/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
290 changes: 290 additions & 0 deletions 67_SubAllocatedDescriptorSet/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,290 @@
// Copyright (C) 2018-2023 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h


#include "nbl/video/alloc/SubAllocatedDescriptorSet.h"

#include "../common/BasicMultiQueueApplication.hpp"
#include "../common/MonoAssetManagerAndBuiltinResourceApplication.hpp"

#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"

using namespace nbl;
using namespace core;
using namespace system;
using namespace ui;
using namespace asset;
using namespace video;

class SubAllocatedDescriptorSetApp final : public examples::MonoDeviceApplication, public examples::MonoAssetManagerAndBuiltinResourceApplication
{
using device_base_t = examples::MonoDeviceApplication;
using asset_base_t = examples::MonoAssetManagerAndBuiltinResourceApplication;

smart_refctd_ptr<nbl::video::ICommandPoolCache> m_poolCache;
smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet> m_subAllocDescriptorSet;

smart_refctd_ptr<ISemaphore> m_timeline;
uint64_t m_iteration = 0;
constexpr static inline uint64_t MaxIterations = 200;
constexpr static inline uint64_t MaxDescriptors = 512;
constexpr static inline uint64_t MaxAllocPerFrame = 10;
constexpr static uint32_t AllocatedBinding = 0;
smart_refctd_ptr<IGPUImageView> m_descriptorImages[MaxDescriptors];
smart_refctd_ptr<IGPUBuffer> m_descriptorBuffers[MaxDescriptors];
Comment on lines +34 to +35

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

declaring them like this might keep them alive via shared ownership, an it might hide lifetime tracking issues.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

alternative is to keep a container<pair<binding,offset>> of everything you haven't freed (stil occcupant) where the container can be a vector or a set

then you draw a random number of elements from that to multi_deallocate


public:
SubAllocatedDescriptorSetApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
system::IApplicationFramework(_localInputCWD,_localOutputCWD,_sharedInputCWD,_sharedOutputCWD) {}

bool writeDescriptors(uint32_t count, uint32_t* valueIndices, uint32_t* allocationIndex)
{
auto createImageDescriptor = [&](uint32_t width, uint32_t height)
{
auto image = m_device->createImage(nbl::video::IGPUImage::SCreationParams {
{
.type = nbl::video::IGPUImage::E_TYPE::ET_2D,
.samples = nbl::video::IGPUImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT,
.format = nbl::asset::E_FORMAT::EF_R8G8B8A8_UNORM,
.extent = { width, height, 1 },
.mipLevels = 1,
.arrayLayers = 1,
.usage = nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_STORAGE_BIT
| nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT
| nbl::video::IGPUImage::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT,
}, {}, nbl::video::IGPUImage::TILING::LINEAR,
});

auto reqs = image->getMemoryReqs();
reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
m_device->allocate(reqs, image.get());

auto imageView = m_device->createImageView(nbl::video::IGPUImageView::SCreationParams {
.image = image,
.viewType = nbl::video::IGPUImageView::E_TYPE::ET_2D,
.format = nbl::asset::E_FORMAT::EF_R8G8B8A8_UNORM,
// .subresourceRange = { nbl::video::IGPUImage::E_ASPECT_FLAGS::EAF_COLOR_BIT, 0, 1, 0, 1 },
});

return imageView;
};

auto createBufferDescriptor = [&](uint32_t size)
{
nbl::video::IGPUBuffer::SCreationParams params;
{
params.size = size;
params.usage = nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_STORAGE_BUFFER_BIT
| nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT
| nbl::video::IGPUBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT;
}
auto buffer = m_device->createBuffer(std::move(params));

auto reqs = buffer->getMemoryReqs();
reqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
m_device->allocate(reqs, buffer.get());

return buffer;
};


std::vector<video::IGPUDescriptorSet::SWriteDescriptorSet> descriptorWrites;
descriptorWrites.reserve(count);
std::vector<video::IGPUDescriptorSet::SDescriptorInfo> descriptorInfos;
{
for (uint32_t i = 0; i < count; i++)
{
auto index = valueIndices[i];
m_logger->log("writeDescriptors[%d]: allocation[%d]: %d", system::ILogger::ELL_INFO, i, index, allocationIndex[i]);
if (allocationIndex[i] == core::PoolAddressAllocator<uint32_t>::invalid_address)
return logFail("value at %d wasn't allocated", i);

auto allocationIdx = allocationIndex[i];

video::IGPUDescriptorSet::SDescriptorInfo descriptorInfo;

// Storage image
{
m_descriptorImages[index] = createImageDescriptor(256, 256);
descriptorInfo.desc = core::smart_refctd_ptr<IGPUImageView>(m_descriptorImages[index]);
descriptorInfo.info.image.imageLayout = asset::IImage::LAYOUT::GENERAL;
}
// Storage buffer
//{
// m_descriptorBuffers[index] = createBufferDescriptor(1024);
// descriptorInfo.desc = core::smart_refctd_ptr<IGPUBuffer>(m_descriptorBuffers[index]);
// descriptorInfo.info.buffer.offset = 0u;
// descriptorInfo.info.buffer.size = 1024u;
//}

descriptorInfos.push_back(descriptorInfo);
}
for (uint32_t i = 0; i < count; i++)
{
auto index = valueIndices[i];
auto allocationIdx = allocationIndex[i];

video::IGPUDescriptorSet::SWriteDescriptorSet write;
write.dstSet = m_subAllocDescriptorSet->getDescriptorSet();
write.binding = AllocatedBinding;
write.arrayElement = index;
write.count = 1u;
write.info = &descriptorInfos[i];
descriptorWrites.push_back(write);
}
}

m_device->updateDescriptorSets(descriptorWrites, {});
}

bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
{
using nbl::video::IGPUDescriptorSetLayout;

if (!device_base_t::onAppInitialized(std::move(system)))
return false;
if (!asset_base_t::onAppInitialized(std::move(system)))
return false;


constexpr auto MaxConcurrency = 64;

m_poolCache = ICommandPoolCache::create(core::smart_refctd_ptr(m_device),getComputeQueue()->getFamilyIndex(),IGPUCommandPool::CREATE_FLAGS::NONE,MaxConcurrency);

m_timeline = m_device->createSemaphore(m_iteration);

// Descriptor set sub allocator

video::IGPUDescriptorSetLayout::SBinding bindings[12];
{
for (uint32_t i = 0; i < 12; i++)
{
bindings[i].binding = i;
bindings[i].count = MaxDescriptors;
bindings[i].createFlags = core::bitflag(IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT)
| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_UPDATE_UNUSED_WHILE_PENDING_BIT
| IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_PARTIALLY_BOUND_BIT;
if (i % 2 == 0) bindings[i].type = asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE;
else if (i % 2 == 1) bindings[i].type = asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER;
Comment on lines +168 to +169

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you might want to try all descriptor types just to make the testing complete

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enable acceleration structure if available, and even test that if its there

(can be done as separate PR, no clue if the AS refactor works without stupid typos yet)

bindings[i].stageFlags = asset::IShader::E_SHADER_STAGE::ESS_COMPUTE;
}
}

std::span<video::IGPUDescriptorSetLayout::SBinding> bindingsSpan(bindings);

auto descriptorSetLayout = m_device->createDescriptorSetLayout(bindings);

video::IDescriptorPool::SCreateInfo poolParams = {};
{
poolParams.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_STORAGE_IMAGE)] = 512 * 6;
poolParams.maxDescriptorCount[static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_STORAGE_BUFFER)] = 512 * 6;
poolParams.maxSets = 1;
poolParams.flags = core::bitflag(video::IDescriptorPool::E_CREATE_FLAGS::ECF_UPDATE_AFTER_BIND_BIT);
}

auto descriptorPool = m_device->createDescriptorPool(std::move(poolParams));
auto descriptorSet = descriptorPool->createDescriptorSet(core::smart_refctd_ptr(descriptorSetLayout));


// TODO: I don't think these are needed for sub allocated descriptor sets (alignment isn't needed, and min size is 1)
auto subAllocatedDescriptorSet = core::make_smart_refctd_ptr<nbl::video::SubAllocatedDescriptorSet>(core::smart_refctd_ptr(descriptorSet), core::smart_refctd_ptr(m_device));
//std::vector<uint32_t> allocation(MaxDescriptors, core::PoolAddressAllocator<uint32_t>::invalid_address);

//std::vector<uint32_t> indices;
//indices.reserve(MaxDescriptors);
//for (uint32_t i = 0; i < MaxDescriptors; i++)
// indices.push_back(i);

//auto allocNum = subAllocatedDescriptorSet->multi_allocate(AllocatedBinding, allocation.size(), allocation.data());
//assert(allocNum == 0);
m_subAllocDescriptorSet = std::move(subAllocatedDescriptorSet);

//bool response = writeDescriptors(allocation.size(), indices.data(), allocation.data());
//if (!response) return false;

return true;
}

bool keepRunning() override { return m_iteration<MaxIterations; }

void workLoopBody() override
{
IQueue* const queue = getComputeQueue();

// Similar idea to example 05 (streaming buffers)
// We will be allocating and freeing stuff, latched on previous frame's timeline semaphore
auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({ m_iteration ^ 0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_) });
const auto elementCount = rng() % MaxAllocPerFrame;
m_logger->log("elementCount: %d", system::ILogger::ELL_INFO, elementCount);

std::vector<SubAllocatedDescriptorSet::value_type> values(elementCount, SubAllocatedDescriptorSet::invalid_value);

{
std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
m_subAllocDescriptorSet->multi_allocate(waitTill, AllocatedBinding, elementCount, values.data());

std::vector<SubAllocatedDescriptorSet::value_type> indices;
indices.reserve(elementCount);
for (uint32_t i = 0; i < elementCount; i++)
indices.push_back(i);

bool response = writeDescriptors(elementCount, indices.data(), values.data());
assert(response);
Comment on lines +228 to +233

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd rather actually create the resources here and rely on the set to keep them alive

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function does create them

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this function does create them

yes I know, what I mean I'd want you to create them there, because right now you have the m_descriptor.. keeping them alive

}

uint32_t poolIx;
do
{
poolIx = m_poolCache->acquirePool();
} while (poolIx==ICommandPoolCache::invalid_index);

smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
{
m_poolCache->getPool(poolIx)->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY,{&cmdbuf,1},core::smart_refctd_ptr(m_logger));
cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);

// COMMAND RECORDING
// Here we would hipothetically use the descriptors created above
Comment on lines +247 to +248
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

an idea, actually bind the descriptor set, launch an empty compute dispatch of a single workgroup that doesn't do anything

might find some bugs in IGPUCommandBuffer and friends


auto result = cmdbuf->end();
assert(result);
}


const auto savedIterNum = m_iteration++;
{
const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
{
.cmdbuf = cmdbuf.get()
};
const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
{
.semaphore = m_timeline.get(),
.value = m_iteration,
.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
};
const IQueue::SSubmitInfo submitInfo = {
.waitSemaphores = {},
.commandBuffers = {&cmdbufInfo,1},
.signalSemaphores = {&signalInfo,1}
};

queue->startCapture();
auto statusCode = queue->submit({ &submitInfo,1 });
queue->endCapture();
assert(statusCode == IQueue::RESULT::SUCCESS);
}

const ISemaphore::SWaitInfo futureWait = {m_timeline.get(),m_iteration};
m_poolCache->releasePool(futureWait,poolIx);
m_subAllocDescriptorSet->multi_deallocate(AllocatedBinding, elementCount, values.data(), futureWait);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't deallocate everything you allocated every frame

}

bool onAppTerminated() override
{
return device_base_t::onAppTerminated();
}
};

NBL_MAIN_FUNC(SubAllocatedDescriptorSetApp)
50 changes: 50 additions & 0 deletions 67_SubAllocatedDescriptorSet/pipeline.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import org.DevshGraphicsProgramming.Agent
import org.DevshGraphicsProgramming.BuilderInfo
import org.DevshGraphicsProgramming.IBuilder

class CSubAllocatedDescriptorSetBuilder extends IBuilder
{
public CSubAllocatedDescriptorSetBuilder(Agent _agent, _info)
{
super(_agent, _info)
}

@Override
public boolean prepare(Map axisMapping)
{
return true
}

@Override
public boolean build(Map axisMapping)
{
IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")

def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
def nameOfConfig = getNameOfConfig(config)

agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")

return true
}

@Override
public boolean test(Map axisMapping)
{
return true
}

@Override
public boolean install(Map axisMapping)
{
return true
}
}

def create(Agent _agent, _info)
{
return new CSubAllocatedDescriptorSetBuilder(_agent, _info)
}

return this
Loading