Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Example 14 ComputeScan fix for new NBL API and use HLSL #113

Open
wants to merge 27 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
02286fa
Migrate example 14 ComputeScan from old to new api
kpentaris Feb 11, 2024
8fe0d41
Migrate example 14_ComputeScan to new APIs
kpentaris Feb 12, 2024
7d4c25d
Merge branch 'master' of github.com:kpentaris/Nabla-Examples-and-Tests
kpentaris Mar 31, 2024
2020ee0
Merge branch 'master' of github.com:kpentaris/Nabla-Examples-and-Tests
kpentaris Apr 10, 2024
269a724
Merge branch 'master' of github.com:kpentaris/Nabla-Examples-and-Tests
kpentaris Apr 21, 2024
5607405
Fix compute scan example code to compile and work
kpentaris Apr 21, 2024
40ce35e
Revert "Fix compute scan example code to compile and work"
kpentaris Apr 21, 2024
46d2e3f
Fix compute scan example code to compile and work
kpentaris Apr 21, 2024
65e9d94
Merge branch 'Devsh-Graphics-Programming:master' into global_scan
kpentaris Apr 21, 2024
dc1bb97
Merge branch 'Devsh-Graphics-Programming:master' into master
kpentaris Apr 27, 2024
b1ab7ad
Merge master to branch
kpentaris Apr 27, 2024
7fc7bc5
Merge branch 'global_scan' of github.com:kpentaris/Nabla-Examples-and…
kpentaris Apr 27, 2024
1b5c1f0
Change example to wait for filled buffer creation using semaphore
kpentaris May 4, 2024
574d6f5
Merge branch 'Devsh-Graphics-Programming:master' into global_scan
kpentaris May 4, 2024
b40929d
Merge branch 'master' into global_scan
kpentaris May 4, 2024
a218be7
Merge branch 'global_scan' of github.com:kpentaris/Nabla-Examples-and…
kpentaris May 4, 2024
4be3378
Merge branch 'master' into global_scan
kpentaris May 10, 2024
9e2eee2
Merge branch 'master' into global_scan
kpentaris May 19, 2024
1e897e6
Merge branch 'master' into global_scan
kpentaris May 25, 2024
fe6e64f
Merge branch 'master' into global_scan
kpentaris Jun 2, 2024
dbb7e73
Rename ComputeScan to ComputeReduce
kpentaris Jun 9, 2024
7d93bd1
Change example 14 to Global Reduce instead of Global Scan
kpentaris Jun 17, 2024
75cbb6e
Merge branch 'master' into global_scan
kpentaris Jul 6, 2024
26e5133
Fix results fetching for test success assertion
kpentaris Jul 7, 2024
ac76f37
Add example 15 for global compute scan
kpentaris Jul 7, 2024
989ecd0
Merge branch 'master' into global_scan
kpentaris Aug 11, 2024
9d0ac6a
Merge branch 'master' into global_scan
kpentaris Sep 7, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions 14_ComputeReduce/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
28 changes: 28 additions & 0 deletions 14_ComputeReduce/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": [ "NBL_BUILD_CEGUI" ]
},
"profiles": [
{
"backend": "vulkan",
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release",
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
281 changes: 281 additions & 0 deletions 14_ComputeReduce/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,281 @@
#include "nbl/application_templates/BasicMultiQueueApplication.hpp"
#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"

#include <chrono>
#include <random>

using namespace nbl;
using namespace core;
using namespace asset;
using namespace system;
using namespace video;

class ComputeScanApp final : public application_templates::BasicMultiQueueApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
{
using device_base_t = application_templates::BasicMultiQueueApplication;
using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;

public:
ComputeScanApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}

bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
{
if (!device_base_t::onAppInitialized(std::move(system)))
return false;
if (!asset_base_t::onAppInitialized(std::move(system)))
return false;

computeQueue = getComputeQueue();

// Create (an almost) 128MB input buffer
constexpr auto in_size = 128u << 10u;
constexpr auto in_count = in_size / sizeof(uint32_t) - 23u;

m_logger->log("Input element count: %d", ILogger::ELL_PERFORMANCE, in_count);

inputData = new uint32_t[in_count];
{
std::random_device random_device;
std::mt19937 generator(random_device());
std::uniform_int_distribution<uint32_t> distribution(0u, ~0u);
for (auto i = 0u; i < in_count; i++)
inputData[i] = distribution(generator) % 100000;
devshgraphicsprogramming marked this conversation as resolved.
Show resolved Hide resolved
}
auto minSSBOAlign = m_physicalDevice->getLimits().minSSBOAlignment;
constexpr auto begin = in_count / 4 + 118;
assert(((begin * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u);
constexpr auto end = in_count * 3 / 4 - 78;
assert(((end * sizeof(uint32_t)) & (minSSBOAlign - 1u)) == 0u);
constexpr auto elementCount = end - begin;
Comment on lines +45 to +50

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the point of all this?


// Set Semaphores to control GPU synchronization
core::smart_refctd_ptr<ISemaphore> semaphore = m_device->createSemaphore(0);
IQueue::SSubmitInfo::SSemaphoreInfo semInfo[1] = { {
.semaphore = semaphore.get(),
.value = 1,
.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
} };

smart_refctd_ptr<IGPUBuffer> gpuinputDataBuffer;
{
IGPUBuffer::SCreationParams inputDataBufferCreationParams = {};
inputDataBufferCreationParams.size = sizeof(uint32_t) * in_count; // TODO Declare the element data type in the shader?
inputDataBufferCreationParams.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_TRANSFER_DST_BIT;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

address usage, if you want to use BDA

auto temp = m_utils->createFilledDeviceLocalBufferOnDedMem(
SIntendedSubmitInfo{ .queue = getTransferUpQueue() },
std::move(inputDataBufferCreationParams),
inputData,
{ semInfo, 1 }
);

const ISemaphore::SWaitInfo semWaitInfo[] = { {
.semaphore = semaphore.get(),
.value = 1
} };
if (m_device->blockForSemaphores(semWaitInfo) != ISemaphore::WAIT_RESULT::SUCCESS) {
m_logger->log("Blocking for operation semaphore failed during input data buffer creation", ILogger::ELL_ERROR);
return false;
}
gpuinputDataBuffer = *temp.get();
}
SBufferRange<IGPUBuffer> in_gpu_range = { begin * sizeof(uint32_t), elementCount * sizeof(uint32_t), gpuinputDataBuffer };

auto reducer = m_utils->getDefaultReducer();

CArithmeticOps::DefaultPushConstants reduce_push_constants;
CArithmeticOps::DispatchInfo reduce_dispatch_info;
reducer->buildParameters(elementCount, reduce_push_constants, reduce_dispatch_info);
Comment on lines +87 to +88

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I've left a comment in the "mother PR" that this should be a constructor of parameters or something

Kinda silly to have to use a factory to do some basic arithmetic


IGPUBuffer::SCreationParams params = { reduce_push_constants.scanParams.getScratchSize(), bitflag(IGPUBuffer::EUF_STORAGE_BUFFER_BIT) | IGPUBuffer::EUF_TRANSFER_DST_BIT };
SBufferRange<IGPUBuffer> scratch_gpu_range = {0u, params.size, m_device->createBuffer(std::move(params)) };
{
auto memReqs = scratch_gpu_range.buffer->getMemoryReqs();
memReqs.memoryTypeBits &= m_physicalDevice->getDeviceLocalMemoryTypeBits();
auto scratchMem = m_device->allocate(memReqs, scratch_gpu_range.buffer.get());
}

auto reduce_pipeline = reducer->getDefaultPipeline(CArithmeticOps::EDT_UINT, CArithmeticOps::EO_ADD, params.size); // TODO: Update to test all operations
auto dsLayout = reducer->getDefaultDescriptorSetLayout();
auto dsPool = m_device->createDescriptorPoolForDSLayouts(IDescriptorPool::ECF_NONE, { &dsLayout, 1 });
auto ds = dsPool->createDescriptorSet(core::smart_refctd_ptr<IGPUDescriptorSetLayout>(dsLayout));
reducer->updateDescriptorSet(m_device.get(), ds.get(), in_gpu_range, scratch_gpu_range);

{
smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }))
{
logFail("Failed to create Command Buffers!\n");
return false;
}
}

cmdbuf->begin(IGPUCommandBuffer::USAGE::SIMULTANEOUS_USE_BIT); // (REVIEW): not sure about this
cmdbuf->fillBuffer(scratch_gpu_range, 0u);
cmdbuf->bindComputePipeline(reduce_pipeline);
auto pipeline_layout = reduce_pipeline->getLayout();
cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, pipeline_layout, 0u, 1u, &ds.get());
reducer->dispatchHelper(cmdbuf.get(), pipeline_layout, reduce_push_constants, reduce_dispatch_info, 0u, nullptr, 0u, nullptr);
cmdbuf->end();

{
semInfo[0].value = 2;
semInfo[0].stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
.cmdbuf = cmdbuf.get()
} };

const IQueue::SSubmitInfo infos[1] = { {
.commandBuffers = commandBuffers,
.signalSemaphores = semInfo
} };

computeQueue->startCapture();
if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) {
m_logger->log("Submission failure", system::ILogger::ELL_ERROR);
}
computeQueue->endCapture();
}

// TODO: Update to support all operations
// cpu counterpart
auto cpu_begin = inputData + begin;
m_logger->log("CPU reduce begin", system::ILogger::ELL_PERFORMANCE);

auto start = std::chrono::high_resolution_clock::now();
auto result = std::reduce(cpu_begin, inputData + end, 0u);
auto stop = std::chrono::high_resolution_clock::now();

m_logger->log("CPU reduce end. Time taken: %d us", system::ILogger::ELL_PERFORMANCE, std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count());

// wait for the gpu impl to complete
const ISemaphore::SWaitInfo cmdbufDonePending[] = {{
.semaphore = semaphore.get(),
.value = 2
}};
if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) {
m_logger->log("Blocking for operation semaphore failed", ILogger::ELL_ERROR);
return false;
}

{
IGPUBuffer::SCreationParams params = {};
params.size = in_gpu_range.size;
params.usage = IGPUBuffer::EUF_TRANSFER_DST_BIT;
// (REVIEW): Check if this new download_buffer is needed or if we can directly read from the gpu_input buffer
auto downloaded_buffer = m_device->createBuffer(std::move(params));
auto memReqs = downloaded_buffer->getMemoryReqs();
memReqs.memoryTypeBits &= m_physicalDevice->getDownStreamingMemoryTypeBits();
auto queriesMem = m_device->allocate(memReqs, downloaded_buffer.get());

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

use the downstreaming buffer and make a consumption callback.

Otherwise you'll have a problem making a 128MB HOST_VISIBLE and DEVICE_LOCAL buffer, not everyone has ReBAR configured correctly.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See example 05 (whichever does the streaming consumer callback), how to download a buffer and consume it "piecewise"

{
// (REVIEW): Maybe we can just reset the cmdbuf we already have?
core::smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
{
auto cmdPool = m_device->createCommandPool(computeQueue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::NONE);
cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf , 1}, core::smart_refctd_ptr(m_logger));
}
cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); // TODO: Reset Frame's CommandPool
IGPUCommandBuffer::SBufferCopy region;
region.srcOffset = in_gpu_range.offset;
region.dstOffset = 0u;
region.size = in_gpu_range.size;
cmdbuf->copyBuffer(in_gpu_range.buffer.get(), downloaded_buffer.get(), 1u, &region);
cmdbuf->end();

{
const IQueue::SSubmitInfo::SCommandBufferInfo commandBuffers[1] = { {
.cmdbuf = cmdbuf.get()
} };

semInfo[0].value = 3;
const IQueue::SSubmitInfo infos[1] = { {
.commandBuffers = commandBuffers,
.signalSemaphores = semInfo
} };

if (computeQueue->submit(infos) != IQueue::RESULT::SUCCESS) {
m_logger->log("Download submission failure", system::ILogger::ELL_ERROR);
}

const ISemaphore::SWaitInfo cmdbufDonePending[] = { {
.semaphore = semaphore.get(),
.value = 3
} };
if (m_device->blockForSemaphores(cmdbufDonePending) != ISemaphore::WAIT_RESULT::SUCCESS) {
m_logger->log("Blocking for download semaphore failed", ILogger::ELL_ERROR);
return false;
}
}
}

auto mem = const_cast<video::IDeviceMemoryAllocation*>(downloaded_buffer->getBoundMemory().memory);
{
ILogicalDevice::MappedMemoryRange range;
{
range.memory = mem;
range.offset = 0u;
range.length = in_gpu_range.size;
}
mem->map({ .offset = range.offset, .length = range.length }, video::IDeviceMemoryAllocation::EMCAF_READ);
}
auto gpu_begin = reinterpret_cast<uint32_t*>(mem->getMappedPointer());
if (gpu_begin[0] != result)
_NBL_DEBUG_BREAK_IF(true);
m_logger->log("Result Comparison Test Passed", system::ILogger::ELL_PERFORMANCE);
operationSuccess = true;
}

delete[] inputData;

return true;
}

//virtual video::SPhysicalDeviceFeatures getRequiredDeviceFeatures() const override
//{
// video::SPhysicalDeviceFeatures retval = {};

// retval.bufferDeviceAddress = true;
// retval.subgroupBroadcastDynamicId = true;
// retval.shaderSubgroupExtendedTypes = true;
// // TODO: actually need to implement this and set it on the pipelines
// retval.computeFullSubgroups = true;
// retval.subgroupSizeControl = true;

// return retval;
//}

virtual bool onAppTerminated() override
{
m_logger->log("==========Result==========", ILogger::ELL_INFO);
m_logger->log("Operation Success: %s", ILogger::ELL_INFO, operationSuccess ?"true":"false");
delete[] inputData;
return true;
}

// the unit test is carried out on init
void workLoopBody() override {}

bool keepRunning() override { return false; }

private:
void logTestOutcome(bool passed, uint32_t workgroupSize)
{
if (passed)
m_logger->log("Passed test #%u", ILogger::ELL_INFO, workgroupSize);
else
{
m_logger->log("Failed test #%u", ILogger::ELL_ERROR, workgroupSize);
}
}

IQueue* computeQueue;
uint32_t* inputData = nullptr;
smart_refctd_ptr<IGPUDescriptorSet> descriptorSet;
smart_refctd_ptr<IGPUPipelineLayout> pipelineLayout;
smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
smart_refctd_ptr<ICPUBuffer> resultsBuffer;

bool operationSuccess = false;
};

NBL_MAIN_FUNC(ComputeScanApp)
50 changes: 50 additions & 0 deletions 14_ComputeReduce/pipeline.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import org.DevshGraphicsProgramming.Agent
import org.DevshGraphicsProgramming.BuilderInfo
import org.DevshGraphicsProgramming.IBuilder

class CComputeReduceBuilder extends IBuilder
{
public CComputeReduceBuilder(Agent _agent, _info)
{
super(_agent, _info)
}

@Override
public boolean prepare(Map axisMapping)
{
return true
}

@Override
public boolean build(Map axisMapping)
{
IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")

def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
def nameOfConfig = getNameOfConfig(config)

agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")

return true
}

@Override
public boolean test(Map axisMapping)
{
return true
}

@Override
public boolean install(Map axisMapping)
{
return true
}
}

def create(Agent _agent, _info)
{
return new CComputeReduceBuilder(_agent, _info)
}

return this
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ if(NBL_BUILD_EXAMPLES)
add_subdirectory(07_StagingAndMultipleQueues EXCLUDE_FROM_ALL)
# showcase the set-up of a swapchain and picking of a matching device
add_subdirectory(08_HelloSwapchain EXCLUDE_FROM_ALL)
# global scan
add_subdirectory(14_ComputeScan EXCLUDE_FROM_ALL)
# showcase the use of a depth buffer and rudimentary camera
add_subdirectory(09_DepthBufferAndCamera EXCLUDE_FROM_ALL)
# demonstrate the counting sort utility
Expand Down