From 5b6b09e2df8faf88d10dbe790bc1b0f9aea2826e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 28 Oct 2024 15:44:59 +0100
Subject: [PATCH 01/14] sketch out pipeline caching

---
 include/nbl/video/utilities/CComputeBlit.h | 37 +++++-----
 src/nbl/video/utilities/CComputeBlit.cpp   | 86 +++++++++++++++++++---
 2 files changed, 95 insertions(+), 28 deletions(-)
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
index dc4c6f3c5..779dc2ddc 100644
--- a/include/nbl/video/utilities/CComputeBlit.h
+++ b/include/nbl/video/utilities/CComputeBlit.h
@@ -41,7 +41,21 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 		}
 
 		// ctor
-		inline CComputeBlit(core::smart_refctd_ptr<ILogicalDevice>&& logicalDevice) : m_device(std::move(logicalDevice)) {}
+		CComputeBlit(
+			core::smart_refctd_ptr<ILogicalDevice>&& logicalDevice,
+			core::smart_refctd_ptr<asset::IShaderCompiler::CCache>&& cache=nullptr,
+			core::smart_refctd_ptr<system::ILogger>&& logger=nullptr
+		);
+
+		// if you set the balues too small, we'll correct them ourselves anyway
+		struct STask
+		{
+			uint32_t workgroupSizeLog2 : 4 = 0;
+			// the TRUE output format, not the storage view format you might manually encode into
+			hlsl::format::TexelBlockFormat outputFormat : 8 = hlsl::format::TexelBlockFormat::TBF_UNKNOWN;
+			uint32_t sharedMemoryPerInvocation : 6 = 0;
+			uint32_t unused : 14 = 0;
+		};
 		
 		//! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE.
 		inline asset::E_FORMAT getOutputViewFormat(const asset::E_FORMAT format)
@@ -66,22 +80,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 					return compatFormat;
 			}
 		}
-/*
-		struct STask
-		{
-			hlsl::vector<uint8_t,3> preloadWindow; 
-			asset::E_FORMAT inFormat;
-			asset::E_FORMAT outFormat;
-			// default no coverage adjustment
-			uint8_t alphaBinCountLog2 : 4 = 0;
-		};
-		inline void initializeTaskDefault(STask& task) const
-		{
-			auto physDev = m_device->getPhysicalDevice();
-			const auto formatTrait = hlsl::format::getTraits(static_cast<hlsl::format::TexelBlockFormat>(task.outFormat));
-			task.alphaBinCountLog2 = hlsl::max(,task.alphaBinCountLog2);
-		}
-*/
+
 #if 0
 		// @param `alphaBinCount` is only required to size the histogram present in the default nbl_glsl_blit_AlphaStatistics_t in default_compute_common.comp
 		core::smart_refctd_ptr<video::IGPUShader> createAlphaTestSpecializedShader(const asset::IImage::E_TYPE inImageType, const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount);
@@ -666,7 +665,11 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 			EBT_COUNT
 		};
 
+		void createAndCachePipelines(CAssetConverter* converter, core::smart_refctd_ptr<IGPUComputePipeline>* pipelines, const std::span<const STask> tasks);
+
 		core::smart_refctd_ptr<ILogicalDevice> m_device;
+		system::logger_opt_smart_ptr m_logger;
+		core::smart_refctd_ptr<asset::IShaderCompiler::CCache> m_shaderCache;
 
 		//! This calculates the inclusive upper bound on the preload region i.e. it will be reachable for some cases. For the rest it will be bigger
 		//! by a pixel in each dimension.
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index 012f8a230..5856f42fc 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -5,6 +5,79 @@ using namespace nbl::system;
 using namespace nbl::asset;
 using namespace nbl::video;
 
+
+CComputeBlit::CComputeBlit(smart_refctd_ptr<ILogicalDevice>&& logicalDevice, smart_refctd_ptr<IShaderCompiler::CCache>&& cache, smart_refctd_ptr<ILogger>&& logger) : m_device(std::move(logicalDevice)), m_logger(nullptr)
+{
+	if (logger)
+		m_logger = std::move(logger);
+	else if (auto debugCb=m_device->getPhysicalDevice()->getDebugCallback(); debugCb->getLogger())
+		m_logger = smart_refctd_ptr<system::ILogger>(debugCb->getLogger());
+	
+	if (cache)
+		m_shaderCache = std::move(cache);
+	else
+		m_shaderCache = make_smart_refctd_ptr<IShaderCompiler::CCache>();
+}
+
+void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_refctd_ptr<IGPUComputePipeline>* pipelines, const std::span<const STask> tasks)
+{
+	core::vector<smart_refctd_ptr<ICPUComputePipeline>> cpuPplns;
+	cpuPplns.reserve(tasks.size());
+
+	const auto& limits = m_device->getPhysicalDevice()->getLimits();
+	for (auto task : tasks)
+	{
+		// adjust task default values
+		{
+			if (task.workgroupSizeLog2<limits.maxSubgroupSize)
+				task.workgroupSizeLog2 = core::roundDownToPoT(limits.maxComputeWorkGroupInvocations);
+			bool useFloat16 = false;
+			uint16_t channels = 4;
+			using namespace hlsl::format;
+			if (task.outputFormat!=TexelBlockFormat::TBF_UNKNOWN)
+			{
+				channels = getTraits(task.outputFormat).Channels;
+				const auto precisionAt1 = getFormatPrecision(static_cast<E_FORMAT>(task.outputFormat),3,1.f);
+				const auto precisionAt0 = getFormatPrecision(static_cast<E_FORMAT>(task.outputFormat),3,0.f);
+				if (limits.workgroupMemoryExplicitLayout16BitAccess && limits.shaderFloat16 && precisionAt1>=std::exp2f(-11.f) && precisionAt0>=std::numeric_limits<hlsl::float16_t>::min())
+					useFloat16 = true;
+			}
+			// the absolute minimum needed to store a single pixel
+			const auto singlePixelStorage = channels*(useFloat16 ? sizeof(hlsl::float16_t):sizeof(hlsl::float32_t));
+			// also slightly more memory is needed
+			task.sharedMemoryPerInvocation = core::max(singlePixelStorage*2,task.sharedMemoryPerInvocation);
+		}
+		// create blit pipeline
+		cpuPplns.emplace_back(nullptr);
+		// create optional coverage normalization pipeline
+		cpuPplns.emplace_back(nullptr);
+	}
+
+	CAssetConverter::SInputs inputs = {};
+	inputs.readCache = converter;
+	inputs.logger = m_logger.getRaw();
+	std::get<CAssetConverter::SInputs::asset_span_t<ICPUComputePipeline>>(inputs.assets) = {&cpuPplns.data()->get(),cpuPplns.size()};
+	inputs.readShaderCache = m_shaderCache.get();
+	inputs.writeShaderCache = m_shaderCache.get();
+	// no pipeline cache, because we only make the same pipeline once, ever
+	auto reserveResults = converter->reserve(inputs);
+	assert(reserveResults.getRequiredQueueFlags().value==IQueue::FAMILY_FLAGS::NONE);
+	// copy over the results
+	{
+		auto rIt = reserveResults.getGPUObjects<ICPUComputePipeline>().data();
+		// TODO: redo
+		for (size_t i=0; i<tasks.size(); i++)
+			*(pipelines++) =  (rIt++)->value;
+	}
+
+	// this just inserts the pipelines into the cache
+	{
+		CAssetConverter::SConvertParams params = {};
+		auto convertResults = reserveResults.convert(params);
+		assert(!convertResults.blocking());
+	}
+}
+
 #if 0
 core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
 {
@@ -39,21 +112,14 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializ
 		   "}\n";
 
 	auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSLGLSL::createAlphaTestSpecializedShader");
-
-	return  m_device->createShader(std::move(cpuShader.get()));
 }
 
-core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat,
-	const uint32_t alphaBinCount)
+core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
 {
 	const auto workgroupDims = getDefaultWorkgroupDims(imageType);
 	const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
 	const uint32_t blitDimCount = static_cast<uint32_t>(imageType) + 1;
 
-	const auto castedFormat = getOutImageViewFormat(outFormat);
-	assert(outFormat == castedFormat);
-	const char* formatQualifier = asset::CHLSLCompiler::getStorageImageFormatQualifier(castedFormat);
-
 	std::ostringstream shaderSourceStream;
 
 	shaderSourceStream
@@ -67,7 +133,7 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpeci
 		   "[[vk::binding(0, 0)]]\n"
 		   "nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::combined_sampler_t inCS;\n"
 
-		   "[[vk::image_format(\"" << formatQualifier << "\")]]\n"
+		   "[[vk::image_format(\"unknown\")]]\n"
 		   "[[vk::binding(1, 0)]]\n"
 		   "nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::image_t outImg;\n"
 
@@ -90,7 +156,5 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpeci
 		   "}\n";
 
 	auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSL::createNormalizationSpecializedShader");
-
-	return m_device->createShader(std::move(cpuShader.get()));
 }
 #endif
\ No newline at end of file

From 492a0ad804b65e656c8a7f29fbd5b16d7d58dc92 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 29 Oct 2024 15:05:55 +0100
Subject: [PATCH 02/14] ladies and gentlemen we have C++20 concepts in HLSL !

---
 include/nbl/builtin/hlsl/concepts.hlsl | 42 ++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
index b252b3437..91dc76970 100644
--- a/include/nbl/builtin/hlsl/concepts.hlsl
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -1,23 +1,40 @@
-// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-
 #ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
 #define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
 
+
 #include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
 #include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
 #include <nbl/builtin/hlsl/type_traits.hlsl>
 
 
-#if (__cplusplus >= 202002L && __cpp_concepts)
+#ifndef __HLSL_VERSION
 
+// TODO: old stuff, see how much we can remove
 #define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__>
 #define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__)
 #define NBL_CONCEPT_BODY(...) { __VA_ARGS__ };
 #define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__;
 #define NBL_REQUIRES(...) requires __VA_ARGS__ 
 
+// for struct definitions, use instead of closing `>` on the primary template parameter list
+#define NBL_PRIMARY_REQUIRES(...) > requires (__VA_ARGS__)
+
+// to put right before the closing `>` of the primary template definition, otherwise `NBL_PARTIAL_REQUIRES` wont work on specializations
+#define NBL_STRUCT_CONSTRAINABLE
+// NOTE: C++20 requires and C++11 enable_if have to be in different places! ITS OF UTTMOST IMPORTANCE YOUR REQUIRE CLAUSES ARE IDENTICAL FOR BOTH MACROS
+// put just after the closing `>` on the partial template specialization `template` declaration e.g. `template<typename U, typename V, typename T> NBL_PARTIAL_REQ_TOP(SomeCond<U>)
+#define NBL_PARTIAL_REQ_TOP(...) requires (__VA_ARGS__)
+// put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct<U,V,T NBL_PARTIAL_REQ_BOT(SomeCond<U>)>
+#define NBL_PARTIAL_REQ_BOT(...)
+
+// condition
+#define NBL_FUNC_REQUIRES_BEGIN(...) requires (__VA_ARGS__)
+// return value
+#define NBL_FUNC_REQUIRES_END(...) __VA_ARGS__
+
 #include <concepts>
 
 namespace nbl
@@ -77,12 +94,31 @@ concept matricial = is_matrix<T>::value;
 
 #else
 
+// TODO: old stuff, see how much we can remove
 // No C++20 support. Do nothing.
 #define NBL_CONCEPT_TYPE_PARAMS(...)
 #define NBL_CONCEPT_SIGNATURE(NAME, ...) 
 #define NBL_CONCEPT_BODY(...)
 #define NBL_REQUIRES(...)
 
+
+// for struct definitions, use instead of closing `>` on the primary template parameter list
+#define NBL_PRIMARY_REQUIRES(...) ,typename __requires=::nbl::hlsl::enable_if_t<(__VA_ARGS__),void> > 
+
+// to put right before the closing `>` of the primary template definition, otherwise `NBL_PARTIAL_REQUIRES` wont work on specializations
+#define NBL_STRUCT_CONSTRAINABLE ,typename __requires=void
+// NOTE: C++20 requires and C++11 enable_if have to be in different places! ITS OF UTTMOST IMPORTANCE YOUR REQUIRE CLAUSES ARE IDENTICAL FOR BOTH MACROS
+// put just after the closing `>` on the partial template specialization `template` declaration e.g. `template<typename U, typename V, typename T> NBL_PARTIAL_REQ_TOP(SomeCond<U>)
+#define NBL_PARTIAL_REQ_TOP(...)
+// put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct<U,V,T NBL_PARTIAL_REQ_BOT(SomeCond<U>)>
+#define NBL_PARTIAL_REQ_BOT(...) ,std::enable_if_t<(__VA_ARGS__),void> 
+
+// condition, use right after the closing `>` of a function template
+#define NBL_FUNC_REQUIRES_BEGIN(...) ::nbl::hlsl::enable_if_t<(__VA_ARGS__),
+// return value, use `END(T)` instead of the return value type declaration
+#define NBL_FUNC_REQUIRES_END(...) __VA_ARGS__> 
+
 #endif
 
+
 #endif
\ No newline at end of file

From fb2f7c6cc9e380c761a8d52acd8099a37ef97f21 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 29 Oct 2024 15:30:31 +0100
Subject: [PATCH 03/14] forgot to amend the commit

---
 include/nbl/builtin/hlsl/concepts.hlsl | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
index 91dc76970..033709e72 100644
--- a/include/nbl/builtin/hlsl/concepts.hlsl
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -19,6 +19,10 @@
 #define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__;
 #define NBL_REQUIRES(...) requires __VA_ARGS__ 
 
+
+// to define a concept using `concept Name = SomeContexprBoolCondition<T>;`
+#define NBL_BOOL_CONCEPT concept
+
 // for struct definitions, use instead of closing `>` on the primary template parameter list
 #define NBL_PRIMARY_REQUIRES(...) > requires (__VA_ARGS__)
 
@@ -102,6 +106,9 @@ concept matricial = is_matrix<T>::value;
 #define NBL_REQUIRES(...)
 
 
+// to define a concept using `concept Name = SomeContexprBoolCondition<T>;`
+#define NBL_BOOL_CONCEPT NBL_CONSTEXPR_STATIC_INLINE bool
+
 // for struct definitions, use instead of closing `>` on the primary template parameter list
 #define NBL_PRIMARY_REQUIRES(...) ,typename __requires=::nbl::hlsl::enable_if_t<(__VA_ARGS__),void> > 
 

From 86e9a664a21c575eafa8406aba6e5d74c06a7585 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 29 Oct 2024 15:34:25 +0100
Subject: [PATCH 04/14] start reworking the descriptor declarations

---
 include/nbl/builtin/hlsl/binding_info.hlsl    |  24 ++++
 include/nbl/builtin/hlsl/blit/common.hlsl     | 107 +++++++--------
 .../builtin/hlsl/blit/default_blit.comp.hlsl  |  57 ++++++++
 .../hlsl/blit/default_normalize.comp.hlsl     |  18 +++
 include/nbl/builtin/hlsl/blit/temp.hlsl       |  40 ------
 include/nbl/video/utilities/CComputeBlit.h    |  80 -----------
 src/nbl/builtin/CMakeLists.txt                |   5 +-
 src/nbl/video/utilities/CComputeBlit.cpp      | 129 +++++++++++++++++-
 8 files changed, 279 insertions(+), 181 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/binding_info.hlsl
 create mode 100644 include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl
 create mode 100644 include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl
 delete mode 100644 include/nbl/builtin/hlsl/blit/temp.hlsl

diff --git a/include/nbl/builtin/hlsl/binding_info.hlsl b/include/nbl/builtin/hlsl/binding_info.hlsl
new file mode 100644
index 000000000..8702a32c3
--- /dev/null
+++ b/include/nbl/builtin/hlsl/binding_info.hlsl
@@ -0,0 +1,24 @@
+// Copyright (C) 2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_BINDING_INFO_INCLUDED_
+#define _NBL_BUILTIN_HLSL_BINDING_INFO_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+
+template<uint32_t set, uint32_t ix, uint32_t count=1>
+struct ConstevalBindingInfo
+{
+	NBL_CONSTEXPR_STATIC_INLINE uint32_t Set = set;
+	NBL_CONSTEXPR_STATIC_INLINE uint32_t Index = ix;
+	NBL_CONSTEXPR_STATIC_INLINE uint32_t Count = count;
+};
+
+}
+}
+#endif
diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl
index 07bb3f942..6295e6870 100644
--- a/include/nbl/builtin/hlsl/blit/common.hlsl
+++ b/include/nbl/builtin/hlsl/blit/common.hlsl
@@ -4,84 +4,77 @@
 #ifndef _NBL_BUILTIN_HLSL_BLIT_COMMON_INCLUDED_
 #define _NBL_BUILTIN_HLSL_BLIT_COMMON_INCLUDED_
 
-#include <nbl/builtin/hlsl/cpp_compat.hlsl>
+#include <nbl/builtin/hlsl/binding_info.hlsl>
 
 namespace nbl
 {
 namespace hlsl
 {
-namespace blit
+namespace glsl
 {
-namespace impl
+uint32_t gl_WorkGroupSize()
 {
+	return uint32_t3(ConstevalParameters::WorkGroupSize,1,1);
+}
+}
+}
+}
 
-template <uint32_t Dimension>
-struct dim_to_image_properties { };
+using namespace nbl::hlsl;
+
+[[vk::binding(ConstevalParameters::kernel_weight_binding_t::Index,ConstevalParameters::kernel_weight_binding_t::Set)]]
+Buffer<float32_t4> kernelWeights[ConstevalParameters::kernel_weight_binding_t::Count];
+[[vk::binding(ConstevalParameters::input_sampler_binding_t::Index,ConstevalParameters::input_sampler_binding_t::Set)]]
+SamplerState inSamp[ConstevalParameters::input_sampler_binding_t::Count];
+// aliased
+[[vk::binding(ConstevalParameters::input_image_binding_t::Index,ConstevalParameters::input_image_binding_t::Set)]]
+Texture1DArray<float4> inAs1DArray[ConstevalParameters::input_image_binding_t::Count];
+[[vk::binding(ConstevalParameters::input_image_binding_t::Index,ConstevalParameters::input_image_binding_t::Set)]]
+Texture2DArray<float4> inAs2DArray[ConstevalParameters::input_image_binding_t::Count];
+[[vk::binding(ConstevalParameters::input_image_binding_t::Index,ConstevalParameters::input_image_binding_t::Set)]]
+Texture3D<float4> inAs3D[ConstevalParameters::input_image_binding_t::Count];
+// aliased
+[[vk::binding(ConstevalParameters::output_binding_t::Index,ConstevalParameters::output_binding_t::Set)]] [[vk::image_format("unknown")]]
+RWTexture1DArray<float4> outAs1DArray[ConstevalParameters::output_binding_t::Count];
+[[vk::binding(ConstevalParameters::output_binding_t::Index,ConstevalParameters::output_binding_t::Set)]] [[vk::image_format("unknown")]]
+RWTexture2DArray<float4> outAs2DArray[ConstevalParameters::output_binding_t::Count];
+[[vk::binding(ConstevalParameters::output_binding_t::Index,ConstevalParameters::output_binding_t::Set)]] [[vk::image_format("unknown")]]
+RWTexture3D<float4> outAs3D[ConstevalParameters::output_binding_t::Count];
 
-template <>
-struct dim_to_image_properties<1>
-{
-	using combined_sampler_t = Texture1DArray<float4>;
-	using image_t = RWTexture1DArray<float4>;
 
-	template <typename T>
-	static vector<T, 2> getIndexCoord(vector<T, 3> coords, uint32_t layer)
+groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs];
+/*
+struct HistogramAccessor
+{
+	void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v)
 	{
-		return vector<T, 2>(coords.x, layer);
+		InterlockedAdd(statsBuff[wgID * (ConstevalParameters::AlphaBinCount + 1) + bucket], v);
 	}
 };
-
-template <>
-struct dim_to_image_properties<2>
+struct SharedAccessor
 {
-	using combined_sampler_t = Texture2DArray<float4>;
-	using image_t = RWTexture2DArray<float4>;
-
-	template <typename T>
-	static vector<T,3> getIndexCoord(vector<T, 3> coords, uint32_t layer)
+	float32_t get(float32_t idx)
 	{
-		return vector<T, 3>(coords.xy, layer);
+		return sMem[idx];
+	}
+	void set(float32_t idx, float32_t val)
+	{
+		sMem[idx] = val;
 	}
 };
-
-template <>
-struct dim_to_image_properties<3>
+struct InCSAccessor
 {
-	using combined_sampler_t = Texture3D<float4>;
-	using image_t = RWTexture3D<float4>;
-
-	template <typename T>
-	static vector<T, 3> getIndexCoord(vector<T, 3> coords, uint32_t layer)
+	float32_t4 get(float32_t3 c, uint32_t l)
 	{
-		return vector<T,3>(coords);
+		return inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<float32_t>(c, l), 0);
 	}
 };
-
-}
-
-
-template<
-	uint32_t _WorkGroupSizeX,
-	uint32_t _WorkGroupSizeY,
-	uint32_t _WorkGroupSizeZ,
-	uint32_t _SMemFloatsPerChannel,
-	uint32_t _BlitOutChannelCount,
-	uint32_t _BlitDimCount,
-	uint32_t _AlphaBinCount>
-struct consteval_parameters_t
+struct OutImgAccessor
 {
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t SMemFloatsPerChannel = _SMemFloatsPerChannel;
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t BlitOutChannelCount = _BlitOutChannelCount;
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t BlitDimCount = _BlitDimCount;
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t AlphaBinCount = _AlphaBinCount;
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSizeX = _WorkGroupSizeX;
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSizeY = _WorkGroupSizeY;
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSizeZ = _WorkGroupSizeZ;
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = WorkGroupSizeX * WorkGroupSizeY * WorkGroupSizeZ;
+	void set(int32_t3 c, uint32_t l, float32_t4 v)
+	{
+		outImg[blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<int32_t>(c, l)] = v;
+	}
 };
-
-}
-}
-}
-
+*/
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl
new file mode 100644
index 000000000..ad2749904
--- /dev/null
+++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl
@@ -0,0 +1,57 @@
+// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+//#include "nbl/builtin/hlsl/blit/common.hlsl"
+//#include "nbl/builtin/hlsl/blit/parameters.hlsl"
+//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl"
+
+
+groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs];
+/*
+struct HistogramAccessor
+{
+	void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v)
+	{
+		InterlockedAdd(statsBuff[wgID * (ConstevalParameters::AlphaBinCount + 1) + bucket], v);
+	}
+};
+struct KernelWeightsAccessor
+{
+	float32_t4 get(uint32_t idx)
+	{
+		return kernelWeights[idx];
+	}
+};
+struct InCSAccessor
+{
+	float32_t4 get(float32_t3 c, uint32_t l)
+	{
+		return inCS.SampleLevel(inSamp, blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<float32_t>(c, l), 0);
+	}
+};
+struct OutImgAccessor
+{
+	void set(int32_t3 c, uint32_t l, float32_t4 v)
+	{
+		outImg[blit::impl::dim_to_image_properties<ConstevalParameters::BlitDimCount>::getIndexCoord<int32_t>(c, l)] = v;
+	}
+};
+*/
+
+using namespace nbl::hlsl::blit;
+
+// TODO: push constants
+
+[numthreads(ConstevalParameters::WorkGroupSize,1,1)]
+void main()
+{
+/*
+	blit::compute_blit_t<ConstevalParameters> blit = blit::compute_blit_t<ConstevalParameters>::create(params);
+    InCSAccessor inCSA;
+	OutImgAccessor outImgA;
+	KernelWeightsAccessor kwA;
+	HistogramAccessor hA;
+	SharedAccessor sA;
+	blit.execute(inCSA, outImgA, kwA, hA, sA, workGroupID, localInvocationIndex);
+*/
+}
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl
new file mode 100644
index 000000000..589f370c0
--- /dev/null
+++ b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl
@@ -0,0 +1,18 @@
+// Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#include "nbl/builtin/hlsl/blit/common.hlsl"
+
+
+
+//#include "nbl/builtin/hlsl/blit/parameters.hlsl"
+//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl"
+
+using namespace nbl::hlsl::blit;
+
+// TODO: push constants
+
+[numthreads(ConstevalParameters::WorkGroupSize,1,1)]
+void main()
+{
+}
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/blit/temp.hlsl b/include/nbl/builtin/hlsl/blit/temp.hlsl
deleted file mode 100644
index 4f8ced390..000000000
--- a/include/nbl/builtin/hlsl/blit/temp.hlsl
+++ /dev/null
@@ -1,40 +0,0 @@
-// TODO: Delete this file!
-// This file is temporary file that defines all of the dependencies on PR #519
-// and should be deleted as soon as that's merged.
-#ifndef _NBL_BUILTIN_HLSL_BLIT_TEMP_INCLUDED_
-#define _NBL_BUILTIN_HLSL_BLIT_TEMP_INCLUDED_
-
-
-namespace nbl
-{
-namespace hlsl
-{
-
-namespace workgroup
-{
-    // This is slow naive scan but it doesn't matter as this file is going to
-    // be nuked. The interface is different than the one suggested in PR #519
-    // because right now there's no easy hack-free way to access
-    // gl_localInvocationID globally.
-    template<uint32_t WorkGroupSize, typename T, typename Binop, typename SharedAccessor>
-    T inclusive_scan(T value, NBL_REF_ARG(SharedAccessor) sharedAccessor, uint32_t localInvocationIndex)
-    {
-        Binop binop;
-        for (uint32_t i = 0; i < firstbithigh(WorkGroupSize); ++i)
-        {
-            sharedAccessor.main.set(localInvocationIndex, value);
-            GroupMemoryBarrierWithGroupSync();
-            if (localInvocationIndex >= (1u << i))
-            {
-                value = binop(sharedAccessor.main.get(localInvocationIndex - (1u << i)), value);
-            }
-            GroupMemoryBarrierWithGroupSync();
-        }
-        return value;
-    }
-}
-
-}
-}
-
-#endif
\ No newline at end of file
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
index 779dc2ddc..eae3f4bf0 100644
--- a/include/nbl/video/utilities/CComputeBlit.h
+++ b/include/nbl/video/utilities/CComputeBlit.h
@@ -130,86 +130,6 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 			return m_normalizationPipelines[key];
 		}
 
-		template <typename BlitUtilities>
-		core::smart_refctd_ptr<video::IGPUShader> createBlitSpecializedShader(
-			const asset::E_FORMAT									outFormat,
-			const asset::IImage::E_TYPE								imageType,
-			const core::vectorSIMDu32& inExtent,
-			const core::vectorSIMDu32& outExtent,
-			const asset::IBlitUtilities::E_ALPHA_SEMANTIC			alphaSemantic,
-			const typename BlitUtilities::convolution_kernels_t&	kernels,
-			const uint32_t											workgroupSize = 0,
-			const uint32_t											alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount)
-		{
-			if (workgroupSize==0)
-				workgroupSize = m_device->getPhysicalDevice()->getLimits().maxWorkgroupSize;
-
-			const auto workgroupDims = getDefaultWorkgroupDims(imageType);
-			const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
-
-			const uint32_t outChannelCount = asset::getFormatChannelCount(outFormat);
-			const uint32_t smemFloatCount = m_availableSharedMemory / (sizeof(float) * outChannelCount);
-			const uint32_t blitDimCount = static_cast<uint32_t>(imageType) + 1;
-
-			const auto castedFormat = getOutImageViewFormat(outFormat);
-			assert(outFormat == castedFormat);
-			const char* formatQualifier = asset::CHLSLCompiler::getStorageImageFormatQualifier(castedFormat);
-
-			std::ostringstream shaderSourceStream;
-			shaderSourceStream
-				<< "#include \"nbl/builtin/hlsl/blit/common.hlsl\"\n"
-				   "#include \"nbl/builtin/hlsl/blit/parameters.hlsl\"\n"
-				   "#include \"nbl/builtin/hlsl/blit/compute_blit.hlsl\"\n";
-
-			shaderSourceStream
-				<< "typedef nbl::hlsl::blit::consteval_parameters_t<" << workgroupSize << ", 1, 1, " << smemFloatCount << ", "
-				<< outChannelCount << ", " << blitDimCount << ", " << paddedAlphaBinCount << "> ceval_params_t;\n";
-
-			shaderSourceStream
-				<< "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n"
-				   "nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::combined_sampler_t inCS;\n"
-				   "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n"
-			       "SamplerState inSamp;\n"
-
-				   "[[vk::image_format(\""<< formatQualifier << "\")]]\n"
-				   "[[vk::binding(1, 0)]]\n"
-				   "nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::image_t outImg;\n"
-
-				   "[[vk::binding(0, 1)]] Buffer<float32_t4> kernelWeights;\n"
-			       "[[vk::push_constant]] nbl::hlsl::blit::parameters_t params;"
-				   "groupshared float32_t sMem[" << m_availableSharedMemory / sizeof(float) << "];\n";
-				
-			if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE)
-			{
-				shaderSourceStream
-					<< "[[vk::binding(2 , 0)]] RWStructuredBuffer<uint32_t> statsBuff;\n"
-					   "struct HistogramAccessor { void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { InterlockedAdd(statsBuff[wgID * (ceval_params_t::AlphaBinCount + 1) + bucket], v); } };\n";
-			}
-			else
-			{
-				shaderSourceStream << "struct HistogramAccessor { void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { } };\n";
-			}
-
-			shaderSourceStream
-				<< "struct KernelWeightsAccessor { float32_t4 get(float32_t idx) { return kernelWeights[idx]; } };\n"
-				   "struct SharedAccessor { float32_t get(float32_t idx) { return sMem[idx]; } void set(float32_t idx, float32_t val) { sMem[idx] = val; } };\n"
-				   "struct InCSAccessor { float32_t4 get(float32_t3 c, uint32_t l) { return inCS.SampleLevel(inSamp, nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::getIndexCoord<float32_t>(c, l), 0); } };\n"
-				   "struct OutImgAccessor { void set(int32_t3 c, uint32_t l, float32_t4 v) { outImg[nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::getIndexCoord<int32_t>(c, l)] = v; } };\n"
-
-				   "[numthreads(ceval_params_t::WorkGroupSize, 1, 1)]\n"
-				   "void main(uint32_t3 workGroupID : SV_GroupID, uint32_t localInvocationIndex : SV_GroupIndex)\n"
-				   "{\n"
-				   "	nbl::hlsl::blit::compute_blit_t<ceval_params_t> blit = nbl::hlsl::blit::compute_blit_t<ceval_params_t>::create(params);\n"
-				   "    InCSAccessor inCSA; OutImgAccessor outImgA; KernelWeightsAccessor kwA; HistogramAccessor hA; SharedAccessor sA;\n"
-				   "	blit.execute(inCSA, outImgA, kwA, hA, sA, workGroupID, localInvocationIndex);\n"
-				   "}\n";
-
-			auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_SHADER_STAGE::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlit::createBlitSpecializedShader");
-			auto gpuShader = m_device->createShader(std::move(cpuShader.get()));
-
-			return gpuShader;
-		}
-
 		template <typename BlitUtilities>
 		core::smart_refctd_ptr<video::IGPUComputePipeline> getBlitPipeline(
 			const asset::E_FORMAT									outFormat,
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 4705ca442..b3ec566be 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -263,7 +263,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/member_test_macros.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/alpha_test.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/compute_blit.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/common.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/temp.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/default_blit.comp.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/default_normalize.comp.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/normalization.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/blit/parameters.hlsl")
 #device capability
@@ -337,5 +338,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/memory.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/memory_accessor.hlsl")
 #enums
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/enums.hlsl")
+#
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/binding_info.hlsl")
 
 ADD_CUSTOM_BUILTIN_RESOURCES(nblBuiltinResourceData NBL_RESOURCES_TO_EMBED "${NBL_ROOT_PATH}/include" "nbl/builtin" "nbl::builtin" "${NBL_ROOT_PATH_BINARY}/include" "${NBL_ROOT_PATH_BINARY}/src" "STATIC" "INTERNAL")
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index 5856f42fc..1ceb1ee41 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -1,4 +1,5 @@
 #include "nbl/video/utilities/CComputeBlit.h"
+#include "nbl/builtin/hlsl/binding_info.hlsl"
 
 using namespace nbl::core;
 using namespace nbl::system;
@@ -47,10 +48,55 @@ void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_ref
 			// also slightly more memory is needed
 			task.sharedMemoryPerInvocation = core::max(singlePixelStorage*2,task.sharedMemoryPerInvocation);
 		}
+		const auto common = [&]()->std::string
+		{
+			// TODO: introduce a common type between ImGUI and Blit for the descriptor infos
+			auto serializeBindingInfo = [](const hlsl::SBindingInfo& info={})->std::string
+			{
+				return "ConstevalBindingInfo<"+std::to_string(info.Set)+","+std::to_string(info.Set)+","+std::to_string(info.Count)+">";
+			};
+
+			std::ostringstream tmp;
+			tmp << R"===(
+#include "nbl/builtin/hlsl/binding_info.hlsl"
+
+
+using namespace nbl::hlsl;
+
+
+struct ConstevalParameters
+{
+	NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << (0x1u<<task.workgroupSizeLog2) << R"===(;
+    using kernel_weight_binding_t = )===" << serializeBindingInfo() << R"===(;
+    using input_sampler_binding_t = )===" << serializeBindingInfo() << R"===(;
+    using input_image_binding_t = )===" << serializeBindingInfo() << R"===(;
+    using output_binding_t = )===" << serializeBindingInfo() << R"===(;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t uint32_t SharedMemoryDWORDs = )===" << task.sharedMemoryPerInvocation/sizeof(uint32_t) << R"===(;
+};
+)===";
+			return tmp.str();
+		}();
+		auto createPipeline = [&limits,&common](const char* mainPath)->smart_refctd_ptr<ICPUComputePipeline>
+		{
+			auto shader = make_smart_refctd_ptr<ICPUShader>(
+				(common+"\n#include \""+mainPath+"\"\n").c_str(),
+				IShader::E_SHADER_STAGE::ESS_COMPUTE,
+				IShader::E_CONTENT_TYPE::ECT_HLSL,
+				mainPath
+			);
+
+			ICPUComputePipeline::SCreationParams params = {};
+			params.layout = nullptr; // TODO
+			params.shader.entryPoint = "main";
+			params.shader.shader = shader.get();
+			params.shader.requiredSubgroupSize = static_cast<IShader::SSpecInfoBase::SUBGROUP_SIZE>(hlsl::findMSB(limits.maxSubgroupSize));
+			// needed for the prefix and reductions to work
+			params.shader.requireFullSubgroups = true;
+			return ICPUComputePipeline::create(params);
+		};
 		// create blit pipeline
-		cpuPplns.emplace_back(nullptr);
-		// create optional coverage normalization pipeline
-		cpuPplns.emplace_back(nullptr);
+		cpuPplns.emplace_back(createPipeline("default_blit.comp.hlsl"));
+		cpuPplns.emplace_back(createPipeline("default_normalize.comp.hlsl"));
 	}
 
 	CAssetConverter::SInputs inputs = {};
@@ -79,6 +125,83 @@ void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_ref
 }
 
 #if 0
+
+template <typename BlitUtilities>
+core::smart_refctd_ptr<video::IGPUShader> createBlitSpecializedShader(
+	const asset::IImage::E_TYPE								imageType,
+	const core::vectorSIMDu32& inExtent,
+	const core::vectorSIMDu32& outExtent,
+	const asset::IBlitUtilities::E_ALPHA_SEMANTIC			alphaSemantic,
+	const typename BlitUtilities::convolution_kernels_t&	kernels,
+	const uint32_t											workgroupSize = 0,
+	const uint32_t											alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount)
+{
+	if (workgroupSize==0)
+		workgroupSize = m_device->getPhysicalDevice()->getLimits().maxWorkgroupSize;
+
+	const auto workgroupDims = getDefaultWorkgroupDims(imageType);
+	const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
+
+	const uint32_t outChannelCount = asset::getFormatChannelCount(outFormat);
+	const uint32_t smemFloatCount = m_availableSharedMemory / (sizeof(float) * outChannelCount);
+	const uint32_t blitDimCount = static_cast<uint32_t>(imageType) + 1;
+
+
+	std::ostringstream shaderSourceStream;
+	shaderSourceStream
+		<< "#include \"nbl/builtin/hlsl/blit/common.hlsl\"\n"
+			"#include \"nbl/builtin/hlsl/blit/parameters.hlsl\"\n"
+			"#include \"nbl/builtin/hlsl/blit/compute_blit.hlsl\"\n";
+
+	shaderSourceStream
+		<< "typedef nbl::hlsl::blit::consteval_parameters_t<" << workgroupSize << ", 1, 1, " << smemFloatCount << ", "
+		<< outChannelCount << ", " << blitDimCount << ", " << paddedAlphaBinCount << "> ceval_params_t;\n";
+
+	shaderSourceStream
+		<< "[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n"
+			"nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::combined_sampler_t inCS;\n"
+			"[[vk::combinedImageSampler]] [[vk::binding(0, 0)]]\n"
+			"SamplerState inSamp;\n"
+
+			"[[vk::image_format(\""<< formatQualifier << "\")]]\n"
+			"[[vk::binding(1, 0)]]\n"
+			"nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::image_t outImg;\n"
+
+			"[[vk::binding(0, 1)]] Buffer<float32_t4> kernelWeights;\n"
+			"[[vk::push_constant]] nbl::hlsl::blit::parameters_t params;"
+			"groupshared float32_t sMem[" << m_availableSharedMemory / sizeof(float) << "];\n";
+				
+	if (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE)
+	{
+		shaderSourceStream
+			<< "[[vk::binding(2 , 0)]] RWStructuredBuffer<uint32_t> statsBuff;\n"
+				"struct HistogramAccessor { void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { InterlockedAdd(statsBuff[wgID * (ceval_params_t::AlphaBinCount + 1) + bucket], v); } };\n";
+	}
+	else
+	{
+		shaderSourceStream << "struct HistogramAccessor { void atomicAdd(uint32_t wgID, uint32_t bucket, uint32_t v) { } };\n";
+	}
+
+	shaderSourceStream
+		<< "struct KernelWeightsAccessor { float32_t4 get(float32_t idx) { return kernelWeights[idx]; } };\n"
+			"struct SharedAccessor { float32_t get(float32_t idx) { return sMem[idx]; } void set(float32_t idx, float32_t val) { sMem[idx] = val; } };\n"
+			"struct InCSAccessor { float32_t4 get(float32_t3 c, uint32_t l) { return inCS.SampleLevel(inSamp, nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::getIndexCoord<float32_t>(c, l), 0); } };\n"
+			"struct OutImgAccessor { void set(int32_t3 c, uint32_t l, float32_t4 v) { outImg[nbl::hlsl::blit::impl::dim_to_image_properties<ceval_params_t::BlitDimCount>::getIndexCoord<int32_t>(c, l)] = v; } };\n"
+
+			"[numthreads(ceval_params_t::WorkGroupSize, 1, 1)]\n"
+			"void main(uint32_t3 workGroupID : SV_GroupID, uint32_t localInvocationIndex : SV_GroupIndex)\n"
+			"{\n"
+			"	nbl::hlsl::blit::compute_blit_t<ceval_params_t> blit = nbl::hlsl::blit::compute_blit_t<ceval_params_t>::create(params);\n"
+			"    InCSAccessor inCSA; OutImgAccessor outImgA; KernelWeightsAccessor kwA; HistogramAccessor hA; SharedAccessor sA;\n"
+			"	blit.execute(inCSA, outImgA, kwA, hA, sA, workGroupID, localInvocationIndex);\n"
+			"}\n";
+
+	auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_SHADER_STAGE::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlit::createBlitSpecializedShader");
+	auto gpuShader = m_device->createShader(std::move(cpuShader.get()));
+
+	return gpuShader;
+}
+
 core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
 {
 	const auto workgroupDims = getDefaultWorkgroupDims(imageType);

From c6ba21128a27279b585f1966aa46fb169051810f Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Wed, 30 Oct 2024 12:46:26 +0100
Subject: [PATCH 05/14] improve the function template constraints a lot

---
 include/nbl/builtin/hlsl/concepts.hlsl | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
index 033709e72..fea6dacc9 100644
--- a/include/nbl/builtin/hlsl/concepts.hlsl
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -34,10 +34,8 @@
 // put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct<U,V,T NBL_PARTIAL_REQ_BOT(SomeCond<U>)>
 #define NBL_PARTIAL_REQ_BOT(...)
 
-// condition
-#define NBL_FUNC_REQUIRES_BEGIN(...) requires (__VA_ARGS__)
-// return value
-#define NBL_FUNC_REQUIRES_END(...) __VA_ARGS__
+// condition, use instead of the closing `>` of a function template
+#define NBL_FUNC_REQUIRES(...) > requires (__VA_ARGS__)
 
 #include <concepts>
 
@@ -120,10 +118,8 @@ concept matricial = is_matrix<T>::value;
 // put just before closing `>` on the partial template specialization Type args, e.g. `MyStruct<U,V,T NBL_PARTIAL_REQ_BOT(SomeCond<U>)>
 #define NBL_PARTIAL_REQ_BOT(...) ,std::enable_if_t<(__VA_ARGS__),void> 
 
-// condition, use right after the closing `>` of a function template
-#define NBL_FUNC_REQUIRES_BEGIN(...) ::nbl::hlsl::enable_if_t<(__VA_ARGS__),
-// return value, use `END(T)` instead of the return value type declaration
-#define NBL_FUNC_REQUIRES_END(...) __VA_ARGS__> 
+// condition, use instead of the closing `>` of a function template
+#define NBL_FUNC_REQUIRES(...) ,std::enable_if_t<(__VA_ARGS__),bool> = true>
 
 #endif
 

From a4a9fc9dda20bbedfad320897c3d5e8a795db734 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 4 Nov 2024 12:13:35 +0100
Subject: [PATCH 06/14] add concept macros!

---
 include/nbl/builtin/hlsl/concepts.hlsl    | 108 ++++++++++++++++------
 include/nbl/builtin/hlsl/type_traits.hlsl |   3 +
 2 files changed, 84 insertions(+), 27 deletions(-)

diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
index fea6dacc9..bf16d3d1c 100644
--- a/include/nbl/builtin/hlsl/concepts.hlsl
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -10,14 +10,31 @@
 #include <nbl/builtin/hlsl/type_traits.hlsl>
 
 
-#ifndef __HLSL_VERSION
-
-// TODO: old stuff, see how much we can remove
-#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__>
-#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__)
-#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ };
-#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__;
-#define NBL_REQUIRES(...) requires __VA_ARGS__ 
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+// common implementation juice
+#include <boost/preprocessor/seq/elem.hpp>
+#define NBL_IMPL_CONCEPT_FULL_TPLT(z, n, unused) BOOST_PP_SEQ_ELEM(n,NBL_CONCEPT_TPLT_PRM_KINDS) BOOST_PP_SEQ_ELEM(n,NBL_CONCEPT_TPLT_PRM_NAMES)
+#include <boost/preprocessor/repetition/enum.hpp>
+#define NBL_CONCEPT_FULL_TPLT() BOOST_PP_ENUM(BOOST_PP_SEQ_SIZE(NBL_CONCEPT_TPLT_PRM_NAMES),NBL_IMPL_CONCEPT_FULL_TPLT,DUMMY)
+#include <boost/preprocessor/seq/enum.hpp>
+#define NBL_CONCEPT_TPLT_PARAMS() BOOST_PP_SEQ_ENUM(NBL_CONCEPT_TPLT_PRM_NAMES)
+#include <boost/preprocessor/tuple/elem.hpp>
+#include <boost/preprocessor/control/expr_if.hpp>
+#include <boost/preprocessor/seq/for_each_i.hpp>
+//
+#define NBL_CONCEPT_REQ_TYPE 0
+#define NBL_CONCEPT_REQ_EXPR 1
+//
+#define NBL_CONCEPT_REQ_EXPR_RET_TYPE 2
+
+
+//! Now diverge
+#ifndef __cpp_concepts
 
 
 // to define a concept using `concept Name = SomeContexprBoolCondition<T>;`
@@ -37,14 +54,30 @@
 // condition, use instead of the closing `>` of a function template
 #define NBL_FUNC_REQUIRES(...) > requires (__VA_ARGS__)
 
-#include <concepts>
 
-namespace nbl
-{
-namespace hlsl
-{
-namespace concepts
+//
+#define NBL_CONCEPT_PARAM_T(ID,...) ID
+//
+#define NBL_IMPL_IMPL_CONCEPT_BEGIN(A,...) __VA_ARGS__ A
+#define NBL_IMPL_CONCEPT_BEGIN(z,n,data) NBL_IMPL_IMPL_CONCEPT_BEGIN NBL_CONCEPT_PARAM_##n
+// TODO: are empty local parameter lists valid? a.k.a. just a `()`
+#define NBL_CONCEPT_BEGIN(LOCAL_PARAM_COUNT) template<NBL_CONCEPT_FULL_TPLT()> \
+concept NBL_CONCEPT_NAME = requires BOOST_PP_EXPR_IF(LOCAL_PARAM_COUNT,(BOOST_PP_ENUM(LOCAL_PARAM_COUNT,NBL_IMPL_CONCEPT_BEGIN,DUMMY))) \
 {
+//
+#define NBL_IMPL_CONCEPT_REQ_TYPE(...) typename __VA_ARGS__;
+#define NBL_IMPL_CONCEPT_REQ_EXPR(...) __VA_ARGS__;
+#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) {E}; C<decltype E,__VA_ARGS__ >;
+//
+#define NBL_IMPL_CONCEPT (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
+//
+#define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT) BOOST_PP_SEQ_TAIL(e))
+//
+#define NBL_CONCEPT_END(SEQ) BOOST_PP_SEQ_FOR_EACH_I(NBL_IMPL_CONCEPT_END_DEF, DUMMY, SEQ) \
+}
+
+
+#include <concepts>
 
 // Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use
 // the macros here.
@@ -90,22 +123,11 @@ concept vectorial = is_vector<T>::value;
 template <typename T>
 concept matricial = is_matrix<T>::value;
 
-}
-}
-}
-
 #else
 
-// TODO: old stuff, see how much we can remove
-// No C++20 support. Do nothing.
-#define NBL_CONCEPT_TYPE_PARAMS(...)
-#define NBL_CONCEPT_SIGNATURE(NAME, ...) 
-#define NBL_CONCEPT_BODY(...)
-#define NBL_REQUIRES(...)
-
 
 // to define a concept using `concept Name = SomeContexprBoolCondition<T>;`
-#define NBL_BOOL_CONCEPT NBL_CONSTEXPR_STATIC_INLINE bool
+#define NBL_BOOL_CONCEPT NBL_CONSTEXPR bool
 
 // for struct definitions, use instead of closing `>` on the primary template parameter list
 #define NBL_PRIMARY_REQUIRES(...) ,typename __requires=::nbl::hlsl::enable_if_t<(__VA_ARGS__),void> > 
@@ -121,7 +143,39 @@ concept matricial = is_matrix<T>::value;
 // condition, use instead of the closing `>` of a function template
 #define NBL_FUNC_REQUIRES(...) ,std::enable_if_t<(__VA_ARGS__),bool> = true>
 
-#endif
 
+//
+#define NBL_CONCEPT_BEGIN(LOCAL_PARAM_COUNT) namespace BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME) \
+{
+//
+#define NBL_CONCEPT_PARAM_T(ID,...) ::nbl::hlsl::impl::declval<__VA_ARGS__ >()
+//
+#define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t<typename __VA_ARGS__ >
+#define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t<decltype(__VA_ARGS__)>
+#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) C<decltype E ,__VA_ARGS__  >
+//
+#define NBL_IMPL_CONCEPT_SFINAE (typename=void,typename=void,bool=true)
+#define NBL_IMPL_CONCEPT_SFINAE_SPEC (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
+//
+#define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template<NBL_CONCEPT_FULL_TPLT(), BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT_SFINAE)> \
+struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::false_type {}; \
+template<NBL_CONCEPT_FULL_TPLT()> \
+struct BOOST_PP_CAT(__requirement,i)<NBL_CONCEPT_TPLT_PARAMS(), \
+NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT_SFINAE_SPEC) BOOST_PP_SEQ_TAIL(e)) \
+ > : ::nbl::hlsl::true_type {};
+//
+#define NBL_IMPL_CONCEPT_END_GET(r,unused,i,e) BOOST_PP_EXPR_IF(i,&&) BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME)::BOOST_PP_CAT(__requirement,i)<NBL_CONCEPT_TPLT_PARAMS()>::value
+//
+#define NBL_CONCEPT_END(SEQ) BOOST_PP_SEQ_FOR_EACH_I(NBL_IMPL_CONCEPT_END_DEF, DUMMY, SEQ) \
+} \
+template<NBL_CONCEPT_FULL_TPLT()> \
+NBL_CONSTEXPR bool NBL_CONCEPT_NAME = BOOST_PP_SEQ_FOR_EACH_I(NBL_IMPL_CONCEPT_END_GET, DUMMY, SEQ)
+
+// TODO: counterparts of all the other concepts
+
+#endif
+}
+}
+}
 
 #endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl
index d7b1102af..68cfc6476 100644
--- a/include/nbl/builtin/hlsl/type_traits.hlsl
+++ b/include/nbl/builtin/hlsl/type_traits.hlsl
@@ -574,6 +574,9 @@ using enable_if = std::enable_if<B, T>;
 template<class T>
 using alignment_of = std::alignment_of<T>;
 
+template<typename T>
+using make_void_t = typename make_void<T>::type;
+
 template<class T> using remove_const = std::remove_const<T>;
 template<class T> using remove_volatile = std::remove_volatile<T>;
 template<class T> using remove_cv = std::remove_cv<T>;

From 0b2e2f15e90ee8ff93a22eac6b1470f5b9176375 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 4 Nov 2024 15:07:31 +0100
Subject: [PATCH 07/14] fix up type traits

---
 include/nbl/builtin/hlsl/type_traits.hlsl | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl
index 68cfc6476..3a4e0eda7 100644
--- a/include/nbl/builtin/hlsl/type_traits.hlsl
+++ b/include/nbl/builtin/hlsl/type_traits.hlsl
@@ -156,10 +156,9 @@ namespace nbl
 {
 namespace hlsl
 {
-
+//
 namespace impl
 {
-    
 template<template<class> class Trait, class T>
 struct base_type_forwarder : Trait<T> {};
 
@@ -168,11 +167,14 @@ struct base_type_forwarder<Trait,vector<T,N> > : Trait<T> {};
 
 template<template<class> class Trait, class T, uint16_t N, uint16_t M>
 struct base_type_forwarder<Trait,matrix<T,N,M> > : Trait<T> {};
-
 }
 
-#ifdef __HLSL_VERSION // HLSL
+//
+template<class>
+struct make_void { using type = void; };
+
 
+#ifdef __HLSL_VERSION // HLSL
 
 #define decltype(expr) __decltype(expr)
 
@@ -391,9 +393,6 @@ struct enable_if<true, T> : type_identity<T> {};
 template<class T>
 struct alignment_of;
 
-template<class>
-struct make_void { using type = void; };
-
 // reference stuff needed for semantics 
 
 // not for "human consumption"
@@ -574,9 +573,6 @@ using enable_if = std::enable_if<B, T>;
 template<class T>
 using alignment_of = std::alignment_of<T>;
 
-template<typename T>
-using make_void_t = typename make_void<T>::type;
-
 template<class T> using remove_const = std::remove_const<T>;
 template<class T> using remove_volatile = std::remove_volatile<T>;
 template<class T> using remove_cv = std::remove_cv<T>;
@@ -617,6 +613,9 @@ template<class T>
 NBL_CONSTEXPR uint32_t alignment_of_v = alignment_of<T>::value;
 
 // Overlapping definitions
+template<typename T>
+using make_void_t = typename make_void<T>::type;
+
 template<bool C, typename T, T A, T B>
 struct conditional_value
 {

From 4275c233ffdd6542a159ac8506037f8083e275d8 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Mon, 4 Nov 2024 15:09:05 +0100
Subject: [PATCH 08/14] add a general binding info

---
 include/nbl/asset/IPipelineLayout.h        | 46 ++++++++++++++++++++--
 include/nbl/builtin/hlsl/binding_info.hlsl |  9 +++++
 include/nbl/ext/ImGui/ImGui.h              |  1 +
 include/nbl/video/utilities/CComputeBlit.h | 32 +++++++++++----
 src/nbl/ext/ImGui/ImGui.cpp                |  1 +
 src/nbl/video/utilities/CComputeBlit.cpp   | 12 ++----
 6 files changed, 82 insertions(+), 19 deletions(-)

diff --git a/include/nbl/asset/IPipelineLayout.h b/include/nbl/asset/IPipelineLayout.h
index 7628d0b48..fdbc97bbf 100644
--- a/include/nbl/asset/IPipelineLayout.h
+++ b/include/nbl/asset/IPipelineLayout.h
@@ -4,12 +4,14 @@
 #ifndef _NBL_ASSET_I_PIPELINE_LAYOUT_H_INCLUDED_
 #define _NBL_ASSET_I_PIPELINE_LAYOUT_H_INCLUDED_
 
+#include "nbl/macros.h"
+#include "nbl/core/declarations.h"
 
 #include <algorithm>
 #include <array>
 
-#include "nbl/macros.h"
-#include "nbl/core/declarations.h"
+#include "nbl/asset/IDescriptorSetLayout.h"
+#include "nbl/builtin/hlsl/binding_info.hlsl"
 
 
 namespace nbl::asset
@@ -21,7 +23,7 @@ namespace nbl::asset
     however they serve as a fast path with regard to data upload from the
     CPU and data access from the GPU. 
     
-    Note that IrrlichtBaW limits push constant size to 128 bytes.
+    Note that Nabla limits push constant size to 128 bytes.
 
     Push Constants are an alternative to an UBO where it performs really poorly,
     mostly very small and very frequent updates. Examples of which are:
@@ -140,6 +142,44 @@ class IPipelineLayout
             return static_cast<int32_t>(i)-1;
         }
 
+        // utility function, if you compile shaders for specific layouts, not create layouts given shaders
+        using desc_type_bitset_t = std::bitset<static_cast<size_t>(IDescriptor::E_TYPE::ET_COUNT)>;
+        // TODO: add constraints for stage and creation flags, or just return the storage index & redirect?
+        core::string getBindingInfoForHLSL(const hlsl::SBindingInfo& info, const desc_type_bitset_t allowedTypes=desc_type_bitset_t().set()) const
+        {
+            if (info.set>=DESCRIPTOR_SET_COUNT)
+                return "#error \"::nbl::hlsl::SBindingInfo::set out of range!\"";
+            const auto* layout = m_descSetLayouts[info.set];
+            if (!layout)
+                return "#error \"::nbl::hlsl::SBindingInfo::set layout is nullptr!\"";
+            //
+            using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
+            using storage_range_index_t = redirect_t::storage_range_index_t;
+            const redirect_t* redirect;
+            storage_range_index_t found;
+            {
+                const redirect_t::binding_number_t binding(info.binding);
+                for (auto t=0u; t<static_cast<size_t>(IDescriptor::E_TYPE::ET_COUNT); t++)
+                if (allowedTypes.test(t))
+                {
+                    redirect = &layout->getDescriptorRedirect(static_cast<IDescriptor::E_TYPE>(t));
+                    found = redirect->findBindingStorageIndex(binding);
+                    if (found)
+                        break;
+                }
+                if (!found && allowedTypes.test(static_cast<size_t>(IDescriptor::E_TYPE::ET_SAMPLER)))
+                {
+                    redirect = &layout->getImmutableSamplerRedirect();
+                    found = redirect->findBindingStorageIndex(binding);
+                }
+                if (!found)
+                    return "#error \"Could not find `::nbl::hlsl::SBindingInfo::binding` in `::nbl::hlsl::SBindingInfo::set`'s layout!\"";
+            }
+            const auto count = redirect->getCount(found);
+            assert(count); // this layout should have never passed validation
+            return "::nbl::hlsl::ConstevalBindingInfo<"+std::to_string(info.set)+","+std::to_string(info.binding)+","+std::to_string(count)+">";
+        }
+
     protected:
         IPipelineLayout(
             const std::span<const asset::SPushConstantRange> _pcRanges,
diff --git a/include/nbl/builtin/hlsl/binding_info.hlsl b/include/nbl/builtin/hlsl/binding_info.hlsl
index 8702a32c3..e03766516 100644
--- a/include/nbl/builtin/hlsl/binding_info.hlsl
+++ b/include/nbl/builtin/hlsl/binding_info.hlsl
@@ -19,6 +19,15 @@ struct ConstevalBindingInfo
 	NBL_CONSTEXPR_STATIC_INLINE uint32_t Count = count;
 };
 
+// used for descriptor set layout lookups
+struct SBindingInfo
+{
+	//! binding index for a given resource
+	uint32_t binding : 29;
+	//! descriptor set index for a resource
+	uint32_t set : 3;
+};
+
 }
 }
 #endif
diff --git a/include/nbl/ext/ImGui/ImGui.h b/include/nbl/ext/ImGui/ImGui.h
index 58787b9d5..244195c01 100644
--- a/include/nbl/ext/ImGui/ImGui.h
+++ b/include/nbl/ext/ImGui/ImGui.h
@@ -24,6 +24,7 @@ class UI final : public core::IReferenceCounted
 		struct SResourceParameters
 		{
 				//! for a given pipeline layout we need to know what is intended for UI resources
+				// TODO: introduce a common type between ImGUI and Blit for the descriptor infos "binding_info.hlsl"
 				struct SBindingInfo
 				{
 					//! descriptor set index for a resource
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
index eae3f4bf0..69b8d8ba2 100644
--- a/include/nbl/video/utilities/CComputeBlit.h
+++ b/include/nbl/video/utilities/CComputeBlit.h
@@ -47,16 +47,34 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 			core::smart_refctd_ptr<system::ILogger>&& logger=nullptr
 		);
 
-		// if you set the balues too small, we'll correct them ourselves anyway
-		struct STask
+		// create your pipelines
+		struct SPipelines
 		{
+			core::smart_refctd_ptr<IGPUComputePipeline> blit;
+			core::smart_refctd_ptr<IGPUComputePipeline> coverage;
+		};
+		struct SPipelinesCreateInfo
+		{
+			// required
+			CAssetConverter* converter;
+			// in theory we _could_ accept either pipeline layout type (or just the base) and make the CPU one back from the GPU
+			const asset::ICPUPipelineLayout* layout;
+			// must be Uniform Texel Buffer descriptor type
+			hlsl::SBindingInfo kernelWeights;
+			// must be Sampled Image descriptor type
+			hlsl::SBindingInfo inputs;
+			// must be Sampler descriptor type
+			hlsl::SBindingInfo samplers;
+			// must be Storage Image descriptor type
+			hlsl::SBindingInfo outputs;
+			//! If you set the balues too small, we'll correct them ourselves anyway
+			// needs to be at least as big as the maximum subgroup size 
 			uint32_t workgroupSizeLog2 : 4 = 0;
-			// the TRUE output format, not the storage view format you might manually encode into
-			hlsl::format::TexelBlockFormat outputFormat : 8 = hlsl::format::TexelBlockFormat::TBF_UNKNOWN;
+			//
 			uint32_t sharedMemoryPerInvocation : 6 = 0;
-			uint32_t unused : 14 = 0;
 		};
-		
+		SPipelines createAndCachePipelines(const SPipelinesCreateInfo& info);
+
 		//! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE.
 		inline asset::E_FORMAT getOutputViewFormat(const asset::E_FORMAT format)
 		{
@@ -585,8 +603,6 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 			EBT_COUNT
 		};
 
-		void createAndCachePipelines(CAssetConverter* converter, core::smart_refctd_ptr<IGPUComputePipeline>* pipelines, const std::span<const STask> tasks);
-
 		core::smart_refctd_ptr<ILogicalDevice> m_device;
 		system::logger_opt_smart_ptr m_logger;
 		core::smart_refctd_ptr<asset::IShaderCompiler::CCache> m_shaderCache;
diff --git a/src/nbl/ext/ImGui/ImGui.cpp b/src/nbl/ext/ImGui/ImGui.cpp
index 91b9e4115..9e9f9f2e5 100644
--- a/src/nbl/ext/ImGui/ImGui.cpp
+++ b/src/nbl/ext/ImGui/ImGui.cpp
@@ -221,6 +221,7 @@ core::smart_refctd_ptr<video::IGPUGraphicsPipeline> UI::createPipeline(SCreation
 
 					std::stringstream stream;
 
+					// TODO: Use the `ConstevalBindingInfo`
 					stream << "// -> this code has been autogenerated with Nabla ImGUI extension\n"
 						<< "#define NBL_TEXTURES_BINDING_IX " << creationParams.resources.texturesInfo.bindingIx << "\n"
 						<< "#define NBL_SAMPLER_STATES_BINDING_IX " << creationParams.resources.samplersInfo.bindingIx << "\n"
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index 1ceb1ee41..1dd123952 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -20,8 +20,9 @@ CComputeBlit::CComputeBlit(smart_refctd_ptr<ILogicalDevice>&& logicalDevice, sma
 		m_shaderCache = make_smart_refctd_ptr<IShaderCompiler::CCache>();
 }
 
-void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_refctd_ptr<IGPUComputePipeline>* pipelines, const std::span<const STask> tasks)
+auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) -> SPipelines
 {
+	SPipelines retval;
 	core::vector<smart_refctd_ptr<ICPUComputePipeline>> cpuPplns;
 	cpuPplns.reserve(tasks.size());
 
@@ -50,12 +51,6 @@ void CComputeBlit::createAndCachePipelines(CAssetConverter* converter, smart_ref
 		}
 		const auto common = [&]()->std::string
 		{
-			// TODO: introduce a common type between ImGUI and Blit for the descriptor infos
-			auto serializeBindingInfo = [](const hlsl::SBindingInfo& info={})->std::string
-			{
-				return "ConstevalBindingInfo<"+std::to_string(info.Set)+","+std::to_string(info.Set)+","+std::to_string(info.Count)+">";
-			};
-
 			std::ostringstream tmp;
 			tmp << R"===(
 #include "nbl/builtin/hlsl/binding_info.hlsl"
@@ -67,7 +62,7 @@ using namespace nbl::hlsl;
 struct ConstevalParameters
 {
 	NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << (0x1u<<task.workgroupSizeLog2) << R"===(;
-    using kernel_weight_binding_t = )===" << serializeBindingInfo() << R"===(;
+    using kernel_weight_binding_t = )===" << layout->getBindingInfoForHLSL() << R"===(;
     using input_sampler_binding_t = )===" << serializeBindingInfo() << R"===(;
     using input_image_binding_t = )===" << serializeBindingInfo() << R"===(;
     using output_binding_t = )===" << serializeBindingInfo() << R"===(;
@@ -122,6 +117,7 @@ struct ConstevalParameters
 		auto convertResults = reserveResults.convert(params);
 		assert(!convertResults.blocking());
 	}
+	return retval;
 }
 
 #if 0

From bb757d8c29015b2c3ef02c399c9da07067254172 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 5 Nov 2024 07:33:47 +0100
Subject: [PATCH 09/14] make concepts work

P.S. also make the HLSL `decltype` macro forward perfectly
---
 include/nbl/builtin/hlsl/blit/parameters.hlsl | 32 ++++++++++++++++
 include/nbl/builtin/hlsl/concepts.hlsl        | 13 +++----
 .../nbl/builtin/hlsl/member_test_macros.hlsl  | 13 +++----
 include/nbl/builtin/hlsl/type_traits.hlsl     |  2 +-
 include/nbl/builtin/hlsl/utility.hlsl         | 38 +++++++++++++++++++
 src/nbl/builtin/CMakeLists.txt                |  1 +
 6 files changed, 83 insertions(+), 16 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/utility.hlsl

diff --git a/include/nbl/builtin/hlsl/blit/parameters.hlsl b/include/nbl/builtin/hlsl/blit/parameters.hlsl
index d280cc523..3992fcd68 100644
--- a/include/nbl/builtin/hlsl/blit/parameters.hlsl
+++ b/include/nbl/builtin/hlsl/blit/parameters.hlsl
@@ -44,6 +44,38 @@ struct parameters_t
 	}
 };
 
+struct parameters2_t
+{
+	float32_t3 fScale;
+	float32_t3 negativeSupportMinusHalf;
+	float32_t referenceAlpha;
+	uint32_t kernelWeightsOffsetY;
+	uint32_t kernelWeightsOffsetZ;
+	uint32_t inPixelCount;
+	uint32_t outPixelCount;
+
+	uint16_t3 inputDims;
+	uint16_t3 outputDims;
+	uint16_t3 windowDims;
+	uint16_t3 phaseCount;
+	uint16_t3 preloadRegion;
+	uint16_t3 iterationRegionXPrefixProducts;
+	uint16_t3 iterationRegionYPrefixProducts;
+	uint16_t3 iterationRegionZPrefixProducts;
+
+	//! Offset into the shared memory array which tells us from where the second buffer of shared memory begins
+	//! Given by max(memory_for_preload_region, memory_for_result_of_y_pass)
+	uint16_t secondScratchOffset;
+	uint16_t outputTexelsPerWGZ;
+
+	uint32_t3 getOutputTexelsPerWG()
+	{
+		//! `outputTexelsPerWG.xy` just happens to be in the first components of `iterationRegionsXPrefixProducts` and `iterationRegionYPrefixProducts` --this is
+		//! the result of how we choose to iterate, i.e. if, in the future, we decide to iterate differently, this needs to change.
+		return uint32_t3(iterationRegionXPrefixProducts.x, iterationRegionYPrefixProducts.x, outputTexelsPerWGZ);
+	}
+};
+
 
 }
 }
diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
index bf16d3d1c..0aa1af7b5 100644
--- a/include/nbl/builtin/hlsl/concepts.hlsl
+++ b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -7,7 +7,7 @@
 
 #include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
 #include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
-#include <nbl/builtin/hlsl/type_traits.hlsl>
+#include <nbl/builtin/hlsl/utility.hlsl>
 
 
 namespace nbl
@@ -148,20 +148,19 @@ concept matricial = is_matrix<T>::value;
 #define NBL_CONCEPT_BEGIN(LOCAL_PARAM_COUNT) namespace BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME) \
 {
 //
-#define NBL_CONCEPT_PARAM_T(ID,...) ::nbl::hlsl::impl::declval<__VA_ARGS__ >()
+#define NBL_CONCEPT_PARAM_T(ID,...) ::nbl::hlsl::experimental::declval<__VA_ARGS__ >()
 //
 #define NBL_IMPL_CONCEPT_REQ_TYPE(...) ::nbl::hlsl::make_void_t<typename __VA_ARGS__ >
 #define NBL_IMPL_CONCEPT_REQ_EXPR(...) ::nbl::hlsl::make_void_t<decltype(__VA_ARGS__)>
-#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) C<decltype E ,__VA_ARGS__  >
+#define NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE(E,C,...) ::nbl::hlsl::enable_if_t<C<decltype E ,__VA_ARGS__  > >
 //
-#define NBL_IMPL_CONCEPT_SFINAE (typename=void,typename=void,bool=true)
-#define NBL_IMPL_CONCEPT_SFINAE_SPEC (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
+#define NBL_IMPL_CONCEPT_SFINAE (NBL_IMPL_CONCEPT_REQ_TYPE,NBL_IMPL_CONCEPT_REQ_EXPR,NBL_IMPL_CONCEPT_REQ_EXPR_RET_TYPE)
 //
-#define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template<NBL_CONCEPT_FULL_TPLT(), BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT_SFINAE)> \
+#define NBL_IMPL_CONCEPT_END_DEF(r,unused,i,e) template<NBL_CONCEPT_FULL_TPLT(), typename=void> \
 struct BOOST_PP_CAT(__requirement,i) : ::nbl::hlsl::false_type {}; \
 template<NBL_CONCEPT_FULL_TPLT()> \
 struct BOOST_PP_CAT(__requirement,i)<NBL_CONCEPT_TPLT_PARAMS(), \
-NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT_SFINAE_SPEC) BOOST_PP_SEQ_TAIL(e)) \
+NBL_EVAL(BOOST_PP_TUPLE_ELEM(BOOST_PP_SEQ_HEAD(e),NBL_IMPL_CONCEPT_SFINAE) BOOST_PP_SEQ_TAIL(e)) \
  > : ::nbl::hlsl::true_type {};
 //
 #define NBL_IMPL_CONCEPT_END_GET(r,unused,i,e) BOOST_PP_EXPR_IF(i,&&) BOOST_PP_CAT(__concept__,NBL_CONCEPT_NAME)::BOOST_PP_CAT(__requirement,i)<NBL_CONCEPT_TPLT_PARAMS()>::value
diff --git a/include/nbl/builtin/hlsl/member_test_macros.hlsl b/include/nbl/builtin/hlsl/member_test_macros.hlsl
index f103d6d83..7579fb0fa 100644
--- a/include/nbl/builtin/hlsl/member_test_macros.hlsl
+++ b/include/nbl/builtin/hlsl/member_test_macros.hlsl
@@ -4,7 +4,7 @@
 #ifndef _NBL_BUILTIN_HLSL_MEMBER_TEST_MACROS_INCLUDED_
 #define _NBL_BUILTIN_HLSL_MEMBER_TEST_MACROS_INCLUDED_
 
-#include <nbl/builtin/hlsl/type_traits.hlsl>
+#include <nbl/builtin/hlsl/utility.hlsl>
 #include <boost/preprocessor.hpp>
 
 #ifdef __HLSL_VERSION
@@ -24,9 +24,6 @@ enum e_member_presence
     is_const   = 1<<2,
 };
 
-template<class T>
-T declval(){}
-
 template<bool=false>
 struct if_2_else_1 : integral_constant<uint32_t,1> {};
 template<>
@@ -53,7 +50,7 @@ struct is_static_member_##a<T,typename enable_if<!is_same<decltype(T::a),void>::
 template<class T, class=void> \
 struct is_member_##a: false_type { using type = void; }; \
 template<class T> \
-struct is_member_##a<T,typename enable_if<!is_same<decltype(declval<T>().a),void>::value,void>::type> : true_type { using type = decltype(declval<T>().a); }; \
+struct is_member_##a<T,typename enable_if<!is_same<decltype(experimental::declval<T>().a),void>::value,void>::type> : true_type { using type = decltype(experimental::declval<T>().a); }; \
 } \
 template<class T> \
 struct has_member_##a {  NBL_CONSTEXPR_STATIC_INLINE e_member_presence value = (e_member_presence)(impl::is_member_##a<T>::value + 2*impl::is_static_member_##a<T>::value + 4*is_const<typename impl::is_member_##a<T>::type>::value); }; \
@@ -72,7 +69,7 @@ NBL_GENERATE_MEMBER_TESTER(w)
 #define NBL_TYPE_DECLARE(z, n, x) BOOST_PP_COMMA_IF(x) typename Arg##n
 #define NBL_TYPE_DECLARE_DEFAULT(z, n, x) BOOST_PP_COMMA_IF(x) typename Arg##n=void
 #define NBL_TYPE_FWD(z, n, x) BOOST_PP_COMMA_IF(x) Arg##n
-#define NBL_DECLVAL_DECLARE(z, n, x) impl::declval<Arg##n>() BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(BOOST_PP_INC(n), x))
+#define NBL_DECLVAL_DECLARE(z, n, x) experimental::declval<Arg##n>() BOOST_PP_COMMA_IF(BOOST_PP_NOT_EQUAL(BOOST_PP_INC(n), x))
 
 #define GENERATE_STATIC_METHOD_TESTER_SPEC(z, n, x) \
 template<class T BOOST_PP_REPEAT(n, NBL_TYPE_DECLARE, n)> \
@@ -89,9 +86,9 @@ BOOST_PP_REPEAT(n, GENERATE_STATIC_METHOD_TESTER_SPEC, x)
 
 #define GENERATE_METHOD_TESTER_SPEC(z, n, x) \
 template<class T BOOST_PP_REPEAT(n, NBL_TYPE_DECLARE, n)> \
-struct has_method_##x<T BOOST_PP_REPEAT(n, NBL_TYPE_FWD, n), typename make_void<decltype(impl::declval<T>().x(BOOST_PP_REPEAT(n, NBL_DECLVAL_DECLARE, n)))>::type> : impl::if_2_else_1<impl::has_static_method_##x<T BOOST_PP_REPEAT(n, NBL_TYPE_FWD, n)>::value> \
+struct has_method_##x<T BOOST_PP_REPEAT(n, NBL_TYPE_FWD, n), typename make_void<decltype(experimental::declval<T>().x(BOOST_PP_REPEAT(n, NBL_DECLVAL_DECLARE, n)))>::type> : impl::if_2_else_1<impl::has_static_method_##x<T BOOST_PP_REPEAT(n, NBL_TYPE_FWD, n)>::value> \
 { \
-    using return_type = decltype(impl::declval<T>().x(BOOST_PP_REPEAT(n, NBL_DECLVAL_DECLARE, n))); \
+    using return_type = decltype(experimental::declval<T>().x(BOOST_PP_REPEAT(n, NBL_DECLVAL_DECLARE, n))); \
     NBL_CONSTEXPR_STATIC_INLINE uint arg_count = n; \
 }; 
 
diff --git a/include/nbl/builtin/hlsl/type_traits.hlsl b/include/nbl/builtin/hlsl/type_traits.hlsl
index 3a4e0eda7..1481d087f 100644
--- a/include/nbl/builtin/hlsl/type_traits.hlsl
+++ b/include/nbl/builtin/hlsl/type_traits.hlsl
@@ -176,7 +176,7 @@ struct make_void { using type = void; };
 
 #ifdef __HLSL_VERSION // HLSL
 
-#define decltype(expr) __decltype(expr)
+#define decltype(...) __decltype(__VA_ARGS__)
 
 template<class T>
 struct type_identity 
diff --git a/include/nbl/builtin/hlsl/utility.hlsl b/include/nbl/builtin/hlsl/utility.hlsl
new file mode 100644
index 000000000..487d4a7d7
--- /dev/null
+++ b/include/nbl/builtin/hlsl/utility.hlsl
@@ -0,0 +1,38 @@
+// Copyright (C) 2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+#ifndef _NBL_BUILTIN_HLSL_UTILITY_INCLUDED_
+#define _NBL_BUILTIN_HLSL_UTILITY_INCLUDED_
+
+
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+
+
+// for now we only implement declval
+namespace nbl
+{
+namespace hlsl
+{
+#ifndef __HLSL_VERSION
+
+template<class T>
+std::add_rvalue_reference_t<T> declval() noexcept
+{
+	static_assert(false,"Actually calling declval is ill-formed.");
+}
+
+#else
+
+namespace experimental
+{
+
+template<class T>
+T declval() {}
+
+}
+
+#endif
+}
+}
+
+#endif
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index b3ec566be..53ab53497 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -256,6 +256,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/concepts.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/functional.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/limits.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/type_traits.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/utility.hlsl")
 #metaprogramming
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/mpl.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/member_test_macros.hlsl")

From 067e8a385750177bf91f92aa0a1c832ba0d1c6b6 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 5 Nov 2024 16:46:08 +0100
Subject: [PATCH 10/14] make pipeline layouts const and improve the
 `getBindingInfoForHLSL`

---
 include/nbl/asset/ICPUComputePipeline.h  |  6 ++---
 include/nbl/asset/ICPUGraphicsPipeline.h |  4 +--
 include/nbl/asset/ICPUPipeline.h         |  6 ++---
 include/nbl/asset/IPipeline.h            |  6 ++---
 include/nbl/asset/IPipelineLayout.h      | 32 ++++++++++++++++--------
 5 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/include/nbl/asset/ICPUComputePipeline.h b/include/nbl/asset/ICPUComputePipeline.h
index f3af332c6..14b027715 100644
--- a/include/nbl/asset/ICPUComputePipeline.h
+++ b/include/nbl/asset/ICPUComputePipeline.h
@@ -25,7 +25,7 @@ class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>,1>
         {
             if (!params.layout)
                 return nullptr;
-            auto retval = new ICPUComputePipeline(core::smart_refctd_ptr<ICPUPipelineLayout>(params.layout));
+            auto retval = new ICPUComputePipeline(core::smart_refctd_ptr<const ICPUPipelineLayout>(params.layout));
             if (!retval->setSpecInfo(params.shader))
             {
                 retval->drop();
@@ -48,7 +48,7 @@ class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>,1>
         using base_t::base_t;
         virtual ~ICPUComputePipeline() = default;
 
-        base_t* clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout) const override
+        base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const override 
         {
             return new ICPUComputePipeline(std::move(layout));
         }
@@ -57,7 +57,7 @@ class ICPUComputePipeline : public ICPUPipeline<IPipeline<ICPUPipelineLayout>,1>
         {
             if (ix!=0)
                 return m_stages[0].shader.get();
-            return m_layout.get();
+            return const_cast<ICPUPipelineLayout*>(m_layout.get());
         }
 
         inline int8_t stageToIndex(const ICPUShader::E_SHADER_STAGE stage) const override
diff --git a/include/nbl/asset/ICPUGraphicsPipeline.h b/include/nbl/asset/ICPUGraphicsPipeline.h
index 8b922c5a4..e319b2750 100644
--- a/include/nbl/asset/ICPUGraphicsPipeline.h
+++ b/include/nbl/asset/ICPUGraphicsPipeline.h
@@ -65,7 +65,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 		using base_t::base_t;
         ~ICPUGraphicsPipeline() = default;
 
-		base_t* clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout) const override
+		base_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const override
 		{
 			std::array<ICPUShader::SSpecInfo,GRAPHICS_SHADER_STAGE_COUNT> _shaders;
 			for (auto i=0; i<GRAPHICS_SHADER_STAGE_COUNT; i++)
@@ -80,7 +80,7 @@ class ICPUGraphicsPipeline final : public ICPUPipeline<IGraphicsPipeline<ICPUPip
 		inline IAsset* getDependant_impl(const size_t ix) override
 		{
 			if (ix==0)
-				return m_layout.get();
+				return const_cast<ICPUPipelineLayout*>(m_layout.get());
 			if (ix==1)
 				return m_renderpass.get();
 			size_t stageCount = 0;
diff --git a/include/nbl/asset/ICPUPipeline.h b/include/nbl/asset/ICPUPipeline.h
index 7a0f0c5bf..5c43df017 100644
--- a/include/nbl/asset/ICPUPipeline.h
+++ b/include/nbl/asset/ICPUPipeline.h
@@ -51,11 +51,11 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase
         ICPUPipelineLayout* getLayout() 
         {
             assert(isMutable());
-            return PipelineNonAssetBase::m_layout.get(); 
+            return const_cast<ICPUPipelineLayout*>(PipelineNonAssetBase::m_layout.get());
         }
         const ICPUPipelineLayout* getLayout() const { return PipelineNonAssetBase::m_layout.get(); }
 
-        inline void setLayout(core::smart_refctd_ptr<ICPUPipelineLayout>&& _layout)
+        inline void setLayout(core::smart_refctd_ptr<const ICPUPipelineLayout>&& _layout)
         {
             assert(isMutable());
             PipelineNonAssetBase::m_layout = std::move(_layout);
@@ -117,7 +117,7 @@ class ICPUPipeline : public IAsset, public PipelineNonAssetBase
         using PipelineNonAssetBase::PipelineNonAssetBase;
         virtual ~ICPUPipeline() = default;
 
-        virtual this_t* clone_impl(core::smart_refctd_ptr<ICPUPipelineLayout>&& layout) const = 0;
+        virtual this_t* clone_impl(core::smart_refctd_ptr<const ICPUPipelineLayout>&& layout) const = 0;
         virtual int8_t stageToIndex(const ICPUShader::E_SHADER_STAGE stage) const = 0;
 
         struct ShaderStage {
diff --git a/include/nbl/asset/IPipeline.h b/include/nbl/asset/IPipeline.h
index 6af7b50bf..40623876f 100644
--- a/include/nbl/asset/IPipeline.h
+++ b/include/nbl/asset/IPipeline.h
@@ -35,7 +35,7 @@ class IPipeline
 		struct SCreationParams
 		{
 			public:
-				PipelineLayout* layout = nullptr;
+				const PipelineLayout* layout = nullptr;
 
 			protected:
 				// This is not public to make sure that different pipelines only get the enums they support
@@ -107,9 +107,9 @@ class IPipeline
 		inline const PipelineLayout* getLayout() const {return m_layout.get();}
 
 	protected:
-		inline IPipeline(core::smart_refctd_ptr<PipelineLayout>&& _layout) : m_layout(std::move(_layout)) {}
+		inline IPipeline(core::smart_refctd_ptr<const PipelineLayout>&& _layout) : m_layout(std::move(_layout)) {}
 
-		core::smart_refctd_ptr<PipelineLayout> m_layout;
+		core::smart_refctd_ptr<const PipelineLayout> m_layout;
 };
 
 }
diff --git a/include/nbl/asset/IPipelineLayout.h b/include/nbl/asset/IPipelineLayout.h
index fdbc97bbf..7cc980290 100644
--- a/include/nbl/asset/IPipelineLayout.h
+++ b/include/nbl/asset/IPipelineLayout.h
@@ -143,41 +143,51 @@ class IPipelineLayout
         }
 
         // utility function, if you compile shaders for specific layouts, not create layouts given shaders
-        using desc_type_bitset_t = std::bitset<static_cast<size_t>(IDescriptor::E_TYPE::ET_COUNT)>;
+        struct SBindingKey
+        {
+            using type_bitset_t = std::bitset<static_cast<size_t>(IDescriptor::E_TYPE::ET_COUNT)>;
+
+            hlsl::SBindingInfo binding = {};
+            core::bitflag<IShader::E_SHADER_STAGE> requiredStages = IShader::E_SHADER_STAGE::ESS_UNKNOWN;
+            // could have just initialized with `~type_bitset_t()` in C++23
+            type_bitset_t allowedTypes = type_bitset_t((0x1u<<static_cast<size_t>(IDescriptor::E_TYPE::ET_COUNT))-1);
+        };
         // TODO: add constraints for stage and creation flags, or just return the storage index & redirect?
-        core::string getBindingInfoForHLSL(const hlsl::SBindingInfo& info, const desc_type_bitset_t allowedTypes=desc_type_bitset_t().set()) const
+        core::string getBindingInfoForHLSL(const SBindingKey& key) const
         {
-            if (info.set>=DESCRIPTOR_SET_COUNT)
-                return "#error \"::nbl::hlsl::SBindingInfo::set out of range!\"";
-            const auto* layout = m_descSetLayouts[info.set];
+            if (key.binding.set>=DESCRIPTOR_SET_COUNT)
+                return "#error \"IPipelineLayout::SBindingKey::binding::set out of range!\"";
+            const auto* layout = m_descSetLayouts[key.binding.set].get();
             if (!layout)
-                return "#error \"::nbl::hlsl::SBindingInfo::set layout is nullptr!\"";
+                return "#error \"IPipelineLayout::SBindingKey::binding::set layout is nullptr!\"";
             //
             using redirect_t = IDescriptorSetLayoutBase::CBindingRedirect;
             using storage_range_index_t = redirect_t::storage_range_index_t;
             const redirect_t* redirect;
             storage_range_index_t found;
             {
-                const redirect_t::binding_number_t binding(info.binding);
+                const redirect_t::binding_number_t binding(key.binding.binding);
                 for (auto t=0u; t<static_cast<size_t>(IDescriptor::E_TYPE::ET_COUNT); t++)
-                if (allowedTypes.test(t))
+                if (key.allowedTypes.test(t))
                 {
                     redirect = &layout->getDescriptorRedirect(static_cast<IDescriptor::E_TYPE>(t));
                     found = redirect->findBindingStorageIndex(binding);
                     if (found)
                         break;
                 }
-                if (!found && allowedTypes.test(static_cast<size_t>(IDescriptor::E_TYPE::ET_SAMPLER)))
+                if (!found && key.allowedTypes.test(static_cast<size_t>(IDescriptor::E_TYPE::ET_SAMPLER)))
                 {
                     redirect = &layout->getImmutableSamplerRedirect();
                     found = redirect->findBindingStorageIndex(binding);
                 }
                 if (!found)
-                    return "#error \"Could not find `::nbl::hlsl::SBindingInfo::binding` in `::nbl::hlsl::SBindingInfo::set`'s layout!\"";
+                    return "#error \"Could not find `IPipelineLayout::SBindingKey::binding::binding` in `IPipelineLayout::SBindingKey::binding::set`'s layout!\"";
             }
+            if (redirect->getStageFlags(found).hasFlags(key.requiredStages))
+                return "#error \"Binding found in the layout doesn't have all the `IPipelineLayout::SBindingKey::binding::requiredStages` flags!\"";
             const auto count = redirect->getCount(found);
             assert(count); // this layout should have never passed validation
-            return "::nbl::hlsl::ConstevalBindingInfo<"+std::to_string(info.set)+","+std::to_string(info.binding)+","+std::to_string(count)+">";
+            return "::nbl::hlsl::ConstevalBindingInfo<"+std::to_string(key.binding.set)+","+std::to_string(key.binding.binding)+","+std::to_string(count)+">";
         }
 
     protected:

From 395ac581fd5c28efaf8bb4c3a7123ea6b194d6e0 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 5 Nov 2024 16:46:58 +0100
Subject: [PATCH 11/14] start using the asset converter to make Blit shaders

---
 examples_tests                             |   2 +-
 include/nbl/asset/IGraphicsPipeline.h      |   2 +-
 include/nbl/video/utilities/CComputeBlit.h | 293 +++------------------
 src/nbl/builtin/CMakeLists.txt             |  18 --
 src/nbl/video/utilities/CComputeBlit.cpp   | 204 +++++++++-----
 5 files changed, 188 insertions(+), 331 deletions(-)

diff --git a/examples_tests b/examples_tests
index f6492b0de..e95c56290 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit f6492b0de975754f960a2761aaacf3a1a3354100
+Subproject commit e95c56290e7f31f3f2a2b6e07ccafd7feb2e686e
diff --git a/include/nbl/asset/IGraphicsPipeline.h b/include/nbl/asset/IGraphicsPipeline.h
index 5f6365525..62861fdc9 100644
--- a/include/nbl/asset/IGraphicsPipeline.h
+++ b/include/nbl/asset/IGraphicsPipeline.h
@@ -155,7 +155,7 @@ class IGraphicsPipeline : public IPipeline<PipelineLayoutType>, public IGraphics
 
     protected:
         explicit IGraphicsPipeline(const SCreationParams& _params) :
-            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<PipelineLayoutType>(_params.layout)),
+            IPipeline<PipelineLayoutType>(core::smart_refctd_ptr<const PipelineLayoutType>(_params.layout)),
             m_params(_params.cached), m_renderpass(core::smart_refctd_ptr<renderpass_t>(_params.renderpass)) {}
 
         SCachedCreationParams m_params;
diff --git a/include/nbl/video/utilities/CComputeBlit.h b/include/nbl/video/utilities/CComputeBlit.h
index 69b8d8ba2..4180ac420 100644
--- a/include/nbl/video/utilities/CComputeBlit.h
+++ b/include/nbl/video/utilities/CComputeBlit.h
@@ -8,9 +8,16 @@
 namespace nbl::video
 {
 
-class NBL_API2 CComputeBlit : public core::IReferenceCounted
+class CComputeBlit : public core::IReferenceCounted
 {
 	public:
+		constexpr static inline asset::SPushConstantRange DefaultPushConstantRange = {
+			.stageFlags = IGPUShader::E_SHADER_STAGE::ESS_COMPUTE,
+			.offset = 0ull,
+			.size = sizeof(hlsl::blit::parameters2_t)
+		};
+		constexpr static inline std::span<const asset::SPushConstantRange> DefaultPushConstantRanges = {&DefaultPushConstantRange,1};
+
 		// Coverage adjustment needs alpha to be stored in HDR with high precision
 		static inline asset::E_FORMAT getCoverageAdjustmentIntermediateFormat(const asset::E_FORMAT format)
 		{
@@ -41,7 +48,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 		}
 
 		// ctor
-		CComputeBlit(
+		NBL_API2 CComputeBlit(
 			core::smart_refctd_ptr<ILogicalDevice>&& logicalDevice,
 			core::smart_refctd_ptr<asset::IShaderCompiler::CCache>&& cache=nullptr,
 			core::smart_refctd_ptr<system::ILogger>&& logger=nullptr
@@ -52,6 +59,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 		{
 			core::smart_refctd_ptr<IGPUComputePipeline> blit;
 			core::smart_refctd_ptr<IGPUComputePipeline> coverage;
+			uint16_t workgroupSize;
 		};
 		struct SPipelinesCreateInfo
 		{
@@ -67,13 +75,13 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 			hlsl::SBindingInfo samplers;
 			// must be Storage Image descriptor type
 			hlsl::SBindingInfo outputs;
-			//! If you set the balues too small, we'll correct them ourselves anyway
+			//! If you set the balues too small, we'll correct them ourselves anyway, default values of 0 means we guess and provide our defaults
 			// needs to be at least as big as the maximum subgroup size 
-			uint32_t workgroupSizeLog2 : 4 = 0;
-			//
-			uint32_t sharedMemoryPerInvocation : 6 = 0;
+			uint16_t workgroupSizeLog2 : 4 = 0;
+			// in bytes, needs to be at least enough to store two full input pixels per invocation
+			uint16_t sharedMemoryPerInvocation : 6 = 0;
 		};
-		SPipelines createAndCachePipelines(const SPipelinesCreateInfo& info);
+		NBL_API2 SPipelines createAndCachePipelines(const SPipelinesCreateInfo& info);
 
 		//! Returns the original format if supports STORAGE_IMAGE otherwise returns a format in its compat class which supports STORAGE_IMAGE.
 		inline asset::E_FORMAT getOutputViewFormat(const asset::E_FORMAT format)
@@ -99,101 +107,38 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 			}
 		}
 
-#if 0
-		// @param `alphaBinCount` is only required to size the histogram present in the default nbl_glsl_blit_AlphaStatistics_t in default_compute_common.comp
-		core::smart_refctd_ptr<video::IGPUShader> createAlphaTestSpecializedShader(const asset::IImage::E_TYPE inImageType, const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount);
-
-		core::smart_refctd_ptr<video::IGPUComputePipeline> getAlphaTestPipeline(const uint32_t alphaBinCount, const asset::IImage::E_TYPE imageType)
-		{
-			const auto workgroupDims = getDefaultWorkgroupDims(imageType);
-			const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
-
-			assert(paddedAlphaBinCount >= asset::IBlitUtilities::MinAlphaBinCount);
-			const auto pipelineIndex = (paddedAlphaBinCount / asset::IBlitUtilities::MinAlphaBinCount) - 1;
-
-			if (m_alphaTestPipelines[pipelineIndex][imageType])
-				return m_alphaTestPipelines[pipelineIndex][imageType];
-
-			auto specShader = createAlphaTestSpecializedShader(imageType, paddedAlphaBinCount);
-			IGPUComputePipeline::SCreationParams creationParams;
-			creationParams.shader.shader = specShader.get();
-			creationParams.shader.entryPoint = "main";
-			creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get();
-			assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_alphaTestPipelines[pipelineIndex][imageType]));
-
-			return m_alphaTestPipelines[pipelineIndex][imageType];
-		}
-
-		// @param `outFormat` dictates encoding.
-		core::smart_refctd_ptr<video::IGPUShader> createNormalizationSpecializedShader(const asset::IImage::E_TYPE inImageType, const asset::E_FORMAT outFormat,
-			const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount);
-
-		core::smart_refctd_ptr<video::IGPUComputePipeline> getNormalizationPipeline(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat,
-			const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount)
+		// Use the return values of `getOutputViewFormat` and `getCoverageAdjustmentIntermediateFormat` for this
+		static inline uint32_t getAlphaBinCount(const uint16_t workgroupSize, const asset::E_FORMAT intermediateAlpha, const uint32_t layersToBlit)
 		{
-			const auto workgroupDims = getDefaultWorkgroupDims(imageType);
-			const uint32_t paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
-			const SNormalizationCacheKey key = { imageType, paddedAlphaBinCount, outFormat };
-
-			if (m_normalizationPipelines.find(key) == m_normalizationPipelines.end())
+			uint16_t baseBucketCount;
+			using format_t = nbl::asset::E_FORMAT;
+			switch (intermediateAlpha)
 			{
-				auto specShader = createNormalizationSpecializedShader(imageType, outFormat, paddedAlphaBinCount);
-				IGPUComputePipeline::SCreationParams creationParams;
-				creationParams.shader.shader = specShader.get();
-				creationParams.shader.entryPoint = "main";
-				creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get();
-				assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_normalizationPipelines[key]));
+				case format_t::EF_R8_UNORM: [[fallthrough]];
+				case format_t::EF_R8_SNORM:
+					baseBucketCount = 256;
+					break;
+				case format_t::EF_R16_SFLOAT:
+					baseBucketCount = 512;
+					break;
+				case format_t::EF_R16_UNORM: [[fallthrough]];
+				case format_t::EF_R16_SNORM: [[fallthrough]];
+					baseBucketCount = 1024;
+					break;
+				case format_t::EF_R32_SFLOAT:
+					baseBucketCount = 2048;
+					break;
+				default:
+					return 0;
 			}
-
-			return m_normalizationPipelines[key];
+			// the absolute minimum needed to store a single pixel of a worst case format (precise, all 4 channels)
+			constexpr auto singlePixelStorage = 4*sizeof(hlsl::float32_t);
+			constexpr auto ratio = singlePixelStorage/sizeof(uint16_t);
+			const auto paddedAlphaBinCount = core::min(core::roundUp(baseBucketCount,workgroupSize),workgroupSize*ratio);
+			return paddedAlphaBinCount*layersToBlit;
 		}
 
-		template <typename BlitUtilities>
-		core::smart_refctd_ptr<video::IGPUComputePipeline> getBlitPipeline(
-			const asset::E_FORMAT									outFormat,
-			const asset::IImage::E_TYPE								imageType,
-			const core::vectorSIMDu32& inExtent,
-			const core::vectorSIMDu32& outExtent,
-			const asset::IBlitUtilities::E_ALPHA_SEMANTIC			alphaSemantic,
-			const typename BlitUtilities::convolution_kernels_t& kernels,
-			const uint32_t											workgroupSize = 256,
-			const uint32_t											alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount)
-		{
-			const auto paddedAlphaBinCount = getPaddedAlphaBinCount(core::vectorSIMDu32(workgroupSize, 1, 1, 1), alphaBinCount);
-
-			const SBlitCacheKey key =
-			{
-				.wgSize = workgroupSize,
-				.imageType = imageType,
-				.alphaBinCount = paddedAlphaBinCount,
-				.outFormat = outFormat,
-				.smemSize = m_availableSharedMemory,
-				.coverageAdjustment = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE)
-			};
-
-			if (m_blitPipelines.find(key) == m_blitPipelines.end())
-			{
-				const auto blitType = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) ? EBT_COVERAGE_ADJUSTMENT : EBT_REGULAR;
-
-				auto specShader = createBlitSpecializedShader<BlitUtilities>(
-					outFormat,
-					imageType,
-					inExtent,
-					outExtent,
-					alphaSemantic,
-					kernels,
-					workgroupSize,
-					paddedAlphaBinCount);
-
-				IGPUComputePipeline::SCreationParams creationParams;
-				creationParams.shader.shader = specShader.get();
-				creationParams.shader.entryPoint = "main";
-				creationParams.layout = m_blitPipelineLayout[blitType].get();
-				m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_blitPipelines[key]);
-			}
-
-			return m_blitPipelines[key];
-		}
+#if 0
 
 		//! Returns the number of output texels produced by one workgroup, deciding factor is `m_availableSharedMemory`.
 		//! @param outImageFormat is the format of output (of the blit step) image.
@@ -368,152 +313,10 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 				outDispatchInfo.wgCount[2] = workgroupCount[2];
 		}
 
-		static inline core::vectorSIMDu32 getDefaultWorkgroupDims(const asset::IImage::E_TYPE imageType)
-		{
-			switch (imageType)
-			{
-				case asset::IImage::ET_1D:
-					return core::vectorSIMDu32(256, 1, 1, 1);
-				case asset::IImage::ET_2D:
-					return core::vectorSIMDu32(16, 16, 1, 1);
-				case asset::IImage::ET_3D:
-					return core::vectorSIMDu32(8, 8, 4, 1);
-				default:
-					return core::vectorSIMDu32(1, 1, 1, 1);
-			}
-		}
-
-		static inline size_t getCoverageAdjustmentScratchSize(const asset::IBlitUtilities::E_ALPHA_SEMANTIC alphaSemantic, const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount, const uint32_t layersToBlit)
-		{
-			if (alphaSemantic != asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE)
-				return 0;
-
-			const auto workgroupDims = getDefaultWorkgroupDims(imageType);
-			const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
-			const auto requiredSize = (sizeof(uint32_t) + paddedAlphaBinCount * sizeof(uint32_t)) * layersToBlit;
-			return requiredSize;
-		}
-
-		bool updateDescriptorSet(
-			video::IGPUDescriptorSet* blitDS,
-			video::IGPUDescriptorSet* kernelWeightsDS,
-			core::smart_refctd_ptr<video::IGPUImageView>			inImageView,
-			core::smart_refctd_ptr<video::IGPUImageView>			outImageView,
-			core::smart_refctd_ptr<video::IGPUBuffer>				coverageAdjustmentScratchBuffer,
-			core::smart_refctd_ptr<video::IGPUBufferView>			kernelWeightsUTB,
-			const asset::ISampler::E_TEXTURE_CLAMP					wrapU = asset::ISampler::ETC_CLAMP_TO_EDGE,
-			const asset::ISampler::E_TEXTURE_CLAMP					wrapV = asset::ISampler::ETC_CLAMP_TO_EDGE,
-			const asset::ISampler::E_TEXTURE_CLAMP					wrapW = asset::ISampler::ETC_CLAMP_TO_EDGE,
-			const asset::ISampler::E_TEXTURE_BORDER_COLOR			borderColor = asset::ISampler::ETBC_FLOAT_OPAQUE_BLACK)
-		{
-			constexpr auto MAX_DESCRIPTOR_COUNT = 3;
-
-			auto updateDS = [this, coverageAdjustmentScratchBuffer](video::IGPUDescriptorSet* ds, video::IGPUDescriptorSet::SDescriptorInfo* infos) -> bool
-			{
-				const auto bindingCount = ds->getLayout()->getTotalBindingCount();
-				if ((bindingCount == 3) && !coverageAdjustmentScratchBuffer)
-					return false;
-
-				video::IGPUDescriptorSet::SWriteDescriptorSet writes[MAX_DESCRIPTOR_COUNT] = {};
-
-				uint32_t infoIdx = 0;
-				uint32_t writeCount = 0;
-				for (uint32_t t = 0; t < static_cast<uint32_t>(asset::IDescriptor::E_TYPE::ET_COUNT); ++t)
-				{
-					const auto type = static_cast<asset::IDescriptor::E_TYPE>(t);
-					const auto& redirect = ds->getLayout()->getDescriptorRedirect(type);
-					const auto declaredBindingCount = redirect.getBindingCount();
-
-					for (uint32_t i = 0; i < declaredBindingCount; ++i)
-					{
-						auto& write = writes[writeCount++];
-						write.dstSet = ds;
-						write.binding = redirect.getBinding(IGPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ i }).data;
-						write.arrayElement = 0u;
-						write.count = redirect.getCount(IGPUDescriptorSetLayout::CBindingRedirect::storage_range_index_t{ i });
-						write.info = &infos[infoIdx];
-
-						infoIdx += write.count;
-					}
-				}
-				assert(writeCount == bindingCount);
-				m_device->updateDescriptorSets(writeCount, writes, 0u, nullptr);
-
-				return true;
-			};
-
-			if (blitDS)
-			{
-				if (!inImageView || !outImageView)
-					return false;
-
-				video::IGPUDescriptorSet::SDescriptorInfo infos[MAX_DESCRIPTOR_COUNT] = {};
-
-				if (!samplers[wrapU][wrapV][wrapW][borderColor])
-				{
-					video::IGPUSampler::SParams params = {};
-					params.TextureWrapU = wrapU;
-					params.TextureWrapV = wrapV;
-					params.TextureWrapW = wrapW;
-					params.BorderColor = borderColor;
-					params.MinFilter = asset::ISampler::ETF_NEAREST;
-					params.MaxFilter = asset::ISampler::ETF_NEAREST;
-					params.MipmapMode = asset::ISampler::ESMM_NEAREST;
-					params.AnisotropicFilter = 0u;
-					params.CompareEnable = 0u;
-					params.CompareFunc = asset::ISampler::ECO_ALWAYS;
-
-					samplers[wrapU][wrapV][wrapW][borderColor] = m_device->createSampler(params);
-					if (!samplers[wrapU][wrapV][wrapW][borderColor])
-						return false;
-				}
-
-				infos[0].desc = inImageView;
-				infos[0].info.image.imageLayout = asset::IImage::LAYOUT::READ_ONLY_OPTIMAL;
-				infos[0].info.combinedImageSampler.sampler = samplers[wrapU][wrapV][wrapW][borderColor];
-
-				infos[1].desc = outImageView;
-				infos[1].info.image.imageLayout = asset::IImage::LAYOUT::GENERAL;
-				infos[1].info.combinedImageSampler.sampler = nullptr;
-
-				if (coverageAdjustmentScratchBuffer)
-				{
-					infos[2].desc = coverageAdjustmentScratchBuffer;
-					infos[2].info.buffer.offset = 0;
-					infos[2].info.buffer.size = coverageAdjustmentScratchBuffer->getSize();
-				}
-
-				if (!updateDS(blitDS, infos))
-					return false;
-			}
-
-			if (kernelWeightsDS)
-			{
-				video::IGPUDescriptorSet::SDescriptorInfo info = {};
-				info.desc = kernelWeightsUTB;
-				info.info.buffer.offset = 0ull;
-				info.info.buffer.size = kernelWeightsUTB->getUnderlyingBuffer()->getSize();
-
-				if (!updateDS(kernelWeightsDS, &info))
-					return false;
-			}
-
-			return true;
-		}
-
 		//! User is responsible for the memory barriers between previous writes and the first
 		//! dispatch on the input image, and future reads of output image and the last dispatch.
 		template <typename BlitUtilities>
 		inline void blit(
-			video::IGPUCommandBuffer* cmdbuf,
-			const asset::IBlitUtilities::E_ALPHA_SEMANTIC			alphaSemantic,
-			video::IGPUDescriptorSet* alphaTestDS,
-			video::IGPUComputePipeline* alphaTestPipeline,
-			video::IGPUDescriptorSet* blitDS,
-			video::IGPUDescriptorSet* blitWeightsDS,
-			video::IGPUComputePipeline* blitPipeline,
-			video::IGPUDescriptorSet* normalizationDS,
-			video::IGPUComputePipeline* normalizationPipeline,
 			const core::vectorSIMDu32& inImageExtent,
 			const asset::IImage::E_TYPE								inImageType,
 			const asset::E_FORMAT									inImageFormat,
@@ -627,7 +430,7 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 		}
 
 		//! Query shared memory size for a given `outputTexelsPerWG`.
-		size_t getRequiredSharedMemorySize(
+		inline size_t getRequiredSharedMemorySize(
 			const core::vectorSIMDu32& outputTexelsPerWG,
 			const core::vectorSIMDu32& outExtent,
 			const asset::IImage::E_TYPE imageType,
@@ -641,16 +444,6 @@ class NBL_API2 CComputeBlit : public core::IReferenceCounted
 			const size_t requiredSmem = (core::max(preloadRegion.x * preloadRegion.y * preloadRegion.z, outputTexelsPerWG.x * outputTexelsPerWG.y * preloadRegion.z) + outputTexelsPerWG.x * preloadRegion.y * preloadRegion.z) * channelCount * sizeof(float);
 			return requiredSmem;
 		};
-
-		static inline uint32_t getPaddedAlphaBinCount(const core::vectorSIMDu32& workgroupDims, const uint32_t oldAlphaBinCount)
-		{
-			// For the normalization shader, it should be that:
-			//	alphaBinCount = k*workGroupSize, k is integer, k >= 1, 
-			assert(workgroupDims.x != 0 && workgroupDims.y != 0 && workgroupDims.z != 0);
-			const auto wgSize = workgroupDims.x * workgroupDims.y * workgroupDims.z;
-			const auto paddedAlphaBinCount = core::roundUp(oldAlphaBinCount, wgSize);
-			return paddedAlphaBinCount;
-		}
 };
 
 }
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 53ab53497..4dbd039b5 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -207,24 +207,6 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/com
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/push_constants_struct_common.h")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/depth_pyramid_generator_impl.glsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/ext/DepthPyramidGenerator/virtual_work_group.glsl")
-# blit
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/formats_encode.glsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/parameters.glsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/multi_dimensional_array_addressing.glsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/default_compute_common.comp")
-
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/default_compute_blit.comp")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/blit/blit.glsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/blit/descriptors.glsl")
-
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/default_compute_alpha_test.comp")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/alpha_test/alpha_test.glsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/alpha_test/descriptors.glsl")
-
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/default_compute_normalization.comp")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/normalization.glsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/descriptors.glsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/blit/normalization/shared_normalization.glsl")
 
 # HLSL
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/macros.h")
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index 1dd123952..2ad565600 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -23,36 +23,25 @@ CComputeBlit::CComputeBlit(smart_refctd_ptr<ILogicalDevice>&& logicalDevice, sma
 auto CComputeBlit::createAndCachePipelines(const SPipelinesCreateInfo& info) -> SPipelines
 {
 	SPipelines retval;
-	core::vector<smart_refctd_ptr<ICPUComputePipeline>> cpuPplns;
-	cpuPplns.reserve(tasks.size());
+
+	std::array<smart_refctd_ptr<ICPUComputePipeline>,2> cpuPplns;
 
 	const auto& limits = m_device->getPhysicalDevice()->getLimits();
-	for (auto task : tasks)
+	retval.workgroupSize = 0x1u<<info.workgroupSizeLog2;
+	if (retval.workgroupSize <limits.maxSubgroupSize)
+		retval.workgroupSize = core::roundDownToPoT(limits.maxComputeWorkGroupInvocations);
+	// the absolute minimum needed to store a single pixel of a worst case format (precise, all 4 channels)
+	constexpr auto singlePixelStorage = 4*sizeof(hlsl::float32_t);
+	// also slightly more memory is needed to even have a skirt of any size
+	const auto sharedMemoryPerInvocation = core::max(singlePixelStorage*2,info.sharedMemoryPerInvocation);
+
+	const auto* layout = info.layout;
+
+	// 
+	const auto common = [&]()->std::string
 	{
-		// adjust task default values
-		{
-			if (task.workgroupSizeLog2<limits.maxSubgroupSize)
-				task.workgroupSizeLog2 = core::roundDownToPoT(limits.maxComputeWorkGroupInvocations);
-			bool useFloat16 = false;
-			uint16_t channels = 4;
-			using namespace hlsl::format;
-			if (task.outputFormat!=TexelBlockFormat::TBF_UNKNOWN)
-			{
-				channels = getTraits(task.outputFormat).Channels;
-				const auto precisionAt1 = getFormatPrecision(static_cast<E_FORMAT>(task.outputFormat),3,1.f);
-				const auto precisionAt0 = getFormatPrecision(static_cast<E_FORMAT>(task.outputFormat),3,0.f);
-				if (limits.workgroupMemoryExplicitLayout16BitAccess && limits.shaderFloat16 && precisionAt1>=std::exp2f(-11.f) && precisionAt0>=std::numeric_limits<hlsl::float16_t>::min())
-					useFloat16 = true;
-			}
-			// the absolute minimum needed to store a single pixel
-			const auto singlePixelStorage = channels*(useFloat16 ? sizeof(hlsl::float16_t):sizeof(hlsl::float32_t));
-			// also slightly more memory is needed
-			task.sharedMemoryPerInvocation = core::max(singlePixelStorage*2,task.sharedMemoryPerInvocation);
-		}
-		const auto common = [&]()->std::string
-		{
-			std::ostringstream tmp;
-			tmp << R"===(
+		std::ostringstream tmp;
+		tmp << R"===(
 #include "nbl/builtin/hlsl/binding_info.hlsl"
 
 
@@ -61,54 +50,58 @@ using namespace nbl::hlsl;
 
 struct ConstevalParameters
 {
-	NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << (0x1u<<task.workgroupSizeLog2) << R"===(;
-    using kernel_weight_binding_t = )===" << layout->getBindingInfoForHLSL() << R"===(;
-    using input_sampler_binding_t = )===" << serializeBindingInfo() << R"===(;
-    using input_image_binding_t = )===" << serializeBindingInfo() << R"===(;
-    using output_binding_t = )===" << serializeBindingInfo() << R"===(;
-    NBL_CONSTEXPR_STATIC_INLINE uint32_t uint32_t SharedMemoryDWORDs = )===" << task.sharedMemoryPerInvocation/sizeof(uint32_t) << R"===(;
+NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkGroupSize = )===" << retval.workgroupSize << R"===(;
+using kernel_weight_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.kernelWeights,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(;
+using input_sampler_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.samplers,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(;
+using input_image_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.inputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(;
+using output_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.outputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(;
+NBL_CONSTEXPR_STATIC_INLINE uint32_t uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(;
 };
 )===";
-			return tmp.str();
-		}();
-		auto createPipeline = [&limits,&common](const char* mainPath)->smart_refctd_ptr<ICPUComputePipeline>
+		return tmp.str();
+	}();
+	auto createPipeline = [&limits,layout,&common](const char* mainPath)->smart_refctd_ptr<ICPUComputePipeline>
+	{
+		auto shader = make_smart_refctd_ptr<ICPUShader>(
+			(common+"\n#include \""+mainPath+"\"\n").c_str(),
+			IShader::E_SHADER_STAGE::ESS_COMPUTE,
+			IShader::E_CONTENT_TYPE::ECT_HLSL,
+			mainPath
+		);
+		// make sure there's a hash so asset converter doesn't fail
 		{
-			auto shader = make_smart_refctd_ptr<ICPUShader>(
-				(common+"\n#include \""+mainPath+"\"\n").c_str(),
-				IShader::E_SHADER_STAGE::ESS_COMPUTE,
-				IShader::E_CONTENT_TYPE::ECT_HLSL,
-				mainPath
-			);
-
-			ICPUComputePipeline::SCreationParams params = {};
-			params.layout = nullptr; // TODO
-			params.shader.entryPoint = "main";
-			params.shader.shader = shader.get();
-			params.shader.requiredSubgroupSize = static_cast<IShader::SSpecInfoBase::SUBGROUP_SIZE>(hlsl::findMSB(limits.maxSubgroupSize));
-			// needed for the prefix and reductions to work
-			params.shader.requireFullSubgroups = true;
-			return ICPUComputePipeline::create(params);
-		};
-		// create blit pipeline
-		cpuPplns.emplace_back(createPipeline("default_blit.comp.hlsl"));
-		cpuPplns.emplace_back(createPipeline("default_normalize.comp.hlsl"));
-	}
+			auto source = const_cast<ICPUBuffer*>(shader->getContent());
+			source->setContentHash(source->computeContentHash());
+		}
+
+		ICPUComputePipeline::SCreationParams params = {};
+		params.layout = layout;
+		params.shader.entryPoint = "main";
+		params.shader.shader = shader.get();
+		params.shader.requiredSubgroupSize = static_cast<IShader::SSpecInfoBase::SUBGROUP_SIZE>(hlsl::findMSB(limits.maxSubgroupSize));
+		// needed for the prefix and reductions to work
+		params.shader.requireFullSubgroups = true;
+		return ICPUComputePipeline::create(params);
+	};
+	// create blit pipeline
+	cpuPplns[0] = createPipeline("nbl/builtin/hlsl/blit/default_blit.comp.hlsl");
+	cpuPplns[1] = createPipeline("nbl/builtin/hlsl/blit/default_normalize.comp.hlsl");
 
 	CAssetConverter::SInputs inputs = {};
-	inputs.readCache = converter;
+	inputs.readCache = info.converter;
 	inputs.logger = m_logger.getRaw();
 	std::get<CAssetConverter::SInputs::asset_span_t<ICPUComputePipeline>>(inputs.assets) = {&cpuPplns.data()->get(),cpuPplns.size()};
 	inputs.readShaderCache = m_shaderCache.get();
 	inputs.writeShaderCache = m_shaderCache.get();
 	// no pipeline cache, because we only make the same pipeline once, ever
-	auto reserveResults = converter->reserve(inputs);
+	auto reserveResults = info.converter->reserve(inputs);
 	assert(reserveResults.getRequiredQueueFlags().value==IQueue::FAMILY_FLAGS::NONE);
+
 	// copy over the results
 	{
 		auto rIt = reserveResults.getGPUObjects<ICPUComputePipeline>().data();
-		// TODO: redo
-		for (size_t i=0; i<tasks.size(); i++)
-			*(pipelines++) =  (rIt++)->value;
+		retval.blit = (rIt++)->value;
+		retval.coverage = (rIt++)->value;
 	}
 
 	// this just inserts the pipelines into the cache
@@ -198,6 +191,53 @@ core::smart_refctd_ptr<video::IGPUShader> createBlitSpecializedShader(
 	return gpuShader;
 }
 
+template <typename BlitUtilities>
+core::smart_refctd_ptr<video::IGPUComputePipeline> getBlitPipeline(
+	const asset::E_FORMAT									outFormat,
+	const asset::IImage::E_TYPE								imageType,
+	const core::vectorSIMDu32& inExtent,
+	const core::vectorSIMDu32& outExtent,
+	const asset::IBlitUtilities::E_ALPHA_SEMANTIC			alphaSemantic,
+	const typename BlitUtilities::convolution_kernels_t& kernels,
+	const uint32_t											workgroupSize = 256,
+	const uint32_t											alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount)
+{
+	const auto paddedAlphaBinCount = getPaddedAlphaBinCount(core::vectorSIMDu32(workgroupSize, 1, 1, 1), alphaBinCount);
+
+	const SBlitCacheKey key =
+	{
+		.wgSize = workgroupSize,
+		.imageType = imageType,
+		.alphaBinCount = paddedAlphaBinCount,
+		.outFormat = outFormat,
+		.smemSize = m_availableSharedMemory,
+		.coverageAdjustment = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE)
+	};
+
+	if (m_blitPipelines.find(key) == m_blitPipelines.end())
+	{
+		const auto blitType = (alphaSemantic == asset::IBlitUtilities::EAS_REFERENCE_OR_COVERAGE) ? EBT_COVERAGE_ADJUSTMENT : EBT_REGULAR;
+
+		auto specShader = createBlitSpecializedShader<BlitUtilities>(
+			outFormat,
+			imageType,
+			inExtent,
+			outExtent,
+			alphaSemantic,
+			kernels,
+			workgroupSize,
+			paddedAlphaBinCount);
+
+		IGPUComputePipeline::SCreationParams creationParams;
+		creationParams.shader.shader = specShader.get();
+		creationParams.shader.entryPoint = "main";
+		creationParams.layout = m_blitPipelineLayout[blitType].get();
+		m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_blitPipelines[key]);
+	}
+
+	return m_blitPipelines[key];
+}
+
 core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
 {
 	const auto workgroupDims = getDefaultWorkgroupDims(imageType);
@@ -233,6 +273,28 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createAlphaTestSpecializ
 	auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSLGLSL::createAlphaTestSpecializedShader");
 }
 
+core::smart_refctd_ptr<video::IGPUComputePipeline> getAlphaTestPipeline(const uint32_t alphaBinCount, const asset::IImage::E_TYPE imageType)
+{
+	const auto workgroupDims = getDefaultWorkgroupDims(imageType);
+	const auto paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
+
+	assert(paddedAlphaBinCount >= asset::IBlitUtilities::MinAlphaBinCount);
+	const auto pipelineIndex = (paddedAlphaBinCount / asset::IBlitUtilities::MinAlphaBinCount) - 1;
+
+	if (m_alphaTestPipelines[pipelineIndex][imageType])
+		return m_alphaTestPipelines[pipelineIndex][imageType];
+
+	auto specShader = createAlphaTestSpecializedShader(imageType, paddedAlphaBinCount);
+	IGPUComputePipeline::SCreationParams creationParams;
+	creationParams.shader.shader = specShader.get();
+	creationParams.shader.entryPoint = "main";
+	creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get();
+	assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_alphaTestPipelines[pipelineIndex][imageType]));
+
+	return m_alphaTestPipelines[pipelineIndex][imageType];
+}
+
+// @param `outFormat` dictates encoding.
 core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpecializedShader(const asset::IImage::E_TYPE imageType, const uint32_t alphaBinCount)
 {
 	const auto workgroupDims = getDefaultWorkgroupDims(imageType);
@@ -276,4 +338,24 @@ core::smart_refctd_ptr<video::IGPUShader> CComputeBlit::createNormalizationSpeci
 
 	auto cpuShader = core::make_smart_refctd_ptr<asset::ICPUShader>(shaderSourceStream.str().c_str(), IGPUShader::E_SHADER_STAGE::ESS_COMPUTE, IGPUShader::E_CONTENT_TYPE::ECT_HLSL, "CComputeBlitGLSL::createNormalizationSpecializedShader");
 }
+
+core::smart_refctd_ptr<video::IGPUComputePipeline> getNormalizationPipeline(const asset::IImage::E_TYPE imageType, const asset::E_FORMAT outFormat,
+	const uint32_t alphaBinCount = asset::IBlitUtilities::DefaultAlphaBinCount)
+{
+	const auto workgroupDims = getDefaultWorkgroupDims(imageType);
+	const uint32_t paddedAlphaBinCount = getPaddedAlphaBinCount(workgroupDims, alphaBinCount);
+	const SNormalizationCacheKey key = { imageType, paddedAlphaBinCount, outFormat };
+
+	if (m_normalizationPipelines.find(key) == m_normalizationPipelines.end())
+	{
+		auto specShader = createNormalizationSpecializedShader(imageType, outFormat, paddedAlphaBinCount);
+		IGPUComputePipeline::SCreationParams creationParams;
+		creationParams.shader.shader = specShader.get();
+		creationParams.shader.entryPoint = "main";
+		creationParams.layout = m_blitPipelineLayout[EBT_COVERAGE_ADJUSTMENT].get();
+		assert(m_device->createComputePipelines(nullptr, { &creationParams, &creationParams + 1 }, &m_normalizationPipelines[key]));
+	}
+
+	return m_normalizationPipelines[key];
+}
 #endif
\ No newline at end of file

From e1a87e757995a1d531d5517c9a23e6d07b9dbafe Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 5 Nov 2024 17:16:24 +0100
Subject: [PATCH 12/14] stupid typos are the bane of my existence

---
 examples_tests                                            | 2 +-
 include/nbl/asset/IPipelineLayout.h                       | 2 +-
 include/nbl/builtin/hlsl/blit/common.hlsl                 | 2 +-
 include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl      | 7 +++----
 include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl | 6 ++----
 src/nbl/video/utilities/CComputeBlit.cpp                  | 2 +-
 6 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/examples_tests b/examples_tests
index e95c56290..e77ed5d46 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit e95c56290e7f31f3f2a2b6e07ccafd7feb2e686e
+Subproject commit e77ed5d468f929ac5e7f1909f728895c923eb2c4
diff --git a/include/nbl/asset/IPipelineLayout.h b/include/nbl/asset/IPipelineLayout.h
index 7cc980290..0eaba46f7 100644
--- a/include/nbl/asset/IPipelineLayout.h
+++ b/include/nbl/asset/IPipelineLayout.h
@@ -183,7 +183,7 @@ class IPipelineLayout
                 if (!found)
                     return "#error \"Could not find `IPipelineLayout::SBindingKey::binding::binding` in `IPipelineLayout::SBindingKey::binding::set`'s layout!\"";
             }
-            if (redirect->getStageFlags(found).hasFlags(key.requiredStages))
+            if (!redirect->getStageFlags(found).hasFlags(key.requiredStages))
                 return "#error \"Binding found in the layout doesn't have all the `IPipelineLayout::SBindingKey::binding::requiredStages` flags!\"";
             const auto count = redirect->getCount(found);
             assert(count); // this layout should have never passed validation
diff --git a/include/nbl/builtin/hlsl/blit/common.hlsl b/include/nbl/builtin/hlsl/blit/common.hlsl
index 6295e6870..93ed57931 100644
--- a/include/nbl/builtin/hlsl/blit/common.hlsl
+++ b/include/nbl/builtin/hlsl/blit/common.hlsl
@@ -12,7 +12,7 @@ namespace hlsl
 {
 namespace glsl
 {
-uint32_t gl_WorkGroupSize()
+uint32_t3 gl_WorkGroupSize()
 {
 	return uint32_t3(ConstevalParameters::WorkGroupSize,1,1);
 }
diff --git a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl
index ad2749904..c9184d016 100644
--- a/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl
+++ b/include/nbl/builtin/hlsl/blit/default_blit.comp.hlsl
@@ -1,12 +1,11 @@
 // Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-//#include "nbl/builtin/hlsl/blit/common.hlsl"
-//#include "nbl/builtin/hlsl/blit/parameters.hlsl"
-//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl"
+#include "nbl/builtin/hlsl/blit/parameters.hlsl"
 
+#include "nbl/builtin/hlsl/blit/common.hlsl"
+//#include "nbl/builtin/hlsl/blit/compute_blit.hlsl"
 
-groupshared uint32_t sMem[ConstevalParameters::SharedMemoryDWORDs];
 /*
 struct HistogramAccessor
 {
diff --git a/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl
index 589f370c0..8e2f4beb2 100644
--- a/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl
+++ b/include/nbl/builtin/hlsl/blit/default_normalize.comp.hlsl
@@ -1,11 +1,9 @@
 // Copyright (C) 2023-2024 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
-#include "nbl/builtin/hlsl/blit/common.hlsl"
-
+#include "nbl/builtin/hlsl/blit/parameters.hlsl"
 
-
-//#include "nbl/builtin/hlsl/blit/parameters.hlsl"
+#include "nbl/builtin/hlsl/blit/common.hlsl"
 //#include "nbl/builtin/hlsl/blit/compute_blit.hlsl"
 
 using namespace nbl::hlsl::blit;
diff --git a/src/nbl/video/utilities/CComputeBlit.cpp b/src/nbl/video/utilities/CComputeBlit.cpp
index 2ad565600..c3ceb6667 100644
--- a/src/nbl/video/utilities/CComputeBlit.cpp
+++ b/src/nbl/video/utilities/CComputeBlit.cpp
@@ -55,7 +55,7 @@ using kernel_weight_binding_t = )===" << layout->getBindingInfoForHLSL({.binding
 using input_sampler_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.samplers,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(;
 using input_image_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.inputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(;
 using output_binding_t = )===" << layout->getBindingInfoForHLSL({.binding=info.outputs,.requiredStages=IShader::E_SHADER_STAGE::ESS_COMPUTE}) << R"===(;
-NBL_CONSTEXPR_STATIC_INLINE uint32_t uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(;
+NBL_CONSTEXPR_STATIC_INLINE uint32_t SharedMemoryDWORDs = )===" << (sharedMemoryPerInvocation* retval.workgroupSize)/sizeof(uint32_t) << R"===(;
 };
 )===";
 		return tmp.str();

From c700f67ccc1dc13f7bb4127ace5c6449987ce03e Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 5 Nov 2024 19:13:48 +0100
Subject: [PATCH 13/14] Updated DXC

---
 3rdparty/dxc/dxc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 7acfe6f4f..5adc27f9e 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 7acfe6f4fc724265db8026256fad18afeb282b97
+Subproject commit 5adc27f9e42de7681d65a98873048af661b9b367

From 0c0b9ab86beec5595e09ae594a58e529a3f4cbb7 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 5 Nov 2024 19:22:27 +0100
Subject: [PATCH 14/14] change last place's usage of `impl::declval` to
 `experimental::declval`

---
 include/nbl/builtin/hlsl/sort/counting.hlsl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/sort/counting.hlsl b/include/nbl/builtin/hlsl/sort/counting.hlsl
index 12da2e9d1..1cd916ccc 100644
--- a/include/nbl/builtin/hlsl/sort/counting.hlsl
+++ b/include/nbl/builtin/hlsl/sort/counting.hlsl
@@ -22,7 +22,7 @@ template<
     typename ValueAccessor,
     typename HistogramAccessor,
     typename SharedAccessor,
-    typename key_t = decltype(impl::declval<KeyAccessor>().get(0)),
+    typename key_t = decltype(experimental::declval<KeyAccessor>().get(0)),
     bool robust=false
 >
 struct counting