diff --git a/64_FFT/CMakeLists.txt b/64_FFT/CMakeLists.txt
new file mode 100644
index 00000000..a434ff32
--- /dev/null
+++ b/64_FFT/CMakeLists.txt
@@ -0,0 +1,24 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+    file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+    foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+      LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+    endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
\ No newline at end of file
diff --git a/64_FFT/app_resources/common.hlsl b/64_FFT/app_resources/common.hlsl
new file mode 100644
index 00000000..c6fadb8e
--- /dev/null
+++ b/64_FFT/app_resources/common.hlsl
@@ -0,0 +1,14 @@
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+using scalar_t = nbl::hlsl::float32_t;
+
+struct PushConstantData
+{
+	uint64_t inputAddress;
+	uint64_t outputAddress;
+	uint32_t dataElementCount;
+};
+
+NBL_CONSTEXPR uint32_t WorkgroupSize = 64;
+NBL_CONSTEXPR uint32_t ElementsPerThread = 8;
+NBL_CONSTEXPR uint32_t complexElementCount = WorkgroupSize * ElementsPerThread;
\ No newline at end of file
diff --git a/64_FFT/app_resources/shader.comp.hlsl b/64_FFT/app_resources/shader.comp.hlsl
new file mode 100644
index 00000000..968fcb30
--- /dev/null
+++ b/64_FFT/app_resources/shader.comp.hlsl
@@ -0,0 +1,62 @@
+#include "common.hlsl"
+#include "nbl/builtin/hlsl/workgroup/fft.hlsl"
+#include "nbl/builtin/hlsl/bda/legacy_bda_accessor.hlsl"
+
+[[vk::push_constant]] PushConstantData pushConstants;
+
+using namespace nbl::hlsl;
+
+// careful: change size according to Scalar type
+groupshared uint32_t sharedmem[ workgroup::fft::SharedMemoryDWORDs<scalar_t, WorkgroupSize> ];
+
+// Users MUST define this method for FFT to work
+uint32_t3 glsl::gl_WorkGroupSize() { return uint32_t3(WorkgroupSize, 1, 1); }
+
+struct SharedMemoryAccessor 
+{
+	void set(uint32_t idx, uint32_t value) 
+	{
+		sharedmem[idx] = value;
+	}
+	
+	void get(uint32_t idx, NBL_REF_ARG(uint32_t) value) 
+	{
+		value = sharedmem[idx];
+	}
+
+	void workgroupExecutionAndMemoryBarrier() 
+	{
+		glsl::barrier();
+    }
+
+};
+
+struct Accessor : DoubleLegacyBdaAccessor< complex_t<scalar_t> >
+{
+	static Accessor create(const uint64_t inputAddress, const uint64_t outputAddress)
+    {
+        Accessor accessor;
+        accessor.inputAddress = inputAddress;
+        accessor.outputAddress = outputAddress;
+        return accessor;
+    }
+
+	void memoryBarrier() 
+	{
+		// only one workgroup is touching any memory it wishes to trade
+		spirv::memoryBarrier(spv::ScopeWorkgroup, spv::MemorySemanticsAcquireReleaseMask | spv::MemorySemanticsUniformMemoryMask);
+	}
+};
+
+[numthreads(WorkgroupSize,1,1)]
+void main(uint32_t3 ID : SV_DispatchThreadID)
+{
+	Accessor accessor = Accessor::create(pushConstants.inputAddress, pushConstants.outputAddress);
+	SharedMemoryAccessor sharedmemAccessor;
+
+	// FFT
+
+	workgroup::FFT<ElementsPerThread, true, WorkgroupSize, scalar_t>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor);
+	accessor.workgroupExecutionAndMemoryBarrier();
+	workgroup::FFT<ElementsPerThread, false, WorkgroupSize, scalar_t>::template __call<Accessor, SharedMemoryAccessor>(accessor, sharedmemAccessor);	
+}
\ No newline at end of file
diff --git a/64_FFT/config.json.template b/64_FFT/config.json.template
new file mode 100644
index 00000000..717d05d5
--- /dev/null
+++ b/64_FFT/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
diff --git a/64_FFT/main.cpp b/64_FFT/main.cpp
new file mode 100644
index 00000000..3b9b53c9
--- /dev/null
+++ b/64_FFT/main.cpp
@@ -0,0 +1,329 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+
+// I've moved out a tiny part of this example into a shared header for reuse, please open and read it.
+#include "nbl/application_templates/MonoDeviceApplication.hpp"
+#include "nbl/application_templates/MonoAssetManagerAndBuiltinResourceApplication.hpp"
+
+
+using namespace nbl;
+using namespace core;
+using namespace system;
+using namespace asset;
+using namespace video;
+
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+#include "nbl/builtin/hlsl/random/xoroshiro.hlsl"
+
+
+// Simple showcase of how to run FFT on a 1D array
+class FFT_Test final : public application_templates::MonoDeviceApplication, public application_templates::MonoAssetManagerAndBuiltinResourceApplication
+{
+	using device_base_t = application_templates::MonoDeviceApplication;
+	using asset_base_t = application_templates::MonoAssetManagerAndBuiltinResourceApplication;
+
+	smart_refctd_ptr<IGPUComputePipeline> m_pipeline;
+
+	smart_refctd_ptr<nbl::video::IUtilities> m_utils;
+
+	nbl::video::StreamingTransientDataBufferMT<>* m_upStreamingBuffer;
+	StreamingTransientDataBufferMT<>* m_downStreamingBuffer;
+	smart_refctd_ptr<nbl::video::IGPUBuffer> m_deviceLocalBuffer;
+
+	// These are Buffer Device Addresses
+	uint64_t m_upStreamingBufferAddress;
+	uint64_t m_downStreamingBufferAddress;
+	uint64_t m_deviceLocalBufferAddress;
+
+	// You can ask the `nbl::core::GeneralpurposeAddressAllocator` used internally by the Streaming Buffers give out offsets aligned to a certain multiple (not only Power of Two!)
+	uint32_t m_alignment;
+
+	// This example really lets the advantages of a timeline semaphore shine through!
+	smart_refctd_ptr<ISemaphore> m_timeline;
+	uint64_t semaphorValue = 0;
+
+public:
+	// Yay thanks to multiple inheritance we cannot forward ctors anymore
+	FFT_Test(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	// we stuff all our work here because its a "single shot" app
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		// Remember to call the base class initialization!
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		// this time we load a shader directly from a file
+		smart_refctd_ptr<IGPUShader> shader;
+		{
+			IAssetLoader::SAssetLoadParams lp = {};
+			lp.logger = m_logger.get();
+			lp.workingDirectory = ""; // virtual root
+			auto assetBundle = m_assetMgr->getAsset("app_resources/shader.comp.hlsl", lp);
+			const auto assets = assetBundle.getContents();
+			if (assets.empty())
+				return logFail("Could not load shader!");
+
+			// Cast down the asset to its proper type
+			auto source = IAsset::castDown<ICPUShader>(assets[0]);
+			// The down-cast should not fail!
+			assert(source);
+
+			// Compile directly to IGPUShader
+			shader = m_device->createShader(source.get());
+			if (!shader)
+				return logFail("Creation of a GPU Shader to from CPU Shader source failed!");
+		}
+
+		// Create massive upload/download buffers
+		constexpr uint32_t DownstreamBufferSize = sizeof(scalar_t) << 23;
+		constexpr uint32_t UpstreamBufferSize = sizeof(scalar_t) << 23;
+
+		m_utils = make_smart_refctd_ptr<IUtilities>(smart_refctd_ptr(m_device), smart_refctd_ptr(m_logger), DownstreamBufferSize, UpstreamBufferSize);
+		if (!m_utils)
+			return logFail("Failed to create Utilities!");
+		m_upStreamingBuffer = m_utils->getDefaultUpStreamingBuffer();
+		m_downStreamingBuffer = m_utils->getDefaultDownStreamingBuffer();
+		m_upStreamingBufferAddress = m_upStreamingBuffer->getBuffer()->getDeviceAddress();
+		m_downStreamingBufferAddress = m_downStreamingBuffer->getBuffer()->getDeviceAddress();
+
+		// Create device-local buffer
+		{
+			const uint32_t scalarElementCount = 2 * complexElementCount;
+			IGPUBuffer::SCreationParams deviceLocalBufferParams = {};
+			
+			IQueue* const queue = getComputeQueue();
+			uint32_t queueFamilyIndex = queue->getFamilyIndex();
+			
+			deviceLocalBufferParams.queueFamilyIndexCount = 1;
+			deviceLocalBufferParams.queueFamilyIndices = &queueFamilyIndex;
+			deviceLocalBufferParams.size = sizeof(scalar_t) * scalarElementCount;
+			deviceLocalBufferParams.usage = nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_SRC_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_TRANSFER_DST_BIT | nbl::asset::IBuffer::E_USAGE_FLAGS::EUF_SHADER_DEVICE_ADDRESS_BIT;
+			
+			m_deviceLocalBuffer = m_device->createBuffer(std::move(deviceLocalBufferParams));
+			auto mreqs = m_deviceLocalBuffer->getMemoryReqs();
+			mreqs.memoryTypeBits &= m_device->getPhysicalDevice()->getDeviceLocalMemoryTypeBits();
+			auto gpubufMem = m_device->allocate(mreqs, m_deviceLocalBuffer.get(), IDeviceMemoryAllocation::E_MEMORY_ALLOCATE_FLAGS::EMAF_DEVICE_ADDRESS_BIT);
+
+			m_deviceLocalBufferAddress = m_deviceLocalBuffer.get()->getDeviceAddress();
+		}
+		
+		const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(PushConstantData) };
+
+		{
+			auto layout = m_device->createPipelineLayout({ &pcRange,1 });
+			IGPUComputePipeline::SCreationParams params = {};
+			params.layout = layout.get();
+			params.shader.shader = shader.get();
+			params.shader.requiredSubgroupSize = static_cast<IGPUShader::SSpecInfo::SUBGROUP_SIZE>(hlsl::findMSB(m_physicalDevice->getLimits().maxSubgroupSize));
+			params.shader.requireFullSubgroups = true;
+			if (!m_device->createComputePipelines(nullptr, { &params,1 }, &m_pipeline))
+				return logFail("Failed to create compute pipeline!\n");
+		}
+
+		const auto& deviceLimits = m_device->getPhysicalDevice()->getLimits();
+		// The ranges of non-coherent mapped memory you flush or invalidate need to be aligned. You'll often see a value of 64 reported by devices
+		// which just happens to coincide with a CPU cache line size. So we ask our streaming buffers during allocation to give us properly aligned offsets.
+		// Sidenote: For SSBOs, UBOs, BufferViews, Vertex Buffer Bindings, Acceleration Structure BDAs, Shader Binding Tables, Descriptor Buffers, etc.
+		// there is also a requirement to bind buffers at offsets which have a certain alignment. Memory binding to Buffers and Images also has those.
+		// We'll align to max of coherent atom size even if the memory is coherent,
+		// and we also need to take into account BDA shader loads need to be aligned to the type being loaded.
+		m_alignment = core::max(deviceLimits.nonCoherentAtomSize, alignof(float));
+
+		// Semaphor used here to know the FFT is done before download
+		m_timeline = m_device->createSemaphore(semaphorValue);
+
+		IQueue* const queue = getComputeQueue();
+
+		// Note that I'm using the sample struct with methods that have identical code which compiles as both C++ and HLSL
+		auto rng = nbl::hlsl::Xoroshiro64StarStar::construct({ semaphorValue ^ 0xdeadbeefu,std::hash<string>()(_NBL_APP_NAME_) });
+
+		const uint32_t scalarElementCount = 2 * complexElementCount;
+		const uint32_t inputSize = sizeof(scalar_t) * scalarElementCount;
+
+		// Just need a single suballocation in this example
+		const uint32_t AllocationCount = 1;
+
+		// It comes with a certain drawback that you need to remember to initialize your "yet unallocated" offsets to the Invalid value
+		// this is to allow a set of allocations to fail, and you to re-try after doing something to free up space without repacking args.
+		auto inputOffset = m_upStreamingBuffer->invalid_value;
+
+		// We always just wait till an allocation becomes possible (during allocation previous "latched" frees get their latch conditions polled)
+		// Freeing of Streaming Buffer Allocations can and should be deferred until an associated polled event signals done (more on that later).
+		std::chrono::steady_clock::time_point waitTill(std::chrono::years(45));
+		// note that the API takes a time-point not a duration, because there are multiple waits and preemptions possible, so the durations wouldn't add up properly
+		m_upStreamingBuffer->multi_allocate(waitTill, AllocationCount, &inputOffset, &inputSize, &m_alignment);
+
+		// Generate our data in-place on the allocated staging buffer. Packing is interleaved in this example!
+		{
+			auto* const inputPtr = reinterpret_cast<scalar_t*>(reinterpret_cast<uint8_t*>(m_upStreamingBuffer->getBufferPointer()) + inputOffset);
+			std::cout << "Begin array CPU\n";
+			for (auto j = 0; j < complexElementCount; j++)
+			{
+				//Random array
+
+				//scalar_t x = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max), y = rng() / scalar_t(nbl::hlsl::numeric_limits<decltype(rng())>::max);
+
+				// FFT( (1,0), (0,0), (0,0),... ) = (1,0), (1,0), (1,0),...
+				
+				
+				scalar_t x = j > 0 ? 0.f : 1.f;
+				scalar_t y = 0;
+				
+				
+				// FFT( (c,0), (c,0), (c,0),... ) = (Nc,0), (0,0), (0,0),...
+				
+				/*
+				scalar_t x = 1.f;
+				scalar_t y = 0.f;
+				*/
+
+				inputPtr[2 * j] = x;
+				inputPtr[2 * j + 1] = y;
+				std::cout << "(" << x << ", " << y << "), ";
+			}
+			std::cout << "\nEnd array CPU\n";
+			// Always remember to flush!
+			if (m_upStreamingBuffer->needsManualFlushOrInvalidate())
+			{
+				const auto bound = m_upStreamingBuffer->getBuffer()->getBoundMemory();
+				const ILogicalDevice::MappedMemoryRange range(bound.memory, bound.offset + inputOffset, inputSize);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+		}
+
+		// finally allocate our output range
+		const uint32_t outputSize = inputSize;
+
+		auto outputOffset = m_downStreamingBuffer->invalid_value;
+		m_downStreamingBuffer->multi_allocate(waitTill, AllocationCount, &outputOffset, &outputSize, &m_alignment);
+
+		smart_refctd_ptr<IGPUCommandBuffer> cmdbuf;
+		{
+			smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::TRANSIENT_BIT);
+			if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdbuf)) {
+				return logFail("Failed to create Command Buffers!\n");
+			}
+			cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, { &cmdbuf,1 }, core::smart_refctd_ptr(m_logger));
+			cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdbuf->bindComputePipeline(m_pipeline.get());
+			// This is the new fun part, pushing constants
+			const PushConstantData pc = {
+				.inputAddress = m_deviceLocalBufferAddress,
+				.outputAddress = m_deviceLocalBufferAddress,
+				.dataElementCount = scalarElementCount
+			};
+			IGPUCommandBuffer::SBufferCopy copyInfo = {};
+			copyInfo.srcOffset = 0;
+			copyInfo.dstOffset = 0;
+			copyInfo.size = m_deviceLocalBuffer->getSize();
+			cmdbuf->copyBuffer(m_upStreamingBuffer->getBuffer(), m_deviceLocalBuffer.get(), 1, &copyInfo);
+			cmdbuf->pushConstants(m_pipeline->getLayout(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+			// Remember we do a single workgroup per 1D array in these parts
+			cmdbuf->dispatch(1, 1, 1);
+
+			// Pipeline barrier: wait for FFT shader to be done before copying to downstream buffer 
+			IGPUCommandBuffer::SPipelineBarrierDependencyInfo pipelineBarrierInfo = {};
+
+			decltype(pipelineBarrierInfo)::buffer_barrier_t barrier = {}; 
+			pipelineBarrierInfo.bufBarriers = { &barrier, 1u };
+
+			barrier.range.buffer = m_deviceLocalBuffer;
+
+			barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
+			barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_READ_BITS;
+
+			cmdbuf->pipelineBarrier(asset::E_DEPENDENCY_FLAGS(0), pipelineBarrierInfo);
+			cmdbuf->copyBuffer(m_deviceLocalBuffer.get(), m_downStreamingBuffer->getBuffer(), 1, &copyInfo);
+			cmdbuf->end();
+		}
+
+		semaphorValue++;
+		{
+			const IQueue::SSubmitInfo::SCommandBufferInfo cmdbufInfo =
+			{
+				.cmdbuf = cmdbuf.get()
+			};
+			const IQueue::SSubmitInfo::SSemaphoreInfo signalInfo =
+			{
+				.semaphore = m_timeline.get(),
+				.value = semaphorValue,
+				.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+			};
+
+			const IQueue::SSubmitInfo submitInfo = {
+				.waitSemaphores = {},
+				.commandBuffers = {&cmdbufInfo,1},
+				.signalSemaphores = {&signalInfo,1}
+			};
+
+			queue->startCapture();
+			queue->submit({ &submitInfo,1 });
+			queue->endCapture();
+		}
+
+		// We let all latches know what semaphore and counter value has to be passed for the functors to execute
+		const ISemaphore::SWaitInfo futureWait = { m_timeline.get(),semaphorValue };
+
+		// As promised, we can defer an upstreaming buffer deallocation until a fence is signalled
+		// You can also attach an additional optional IReferenceCounted derived object to hold onto until deallocation.
+		m_upStreamingBuffer->multi_deallocate(AllocationCount, &inputOffset, &inputSize, futureWait);
+
+		// Now a new and even more advanced usage of the latched events, we make our own refcounted object with a custom destructor and latch that like we did the commandbuffer.
+		// Instead of making our own and duplicating logic, we'll use one from IUtilities meant for down-staging memory.
+		// Its nice because it will also remember to invalidate our memory mapping if its not coherent.
+		auto latchedConsumer = make_smart_refctd_ptr<IUtilities::CDownstreamingDataConsumer>(
+			IDeviceMemoryAllocation::MemoryRange(outputOffset, outputSize),
+			// Note the use of capture by-value [=] and not by-reference [&] because this lambda will be called asynchronously whenever the event signals
+			[=](const size_t dstOffset, const void* bufSrc, const size_t size)->void
+			{
+				// The unused variable is used for letting the consumer know the subsection of the output we've managed to download
+				// But here we're sure we can get the whole thing in one go because we allocated the whole range ourselves.
+				assert(dstOffset == 0 && size == outputSize);
+
+				std::cout << "Begin array GPU\n";
+				scalar_t* const data = reinterpret_cast<scalar_t*>(const_cast<void*>(bufSrc));
+				for (auto i = 0u; i < complexElementCount; i++) {
+					std::cout << "(" << data[2 * i] << ", " << data[2 * i + 1] << "), ";
+				}
+
+				std::cout << "\nEnd array GPU\n";
+			},
+			// Its also necessary to hold onto the commandbuffer, even though we take care to not reset the parent pool, because if it
+			// hits its destructor, our automated reference counting will drop all references to objects used in the recorded commands.
+			// It could also be latched in the upstreaming deallocate, because its the same fence.
+			std::move(cmdbuf), m_downStreamingBuffer
+		);
+		// We put a function we want to execute 
+		m_downStreamingBuffer->multi_deallocate(AllocationCount, &outputOffset, &outputSize, futureWait, &latchedConsumer.get());
+
+		return true;
+	}
+
+	// One-shot App
+	bool keepRunning() override { return false; }
+
+	// One-shot App
+	void workLoopBody() override{}
+
+	// Cleanup
+	bool onAppTerminated() override
+	{
+		// Need to make sure that there are no events outstanding if we want all lambdas to eventually execute before `onAppTerminated`
+		// (the destructors of the Command Pool Cache and Streaming buffers will still wait for all lambda events to drain)
+		while (m_downStreamingBuffer->cull_frees()) {}
+		return device_base_t::onAppTerminated();
+	}
+};
+
+
+NBL_MAIN_FUNC(FFT_Test)
\ No newline at end of file
diff --git a/64_FFT/pipeline.groovy b/64_FFT/pipeline.groovy
new file mode 100644
index 00000000..1a7b043a
--- /dev/null
+++ b/64_FFT/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CStreamingAndBufferDeviceAddressBuilder extends IBuilder
+{
+	public CStreamingAndBufferDeviceAddressBuilder(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9bc4ffc2..0d485d3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,5 +64,6 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(61_UI EXCLUDE_FROM_ALL)
 	add_subdirectory(62_CAD EXCLUDE_FROM_ALL)
 	add_subdirectory(62_SchusslerTest EXCLUDE_FROM_ALL)
+	add_subdirectory(64_FFT EXCLUDE_FROM_ALL)
 	add_subdirectory(0_ImportanceSamplingEnvMaps EXCLUDE_FROM_ALL) #TODO: integrate back into 42
 endif()