From 6635ba9cad53f599e4643aa90e7dfa9e800d144c Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Tue, 23 Dec 2025 18:10:26 +0330
Subject: [PATCH 1/7] Add 73_ImageUploadBenchmark example

---
 73_ImageUploadBenchmark/CMakeLists.txt       |   6 +
 73_ImageUploadBenchmark/config.json.template |  28 ++
 73_ImageUploadBenchmark/main.cpp             | 392 +++++++++++++++++++
 73_ImageUploadBenchmark/pipeline.groovy      |  50 +++
 CMakeLists.txt                               |   1 +
 5 files changed, 477 insertions(+)
 create mode 100644 73_ImageUploadBenchmark/CMakeLists.txt
 create mode 100644 73_ImageUploadBenchmark/config.json.template
 create mode 100644 73_ImageUploadBenchmark/main.cpp
 create mode 100644 73_ImageUploadBenchmark/pipeline.groovy
diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt
new file mode 100644
index 000000000..2f9218f93
--- /dev/null
+++ b/73_ImageUploadBenchmark/CMakeLists.txt
@@ -0,0 +1,6 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
diff --git a/73_ImageUploadBenchmark/config.json.template b/73_ImageUploadBenchmark/config.json.template
new file mode 100644
index 000000000..12215d0bb
--- /dev/null
+++ b/73_ImageUploadBenchmark/config.json.template
@@ -0,0 +1,28 @@
+{
+  "enableParallelBuild": true,
+  "threadsPerBuildProcess" : 2,
+  "isExecuted": false,
+  "scriptPath": "",
+  "cmake": {
+    "configurations": [ "Release", "Debug", "RelWithDebInfo" ],
+    "buildModes": [],
+    "requiredOptions": []
+  }, 
+  "profiles": [
+    {
+      "backend": "vulkan", // should be none
+      "platform": "windows",
+      "buildModes": [],
+      "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
+      "gpuArchitectures": []
+    }
+  ],
+  "dependencies": [],
+  "data": [
+    {
+      "dependencies": [],
+      "command": [""],
+      "outputs": []
+    }
+  ]
+}
diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
new file mode 100644
index 000000000..a22647750
--- /dev/null
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -0,0 +1,392 @@
+#include "nbl/examples/examples.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+
+using namespace nbl;
+using namespace nbl::core;
+using namespace nbl::hlsl;
+using namespace nbl::system;
+using namespace nbl::asset;
+using namespace nbl::ui;
+using namespace nbl::video;
+using namespace nbl::examples;
+
+#include "app_resources/common.hlsl"
+#include "nbl/builtin/hlsl/bit.hlsl"
+
+class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+{
+		using device_base_t = application_templates::MonoDeviceApplication;
+		using asset_base_t = BuiltinResourcesApplication;
+
+	public:
+		// Yay thanks to multiple inheritance we cannot forward ctors anymore
+		CountingSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+			system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+		// we stuff all our work here because its a "single shot" app
+		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		{
+			// Remember to call the base class initialization!
+			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+				return false;
+			if (!asset_base_t::onAppInitialized(std::move(system)))
+				return false;
+
+			auto limits = m_physicalDevice->getLimits();
+			constexpr std::array<uint32_t, 3u> AllowedMaxComputeSharedMemorySizes = {
+				16384, 32768, 65536
+			};
+
+			auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize);
+			// devices which support less than 16KB of max compute shared memory size are not supported
+			if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin())
+			{
+				m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize);
+				exit(0);
+			}
+
+			limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1);
+
+			const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations;
+			const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2;
+			constexpr uint32_t element_count = 100000;
+			const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount);
+			const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize);
+
+			auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
+			{
+				// this time we load a shader directly from a file
+				IAssetLoader::SAssetLoadParams lp = {};
+				lp.logger = m_logger.get();
+				lp.workingDirectory = "app_resources"; // virtual root
+				auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(limits, m_physicalDevice->getFeatures());
+				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
+				const auto assets = assetBundle.getContents();
+				if (assets.empty())
+				{
+					logFail("Could not load shader!");
+					return nullptr;
+				}
+
+				auto shader = IAsset::castDown<IShader>(assets[0]);
+				// The down-cast should not fail!
+				assert(shader);
+			
+				// There's two ways of doing stuff like this:
+				// 1. this - modifying the asset after load
+				// 2. creating a short shader source file that includes the asset you would have wanted to load
+				// 
+				//auto overrideSource = CHLSLCompiler::createOverridenCopy(
+				//	source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n",
+				//	WorkgroupSize, bucket_count
+				//);
+
+				// this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple
+				return shader;
+			};
+			auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl"
+			auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl"
+
+			// People love Reflection but I prefer Shader Sources instead!
+			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) };
+
+			// This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size
+			// and using traditional SSBO bindings would force us to update the Descriptor Set every frame.
+			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
+			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
+			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
+			smart_refctd_ptr<IGPUPipelineLayout> layout;
+			smart_refctd_ptr<IGPUComputePipeline> prefixSumPipeline;
+			smart_refctd_ptr<IGPUComputePipeline> scatterPipeline;
+			{
+				layout = m_device->createPipelineLayout({ &pcRange,1 });
+				IGPUComputePipeline::SCreationParams params = {};
+				params.layout = layout.get();
+				params.shader.shader = prefixSumShader.get();
+				params.shader.entryPoint = "main";
+				params.shader.entries = nullptr;
+				params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
+				params.cached.requireFullSubgroups = true;
+				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &prefixSumPipeline))
+					return logFail("Failed to create compute pipeline!\n");
+				params.shader.shader = scatterShader.get();
+				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &scatterPipeline))
+					return logFail("Failed to create compute pipeline!\n");
+			}
+
+			// Allocate memory
+			nbl::video::IDeviceMemoryAllocator::SAllocation allocation[5] = {};
+			smart_refctd_ptr<IGPUBuffer> buffers[5];
+			//smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
+			{
+				auto build_buffer = [this](
+					smart_refctd_ptr<ILogicalDevice> m_device,
+					nbl::video::IDeviceMemoryAllocator::SAllocation *allocation,
+					smart_refctd_ptr<IGPUBuffer>& buffer,
+					size_t buffer_size,
+					const char *label
+				) -> void {
+					IGPUBuffer::SCreationParams params;
+					params.size = buffer_size;
+					params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
+					buffer = m_device->createBuffer(std::move(params));
+					if (!buffer)
+						logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
+
+					buffer->setObjectDebugName(label);
+
+					auto reqs = buffer->getMemoryReqs();
+					reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
+
+					*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
+					if (!allocation->isValid())
+						logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
+
+					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
+				};
+
+				build_buffer(m_device,	allocation,		buffers[0], sizeof(uint32_t) * element_count,	"Input Key Buffer");
+				build_buffer(m_device,	allocation + 1,	buffers[1], sizeof(uint32_t) * element_count,	"Input Value Buffer");
+				build_buffer(m_device,	allocation + 2, buffers[2], sizeof(uint32_t) * bucket_count,	"Scratch Buffer");
+				build_buffer(m_device,	allocation + 3,	buffers[3], sizeof(uint32_t) * element_count,	"Output Key Buffer");
+				build_buffer(m_device,	allocation + 4, buffers[4], sizeof(uint32_t) * element_count,	"Output Value Buffer");
+			}
+			uint64_t buffer_device_address[] = {
+				buffers[0]->getDeviceAddress(),
+				buffers[1]->getDeviceAddress(),
+				buffers[2]->getDeviceAddress(),
+				buffers[3]->getDeviceAddress(),
+				buffers[4]->getDeviceAddress()
+			};
+
+			void* mapped_memory[] = {
+				allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+				allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+				allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+				allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+				allocation[4].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
+			};
+			if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3] || !mapped_memory[4])
+				return logFail("Failed to map the Device Memory!\n");
+
+			// Generate random data
+			constexpr uint32_t minimum = 0;
+			const uint32_t range = bucket_count;
+			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+			std::mt19937 g(seed);
+
+			auto bufferData = new uint32_t[2][element_count];
+			for (uint32_t i = 0; i < element_count; i++) {
+				bufferData[0][i] = minimum + g() % range;
+			}
+
+			memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count);
+
+			for (uint32_t i = 0; i < element_count; i++) {
+				bufferData[1][i] = g() % std::numeric_limits<uint32_t>::max();
+			}
+
+			memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count);
+
+			std::string outBuffer;
+			for (auto i = 0; i < element_count; i++) {
+				outBuffer.append("{");
+				outBuffer.append(std::to_string(bufferData[0][i]));
+				outBuffer.append(", ");
+				outBuffer.append(std::to_string(bufferData[1][i]));
+				outBuffer.append("} ");
+			}
+			outBuffer.append("\n");
+			outBuffer.append("Count: ");
+			outBuffer.append(std::to_string(element_count));
+			outBuffer.append("\n");
+			m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+
+			auto pc = CountingPushData {
+				.inputKeyAddress = buffer_device_address[0],
+				.inputValueAddress = buffer_device_address[1],
+				.histogramAddress = buffer_device_address[2],
+				.outputKeyAddress = buffer_device_address[3],
+				.outputValueAddress = buffer_device_address[4],
+				.dataElementCount = element_count,
+				.elementsPerWT = elements_per_thread,
+				.minimum = minimum,
+				.maximum = minimum + bucket_count - 1,
+			};
+
+			smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdBuf;
+			{
+				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
+				if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf))
+					return logFail("Failed to create Command Buffers!\n");
+			}
+
+			// Create the Semaphore for prefix sum
+			constexpr uint64_t started_value = 0;
+			uint64_t timeline = started_value;
+			smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(started_value);
+
+			cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdBuf->beginDebugMarker("Prefix Sum Dispatch", core::vectorSIMDf(0, 1, 0, 1));
+			cmdBuf->bindComputePipeline(prefixSumPipeline.get());
+			cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+			cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1);
+			cmdBuf->endDebugMarker();
+			cmdBuf->end();
+
+			{
+				auto queue = getComputeQueue();
+
+				IQueue::SSubmitInfo submit_infos[1];
+				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
+					{
+						.cmdbuf = cmdBuf.get()
+					}
+				};
+				submit_infos[0].commandBuffers = cmdBufs;
+				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
+					{
+						.semaphore = progress.get(),
+						.value = ++timeline,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].signalSemaphores = signals;
+
+				m_api->startCapture();
+				queue->submit(submit_infos);
+				m_api->endCapture();
+			}
+
+			const ISemaphore::SWaitInfo wait_infos[] = { {
+					.semaphore = progress.get(),
+					.value = timeline
+				} };
+			m_device->blockForSemaphores(wait_infos);
+
+			// Create the Semaphore for Scatter
+			uint64_t timeline2 = started_value;
+			smart_refctd_ptr<ISemaphore> progress2 = m_device->createSemaphore(started_value);
+
+			cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			cmdBuf->beginDebugMarker("Scatter Dispatch", core::vectorSIMDf(0, 1, 0, 1));
+			cmdBuf->bindComputePipeline(scatterPipeline.get());
+			cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
+			cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1);
+			cmdBuf->endDebugMarker();
+			cmdBuf->end();
+
+			{
+				auto queue = getComputeQueue();
+
+				IQueue::SSubmitInfo submit_infos[1];
+				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
+					{
+						.cmdbuf = cmdBuf.get()
+					}
+				};
+				submit_infos[0].commandBuffers = cmdBufs;
+				IQueue::SSubmitInfo::SSemaphoreInfo waits[] = {
+					{
+						.semaphore = progress.get(),
+						.value = timeline,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].waitSemaphores = waits;
+				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
+					{
+						.semaphore = progress2.get(),
+						.value = ++timeline2,
+						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+					}
+				};
+				submit_infos[0].signalSemaphores = signals;
+
+				m_api->startCapture();
+				queue->submit(submit_infos);
+				m_api->endCapture();
+			}
+
+			const ISemaphore::SWaitInfo wait_infos2[] = {{
+					.semaphore = progress2.get(),
+					.value = timeline2
+				} };
+			m_device->blockForSemaphores(wait_infos2);
+
+			const ILogicalDevice::MappedMemoryRange memory_range[] = {
+				ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()),
+				ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()),
+				ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()),
+				ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()),
+				ILogicalDevice::MappedMemoryRange(allocation[4].memory.get(), 0ull, allocation[4].memory->getAllocationSize())
+			};
+
+			if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[0]);
+			if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[1]);
+			if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[2]);
+			if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[3]);
+			if (!allocation[4].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				m_device->invalidateMappedMemoryRanges(1, &memory_range[4]);
+
+			const uint32_t* buffData[] = {
+				reinterpret_cast<const uint32_t*>(allocation[2].memory->getMappedPointer()),
+				reinterpret_cast<const uint32_t*>(allocation[3].memory->getMappedPointer()),
+				reinterpret_cast<const uint32_t*>(allocation[4].memory->getMappedPointer())
+			};
+
+			assert(allocation[2].offset == 0); // simpler than writing out all the pointer arithmetic
+			assert(allocation[3].offset == 0); // simpler than writing out all the pointer arithmetic
+			assert(allocation[4].offset == 0); // simpler than writing out all the pointer arithmetic
+
+			outBuffer.clear();
+			for (auto i = 0; i < bucket_count; i++) {
+				outBuffer.append(std::to_string(buffData[0][i]));
+				outBuffer.append(" ");
+			}
+			outBuffer.append("\n");
+
+			m_logger->log("Scratch buffer is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+
+			outBuffer.clear();
+			for (auto i = 0; i < element_count; i++) {
+				outBuffer.append("{");
+				outBuffer.append(std::to_string(buffData[1][i]));
+				outBuffer.append(", ");
+				outBuffer.append(std::to_string(buffData[2][i]));
+				outBuffer.append("} ");
+			}
+			outBuffer.append("\n");
+			outBuffer.append("Count: ");
+			outBuffer.append(std::to_string(element_count));
+			outBuffer.append("\n");
+			m_logger->log("Your output array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+
+			allocation[0].memory->unmap();
+			allocation[1].memory->unmap();
+			allocation[2].memory->unmap();
+			allocation[3].memory->unmap();
+			allocation[4].memory->unmap();
+
+			m_device->waitIdle();
+
+			delete[] bufferData;
+
+			return true;
+		}
+
+		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
+		bool keepRunning() override { return false; }
+
+		// Finally the first actual work-loop
+		void workLoopBody() override {}
+
+		bool onAppTerminated() override { return true; }
+};
+
+
+NBL_MAIN_FUNC(CountingSortApp)
\ No newline at end of file
diff --git a/73_ImageUploadBenchmark/pipeline.groovy b/73_ImageUploadBenchmark/pipeline.groovy
new file mode 100644
index 000000000..1249f10b5
--- /dev/null
+++ b/73_ImageUploadBenchmark/pipeline.groovy
@@ -0,0 +1,50 @@
+import org.DevshGraphicsProgramming.Agent
+import org.DevshGraphicsProgramming.BuilderInfo
+import org.DevshGraphicsProgramming.IBuilder
+
+class CImageUploadBenchmark extends IBuilder
+{
+	public CImageUploadBenchmark(Agent _agent, _info)
+	{
+		super(_agent, _info)
+	}
+	
+	@Override
+	public boolean prepare(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+  	public boolean build(Map axisMapping)
+	{
+		IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION")
+		IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE")
+		
+		def nameOfBuildDirectory = getNameOfBuildDirectory(buildType)
+		def nameOfConfig = getNameOfConfig(config)
+		
+		agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v")
+		
+		return true
+	}
+	
+	@Override
+  	public boolean test(Map axisMapping)
+	{
+		return true
+	}
+	
+	@Override
+	public boolean install(Map axisMapping)
+	{
+		return true
+	}
+}
+
+def create(Agent _agent, _info)
+{
+	return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info)
+}
+
+return this
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cbe482aa4..2d4ed7408 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES)
   	add_subdirectory(70_FLIPFluids)
 	add_subdirectory(71_RayTracingPipeline)
 	add_subdirectory(72_CooperativeBinarySearch)
+	add_subdirectory(73_ImageUploadBenchmark)
 
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)

From 951e2fdd218abc307f9890d69f7a9be38d28f95a Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Wed, 24 Dec 2025 18:38:09 +0330
Subject: [PATCH 2/7] Simple benchmark HOST_VISIBLE vs HOST_VISIBLE &
 DEVICE_LOCAL

---
 73_ImageUploadBenchmark/main.cpp | 694 ++++++++++++++++---------------
 1 file changed, 357 insertions(+), 337 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index a22647750..68815681d 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -1,392 +1,412 @@
 #include "nbl/examples/examples.hpp"
-#include "nbl/this_example/builtin/build/spirv/keys.hpp"
+#include <chrono>
 
 using namespace nbl;
 using namespace nbl::core;
-using namespace nbl::hlsl;
 using namespace nbl::system;
 using namespace nbl::asset;
-using namespace nbl::ui;
 using namespace nbl::video;
 using namespace nbl::examples;
 
-#include "app_resources/common.hlsl"
-#include "nbl/builtin/hlsl/bit.hlsl"
-
-class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
+class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication
 {
-		using device_base_t = application_templates::MonoDeviceApplication;
-		using asset_base_t = BuiltinResourcesApplication;
+	using device_base_t = application_templates::MonoDeviceApplication;
+	using asset_base_t = BuiltinResourcesApplication;
+
+public:
+	ImageUploadBenchmarkApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
+		system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+
+	bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+	{
+		if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
+			return false;
+		if (!asset_base_t::onAppInitialized(std::move(system)))
+			return false;
+
+		constexpr uint32_t TILE_SIZE = 128;
+		constexpr uint32_t TILE_BYTES_PER_PIXEL = 4;
+		constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL;
+		constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
+		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / TILE_SIZE_BYTES;
+		constexpr uint32_t FRAMES_IN_FLIGHT = 4;
+		constexpr uint32_t TOTAL_FRAMES = 1000;
+
+		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO);
+		m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_INFO, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024);
+		m_logger->log("Staging buffer: %u MB", ILogger::ELL_INFO, STAGING_BUFFER_SIZE / (1024 * 1024));
+		m_logger->log("Tiles per frame: %u", ILogger::ELL_INFO, TILES_PER_FRAME);
+		m_logger->log("Frames in flight: %u", ILogger::ELL_INFO, FRAMES_IN_FLIGHT);
+
+		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
+		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
+		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits;
+		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
+
+		if (!hostVisibleOnlyBits)
+		{
+			m_logger->log("HOST_VISIBLE memory types not found!", ILogger::ELL_ERROR);
+			return false;
+		}
 
-	public:
-		// Yay thanks to multiple inheritance we cannot forward ctors anymore
-		CountingSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) :
-			system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {}
+		if (!deviceLocalBits)
+		{
+			m_logger->log("DEVICE_LOCAL memory types not found!", ILogger::ELL_ERROR);
+			return false;
+		}
 
-		// we stuff all our work here because its a "single shot" app
-		bool onAppInitialized(smart_refctd_ptr<ISystem>&& system) override
+		IQueue* queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT);
+		smart_refctd_ptr<IGPUImage> destinationImage;
 		{
-			// Remember to call the base class initialization!
-			if (!device_base_t::onAppInitialized(smart_refctd_ptr(system)))
-				return false;
-			if (!asset_base_t::onAppInitialized(std::move(system)))
-				return false;
+			IGPUImage::SCreationParams imgParams{};
+			imgParams.type = IImage::E_TYPE::ET_2D;
+			imgParams.extent.width = TILE_SIZE * 32;
+			imgParams.extent.height = TILE_SIZE * 32;
+			imgParams.extent.depth = 1u;
+			imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM;
+			imgParams.mipLevels = 1u;
+			imgParams.flags = IImage::ECF_NONE;
+			imgParams.arrayLayers = 1u;
+			imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT;
+			imgParams.tiling = video::IGPUImage::TILING::OPTIMAL;
+			imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT;
+			imgParams.preinitialized = false;
+
+			destinationImage = m_device->createImage(std::move(imgParams));
+			if (!destinationImage)
+				return logFail("Failed to create destination image!\n");
+
+			destinationImage->setObjectDebugName("Destination Image");
+
+			auto reqs = destinationImage->getMemoryReqs();
+			reqs.memoryTypeBits &= deviceLocalBits;
+
+			auto allocation = m_device->allocate(reqs, destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE);
+			if (!allocation.isValid())
+				return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n");
+		}
 
-			auto limits = m_physicalDevice->getLimits();
-			constexpr std::array<uint32_t, 3u> AllowedMaxComputeSharedMemorySizes = {
-				16384, 32768, 65536
-			};
+		m_logger->log("\nTesting Strategy 1: System RAM", ILogger::ELL_INFO);
+
+		double throughputSystemRAM = 0.0;
+		{
+			smart_refctd_ptr<IGPUBuffer> stagingBuffer;
+			IDeviceMemoryAllocator::SAllocation stagingAlloc;
+			void* mappedPtr = nullptr;
 
-			auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize);
-			// devices which support less than 16KB of max compute shared memory size are not supported
-			if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin())
+			if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleOnlyBits,
+				"Staging Buffer - System RAM", stagingBuffer, stagingAlloc, mappedPtr))
 			{
-				m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize);
-				exit(0);
+				return false;
 			}
 
-			limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1);
+			throughputSystemRAM = runBenchmark(
+				"System RAM",
+				stagingBuffer.get(),
+				mappedPtr,
+				destinationImage.get(),
+				TILE_SIZE,
+				TILE_SIZE_BYTES,
+				TILES_PER_FRAME,
+				FRAMES_IN_FLIGHT,
+				TOTAL_FRAMES,
+				queue
+			);
+
+			stagingAlloc.memory->unmap();
+		}
 
-			const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations;
-			const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2;
-			constexpr uint32_t element_count = 100000;
-			const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount);
-			const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize);
+		m_logger->log("System RAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputSystemRAM);
 
-			auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>() -> smart_refctd_ptr<IShader>
-			{
-				// this time we load a shader directly from a file
-				IAssetLoader::SAssetLoadParams lp = {};
-				lp.logger = m_logger.get();
-				lp.workingDirectory = "app_resources"; // virtual root
-				auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(limits, m_physicalDevice->getFeatures());
-				auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
-				const auto assets = assetBundle.getContents();
-				if (assets.empty())
-				{
-					logFail("Could not load shader!");
-					return nullptr;
-				}
+		m_device->waitIdle();
 
-				auto shader = IAsset::castDown<IShader>(assets[0]);
-				// The down-cast should not fail!
-				assert(shader);
-			
-				// There's two ways of doing stuff like this:
-				// 1. this - modifying the asset after load
-				// 2. creating a short shader source file that includes the asset you would have wanted to load
-				// 
-				//auto overrideSource = CHLSLCompiler::createOverridenCopy(
-				//	source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n",
-				//	WorkgroupSize, bucket_count
-				//);
-
-				// this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple
-				return shader;
-			};
-			auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl"
-			auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl"
-
-			// People love Reflection but I prefer Shader Sources instead!
-			const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) };
-
-			// This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size
-			// and using traditional SSBO bindings would force us to update the Descriptor Set every frame.
-			// I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic
-			// only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding.
-			// Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size.
-			smart_refctd_ptr<IGPUPipelineLayout> layout;
-			smart_refctd_ptr<IGPUComputePipeline> prefixSumPipeline;
-			smart_refctd_ptr<IGPUComputePipeline> scatterPipeline;
-			{
-				layout = m_device->createPipelineLayout({ &pcRange,1 });
-				IGPUComputePipeline::SCreationParams params = {};
-				params.layout = layout.get();
-				params.shader.shader = prefixSumShader.get();
-				params.shader.entryPoint = "main";
-				params.shader.entries = nullptr;
-				params.shader.requiredSubgroupSize = static_cast<IPipelineBase::SUBGROUP_SIZE>(5);
-				params.cached.requireFullSubgroups = true;
-				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &prefixSumPipeline))
-					return logFail("Failed to create compute pipeline!\n");
-				params.shader.shader = scatterShader.get();
-				if (!m_device->createComputePipelines(nullptr, { &params,1 }, &scatterPipeline))
-					return logFail("Failed to create compute pipeline!\n");
-			}
+		if (hostVisibleDeviceLocalBits)
+		{
+			m_logger->log("\nTesting Strategy 2: VRAM (ReBAR)", ILogger::ELL_INFO);
 
-			// Allocate memory
-			nbl::video::IDeviceMemoryAllocator::SAllocation allocation[5] = {};
-			smart_refctd_ptr<IGPUBuffer> buffers[5];
-			//smart_refctd_ptr<nbl::video::IGPUDescriptorSet> ds;
+			double throughputVRAM = 0.0;
 			{
-				auto build_buffer = [this](
-					smart_refctd_ptr<ILogicalDevice> m_device,
-					nbl::video::IDeviceMemoryAllocator::SAllocation *allocation,
-					smart_refctd_ptr<IGPUBuffer>& buffer,
-					size_t buffer_size,
-					const char *label
-				) -> void {
-					IGPUBuffer::SCreationParams params;
-					params.size = buffer_size;
-					params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
-					buffer = m_device->createBuffer(std::move(params));
-					if (!buffer)
-						logFail("Failed to create GPU buffer of size %d!\n", buffer_size);
-
-					buffer->setObjectDebugName(label);
-
-					auto reqs = buffer->getMemoryReqs();
-					reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits();
-
-					*allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
-					if (!allocation->isValid())
-						logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n");
-
-					assert(allocation->memory.get() == buffer->getBoundMemory().memory);
-				};
+				smart_refctd_ptr<IGPUBuffer> stagingBuffer;
+				IDeviceMemoryAllocator::SAllocation stagingAlloc;
+				void* mappedPtr = nullptr;
 
-				build_buffer(m_device,	allocation,		buffers[0], sizeof(uint32_t) * element_count,	"Input Key Buffer");
-				build_buffer(m_device,	allocation + 1,	buffers[1], sizeof(uint32_t) * element_count,	"Input Value Buffer");
-				build_buffer(m_device,	allocation + 2, buffers[2], sizeof(uint32_t) * bucket_count,	"Scratch Buffer");
-				build_buffer(m_device,	allocation + 3,	buffers[3], sizeof(uint32_t) * element_count,	"Output Key Buffer");
-				build_buffer(m_device,	allocation + 4, buffers[4], sizeof(uint32_t) * element_count,	"Output Value Buffer");
-			}
-			uint64_t buffer_device_address[] = {
-				buffers[0]->getDeviceAddress(),
-				buffers[1]->getDeviceAddress(),
-				buffers[2]->getDeviceAddress(),
-				buffers[3]->getDeviceAddress(),
-				buffers[4]->getDeviceAddress()
-			};
+				if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits,
+					"Staging Buffer - VRAM (ReBAR)", stagingBuffer, stagingAlloc, mappedPtr))
+				{
+					return false;
+				}
 
-			void* mapped_memory[] = {
-				allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-				allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-				allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-				allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-				allocation[4].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ),
-			};
-			if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3] || !mapped_memory[4])
-				return logFail("Failed to map the Device Memory!\n");
-
-			// Generate random data
-			constexpr uint32_t minimum = 0;
-			const uint32_t range = bucket_count;
-			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-			std::mt19937 g(seed);
-
-			auto bufferData = new uint32_t[2][element_count];
-			for (uint32_t i = 0; i < element_count; i++) {
-				bufferData[0][i] = minimum + g() % range;
+				throughputVRAM = runBenchmark(
+					"VRAM (ReBAR)",
+					stagingBuffer.get(),
+					mappedPtr,
+					destinationImage.get(),
+					TILE_SIZE,
+					TILE_SIZE_BYTES,
+					TILES_PER_FRAME,
+					FRAMES_IN_FLIGHT,
+					TOTAL_FRAMES,
+					queue
+				);
+
+				stagingAlloc.memory->unmap();
 			}
 
-			memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count);
-
-			for (uint32_t i = 0; i < element_count; i++) {
-				bufferData[1][i] = g() % std::numeric_limits<uint32_t>::max();
-			}
+			m_logger->log("VRAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputVRAM);
 
-			memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count);
+			double speedup = throughputVRAM / throughputSystemRAM;
+			m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup);
+		}
 
-			std::string outBuffer;
-			for (auto i = 0; i < element_count; i++) {
-				outBuffer.append("{");
-				outBuffer.append(std::to_string(bufferData[0][i]));
-				outBuffer.append(", ");
-				outBuffer.append(std::to_string(bufferData[1][i]));
-				outBuffer.append("} ");
-			}
-			outBuffer.append("\n");
-			outBuffer.append("Count: ");
-			outBuffer.append(std::to_string(element_count));
-			outBuffer.append("\n");
-			m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
-
-			auto pc = CountingPushData {
-				.inputKeyAddress = buffer_device_address[0],
-				.inputValueAddress = buffer_device_address[1],
-				.histogramAddress = buffer_device_address[2],
-				.outputKeyAddress = buffer_device_address[3],
-				.outputValueAddress = buffer_device_address[4],
-				.dataElementCount = element_count,
-				.elementsPerWT = elements_per_thread,
-				.minimum = minimum,
-				.maximum = minimum + bucket_count - 1,
-			};
+		return true;
+	}
+
+	bool keepRunning() override { return false; }
+	void workLoopBody() override {}
+	bool onAppTerminated() override { return true; }
+
+protected:
+	core::vector<queue_req_t> getQueueRequirements() const override
+	{
+		using flags_t = IQueue::FAMILY_FLAGS;
+		return { {
+			.requiredFlags = flags_t::GRAPHICS_BIT,
+			.disallowedFlags = flags_t::NONE,
+			.queueCount = 1,
+			.maxImageTransferGranularity = {1, 1, 1}
+		} };
+	}
+
+private:
+	void transitionImageLayout(
+		IGPUCommandBuffer* cmdBuf,
+		IGPUImage* image,
+		IImage::LAYOUT oldLayout,
+		IImage::LAYOUT newLayout)
+	{
+		IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
+		barrier.oldLayout = oldLayout;
+		barrier.newLayout = newLayout;
+		barrier.image = image;
+		barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+		barrier.subresourceRange.baseMipLevel = 0;
+		barrier.subresourceRange.levelCount = 1;
+		barrier.subresourceRange.baseArrayLayer = 0;
+		barrier.subresourceRange.layerCount = 1;
+		barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+		barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+		barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
+		barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+		cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
+	}
+
+	void generateTileCopyRegions(
+		IImage::SBufferCopy* outRegions,
+		uint32_t tilesPerFrame,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t imageWidth)
+	{
+		uint32_t tilesPerRow = imageWidth / tileSize;
+		for (size_t i = 0; i < tilesPerFrame; i++)
+		{
+			uint32_t tileX = (i % tilesPerRow) * tileSize;
+			uint32_t tileY = (i / tilesPerRow) * tileSize;
+
+			outRegions[i].bufferOffset = i * tileSizeBytes;
+			outRegions[i].bufferRowLength = tileSize;
+			outRegions[i].bufferImageHeight = tileSize;
+			outRegions[i].imageOffset = { tileX, tileY, 0 };
+			outRegions[i].imageExtent = { tileSize, tileSize, 1 };
+			outRegions[i].imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			outRegions[i].imageSubresource.mipLevel = 0;
+			outRegions[i].imageSubresource.baseArrayLayer = 0;
+			outRegions[i].imageSubresource.layerCount = 1;
+		}
+	}
 
-			smart_refctd_ptr<nbl::video::IGPUCommandBuffer> cmdBuf;
-			{
-				smart_refctd_ptr<nbl::video::IGPUCommandPool> cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT);
-				if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf))
-					return logFail("Failed to create Command Buffers!\n");
-			}
+	void generateRandomTileData(void* mappedPtr, uint32_t sizeBytes)
+	{
+		uint32_t* data = (uint32_t*)mappedPtr;
+		unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+		std::mt19937 g(seed);
+		const uint32_t valueCount = sizeBytes / sizeof(uint32_t);
 
-			// Create the Semaphore for prefix sum
-			constexpr uint64_t started_value = 0;
-			uint64_t timeline = started_value;
-			smart_refctd_ptr<ISemaphore> progress = m_device->createSemaphore(started_value);
+		auto bufferData = new uint32_t[valueCount];
 
-			cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cmdBuf->beginDebugMarker("Prefix Sum Dispatch", core::vectorSIMDf(0, 1, 0, 1));
-			cmdBuf->bindComputePipeline(prefixSumPipeline.get());
-			cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-			cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1);
-			cmdBuf->endDebugMarker();
-			cmdBuf->end();
+		for (uint32_t i = 0; i < valueCount; i++)
+		{
+			bufferData[i] = g();
+		}
+		memcpy(mappedPtr, bufferData, sizeBytes);
+		delete[] bufferData;
+	}
+
+	double runBenchmark(
+		const char* strategyName,
+		IGPUBuffer* stagingBuffer,
+		void* mappedPtr,
+		IGPUImage* destinationImage,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t tilesPerFrame,
+		uint32_t framesInFlight,
+		uint32_t totalFrames,
+		IQueue* queue)
+	{
+		smart_refctd_ptr<ISemaphore> timelineSemaphore = m_device->createSemaphore(0);
+
+		auto commandPools = new smart_refctd_ptr<IGPUCommandPool>[framesInFlight];
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i] = m_device->createCommandPool(
+				queue->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
+		}
 
-			{
-				auto queue = getComputeQueue();
+		auto commandBuffers = new smart_refctd_ptr<IGPUCommandBuffer>[framesInFlight];
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i]->createCommandBuffers(
+				IGPUCommandPool::BUFFER_LEVEL::PRIMARY,
+				1,
+				&commandBuffers[i]
+			);
+		}
 
-				IQueue::SSubmitInfo submit_infos[1];
-				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
-					{
-						.cmdbuf = cmdBuf.get()
-					}
-				};
-				submit_infos[0].commandBuffers = cmdBufs;
-				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
-					{
-						.semaphore = progress.get(),
-						.value = ++timeline,
-						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-					}
-				};
-				submit_infos[0].signalSemaphores = signals;
+		uint64_t timelineValue = 0;
 
-				m_api->startCapture();
-				queue->submit(submit_infos);
-				m_api->endCapture();
-			}
+		commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		transitionImageLayout(
+			commandBuffers[0].get(),
+			destinationImage,
+			IImage::LAYOUT::UNDEFINED,
+			IImage::LAYOUT::TRANSFER_DST_OPTIMAL
+		);
+		commandBuffers[0]->end();
 
-			const ISemaphore::SWaitInfo wait_infos[] = { {
-					.semaphore = progress.get(),
-					.value = timeline
-				} };
-			m_device->blockForSemaphores(wait_infos);
+		IQueue::SSubmitInfo submitInfo = {};
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() };
+		submitInfo.commandBuffers = { &cmdBufInfo, 1 };
 
-			// Create the Semaphore for Scatter
-			uint64_t timeline2 = started_value;
-			smart_refctd_ptr<ISemaphore> progress2 = m_device->createSemaphore(started_value);
+		IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = ++timelineValue,
+			.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+		};
+		submitInfo.signalSemaphores = { &signalInfo, 1 };
 
-			cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-			cmdBuf->beginDebugMarker("Scatter Dispatch", core::vectorSIMDf(0, 1, 0, 1));
-			cmdBuf->bindComputePipeline(scatterPipeline.get());
-			cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc);
-			cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1);
-			cmdBuf->endDebugMarker();
-			cmdBuf->end();
+		queue->submit({ &submitInfo, 1 });
 
-			{
-				auto queue = getComputeQueue();
+		ISemaphore::SWaitInfo waitInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &waitInfo, 1 });
 
-				IQueue::SSubmitInfo submit_infos[1];
-				IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = {
-					{
-						.cmdbuf = cmdBuf.get()
-					}
-				};
-				submit_infos[0].commandBuffers = cmdBufs;
-				IQueue::SSubmitInfo::SSemaphoreInfo waits[] = {
-					{
-						.semaphore = progress.get(),
-						.value = timeline,
-						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-					}
-				};
-				submit_infos[0].waitSemaphores = waits;
-				IQueue::SSubmitInfo::SSemaphoreInfo signals[] = {
-					{
-						.semaphore = progress2.get(),
-						.value = ++timeline2,
-						.stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
-					}
-				};
-				submit_infos[0].signalSemaphores = signals;
+		auto regions = new IImage::SBufferCopy[tilesPerFrame];
 
-				m_api->startCapture();
-				queue->submit(submit_infos);
-				m_api->endCapture();
-			}
+		generateRandomTileData(mappedPtr, tilesPerFrame * tileSizeBytes);
 
-			const ISemaphore::SWaitInfo wait_infos2[] = {{
-					.semaphore = progress2.get(),
-					.value = timeline2
-				} };
-			m_device->blockForSemaphores(wait_infos2);
-
-			const ILogicalDevice::MappedMemoryRange memory_range[] = {
-				ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()),
-				ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()),
-				ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()),
-				ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()),
-				ILogicalDevice::MappedMemoryRange(allocation[4].memory.get(), 0ull, allocation[4].memory->getAllocationSize())
-			};
+		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
+		generateTileCopyRegions(regions, tilesPerFrame, tileSize, tileSizeBytes, imageWidth);
 
-			if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[0]);
-			if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[1]);
-			if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[2]);
-			if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[3]);
-			if (!allocation[4].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
-				m_device->invalidateMappedMemoryRanges(1, &memory_range[4]);
-
-			const uint32_t* buffData[] = {
-				reinterpret_cast<const uint32_t*>(allocation[2].memory->getMappedPointer()),
-				reinterpret_cast<const uint32_t*>(allocation[3].memory->getMappedPointer()),
-				reinterpret_cast<const uint32_t*>(allocation[4].memory->getMappedPointer())
-			};
+		auto startTime = std::chrono::high_resolution_clock::now();
 
-			assert(allocation[2].offset == 0); // simpler than writing out all the pointer arithmetic
-			assert(allocation[3].offset == 0); // simpler than writing out all the pointer arithmetic
-			assert(allocation[4].offset == 0); // simpler than writing out all the pointer arithmetic
+		for (uint32_t frame = 0; frame < totalFrames; frame++)
+		{
+			uint32_t cmdBufIndex = frame % framesInFlight;
 
-			outBuffer.clear();
-			for (auto i = 0; i < bucket_count; i++) {
-				outBuffer.append(std::to_string(buffData[0][i]));
-				outBuffer.append(" ");
-			}
-			outBuffer.append("\n");
+			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 
-			m_logger->log("Scratch buffer is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+			commandBuffers[cmdBufIndex]->copyBufferToImage(
+				stagingBuffer,
+				destinationImage,
+				IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+				tilesPerFrame,
+				regions
+			);
 
-			outBuffer.clear();
-			for (auto i = 0; i < element_count; i++) {
-				outBuffer.append("{");
-				outBuffer.append(std::to_string(buffData[1][i]));
-				outBuffer.append(", ");
-				outBuffer.append(std::to_string(buffData[2][i]));
-				outBuffer.append("} ");
-			}
-			outBuffer.append("\n");
-			outBuffer.append("Count: ");
-			outBuffer.append(std::to_string(element_count));
-			outBuffer.append("\n");
-			m_logger->log("Your output array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE);
+			commandBuffers[cmdBufIndex]->end();
 
-			allocation[0].memory->unmap();
-			allocation[1].memory->unmap();
-			allocation[2].memory->unmap();
-			allocation[3].memory->unmap();
-			allocation[4].memory->unmap();
+			// Create submit info for THIS frame
+			IQueue::SSubmitInfo frameSubmitInfo = {};
+			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()};
+			frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1};
 
-			m_device->waitIdle();
+			IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = {
+				.semaphore = timelineSemaphore.get(),
+				.value = ++timelineValue,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			};
+			frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1};
 
-			delete[] bufferData;
+			// Submit to GPU
+			queue->submit({&frameSubmitInfo, 1});
 
-			return true;
+			// Wait for old frames 
+			if (frame >= framesInFlight)
+			{
+				ISemaphore::SWaitInfo frameWaitInfo = {
+					.semaphore = timelineSemaphore.get(),
+					.value = timelineValue - framesInFlight
+				};
+				m_device->blockForSemaphores({&frameWaitInfo, 1});
+			}
 		}
 
-		// Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script)
-		bool keepRunning() override { return false; }
-
-		// Finally the first actual work-loop
-		void workLoopBody() override {}
-
-		bool onAppTerminated() override { return true; }
+		// Wait for all remaining frames to complete
+		ISemaphore::SWaitInfo finalWait = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({&finalWait, 1});
+
+		auto endTime = std::chrono::high_resolution_clock::now();
+
+		delete[] regions;
+		delete[] commandPools;
+		delete[] commandBuffers;
+
+		// Calculate throughput
+		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
+		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
+
+		return throughputGBps;
+	}
+
+	bool createStagingBuffer(
+		uint32_t bufferSize,
+		uint32_t memoryTypeBits,
+		const char* debugName,
+		smart_refctd_ptr<IGPUBuffer>& outBuffer,
+		IDeviceMemoryAllocator::SAllocation& outAllocation,
+		void*& outMappedPtr)
+	{
+		IGPUBuffer::SCreationParams params;
+		params.size = bufferSize;
+		params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT;
+		outBuffer = m_device->createBuffer(std::move(params));
+		if (!outBuffer)
+			return logFail("Failed to create GPU buffer of size %d!\n", bufferSize);
+
+		outBuffer->setObjectDebugName(debugName);
+
+		auto reqs = outBuffer->getMemoryReqs();
+		reqs.memoryTypeBits &= memoryTypeBits;
+
+		outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_NONE);
+		if (!outAllocation.isValid())
+			return logFail("Failed to allocate Device Memory!\n");
+
+		outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ);
+		if (!outMappedPtr)
+			return logFail("Failed to map Device Memory!\n");
+
+		return true;
+	}
 };
 
-
-NBL_MAIN_FUNC(CountingSortApp)
\ No newline at end of file
+NBL_MAIN_FUNC(ImageUploadBenchmarkApp)

From 141295bee833de2fb97bc1ef1e7e8bc8980a643c Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Wed, 24 Dec 2025 21:09:51 +0330
Subject: [PATCH 3/7] Measurment was wierd, added some detail and also fix a
 bug related to FIF

---
 73_ImageUploadBenchmark/main.cpp | 123 +++++++++++++++++++------------
 1 file changed, 77 insertions(+), 46 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index 68815681d..eceb0f9ea 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -28,8 +28,8 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		constexpr uint32_t TILE_BYTES_PER_PIXEL = 4;
 		constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL;
 		constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
-		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / TILE_SIZE_BYTES;
 		constexpr uint32_t FRAMES_IN_FLIGHT = 4;
+		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT);
 		constexpr uint32_t TOTAL_FRAMES = 1000;
 
 		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO);
@@ -40,12 +40,20 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
 		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
-		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits;
+		uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT);
+
+		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits;
+
 		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
 
+		m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X",
+			ILogger::ELL_INFO, hostVisibleBits, deviceLocalBits, hostCachedBits);
+		m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X",
+			ILogger::ELL_INFO, hostVisibleOnlyBits, hostVisibleDeviceLocalBits);
+
 		if (!hostVisibleOnlyBits)
 		{
-			m_logger->log("HOST_VISIBLE memory types not found!", ILogger::ELL_ERROR);
+			m_logger->log("HOST_VISIBLE non-cached memory types not found!", ILogger::ELL_ERROR);
 			return false;
 		}
 
@@ -122,7 +130,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		if (hostVisibleDeviceLocalBits)
 		{
-			m_logger->log("\nTesting Strategy 2: VRAM (ReBAR)", ILogger::ELL_INFO);
+			m_logger->log("\nTesting Strategy 2: VRAM", ILogger::ELL_INFO);
 
 			double throughputVRAM = 0.0;
 			{
@@ -131,13 +139,13 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 				void* mappedPtr = nullptr;
 
 				if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits,
-					"Staging Buffer - VRAM (ReBAR)", stagingBuffer, stagingAlloc, mappedPtr))
+					"Staging Buffer - VRAM", stagingBuffer, stagingAlloc, mappedPtr))
 				{
 					return false;
 				}
 
 				throughputVRAM = runBenchmark(
-					"VRAM (ReBAR)",
+					"VRAM",
 					stagingBuffer.get(),
 					mappedPtr,
 					destinationImage.get(),
@@ -205,7 +213,8 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint32_t tilesPerFrame,
 		uint32_t tileSize,
 		uint32_t tileSizeBytes,
-		uint32_t imageWidth)
+		uint32_t imageWidth,
+		uint32_t bufferBaseOffset)
 	{
 		uint32_t tilesPerRow = imageWidth / tileSize;
 		for (size_t i = 0; i < tilesPerFrame; i++)
@@ -213,7 +222,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			uint32_t tileX = (i % tilesPerRow) * tileSize;
 			uint32_t tileY = (i / tilesPerRow) * tileSize;
 
-			outRegions[i].bufferOffset = i * tileSizeBytes;
+			outRegions[i].bufferOffset = bufferBaseOffset + (i * tileSizeBytes);
 			outRegions[i].bufferRowLength = tileSize;
 			outRegions[i].bufferImageHeight = tileSize;
 			outRegions[i].imageOffset = { tileX, tileY, 0 };
@@ -225,23 +234,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		}
 	}
 
-	void generateRandomTileData(void* mappedPtr, uint32_t sizeBytes)
-	{
-		uint32_t* data = (uint32_t*)mappedPtr;
-		unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-		std::mt19937 g(seed);
-		const uint32_t valueCount = sizeBytes / sizeof(uint32_t);
-
-		auto bufferData = new uint32_t[valueCount];
-
-		for (uint32_t i = 0; i < valueCount; i++)
-		{
-			bufferData[i] = g();
-		}
-		memcpy(mappedPtr, bufferData, sizeBytes);
-		delete[] bufferData;
-	}
-
 	double runBenchmark(
 		const char* strategyName,
 		IGPUBuffer* stagingBuffer,
@@ -305,12 +297,31 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		};
 		m_device->blockForSemaphores({ &waitInfo, 1 });
 
-		auto regions = new IImage::SBufferCopy[tilesPerFrame];
+		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
+		uint32_t partitionSize = tilesPerFrame * tileSizeBytes;
 
-		generateRandomTileData(mappedPtr, tilesPerFrame * tileSizeBytes);
+		// CPU source buffer with random data (generated once, reused each frame)
+		auto cpuSourceData = new uint8_t[partitionSize];
+		{
+			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+			std::mt19937 g(seed);
+			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData);
+			for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++)
+				data[i] = g();
+		}
 
-		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
-		generateTileCopyRegions(regions, tilesPerFrame, tileSize, tileSizeBytes, imageWidth);
+		auto regionsPerFrame = new IImage::SBufferCopy*[framesInFlight];
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			regionsPerFrame[i] = new IImage::SBufferCopy[tilesPerFrame];
+			uint32_t bufferOffset = i * partitionSize;
+			generateTileCopyRegions(regionsPerFrame[i], tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset);
+		}
+
+		double totalWaitTime = 0.0;
+		double totalMemcpyTime = 0.0;
+		double totalRecordTime = 0.0;
+		double totalSubmitTime = 0.0;
 
 		auto startTime = std::chrono::high_resolution_clock::now();
 
@@ -318,19 +329,35 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		{
 			uint32_t cmdBufIndex = frame % framesInFlight;
 
-			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+			auto t1 = std::chrono::high_resolution_clock::now();
+			if (frame >= framesInFlight)
+			{
+				ISemaphore::SWaitInfo frameWaitInfo = {
+					.semaphore = timelineSemaphore.get(),
+					.value = timelineValue - framesInFlight + 1
+				};
+				m_device->blockForSemaphores({&frameWaitInfo, 1});
+			}
+			auto t2 = std::chrono::high_resolution_clock::now();
+
+			commandPools[cmdBufIndex]->reset();
+
+			uint32_t bufferOffset = cmdBufIndex * partitionSize;
+			void* targetPtr = static_cast<uint8_t*>(mappedPtr) + bufferOffset;
+			memcpy(targetPtr, cpuSourceData, partitionSize);
+			auto t3 = std::chrono::high_resolution_clock::now();
 
+			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			commandBuffers[cmdBufIndex]->copyBufferToImage(
 				stagingBuffer,
 				destinationImage,
 				IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
 				tilesPerFrame,
-				regions
+				regionsPerFrame[cmdBufIndex]
 			);
-
 			commandBuffers[cmdBufIndex]->end();
+			auto t4 = std::chrono::high_resolution_clock::now();
 
-			// Create submit info for THIS frame
 			IQueue::SSubmitInfo frameSubmitInfo = {};
 			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()};
 			frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1};
@@ -342,18 +369,13 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			};
 			frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1};
 
-			// Submit to GPU
 			queue->submit({&frameSubmitInfo, 1});
+			auto t5 = std::chrono::high_resolution_clock::now();
 
-			// Wait for old frames 
-			if (frame >= framesInFlight)
-			{
-				ISemaphore::SWaitInfo frameWaitInfo = {
-					.semaphore = timelineSemaphore.get(),
-					.value = timelineValue - framesInFlight
-				};
-				m_device->blockForSemaphores({&frameWaitInfo, 1});
-			}
+			totalWaitTime += std::chrono::duration<double>(t2 - t1).count();
+			totalMemcpyTime += std::chrono::duration<double>(t3 - t2).count();
+			totalRecordTime += std::chrono::duration<double>(t4 - t3).count();
+			totalSubmitTime += std::chrono::duration<double>(t5 - t4).count();
 		}
 
 		// Wait for all remaining frames to complete
@@ -365,15 +387,24 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		auto endTime = std::chrono::high_resolution_clock::now();
 
-		delete[] regions;
+		delete[] cpuSourceData;
+		for (uint32_t i = 0; i < framesInFlight; i++)
+			delete[] regionsPerFrame[i];
+		delete[] regionsPerFrame;
 		delete[] commandPools;
 		delete[] commandBuffers;
 
-		// Calculate throughput
 		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
 		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
 		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
 
+		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_INFO, strategyName);
+		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
+		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
+		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
+		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
+		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_INFO, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime);
+
 		return throughputGBps;
 	}
 
@@ -401,7 +432,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		if (!outAllocation.isValid())
 			return logFail("Failed to allocate Device Memory!\n");
 
-		outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ);
+		outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_WRITE);
 		if (!outMappedPtr)
 			return logFail("Failed to map Device Memory!\n");
 

From 874814af7c8dd08c264afbdebef1e0719561dffe Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Wed, 31 Dec 2025 16:29:19 +0330
Subject: [PATCH 4/7] Resolved PR comments + adding timestamp query

---
 73_ImageUploadBenchmark/main.cpp | 159 +++++++++++++++++++++----------
 1 file changed, 110 insertions(+), 49 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index eceb0f9ea..f8124c9ab 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -1,5 +1,6 @@
 #include "nbl/examples/examples.hpp"
 #include <chrono>
+#include <thread>
 
 using namespace nbl;
 using namespace nbl::core;
@@ -68,8 +69,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		{
 			IGPUImage::SCreationParams imgParams{};
 			imgParams.type = IImage::E_TYPE::ET_2D;
-			imgParams.extent.width = TILE_SIZE * 32;
-			imgParams.extent.height = TILE_SIZE * 32;
+			uint32_t tilePerRow = (uint32_t)std::sqrt(TILES_PER_FRAME);
+			imgParams.extent.width = TILE_SIZE * tilePerRow;
+			imgParams.extent.height = TILE_SIZE * tilePerRow;
 			imgParams.extent.depth = 1u;
 			imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM;
 			imgParams.mipLevels = 1u;
@@ -111,6 +113,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			throughputSystemRAM = runBenchmark(
 				"System RAM",
 				stagingBuffer.get(),
+				stagingAlloc,
 				mappedPtr,
 				destinationImage.get(),
 				TILE_SIZE,
@@ -147,6 +150,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 				throughputVRAM = runBenchmark(
 					"VRAM",
 					stagingBuffer.get(),
+					stagingAlloc,
 					mappedPtr,
 					destinationImage.get(),
 					TILE_SIZE,
@@ -166,6 +170,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup);
 		}
 
+		m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_INFO);
+		std::this_thread::sleep_for(std::chrono::seconds(5));
+
 		return true;
 	}
 
@@ -186,28 +193,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	}
 
 private:
-	void transitionImageLayout(
-		IGPUCommandBuffer* cmdBuf,
-		IGPUImage* image,
-		IImage::LAYOUT oldLayout,
-		IImage::LAYOUT newLayout)
-	{
-		IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
-		barrier.oldLayout = oldLayout;
-		barrier.newLayout = newLayout;
-		barrier.image = image;
-		barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-		barrier.subresourceRange.baseMipLevel = 0;
-		barrier.subresourceRange.levelCount = 1;
-		barrier.subresourceRange.baseArrayLayer = 0;
-		barrier.subresourceRange.layerCount = 1;
-		barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
-		barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
-		barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS;
-		barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-		cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
-	}
-
 	void generateTileCopyRegions(
 		IImage::SBufferCopy* outRegions,
 		uint32_t tilesPerFrame,
@@ -237,6 +222,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	double runBenchmark(
 		const char* strategyName,
 		IGPUBuffer* stagingBuffer,
+		IDeviceMemoryAllocator::SAllocation& stagingAlloc,
 		void* mappedPtr,
 		IGPUImage* destinationImage,
 		uint32_t tileSize,
@@ -248,7 +234,16 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	{
 		smart_refctd_ptr<ISemaphore> timelineSemaphore = m_device->createSemaphore(0);
 
-		auto commandPools = new smart_refctd_ptr<IGPUCommandPool>[framesInFlight];
+		smart_refctd_ptr<IQueryPool> queryPool;
+		{
+			IQueryPool::SCreationParams queryPoolParams = {};
+			queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP;
+			queryPoolParams.queryCount = framesInFlight * 2;  
+			queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
+			queryPool = m_device->createQueryPool(queryPoolParams);
+		}
+		
+		std::vector<smart_refctd_ptr<IGPUCommandPool>> commandPools(framesInFlight);
 		for (uint32_t i = 0; i < framesInFlight; i++)
 		{
 			commandPools[i] = m_device->createCommandPool(
@@ -256,8 +251,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
 			);
 		}
-
-		auto commandBuffers = new smart_refctd_ptr<IGPUCommandBuffer>[framesInFlight];
+		std::vector<smart_refctd_ptr<IGPUCommandBuffer>> commandBuffers(framesInFlight);
 		for (uint32_t i = 0; i < framesInFlight; i++)
 		{
 			commandPools[i]->createCommandBuffers(
@@ -270,12 +264,22 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint64_t timelineValue = 0;
 
 		commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-		transitionImageLayout(
-			commandBuffers[0].get(),
-			destinationImage,
-			IImage::LAYOUT::UNDEFINED,
-			IImage::LAYOUT::TRANSFER_DST_OPTIMAL
-		);
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> initBarrier = {};
+			initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+			initBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			initBarrier.image = destinationImage;
+			initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			initBarrier.subresourceRange.baseMipLevel = 0;
+			initBarrier.subresourceRange.levelCount = 1;
+			initBarrier.subresourceRange.baseArrayLayer = 0;
+			initBarrier.subresourceRange.layerCount = 1;
+			initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+			initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+			initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&initBarrier, 1}});
+		}
 		commandBuffers[0]->end();
 
 		IQueue::SSubmitInfo submitInfo = {};
@@ -300,22 +304,20 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
 		uint32_t partitionSize = tilesPerFrame * tileSizeBytes;
 
-		// CPU source buffer with random data (generated once, reused each frame)
-		auto cpuSourceData = new uint8_t[partitionSize];
+		std::vector<uint8_t> cpuSourceData(partitionSize);
 		{
 			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
 			std::mt19937 g(seed);
-			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData);
+			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData.data());
 			for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++)
 				data[i] = g();
 		}
-
-		auto regionsPerFrame = new IImage::SBufferCopy*[framesInFlight];
+		std::vector<std::vector<IImage::SBufferCopy>> regionsPerFrame(framesInFlight);
 		for (uint32_t i = 0; i < framesInFlight; i++)
 		{
-			regionsPerFrame[i] = new IImage::SBufferCopy[tilesPerFrame];
+			regionsPerFrame[i].resize(tilesPerFrame);
 			uint32_t bufferOffset = i * partitionSize;
-			generateTileCopyRegions(regionsPerFrame[i], tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset);
+			generateTileCopyRegions(regionsPerFrame[i].data(), tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset);
 		}
 
 		double totalWaitTime = 0.0;
@@ -344,17 +346,63 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 			uint32_t bufferOffset = cmdBufIndex * partitionSize;
 			void* targetPtr = static_cast<uint8_t*>(mappedPtr) + bufferOffset;
-			memcpy(targetPtr, cpuSourceData, partitionSize);
+			memcpy(targetPtr, cpuSourceData.data(), partitionSize);
+
+			if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			{
+				ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+
 			auto t3 = std::chrono::high_resolution_clock::now();
 
 			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			uint32_t queryStartIndex = cmdBufIndex * 2;
+			commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
+			barrier.oldLayout = IImage::LAYOUT::GENERAL;
+			barrier.newLayout = IImage::LAYOUT::GENERAL;
+			barrier.image = destinationImage;
+			barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			barrier.subresourceRange.baseMipLevel = 0;
+			barrier.subresourceRange.levelCount = 1;
+			barrier.subresourceRange.baseArrayLayer = 0;
+			barrier.subresourceRange.layerCount = 1;
+			barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&barrier, 1}});
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0);
+
 			commandBuffers[cmdBufIndex]->copyBufferToImage(
 				stagingBuffer,
 				destinationImage,
-				IImage::LAYOUT::TRANSFER_DST_OPTIMAL,
+				IImage::LAYOUT::GENERAL,
 				tilesPerFrame,
-				regionsPerFrame[cmdBufIndex]
+				regionsPerFrame[cmdBufIndex].data()
 			);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> afterBarrier = {};
+			afterBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.image = destinationImage;
+			afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			afterBarrier.subresourceRange.baseMipLevel = 0;
+			afterBarrier.subresourceRange.levelCount = 1;
+			afterBarrier.subresourceRange.baseArrayLayer = 0;
+			afterBarrier.subresourceRange.layerCount = 1;
+			afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}});
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1);
+
 			commandBuffers[cmdBufIndex]->end();
 			auto t4 = std::chrono::high_resolution_clock::now();
 
@@ -387,17 +435,30 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		auto endTime = std::chrono::high_resolution_clock::now();
 
-		delete[] cpuSourceData;
-		for (uint32_t i = 0; i < framesInFlight; i++)
-			delete[] regionsPerFrame[i];
-		delete[] regionsPerFrame;
-		delete[] commandPools;
-		delete[] commandBuffers;
+		std::vector<uint64_t> timestamps(framesInFlight * 2);
+		const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+		m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags);
+		uint64_t totalGpuTicks = 0;
+		for (uint32_t i = 0; i < framesInFlight; i++) {
+			uint64_t startTick = timestamps[i * 2 + 0];
+			uint64_t endTick = timestamps[i * 2 + 1];
+			totalGpuTicks += (endTick - startTick);
+		}
+		float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds;
+		double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9;
+
+		double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight;
+		double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames;
+
 
 		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
 		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+
 		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
 
+		m_logger->log("    GPU time: %.3f s", ILogger::ELL_INFO, totalGpuTimeSeconds);
+		m_logger->log("    GPU throughput: %.2f GB/s", ILogger::ELL_INFO, throughputGBps);
+
 		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_INFO, strategyName);
 		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
 		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);

From ddb7bfc6ae5889aea89db756b461a0beeb763d0f Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Thu, 1 Jan 2026 17:01:15 +0330
Subject: [PATCH 5/7] Adding more logs to release build

---
 73_ImageUploadBenchmark/main.cpp | 36 ++++++++++++++++----------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index f8124c9ab..ff38b1555 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -33,11 +33,11 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT);
 		constexpr uint32_t TOTAL_FRAMES = 1000;
 
-		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO);
-		m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_INFO, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024);
-		m_logger->log("Staging buffer: %u MB", ILogger::ELL_INFO, STAGING_BUFFER_SIZE / (1024 * 1024));
-		m_logger->log("Tiles per frame: %u", ILogger::ELL_INFO, TILES_PER_FRAME);
-		m_logger->log("Frames in flight: %u", ILogger::ELL_INFO, FRAMES_IN_FLIGHT);
+		m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_PERFORMANCE);
+		m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_PERFORMANCE, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024);
+		m_logger->log("Staging buffer: %u MB", ILogger::ELL_PERFORMANCE, STAGING_BUFFER_SIZE / (1024 * 1024));
+		m_logger->log("Tiles per frame: %u", ILogger::ELL_PERFORMANCE, TILES_PER_FRAME);
+		m_logger->log("Frames in flight: %u", ILogger::ELL_PERFORMANCE, FRAMES_IN_FLIGHT);
 
 		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
 		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
@@ -48,9 +48,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
 
 		m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X",
-			ILogger::ELL_INFO, hostVisibleBits, deviceLocalBits, hostCachedBits);
+			ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits);
 		m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X",
-			ILogger::ELL_INFO, hostVisibleOnlyBits, hostVisibleDeviceLocalBits);
+			ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits);
 
 		if (!hostVisibleOnlyBits)
 		{
@@ -96,7 +96,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 				return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n");
 		}
 
-		m_logger->log("\nTesting Strategy 1: System RAM", ILogger::ELL_INFO);
+		m_logger->log("\nStrategy 1: System RAM", ILogger::ELL_PERFORMANCE);
 
 		double throughputSystemRAM = 0.0;
 		{
@@ -133,7 +133,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		if (hostVisibleDeviceLocalBits)
 		{
-			m_logger->log("\nTesting Strategy 2: VRAM", ILogger::ELL_INFO);
+			m_logger->log("\nStrategy 2: VRAM", ILogger::ELL_PERFORMANCE);
 
 			double throughputVRAM = 0.0;
 			{
@@ -170,7 +170,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup);
 		}
 
-		m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_INFO);
+		m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE);
 		std::this_thread::sleep_for(std::chrono::seconds(5));
 
 		return true;
@@ -456,15 +456,15 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
 
-		m_logger->log("    GPU time: %.3f s", ILogger::ELL_INFO, totalGpuTimeSeconds);
-		m_logger->log("    GPU throughput: %.2f GB/s", ILogger::ELL_INFO, throughputGBps);
+		m_logger->log("    GPU time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds);
+		m_logger->log("    GPU throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps);
 
-		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_INFO, strategyName);
-		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
-		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
-		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
-		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
-		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_INFO, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime);
+		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName);
+		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
+		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
+		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
+		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
+		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime);
 
 		return throughputGBps;
 	}

From f1fc8d50a520023dd72ac995175dfe60a64b997e Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Fri, 30 Jan 2026 17:19:14 +0330
Subject: [PATCH 6/7] Added image to image copy

---
 73_ImageUploadBenchmark/main.cpp | 458 ++++++++++++++++++++++++++++++-
 1 file changed, 443 insertions(+), 15 deletions(-)

diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index ff38b1555..1fff59202 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -43,7 +43,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
 		uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT);
 
-		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits;
+		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits;
 
 		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
 
@@ -170,6 +170,146 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup);
 		}
 
+		m_device->waitIdle();
+
+
+		m_logger->log("\nStrategy 3: Image-to-Image Staging (OPTIMAL)", ILogger::ELL_PERFORMANCE);
+		{
+			std::vector<smart_refctd_ptr<IGPUImage>> stagingImages(FRAMES_IN_FLIGHT);
+			for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++)
+			{
+				IGPUImage::SCreationParams imgParams{};
+				imgParams.type = IImage::E_TYPE::ET_2D;
+				imgParams.extent.width = TILE_SIZE;
+				imgParams.extent.height = TILE_SIZE;
+				imgParams.extent.depth = 1u;
+				imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM;
+				imgParams.mipLevels = 1u;
+				imgParams.flags = IImage::ECF_NONE;
+				imgParams.arrayLayers = 1u;
+				imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT;
+				imgParams.tiling = video::IGPUImage::TILING::OPTIMAL;
+				imgParams.usage = asset::IImage::EUF_TRANSFER_SRC_BIT;
+				imgParams.preinitialized = false;
+				stagingImages[i] = m_device->createImage(std::move(imgParams));
+			}
+
+			std::vector<size_t> imageMemoryOffsets(FRAMES_IN_FLIGHT);
+			size_t currentOffset = 0;
+			uint32_t combinedMemoryTypeBits = 0xFFFFFFFF;
+			uint32_t maxAlignmentLog2 = 0;
+			for (size_t i = 0; i < FRAMES_IN_FLIGHT; i++)
+			{
+				auto memReqs = stagingImages[i]->getMemoryReqs();
+				size_t alignment = 1u << memReqs.alignmentLog2;
+				size_t alignedOffset = (currentOffset + alignment - 1) & ~(alignment - 1);
+				imageMemoryOffsets[i] = alignedOffset;
+				currentOffset = alignedOffset + memReqs.size;
+				combinedMemoryTypeBits &= memReqs.memoryTypeBits;
+				if (memReqs.alignmentLog2 > maxAlignmentLog2)
+					maxAlignmentLog2 = memReqs.alignmentLog2;
+			}
+
+			size_t totalMemorySize = currentOffset;
+
+
+			uint32_t compatibleBits = combinedMemoryTypeBits & hostVisibleDeviceLocalBits;
+			if (!compatibleBits)
+				compatibleBits = combinedMemoryTypeBits & hostVisibleOnlyBits;  
+
+			if (!compatibleBits)
+			{
+				m_logger->log("OPTIMAL images don't support HOST_VISIBLE on this GPU!",
+					ILogger::ELL_ERROR);
+				return false;
+			}
+
+			IDeviceMemoryBacked::SDeviceMemoryRequirements memReqs = {};
+			memReqs.size = totalMemorySize;
+			memReqs.memoryTypeBits = compatibleBits;
+			memReqs.alignmentLog2 = maxAlignmentLog2;
+
+			auto memoryAllocation = m_device->allocate(memReqs,nullptr,IDeviceMemoryAllocation::EMAF_NONE);
+			if (!memoryAllocation.isValid())
+			{
+				m_logger->log("Failed to allocate HOST_VISIBLE memory for staging images!", ILogger::ELL_ERROR);
+			}
+
+			for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++)
+			{
+				ILogicalDevice::SBindImageMemoryInfo info{};
+				info.image = stagingImages[i].get();
+				info.binding.memory = memoryAllocation.memory.get();
+				info.binding.offset = imageMemoryOffsets[i];
+				if (!m_device->bindImageMemory({&info,1}))
+				{
+					m_logger->log("Failed to bind staging image %u to memory!", ILogger::ELL_ERROR, i);
+				}
+			}
+
+			void* mappedPtr = memoryAllocation.memory->map({ 0ull,memoryAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_WRITE);
+			if (!mappedPtr)
+			{
+				m_logger->log("Failed to map staging image memory!", ILogger::ELL_ERROR);
+			}
+
+			smart_refctd_ptr<IGPUCommandPool> transitionCmdPool = m_device->createCommandPool(
+				queue->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
+
+			smart_refctd_ptr<IGPUCommandBuffer> transitionCmdBuf;
+			transitionCmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &transitionCmdBuf);
+
+			transitionCmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+		
+			for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++)
+			{
+				IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
+				barrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+				barrier.newLayout = IImage::LAYOUT::GENERAL;
+				barrier.image = stagingImages[i].get();
+				barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+				barrier.subresourceRange.baseMipLevel = 0;
+				barrier.subresourceRange.levelCount = 1;
+				barrier.subresourceRange.baseArrayLayer = 0;
+				barrier.subresourceRange.layerCount = 1;
+				barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+				barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
+				barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+				barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT;
+
+				transitionCmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
+			}
+
+			transitionCmdBuf->end();
+
+			IQueue::SSubmitInfo submitInfo = {};
+			IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = transitionCmdBuf.get() };
+			submitInfo.commandBuffers = { &cmdBufInfo, 1 };
+
+			queue->submit({ &submitInfo, 1 });
+			m_device->waitIdle();
+			double throughputImageStaging = runBenchmarkImageStaging(
+				"Image-to-Image",
+				stagingImages,              
+				imageMemoryOffsets,         
+				memoryAllocation.memory.get(),
+				mappedPtr,                  
+				destinationImage.get(),     
+				TILE_SIZE,
+				TILE_SIZE_BYTES,
+				TILES_PER_FRAME,
+				FRAMES_IN_FLIGHT,
+				TOTAL_FRAMES,
+				queue
+			);
+
+			m_logger->log("Image-to-Image staging throughput: %.2f GB/s",
+				ILogger::ELL_PERFORMANCE, throughputImageStaging);
+		}
+
 		m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE);
 		std::this_thread::sleep_for(std::chrono::seconds(5));
 
@@ -238,11 +378,11 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		{
 			IQueryPool::SCreationParams queryPoolParams = {};
 			queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP;
-			queryPoolParams.queryCount = framesInFlight * 2;  
+			queryPoolParams.queryCount = framesInFlight * 2;
 			queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
 			queryPool = m_device->createQueryPool(queryPoolParams);
 		}
-		
+
 		std::vector<smart_refctd_ptr<IGPUCommandPool>> commandPools(framesInFlight);
 		for (uint32_t i = 0; i < framesInFlight; i++)
 		{
@@ -278,7 +418,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
 			initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
 			initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
-			commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&initBarrier, 1}});
+			commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} });
 		}
 		commandBuffers[0]->end();
 
@@ -338,7 +478,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 					.semaphore = timelineSemaphore.get(),
 					.value = timelineValue - framesInFlight + 1
 				};
-				m_device->blockForSemaphores({&frameWaitInfo, 1});
+				m_device->blockForSemaphores({ &frameWaitInfo, 1 });
 			}
 			auto t2 = std::chrono::high_resolution_clock::now();
 
@@ -374,7 +514,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
 			barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
 			barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
-			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&barrier, 1}});
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
 
 			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0);
 
@@ -399,7 +539,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
 			afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
 			afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
-			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}});
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&afterBarrier, 1} });
 
 			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1);
 
@@ -407,17 +547,17 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			auto t4 = std::chrono::high_resolution_clock::now();
 
 			IQueue::SSubmitInfo frameSubmitInfo = {};
-			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()};
-			frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1};
+			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = { .cmdbuf = commandBuffers[cmdBufIndex].get() };
+			frameSubmitInfo.commandBuffers = { &frameCmdBufInfo, 1 };
 
 			IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = {
 				.semaphore = timelineSemaphore.get(),
 				.value = ++timelineValue,
 				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
 			};
-			frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1};
+			frameSubmitInfo.signalSemaphores = { &frameSignalInfo, 1 };
 
-			queue->submit({&frameSubmitInfo, 1});
+			queue->submit({ &frameSubmitInfo, 1 });
 			auto t5 = std::chrono::high_resolution_clock::now();
 
 			totalWaitTime += std::chrono::duration<double>(t2 - t1).count();
@@ -426,12 +566,11 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			totalSubmitTime += std::chrono::duration<double>(t5 - t4).count();
 		}
 
-		// Wait for all remaining frames to complete
 		ISemaphore::SWaitInfo finalWait = {
 			.semaphore = timelineSemaphore.get(),
 			.value = timelineValue
 		};
-		m_device->blockForSemaphores({&finalWait, 1});
+		m_device->blockForSemaphores({ &finalWait, 1 });
 
 		auto endTime = std::chrono::high_resolution_clock::now();
 
@@ -469,6 +608,295 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		return throughputGBps;
 	}
 
+
+	double runBenchmarkImageStaging(
+		const char* strategyName,
+		const std::vector<smart_refctd_ptr<IGPUImage>>& stagingImages,  
+		const std::vector<size_t>& imageMemoryOffsets,                  
+		IDeviceMemoryAllocation* stagingMemory,                         
+		void* mappedPtr,                                                
+		IGPUImage* destinationImage,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t tilesPerFrame,
+		uint32_t framesInFlight,
+		uint32_t totalFrames,
+		IQueue* queue)
+	{
+		smart_refctd_ptr<ISemaphore> timelineSemaphore = m_device->createSemaphore(0);
+
+		smart_refctd_ptr<IQueryPool> queryPool;
+		{
+			IQueryPool::SCreationParams queryPoolParams = {};
+			queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP;
+			queryPoolParams.queryCount = framesInFlight * 2;
+			queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
+			queryPool = m_device->createQueryPool(queryPoolParams);
+		}
+
+		std::vector<smart_refctd_ptr<IGPUCommandPool>> commandPools(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i] = m_device->createCommandPool(
+				queue->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
+		}
+		std::vector<smart_refctd_ptr<IGPUCommandBuffer>> commandBuffers(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i]->createCommandBuffers(
+				IGPUCommandPool::BUFFER_LEVEL::PRIMARY,
+				1,
+				&commandBuffers[i]
+			);
+		}
+
+		uint64_t timelineValue = 0;
+
+		commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> initBarrier = {};
+			initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+			initBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			initBarrier.image = destinationImage;
+			initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			initBarrier.subresourceRange.baseMipLevel = 0;
+			initBarrier.subresourceRange.levelCount = 1;
+			initBarrier.subresourceRange.baseArrayLayer = 0;
+			initBarrier.subresourceRange.layerCount = 1;
+			initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+			initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+			initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} });
+		}
+		commandBuffers[0]->end();
+
+		IQueue::SSubmitInfo submitInfo = {};
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() };
+		submitInfo.commandBuffers = { &cmdBufInfo, 1 };
+
+		IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = ++timelineValue,
+			.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+		};
+		submitInfo.signalSemaphores = { &signalInfo, 1 };
+
+		queue->submit({ &submitInfo, 1 });
+
+		ISemaphore::SWaitInfo waitInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &waitInfo, 1 });
+		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
+		std::vector<uint8_t> testPatternData(tileSizeBytes);
+		for (uint32_t y = 0; y < tileSize; y++)
+		{
+			for (uint32_t x = 0; x < tileSize; x++)
+			{
+				uint32_t idx = (y * tileSize + x) * 4;
+				testPatternData[idx + 0] = (x * 2) & 0xFF;  
+				testPatternData[idx + 1] = (y * 2) & 0xFF;  
+				testPatternData[idx + 2] = 128;              
+				testPatternData[idx + 3] = 255;              
+			}
+		}
+
+		uint32_t tilesPerRow = imageWidth / tileSize;
+
+		double totalWaitTime = 0.0;
+		double totalMemcpyTime = 0.0;
+		double totalImageCreateTime = 0.0;  
+		double totalRecordTime = 0.0;
+		double totalSubmitTime = 0.0;
+
+		auto startTime = std::chrono::high_resolution_clock::now();
+
+		for (uint32_t frame = 0; frame < totalFrames; frame++)
+		{
+			uint32_t cmdBufIndex = frame % framesInFlight;
+
+			auto t1 = std::chrono::high_resolution_clock::now();
+			if (frame >= framesInFlight)
+			{
+				ISemaphore::SWaitInfo frameWaitInfo = {
+					.semaphore = timelineSemaphore.get(),
+					.value = timelineValue - framesInFlight + 1
+				};
+				m_device->blockForSemaphores({&frameWaitInfo, 1});
+			}
+			auto t2 = std::chrono::high_resolution_clock::now();
+
+			commandPools[cmdBufIndex]->reset();
+
+			IGPUImage* stagingImage = stagingImages[cmdBufIndex].get();
+			size_t memoryOffset = imageMemoryOffsets[cmdBufIndex];
+
+			void* targetPtr = static_cast<uint8_t*>(mappedPtr) + memoryOffset;
+			memcpy(targetPtr, testPatternData.data(), tileSizeBytes);
+
+			// Flush if not HOST_COHERENT
+			if (!stagingMemory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			{
+				ILogicalDevice::MappedMemoryRange range(stagingMemory, memoryOffset, tileSizeBytes);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+
+
+			auto t3 = std::chrono::high_resolution_clock::now();
+
+
+
+			auto t4 = std::chrono::high_resolution_clock::now();
+
+			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			uint32_t queryStartIndex = cmdBufIndex * 2;
+			commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> stagingBarrier = {};
+			stagingBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			stagingBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			stagingBarrier.image = stagingImage;
+			stagingBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			stagingBarrier.subresourceRange.baseMipLevel = 0;
+			stagingBarrier.subresourceRange.levelCount = 1;
+			stagingBarrier.subresourceRange.baseArrayLayer = 0;
+			stagingBarrier.subresourceRange.layerCount = 1;
+			stagingBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT;
+			stagingBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT;
+			stagingBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT;
+			stagingBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&stagingBarrier, 1} });
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> dstBarrier = {};
+			dstBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			dstBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			dstBarrier.image = destinationImage;
+			dstBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			dstBarrier.subresourceRange.baseMipLevel = 0;
+			dstBarrier.subresourceRange.levelCount = 1;
+			dstBarrier.subresourceRange.baseArrayLayer = 0;
+			dstBarrier.subresourceRange.layerCount = 1;
+			dstBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			dstBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			dstBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			dstBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&dstBarrier, 1}});
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0);
+
+			uint32_t tileIndex = frame % tilesPerRow;  
+			uint32_t tileX = (tileIndex % tilesPerRow) * tileSize;
+			uint32_t tileY = (tileIndex / tilesPerRow) * tileSize;
+
+			IImage::SImageCopy copyRegion = {};
+			copyRegion.srcSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			copyRegion.srcSubresource.mipLevel = 0;
+			copyRegion.srcSubresource.baseArrayLayer = 0;
+			copyRegion.srcSubresource.layerCount = 1;
+			copyRegion.srcOffset = { 0, 0, 0 };
+			copyRegion.dstSubresource = copyRegion.srcSubresource;
+			copyRegion.dstOffset = { tileX, tileY, 0 };
+			copyRegion.extent = { tileSize, tileSize, 1 };
+
+			commandBuffers[cmdBufIndex]->copyImage(
+				stagingImage,
+				IImage::LAYOUT::GENERAL,
+				destinationImage,
+				IImage::LAYOUT::GENERAL,
+				1,
+				&copyRegion
+			);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> afterBarrier = {};
+			afterBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.image = destinationImage;
+			afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			afterBarrier.subresourceRange.baseMipLevel = 0;
+			afterBarrier.subresourceRange.levelCount = 1;
+			afterBarrier.subresourceRange.baseArrayLayer = 0;
+			afterBarrier.subresourceRange.layerCount = 1;
+			afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT;
+			afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS;
+			afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}});
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1);
+
+			commandBuffers[cmdBufIndex]->end();
+			auto t5 = std::chrono::high_resolution_clock::now();
+
+			IQueue::SSubmitInfo frameSubmitInfo = {};
+			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()};
+			frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1};
+
+			IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = {
+				.semaphore = timelineSemaphore.get(),
+				.value = ++timelineValue,
+				.stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS
+			};
+			frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1};
+
+			queue->submit({&frameSubmitInfo, 1});
+			auto t6 = std::chrono::high_resolution_clock::now();
+
+
+
+			totalWaitTime += std::chrono::duration<double>(t2 - t1).count();
+			totalMemcpyTime += std::chrono::duration<double>(t3 - t2).count();
+			totalImageCreateTime += std::chrono::duration<double>(t4 - t3).count();
+			totalRecordTime += std::chrono::duration<double>(t5 - t4).count();
+			totalSubmitTime += std::chrono::duration<double>(t6 - t5).count();
+		}
+
+		ISemaphore::SWaitInfo finalWait = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({&finalWait, 1});
+
+		auto endTime = std::chrono::high_resolution_clock::now();
+
+		std::vector<uint64_t> timestamps(framesInFlight * 2);
+		const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+		m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags);
+		uint64_t totalGpuTicks = 0;
+		for (uint32_t i = 0; i < framesInFlight; i++) {
+			uint64_t startTick = timestamps[i * 2 + 0];
+			uint64_t endTick = timestamps[i * 2 + 1];
+			totalGpuTicks += (endTick - startTick);
+		}
+		float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds;
+		double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9;
+
+		double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight;
+		double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames;
+
+		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
+		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+
+		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
+
+		m_logger->log("    copyImage time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds);
+		m_logger->log("    Total throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps);
+
+		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName);
+		m_logger->log("    Wait time:         %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
+		m_logger->log("    Memcpy time:       %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
+		m_logger->log("    Image create time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalImageCreateTime, 100.0 * totalImageCreateTime / elapsedSeconds);
+		m_logger->log("    Record time:       %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
+		m_logger->log("    Submit time:       %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
+		m_logger->log("    Memcpy speed:      %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime);
+
+		return throughputGBps;
+	}
+
 	bool createStagingBuffer(
 		uint32_t bufferSize,
 		uint32_t memoryTypeBits,
@@ -493,7 +921,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		if (!outAllocation.isValid())
 			return logFail("Failed to allocate Device Memory!\n");
 
-		outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_WRITE);
+		outMappedPtr = outAllocation.memory->map({ 0ull, outAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_WRITE);
 		if (!outMappedPtr)
 			return logFail("Failed to map Device Memory!\n");
 
@@ -501,4 +929,4 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	}
 };
 
-NBL_MAIN_FUNC(ImageUploadBenchmarkApp)
+NBL_MAIN_FUNC(ImageUploadBenchmarkApp)
\ No newline at end of file

From 7abe408b3fba6ae0ea896c1c462c51a7a483e506 Mon Sep 17 00:00:00 2001
From: CrabeExtra <abbasgaroosi7@gmail.com>
Date: Thu, 26 Feb 2026 10:35:34 +0330
Subject: [PATCH 7/7] compute shader added

---
 73_ImageUploadBenchmark/CMakeLists.txt        |   60 +-
 .../app_resources/common.hlsl                 |    8 +
 .../app_resources/tile_upload.comp.hlsl       |  260 ++++
 73_ImageUploadBenchmark/main.cpp              | 1203 ++++++++++++++---
 4 files changed, 1334 insertions(+), 197 deletions(-)
 create mode 100644 73_ImageUploadBenchmark/app_resources/common.hlsl
 create mode 100644 73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl

diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt
index 2f9218f93..da95550e7 100644
--- a/73_ImageUploadBenchmark/CMakeLists.txt
+++ b/73_ImageUploadBenchmark/CMakeLists.txt
@@ -3,4 +3,62 @@ if(NOT RES)
 	message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
 endif()
 
-nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
\ No newline at end of file
+nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")
+
+if(NBL_EMBED_BUILTIN_RESOURCES)
+	set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+	set(RESOURCE_DIR "app_resources")
+
+	get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+	get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+	file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+	foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+		LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+	endforeach()
+
+	ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+	LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+endif()
+
+set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
+set(DEPENDS
+	app_resources/common.hlsl
+	app_resources/tile_upload.comp.hlsl
+)
+target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
+set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)
+
+set(SM 6_8)
+set(JSON [=[
+[
+	{
+		"INPUT": "app_resources/tile_upload.comp.hlsl",
+		"KEY": "snakeStore"
+	}
+]
+]=])
+string(CONFIGURE "${JSON}" JSON)
+
+NBL_CREATE_NSC_COMPILE_RULES(
+	TARGET ${EXECUTABLE_NAME}SPIRV
+	LINK_TO ${EXECUTABLE_NAME}
+	DEPENDS ${DEPENDS}
+	BINARY_DIR ${OUTPUT_DIRECTORY}
+	MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
+	COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -T lib_${SM}
+	OUTPUT_VAR KEYS
+	INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
+	NAMESPACE nbl::this_example::builtin::build
+	INPUTS ${JSON}
+)
+
+NBL_CREATE_RESOURCE_ARCHIVE(
+	NAMESPACE nbl::this_example::builtin::build
+	TARGET ${EXECUTABLE_NAME}_builtinsBuild
+	LINK_TO ${EXECUTABLE_NAME}
+	BIND ${OUTPUT_DIRECTORY}
+	BUILTINS ${KEYS}
+)
diff --git a/73_ImageUploadBenchmark/app_resources/common.hlsl b/73_ImageUploadBenchmark/app_resources/common.hlsl
new file mode 100644
index 000000000..f86f60fb7
--- /dev/null
+++ b/73_ImageUploadBenchmark/app_resources/common.hlsl
@@ -0,0 +1,8 @@
+struct PushConstantData
+{
+    uint64_t deviceBufferAddress;
+    uint32_t2 dstOffset;
+    uint32_t srcWidth;
+    uint32_t srcHeight;
+    uint32_t tilesPerRow;  
+};
diff --git a/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl b/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl
new file mode 100644
index 000000000..bfec6b9d8
--- /dev/null
+++ b/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl
@@ -0,0 +1,260 @@
+#include "common.hlsl"
+
+[[vk::binding(0,0)]] RWTexture2D<float32_t4> dstImage;
+[[vk::push_constant]] PushConstantData pc;
+
+using namespace nbl::hlsl;
+
+static const uint32_t TILE_WIDTH = 16u;
+static const uint32_t TILE_HEIGHT = 8u; 
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void linearStore(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t  gIdx     = ID.x;
+    uint32_t2 pixelPos = uint32_t2(gIdx % pc.srcWidth, gIdx / pc.srcWidth);
+
+    if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
+        return;
+
+    uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
+
+    float32_t4 rgba = float32_t4(
+        float32_t((packed >>  0u) & 0xFFu) / 255.0f,
+        float32_t((packed >>  8u) & 0xFFu) / 255.0f,
+        float32_t((packed >> 16u) & 0xFFu) / 255.0f,
+        float32_t((packed >> 24u) & 0xFFu) / 255.0f
+    );
+
+    dstImage[pc.dstOffset + pixelPos] = rgba;
+}
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void linearLoad(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t gIdx = ID.x;
+    uint32_t2 pixelPos = uint32_t2(gIdx % pc.srcWidth, gIdx / pc.srcWidth);
+
+    if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
+        return;
+
+    float32_t4 color = dstImage[pc.dstOffset + pixelPos];
+
+    uint32_t r = uint32_t(color.r * 255.0f + 0.5f);
+    uint32_t g = uint32_t(color.g * 255.0f + 0.5f);
+    uint32_t b = uint32_t(color.b * 255.0f + 0.5f);
+    uint32_t a = uint32_t(color.a * 255.0f + 0.5f);
+    uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u);
+    vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed);
+}
+
+
+uint32_t2 snakePixelPos(uint32_t gIdx, uint32_t srcWidth)
+{
+    static const uint32_t PIXELS_PER_TILE = TILE_WIDTH * TILE_HEIGHT;
+    uint32_t tilesPerRow = srcWidth / TILE_WIDTH;
+
+    uint32_t tileIdx = gIdx / PIXELS_PER_TILE;
+    uint32_t localIdx = gIdx % PIXELS_PER_TILE;
+
+    uint32_t tileRow = tileIdx / tilesPerRow;
+    uint32_t tileCol = tileIdx % tilesPerRow;
+    // Odd rows: reverse X direction 
+    if (tileRow & 1u)
+        tileCol = tilesPerRow - 1u - tileCol;
+
+    uint32_t localX = localIdx % TILE_WIDTH;
+    uint32_t localY = localIdx / TILE_WIDTH;
+
+    return uint32_t2(
+        tileCol * TILE_WIDTH + localX,
+        tileRow * TILE_HEIGHT + localY
+    );
+}
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void SnakeOrderStore(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t gIdx = ID.x;
+    uint32_t2 pixelPos = snakePixelPos(gIdx, pc.srcWidth);
+
+    if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
+        return;
+
+    uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
+
+    float32_t4 rgba = float32_t4(
+        float32_t((packed >>  0u) & 0xFFu) / 255.0f,
+        float32_t((packed >>  8u) & 0xFFu) / 255.0f,
+        float32_t((packed >> 16u) & 0xFFu) / 255.0f,
+        float32_t((packed >> 24u) & 0xFFu) / 255.0f
+    );
+
+    dstImage[pc.dstOffset + pixelPos] = rgba;
+}
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void SnakeOrderLoad(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t gIdx = ID.x;
+    uint32_t2 pixelPos = snakePixelPos(gIdx, pc.srcWidth);
+
+    if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
+        return;
+
+    float32_t4 color = dstImage[pc.dstOffset + pixelPos];
+
+    uint32_t r = uint32_t(color.r * 255.0f + 0.5f);
+    uint32_t g = uint32_t(color.g * 255.0f + 0.5f);
+    uint32_t b = uint32_t(color.b * 255.0f + 0.5f);
+    uint32_t a = uint32_t(color.a * 255.0f + 0.5f);
+    uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u);
+
+    vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed);
+}
+    
+uint32_t mortonCompact1By1(uint32_t x)
+{
+    x &= 0x55555555u;
+    x = (x ^ (x >> 1u))  & 0x33333333u;
+    x = (x ^ (x >> 2u))  & 0x0f0f0f0fu;
+    x = (x ^ (x >> 4u))  & 0x00ff00ffu;
+    x = (x ^ (x >> 8u))  & 0x0000ffffu;
+    return x;
+}
+
+uint32_t2 mortonDecode(uint32_t code)
+{
+    return uint32_t2(
+        mortonCompact1By1(code),
+        mortonCompact1By1(code >> 1u)
+    );
+}
+    
+void batchedTileInfo(uint32_t gIdx, uint32_t tileW, uint32_t tileH, uint32_t tilesPerRow,
+    out uint32_t2 tileBase, out uint32_t localIdx)
+{
+    uint32_t pixelsPerTile = tileW * tileH;
+    uint32_t tileIdx = gIdx / pixelsPerTile;
+    localIdx = gIdx % pixelsPerTile;
+    uint32_t tileCol = tileIdx % tilesPerRow;
+    uint32_t tileRow = tileIdx / tilesPerRow;
+    tileBase = uint32_t2(tileCol * tileW, tileRow * tileH);
+}
+
+float32_t4 unpackRGBA(uint32_t packed)
+{
+    return float32_t4(
+        float32_t((packed >>  0u) & 0xFFu) / 255.0f,
+        float32_t((packed >>  8u) & 0xFFu) / 255.0f,
+        float32_t((packed >> 16u) & 0xFFu) / 255.0f,
+        float32_t((packed >> 24u) & 0xFFu) / 255.0f
+    );
+}
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void BatchedLinearStore(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t gIdx = ID.x;
+    uint32_t2 tileBase;
+    uint32_t localIdx;
+    batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx);
+
+    uint32_t2 localPos = uint32_t2(localIdx % pc.srcWidth, localIdx / pc.srcWidth);
+    uint32_t2 pixelPos = tileBase + localPos;
+
+    uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
+    dstImage[pixelPos] = unpackRGBA(packed);
+}
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void BatchedSnakeStore(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t gIdx = ID.x;
+    uint32_t2 tileBase;
+    uint32_t localIdx;
+    batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx);
+
+    // Snake within tile row-major with zigzag on odd tile rows
+    uint32_t localTilesPerRow = pc.srcWidth / TILE_WIDTH;
+    uint32_t subTileIdx = localIdx / (TILE_WIDTH * TILE_HEIGHT);
+    uint32_t subLocalIdx = localIdx % (TILE_WIDTH * TILE_HEIGHT);
+    uint32_t subRow = subTileIdx / localTilesPerRow;
+    uint32_t subCol = subTileIdx % localTilesPerRow;
+    if (subRow & 1u)
+        subCol = localTilesPerRow - 1u - subCol;
+    uint32_t localX = subCol * TILE_WIDTH + (subLocalIdx % TILE_WIDTH);
+    uint32_t localY = subRow * TILE_HEIGHT + (subLocalIdx / TILE_WIDTH);
+    uint32_t2 pixelPos = tileBase + uint32_t2(localX, localY);
+
+    uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
+    dstImage[pixelPos] = unpackRGBA(packed);
+}
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void BatchedMortonStore(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t gIdx = ID.x;
+    uint32_t2 tileBase;
+    uint32_t localIdx;
+    batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx);
+
+    uint32_t2 localPos = mortonDecode(localIdx);
+    uint32_t2 pixelPos = tileBase + localPos;
+
+    if (localPos.x >= pc.srcWidth || localPos.y >= pc.srcHeight)
+        return;
+
+    uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
+    dstImage[pixelPos] = unpackRGBA(packed);
+}
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void MortonOrderStore(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t gIdx = ID.x;
+    uint32_t2 pixelPos = mortonDecode(gIdx);
+
+    if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
+        return;
+
+    uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
+
+    float32_t4 rgba = float32_t4(
+        float32_t((packed >>  0u) & 0xFFu) / 255.0f,
+        float32_t((packed >>  8u) & 0xFFu) / 255.0f,
+        float32_t((packed >> 16u) & 0xFFu) / 255.0f,
+        float32_t((packed >> 24u) & 0xFFu) / 255.0f
+    );
+
+    dstImage[pc.dstOffset + pixelPos] = rgba;
+}
+
+[numthreads(128, 1, 1)]
+[shader("compute")]
+void MortonOrderLoad(uint32_t3 ID : SV_DispatchThreadID)
+{
+    uint32_t gIdx = ID.x;
+    uint32_t2 pixelPos = mortonDecode(gIdx);
+
+    if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
+        return;
+
+    float32_t4 color = dstImage[pc.dstOffset + pixelPos];
+
+    uint32_t r = uint32_t(color.r * 255.0f + 0.5f);
+    uint32_t g = uint32_t(color.g * 255.0f + 0.5f);
+    uint32_t b = uint32_t(color.b * 255.0f + 0.5f);
+    uint32_t a = uint32_t(color.a * 255.0f + 0.5f);
+    uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u);
+
+    vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed);
+}
\ No newline at end of file
diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp
index 1fff59202..094e3c2f7 100644
--- a/73_ImageUploadBenchmark/main.cpp
+++ b/73_ImageUploadBenchmark/main.cpp
@@ -1,4 +1,5 @@
 #include "nbl/examples/examples.hpp"
+#include "nbl/this_example/builtin/build/spirv/keys.hpp"
 #include <chrono>
 #include <thread>
 
@@ -25,9 +26,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		if (!asset_base_t::onAppInitialized(std::move(system)))
 			return false;
 
-		constexpr uint32_t TILE_SIZE = 128;
-		constexpr uint32_t TILE_BYTES_PER_PIXEL = 4;
-		constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL;
 		constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
 		constexpr uint32_t FRAMES_IN_FLIGHT = 4;
 		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT);
@@ -47,7 +45,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 
 		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
 
-		m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X",
+		m_logger->log("Memory type bits HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X",
 			ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits);
 		m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X",
 			ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits);
@@ -64,8 +62,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			return false;
 		}
 
-		IQueue* queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT);
-		smart_refctd_ptr<IGPUImage> destinationImage;
+		m_queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT);
 		{
 			IGPUImage::SCreationParams imgParams{};
 			imgParams.type = IImage::E_TYPE::ET_2D;
@@ -79,246 +76,607 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			imgParams.arrayLayers = 1u;
 			imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT;
 			imgParams.tiling = video::IGPUImage::TILING::OPTIMAL;
-			imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT;
+			imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_STORAGE_BIT;
 			imgParams.preinitialized = false;
 
-			destinationImage = m_device->createImage(std::move(imgParams));
-			if (!destinationImage)
+			m_destinationImage = m_device->createImage(std::move(imgParams));
+			if (!m_destinationImage)
 				return logFail("Failed to create destination image!\n");
 
-			destinationImage->setObjectDebugName("Destination Image");
+			m_destinationImage->setObjectDebugName("Destination Image");
 
-			auto reqs = destinationImage->getMemoryReqs();
+			auto reqs = m_destinationImage->getMemoryReqs();
 			reqs.memoryTypeBits &= deviceLocalBits;
 
-			auto allocation = m_device->allocate(reqs, destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE);
+			auto allocation = m_device->allocate(reqs, m_destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE);
 			if (!allocation.isValid())
 				return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n");
 		}
 
-		m_logger->log("\nStrategy 1: System RAM", ILogger::ELL_PERFORMANCE);
-
-		double throughputSystemRAM = 0.0;
+		//compute shader
+		auto loadPrecompiledShader = [&]<core::StringLiteral ShaderKey>()->smart_refctd_ptr<IShader>
 		{
-			smart_refctd_ptr<IGPUBuffer> stagingBuffer;
-			IDeviceMemoryAllocator::SAllocation stagingAlloc;
-			void* mappedPtr = nullptr;
+			IAssetLoader::SAssetLoadParams lp = {};
+			lp.logger = m_logger.get();
+			lp.workingDirectory = "app_resources";
+
+			auto key = nbl::this_example::builtin::build::get_spirv_key<ShaderKey>(m_physicalDevice->getLimits(), m_physicalDevice->getFeatures());
+			m_logger->log("Loading shader with key: %s", ILogger::ELL_INFO, key.data());
 
-			if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleOnlyBits,
-				"Staging Buffer - System RAM", stagingBuffer, stagingAlloc, mappedPtr))
+			auto assetBundle = m_assetMgr->getAsset(key.data(), lp);
+			const auto assets = assetBundle.getContents();
+			if (assets.empty())
 			{
-				return false;
+				m_logger->log("Asset bundle is empty for key: %s", ILogger::ELL_ERROR, key.data());
+				return smart_refctd_ptr<IShader>(nullptr);
 			}
 
-			throughputSystemRAM = runBenchmark(
-				"System RAM",
-				stagingBuffer.get(),
-				stagingAlloc,
-				mappedPtr,
-				destinationImage.get(),
-				TILE_SIZE,
-				TILE_SIZE_BYTES,
-				TILES_PER_FRAME,
-				FRAMES_IN_FLIGHT,
-				TOTAL_FRAMES,
-				queue
-			);
+			m_logger->log("Asset count: %u, asset type: %u", ILogger::ELL_INFO, assets.size(), (uint32_t)assets[0]->getAssetType());
 
-			stagingAlloc.memory->unmap();
-		}
-
-		m_logger->log("System RAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputSystemRAM);
+			auto shader = IAsset::castDown<IShader>(assets[0]);
+			return shader;
+		};
 
-		m_device->waitIdle();
 
-		if (hostVisibleDeviceLocalBits)
+		//Setup compute shader resources
+		m_logger->log("\n=== Setting up Compute Shaders (Linear + Snake + Morton) ===", ILogger::ELL_PERFORMANCE);
 		{
-			m_logger->log("\nStrategy 2: VRAM", ILogger::ELL_PERFORMANCE);
+			auto shaderLib = loadPrecompiledShader.operator()<"snakeStore">();
+			if (!shaderLib)
+				return logFail("Failed to load shader library!\n");
+
+			IGPUDescriptorSetLayout::SBinding dsBinding = {
+				.binding = 0,
+				.type = IDescriptor::E_TYPE::ET_STORAGE_IMAGE,
+				.createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE,
+				.stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,
+				.count = 1
+			};
+			auto dsLayout = m_device->createDescriptorSetLayout({&dsBinding, 1});
+			if (!dsLayout)
+				return logFail("Failed to create descriptor set layout!\n");
+
+			asset::SPushConstantRange pcRange = {
+				.stageFlags = hlsl::ShaderStage::ESS_COMPUTE,
+				.offset = 0,
+				.size = sizeof(SPushConstantData)
+			};
 
-			double throughputVRAM = 0.0;
-			{
-				smart_refctd_ptr<IGPUBuffer> stagingBuffer;
-				IDeviceMemoryAllocator::SAllocation stagingAlloc;
-				void* mappedPtr = nullptr;
+			m_pipelineLayout = m_device->createPipelineLayout({&pcRange, 1}, smart_refctd_ptr(dsLayout));
+			if (!m_pipelineLayout)
+				return logFail("Failed to create pipeline layout!\n");
 
-				if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits,
-					"Staging Buffer - VRAM", stagingBuffer, stagingAlloc, mappedPtr))
-				{
-					return false;
-				}
+			IGPUComputePipeline::SCreationParams storeParams = {};
+			storeParams.layout = m_pipelineLayout.get();
+			storeParams.shader.shader = shaderLib.get();
+			storeParams.shader.entryPoint = "linearStore";
 
-				throughputVRAM = runBenchmark(
-					"VRAM",
-					stagingBuffer.get(),
-					stagingAlloc,
-					mappedPtr,
-					destinationImage.get(),
-					TILE_SIZE,
-					TILE_SIZE_BYTES,
-					TILES_PER_FRAME,
-					FRAMES_IN_FLIGHT,
-					TOTAL_FRAMES,
-					queue
-				);
-
-				stagingAlloc.memory->unmap();
-			}
+			if (!m_device->createComputePipelines(nullptr, {&storeParams, 1}, &m_storePipeline))
+				return logFail("Failed to create linearStore pipeline!\n");
 
-			m_logger->log("VRAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputVRAM);
+			IGPUComputePipeline::SCreationParams loadParams = {};
+			loadParams.layout = m_pipelineLayout.get();
+			loadParams.shader.shader = shaderLib.get();
+			loadParams.shader.entryPoint = "linearLoad";
 
-			double speedup = throughputVRAM / throughputSystemRAM;
-			m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup);
-		}
+			if (!m_device->createComputePipelines(nullptr, {&loadParams, 1}, &m_loadPipeline))
+				return logFail("Failed to create linearLoad pipeline!\n");
 
-		m_device->waitIdle();
+			IGPUComputePipeline::SCreationParams snakeStoreParams = {};
+			snakeStoreParams.layout = m_pipelineLayout.get();
+			snakeStoreParams.shader.shader = shaderLib.get();
+			snakeStoreParams.shader.entryPoint = "SnakeOrderStore";
 
+			if (!m_device->createComputePipelines(nullptr, {&snakeStoreParams, 1}, &m_snakeStorePipeline))
+				return logFail("Failed to create SnakeOrderStore pipeline!\n");
 
-		m_logger->log("\nStrategy 3: Image-to-Image Staging (OPTIMAL)", ILogger::ELL_PERFORMANCE);
-		{
-			std::vector<smart_refctd_ptr<IGPUImage>> stagingImages(FRAMES_IN_FLIGHT);
-			for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++)
-			{
-				IGPUImage::SCreationParams imgParams{};
-				imgParams.type = IImage::E_TYPE::ET_2D;
-				imgParams.extent.width = TILE_SIZE;
-				imgParams.extent.height = TILE_SIZE;
-				imgParams.extent.depth = 1u;
-				imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM;
-				imgParams.mipLevels = 1u;
-				imgParams.flags = IImage::ECF_NONE;
-				imgParams.arrayLayers = 1u;
-				imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT;
-				imgParams.tiling = video::IGPUImage::TILING::OPTIMAL;
-				imgParams.usage = asset::IImage::EUF_TRANSFER_SRC_BIT;
-				imgParams.preinitialized = false;
-				stagingImages[i] = m_device->createImage(std::move(imgParams));
-			}
+			IGPUComputePipeline::SCreationParams snakeLoadParams = {};
+			snakeLoadParams.layout = m_pipelineLayout.get();
+			snakeLoadParams.shader.shader = shaderLib.get();
+			snakeLoadParams.shader.entryPoint = "SnakeOrderLoad";
 
-			std::vector<size_t> imageMemoryOffsets(FRAMES_IN_FLIGHT);
-			size_t currentOffset = 0;
-			uint32_t combinedMemoryTypeBits = 0xFFFFFFFF;
-			uint32_t maxAlignmentLog2 = 0;
-			for (size_t i = 0; i < FRAMES_IN_FLIGHT; i++)
-			{
-				auto memReqs = stagingImages[i]->getMemoryReqs();
-				size_t alignment = 1u << memReqs.alignmentLog2;
-				size_t alignedOffset = (currentOffset + alignment - 1) & ~(alignment - 1);
-				imageMemoryOffsets[i] = alignedOffset;
-				currentOffset = alignedOffset + memReqs.size;
-				combinedMemoryTypeBits &= memReqs.memoryTypeBits;
-				if (memReqs.alignmentLog2 > maxAlignmentLog2)
-					maxAlignmentLog2 = memReqs.alignmentLog2;
-			}
+			if (!m_device->createComputePipelines(nullptr, {&snakeLoadParams, 1}, &m_snakeLoadPipeline))
+				return logFail("Failed to create SnakeOrderLoad pipeline!\n");
+
+			IGPUComputePipeline::SCreationParams mortonStoreParams = {};
+			mortonStoreParams.layout = m_pipelineLayout.get();
+			mortonStoreParams.shader.shader = shaderLib.get();
+			mortonStoreParams.shader.entryPoint = "MortonOrderStore";
 
-			size_t totalMemorySize = currentOffset;
+			if (!m_device->createComputePipelines(nullptr, {&mortonStoreParams, 1}, &m_mortonStorePipeline))
+				return logFail("Failed to create MortonOrderStore pipeline!\n");
 
+			IGPUComputePipeline::SCreationParams mortonLoadParams = {};
+			mortonLoadParams.layout = m_pipelineLayout.get();
+			mortonLoadParams.shader.shader = shaderLib.get();
+			mortonLoadParams.shader.entryPoint = "MortonOrderLoad";
 
-			uint32_t compatibleBits = combinedMemoryTypeBits & hostVisibleDeviceLocalBits;
-			if (!compatibleBits)
-				compatibleBits = combinedMemoryTypeBits & hostVisibleOnlyBits;  
+			if (!m_device->createComputePipelines(nullptr, {&mortonLoadParams, 1}, &m_mortonLoadPipeline))
+				return logFail("Failed to create MortonOrderLoad pipeline!\n");
 
-			if (!compatibleBits)
+			auto createBatchedPipeline = [&](const char* entryPoint, smart_refctd_ptr<IGPUComputePipeline>& outPipeline) -> bool
 			{
-				m_logger->log("OPTIMAL images don't support HOST_VISIBLE on this GPU!",
-					ILogger::ELL_ERROR);
+				IGPUComputePipeline::SCreationParams params = {};
+				params.layout = m_pipelineLayout.get();
+				params.shader.shader = shaderLib.get();
+				params.shader.entryPoint = entryPoint;
+				if (!m_device->createComputePipelines(nullptr, {&params, 1}, &outPipeline))
+					return logFail("Failed to create %s pipeline!\n", entryPoint);
+				return true;
+			};
+
+			if (!createBatchedPipeline("BatchedLinearStore", m_batchedLinearPipeline)) return false;
+			if (!createBatchedPipeline("BatchedSnakeStore", m_batchedSnakePipeline)) return false;
+			if (!createBatchedPipeline("BatchedMortonStore", m_batchedMortonPipeline)) return false;
+
+			auto imageView = m_device->createImageView({
+				.flags = IGPUImageView::ECF_NONE,
+				.subUsages = IGPUImage::EUF_STORAGE_BIT,
+				.image = smart_refctd_ptr(m_destinationImage),
+				.viewType = IGPUImageView::E_TYPE::ET_2D,
+				.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM
+			});
+			if (!imageView)
+				return logFail("Failed to create image view!\n");
+
+			uint32_t setCount = 1;
+			auto dsPool = m_device->createDescriptorPoolForDSLayouts(
+				IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}, &setCount);
+			m_ds = dsPool->createDescriptorSet(smart_refctd_ptr(dsLayout));
+
+			IGPUDescriptorSet::SDescriptorInfo imgInfo = {};
+			imgInfo.desc = imageView;
+			imgInfo.info.image.imageLayout = IGPUImage::LAYOUT::GENERAL;
+
+			IGPUDescriptorSet::SWriteDescriptorSet dsWrite = {
+				.dstSet = m_ds.get(),
+				.binding = 0,
+				.arrayElement = 0,
+				.count = 1,
+				.info = &imgInfo
+			};
+			m_device->updateDescriptorSets({&dsWrite, 1}, {});
+
+			if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits,
+				"Verify Staging Buffer", m_stagingBuffer, m_stagingAlloc, m_stagingMappedPtr))
 				return false;
-			}
 
-			IDeviceMemoryBacked::SDeviceMemoryRequirements memReqs = {};
-			memReqs.size = totalMemorySize;
-			memReqs.memoryTypeBits = compatibleBits;
-			memReqs.alignmentLog2 = maxAlignmentLog2;
+			if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits,
+				"Verify Readback Buffer", m_readbackBuffer, m_readbackAlloc, m_readbackMappedPtr))
+				return false;
 
-			auto memoryAllocation = m_device->allocate(memReqs,nullptr,IDeviceMemoryAllocation::EMAF_NONE);
-			if (!memoryAllocation.isValid())
-			{
-				m_logger->log("Failed to allocate HOST_VISIBLE memory for staging images!", ILogger::ELL_ERROR);
-			}
+			if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits,
+				"Snake Readback Buffer", m_snakeReadbackBuffer, m_snakeReadbackAlloc, m_snakeReadbackMappedPtr))
+				return false;
+
+			if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits,
+				"Morton Readback Buffer", m_mortonReadbackBuffer, m_mortonReadbackAlloc, m_mortonReadbackMappedPtr))
+				return false;
 
-			for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++)
 			{
-				ILogicalDevice::SBindImageMemoryInfo info{};
-				info.image = stagingImages[i].get();
-				info.binding.memory = memoryAllocation.memory.get();
-				info.binding.offset = imageMemoryOffsets[i];
-				if (!m_device->bindImageMemory({&info,1}))
+				uint32_t* pixels = static_cast<uint32_t*>(m_stagingMappedPtr);
+				uint32_t totalPixels = TILE_SIZE * TILE_SIZE;
+				for (uint32_t i = 0; i < totalPixels; i++)
 				{
-					m_logger->log("Failed to bind staging image %u to memory!", ILogger::ELL_ERROR, i);
+					uint8_t val = static_cast<uint8_t>(i & 0xFF);
+					pixels[i] = val | (val << 8u) | (val << 16u) | (val << 24u);
 				}
-			}
 
-			void* mappedPtr = memoryAllocation.memory->map({ 0ull,memoryAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_WRITE);
-			if (!mappedPtr)
-			{
-				m_logger->log("Failed to map staging image memory!", ILogger::ELL_ERROR);
+				if (!m_stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+				{
+					ILogicalDevice::MappedMemoryRange range(m_stagingAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+					m_device->flushMappedMemoryRanges(1, &range);
+				}
 			}
 
-			smart_refctd_ptr<IGPUCommandPool> transitionCmdPool = m_device->createCommandPool(
-				queue->getFamilyIndex(),
+			m_cmdPool = m_device->createCommandPool(
+				m_queue->getFamilyIndex(),
 				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
 			);
+			m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &m_cmdbuf);
+			m_sem = m_device->createSemaphore(0);
 
-			smart_refctd_ptr<IGPUCommandBuffer> transitionCmdBuf;
-			transitionCmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &transitionCmdBuf);
-
-			transitionCmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
-
-		
-			for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++)
+			m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
 			{
-				IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
-				barrier.oldLayout = IImage::LAYOUT::UNDEFINED;
-				barrier.newLayout = IImage::LAYOUT::GENERAL;
-				barrier.image = stagingImages[i].get();
-				barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
-				barrier.subresourceRange.baseMipLevel = 0;
-				barrier.subresourceRange.levelCount = 1;
-				barrier.subresourceRange.baseArrayLayer = 0;
-				barrier.subresourceRange.layerCount = 1;
-				barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
-				barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS;
-				barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
-				barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT;
-
-				transitionCmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
+				IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> initBarrier = {};
+				initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+				initBarrier.newLayout = IImage::LAYOUT::GENERAL;
+				initBarrier.image = m_destinationImage.get();
+				initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+				initBarrier.subresourceRange.baseMipLevel = 0;
+				initBarrier.subresourceRange.levelCount = 1;
+				initBarrier.subresourceRange.baseArrayLayer = 0;
+				initBarrier.subresourceRange.layerCount = 1;
+				initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+				initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+				initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+				initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+				m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} });
 			}
-
-			transitionCmdBuf->end();
+			m_cmdbuf->end();
 
 			IQueue::SSubmitInfo submitInfo = {};
-			IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = transitionCmdBuf.get() };
+			IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = m_cmdbuf.get() };
 			submitInfo.commandBuffers = { &cmdBufInfo, 1 };
 
-			queue->submit({ &submitInfo, 1 });
-			m_device->waitIdle();
-			double throughputImageStaging = runBenchmarkImageStaging(
-				"Image-to-Image",
-				stagingImages,              
-				imageMemoryOffsets,         
-				memoryAllocation.memory.get(),
-				mappedPtr,                  
-				destinationImage.get(),     
-				TILE_SIZE,
-				TILE_SIZE_BYTES,
-				TILES_PER_FRAME,
-				FRAMES_IN_FLIGHT,
-				TOTAL_FRAMES,
-				queue
-			);
+			IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+				.semaphore = m_sem.get(),
+				.value = 1,
+				.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+			};
+			submitInfo.signalSemaphores = { &signalInfo, 1 };
 
-			m_logger->log("Image-to-Image staging throughput: %.2f GB/s",
-				ILogger::ELL_PERFORMANCE, throughputImageStaging);
+			m_queue->submit({ &submitInfo, 1 });
+
+			ISemaphore::SWaitInfo waitInfo = { .semaphore = m_sem.get(), .value = 1 };
+			m_device->blockForSemaphores({ &waitInfo, 1 });
 		}
 
-		m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE);
-		std::this_thread::sleep_for(std::chrono::seconds(5));
+		m_logger->log("Setup complete. Running verification loop (%u frames)", ILogger::ELL_PERFORMANCE, VERIFICATION_LOOP_COUNT);
 
 		return true;
 	}
 
-	bool keepRunning() override { return false; }
-	void workLoopBody() override {}
-	bool onAppTerminated() override { return true; }
+	bool keepRunning() override { return m_frameIndex < VERIFICATION_LOOP_COUNT; }
+
+	void workLoopBody() override
+	{
+		m_cmdPool->reset();
+
+		//Clear readback buffers to zero
+		memset(m_readbackMappedPtr, 0, TILE_SIZE_BYTES);
+		if (!m_readbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_readbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->flushMappedMemoryRanges(1, &range);
+		}
+		memset(m_snakeReadbackMappedPtr, 0, TILE_SIZE_BYTES);
+		if (!m_snakeReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_snakeReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->flushMappedMemoryRanges(1, &range);
+		}
+		memset(m_mortonReadbackMappedPtr, 0, TILE_SIZE_BYTES);
+		if (!m_mortonReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_mortonReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->flushMappedMemoryRanges(1, &range);
+		}
+
+		m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> barrier = {};
+			barrier.oldLayout = IImage::LAYOUT::GENERAL;
+			barrier.newLayout = IImage::LAYOUT::GENERAL;
+			barrier.image = m_destinationImage.get();
+			barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			barrier.subresourceRange.baseMipLevel = 0;
+			barrier.subresourceRange.levelCount = 1;
+			barrier.subresourceRange.baseArrayLayer = 0;
+			barrier.subresourceRange.layerCount = 1;
+			barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_storePipeline.get());
+		const IGPUDescriptorSet* sets[] = { m_ds.get() };
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+
+		SPushConstantData storePc = {
+			.deviceBufferAddress = m_stagingBuffer->getDeviceAddress(),
+			.dstOffsetX = 0,
+			.dstOffsetY = 0,
+			.srcWidth = TILE_SIZE,
+			.srcHeight = TILE_SIZE
+		};
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc);
+		m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u);
+
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> midBarrier = {};
+			midBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			midBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			midBarrier.image = m_destinationImage.get();
+			midBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			midBarrier.subresourceRange.baseMipLevel = 0;
+			midBarrier.subresourceRange.levelCount = 1;
+			midBarrier.subresourceRange.baseArrayLayer = 0;
+			midBarrier.subresourceRange.layerCount = 1;
+			midBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			midBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			midBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			midBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&midBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_loadPipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+
+		SPushConstantData loadPc = {
+			.deviceBufferAddress = m_readbackBuffer->getDeviceAddress(),
+			.dstOffsetX = 0,
+			.dstOffsetY = 0,
+			.srcWidth = TILE_SIZE,
+			.srcHeight = TILE_SIZE
+		};
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &loadPc);
+		m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u);
+
+		{
+			asset::SMemoryBarrier memBarrier = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
+				.dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT
+			};
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+		}
+
+		//SNAKE VERIFICATION
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> snakePreBarrier = {};
+			snakePreBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			snakePreBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			snakePreBarrier.image = m_destinationImage.get();
+			snakePreBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			snakePreBarrier.subresourceRange.baseMipLevel = 0;
+			snakePreBarrier.subresourceRange.levelCount = 1;
+			snakePreBarrier.subresourceRange.baseArrayLayer = 0;
+			snakePreBarrier.subresourceRange.layerCount = 1;
+			snakePreBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			snakePreBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			snakePreBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			snakePreBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&snakePreBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_snakeStorePipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc);
+		m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u);
+
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> snakeMidBarrier = {};
+			snakeMidBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			snakeMidBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			snakeMidBarrier.image = m_destinationImage.get();
+			snakeMidBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			snakeMidBarrier.subresourceRange.baseMipLevel = 0;
+			snakeMidBarrier.subresourceRange.levelCount = 1;
+			snakeMidBarrier.subresourceRange.baseArrayLayer = 0;
+			snakeMidBarrier.subresourceRange.layerCount = 1;
+			snakeMidBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			snakeMidBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			snakeMidBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			snakeMidBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&snakeMidBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_snakeLoadPipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+
+		SPushConstantData snakeLoadPc = {
+			.deviceBufferAddress = m_snakeReadbackBuffer->getDeviceAddress(),
+			.dstOffsetX = 0,
+			.dstOffsetY = 0,
+			.srcWidth = TILE_SIZE,
+			.srcHeight = TILE_SIZE
+		};
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &snakeLoadPc);
+		m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u);
+
+		{
+			asset::SMemoryBarrier memBarrier = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
+				.dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT
+			};
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+		}
+
+		//MORTON VERIFICATION
+
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> mortonPreBarrier = {};
+			mortonPreBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			mortonPreBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			mortonPreBarrier.image = m_destinationImage.get();
+			mortonPreBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			mortonPreBarrier.subresourceRange.baseMipLevel = 0;
+			mortonPreBarrier.subresourceRange.levelCount = 1;
+			mortonPreBarrier.subresourceRange.baseArrayLayer = 0;
+			mortonPreBarrier.subresourceRange.layerCount = 1;
+			mortonPreBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			mortonPreBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			mortonPreBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			mortonPreBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&mortonPreBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_mortonStorePipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc);
+		m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u);
+
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> mortonMidBarrier = {};
+			mortonMidBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			mortonMidBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			mortonMidBarrier.image = m_destinationImage.get();
+			mortonMidBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			mortonMidBarrier.subresourceRange.baseMipLevel = 0;
+			mortonMidBarrier.subresourceRange.levelCount = 1;
+			mortonMidBarrier.subresourceRange.baseArrayLayer = 0;
+			mortonMidBarrier.subresourceRange.layerCount = 1;
+			mortonMidBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			mortonMidBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			mortonMidBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			mortonMidBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&mortonMidBarrier, 1} });
+		}
+
+		m_cmdbuf->bindComputePipeline(m_mortonLoadPipeline.get());
+		m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets);
+
+		SPushConstantData mortonLoadPc = {
+			.deviceBufferAddress = m_mortonReadbackBuffer->getDeviceAddress(),
+			.dstOffsetX = 0,
+			.dstOffsetY = 0,
+			.srcWidth = TILE_SIZE,
+			.srcHeight = TILE_SIZE
+		};
+		m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &mortonLoadPc);
+		m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u);
+
+		{
+			asset::SMemoryBarrier memBarrier = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
+				.dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT
+			};
+			m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} });
+		}
+
+		m_cmdbuf->end();
+
+		// Submit and wait
+		uint64_t semValue = m_frameIndex + 2; // +2 because value 1 was used in init
+		IQueue::SSubmitInfo submitInfo = {};
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = m_cmdbuf.get() };
+		submitInfo.commandBuffers = { &cmdBufInfo, 1 };
+
+		IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+			.semaphore = m_sem.get(),
+			.value = semValue,
+			.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+		};
+		submitInfo.signalSemaphores = { &signalInfo, 1 };
+
+		//RenderDoc capture on first frame
+		if (m_frameIndex == 0)
+			m_api->startCapture();
+
+		m_queue->submit({ &submitInfo, 1 });
+
+		if (m_frameIndex == 0)
+			m_api->endCapture();
+
+		ISemaphore::SWaitInfo waitInfo = { .semaphore = m_sem.get(), .value = semValue };
+		m_device->blockForSemaphores({ &waitInfo, 1 });
+
+		if (!m_readbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_readbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->invalidateMappedMemoryRanges(1, &range);
+		}
+		if (!m_snakeReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_snakeReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->invalidateMappedMemoryRanges(1, &range);
+		}
+		if (!m_mortonReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+		{
+			ILogicalDevice::MappedMemoryRange range(m_mortonReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES);
+			m_device->invalidateMappedMemoryRanges(1, &range);
+		}
+
+		const uint32_t* srcPixels = static_cast<const uint32_t*>(m_stagingMappedPtr);
+		const uint32_t* dstPixels = static_cast<const uint32_t*>(m_readbackMappedPtr);
+		uint32_t totalPixels = TILE_SIZE * TILE_SIZE;
+		uint32_t matchCount = 0;
+		uint32_t firstMismatchIdx = ~0u;
+
+		for (uint32_t i = 0; i < totalPixels; i++)
+		{
+			if (srcPixels[i] == dstPixels[i])
+				matchCount++;
+			else if (firstMismatchIdx == ~0u)
+				firstMismatchIdx = i;
+		}
+
+		if (matchCount == totalPixels)
+		{
+			if (m_frameIndex == 0)
+				m_logger->log("Frame %u: Linear PASS - All %u pixels match.", ILogger::ELL_PERFORMANCE, m_frameIndex, totalPixels);
+		}
+		else
+		{
+			m_logger->log("Frame %u: Linear FAIL %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X",
+				ILogger::ELL_ERROR, m_frameIndex, matchCount, totalPixels, firstMismatchIdx, srcPixels[firstMismatchIdx], dstPixels[firstMismatchIdx]);
+		}
+
+		const uint32_t* snakeDstPixels = static_cast<const uint32_t*>(m_snakeReadbackMappedPtr);
+		uint32_t snakeMatchCount = 0;
+		uint32_t snakeFirstMismatchIdx = ~0u;
+
+		for (uint32_t i = 0; i < totalPixels; i++)
+		{
+			if (srcPixels[i] == snakeDstPixels[i])
+				snakeMatchCount++;
+			else if (snakeFirstMismatchIdx == ~0u)
+				snakeFirstMismatchIdx = i;
+		}
+
+		if (snakeMatchCount == totalPixels)
+		{
+			if (m_frameIndex == 0)
+				m_logger->log("Frame %u: Snake PASS All %u pixels match.", ILogger::ELL_PERFORMANCE, m_frameIndex, totalPixels);
+		}
+		else
+		{
+			m_logger->log("Frame %u: Snake FAIL %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X",
+				ILogger::ELL_ERROR, m_frameIndex, snakeMatchCount, totalPixels, snakeFirstMismatchIdx, srcPixels[snakeFirstMismatchIdx], snakeDstPixels[snakeFirstMismatchIdx]);
+		}
+
+		const uint32_t* mortonDstPixels = static_cast<const uint32_t*>(m_mortonReadbackMappedPtr);
+		uint32_t mortonMatchCount = 0;
+		uint32_t mortonFirstMismatchIdx = ~0u;
+
+		for (uint32_t i = 0; i < totalPixels; i++)
+		{
+			if (srcPixels[i] == mortonDstPixels[i])
+				mortonMatchCount++;
+			else if (mortonFirstMismatchIdx == ~0u)
+				mortonFirstMismatchIdx = i;
+		}
+
+		if (mortonMatchCount == totalPixels)
+		{
+			if (m_frameIndex == 0)
+				m_logger->log("Frame %u: Morton PASS All %u pixels match.", ILogger::ELL_PERFORMANCE, m_frameIndex, totalPixels);
+		}
+		else
+		{
+			m_logger->log("Frame %u: Morton FAIL %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X",
+				ILogger::ELL_ERROR, m_frameIndex, mortonMatchCount, totalPixels, mortonFirstMismatchIdx, srcPixels[mortonFirstMismatchIdx], mortonDstPixels[mortonFirstMismatchIdx]);
+		}
+
+		m_frameIndex++;
+	}
+
+	bool onAppTerminated() override
+	{
+		runAllBenchmarks();
+
+		m_logger->log("\nResults above. Waiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE);
+		std::this_thread::sleep_for(std::chrono::seconds(5));
+
+		if (m_stagingAlloc.memory)
+			m_stagingAlloc.memory->unmap();
+		if (m_readbackAlloc.memory)
+			m_readbackAlloc.memory->unmap();
+		if (m_snakeReadbackAlloc.memory)
+			m_snakeReadbackAlloc.memory->unmap();
+		if (m_mortonReadbackAlloc.memory)
+			m_mortonReadbackAlloc.memory->unmap();
+		return true;
+	}
 
 protected:
 	core::vector<queue_req_t> getQueueRequirements() const override
@@ -333,6 +691,183 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	}
 
 private:
+	static constexpr uint32_t TILE_SIZE = 128;
+	static constexpr uint32_t TILE_BYTES_PER_PIXEL = 4;
+	static constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL;
+	static constexpr uint32_t VERIFICATION_LOOP_COUNT = 300;
+
+	struct SPushConstantData
+	{
+		uint64_t deviceBufferAddress;
+		uint32_t dstOffsetX;
+		uint32_t dstOffsetY;
+		uint32_t srcWidth;
+		uint32_t srcHeight;
+		uint32_t tilesPerRow;
+	};
+
+	IQueue* m_queue = nullptr;
+	smart_refctd_ptr<IGPUImage> m_destinationImage;
+	smart_refctd_ptr<IGPUComputePipeline> m_storePipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_loadPipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_snakeStorePipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_snakeLoadPipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_mortonStorePipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_mortonLoadPipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_batchedLinearPipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_batchedSnakePipeline;
+	smart_refctd_ptr<IGPUComputePipeline> m_batchedMortonPipeline;
+	smart_refctd_ptr<IGPUPipelineLayout> m_pipelineLayout;
+	smart_refctd_ptr<IGPUDescriptorSet> m_ds;
+	smart_refctd_ptr<IGPUBuffer> m_stagingBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_readbackBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_snakeReadbackBuffer;
+	smart_refctd_ptr<IGPUBuffer> m_mortonReadbackBuffer;
+	IDeviceMemoryAllocator::SAllocation m_stagingAlloc;
+	IDeviceMemoryAllocator::SAllocation m_readbackAlloc;
+	IDeviceMemoryAllocator::SAllocation m_snakeReadbackAlloc;
+	IDeviceMemoryAllocator::SAllocation m_mortonReadbackAlloc;
+	void* m_stagingMappedPtr = nullptr;
+	void* m_readbackMappedPtr = nullptr;
+	void* m_snakeReadbackMappedPtr = nullptr;
+	void* m_mortonReadbackMappedPtr = nullptr;
+	smart_refctd_ptr<IGPUCommandPool> m_cmdPool;
+	smart_refctd_ptr<IGPUCommandBuffer> m_cmdbuf;
+	smart_refctd_ptr<ISemaphore> m_sem;
+	uint32_t m_frameIndex = 0;
+
+	void runAllBenchmarks()
+	{
+		constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024;
+		constexpr uint32_t FRAMES_IN_FLIGHT = 4;
+		constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT);
+		constexpr uint32_t TOTAL_FRAMES = 1000;
+
+		uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits();
+		uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits();
+		uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits;
+		uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits;
+
+		m_logger->log("\n=== RUNNING BENCHMARKS ===", ILogger::ELL_PERFORMANCE);
+
+		struct BenchmarkResult
+		{
+			const char* name;
+			double wallGBps;
+			double gpuGBps;
+			double memcpyGBps;
+		};
+		std::vector<BenchmarkResult> results;
+
+		//SysRAM benchmarks
+		{
+			smart_refctd_ptr<IGPUBuffer> benchStagingBuffer;
+			IDeviceMemoryAllocator::SAllocation benchStagingAlloc;
+			void* benchMappedPtr = nullptr;
+			uint32_t benchBufSize = STAGING_BUFFER_SIZE;
+
+			if (createStagingBuffer(benchBufSize, hostVisibleOnlyBits,
+				"Benchmark Staging (SysRAM)", benchStagingBuffer, benchStagingAlloc, benchMappedPtr))
+			{
+				m_logger->log("\n--- CopyBufferToImage (SysRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rCopy = runBenchmark("CopyBufferToImage (SysRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({"CopyBufferToImage (SysRAM)", rCopy.wallGBps, rCopy.gpuGBps, rCopy.memcpyGBps});
+
+				m_logger->log("\n--- Linear Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rLinear = runBenchmarkCompute("Linear Compute (SysRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_batchedLinearPipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({"Linear Compute (SysRAM)", rLinear.wallGBps, rLinear.gpuGBps, rLinear.memcpyGBps});
+
+				m_logger->log("\n--- Snake Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rSnake = runBenchmarkCompute("Snake Compute (SysRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_batchedSnakePipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({"Snake Compute (SysRAM)", rSnake.wallGBps, rSnake.gpuGBps, rSnake.memcpyGBps});
+
+				m_logger->log("\n--- Morton Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rMorton = runBenchmarkCompute("Morton Compute (SysRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_batchedMortonPipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({"Morton Compute (SysRAM)", rMorton.wallGBps, rMorton.gpuGBps, rMorton.memcpyGBps});
+
+				benchStagingAlloc.memory->unmap();
+			}
+		}
+
+		//BAR/VRAM benchmarks (if available)
+		if (hostVisibleDeviceLocalBits)
+		{
+			smart_refctd_ptr<IGPUBuffer> benchStagingBuffer;
+			IDeviceMemoryAllocator::SAllocation benchStagingAlloc;
+			void* benchMappedPtr = nullptr;
+			uint32_t benchBufSize = STAGING_BUFFER_SIZE;
+
+			if (createStagingBuffer(benchBufSize, hostVisibleDeviceLocalBits,
+				"Benchmark Staging (BAR/VRAM)", benchStagingBuffer, benchStagingAlloc, benchMappedPtr))
+			{
+				m_logger->log("\n--- CopyBufferToImage (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rCopy = runBenchmark("CopyBufferToImage (BAR/VRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({"CopyBufferToImage (BAR/VRAM)", rCopy.wallGBps, rCopy.gpuGBps, rCopy.memcpyGBps});
+
+				m_logger->log("\n--- Linear Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rLinear = runBenchmarkCompute("Linear Compute (BAR/VRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_batchedLinearPipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({"Linear Compute (BAR/VRAM)", rLinear.wallGBps, rLinear.gpuGBps, rLinear.memcpyGBps});
+
+				m_logger->log("\n--- Snake Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rSnake = runBenchmarkCompute("Snake Compute (BAR/VRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_batchedSnakePipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({"Snake Compute (BAR/VRAM)", rSnake.wallGBps, rSnake.gpuGBps, rSnake.memcpyGBps});
+
+				m_logger->log("\n--- Morton Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE);
+				auto rMorton = runBenchmarkCompute("Morton Compute (BAR/VRAM)",
+					benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr,
+					m_destinationImage.get(), m_batchedMortonPipeline.get(), m_pipelineLayout.get(), m_ds.get(),
+					TILE_SIZE, TILE_SIZE_BYTES,
+					TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue);
+				results.push_back({"Morton Compute (BAR/VRAM)", rMorton.wallGBps, rMorton.gpuGBps, rMorton.memcpyGBps});
+
+				benchStagingAlloc.memory->unmap();
+			}
+		}
+
+		//Summary table
+		m_logger->log("\n=== BENCHMARK RESULTS ===", ILogger::ELL_PERFORMANCE);
+		m_logger->log("%-36s | Wall GB/s | GPU GB/s | Memcpy GB/s", ILogger::ELL_PERFORMANCE, "Strategy");
+		m_logger->log("-------------------------------------+-----------+----------+------------", ILogger::ELL_PERFORMANCE);
+		for (const auto& r : results)
+		{
+			m_logger->log("%-36s | %9.2f | %8.2f | %10.2f", ILogger::ELL_PERFORMANCE, r.name, r.wallGBps, r.gpuGBps, r.memcpyGBps);
+		}
+		m_logger->log("=====================================+===========+==========+============", ILogger::ELL_PERFORMANCE);
+	}
+
+	struct BenchResult
+	{
+		double wallGBps;
+		double gpuGBps;
+		double memcpyGBps;
+	};
+
 	void generateTileCopyRegions(
 		IImage::SBufferCopy* outRegions,
 		uint32_t tilesPerFrame,
@@ -359,7 +894,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		}
 	}
 
-	double runBenchmark(
+	BenchResult runBenchmark(
 		const char* strategyName,
 		IGPUBuffer* stagingBuffer,
 		IDeviceMemoryAllocator::SAllocation& stagingAlloc,
@@ -566,14 +1101,16 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 			totalSubmitTime += std::chrono::duration<double>(t5 - t4).count();
 		}
 
+		// End marker is after last submit, NOT after GPU finishes.
+		auto endTime = std::chrono::high_resolution_clock::now();
+
 		ISemaphore::SWaitInfo finalWait = {
 			.semaphore = timelineSemaphore.get(),
 			.value = timelineValue
 		};
 		m_device->blockForSemaphores({ &finalWait, 1 });
 
-		auto endTime = std::chrono::high_resolution_clock::now();
-
+		// Read timestamps from the last completed flight of command buffers
 		std::vector<uint64_t> timestamps(framesInFlight * 2);
 		const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
 		m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags);
@@ -586,26 +1123,31 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds;
 		double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9;
 
+		// GPU timestamps only represent the last framesInFlight frames (earlier ones were overwritten)
 		double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight;
 		double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames;
 
 
 		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
 		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+		double totalGB = totalBytes / (1024.0 * 1024.0 * 1024.0);
 
-		double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds;
+		double wallThroughputGBps = totalGB / elapsedSeconds;
+		double gpuThroughputGBps = totalGB / totalGpuTimeSeconds;
 
-		m_logger->log("    GPU time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds);
-		m_logger->log("    GPU throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps);
+		m_logger->log("    GPU time (extrapolated): %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds);
+		m_logger->log("    CPU submit throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, wallThroughputGBps);
+		m_logger->log("    GPU only throughput:   %.2f GB/s", ILogger::ELL_PERFORMANCE, gpuThroughputGBps);
 
 		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName);
 		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
 		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
 		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
 		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
-		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime);
+		double memcpyGBps = totalGB / totalMemcpyTime;
+		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, memcpyGBps);
 
-		return throughputGBps;
+		return { wallThroughputGBps, gpuThroughputGBps, memcpyGBps };
 	}
 
 
@@ -897,6 +1439,275 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		return throughputGBps;
 	}
 
+	BenchResult runBenchmarkCompute(
+		const char* strategyName,
+		IGPUBuffer* stagingBuffer,
+		IDeviceMemoryAllocator::SAllocation& stagingAlloc,
+		void* mappedPtr,
+		IGPUImage* destinationImage,
+		IGPUComputePipeline* pipeline,
+		IGPUPipelineLayout* pipelineLayout,
+		IGPUDescriptorSet* ds,
+		uint32_t tileSize,
+		uint32_t tileSizeBytes,
+		uint32_t tilesPerFrame,
+		uint32_t framesInFlight,
+		uint32_t totalFrames,
+		IQueue* queue)
+	{
+		smart_refctd_ptr<ISemaphore> timelineSemaphore = m_device->createSemaphore(0);
+
+		smart_refctd_ptr<IQueryPool> queryPool;
+		{
+			IQueryPool::SCreationParams queryPoolParams = {};
+			queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP;
+			queryPoolParams.queryCount = framesInFlight * 2;
+			queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE;
+			queryPool = m_device->createQueryPool(queryPoolParams);
+		}
+
+		std::vector<smart_refctd_ptr<IGPUCommandPool>> commandPools(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i] = m_device->createCommandPool(
+				queue->getFamilyIndex(),
+				IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT
+			);
+		}
+		std::vector<smart_refctd_ptr<IGPUCommandBuffer>> commandBuffers(framesInFlight);
+		for (uint32_t i = 0; i < framesInFlight; i++)
+		{
+			commandPools[i]->createCommandBuffers(
+				IGPUCommandPool::BUFFER_LEVEL::PRIMARY,
+				1,
+				&commandBuffers[i]
+			);
+		}
+
+		uint64_t timelineValue = 0;
+
+		commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+		{
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> initBarrier = {};
+			initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED;
+			initBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			initBarrier.image = destinationImage;
+			initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			initBarrier.subresourceRange.baseMipLevel = 0;
+			initBarrier.subresourceRange.levelCount = 1;
+			initBarrier.subresourceRange.baseArrayLayer = 0;
+			initBarrier.subresourceRange.layerCount = 1;
+			initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE;
+			initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE;
+			initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} });
+		}
+		commandBuffers[0]->end();
+
+		IQueue::SSubmitInfo submitInfo = {};
+		IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() };
+		submitInfo.commandBuffers = { &cmdBufInfo, 1 };
+
+		IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = ++timelineValue,
+			.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+		};
+		submitInfo.signalSemaphores = { &signalInfo, 1 };
+
+		queue->submit({ &submitInfo, 1 });
+
+		ISemaphore::SWaitInfo waitInfo = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &waitInfo, 1 });
+
+		uint32_t imageWidth = destinationImage->getCreationParameters().extent.width;
+		uint32_t tilesPerRow = imageWidth / tileSize;
+		uint32_t partitionSize = tilesPerFrame * tileSizeBytes;
+
+		std::vector<uint8_t> cpuSourceData(partitionSize);
+		{
+			unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+			std::mt19937 g(seed);
+			uint32_t* data = reinterpret_cast<uint32_t*>(cpuSourceData.data());
+			for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++)
+				data[i] = g();
+		}
+
+		double totalWaitTime = 0.0;
+		double totalMemcpyTime = 0.0;
+		double totalRecordTime = 0.0;
+		double totalSubmitTime = 0.0;
+
+		auto startTime = std::chrono::high_resolution_clock::now();
+
+		for (uint32_t frame = 0; frame < totalFrames; frame++)
+		{
+			uint32_t cmdBufIndex = frame % framesInFlight;
+
+			auto t1 = std::chrono::high_resolution_clock::now();
+			if (frame >= framesInFlight)
+			{
+				ISemaphore::SWaitInfo frameWaitInfo = {
+					.semaphore = timelineSemaphore.get(),
+					.value = timelineValue - framesInFlight + 1
+				};
+				m_device->blockForSemaphores({ &frameWaitInfo, 1 });
+			}
+			auto t2 = std::chrono::high_resolution_clock::now();
+
+			commandPools[cmdBufIndex]->reset();
+
+			uint32_t bufferOffset = cmdBufIndex * partitionSize;
+			void* targetPtr = static_cast<uint8_t*>(mappedPtr) + bufferOffset;
+			memcpy(targetPtr, cpuSourceData.data(), partitionSize);
+
+			if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT))
+			{
+				ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize);
+				m_device->flushMappedMemoryRanges(1, &range);
+			}
+
+			auto t3 = std::chrono::high_resolution_clock::now();
+
+			commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT);
+
+			uint32_t queryStartIndex = cmdBufIndex * 2;
+			commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2);
+
+			asset::SMemoryBarrier memBarrier = {
+				.srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT,
+				.srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT,
+				.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT,
+				.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS
+			};
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> dstBarrier = {};
+			dstBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			dstBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			dstBarrier.image = destinationImage;
+			dstBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			dstBarrier.subresourceRange.baseMipLevel = 0;
+			dstBarrier.subresourceRange.levelCount = 1;
+			dstBarrier.subresourceRange.baseArrayLayer = 0;
+			dstBarrier.subresourceRange.layerCount = 1;
+			dstBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			dstBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			dstBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			dstBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {
+				.memBarriers = {&memBarrier, 1},
+				.imgBarriers = {&dstBarrier, 1}
+			});
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, queryPool.get(), queryStartIndex + 0);
+
+			commandBuffers[cmdBufIndex]->bindComputePipeline(pipeline);
+			const IGPUDescriptorSet* sets[] = { ds };
+			commandBuffers[cmdBufIndex]->bindDescriptorSets(asset::EPBP_COMPUTE, pipelineLayout, 0, 1, sets);
+
+			// Single dispatch covering all tiles at once
+			SPushConstantData pc = {
+				.deviceBufferAddress = stagingBuffer->getDeviceAddress() + bufferOffset,
+				.dstOffsetX = 0,
+				.dstOffsetY = 0,
+				.srcWidth = tileSize,
+				.srcHeight = tileSize,
+				.tilesPerRow = tilesPerRow
+			};
+			commandBuffers[cmdBufIndex]->pushConstants(pipelineLayout, hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &pc);
+			commandBuffers[cmdBufIndex]->dispatch(tilesPerFrame * tileSize * tileSize / 128u, 1u, 1u);
+
+			IGPUCommandBuffer::SImageMemoryBarrier<IGPUCommandBuffer::SOwnershipTransferBarrier> afterBarrier = {};
+			afterBarrier.oldLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.newLayout = IImage::LAYOUT::GENERAL;
+			afterBarrier.image = destinationImage;
+			afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT;
+			afterBarrier.subresourceRange.baseMipLevel = 0;
+			afterBarrier.subresourceRange.levelCount = 1;
+			afterBarrier.subresourceRange.baseArrayLayer = 0;
+			afterBarrier.subresourceRange.layerCount = 1;
+			afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS;
+			afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS;
+			afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT;
+			afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT;
+			commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&afterBarrier, 1} });
+
+			commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, queryPool.get(), queryStartIndex + 1);
+
+			commandBuffers[cmdBufIndex]->end();
+			auto t4 = std::chrono::high_resolution_clock::now();
+
+			IQueue::SSubmitInfo frameSubmitInfo = {};
+			IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = { .cmdbuf = commandBuffers[cmdBufIndex].get() };
+			frameSubmitInfo.commandBuffers = { &frameCmdBufInfo, 1 };
+
+			IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = {
+				.semaphore = timelineSemaphore.get(),
+				.value = ++timelineValue,
+				.stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT
+			};
+			frameSubmitInfo.signalSemaphores = { &frameSignalInfo, 1 };
+
+			queue->submit({ &frameSubmitInfo, 1 });
+			auto t5 = std::chrono::high_resolution_clock::now();
+
+			totalWaitTime += std::chrono::duration<double>(t2 - t1).count();
+			totalMemcpyTime += std::chrono::duration<double>(t3 - t2).count();
+			totalRecordTime += std::chrono::duration<double>(t4 - t3).count();
+			totalSubmitTime += std::chrono::duration<double>(t5 - t4).count();
+		}
+
+		// End marker is after last submit, NOT after GPU finishes.
+		auto endTime = std::chrono::high_resolution_clock::now();
+
+		ISemaphore::SWaitInfo finalWait = {
+			.semaphore = timelineSemaphore.get(),
+			.value = timelineValue
+		};
+		m_device->blockForSemaphores({ &finalWait, 1 });
+
+		// Read timestamps from the last completed flight of command buffers
+		std::vector<uint64_t> timestamps(framesInFlight * 2);
+		const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT);
+		m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags);
+		uint64_t totalGpuTicks = 0;
+		for (uint32_t i = 0; i < framesInFlight; i++) {
+			uint64_t startTick = timestamps[i * 2 + 0];
+			uint64_t endTick = timestamps[i * 2 + 1];
+			totalGpuTicks += (endTick - startTick);
+		}
+		float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds;
+		double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9;
+
+		double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight;
+		double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames;
+
+		double elapsedSeconds = std::chrono::duration<double>(endTime - startTime).count();
+		uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes;
+		double totalGB = totalBytes / (1024.0 * 1024.0 * 1024.0);
+
+		double wallThroughputGBps = totalGB / elapsedSeconds;
+		double gpuThroughputGBps = totalGB / totalGpuTimeSeconds;
+
+		m_logger->log("    GPU time (extrapolated): %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds);
+		m_logger->log("    CPU submit throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, wallThroughputGBps);
+		m_logger->log("    GPU only throughput:   %.2f GB/s", ILogger::ELL_PERFORMANCE, gpuThroughputGBps);
+
+		m_logger->log("  Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName);
+		m_logger->log("    Wait time:   %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds);
+		m_logger->log("    Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds);
+		m_logger->log("    Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds);
+		m_logger->log("    Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds);
+		double memcpyGBps = totalGB / totalMemcpyTime;
+		m_logger->log("    Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, memcpyGBps);
+
+		return { wallThroughputGBps, gpuThroughputGBps, memcpyGBps };
+	}
+
 	bool createStagingBuffer(
 		uint32_t bufferSize,
 		uint32_t memoryTypeBits,
@@ -907,7 +1718,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 	{
 		IGPUBuffer::SCreationParams params;
 		params.size = bufferSize;
-		params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT;
+		params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT;
 		outBuffer = m_device->createBuffer(std::move(params));
 		if (!outBuffer)
 			return logFail("Failed to create GPU buffer of size %d!\n", bufferSize);
@@ -917,7 +1728,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp
 		auto reqs = outBuffer->getMemoryReqs();
 		reqs.memoryTypeBits &= memoryTypeBits;
 
-		outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_NONE);
+		outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT);
 		if (!outAllocation.isValid())
 			return logFail("Failed to allocate Device Memory!\n");