From 6635ba9cad53f599e4643aa90e7dfa9e800d144c Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Tue, 23 Dec 2025 18:10:26 +0330 Subject: [PATCH 1/7] Add 73_ImageUploadBenchmark example --- 73_ImageUploadBenchmark/CMakeLists.txt | 6 + 73_ImageUploadBenchmark/config.json.template | 28 ++ 73_ImageUploadBenchmark/main.cpp | 392 +++++++++++++++++++ 73_ImageUploadBenchmark/pipeline.groovy | 50 +++ CMakeLists.txt | 1 + 5 files changed, 477 insertions(+) create mode 100644 73_ImageUploadBenchmark/CMakeLists.txt create mode 100644 73_ImageUploadBenchmark/config.json.template create mode 100644 73_ImageUploadBenchmark/main.cpp create mode 100644 73_ImageUploadBenchmark/pipeline.groovy diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt new file mode 100644 index 000000000..2f9218f93 --- /dev/null +++ b/73_ImageUploadBenchmark/CMakeLists.txt @@ -0,0 +1,6 @@ +include(common RESULT_VARIABLE RES) +if(NOT RES) + message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") +endif() + +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file diff --git a/73_ImageUploadBenchmark/config.json.template b/73_ImageUploadBenchmark/config.json.template new file mode 100644 index 000000000..12215d0bb --- /dev/null +++ b/73_ImageUploadBenchmark/config.json.template @@ -0,0 +1,28 @@ +{ + "enableParallelBuild": true, + "threadsPerBuildProcess" : 2, + "isExecuted": false, + "scriptPath": "", + "cmake": { + "configurations": [ "Release", "Debug", "RelWithDebInfo" ], + "buildModes": [], + "requiredOptions": [] + }, + "profiles": [ + { + "backend": "vulkan", // should be none + "platform": "windows", + "buildModes": [], + "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example + "gpuArchitectures": [] + } + ], + "dependencies": [], + "data": [ + { + "dependencies": [], + "command": [""], + "outputs": [] + } + ] +} diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp new file mode 100644 index 000000000..a22647750 --- /dev/null +++ b/73_ImageUploadBenchmark/main.cpp @@ -0,0 +1,392 @@ +#include "nbl/examples/examples.hpp" +#include "nbl/this_example/builtin/build/spirv/keys.hpp" + +using namespace nbl; +using namespace nbl::core; +using namespace nbl::hlsl; +using namespace nbl::system; +using namespace nbl::asset; +using namespace nbl::ui; +using namespace nbl::video; +using namespace nbl::examples; + +#include "app_resources/common.hlsl" +#include "nbl/builtin/hlsl/bit.hlsl" + +class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +{ + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + + public: + // Yay thanks to multiple inheritance we cannot forward ctors anymore + CountingSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + // we stuff all our work here because its a "single shot" app + bool onAppInitialized(smart_refctd_ptr&& system) override + { + // Remember to call the base class initialization! + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + auto limits = m_physicalDevice->getLimits(); + constexpr std::array AllowedMaxComputeSharedMemorySizes = { + 16384, 32768, 65536 + }; + + auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize); + // devices which support less than 16KB of max compute shared memory size are not supported + if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin()) + { + m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize); + exit(0); + } + + limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1); + + const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations; + const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2; + constexpr uint32_t element_count = 100000; + const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount); + const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize); + + auto loadPrecompiledShader = [&]() -> smart_refctd_ptr + { + // this time we load a shader directly from a file + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; // virtual root + auto key = nbl::this_example::builtin::build::get_spirv_key(limits, m_physicalDevice->getFeatures()); + auto assetBundle = m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) + { + logFail("Could not load shader!"); + return nullptr; + } + + auto shader = IAsset::castDown(assets[0]); + // The down-cast should not fail! + assert(shader); + + // There's two ways of doing stuff like this: + // 1. this - modifying the asset after load + // 2. creating a short shader source file that includes the asset you would have wanted to load + // + //auto overrideSource = CHLSLCompiler::createOverridenCopy( + // source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n", + // WorkgroupSize, bucket_count + //); + + // this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple + return shader; + }; + auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl" + auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl" + + // People love Reflection but I prefer Shader Sources instead! + const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) }; + + // This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size + // and using traditional SSBO bindings would force us to update the Descriptor Set every frame. + // I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic + // only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding. + // Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size. + smart_refctd_ptr layout; + smart_refctd_ptr prefixSumPipeline; + smart_refctd_ptr scatterPipeline; + { + layout = m_device->createPipelineLayout({ &pcRange,1 }); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = layout.get(); + params.shader.shader = prefixSumShader.get(); + params.shader.entryPoint = "main"; + params.shader.entries = nullptr; + params.shader.requiredSubgroupSize = static_cast(5); + params.cached.requireFullSubgroups = true; + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &prefixSumPipeline)) + return logFail("Failed to create compute pipeline!\n"); + params.shader.shader = scatterShader.get(); + if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &scatterPipeline)) + return logFail("Failed to create compute pipeline!\n"); + } + + // Allocate memory + nbl::video::IDeviceMemoryAllocator::SAllocation allocation[5] = {}; + smart_refctd_ptr buffers[5]; + //smart_refctd_ptr ds; + { + auto build_buffer = [this]( + smart_refctd_ptr m_device, + nbl::video::IDeviceMemoryAllocator::SAllocation *allocation, + smart_refctd_ptr& buffer, + size_t buffer_size, + const char *label + ) -> void { + IGPUBuffer::SCreationParams params; + params.size = buffer_size; + params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; + buffer = m_device->createBuffer(std::move(params)); + if (!buffer) + logFail("Failed to create GPU buffer of size %d!\n", buffer_size); + + buffer->setObjectDebugName(label); + + auto reqs = buffer->getMemoryReqs(); + reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); + + *allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); + if (!allocation->isValid()) + logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); + + assert(allocation->memory.get() == buffer->getBoundMemory().memory); + }; + + build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer"); + build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer"); + build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * bucket_count, "Scratch Buffer"); + build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Key Buffer"); + build_buffer(m_device, allocation + 4, buffers[4], sizeof(uint32_t) * element_count, "Output Value Buffer"); + } + uint64_t buffer_device_address[] = { + buffers[0]->getDeviceAddress(), + buffers[1]->getDeviceAddress(), + buffers[2]->getDeviceAddress(), + buffers[3]->getDeviceAddress(), + buffers[4]->getDeviceAddress() + }; + + void* mapped_memory[] = { + allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + allocation[4].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), + }; + if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3] || !mapped_memory[4]) + return logFail("Failed to map the Device Memory!\n"); + + // Generate random data + constexpr uint32_t minimum = 0; + const uint32_t range = bucket_count; + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + + auto bufferData = new uint32_t[2][element_count]; + for (uint32_t i = 0; i < element_count; i++) { + bufferData[0][i] = minimum + g() % range; + } + + memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count); + + for (uint32_t i = 0; i < element_count; i++) { + bufferData[1][i] = g() % std::numeric_limits::max(); + } + + memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count); + + std::string outBuffer; + for (auto i = 0; i < element_count; i++) { + outBuffer.append("{"); + outBuffer.append(std::to_string(bufferData[0][i])); + outBuffer.append(", "); + outBuffer.append(std::to_string(bufferData[1][i])); + outBuffer.append("} "); + } + outBuffer.append("\n"); + outBuffer.append("Count: "); + outBuffer.append(std::to_string(element_count)); + outBuffer.append("\n"); + m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + + auto pc = CountingPushData { + .inputKeyAddress = buffer_device_address[0], + .inputValueAddress = buffer_device_address[1], + .histogramAddress = buffer_device_address[2], + .outputKeyAddress = buffer_device_address[3], + .outputValueAddress = buffer_device_address[4], + .dataElementCount = element_count, + .elementsPerWT = elements_per_thread, + .minimum = minimum, + .maximum = minimum + bucket_count - 1, + }; + + smart_refctd_ptr cmdBuf; + { + smart_refctd_ptr cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); + if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf)) + return logFail("Failed to create Command Buffers!\n"); + } + + // Create the Semaphore for prefix sum + constexpr uint64_t started_value = 0; + uint64_t timeline = started_value; + smart_refctd_ptr progress = m_device->createSemaphore(started_value); + + cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdBuf->beginDebugMarker("Prefix Sum Dispatch", core::vectorSIMDf(0, 1, 0, 1)); + cmdBuf->bindComputePipeline(prefixSumPipeline.get()); + cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1); + cmdBuf->endDebugMarker(); + cmdBuf->end(); + + { + auto queue = getComputeQueue(); + + IQueue::SSubmitInfo submit_infos[1]; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { + { + .cmdbuf = cmdBuf.get() + } + }; + submit_infos[0].commandBuffers = cmdBufs; + IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { + { + .semaphore = progress.get(), + .value = ++timeline, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } + }; + submit_infos[0].signalSemaphores = signals; + + m_api->startCapture(); + queue->submit(submit_infos); + m_api->endCapture(); + } + + const ISemaphore::SWaitInfo wait_infos[] = { { + .semaphore = progress.get(), + .value = timeline + } }; + m_device->blockForSemaphores(wait_infos); + + // Create the Semaphore for Scatter + uint64_t timeline2 = started_value; + smart_refctd_ptr progress2 = m_device->createSemaphore(started_value); + + cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + cmdBuf->beginDebugMarker("Scatter Dispatch", core::vectorSIMDf(0, 1, 0, 1)); + cmdBuf->bindComputePipeline(scatterPipeline.get()); + cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); + cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1); + cmdBuf->endDebugMarker(); + cmdBuf->end(); + + { + auto queue = getComputeQueue(); + + IQueue::SSubmitInfo submit_infos[1]; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { + { + .cmdbuf = cmdBuf.get() + } + }; + submit_infos[0].commandBuffers = cmdBufs; + IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { + { + .semaphore = progress.get(), + .value = timeline, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } + }; + submit_infos[0].waitSemaphores = waits; + IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { + { + .semaphore = progress2.get(), + .value = ++timeline2, + .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + } + }; + submit_infos[0].signalSemaphores = signals; + + m_api->startCapture(); + queue->submit(submit_infos); + m_api->endCapture(); + } + + const ISemaphore::SWaitInfo wait_infos2[] = {{ + .semaphore = progress2.get(), + .value = timeline2 + } }; + m_device->blockForSemaphores(wait_infos2); + + const ILogicalDevice::MappedMemoryRange memory_range[] = { + ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()), + ILogicalDevice::MappedMemoryRange(allocation[4].memory.get(), 0ull, allocation[4].memory->getAllocationSize()) + }; + + if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[0]); + if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[1]); + if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[2]); + if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[3]); + if (!allocation[4].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + m_device->invalidateMappedMemoryRanges(1, &memory_range[4]); + + const uint32_t* buffData[] = { + reinterpret_cast(allocation[2].memory->getMappedPointer()), + reinterpret_cast(allocation[3].memory->getMappedPointer()), + reinterpret_cast(allocation[4].memory->getMappedPointer()) + }; + + assert(allocation[2].offset == 0); // simpler than writing out all the pointer arithmetic + assert(allocation[3].offset == 0); // simpler than writing out all the pointer arithmetic + assert(allocation[4].offset == 0); // simpler than writing out all the pointer arithmetic + + outBuffer.clear(); + for (auto i = 0; i < bucket_count; i++) { + outBuffer.append(std::to_string(buffData[0][i])); + outBuffer.append(" "); + } + outBuffer.append("\n"); + + m_logger->log("Scratch buffer is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + + outBuffer.clear(); + for (auto i = 0; i < element_count; i++) { + outBuffer.append("{"); + outBuffer.append(std::to_string(buffData[1][i])); + outBuffer.append(", "); + outBuffer.append(std::to_string(buffData[2][i])); + outBuffer.append("} "); + } + outBuffer.append("\n"); + outBuffer.append("Count: "); + outBuffer.append(std::to_string(element_count)); + outBuffer.append("\n"); + m_logger->log("Your output array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + + allocation[0].memory->unmap(); + allocation[1].memory->unmap(); + allocation[2].memory->unmap(); + allocation[3].memory->unmap(); + allocation[4].memory->unmap(); + + m_device->waitIdle(); + + delete[] bufferData; + + return true; + } + + // Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script) + bool keepRunning() override { return false; } + + // Finally the first actual work-loop + void workLoopBody() override {} + + bool onAppTerminated() override { return true; } +}; + + +NBL_MAIN_FUNC(CountingSortApp) \ No newline at end of file diff --git a/73_ImageUploadBenchmark/pipeline.groovy b/73_ImageUploadBenchmark/pipeline.groovy new file mode 100644 index 000000000..1249f10b5 --- /dev/null +++ b/73_ImageUploadBenchmark/pipeline.groovy @@ -0,0 +1,50 @@ +import org.DevshGraphicsProgramming.Agent +import org.DevshGraphicsProgramming.BuilderInfo +import org.DevshGraphicsProgramming.IBuilder + +class CImageUploadBenchmark extends IBuilder +{ + public CImageUploadBenchmark(Agent _agent, _info) + { + super(_agent, _info) + } + + @Override + public boolean prepare(Map axisMapping) + { + return true + } + + @Override + public boolean build(Map axisMapping) + { + IBuilder.CONFIGURATION config = axisMapping.get("CONFIGURATION") + IBuilder.BUILD_TYPE buildType = axisMapping.get("BUILD_TYPE") + + def nameOfBuildDirectory = getNameOfBuildDirectory(buildType) + def nameOfConfig = getNameOfConfig(config) + + agent.execute("cmake --build ${info.rootProjectPath}/${nameOfBuildDirectory}/${info.targetProjectPathRelativeToRoot} --target ${info.targetBaseName} --config ${nameOfConfig} -j12 -v") + + return true + } + + @Override + public boolean test(Map axisMapping) + { + return true + } + + @Override + public boolean install(Map axisMapping) + { + return true + } +} + +def create(Agent _agent, _info) +{ + return new CStreamingAndBufferDeviceAddressBuilder(_agent, _info) +} + +return this diff --git a/CMakeLists.txt b/CMakeLists.txt index cbe482aa4..2d4ed7408 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -89,6 +89,7 @@ if(NBL_BUILD_EXAMPLES) add_subdirectory(70_FLIPFluids) add_subdirectory(71_RayTracingPipeline) add_subdirectory(72_CooperativeBinarySearch) + add_subdirectory(73_ImageUploadBenchmark) # add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory NBL_GET_ALL_TARGETS(TARGETS) From 951e2fdd218abc307f9890d69f7a9be38d28f95a Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Wed, 24 Dec 2025 18:38:09 +0330 Subject: [PATCH 2/7] Simple benchmark HOST_VISIBLE vs HOST_VISIBLE & DEVICE_LOCAL --- 73_ImageUploadBenchmark/main.cpp | 694 ++++++++++++++++--------------- 1 file changed, 357 insertions(+), 337 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index a22647750..68815681d 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -1,392 +1,412 @@ #include "nbl/examples/examples.hpp" -#include "nbl/this_example/builtin/build/spirv/keys.hpp" +#include using namespace nbl; using namespace nbl::core; -using namespace nbl::hlsl; using namespace nbl::system; using namespace nbl::asset; -using namespace nbl::ui; using namespace nbl::video; using namespace nbl::examples; -#include "app_resources/common.hlsl" -#include "nbl/builtin/hlsl/bit.hlsl" - -class CountingSortApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication +class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceApplication, public BuiltinResourcesApplication { - using device_base_t = application_templates::MonoDeviceApplication; - using asset_base_t = BuiltinResourcesApplication; + using device_base_t = application_templates::MonoDeviceApplication; + using asset_base_t = BuiltinResourcesApplication; + +public: + ImageUploadBenchmarkApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : + system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + + bool onAppInitialized(smart_refctd_ptr&& system) override + { + if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) + return false; + if (!asset_base_t::onAppInitialized(std::move(system))) + return false; + + constexpr uint32_t TILE_SIZE = 128; + constexpr uint32_t TILE_BYTES_PER_PIXEL = 4; + constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL; + constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; + constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / TILE_SIZE_BYTES; + constexpr uint32_t FRAMES_IN_FLIGHT = 4; + constexpr uint32_t TOTAL_FRAMES = 1000; + + m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO); + m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_INFO, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024); + m_logger->log("Staging buffer: %u MB", ILogger::ELL_INFO, STAGING_BUFFER_SIZE / (1024 * 1024)); + m_logger->log("Tiles per frame: %u", ILogger::ELL_INFO, TILES_PER_FRAME); + m_logger->log("Frames in flight: %u", ILogger::ELL_INFO, FRAMES_IN_FLIGHT); + + uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); + uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits; + uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; + + if (!hostVisibleOnlyBits) + { + m_logger->log("HOST_VISIBLE memory types not found!", ILogger::ELL_ERROR); + return false; + } - public: - // Yay thanks to multiple inheritance we cannot forward ctors anymore - CountingSortApp(const path& _localInputCWD, const path& _localOutputCWD, const path& _sharedInputCWD, const path& _sharedOutputCWD) : - system::IApplicationFramework(_localInputCWD, _localOutputCWD, _sharedInputCWD, _sharedOutputCWD) {} + if (!deviceLocalBits) + { + m_logger->log("DEVICE_LOCAL memory types not found!", ILogger::ELL_ERROR); + return false; + } - // we stuff all our work here because its a "single shot" app - bool onAppInitialized(smart_refctd_ptr&& system) override + IQueue* queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT); + smart_refctd_ptr destinationImage; { - // Remember to call the base class initialization! - if (!device_base_t::onAppInitialized(smart_refctd_ptr(system))) - return false; - if (!asset_base_t::onAppInitialized(std::move(system))) - return false; + IGPUImage::SCreationParams imgParams{}; + imgParams.type = IImage::E_TYPE::ET_2D; + imgParams.extent.width = TILE_SIZE * 32; + imgParams.extent.height = TILE_SIZE * 32; + imgParams.extent.depth = 1u; + imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM; + imgParams.mipLevels = 1u; + imgParams.flags = IImage::ECF_NONE; + imgParams.arrayLayers = 1u; + imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; + imgParams.tiling = video::IGPUImage::TILING::OPTIMAL; + imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT; + imgParams.preinitialized = false; + + destinationImage = m_device->createImage(std::move(imgParams)); + if (!destinationImage) + return logFail("Failed to create destination image!\n"); + + destinationImage->setObjectDebugName("Destination Image"); + + auto reqs = destinationImage->getMemoryReqs(); + reqs.memoryTypeBits &= deviceLocalBits; + + auto allocation = m_device->allocate(reqs, destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE); + if (!allocation.isValid()) + return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n"); + } - auto limits = m_physicalDevice->getLimits(); - constexpr std::array AllowedMaxComputeSharedMemorySizes = { - 16384, 32768, 65536 - }; + m_logger->log("\nTesting Strategy 1: System RAM", ILogger::ELL_INFO); + + double throughputSystemRAM = 0.0; + { + smart_refctd_ptr stagingBuffer; + IDeviceMemoryAllocator::SAllocation stagingAlloc; + void* mappedPtr = nullptr; - auto upperBoundSharedMemSize = std::upper_bound(AllowedMaxComputeSharedMemorySizes.begin(), AllowedMaxComputeSharedMemorySizes.end(), limits.maxComputeSharedMemorySize); - // devices which support less than 16KB of max compute shared memory size are not supported - if (upperBoundSharedMemSize == AllowedMaxComputeSharedMemorySizes.begin()) + if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleOnlyBits, + "Staging Buffer - System RAM", stagingBuffer, stagingAlloc, mappedPtr)) { - m_logger->log("maxComputeSharedMemorySize is too low (%u)", ILogger::E_LOG_LEVEL::ELL_ERROR, limits.maxComputeSharedMemorySize); - exit(0); + return false; } - limits.maxComputeSharedMemorySize = *(upperBoundSharedMemSize - 1); + throughputSystemRAM = runBenchmark( + "System RAM", + stagingBuffer.get(), + mappedPtr, + destinationImage.get(), + TILE_SIZE, + TILE_SIZE_BYTES, + TILES_PER_FRAME, + FRAMES_IN_FLIGHT, + TOTAL_FRAMES, + queue + ); + + stagingAlloc.memory->unmap(); + } - const uint32_t WorkgroupSize = limits.maxComputeWorkGroupInvocations; - const uint32_t MaxBucketCount = (limits.maxComputeSharedMemorySize / sizeof(uint32_t)) / 2; - constexpr uint32_t element_count = 100000; - const uint32_t bucket_count = std::min((uint32_t)3000, MaxBucketCount); - const uint32_t elements_per_thread = ceil((float)ceil((float)element_count / limits.computeUnits) / WorkgroupSize); + m_logger->log("System RAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputSystemRAM); - auto loadPrecompiledShader = [&]() -> smart_refctd_ptr - { - // this time we load a shader directly from a file - IAssetLoader::SAssetLoadParams lp = {}; - lp.logger = m_logger.get(); - lp.workingDirectory = "app_resources"; // virtual root - auto key = nbl::this_example::builtin::build::get_spirv_key(limits, m_physicalDevice->getFeatures()); - auto assetBundle = m_assetMgr->getAsset(key.data(), lp); - const auto assets = assetBundle.getContents(); - if (assets.empty()) - { - logFail("Could not load shader!"); - return nullptr; - } + m_device->waitIdle(); - auto shader = IAsset::castDown(assets[0]); - // The down-cast should not fail! - assert(shader); - - // There's two ways of doing stuff like this: - // 1. this - modifying the asset after load - // 2. creating a short shader source file that includes the asset you would have wanted to load - // - //auto overrideSource = CHLSLCompiler::createOverridenCopy( - // source.get(), "#define WorkgroupSize %d\n#define BucketCount %d\n", - // WorkgroupSize, bucket_count - //); - - // this time we skip the use of the asset converter since the IShader->IGPUShader path is quick and simple - return shader; - }; - auto prefixSumShader = loadPrecompiledShader.operator()<"prefix_sum_shader">(); // "app_resources/prefix_sum_shader.comp.hlsl" - auto scatterShader = loadPrecompiledShader.operator()<"scatter_shader">(); // "app_resources/scatter_shader.comp.hlsl" - - // People love Reflection but I prefer Shader Sources instead! - const nbl::asset::SPushConstantRange pcRange = { .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE,.offset = 0,.size = sizeof(CountingPushData) }; - - // This time we'll have no Descriptor Sets or Layouts because our workload has a widely varying size - // and using traditional SSBO bindings would force us to update the Descriptor Set every frame. - // I even started writing this sample with the use of Dynamic SSBOs, however the length of the buffer range is not dynamic - // only the offset. This means that we'd have to write the "worst case" length into the descriptor set binding. - // Then this has a knock-on effect that we couldn't allocate closer to the end of the streaming buffer than the "worst case" size. - smart_refctd_ptr layout; - smart_refctd_ptr prefixSumPipeline; - smart_refctd_ptr scatterPipeline; - { - layout = m_device->createPipelineLayout({ &pcRange,1 }); - IGPUComputePipeline::SCreationParams params = {}; - params.layout = layout.get(); - params.shader.shader = prefixSumShader.get(); - params.shader.entryPoint = "main"; - params.shader.entries = nullptr; - params.shader.requiredSubgroupSize = static_cast(5); - params.cached.requireFullSubgroups = true; - if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &prefixSumPipeline)) - return logFail("Failed to create compute pipeline!\n"); - params.shader.shader = scatterShader.get(); - if (!m_device->createComputePipelines(nullptr, { ¶ms,1 }, &scatterPipeline)) - return logFail("Failed to create compute pipeline!\n"); - } + if (hostVisibleDeviceLocalBits) + { + m_logger->log("\nTesting Strategy 2: VRAM (ReBAR)", ILogger::ELL_INFO); - // Allocate memory - nbl::video::IDeviceMemoryAllocator::SAllocation allocation[5] = {}; - smart_refctd_ptr buffers[5]; - //smart_refctd_ptr ds; + double throughputVRAM = 0.0; { - auto build_buffer = [this]( - smart_refctd_ptr m_device, - nbl::video::IDeviceMemoryAllocator::SAllocation *allocation, - smart_refctd_ptr& buffer, - size_t buffer_size, - const char *label - ) -> void { - IGPUBuffer::SCreationParams params; - params.size = buffer_size; - params.usage = IGPUBuffer::EUF_STORAGE_BUFFER_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; - buffer = m_device->createBuffer(std::move(params)); - if (!buffer) - logFail("Failed to create GPU buffer of size %d!\n", buffer_size); - - buffer->setObjectDebugName(label); - - auto reqs = buffer->getMemoryReqs(); - reqs.memoryTypeBits &= m_physicalDevice->getHostVisibleMemoryTypeBits(); - - *allocation = m_device->allocate(reqs, buffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); - if (!allocation->isValid()) - logFail("Failed to allocate Device Memory compatible with our GPU Buffer!\n"); - - assert(allocation->memory.get() == buffer->getBoundMemory().memory); - }; + smart_refctd_ptr stagingBuffer; + IDeviceMemoryAllocator::SAllocation stagingAlloc; + void* mappedPtr = nullptr; - build_buffer(m_device, allocation, buffers[0], sizeof(uint32_t) * element_count, "Input Key Buffer"); - build_buffer(m_device, allocation + 1, buffers[1], sizeof(uint32_t) * element_count, "Input Value Buffer"); - build_buffer(m_device, allocation + 2, buffers[2], sizeof(uint32_t) * bucket_count, "Scratch Buffer"); - build_buffer(m_device, allocation + 3, buffers[3], sizeof(uint32_t) * element_count, "Output Key Buffer"); - build_buffer(m_device, allocation + 4, buffers[4], sizeof(uint32_t) * element_count, "Output Value Buffer"); - } - uint64_t buffer_device_address[] = { - buffers[0]->getDeviceAddress(), - buffers[1]->getDeviceAddress(), - buffers[2]->getDeviceAddress(), - buffers[3]->getDeviceAddress(), - buffers[4]->getDeviceAddress() - }; + if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits, + "Staging Buffer - VRAM (ReBAR)", stagingBuffer, stagingAlloc, mappedPtr)) + { + return false; + } - void* mapped_memory[] = { - allocation[0].memory->map({0ull,allocation[0].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[1].memory->map({0ull,allocation[1].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[2].memory->map({0ull,allocation[2].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[3].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - allocation[4].memory->map({0ull,allocation[3].memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ), - }; - if (!mapped_memory[0] || !mapped_memory[1] || !mapped_memory[2] || !mapped_memory[3] || !mapped_memory[4]) - return logFail("Failed to map the Device Memory!\n"); - - // Generate random data - constexpr uint32_t minimum = 0; - const uint32_t range = bucket_count; - unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::mt19937 g(seed); - - auto bufferData = new uint32_t[2][element_count]; - for (uint32_t i = 0; i < element_count; i++) { - bufferData[0][i] = minimum + g() % range; + throughputVRAM = runBenchmark( + "VRAM (ReBAR)", + stagingBuffer.get(), + mappedPtr, + destinationImage.get(), + TILE_SIZE, + TILE_SIZE_BYTES, + TILES_PER_FRAME, + FRAMES_IN_FLIGHT, + TOTAL_FRAMES, + queue + ); + + stagingAlloc.memory->unmap(); } - memcpy(mapped_memory[0], bufferData[0], sizeof(uint32_t) * element_count); - - for (uint32_t i = 0; i < element_count; i++) { - bufferData[1][i] = g() % std::numeric_limits::max(); - } + m_logger->log("VRAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputVRAM); - memcpy(mapped_memory[1], bufferData[1], sizeof(uint32_t) * element_count); + double speedup = throughputVRAM / throughputSystemRAM; + m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); + } - std::string outBuffer; - for (auto i = 0; i < element_count; i++) { - outBuffer.append("{"); - outBuffer.append(std::to_string(bufferData[0][i])); - outBuffer.append(", "); - outBuffer.append(std::to_string(bufferData[1][i])); - outBuffer.append("} "); - } - outBuffer.append("\n"); - outBuffer.append("Count: "); - outBuffer.append(std::to_string(element_count)); - outBuffer.append("\n"); - m_logger->log("Your input array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); - - auto pc = CountingPushData { - .inputKeyAddress = buffer_device_address[0], - .inputValueAddress = buffer_device_address[1], - .histogramAddress = buffer_device_address[2], - .outputKeyAddress = buffer_device_address[3], - .outputValueAddress = buffer_device_address[4], - .dataElementCount = element_count, - .elementsPerWT = elements_per_thread, - .minimum = minimum, - .maximum = minimum + bucket_count - 1, - }; + return true; + } + + bool keepRunning() override { return false; } + void workLoopBody() override {} + bool onAppTerminated() override { return true; } + +protected: + core::vector getQueueRequirements() const override + { + using flags_t = IQueue::FAMILY_FLAGS; + return { { + .requiredFlags = flags_t::GRAPHICS_BIT, + .disallowedFlags = flags_t::NONE, + .queueCount = 1, + .maxImageTransferGranularity = {1, 1, 1} + } }; + } + +private: + void transitionImageLayout( + IGPUCommandBuffer* cmdBuf, + IGPUImage* image, + IImage::LAYOUT oldLayout, + IImage::LAYOUT newLayout) + { + IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; + barrier.oldLayout = oldLayout; + barrier.newLayout = newLayout; + barrier.image = image; + barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); + } + + void generateTileCopyRegions( + IImage::SBufferCopy* outRegions, + uint32_t tilesPerFrame, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t imageWidth) + { + uint32_t tilesPerRow = imageWidth / tileSize; + for (size_t i = 0; i < tilesPerFrame; i++) + { + uint32_t tileX = (i % tilesPerRow) * tileSize; + uint32_t tileY = (i / tilesPerRow) * tileSize; + + outRegions[i].bufferOffset = i * tileSizeBytes; + outRegions[i].bufferRowLength = tileSize; + outRegions[i].bufferImageHeight = tileSize; + outRegions[i].imageOffset = { tileX, tileY, 0 }; + outRegions[i].imageExtent = { tileSize, tileSize, 1 }; + outRegions[i].imageSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + outRegions[i].imageSubresource.mipLevel = 0; + outRegions[i].imageSubresource.baseArrayLayer = 0; + outRegions[i].imageSubresource.layerCount = 1; + } + } - smart_refctd_ptr cmdBuf; - { - smart_refctd_ptr cmdpool = m_device->createCommandPool(getComputeQueue()->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT); - if (!cmdpool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1u, &cmdBuf)) - return logFail("Failed to create Command Buffers!\n"); - } + void generateRandomTileData(void* mappedPtr, uint32_t sizeBytes) + { + uint32_t* data = (uint32_t*)mappedPtr; + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + const uint32_t valueCount = sizeBytes / sizeof(uint32_t); - // Create the Semaphore for prefix sum - constexpr uint64_t started_value = 0; - uint64_t timeline = started_value; - smart_refctd_ptr progress = m_device->createSemaphore(started_value); + auto bufferData = new uint32_t[valueCount]; - cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdBuf->beginDebugMarker("Prefix Sum Dispatch", core::vectorSIMDf(0, 1, 0, 1)); - cmdBuf->bindComputePipeline(prefixSumPipeline.get()); - cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1); - cmdBuf->endDebugMarker(); - cmdBuf->end(); + for (uint32_t i = 0; i < valueCount; i++) + { + bufferData[i] = g(); + } + memcpy(mappedPtr, bufferData, sizeBytes); + delete[] bufferData; + } + + double runBenchmark( + const char* strategyName, + IGPUBuffer* stagingBuffer, + void* mappedPtr, + IGPUImage* destinationImage, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t tilesPerFrame, + uint32_t framesInFlight, + uint32_t totalFrames, + IQueue* queue) + { + smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); + + auto commandPools = new smart_refctd_ptr[framesInFlight]; + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i] = m_device->createCommandPool( + queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + } - { - auto queue = getComputeQueue(); + auto commandBuffers = new smart_refctd_ptr[framesInFlight]; + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i]->createCommandBuffers( + IGPUCommandPool::BUFFER_LEVEL::PRIMARY, + 1, + &commandBuffers[i] + ); + } - IQueue::SSubmitInfo submit_infos[1]; - IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { - { - .cmdbuf = cmdBuf.get() - } - }; - submit_infos[0].commandBuffers = cmdBufs; - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { - { - .semaphore = progress.get(), - .value = ++timeline, - .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } - }; - submit_infos[0].signalSemaphores = signals; + uint64_t timelineValue = 0; - m_api->startCapture(); - queue->submit(submit_infos); - m_api->endCapture(); - } + commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + transitionImageLayout( + commandBuffers[0].get(), + destinationImage, + IImage::LAYOUT::UNDEFINED, + IImage::LAYOUT::TRANSFER_DST_OPTIMAL + ); + commandBuffers[0]->end(); - const ISemaphore::SWaitInfo wait_infos[] = { { - .semaphore = progress.get(), - .value = timeline - } }; - m_device->blockForSemaphores(wait_infos); + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; - // Create the Semaphore for Scatter - uint64_t timeline2 = started_value; - smart_refctd_ptr progress2 = m_device->createSemaphore(started_value); + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; - cmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - cmdBuf->beginDebugMarker("Scatter Dispatch", core::vectorSIMDf(0, 1, 0, 1)); - cmdBuf->bindComputePipeline(scatterPipeline.get()); - cmdBuf->pushConstants(layout.get(), IShader::E_SHADER_STAGE::ESS_COMPUTE, 0u, sizeof(pc), &pc); - cmdBuf->dispatch(ceil((float)element_count / (elements_per_thread * WorkgroupSize)), 1, 1); - cmdBuf->endDebugMarker(); - cmdBuf->end(); + queue->submit({ &submitInfo, 1 }); - { - auto queue = getComputeQueue(); + ISemaphore::SWaitInfo waitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &waitInfo, 1 }); - IQueue::SSubmitInfo submit_infos[1]; - IQueue::SSubmitInfo::SCommandBufferInfo cmdBufs[] = { - { - .cmdbuf = cmdBuf.get() - } - }; - submit_infos[0].commandBuffers = cmdBufs; - IQueue::SSubmitInfo::SSemaphoreInfo waits[] = { - { - .semaphore = progress.get(), - .value = timeline, - .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } - }; - submit_infos[0].waitSemaphores = waits; - IQueue::SSubmitInfo::SSemaphoreInfo signals[] = { - { - .semaphore = progress2.get(), - .value = ++timeline2, - .stageMask = asset::PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT - } - }; - submit_infos[0].signalSemaphores = signals; + auto regions = new IImage::SBufferCopy[tilesPerFrame]; - m_api->startCapture(); - queue->submit(submit_infos); - m_api->endCapture(); - } + generateRandomTileData(mappedPtr, tilesPerFrame * tileSizeBytes); - const ISemaphore::SWaitInfo wait_infos2[] = {{ - .semaphore = progress2.get(), - .value = timeline2 - } }; - m_device->blockForSemaphores(wait_infos2); - - const ILogicalDevice::MappedMemoryRange memory_range[] = { - ILogicalDevice::MappedMemoryRange(allocation[0].memory.get(), 0ull, allocation[0].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[1].memory.get(), 0ull, allocation[1].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[2].memory.get(), 0ull, allocation[2].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[3].memory.get(), 0ull, allocation[3].memory->getAllocationSize()), - ILogicalDevice::MappedMemoryRange(allocation[4].memory.get(), 0ull, allocation[4].memory->getAllocationSize()) - }; + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + generateTileCopyRegions(regions, tilesPerFrame, tileSize, tileSizeBytes, imageWidth); - if (!allocation[0].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[0]); - if (!allocation[1].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[1]); - if (!allocation[2].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[2]); - if (!allocation[3].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[3]); - if (!allocation[4].memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) - m_device->invalidateMappedMemoryRanges(1, &memory_range[4]); - - const uint32_t* buffData[] = { - reinterpret_cast(allocation[2].memory->getMappedPointer()), - reinterpret_cast(allocation[3].memory->getMappedPointer()), - reinterpret_cast(allocation[4].memory->getMappedPointer()) - }; + auto startTime = std::chrono::high_resolution_clock::now(); - assert(allocation[2].offset == 0); // simpler than writing out all the pointer arithmetic - assert(allocation[3].offset == 0); // simpler than writing out all the pointer arithmetic - assert(allocation[4].offset == 0); // simpler than writing out all the pointer arithmetic + for (uint32_t frame = 0; frame < totalFrames; frame++) + { + uint32_t cmdBufIndex = frame % framesInFlight; - outBuffer.clear(); - for (auto i = 0; i < bucket_count; i++) { - outBuffer.append(std::to_string(buffData[0][i])); - outBuffer.append(" "); - } - outBuffer.append("\n"); + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - m_logger->log("Scratch buffer is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + commandBuffers[cmdBufIndex]->copyBufferToImage( + stagingBuffer, + destinationImage, + IImage::LAYOUT::TRANSFER_DST_OPTIMAL, + tilesPerFrame, + regions + ); - outBuffer.clear(); - for (auto i = 0; i < element_count; i++) { - outBuffer.append("{"); - outBuffer.append(std::to_string(buffData[1][i])); - outBuffer.append(", "); - outBuffer.append(std::to_string(buffData[2][i])); - outBuffer.append("} "); - } - outBuffer.append("\n"); - outBuffer.append("Count: "); - outBuffer.append(std::to_string(element_count)); - outBuffer.append("\n"); - m_logger->log("Your output array is: \n" + outBuffer, ILogger::ELL_PERFORMANCE); + commandBuffers[cmdBufIndex]->end(); - allocation[0].memory->unmap(); - allocation[1].memory->unmap(); - allocation[2].memory->unmap(); - allocation[3].memory->unmap(); - allocation[4].memory->unmap(); + // Create submit info for THIS frame + IQueue::SSubmitInfo frameSubmitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()}; + frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1}; - m_device->waitIdle(); + IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1}; - delete[] bufferData; + // Submit to GPU + queue->submit({&frameSubmitInfo, 1}); - return true; + // Wait for old frames + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + }; + m_device->blockForSemaphores({&frameWaitInfo, 1}); + } } - // Ok this time we'll actually have a work loop (maybe just for the sake of future WASM so we don't timeout a Browser Tab with an unresponsive script) - bool keepRunning() override { return false; } - - // Finally the first actual work-loop - void workLoopBody() override {} - - bool onAppTerminated() override { return true; } + // Wait for all remaining frames to complete + ISemaphore::SWaitInfo finalWait = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({&finalWait, 1}); + + auto endTime = std::chrono::high_resolution_clock::now(); + + delete[] regions; + delete[] commandPools; + delete[] commandBuffers; + + // Calculate throughput + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); + uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + + return throughputGBps; + } + + bool createStagingBuffer( + uint32_t bufferSize, + uint32_t memoryTypeBits, + const char* debugName, + smart_refctd_ptr& outBuffer, + IDeviceMemoryAllocator::SAllocation& outAllocation, + void*& outMappedPtr) + { + IGPUBuffer::SCreationParams params; + params.size = bufferSize; + params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT; + outBuffer = m_device->createBuffer(std::move(params)); + if (!outBuffer) + return logFail("Failed to create GPU buffer of size %d!\n", bufferSize); + + outBuffer->setObjectDebugName(debugName); + + auto reqs = outBuffer->getMemoryReqs(); + reqs.memoryTypeBits &= memoryTypeBits; + + outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_NONE); + if (!outAllocation.isValid()) + return logFail("Failed to allocate Device Memory!\n"); + + outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ); + if (!outMappedPtr) + return logFail("Failed to map Device Memory!\n"); + + return true; + } }; - -NBL_MAIN_FUNC(CountingSortApp) \ No newline at end of file +NBL_MAIN_FUNC(ImageUploadBenchmarkApp) From 141295bee833de2fb97bc1ef1e7e8bc8980a643c Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Wed, 24 Dec 2025 21:09:51 +0330 Subject: [PATCH 3/7] Measurment was wierd, added some detail and also fix a bug related to FIF --- 73_ImageUploadBenchmark/main.cpp | 123 +++++++++++++++++++------------ 1 file changed, 77 insertions(+), 46 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index 68815681d..eceb0f9ea 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -28,8 +28,8 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp constexpr uint32_t TILE_BYTES_PER_PIXEL = 4; constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL; constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; - constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / TILE_SIZE_BYTES; constexpr uint32_t FRAMES_IN_FLIGHT = 4; + constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); constexpr uint32_t TOTAL_FRAMES = 1000; m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO); @@ -40,12 +40,20 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); - uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits; + uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT); + + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits; + uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; + m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X", + ILogger::ELL_INFO, hostVisibleBits, deviceLocalBits, hostCachedBits); + m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X", + ILogger::ELL_INFO, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); + if (!hostVisibleOnlyBits) { - m_logger->log("HOST_VISIBLE memory types not found!", ILogger::ELL_ERROR); + m_logger->log("HOST_VISIBLE non-cached memory types not found!", ILogger::ELL_ERROR); return false; } @@ -122,7 +130,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp if (hostVisibleDeviceLocalBits) { - m_logger->log("\nTesting Strategy 2: VRAM (ReBAR)", ILogger::ELL_INFO); + m_logger->log("\nTesting Strategy 2: VRAM", ILogger::ELL_INFO); double throughputVRAM = 0.0; { @@ -131,13 +139,13 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp void* mappedPtr = nullptr; if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits, - "Staging Buffer - VRAM (ReBAR)", stagingBuffer, stagingAlloc, mappedPtr)) + "Staging Buffer - VRAM", stagingBuffer, stagingAlloc, mappedPtr)) { return false; } throughputVRAM = runBenchmark( - "VRAM (ReBAR)", + "VRAM", stagingBuffer.get(), mappedPtr, destinationImage.get(), @@ -205,7 +213,8 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t tilesPerFrame, uint32_t tileSize, uint32_t tileSizeBytes, - uint32_t imageWidth) + uint32_t imageWidth, + uint32_t bufferBaseOffset) { uint32_t tilesPerRow = imageWidth / tileSize; for (size_t i = 0; i < tilesPerFrame; i++) @@ -213,7 +222,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t tileX = (i % tilesPerRow) * tileSize; uint32_t tileY = (i / tilesPerRow) * tileSize; - outRegions[i].bufferOffset = i * tileSizeBytes; + outRegions[i].bufferOffset = bufferBaseOffset + (i * tileSizeBytes); outRegions[i].bufferRowLength = tileSize; outRegions[i].bufferImageHeight = tileSize; outRegions[i].imageOffset = { tileX, tileY, 0 }; @@ -225,23 +234,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp } } - void generateRandomTileData(void* mappedPtr, uint32_t sizeBytes) - { - uint32_t* data = (uint32_t*)mappedPtr; - unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::mt19937 g(seed); - const uint32_t valueCount = sizeBytes / sizeof(uint32_t); - - auto bufferData = new uint32_t[valueCount]; - - for (uint32_t i = 0; i < valueCount; i++) - { - bufferData[i] = g(); - } - memcpy(mappedPtr, bufferData, sizeBytes); - delete[] bufferData; - } - double runBenchmark( const char* strategyName, IGPUBuffer* stagingBuffer, @@ -305,12 +297,31 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp }; m_device->blockForSemaphores({ &waitInfo, 1 }); - auto regions = new IImage::SBufferCopy[tilesPerFrame]; + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + uint32_t partitionSize = tilesPerFrame * tileSizeBytes; - generateRandomTileData(mappedPtr, tilesPerFrame * tileSizeBytes); + // CPU source buffer with random data (generated once, reused each frame) + auto cpuSourceData = new uint8_t[partitionSize]; + { + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + uint32_t* data = reinterpret_cast(cpuSourceData); + for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++) + data[i] = g(); + } - uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; - generateTileCopyRegions(regions, tilesPerFrame, tileSize, tileSizeBytes, imageWidth); + auto regionsPerFrame = new IImage::SBufferCopy*[framesInFlight]; + for (uint32_t i = 0; i < framesInFlight; i++) + { + regionsPerFrame[i] = new IImage::SBufferCopy[tilesPerFrame]; + uint32_t bufferOffset = i * partitionSize; + generateTileCopyRegions(regionsPerFrame[i], tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset); + } + + double totalWaitTime = 0.0; + double totalMemcpyTime = 0.0; + double totalRecordTime = 0.0; + double totalSubmitTime = 0.0; auto startTime = std::chrono::high_resolution_clock::now(); @@ -318,19 +329,35 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp { uint32_t cmdBufIndex = frame % framesInFlight; - commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + auto t1 = std::chrono::high_resolution_clock::now(); + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + 1 + }; + m_device->blockForSemaphores({&frameWaitInfo, 1}); + } + auto t2 = std::chrono::high_resolution_clock::now(); + + commandPools[cmdBufIndex]->reset(); + + uint32_t bufferOffset = cmdBufIndex * partitionSize; + void* targetPtr = static_cast(mappedPtr) + bufferOffset; + memcpy(targetPtr, cpuSourceData, partitionSize); + auto t3 = std::chrono::high_resolution_clock::now(); + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); commandBuffers[cmdBufIndex]->copyBufferToImage( stagingBuffer, destinationImage, IImage::LAYOUT::TRANSFER_DST_OPTIMAL, tilesPerFrame, - regions + regionsPerFrame[cmdBufIndex] ); - commandBuffers[cmdBufIndex]->end(); + auto t4 = std::chrono::high_resolution_clock::now(); - // Create submit info for THIS frame IQueue::SSubmitInfo frameSubmitInfo = {}; IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()}; frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1}; @@ -342,18 +369,13 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp }; frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1}; - // Submit to GPU queue->submit({&frameSubmitInfo, 1}); + auto t5 = std::chrono::high_resolution_clock::now(); - // Wait for old frames - if (frame >= framesInFlight) - { - ISemaphore::SWaitInfo frameWaitInfo = { - .semaphore = timelineSemaphore.get(), - .value = timelineValue - framesInFlight - }; - m_device->blockForSemaphores({&frameWaitInfo, 1}); - } + totalWaitTime += std::chrono::duration(t2 - t1).count(); + totalMemcpyTime += std::chrono::duration(t3 - t2).count(); + totalRecordTime += std::chrono::duration(t4 - t3).count(); + totalSubmitTime += std::chrono::duration(t5 - t4).count(); } // Wait for all remaining frames to complete @@ -365,15 +387,24 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp auto endTime = std::chrono::high_resolution_clock::now(); - delete[] regions; + delete[] cpuSourceData; + for (uint32_t i = 0; i < framesInFlight; i++) + delete[] regionsPerFrame[i]; + delete[] regionsPerFrame; delete[] commandPools; delete[] commandBuffers; - // Calculate throughput double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_INFO, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_INFO, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); + return throughputGBps; } @@ -401,7 +432,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp if (!outAllocation.isValid()) return logFail("Failed to allocate Device Memory!\n"); - outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_READ); + outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_WRITE); if (!outMappedPtr) return logFail("Failed to map Device Memory!\n"); From 874814af7c8dd08c264afbdebef1e0719561dffe Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Wed, 31 Dec 2025 16:29:19 +0330 Subject: [PATCH 4/7] Resolved PR comments + adding timestamp query --- 73_ImageUploadBenchmark/main.cpp | 159 +++++++++++++++++++++---------- 1 file changed, 110 insertions(+), 49 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index eceb0f9ea..f8124c9ab 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -1,5 +1,6 @@ #include "nbl/examples/examples.hpp" #include +#include using namespace nbl; using namespace nbl::core; @@ -68,8 +69,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp { IGPUImage::SCreationParams imgParams{}; imgParams.type = IImage::E_TYPE::ET_2D; - imgParams.extent.width = TILE_SIZE * 32; - imgParams.extent.height = TILE_SIZE * 32; + uint32_t tilePerRow = (uint32_t)std::sqrt(TILES_PER_FRAME); + imgParams.extent.width = TILE_SIZE * tilePerRow; + imgParams.extent.height = TILE_SIZE * tilePerRow; imgParams.extent.depth = 1u; imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM; imgParams.mipLevels = 1u; @@ -111,6 +113,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp throughputSystemRAM = runBenchmark( "System RAM", stagingBuffer.get(), + stagingAlloc, mappedPtr, destinationImage.get(), TILE_SIZE, @@ -147,6 +150,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp throughputVRAM = runBenchmark( "VRAM", stagingBuffer.get(), + stagingAlloc, mappedPtr, destinationImage.get(), TILE_SIZE, @@ -166,6 +170,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); } + m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_INFO); + std::this_thread::sleep_for(std::chrono::seconds(5)); + return true; } @@ -186,28 +193,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp } private: - void transitionImageLayout( - IGPUCommandBuffer* cmdBuf, - IGPUImage* image, - IImage::LAYOUT oldLayout, - IImage::LAYOUT newLayout) - { - IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; - barrier.oldLayout = oldLayout; - barrier.newLayout = newLayout; - barrier.image = image; - barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - barrier.subresourceRange.baseMipLevel = 0; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = 1; - barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; - barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; - barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_COMMANDS_BITS; - barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - cmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); - } - void generateTileCopyRegions( IImage::SBufferCopy* outRegions, uint32_t tilesPerFrame, @@ -237,6 +222,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp double runBenchmark( const char* strategyName, IGPUBuffer* stagingBuffer, + IDeviceMemoryAllocator::SAllocation& stagingAlloc, void* mappedPtr, IGPUImage* destinationImage, uint32_t tileSize, @@ -248,7 +234,16 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp { smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); - auto commandPools = new smart_refctd_ptr[framesInFlight]; + smart_refctd_ptr queryPool; + { + IQueryPool::SCreationParams queryPoolParams = {}; + queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolParams.queryCount = framesInFlight * 2; + queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + queryPool = m_device->createQueryPool(queryPoolParams); + } + + std::vector> commandPools(framesInFlight); for (uint32_t i = 0; i < framesInFlight; i++) { commandPools[i] = m_device->createCommandPool( @@ -256,8 +251,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT ); } - - auto commandBuffers = new smart_refctd_ptr[framesInFlight]; + std::vector> commandBuffers(framesInFlight); for (uint32_t i = 0; i < framesInFlight; i++) { commandPools[i]->createCommandBuffers( @@ -270,12 +264,22 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint64_t timelineValue = 0; commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - transitionImageLayout( - commandBuffers[0].get(), - destinationImage, - IImage::LAYOUT::UNDEFINED, - IImage::LAYOUT::TRANSFER_DST_OPTIMAL - ); + { + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = destinationImage; + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&initBarrier, 1}}); + } commandBuffers[0]->end(); IQueue::SSubmitInfo submitInfo = {}; @@ -300,22 +304,20 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; uint32_t partitionSize = tilesPerFrame * tileSizeBytes; - // CPU source buffer with random data (generated once, reused each frame) - auto cpuSourceData = new uint8_t[partitionSize]; + std::vector cpuSourceData(partitionSize); { unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); std::mt19937 g(seed); - uint32_t* data = reinterpret_cast(cpuSourceData); + uint32_t* data = reinterpret_cast(cpuSourceData.data()); for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++) data[i] = g(); } - - auto regionsPerFrame = new IImage::SBufferCopy*[framesInFlight]; + std::vector> regionsPerFrame(framesInFlight); for (uint32_t i = 0; i < framesInFlight; i++) { - regionsPerFrame[i] = new IImage::SBufferCopy[tilesPerFrame]; + regionsPerFrame[i].resize(tilesPerFrame); uint32_t bufferOffset = i * partitionSize; - generateTileCopyRegions(regionsPerFrame[i], tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset); + generateTileCopyRegions(regionsPerFrame[i].data(), tilesPerFrame, tileSize, tileSizeBytes, imageWidth, bufferOffset); } double totalWaitTime = 0.0; @@ -344,17 +346,63 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t bufferOffset = cmdBufIndex * partitionSize; void* targetPtr = static_cast(mappedPtr) + bufferOffset; - memcpy(targetPtr, cpuSourceData, partitionSize); + memcpy(targetPtr, cpuSourceData.data(), partitionSize); + + if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize); + m_device->flushMappedMemoryRanges(1, &range); + } + auto t3 = std::chrono::high_resolution_clock::now(); commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + uint32_t queryStartIndex = cmdBufIndex * 2; + commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2); + + IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; + barrier.oldLayout = IImage::LAYOUT::GENERAL; + barrier.newLayout = IImage::LAYOUT::GENERAL; + barrier.image = destinationImage; + barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&barrier, 1}}); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0); + commandBuffers[cmdBufIndex]->copyBufferToImage( stagingBuffer, destinationImage, - IImage::LAYOUT::TRANSFER_DST_OPTIMAL, + IImage::LAYOUT::GENERAL, tilesPerFrame, - regionsPerFrame[cmdBufIndex] + regionsPerFrame[cmdBufIndex].data() ); + + IGPUCommandBuffer::SImageMemoryBarrier afterBarrier = {}; + afterBarrier.oldLayout = IImage::LAYOUT::GENERAL; + afterBarrier.newLayout = IImage::LAYOUT::GENERAL; + afterBarrier.image = destinationImage; + afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + afterBarrier.subresourceRange.baseMipLevel = 0; + afterBarrier.subresourceRange.levelCount = 1; + afterBarrier.subresourceRange.baseArrayLayer = 0; + afterBarrier.subresourceRange.layerCount = 1; + afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}}); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1); + commandBuffers[cmdBufIndex]->end(); auto t4 = std::chrono::high_resolution_clock::now(); @@ -387,17 +435,30 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp auto endTime = std::chrono::high_resolution_clock::now(); - delete[] cpuSourceData; - for (uint32_t i = 0; i < framesInFlight; i++) - delete[] regionsPerFrame[i]; - delete[] regionsPerFrame; - delete[] commandPools; - delete[] commandBuffers; + std::vector timestamps(framesInFlight * 2); + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags); + uint64_t totalGpuTicks = 0; + for (uint32_t i = 0; i < framesInFlight; i++) { + uint64_t startTick = timestamps[i * 2 + 0]; + uint64_t endTick = timestamps[i * 2 + 1]; + totalGpuTicks += (endTick - startTick); + } + float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds; + double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9; + + double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight; + double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames; + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + m_logger->log(" GPU time: %.3f s", ILogger::ELL_INFO, totalGpuTimeSeconds); + m_logger->log(" GPU throughput: %.2f GB/s", ILogger::ELL_INFO, throughputGBps); + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_INFO, strategyName); m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); From ddb7bfc6ae5889aea89db756b461a0beeb763d0f Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Thu, 1 Jan 2026 17:01:15 +0330 Subject: [PATCH 5/7] Adding more logs to release build --- 73_ImageUploadBenchmark/main.cpp | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index f8124c9ab..ff38b1555 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -33,11 +33,11 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); constexpr uint32_t TOTAL_FRAMES = 1000; - m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_INFO); - m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_INFO, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024); - m_logger->log("Staging buffer: %u MB", ILogger::ELL_INFO, STAGING_BUFFER_SIZE / (1024 * 1024)); - m_logger->log("Tiles per frame: %u", ILogger::ELL_INFO, TILES_PER_FRAME); - m_logger->log("Frames in flight: %u", ILogger::ELL_INFO, FRAMES_IN_FLIGHT); + m_logger->log("GPU Memory Transfer Benchmark", ILogger::ELL_PERFORMANCE); + m_logger->log("Tile size: %ux%u (%u KB)", ILogger::ELL_PERFORMANCE, TILE_SIZE, TILE_SIZE, TILE_SIZE_BYTES / 1024); + m_logger->log("Staging buffer: %u MB", ILogger::ELL_PERFORMANCE, STAGING_BUFFER_SIZE / (1024 * 1024)); + m_logger->log("Tiles per frame: %u", ILogger::ELL_PERFORMANCE, TILES_PER_FRAME); + m_logger->log("Frames in flight: %u", ILogger::ELL_PERFORMANCE, FRAMES_IN_FLIGHT); uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); @@ -48,9 +48,9 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X", - ILogger::ELL_INFO, hostVisibleBits, deviceLocalBits, hostCachedBits); + ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits); m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X", - ILogger::ELL_INFO, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); + ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); if (!hostVisibleOnlyBits) { @@ -96,7 +96,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n"); } - m_logger->log("\nTesting Strategy 1: System RAM", ILogger::ELL_INFO); + m_logger->log("\nStrategy 1: System RAM", ILogger::ELL_PERFORMANCE); double throughputSystemRAM = 0.0; { @@ -133,7 +133,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp if (hostVisibleDeviceLocalBits) { - m_logger->log("\nTesting Strategy 2: VRAM", ILogger::ELL_INFO); + m_logger->log("\nStrategy 2: VRAM", ILogger::ELL_PERFORMANCE); double throughputVRAM = 0.0; { @@ -170,7 +170,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); } - m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_INFO); + m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE); std::this_thread::sleep_for(std::chrono::seconds(5)); return true; @@ -456,15 +456,15 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; - m_logger->log(" GPU time: %.3f s", ILogger::ELL_INFO, totalGpuTimeSeconds); - m_logger->log(" GPU throughput: %.2f GB/s", ILogger::ELL_INFO, throughputGBps); + m_logger->log(" GPU time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); + m_logger->log(" GPU throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps); - m_logger->log(" Timing breakdown for %s:", ILogger::ELL_INFO, strategyName); - m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); - m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); - m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); - m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_INFO, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); - m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_INFO, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); return throughputGBps; } From f1fc8d50a520023dd72ac995175dfe60a64b997e Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Fri, 30 Jan 2026 17:19:14 +0330 Subject: [PATCH 6/7] Added image to image copy --- 73_ImageUploadBenchmark/main.cpp | 458 ++++++++++++++++++++++++++++++- 1 file changed, 443 insertions(+), 15 deletions(-) diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index ff38b1555..1fff59202 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -43,7 +43,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); uint32_t hostCachedBits = m_physicalDevice->getMemoryTypeBitsFromMemoryTypeFlags(IDeviceMemoryAllocation::EMPF_HOST_CACHED_BIT); - uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits & ~hostCachedBits; + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits; uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; @@ -170,6 +170,146 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); } + m_device->waitIdle(); + + + m_logger->log("\nStrategy 3: Image-to-Image Staging (OPTIMAL)", ILogger::ELL_PERFORMANCE); + { + std::vector> stagingImages(FRAMES_IN_FLIGHT); + for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++) + { + IGPUImage::SCreationParams imgParams{}; + imgParams.type = IImage::E_TYPE::ET_2D; + imgParams.extent.width = TILE_SIZE; + imgParams.extent.height = TILE_SIZE; + imgParams.extent.depth = 1u; + imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM; + imgParams.mipLevels = 1u; + imgParams.flags = IImage::ECF_NONE; + imgParams.arrayLayers = 1u; + imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; + imgParams.tiling = video::IGPUImage::TILING::OPTIMAL; + imgParams.usage = asset::IImage::EUF_TRANSFER_SRC_BIT; + imgParams.preinitialized = false; + stagingImages[i] = m_device->createImage(std::move(imgParams)); + } + + std::vector imageMemoryOffsets(FRAMES_IN_FLIGHT); + size_t currentOffset = 0; + uint32_t combinedMemoryTypeBits = 0xFFFFFFFF; + uint32_t maxAlignmentLog2 = 0; + for (size_t i = 0; i < FRAMES_IN_FLIGHT; i++) + { + auto memReqs = stagingImages[i]->getMemoryReqs(); + size_t alignment = 1u << memReqs.alignmentLog2; + size_t alignedOffset = (currentOffset + alignment - 1) & ~(alignment - 1); + imageMemoryOffsets[i] = alignedOffset; + currentOffset = alignedOffset + memReqs.size; + combinedMemoryTypeBits &= memReqs.memoryTypeBits; + if (memReqs.alignmentLog2 > maxAlignmentLog2) + maxAlignmentLog2 = memReqs.alignmentLog2; + } + + size_t totalMemorySize = currentOffset; + + + uint32_t compatibleBits = combinedMemoryTypeBits & hostVisibleDeviceLocalBits; + if (!compatibleBits) + compatibleBits = combinedMemoryTypeBits & hostVisibleOnlyBits; + + if (!compatibleBits) + { + m_logger->log("OPTIMAL images don't support HOST_VISIBLE on this GPU!", + ILogger::ELL_ERROR); + return false; + } + + IDeviceMemoryBacked::SDeviceMemoryRequirements memReqs = {}; + memReqs.size = totalMemorySize; + memReqs.memoryTypeBits = compatibleBits; + memReqs.alignmentLog2 = maxAlignmentLog2; + + auto memoryAllocation = m_device->allocate(memReqs,nullptr,IDeviceMemoryAllocation::EMAF_NONE); + if (!memoryAllocation.isValid()) + { + m_logger->log("Failed to allocate HOST_VISIBLE memory for staging images!", ILogger::ELL_ERROR); + } + + for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++) + { + ILogicalDevice::SBindImageMemoryInfo info{}; + info.image = stagingImages[i].get(); + info.binding.memory = memoryAllocation.memory.get(); + info.binding.offset = imageMemoryOffsets[i]; + if (!m_device->bindImageMemory({&info,1})) + { + m_logger->log("Failed to bind staging image %u to memory!", ILogger::ELL_ERROR, i); + } + } + + void* mappedPtr = memoryAllocation.memory->map({ 0ull,memoryAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_WRITE); + if (!mappedPtr) + { + m_logger->log("Failed to map staging image memory!", ILogger::ELL_ERROR); + } + + smart_refctd_ptr transitionCmdPool = m_device->createCommandPool( + queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + + smart_refctd_ptr transitionCmdBuf; + transitionCmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &transitionCmdBuf); + + transitionCmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + + for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++) + { + IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; + barrier.oldLayout = IImage::LAYOUT::UNDEFINED; + barrier.newLayout = IImage::LAYOUT::GENERAL; + barrier.image = stagingImages[i].get(); + barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT; + + transitionCmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); + } + + transitionCmdBuf->end(); + + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = transitionCmdBuf.get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + queue->submit({ &submitInfo, 1 }); + m_device->waitIdle(); + double throughputImageStaging = runBenchmarkImageStaging( + "Image-to-Image", + stagingImages, + imageMemoryOffsets, + memoryAllocation.memory.get(), + mappedPtr, + destinationImage.get(), + TILE_SIZE, + TILE_SIZE_BYTES, + TILES_PER_FRAME, + FRAMES_IN_FLIGHT, + TOTAL_FRAMES, + queue + ); + + m_logger->log("Image-to-Image staging throughput: %.2f GB/s", + ILogger::ELL_PERFORMANCE, throughputImageStaging); + } + m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE); std::this_thread::sleep_for(std::chrono::seconds(5)); @@ -238,11 +378,11 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp { IQueryPool::SCreationParams queryPoolParams = {}; queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP; - queryPoolParams.queryCount = framesInFlight * 2; + queryPoolParams.queryCount = framesInFlight * 2; queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; queryPool = m_device->createQueryPool(queryPoolParams); } - + std::vector> commandPools(framesInFlight); for (uint32_t i = 0; i < framesInFlight; i++) { @@ -278,7 +418,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; - commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&initBarrier, 1}}); + commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} }); } commandBuffers[0]->end(); @@ -338,7 +478,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp .semaphore = timelineSemaphore.get(), .value = timelineValue - framesInFlight + 1 }; - m_device->blockForSemaphores({&frameWaitInfo, 1}); + m_device->blockForSemaphores({ &frameWaitInfo, 1 }); } auto t2 = std::chrono::high_resolution_clock::now(); @@ -374,7 +514,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; - commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&barrier, 1}}); + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0); @@ -399,7 +539,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT; - commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}}); + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&afterBarrier, 1} }); commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1); @@ -407,17 +547,17 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp auto t4 = std::chrono::high_resolution_clock::now(); IQueue::SSubmitInfo frameSubmitInfo = {}; - IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()}; - frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1}; + IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = { .cmdbuf = commandBuffers[cmdBufIndex].get() }; + frameSubmitInfo.commandBuffers = { &frameCmdBufInfo, 1 }; IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = { .semaphore = timelineSemaphore.get(), .value = ++timelineValue, .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS }; - frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1}; + frameSubmitInfo.signalSemaphores = { &frameSignalInfo, 1 }; - queue->submit({&frameSubmitInfo, 1}); + queue->submit({ &frameSubmitInfo, 1 }); auto t5 = std::chrono::high_resolution_clock::now(); totalWaitTime += std::chrono::duration(t2 - t1).count(); @@ -426,12 +566,11 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp totalSubmitTime += std::chrono::duration(t5 - t4).count(); } - // Wait for all remaining frames to complete ISemaphore::SWaitInfo finalWait = { .semaphore = timelineSemaphore.get(), .value = timelineValue }; - m_device->blockForSemaphores({&finalWait, 1}); + m_device->blockForSemaphores({ &finalWait, 1 }); auto endTime = std::chrono::high_resolution_clock::now(); @@ -469,6 +608,295 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp return throughputGBps; } + + double runBenchmarkImageStaging( + const char* strategyName, + const std::vector>& stagingImages, + const std::vector& imageMemoryOffsets, + IDeviceMemoryAllocation* stagingMemory, + void* mappedPtr, + IGPUImage* destinationImage, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t tilesPerFrame, + uint32_t framesInFlight, + uint32_t totalFrames, + IQueue* queue) + { + smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); + + smart_refctd_ptr queryPool; + { + IQueryPool::SCreationParams queryPoolParams = {}; + queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolParams.queryCount = framesInFlight * 2; + queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + queryPool = m_device->createQueryPool(queryPoolParams); + } + + std::vector> commandPools(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i] = m_device->createCommandPool( + queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + } + std::vector> commandBuffers(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i]->createCommandBuffers( + IGPUCommandPool::BUFFER_LEVEL::PRIMARY, + 1, + &commandBuffers[i] + ); + } + + uint64_t timelineValue = 0; + + commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = destinationImage; + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} }); + } + commandBuffers[0]->end(); + + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; + + queue->submit({ &submitInfo, 1 }); + + ISemaphore::SWaitInfo waitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &waitInfo, 1 }); + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + std::vector testPatternData(tileSizeBytes); + for (uint32_t y = 0; y < tileSize; y++) + { + for (uint32_t x = 0; x < tileSize; x++) + { + uint32_t idx = (y * tileSize + x) * 4; + testPatternData[idx + 0] = (x * 2) & 0xFF; + testPatternData[idx + 1] = (y * 2) & 0xFF; + testPatternData[idx + 2] = 128; + testPatternData[idx + 3] = 255; + } + } + + uint32_t tilesPerRow = imageWidth / tileSize; + + double totalWaitTime = 0.0; + double totalMemcpyTime = 0.0; + double totalImageCreateTime = 0.0; + double totalRecordTime = 0.0; + double totalSubmitTime = 0.0; + + auto startTime = std::chrono::high_resolution_clock::now(); + + for (uint32_t frame = 0; frame < totalFrames; frame++) + { + uint32_t cmdBufIndex = frame % framesInFlight; + + auto t1 = std::chrono::high_resolution_clock::now(); + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + 1 + }; + m_device->blockForSemaphores({&frameWaitInfo, 1}); + } + auto t2 = std::chrono::high_resolution_clock::now(); + + commandPools[cmdBufIndex]->reset(); + + IGPUImage* stagingImage = stagingImages[cmdBufIndex].get(); + size_t memoryOffset = imageMemoryOffsets[cmdBufIndex]; + + void* targetPtr = static_cast(mappedPtr) + memoryOffset; + memcpy(targetPtr, testPatternData.data(), tileSizeBytes); + + // Flush if not HOST_COHERENT + if (!stagingMemory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingMemory, memoryOffset, tileSizeBytes); + m_device->flushMappedMemoryRanges(1, &range); + } + + + auto t3 = std::chrono::high_resolution_clock::now(); + + + + auto t4 = std::chrono::high_resolution_clock::now(); + + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + uint32_t queryStartIndex = cmdBufIndex * 2; + commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2); + + IGPUCommandBuffer::SImageMemoryBarrier stagingBarrier = {}; + stagingBarrier.oldLayout = IImage::LAYOUT::GENERAL; + stagingBarrier.newLayout = IImage::LAYOUT::GENERAL; + stagingBarrier.image = stagingImage; + stagingBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + stagingBarrier.subresourceRange.baseMipLevel = 0; + stagingBarrier.subresourceRange.levelCount = 1; + stagingBarrier.subresourceRange.baseArrayLayer = 0; + stagingBarrier.subresourceRange.layerCount = 1; + stagingBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT; + stagingBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_READ_BIT; + stagingBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT; + stagingBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COPY_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&stagingBarrier, 1} }); + + IGPUCommandBuffer::SImageMemoryBarrier dstBarrier = {}; + dstBarrier.oldLayout = IImage::LAYOUT::GENERAL; + dstBarrier.newLayout = IImage::LAYOUT::GENERAL; + dstBarrier.image = destinationImage; + dstBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + dstBarrier.subresourceRange.baseMipLevel = 0; + dstBarrier.subresourceRange.levelCount = 1; + dstBarrier.subresourceRange.baseArrayLayer = 0; + dstBarrier.subresourceRange.layerCount = 1; + dstBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + dstBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + dstBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + dstBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&dstBarrier, 1}}); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 0); + + uint32_t tileIndex = frame % tilesPerRow; + uint32_t tileX = (tileIndex % tilesPerRow) * tileSize; + uint32_t tileY = (tileIndex / tilesPerRow) * tileSize; + + IImage::SImageCopy copyRegion = {}; + copyRegion.srcSubresource.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + copyRegion.srcSubresource.mipLevel = 0; + copyRegion.srcSubresource.baseArrayLayer = 0; + copyRegion.srcSubresource.layerCount = 1; + copyRegion.srcOffset = { 0, 0, 0 }; + copyRegion.dstSubresource = copyRegion.srcSubresource; + copyRegion.dstOffset = { tileX, tileY, 0 }; + copyRegion.extent = { tileSize, tileSize, 1 }; + + commandBuffers[cmdBufIndex]->copyImage( + stagingImage, + IImage::LAYOUT::GENERAL, + destinationImage, + IImage::LAYOUT::GENERAL, + 1, + ©Region + ); + + IGPUCommandBuffer::SImageMemoryBarrier afterBarrier = {}; + afterBarrier.oldLayout = IImage::LAYOUT::GENERAL; + afterBarrier.newLayout = IImage::LAYOUT::GENERAL; + afterBarrier.image = destinationImage; + afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + afterBarrier.subresourceRange.baseMipLevel = 0; + afterBarrier.subresourceRange.levelCount = 1; + afterBarrier.subresourceRange.baseArrayLayer = 0; + afterBarrier.subresourceRange.layerCount = 1; + afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::TRANSFER_WRITE_BIT; + afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS; + afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, {.imgBarriers = {&afterBarrier, 1}}); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COPY_BIT, queryPool.get(), queryStartIndex + 1); + + commandBuffers[cmdBufIndex]->end(); + auto t5 = std::chrono::high_resolution_clock::now(); + + IQueue::SSubmitInfo frameSubmitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = {.cmdbuf = commandBuffers[cmdBufIndex].get()}; + frameSubmitInfo.commandBuffers = {&frameCmdBufInfo, 1}; + + IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::ALL_TRANSFER_BITS + }; + frameSubmitInfo.signalSemaphores = {&frameSignalInfo, 1}; + + queue->submit({&frameSubmitInfo, 1}); + auto t6 = std::chrono::high_resolution_clock::now(); + + + + totalWaitTime += std::chrono::duration(t2 - t1).count(); + totalMemcpyTime += std::chrono::duration(t3 - t2).count(); + totalImageCreateTime += std::chrono::duration(t4 - t3).count(); + totalRecordTime += std::chrono::duration(t5 - t4).count(); + totalSubmitTime += std::chrono::duration(t6 - t5).count(); + } + + ISemaphore::SWaitInfo finalWait = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({&finalWait, 1}); + + auto endTime = std::chrono::high_resolution_clock::now(); + + std::vector timestamps(framesInFlight * 2); + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags); + uint64_t totalGpuTicks = 0; + for (uint32_t i = 0; i < framesInFlight; i++) { + uint64_t startTick = timestamps[i * 2 + 0]; + uint64_t endTick = timestamps[i * 2 + 1]; + totalGpuTicks += (endTick - startTick); + } + float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds; + double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9; + + double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight; + double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames; + + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); + uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + + double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + + m_logger->log(" copyImage time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); + m_logger->log(" Total throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps); + + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Image create time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalImageCreateTime, 100.0 * totalImageCreateTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); + + return throughputGBps; + } + bool createStagingBuffer( uint32_t bufferSize, uint32_t memoryTypeBits, @@ -493,7 +921,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp if (!outAllocation.isValid()) return logFail("Failed to allocate Device Memory!\n"); - outMappedPtr = outAllocation.memory->map({0ull, outAllocation.memory->getAllocationSize()}, IDeviceMemoryAllocation::EMCAF_WRITE); + outMappedPtr = outAllocation.memory->map({ 0ull, outAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_WRITE); if (!outMappedPtr) return logFail("Failed to map Device Memory!\n"); @@ -501,4 +929,4 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp } }; -NBL_MAIN_FUNC(ImageUploadBenchmarkApp) +NBL_MAIN_FUNC(ImageUploadBenchmarkApp) \ No newline at end of file From 7abe408b3fba6ae0ea896c1c462c51a7a483e506 Mon Sep 17 00:00:00 2001 From: CrabeExtra Date: Thu, 26 Feb 2026 10:35:34 +0330 Subject: [PATCH 7/7] compute shader added --- 73_ImageUploadBenchmark/CMakeLists.txt | 60 +- .../app_resources/common.hlsl | 8 + .../app_resources/tile_upload.comp.hlsl | 260 ++++ 73_ImageUploadBenchmark/main.cpp | 1203 ++++++++++++++--- 4 files changed, 1334 insertions(+), 197 deletions(-) create mode 100644 73_ImageUploadBenchmark/app_resources/common.hlsl create mode 100644 73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl diff --git a/73_ImageUploadBenchmark/CMakeLists.txt b/73_ImageUploadBenchmark/CMakeLists.txt index 2f9218f93..da95550e7 100644 --- a/73_ImageUploadBenchmark/CMakeLists.txt +++ b/73_ImageUploadBenchmark/CMakeLists.txt @@ -3,4 +3,62 @@ if(NOT RES) message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") endif() -nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") \ No newline at end of file +nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") + +if(NBL_EMBED_BUILTIN_RESOURCES) + set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) + set(RESOURCE_DIR "app_resources") + + get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) + get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) + + file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") + foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) + LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") + endforeach() + + ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") + + LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) +endif() + +set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") +set(DEPENDS + app_resources/common.hlsl + app_resources/tile_upload.comp.hlsl +) +target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS}) +set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON) + +set(SM 6_8) +set(JSON [=[ +[ + { + "INPUT": "app_resources/tile_upload.comp.hlsl", + "KEY": "snakeStore" + } +] +]=]) +string(CONFIGURE "${JSON}" JSON) + +NBL_CREATE_NSC_COMPILE_RULES( + TARGET ${EXECUTABLE_NAME}SPIRV + LINK_TO ${EXECUTABLE_NAME} + DEPENDS ${DEPENDS} + BINARY_DIR ${OUTPUT_DIRECTORY} + MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT + COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -T lib_${SM} + OUTPUT_VAR KEYS + INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp + NAMESPACE nbl::this_example::builtin::build + INPUTS ${JSON} +) + +NBL_CREATE_RESOURCE_ARCHIVE( + NAMESPACE nbl::this_example::builtin::build + TARGET ${EXECUTABLE_NAME}_builtinsBuild + LINK_TO ${EXECUTABLE_NAME} + BIND ${OUTPUT_DIRECTORY} + BUILTINS ${KEYS} +) diff --git a/73_ImageUploadBenchmark/app_resources/common.hlsl b/73_ImageUploadBenchmark/app_resources/common.hlsl new file mode 100644 index 000000000..f86f60fb7 --- /dev/null +++ b/73_ImageUploadBenchmark/app_resources/common.hlsl @@ -0,0 +1,8 @@ +struct PushConstantData +{ + uint64_t deviceBufferAddress; + uint32_t2 dstOffset; + uint32_t srcWidth; + uint32_t srcHeight; + uint32_t tilesPerRow; +}; diff --git a/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl b/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl new file mode 100644 index 000000000..bfec6b9d8 --- /dev/null +++ b/73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl @@ -0,0 +1,260 @@ +#include "common.hlsl" + +[[vk::binding(0,0)]] RWTexture2D dstImage; +[[vk::push_constant]] PushConstantData pc; + +using namespace nbl::hlsl; + +static const uint32_t TILE_WIDTH = 16u; +static const uint32_t TILE_HEIGHT = 8u; + +[numthreads(128, 1, 1)] +[shader("compute")] +void linearStore(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 pixelPos = uint32_t2(gIdx % pc.srcWidth, gIdx / pc.srcWidth); + + if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) + return; + + uint32_t packed = vk::RawBufferLoad(pc.deviceBufferAddress + gIdx * 4u); + + float32_t4 rgba = float32_t4( + float32_t((packed >> 0u) & 0xFFu) / 255.0f, + float32_t((packed >> 8u) & 0xFFu) / 255.0f, + float32_t((packed >> 16u) & 0xFFu) / 255.0f, + float32_t((packed >> 24u) & 0xFFu) / 255.0f + ); + + dstImage[pc.dstOffset + pixelPos] = rgba; +} + +[numthreads(128, 1, 1)] +[shader("compute")] +void linearLoad(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 pixelPos = uint32_t2(gIdx % pc.srcWidth, gIdx / pc.srcWidth); + + if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) + return; + + float32_t4 color = dstImage[pc.dstOffset + pixelPos]; + + uint32_t r = uint32_t(color.r * 255.0f + 0.5f); + uint32_t g = uint32_t(color.g * 255.0f + 0.5f); + uint32_t b = uint32_t(color.b * 255.0f + 0.5f); + uint32_t a = uint32_t(color.a * 255.0f + 0.5f); + uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u); + vk::RawBufferStore(pc.deviceBufferAddress + gIdx * 4u, packed); +} + + +uint32_t2 snakePixelPos(uint32_t gIdx, uint32_t srcWidth) +{ + static const uint32_t PIXELS_PER_TILE = TILE_WIDTH * TILE_HEIGHT; + uint32_t tilesPerRow = srcWidth / TILE_WIDTH; + + uint32_t tileIdx = gIdx / PIXELS_PER_TILE; + uint32_t localIdx = gIdx % PIXELS_PER_TILE; + + uint32_t tileRow = tileIdx / tilesPerRow; + uint32_t tileCol = tileIdx % tilesPerRow; + // Odd rows: reverse X direction + if (tileRow & 1u) + tileCol = tilesPerRow - 1u - tileCol; + + uint32_t localX = localIdx % TILE_WIDTH; + uint32_t localY = localIdx / TILE_WIDTH; + + return uint32_t2( + tileCol * TILE_WIDTH + localX, + tileRow * TILE_HEIGHT + localY + ); +} + +[numthreads(128, 1, 1)] +[shader("compute")] +void SnakeOrderStore(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 pixelPos = snakePixelPos(gIdx, pc.srcWidth); + + if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) + return; + + uint32_t packed = vk::RawBufferLoad(pc.deviceBufferAddress + gIdx * 4u); + + float32_t4 rgba = float32_t4( + float32_t((packed >> 0u) & 0xFFu) / 255.0f, + float32_t((packed >> 8u) & 0xFFu) / 255.0f, + float32_t((packed >> 16u) & 0xFFu) / 255.0f, + float32_t((packed >> 24u) & 0xFFu) / 255.0f + ); + + dstImage[pc.dstOffset + pixelPos] = rgba; +} + +[numthreads(128, 1, 1)] +[shader("compute")] +void SnakeOrderLoad(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 pixelPos = snakePixelPos(gIdx, pc.srcWidth); + + if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) + return; + + float32_t4 color = dstImage[pc.dstOffset + pixelPos]; + + uint32_t r = uint32_t(color.r * 255.0f + 0.5f); + uint32_t g = uint32_t(color.g * 255.0f + 0.5f); + uint32_t b = uint32_t(color.b * 255.0f + 0.5f); + uint32_t a = uint32_t(color.a * 255.0f + 0.5f); + uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u); + + vk::RawBufferStore(pc.deviceBufferAddress + gIdx * 4u, packed); +} + +uint32_t mortonCompact1By1(uint32_t x) +{ + x &= 0x55555555u; + x = (x ^ (x >> 1u)) & 0x33333333u; + x = (x ^ (x >> 2u)) & 0x0f0f0f0fu; + x = (x ^ (x >> 4u)) & 0x00ff00ffu; + x = (x ^ (x >> 8u)) & 0x0000ffffu; + return x; +} + +uint32_t2 mortonDecode(uint32_t code) +{ + return uint32_t2( + mortonCompact1By1(code), + mortonCompact1By1(code >> 1u) + ); +} + +void batchedTileInfo(uint32_t gIdx, uint32_t tileW, uint32_t tileH, uint32_t tilesPerRow, + out uint32_t2 tileBase, out uint32_t localIdx) +{ + uint32_t pixelsPerTile = tileW * tileH; + uint32_t tileIdx = gIdx / pixelsPerTile; + localIdx = gIdx % pixelsPerTile; + uint32_t tileCol = tileIdx % tilesPerRow; + uint32_t tileRow = tileIdx / tilesPerRow; + tileBase = uint32_t2(tileCol * tileW, tileRow * tileH); +} + +float32_t4 unpackRGBA(uint32_t packed) +{ + return float32_t4( + float32_t((packed >> 0u) & 0xFFu) / 255.0f, + float32_t((packed >> 8u) & 0xFFu) / 255.0f, + float32_t((packed >> 16u) & 0xFFu) / 255.0f, + float32_t((packed >> 24u) & 0xFFu) / 255.0f + ); +} + +[numthreads(128, 1, 1)] +[shader("compute")] +void BatchedLinearStore(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 tileBase; + uint32_t localIdx; + batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx); + + uint32_t2 localPos = uint32_t2(localIdx % pc.srcWidth, localIdx / pc.srcWidth); + uint32_t2 pixelPos = tileBase + localPos; + + uint32_t packed = vk::RawBufferLoad(pc.deviceBufferAddress + gIdx * 4u); + dstImage[pixelPos] = unpackRGBA(packed); +} + +[numthreads(128, 1, 1)] +[shader("compute")] +void BatchedSnakeStore(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 tileBase; + uint32_t localIdx; + batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx); + + // Snake within tile row-major with zigzag on odd tile rows + uint32_t localTilesPerRow = pc.srcWidth / TILE_WIDTH; + uint32_t subTileIdx = localIdx / (TILE_WIDTH * TILE_HEIGHT); + uint32_t subLocalIdx = localIdx % (TILE_WIDTH * TILE_HEIGHT); + uint32_t subRow = subTileIdx / localTilesPerRow; + uint32_t subCol = subTileIdx % localTilesPerRow; + if (subRow & 1u) + subCol = localTilesPerRow - 1u - subCol; + uint32_t localX = subCol * TILE_WIDTH + (subLocalIdx % TILE_WIDTH); + uint32_t localY = subRow * TILE_HEIGHT + (subLocalIdx / TILE_WIDTH); + uint32_t2 pixelPos = tileBase + uint32_t2(localX, localY); + + uint32_t packed = vk::RawBufferLoad(pc.deviceBufferAddress + gIdx * 4u); + dstImage[pixelPos] = unpackRGBA(packed); +} + +[numthreads(128, 1, 1)] +[shader("compute")] +void BatchedMortonStore(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 tileBase; + uint32_t localIdx; + batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx); + + uint32_t2 localPos = mortonDecode(localIdx); + uint32_t2 pixelPos = tileBase + localPos; + + if (localPos.x >= pc.srcWidth || localPos.y >= pc.srcHeight) + return; + + uint32_t packed = vk::RawBufferLoad(pc.deviceBufferAddress + gIdx * 4u); + dstImage[pixelPos] = unpackRGBA(packed); +} + +[numthreads(128, 1, 1)] +[shader("compute")] +void MortonOrderStore(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 pixelPos = mortonDecode(gIdx); + + if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) + return; + + uint32_t packed = vk::RawBufferLoad(pc.deviceBufferAddress + gIdx * 4u); + + float32_t4 rgba = float32_t4( + float32_t((packed >> 0u) & 0xFFu) / 255.0f, + float32_t((packed >> 8u) & 0xFFu) / 255.0f, + float32_t((packed >> 16u) & 0xFFu) / 255.0f, + float32_t((packed >> 24u) & 0xFFu) / 255.0f + ); + + dstImage[pc.dstOffset + pixelPos] = rgba; +} + +[numthreads(128, 1, 1)] +[shader("compute")] +void MortonOrderLoad(uint32_t3 ID : SV_DispatchThreadID) +{ + uint32_t gIdx = ID.x; + uint32_t2 pixelPos = mortonDecode(gIdx); + + if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) + return; + + float32_t4 color = dstImage[pc.dstOffset + pixelPos]; + + uint32_t r = uint32_t(color.r * 255.0f + 0.5f); + uint32_t g = uint32_t(color.g * 255.0f + 0.5f); + uint32_t b = uint32_t(color.b * 255.0f + 0.5f); + uint32_t a = uint32_t(color.a * 255.0f + 0.5f); + uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u); + + vk::RawBufferStore(pc.deviceBufferAddress + gIdx * 4u, packed); +} \ No newline at end of file diff --git a/73_ImageUploadBenchmark/main.cpp b/73_ImageUploadBenchmark/main.cpp index 1fff59202..094e3c2f7 100644 --- a/73_ImageUploadBenchmark/main.cpp +++ b/73_ImageUploadBenchmark/main.cpp @@ -1,4 +1,5 @@ #include "nbl/examples/examples.hpp" +#include "nbl/this_example/builtin/build/spirv/keys.hpp" #include #include @@ -25,9 +26,6 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp if (!asset_base_t::onAppInitialized(std::move(system))) return false; - constexpr uint32_t TILE_SIZE = 128; - constexpr uint32_t TILE_BYTES_PER_PIXEL = 4; - constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL; constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; constexpr uint32_t FRAMES_IN_FLIGHT = 4; constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); @@ -47,7 +45,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; - m_logger->log("Memory type bits - HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X", + m_logger->log("Memory type bits HostVisible: 0x%X, DeviceLocal: 0x%X, HostCached: 0x%X", ILogger::ELL_PERFORMANCE, hostVisibleBits, deviceLocalBits, hostCachedBits); m_logger->log("System RAM (non-cached): 0x%X, VRAM: 0x%X", ILogger::ELL_PERFORMANCE, hostVisibleOnlyBits, hostVisibleDeviceLocalBits); @@ -64,8 +62,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp return false; } - IQueue* queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT); - smart_refctd_ptr destinationImage; + m_queue = getQueue(IQueue::FAMILY_FLAGS::GRAPHICS_BIT); { IGPUImage::SCreationParams imgParams{}; imgParams.type = IImage::E_TYPE::ET_2D; @@ -79,246 +76,607 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp imgParams.arrayLayers = 1u; imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; imgParams.tiling = video::IGPUImage::TILING::OPTIMAL; - imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT; + imgParams.usage = asset::IImage::EUF_TRANSFER_DST_BIT | asset::IImage::EUF_STORAGE_BIT; imgParams.preinitialized = false; - destinationImage = m_device->createImage(std::move(imgParams)); - if (!destinationImage) + m_destinationImage = m_device->createImage(std::move(imgParams)); + if (!m_destinationImage) return logFail("Failed to create destination image!\n"); - destinationImage->setObjectDebugName("Destination Image"); + m_destinationImage->setObjectDebugName("Destination Image"); - auto reqs = destinationImage->getMemoryReqs(); + auto reqs = m_destinationImage->getMemoryReqs(); reqs.memoryTypeBits &= deviceLocalBits; - auto allocation = m_device->allocate(reqs, destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE); + auto allocation = m_device->allocate(reqs, m_destinationImage.get(), IDeviceMemoryAllocation::EMAF_NONE); if (!allocation.isValid()) return logFail("Failed to allocate DEVICE_LOCAL memory for destination image!\n"); } - m_logger->log("\nStrategy 1: System RAM", ILogger::ELL_PERFORMANCE); - - double throughputSystemRAM = 0.0; + //compute shader + auto loadPrecompiledShader = [&]()->smart_refctd_ptr { - smart_refctd_ptr stagingBuffer; - IDeviceMemoryAllocator::SAllocation stagingAlloc; - void* mappedPtr = nullptr; + IAssetLoader::SAssetLoadParams lp = {}; + lp.logger = m_logger.get(); + lp.workingDirectory = "app_resources"; + + auto key = nbl::this_example::builtin::build::get_spirv_key(m_physicalDevice->getLimits(), m_physicalDevice->getFeatures()); + m_logger->log("Loading shader with key: %s", ILogger::ELL_INFO, key.data()); - if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleOnlyBits, - "Staging Buffer - System RAM", stagingBuffer, stagingAlloc, mappedPtr)) + auto assetBundle = m_assetMgr->getAsset(key.data(), lp); + const auto assets = assetBundle.getContents(); + if (assets.empty()) { - return false; + m_logger->log("Asset bundle is empty for key: %s", ILogger::ELL_ERROR, key.data()); + return smart_refctd_ptr(nullptr); } - throughputSystemRAM = runBenchmark( - "System RAM", - stagingBuffer.get(), - stagingAlloc, - mappedPtr, - destinationImage.get(), - TILE_SIZE, - TILE_SIZE_BYTES, - TILES_PER_FRAME, - FRAMES_IN_FLIGHT, - TOTAL_FRAMES, - queue - ); + m_logger->log("Asset count: %u, asset type: %u", ILogger::ELL_INFO, assets.size(), (uint32_t)assets[0]->getAssetType()); - stagingAlloc.memory->unmap(); - } - - m_logger->log("System RAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputSystemRAM); + auto shader = IAsset::castDown(assets[0]); + return shader; + }; - m_device->waitIdle(); - if (hostVisibleDeviceLocalBits) + //Setup compute shader resources + m_logger->log("\n=== Setting up Compute Shaders (Linear + Snake + Morton) ===", ILogger::ELL_PERFORMANCE); { - m_logger->log("\nStrategy 2: VRAM", ILogger::ELL_PERFORMANCE); + auto shaderLib = loadPrecompiledShader.operator()<"snakeStore">(); + if (!shaderLib) + return logFail("Failed to load shader library!\n"); + + IGPUDescriptorSetLayout::SBinding dsBinding = { + .binding = 0, + .type = IDescriptor::E_TYPE::ET_STORAGE_IMAGE, + .createFlags = IGPUDescriptorSetLayout::SBinding::E_CREATE_FLAGS::ECF_NONE, + .stageFlags = IShader::E_SHADER_STAGE::ESS_COMPUTE, + .count = 1 + }; + auto dsLayout = m_device->createDescriptorSetLayout({&dsBinding, 1}); + if (!dsLayout) + return logFail("Failed to create descriptor set layout!\n"); + + asset::SPushConstantRange pcRange = { + .stageFlags = hlsl::ShaderStage::ESS_COMPUTE, + .offset = 0, + .size = sizeof(SPushConstantData) + }; - double throughputVRAM = 0.0; - { - smart_refctd_ptr stagingBuffer; - IDeviceMemoryAllocator::SAllocation stagingAlloc; - void* mappedPtr = nullptr; + m_pipelineLayout = m_device->createPipelineLayout({&pcRange, 1}, smart_refctd_ptr(dsLayout)); + if (!m_pipelineLayout) + return logFail("Failed to create pipeline layout!\n"); - if (!createStagingBuffer(STAGING_BUFFER_SIZE, hostVisibleDeviceLocalBits, - "Staging Buffer - VRAM", stagingBuffer, stagingAlloc, mappedPtr)) - { - return false; - } + IGPUComputePipeline::SCreationParams storeParams = {}; + storeParams.layout = m_pipelineLayout.get(); + storeParams.shader.shader = shaderLib.get(); + storeParams.shader.entryPoint = "linearStore"; - throughputVRAM = runBenchmark( - "VRAM", - stagingBuffer.get(), - stagingAlloc, - mappedPtr, - destinationImage.get(), - TILE_SIZE, - TILE_SIZE_BYTES, - TILES_PER_FRAME, - FRAMES_IN_FLIGHT, - TOTAL_FRAMES, - queue - ); - - stagingAlloc.memory->unmap(); - } + if (!m_device->createComputePipelines(nullptr, {&storeParams, 1}, &m_storePipeline)) + return logFail("Failed to create linearStore pipeline!\n"); - m_logger->log("VRAM throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputVRAM); + IGPUComputePipeline::SCreationParams loadParams = {}; + loadParams.layout = m_pipelineLayout.get(); + loadParams.shader.shader = shaderLib.get(); + loadParams.shader.entryPoint = "linearLoad"; - double speedup = throughputVRAM / throughputSystemRAM; - m_logger->log("\nVRAM is %.2fx faster than System RAM", ILogger::ELL_PERFORMANCE, speedup); - } + if (!m_device->createComputePipelines(nullptr, {&loadParams, 1}, &m_loadPipeline)) + return logFail("Failed to create linearLoad pipeline!\n"); - m_device->waitIdle(); + IGPUComputePipeline::SCreationParams snakeStoreParams = {}; + snakeStoreParams.layout = m_pipelineLayout.get(); + snakeStoreParams.shader.shader = shaderLib.get(); + snakeStoreParams.shader.entryPoint = "SnakeOrderStore"; + if (!m_device->createComputePipelines(nullptr, {&snakeStoreParams, 1}, &m_snakeStorePipeline)) + return logFail("Failed to create SnakeOrderStore pipeline!\n"); - m_logger->log("\nStrategy 3: Image-to-Image Staging (OPTIMAL)", ILogger::ELL_PERFORMANCE); - { - std::vector> stagingImages(FRAMES_IN_FLIGHT); - for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++) - { - IGPUImage::SCreationParams imgParams{}; - imgParams.type = IImage::E_TYPE::ET_2D; - imgParams.extent.width = TILE_SIZE; - imgParams.extent.height = TILE_SIZE; - imgParams.extent.depth = 1u; - imgParams.format = asset::E_FORMAT::EF_R8G8B8A8_UNORM; - imgParams.mipLevels = 1u; - imgParams.flags = IImage::ECF_NONE; - imgParams.arrayLayers = 1u; - imgParams.samples = IImage::E_SAMPLE_COUNT_FLAGS::ESCF_1_BIT; - imgParams.tiling = video::IGPUImage::TILING::OPTIMAL; - imgParams.usage = asset::IImage::EUF_TRANSFER_SRC_BIT; - imgParams.preinitialized = false; - stagingImages[i] = m_device->createImage(std::move(imgParams)); - } + IGPUComputePipeline::SCreationParams snakeLoadParams = {}; + snakeLoadParams.layout = m_pipelineLayout.get(); + snakeLoadParams.shader.shader = shaderLib.get(); + snakeLoadParams.shader.entryPoint = "SnakeOrderLoad"; - std::vector imageMemoryOffsets(FRAMES_IN_FLIGHT); - size_t currentOffset = 0; - uint32_t combinedMemoryTypeBits = 0xFFFFFFFF; - uint32_t maxAlignmentLog2 = 0; - for (size_t i = 0; i < FRAMES_IN_FLIGHT; i++) - { - auto memReqs = stagingImages[i]->getMemoryReqs(); - size_t alignment = 1u << memReqs.alignmentLog2; - size_t alignedOffset = (currentOffset + alignment - 1) & ~(alignment - 1); - imageMemoryOffsets[i] = alignedOffset; - currentOffset = alignedOffset + memReqs.size; - combinedMemoryTypeBits &= memReqs.memoryTypeBits; - if (memReqs.alignmentLog2 > maxAlignmentLog2) - maxAlignmentLog2 = memReqs.alignmentLog2; - } + if (!m_device->createComputePipelines(nullptr, {&snakeLoadParams, 1}, &m_snakeLoadPipeline)) + return logFail("Failed to create SnakeOrderLoad pipeline!\n"); + + IGPUComputePipeline::SCreationParams mortonStoreParams = {}; + mortonStoreParams.layout = m_pipelineLayout.get(); + mortonStoreParams.shader.shader = shaderLib.get(); + mortonStoreParams.shader.entryPoint = "MortonOrderStore"; - size_t totalMemorySize = currentOffset; + if (!m_device->createComputePipelines(nullptr, {&mortonStoreParams, 1}, &m_mortonStorePipeline)) + return logFail("Failed to create MortonOrderStore pipeline!\n"); + IGPUComputePipeline::SCreationParams mortonLoadParams = {}; + mortonLoadParams.layout = m_pipelineLayout.get(); + mortonLoadParams.shader.shader = shaderLib.get(); + mortonLoadParams.shader.entryPoint = "MortonOrderLoad"; - uint32_t compatibleBits = combinedMemoryTypeBits & hostVisibleDeviceLocalBits; - if (!compatibleBits) - compatibleBits = combinedMemoryTypeBits & hostVisibleOnlyBits; + if (!m_device->createComputePipelines(nullptr, {&mortonLoadParams, 1}, &m_mortonLoadPipeline)) + return logFail("Failed to create MortonOrderLoad pipeline!\n"); - if (!compatibleBits) + auto createBatchedPipeline = [&](const char* entryPoint, smart_refctd_ptr& outPipeline) -> bool { - m_logger->log("OPTIMAL images don't support HOST_VISIBLE on this GPU!", - ILogger::ELL_ERROR); + IGPUComputePipeline::SCreationParams params = {}; + params.layout = m_pipelineLayout.get(); + params.shader.shader = shaderLib.get(); + params.shader.entryPoint = entryPoint; + if (!m_device->createComputePipelines(nullptr, {¶ms, 1}, &outPipeline)) + return logFail("Failed to create %s pipeline!\n", entryPoint); + return true; + }; + + if (!createBatchedPipeline("BatchedLinearStore", m_batchedLinearPipeline)) return false; + if (!createBatchedPipeline("BatchedSnakeStore", m_batchedSnakePipeline)) return false; + if (!createBatchedPipeline("BatchedMortonStore", m_batchedMortonPipeline)) return false; + + auto imageView = m_device->createImageView({ + .flags = IGPUImageView::ECF_NONE, + .subUsages = IGPUImage::EUF_STORAGE_BIT, + .image = smart_refctd_ptr(m_destinationImage), + .viewType = IGPUImageView::E_TYPE::ET_2D, + .format = asset::E_FORMAT::EF_R8G8B8A8_UNORM + }); + if (!imageView) + return logFail("Failed to create image view!\n"); + + uint32_t setCount = 1; + auto dsPool = m_device->createDescriptorPoolForDSLayouts( + IDescriptorPool::ECF_NONE, {&dsLayout.get(), 1}, &setCount); + m_ds = dsPool->createDescriptorSet(smart_refctd_ptr(dsLayout)); + + IGPUDescriptorSet::SDescriptorInfo imgInfo = {}; + imgInfo.desc = imageView; + imgInfo.info.image.imageLayout = IGPUImage::LAYOUT::GENERAL; + + IGPUDescriptorSet::SWriteDescriptorSet dsWrite = { + .dstSet = m_ds.get(), + .binding = 0, + .arrayElement = 0, + .count = 1, + .info = &imgInfo + }; + m_device->updateDescriptorSets({&dsWrite, 1}, {}); + + if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits, + "Verify Staging Buffer", m_stagingBuffer, m_stagingAlloc, m_stagingMappedPtr)) return false; - } - IDeviceMemoryBacked::SDeviceMemoryRequirements memReqs = {}; - memReqs.size = totalMemorySize; - memReqs.memoryTypeBits = compatibleBits; - memReqs.alignmentLog2 = maxAlignmentLog2; + if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits, + "Verify Readback Buffer", m_readbackBuffer, m_readbackAlloc, m_readbackMappedPtr)) + return false; - auto memoryAllocation = m_device->allocate(memReqs,nullptr,IDeviceMemoryAllocation::EMAF_NONE); - if (!memoryAllocation.isValid()) - { - m_logger->log("Failed to allocate HOST_VISIBLE memory for staging images!", ILogger::ELL_ERROR); - } + if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits, + "Snake Readback Buffer", m_snakeReadbackBuffer, m_snakeReadbackAlloc, m_snakeReadbackMappedPtr)) + return false; + + if (!createStagingBuffer(TILE_SIZE_BYTES, hostVisibleOnlyBits, + "Morton Readback Buffer", m_mortonReadbackBuffer, m_mortonReadbackAlloc, m_mortonReadbackMappedPtr)) + return false; - for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++) { - ILogicalDevice::SBindImageMemoryInfo info{}; - info.image = stagingImages[i].get(); - info.binding.memory = memoryAllocation.memory.get(); - info.binding.offset = imageMemoryOffsets[i]; - if (!m_device->bindImageMemory({&info,1})) + uint32_t* pixels = static_cast(m_stagingMappedPtr); + uint32_t totalPixels = TILE_SIZE * TILE_SIZE; + for (uint32_t i = 0; i < totalPixels; i++) { - m_logger->log("Failed to bind staging image %u to memory!", ILogger::ELL_ERROR, i); + uint8_t val = static_cast(i & 0xFF); + pixels[i] = val | (val << 8u) | (val << 16u) | (val << 24u); } - } - void* mappedPtr = memoryAllocation.memory->map({ 0ull,memoryAllocation.memory->getAllocationSize() }, IDeviceMemoryAllocation::EMCAF_WRITE); - if (!mappedPtr) - { - m_logger->log("Failed to map staging image memory!", ILogger::ELL_ERROR); + if (!m_stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_stagingAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->flushMappedMemoryRanges(1, &range); + } } - smart_refctd_ptr transitionCmdPool = m_device->createCommandPool( - queue->getFamilyIndex(), + m_cmdPool = m_device->createCommandPool( + m_queue->getFamilyIndex(), IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT ); + m_cmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &m_cmdbuf); + m_sem = m_device->createSemaphore(0); - smart_refctd_ptr transitionCmdBuf; - transitionCmdPool->createCommandBuffers(IGPUCommandPool::BUFFER_LEVEL::PRIMARY, 1, &transitionCmdBuf); - - transitionCmdBuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); - - - for (uint32_t i = 0; i < FRAMES_IN_FLIGHT; i++) + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); { - IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; - barrier.oldLayout = IImage::LAYOUT::UNDEFINED; - barrier.newLayout = IImage::LAYOUT::GENERAL; - barrier.image = stagingImages[i].get(); - barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; - barrier.subresourceRange.baseMipLevel = 0; - barrier.subresourceRange.levelCount = 1; - barrier.subresourceRange.baseArrayLayer = 0; - barrier.subresourceRange.layerCount = 1; - barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; - barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::MEMORY_WRITE_BITS; - barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; - barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT; - - transitionCmdBuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = m_destinationImage.get(); + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} }); } - - transitionCmdBuf->end(); + m_cmdbuf->end(); IQueue::SSubmitInfo submitInfo = {}; - IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = transitionCmdBuf.get() }; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = m_cmdbuf.get() }; submitInfo.commandBuffers = { &cmdBufInfo, 1 }; - queue->submit({ &submitInfo, 1 }); - m_device->waitIdle(); - double throughputImageStaging = runBenchmarkImageStaging( - "Image-to-Image", - stagingImages, - imageMemoryOffsets, - memoryAllocation.memory.get(), - mappedPtr, - destinationImage.get(), - TILE_SIZE, - TILE_SIZE_BYTES, - TILES_PER_FRAME, - FRAMES_IN_FLIGHT, - TOTAL_FRAMES, - queue - ); + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = m_sem.get(), + .value = 1, + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; - m_logger->log("Image-to-Image staging throughput: %.2f GB/s", - ILogger::ELL_PERFORMANCE, throughputImageStaging); + m_queue->submit({ &submitInfo, 1 }); + + ISemaphore::SWaitInfo waitInfo = { .semaphore = m_sem.get(), .value = 1 }; + m_device->blockForSemaphores({ &waitInfo, 1 }); } - m_logger->log("\nWaiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE); - std::this_thread::sleep_for(std::chrono::seconds(5)); + m_logger->log("Setup complete. Running verification loop (%u frames)", ILogger::ELL_PERFORMANCE, VERIFICATION_LOOP_COUNT); return true; } - bool keepRunning() override { return false; } - void workLoopBody() override {} - bool onAppTerminated() override { return true; } + bool keepRunning() override { return m_frameIndex < VERIFICATION_LOOP_COUNT; } + + void workLoopBody() override + { + m_cmdPool->reset(); + + //Clear readback buffers to zero + memset(m_readbackMappedPtr, 0, TILE_SIZE_BYTES); + if (!m_readbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_readbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->flushMappedMemoryRanges(1, &range); + } + memset(m_snakeReadbackMappedPtr, 0, TILE_SIZE_BYTES); + if (!m_snakeReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_snakeReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->flushMappedMemoryRanges(1, &range); + } + memset(m_mortonReadbackMappedPtr, 0, TILE_SIZE_BYTES); + if (!m_mortonReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_mortonReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->flushMappedMemoryRanges(1, &range); + } + + m_cmdbuf->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + { + IGPUCommandBuffer::SImageMemoryBarrier barrier = {}; + barrier.oldLayout = IImage::LAYOUT::GENERAL; + barrier.newLayout = IImage::LAYOUT::GENERAL; + barrier.image = m_destinationImage.get(); + barrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + barrier.subresourceRange.baseMipLevel = 0; + barrier.subresourceRange.levelCount = 1; + barrier.subresourceRange.baseArrayLayer = 0; + barrier.subresourceRange.layerCount = 1; + barrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + barrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + barrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + barrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&barrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_storePipeline.get()); + const IGPUDescriptorSet* sets[] = { m_ds.get() }; + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + + SPushConstantData storePc = { + .deviceBufferAddress = m_stagingBuffer->getDeviceAddress(), + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = TILE_SIZE, + .srcHeight = TILE_SIZE + }; + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc); + m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u); + + { + IGPUCommandBuffer::SImageMemoryBarrier midBarrier = {}; + midBarrier.oldLayout = IImage::LAYOUT::GENERAL; + midBarrier.newLayout = IImage::LAYOUT::GENERAL; + midBarrier.image = m_destinationImage.get(); + midBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + midBarrier.subresourceRange.baseMipLevel = 0; + midBarrier.subresourceRange.levelCount = 1; + midBarrier.subresourceRange.baseArrayLayer = 0; + midBarrier.subresourceRange.layerCount = 1; + midBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + midBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + midBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + midBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&midBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_loadPipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + + SPushConstantData loadPc = { + .deviceBufferAddress = m_readbackBuffer->getDeviceAddress(), + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = TILE_SIZE, + .srcHeight = TILE_SIZE + }; + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &loadPc); + m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u); + + { + asset::SMemoryBarrier memBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT + }; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + //SNAKE VERIFICATION + { + IGPUCommandBuffer::SImageMemoryBarrier snakePreBarrier = {}; + snakePreBarrier.oldLayout = IImage::LAYOUT::GENERAL; + snakePreBarrier.newLayout = IImage::LAYOUT::GENERAL; + snakePreBarrier.image = m_destinationImage.get(); + snakePreBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + snakePreBarrier.subresourceRange.baseMipLevel = 0; + snakePreBarrier.subresourceRange.levelCount = 1; + snakePreBarrier.subresourceRange.baseArrayLayer = 0; + snakePreBarrier.subresourceRange.layerCount = 1; + snakePreBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + snakePreBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + snakePreBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + snakePreBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&snakePreBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_snakeStorePipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc); + m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u); + + { + IGPUCommandBuffer::SImageMemoryBarrier snakeMidBarrier = {}; + snakeMidBarrier.oldLayout = IImage::LAYOUT::GENERAL; + snakeMidBarrier.newLayout = IImage::LAYOUT::GENERAL; + snakeMidBarrier.image = m_destinationImage.get(); + snakeMidBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + snakeMidBarrier.subresourceRange.baseMipLevel = 0; + snakeMidBarrier.subresourceRange.levelCount = 1; + snakeMidBarrier.subresourceRange.baseArrayLayer = 0; + snakeMidBarrier.subresourceRange.layerCount = 1; + snakeMidBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + snakeMidBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + snakeMidBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + snakeMidBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&snakeMidBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_snakeLoadPipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + + SPushConstantData snakeLoadPc = { + .deviceBufferAddress = m_snakeReadbackBuffer->getDeviceAddress(), + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = TILE_SIZE, + .srcHeight = TILE_SIZE + }; + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &snakeLoadPc); + m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u); + + { + asset::SMemoryBarrier memBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT + }; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + //MORTON VERIFICATION + + { + IGPUCommandBuffer::SImageMemoryBarrier mortonPreBarrier = {}; + mortonPreBarrier.oldLayout = IImage::LAYOUT::GENERAL; + mortonPreBarrier.newLayout = IImage::LAYOUT::GENERAL; + mortonPreBarrier.image = m_destinationImage.get(); + mortonPreBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + mortonPreBarrier.subresourceRange.baseMipLevel = 0; + mortonPreBarrier.subresourceRange.levelCount = 1; + mortonPreBarrier.subresourceRange.baseArrayLayer = 0; + mortonPreBarrier.subresourceRange.layerCount = 1; + mortonPreBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + mortonPreBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + mortonPreBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + mortonPreBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&mortonPreBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_mortonStorePipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &storePc); + m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u); + + { + IGPUCommandBuffer::SImageMemoryBarrier mortonMidBarrier = {}; + mortonMidBarrier.oldLayout = IImage::LAYOUT::GENERAL; + mortonMidBarrier.newLayout = IImage::LAYOUT::GENERAL; + mortonMidBarrier.image = m_destinationImage.get(); + mortonMidBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + mortonMidBarrier.subresourceRange.baseMipLevel = 0; + mortonMidBarrier.subresourceRange.levelCount = 1; + mortonMidBarrier.subresourceRange.baseArrayLayer = 0; + mortonMidBarrier.subresourceRange.layerCount = 1; + mortonMidBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + mortonMidBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + mortonMidBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + mortonMidBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&mortonMidBarrier, 1} }); + } + + m_cmdbuf->bindComputePipeline(m_mortonLoadPipeline.get()); + m_cmdbuf->bindDescriptorSets(asset::EPBP_COMPUTE, m_pipelineLayout.get(), 0, 1, sets); + + SPushConstantData mortonLoadPc = { + .deviceBufferAddress = m_mortonReadbackBuffer->getDeviceAddress(), + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = TILE_SIZE, + .srcHeight = TILE_SIZE + }; + m_cmdbuf->pushConstants(m_pipelineLayout.get(), hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &mortonLoadPc); + m_cmdbuf->dispatch(TILE_SIZE * TILE_SIZE / 128u, 1u, 1u); + + { + asset::SMemoryBarrier memBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS, + .dstStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .dstAccessMask = ACCESS_FLAGS::HOST_READ_BIT + }; + m_cmdbuf->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .memBarriers = {&memBarrier, 1} }); + } + + m_cmdbuf->end(); + + // Submit and wait + uint64_t semValue = m_frameIndex + 2; // +2 because value 1 was used in init + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = m_cmdbuf.get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = m_sem.get(), + .value = semValue, + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; + + //RenderDoc capture on first frame + if (m_frameIndex == 0) + m_api->startCapture(); + + m_queue->submit({ &submitInfo, 1 }); + + if (m_frameIndex == 0) + m_api->endCapture(); + + ISemaphore::SWaitInfo waitInfo = { .semaphore = m_sem.get(), .value = semValue }; + m_device->blockForSemaphores({ &waitInfo, 1 }); + + if (!m_readbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_readbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->invalidateMappedMemoryRanges(1, &range); + } + if (!m_snakeReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_snakeReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->invalidateMappedMemoryRanges(1, &range); + } + if (!m_mortonReadbackAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(m_mortonReadbackAlloc.memory.get(), 0, TILE_SIZE_BYTES); + m_device->invalidateMappedMemoryRanges(1, &range); + } + + const uint32_t* srcPixels = static_cast(m_stagingMappedPtr); + const uint32_t* dstPixels = static_cast(m_readbackMappedPtr); + uint32_t totalPixels = TILE_SIZE * TILE_SIZE; + uint32_t matchCount = 0; + uint32_t firstMismatchIdx = ~0u; + + for (uint32_t i = 0; i < totalPixels; i++) + { + if (srcPixels[i] == dstPixels[i]) + matchCount++; + else if (firstMismatchIdx == ~0u) + firstMismatchIdx = i; + } + + if (matchCount == totalPixels) + { + if (m_frameIndex == 0) + m_logger->log("Frame %u: Linear PASS - All %u pixels match.", ILogger::ELL_PERFORMANCE, m_frameIndex, totalPixels); + } + else + { + m_logger->log("Frame %u: Linear FAIL %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X", + ILogger::ELL_ERROR, m_frameIndex, matchCount, totalPixels, firstMismatchIdx, srcPixels[firstMismatchIdx], dstPixels[firstMismatchIdx]); + } + + const uint32_t* snakeDstPixels = static_cast(m_snakeReadbackMappedPtr); + uint32_t snakeMatchCount = 0; + uint32_t snakeFirstMismatchIdx = ~0u; + + for (uint32_t i = 0; i < totalPixels; i++) + { + if (srcPixels[i] == snakeDstPixels[i]) + snakeMatchCount++; + else if (snakeFirstMismatchIdx == ~0u) + snakeFirstMismatchIdx = i; + } + + if (snakeMatchCount == totalPixels) + { + if (m_frameIndex == 0) + m_logger->log("Frame %u: Snake PASS All %u pixels match.", ILogger::ELL_PERFORMANCE, m_frameIndex, totalPixels); + } + else + { + m_logger->log("Frame %u: Snake FAIL %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X", + ILogger::ELL_ERROR, m_frameIndex, snakeMatchCount, totalPixels, snakeFirstMismatchIdx, srcPixels[snakeFirstMismatchIdx], snakeDstPixels[snakeFirstMismatchIdx]); + } + + const uint32_t* mortonDstPixels = static_cast(m_mortonReadbackMappedPtr); + uint32_t mortonMatchCount = 0; + uint32_t mortonFirstMismatchIdx = ~0u; + + for (uint32_t i = 0; i < totalPixels; i++) + { + if (srcPixels[i] == mortonDstPixels[i]) + mortonMatchCount++; + else if (mortonFirstMismatchIdx == ~0u) + mortonFirstMismatchIdx = i; + } + + if (mortonMatchCount == totalPixels) + { + if (m_frameIndex == 0) + m_logger->log("Frame %u: Morton PASS All %u pixels match.", ILogger::ELL_PERFORMANCE, m_frameIndex, totalPixels); + } + else + { + m_logger->log("Frame %u: Morton FAIL %u / %u pixels matched. First mismatch at pixel %u: expected 0x%08X, got 0x%08X", + ILogger::ELL_ERROR, m_frameIndex, mortonMatchCount, totalPixels, mortonFirstMismatchIdx, srcPixels[mortonFirstMismatchIdx], mortonDstPixels[mortonFirstMismatchIdx]); + } + + m_frameIndex++; + } + + bool onAppTerminated() override + { + runAllBenchmarks(); + + m_logger->log("\nResults above. Waiting 5 seconds before exit...", ILogger::ELL_PERFORMANCE); + std::this_thread::sleep_for(std::chrono::seconds(5)); + + if (m_stagingAlloc.memory) + m_stagingAlloc.memory->unmap(); + if (m_readbackAlloc.memory) + m_readbackAlloc.memory->unmap(); + if (m_snakeReadbackAlloc.memory) + m_snakeReadbackAlloc.memory->unmap(); + if (m_mortonReadbackAlloc.memory) + m_mortonReadbackAlloc.memory->unmap(); + return true; + } protected: core::vector getQueueRequirements() const override @@ -333,6 +691,183 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp } private: + static constexpr uint32_t TILE_SIZE = 128; + static constexpr uint32_t TILE_BYTES_PER_PIXEL = 4; + static constexpr uint32_t TILE_SIZE_BYTES = TILE_SIZE * TILE_SIZE * TILE_BYTES_PER_PIXEL; + static constexpr uint32_t VERIFICATION_LOOP_COUNT = 300; + + struct SPushConstantData + { + uint64_t deviceBufferAddress; + uint32_t dstOffsetX; + uint32_t dstOffsetY; + uint32_t srcWidth; + uint32_t srcHeight; + uint32_t tilesPerRow; + }; + + IQueue* m_queue = nullptr; + smart_refctd_ptr m_destinationImage; + smart_refctd_ptr m_storePipeline; + smart_refctd_ptr m_loadPipeline; + smart_refctd_ptr m_snakeStorePipeline; + smart_refctd_ptr m_snakeLoadPipeline; + smart_refctd_ptr m_mortonStorePipeline; + smart_refctd_ptr m_mortonLoadPipeline; + smart_refctd_ptr m_batchedLinearPipeline; + smart_refctd_ptr m_batchedSnakePipeline; + smart_refctd_ptr m_batchedMortonPipeline; + smart_refctd_ptr m_pipelineLayout; + smart_refctd_ptr m_ds; + smart_refctd_ptr m_stagingBuffer; + smart_refctd_ptr m_readbackBuffer; + smart_refctd_ptr m_snakeReadbackBuffer; + smart_refctd_ptr m_mortonReadbackBuffer; + IDeviceMemoryAllocator::SAllocation m_stagingAlloc; + IDeviceMemoryAllocator::SAllocation m_readbackAlloc; + IDeviceMemoryAllocator::SAllocation m_snakeReadbackAlloc; + IDeviceMemoryAllocator::SAllocation m_mortonReadbackAlloc; + void* m_stagingMappedPtr = nullptr; + void* m_readbackMappedPtr = nullptr; + void* m_snakeReadbackMappedPtr = nullptr; + void* m_mortonReadbackMappedPtr = nullptr; + smart_refctd_ptr m_cmdPool; + smart_refctd_ptr m_cmdbuf; + smart_refctd_ptr m_sem; + uint32_t m_frameIndex = 0; + + void runAllBenchmarks() + { + constexpr uint32_t STAGING_BUFFER_SIZE = 64 * 1024 * 1024; + constexpr uint32_t FRAMES_IN_FLIGHT = 4; + constexpr uint32_t TILES_PER_FRAME = STAGING_BUFFER_SIZE / (TILE_SIZE_BYTES * FRAMES_IN_FLIGHT); + constexpr uint32_t TOTAL_FRAMES = 1000; + + uint32_t hostVisibleBits = m_physicalDevice->getHostVisibleMemoryTypeBits(); + uint32_t deviceLocalBits = m_physicalDevice->getDeviceLocalMemoryTypeBits(); + uint32_t hostVisibleOnlyBits = hostVisibleBits & ~deviceLocalBits; + uint32_t hostVisibleDeviceLocalBits = hostVisibleBits & deviceLocalBits; + + m_logger->log("\n=== RUNNING BENCHMARKS ===", ILogger::ELL_PERFORMANCE); + + struct BenchmarkResult + { + const char* name; + double wallGBps; + double gpuGBps; + double memcpyGBps; + }; + std::vector results; + + //SysRAM benchmarks + { + smart_refctd_ptr benchStagingBuffer; + IDeviceMemoryAllocator::SAllocation benchStagingAlloc; + void* benchMappedPtr = nullptr; + uint32_t benchBufSize = STAGING_BUFFER_SIZE; + + if (createStagingBuffer(benchBufSize, hostVisibleOnlyBits, + "Benchmark Staging (SysRAM)", benchStagingBuffer, benchStagingAlloc, benchMappedPtr)) + { + m_logger->log("\n--- CopyBufferToImage (SysRAM) ---", ILogger::ELL_PERFORMANCE); + auto rCopy = runBenchmark("CopyBufferToImage (SysRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({"CopyBufferToImage (SysRAM)", rCopy.wallGBps, rCopy.gpuGBps, rCopy.memcpyGBps}); + + m_logger->log("\n--- Linear Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE); + auto rLinear = runBenchmarkCompute("Linear Compute (SysRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_batchedLinearPipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({"Linear Compute (SysRAM)", rLinear.wallGBps, rLinear.gpuGBps, rLinear.memcpyGBps}); + + m_logger->log("\n--- Snake Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE); + auto rSnake = runBenchmarkCompute("Snake Compute (SysRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_batchedSnakePipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({"Snake Compute (SysRAM)", rSnake.wallGBps, rSnake.gpuGBps, rSnake.memcpyGBps}); + + m_logger->log("\n--- Morton Compute (SysRAM) ---", ILogger::ELL_PERFORMANCE); + auto rMorton = runBenchmarkCompute("Morton Compute (SysRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_batchedMortonPipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({"Morton Compute (SysRAM)", rMorton.wallGBps, rMorton.gpuGBps, rMorton.memcpyGBps}); + + benchStagingAlloc.memory->unmap(); + } + } + + //BAR/VRAM benchmarks (if available) + if (hostVisibleDeviceLocalBits) + { + smart_refctd_ptr benchStagingBuffer; + IDeviceMemoryAllocator::SAllocation benchStagingAlloc; + void* benchMappedPtr = nullptr; + uint32_t benchBufSize = STAGING_BUFFER_SIZE; + + if (createStagingBuffer(benchBufSize, hostVisibleDeviceLocalBits, + "Benchmark Staging (BAR/VRAM)", benchStagingBuffer, benchStagingAlloc, benchMappedPtr)) + { + m_logger->log("\n--- CopyBufferToImage (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE); + auto rCopy = runBenchmark("CopyBufferToImage (BAR/VRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({"CopyBufferToImage (BAR/VRAM)", rCopy.wallGBps, rCopy.gpuGBps, rCopy.memcpyGBps}); + + m_logger->log("\n--- Linear Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE); + auto rLinear = runBenchmarkCompute("Linear Compute (BAR/VRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_batchedLinearPipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({"Linear Compute (BAR/VRAM)", rLinear.wallGBps, rLinear.gpuGBps, rLinear.memcpyGBps}); + + m_logger->log("\n--- Snake Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE); + auto rSnake = runBenchmarkCompute("Snake Compute (BAR/VRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_batchedSnakePipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({"Snake Compute (BAR/VRAM)", rSnake.wallGBps, rSnake.gpuGBps, rSnake.memcpyGBps}); + + m_logger->log("\n--- Morton Compute (BAR/VRAM) ---", ILogger::ELL_PERFORMANCE); + auto rMorton = runBenchmarkCompute("Morton Compute (BAR/VRAM)", + benchStagingBuffer.get(), benchStagingAlloc, benchMappedPtr, + m_destinationImage.get(), m_batchedMortonPipeline.get(), m_pipelineLayout.get(), m_ds.get(), + TILE_SIZE, TILE_SIZE_BYTES, + TILES_PER_FRAME, FRAMES_IN_FLIGHT, TOTAL_FRAMES, m_queue); + results.push_back({"Morton Compute (BAR/VRAM)", rMorton.wallGBps, rMorton.gpuGBps, rMorton.memcpyGBps}); + + benchStagingAlloc.memory->unmap(); + } + } + + //Summary table + m_logger->log("\n=== BENCHMARK RESULTS ===", ILogger::ELL_PERFORMANCE); + m_logger->log("%-36s | Wall GB/s | GPU GB/s | Memcpy GB/s", ILogger::ELL_PERFORMANCE, "Strategy"); + m_logger->log("-------------------------------------+-----------+----------+------------", ILogger::ELL_PERFORMANCE); + for (const auto& r : results) + { + m_logger->log("%-36s | %9.2f | %8.2f | %10.2f", ILogger::ELL_PERFORMANCE, r.name, r.wallGBps, r.gpuGBps, r.memcpyGBps); + } + m_logger->log("=====================================+===========+==========+============", ILogger::ELL_PERFORMANCE); + } + + struct BenchResult + { + double wallGBps; + double gpuGBps; + double memcpyGBps; + }; + void generateTileCopyRegions( IImage::SBufferCopy* outRegions, uint32_t tilesPerFrame, @@ -359,7 +894,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp } } - double runBenchmark( + BenchResult runBenchmark( const char* strategyName, IGPUBuffer* stagingBuffer, IDeviceMemoryAllocator::SAllocation& stagingAlloc, @@ -566,14 +1101,16 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp totalSubmitTime += std::chrono::duration(t5 - t4).count(); } + // End marker is after last submit, NOT after GPU finishes. + auto endTime = std::chrono::high_resolution_clock::now(); + ISemaphore::SWaitInfo finalWait = { .semaphore = timelineSemaphore.get(), .value = timelineValue }; m_device->blockForSemaphores({ &finalWait, 1 }); - auto endTime = std::chrono::high_resolution_clock::now(); - + // Read timestamps from the last completed flight of command buffers std::vector timestamps(framesInFlight * 2); const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags); @@ -586,26 +1123,31 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds; double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9; + // GPU timestamps only represent the last framesInFlight frames (earlier ones were overwritten) double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight; double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames; double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + double totalGB = totalBytes / (1024.0 * 1024.0 * 1024.0); - double throughputGBps = (totalBytes / (1024.0 * 1024.0 * 1024.0)) / elapsedSeconds; + double wallThroughputGBps = totalGB / elapsedSeconds; + double gpuThroughputGBps = totalGB / totalGpuTimeSeconds; - m_logger->log(" GPU time: %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); - m_logger->log(" GPU throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, throughputGBps); + m_logger->log(" GPU time (extrapolated): %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); + m_logger->log(" CPU submit throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, wallThroughputGBps); + m_logger->log(" GPU only throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, gpuThroughputGBps); m_logger->log(" Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName); m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); - m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, (totalBytes / (1024.0 * 1024.0 * 1024.0)) / totalMemcpyTime); + double memcpyGBps = totalGB / totalMemcpyTime; + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, memcpyGBps); - return throughputGBps; + return { wallThroughputGBps, gpuThroughputGBps, memcpyGBps }; } @@ -897,6 +1439,275 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp return throughputGBps; } + BenchResult runBenchmarkCompute( + const char* strategyName, + IGPUBuffer* stagingBuffer, + IDeviceMemoryAllocator::SAllocation& stagingAlloc, + void* mappedPtr, + IGPUImage* destinationImage, + IGPUComputePipeline* pipeline, + IGPUPipelineLayout* pipelineLayout, + IGPUDescriptorSet* ds, + uint32_t tileSize, + uint32_t tileSizeBytes, + uint32_t tilesPerFrame, + uint32_t framesInFlight, + uint32_t totalFrames, + IQueue* queue) + { + smart_refctd_ptr timelineSemaphore = m_device->createSemaphore(0); + + smart_refctd_ptr queryPool; + { + IQueryPool::SCreationParams queryPoolParams = {}; + queryPoolParams.queryType = IQueryPool::TYPE::TIMESTAMP; + queryPoolParams.queryCount = framesInFlight * 2; + queryPoolParams.pipelineStatisticsFlags = IQueryPool::PIPELINE_STATISTICS_FLAGS::NONE; + queryPool = m_device->createQueryPool(queryPoolParams); + } + + std::vector> commandPools(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i] = m_device->createCommandPool( + queue->getFamilyIndex(), + IGPUCommandPool::CREATE_FLAGS::RESET_COMMAND_BUFFER_BIT + ); + } + std::vector> commandBuffers(framesInFlight); + for (uint32_t i = 0; i < framesInFlight; i++) + { + commandPools[i]->createCommandBuffers( + IGPUCommandPool::BUFFER_LEVEL::PRIMARY, + 1, + &commandBuffers[i] + ); + } + + uint64_t timelineValue = 0; + + commandBuffers[0]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + { + IGPUCommandBuffer::SImageMemoryBarrier initBarrier = {}; + initBarrier.oldLayout = IImage::LAYOUT::UNDEFINED; + initBarrier.newLayout = IImage::LAYOUT::GENERAL; + initBarrier.image = destinationImage; + initBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + initBarrier.subresourceRange.baseMipLevel = 0; + initBarrier.subresourceRange.levelCount = 1; + initBarrier.subresourceRange.baseArrayLayer = 0; + initBarrier.subresourceRange.layerCount = 1; + initBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::NONE; + initBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + initBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::NONE; + initBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + commandBuffers[0]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&initBarrier, 1} }); + } + commandBuffers[0]->end(); + + IQueue::SSubmitInfo submitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo cmdBufInfo = { .cmdbuf = commandBuffers[0].get() }; + submitInfo.commandBuffers = { &cmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo signalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + submitInfo.signalSemaphores = { &signalInfo, 1 }; + + queue->submit({ &submitInfo, 1 }); + + ISemaphore::SWaitInfo waitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &waitInfo, 1 }); + + uint32_t imageWidth = destinationImage->getCreationParameters().extent.width; + uint32_t tilesPerRow = imageWidth / tileSize; + uint32_t partitionSize = tilesPerFrame * tileSizeBytes; + + std::vector cpuSourceData(partitionSize); + { + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937 g(seed); + uint32_t* data = reinterpret_cast(cpuSourceData.data()); + for (uint32_t i = 0; i < partitionSize / sizeof(uint32_t); i++) + data[i] = g(); + } + + double totalWaitTime = 0.0; + double totalMemcpyTime = 0.0; + double totalRecordTime = 0.0; + double totalSubmitTime = 0.0; + + auto startTime = std::chrono::high_resolution_clock::now(); + + for (uint32_t frame = 0; frame < totalFrames; frame++) + { + uint32_t cmdBufIndex = frame % framesInFlight; + + auto t1 = std::chrono::high_resolution_clock::now(); + if (frame >= framesInFlight) + { + ISemaphore::SWaitInfo frameWaitInfo = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue - framesInFlight + 1 + }; + m_device->blockForSemaphores({ &frameWaitInfo, 1 }); + } + auto t2 = std::chrono::high_resolution_clock::now(); + + commandPools[cmdBufIndex]->reset(); + + uint32_t bufferOffset = cmdBufIndex * partitionSize; + void* targetPtr = static_cast(mappedPtr) + bufferOffset; + memcpy(targetPtr, cpuSourceData.data(), partitionSize); + + if (!stagingAlloc.memory->getMemoryPropertyFlags().hasFlags(IDeviceMemoryAllocation::EMPF_HOST_COHERENT_BIT)) + { + ILogicalDevice::MappedMemoryRange range(stagingAlloc.memory.get(), bufferOffset, partitionSize); + m_device->flushMappedMemoryRanges(1, &range); + } + + auto t3 = std::chrono::high_resolution_clock::now(); + + commandBuffers[cmdBufIndex]->begin(IGPUCommandBuffer::USAGE::ONE_TIME_SUBMIT_BIT); + + uint32_t queryStartIndex = cmdBufIndex * 2; + commandBuffers[cmdBufIndex]->resetQueryPool(queryPool.get(), queryStartIndex, 2); + + asset::SMemoryBarrier memBarrier = { + .srcStageMask = PIPELINE_STAGE_FLAGS::HOST_BIT, + .srcAccessMask = ACCESS_FLAGS::HOST_WRITE_BIT, + .dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, + .dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS + }; + + IGPUCommandBuffer::SImageMemoryBarrier dstBarrier = {}; + dstBarrier.oldLayout = IImage::LAYOUT::GENERAL; + dstBarrier.newLayout = IImage::LAYOUT::GENERAL; + dstBarrier.image = destinationImage; + dstBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + dstBarrier.subresourceRange.baseMipLevel = 0; + dstBarrier.subresourceRange.levelCount = 1; + dstBarrier.subresourceRange.baseArrayLayer = 0; + dstBarrier.subresourceRange.layerCount = 1; + dstBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + dstBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + dstBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + dstBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { + .memBarriers = {&memBarrier, 1}, + .imgBarriers = {&dstBarrier, 1} + }); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, queryPool.get(), queryStartIndex + 0); + + commandBuffers[cmdBufIndex]->bindComputePipeline(pipeline); + const IGPUDescriptorSet* sets[] = { ds }; + commandBuffers[cmdBufIndex]->bindDescriptorSets(asset::EPBP_COMPUTE, pipelineLayout, 0, 1, sets); + + // Single dispatch covering all tiles at once + SPushConstantData pc = { + .deviceBufferAddress = stagingBuffer->getDeviceAddress() + bufferOffset, + .dstOffsetX = 0, + .dstOffsetY = 0, + .srcWidth = tileSize, + .srcHeight = tileSize, + .tilesPerRow = tilesPerRow + }; + commandBuffers[cmdBufIndex]->pushConstants(pipelineLayout, hlsl::ShaderStage::ESS_COMPUTE, 0, sizeof(SPushConstantData), &pc); + commandBuffers[cmdBufIndex]->dispatch(tilesPerFrame * tileSize * tileSize / 128u, 1u, 1u); + + IGPUCommandBuffer::SImageMemoryBarrier afterBarrier = {}; + afterBarrier.oldLayout = IImage::LAYOUT::GENERAL; + afterBarrier.newLayout = IImage::LAYOUT::GENERAL; + afterBarrier.image = destinationImage; + afterBarrier.subresourceRange.aspectMask = IImage::E_ASPECT_FLAGS::EAF_COLOR_BIT; + afterBarrier.subresourceRange.baseMipLevel = 0; + afterBarrier.subresourceRange.levelCount = 1; + afterBarrier.subresourceRange.baseArrayLayer = 0; + afterBarrier.subresourceRange.layerCount = 1; + afterBarrier.barrier.dep.srcAccessMask = ACCESS_FLAGS::SHADER_WRITE_BITS; + afterBarrier.barrier.dep.dstAccessMask = ACCESS_FLAGS::SHADER_READ_BITS; + afterBarrier.barrier.dep.srcStageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT; + afterBarrier.barrier.dep.dstStageMask = PIPELINE_STAGE_FLAGS::FRAGMENT_SHADER_BIT; + commandBuffers[cmdBufIndex]->pipelineBarrier(E_DEPENDENCY_FLAGS::EDF_NONE, { .imgBarriers = {&afterBarrier, 1} }); + + commandBuffers[cmdBufIndex]->writeTimestamp(PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT, queryPool.get(), queryStartIndex + 1); + + commandBuffers[cmdBufIndex]->end(); + auto t4 = std::chrono::high_resolution_clock::now(); + + IQueue::SSubmitInfo frameSubmitInfo = {}; + IQueue::SSubmitInfo::SCommandBufferInfo frameCmdBufInfo = { .cmdbuf = commandBuffers[cmdBufIndex].get() }; + frameSubmitInfo.commandBuffers = { &frameCmdBufInfo, 1 }; + + IQueue::SSubmitInfo::SSemaphoreInfo frameSignalInfo = { + .semaphore = timelineSemaphore.get(), + .value = ++timelineValue, + .stageMask = PIPELINE_STAGE_FLAGS::COMPUTE_SHADER_BIT + }; + frameSubmitInfo.signalSemaphores = { &frameSignalInfo, 1 }; + + queue->submit({ &frameSubmitInfo, 1 }); + auto t5 = std::chrono::high_resolution_clock::now(); + + totalWaitTime += std::chrono::duration(t2 - t1).count(); + totalMemcpyTime += std::chrono::duration(t3 - t2).count(); + totalRecordTime += std::chrono::duration(t4 - t3).count(); + totalSubmitTime += std::chrono::duration(t5 - t4).count(); + } + + // End marker is after last submit, NOT after GPU finishes. + auto endTime = std::chrono::high_resolution_clock::now(); + + ISemaphore::SWaitInfo finalWait = { + .semaphore = timelineSemaphore.get(), + .value = timelineValue + }; + m_device->blockForSemaphores({ &finalWait, 1 }); + + // Read timestamps from the last completed flight of command buffers + std::vector timestamps(framesInFlight * 2); + const core::bitflag flags = core::bitflag(IQueryPool::RESULTS_FLAGS::_64_BIT) | core::bitflag(IQueryPool::RESULTS_FLAGS::WAIT_BIT); + m_device->getQueryPoolResults(queryPool.get(), 0, framesInFlight * 2, timestamps.data(), sizeof(uint64_t), flags); + uint64_t totalGpuTicks = 0; + for (uint32_t i = 0; i < framesInFlight; i++) { + uint64_t startTick = timestamps[i * 2 + 0]; + uint64_t endTick = timestamps[i * 2 + 1]; + totalGpuTicks += (endTick - startTick); + } + float timestampPeriod = m_physicalDevice->getLimits().timestampPeriodInNanoSeconds; + double sampledGpuTimeSeconds = (totalGpuTicks * timestampPeriod) / 1e9; + + double avgGpuTimePerFrame = sampledGpuTimeSeconds / framesInFlight; + double totalGpuTimeSeconds = avgGpuTimePerFrame * totalFrames; + + double elapsedSeconds = std::chrono::duration(endTime - startTime).count(); + uint64_t totalBytes = (uint64_t)totalFrames * tilesPerFrame * tileSizeBytes; + double totalGB = totalBytes / (1024.0 * 1024.0 * 1024.0); + + double wallThroughputGBps = totalGB / elapsedSeconds; + double gpuThroughputGBps = totalGB / totalGpuTimeSeconds; + + m_logger->log(" GPU time (extrapolated): %.3f s", ILogger::ELL_PERFORMANCE, totalGpuTimeSeconds); + m_logger->log(" CPU submit throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, wallThroughputGBps); + m_logger->log(" GPU only throughput: %.2f GB/s", ILogger::ELL_PERFORMANCE, gpuThroughputGBps); + + m_logger->log(" Timing breakdown for %s:", ILogger::ELL_PERFORMANCE, strategyName); + m_logger->log(" Wait time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalWaitTime, 100.0 * totalWaitTime / elapsedSeconds); + m_logger->log(" Memcpy time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalMemcpyTime, 100.0 * totalMemcpyTime / elapsedSeconds); + m_logger->log(" Record time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalRecordTime, 100.0 * totalRecordTime / elapsedSeconds); + m_logger->log(" Submit time: %.3f s (%.1f%%)", ILogger::ELL_PERFORMANCE, totalSubmitTime, 100.0 * totalSubmitTime / elapsedSeconds); + double memcpyGBps = totalGB / totalMemcpyTime; + m_logger->log(" Memcpy speed: %.2f GB/s", ILogger::ELL_PERFORMANCE, memcpyGBps); + + return { wallThroughputGBps, gpuThroughputGBps, memcpyGBps }; + } + bool createStagingBuffer( uint32_t bufferSize, uint32_t memoryTypeBits, @@ -907,7 +1718,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp { IGPUBuffer::SCreationParams params; params.size = bufferSize; - params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT; + params.usage = IGPUBuffer::EUF_TRANSFER_SRC_BIT | IGPUBuffer::EUF_SHADER_DEVICE_ADDRESS_BIT; outBuffer = m_device->createBuffer(std::move(params)); if (!outBuffer) return logFail("Failed to create GPU buffer of size %d!\n", bufferSize); @@ -917,7 +1728,7 @@ class ImageUploadBenchmarkApp final : public application_templates::MonoDeviceAp auto reqs = outBuffer->getMemoryReqs(); reqs.memoryTypeBits &= memoryTypeBits; - outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_NONE); + outAllocation = m_device->allocate(reqs, outBuffer.get(), IDeviceMemoryAllocation::EMAF_DEVICE_ADDRESS_BIT); if (!outAllocation.isValid()) return logFail("Failed to allocate Device Memory!\n");