diff --git a/CMakeLists.txt b/CMakeLists.txt index 50a16a32..b2ef28f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,10 @@ set (VULKAN_SDK_MIN_MINOR_VERSION 4) set (VULKAN_SDK_MIN_PATCH_VERSION 321) FIND_VULKAN_SDK(${VULKAN_SDK_MIN_MAJOR_VERSION} ${VULKAN_SDK_MIN_MINOR_VERSION} ${VULKAN_SDK_MIN_PATCH_VERSION}) -include(FindShaderc) +option(USE_ENCODER_SHADERC "Enable shaderc GPU compute filters for encoder (e.g. YUV conversion). Only affects the encoder build; the decoder always uses shaderc." ON) +if(BUILD_DECODER OR USE_ENCODER_SHADERC) + include(FindShaderc) +endif() ############ VULKAN_FFMPEG_LIB_PATH ###################################### if (DEFINED ENV{VULKAN_FFMPEG_LIB_DIR_PATH}) diff --git a/cmake/LinuxSettings.cmake b/cmake/LinuxSettings.cmake index a90e96ee..f9f3c727 100644 --- a/cmake/LinuxSettings.cmake +++ b/cmake/LinuxSettings.cmake @@ -84,7 +84,7 @@ endif() # Compiler flags for GCC/Clang if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang") - set(COMMON_COMPILE_FLAGS "-Wall -Wextra -Wundef -Wno-unused-parameter -Wno-missing-field-initializers -Wshadow") + set(COMMON_COMPILE_FLAGS "-Wall -Wextra -Wundef -Wno-unused-parameter -Wno-missing-field-initializers -Wshadow -Wcast-qual") set(COMMON_COMPILE_FLAGS "${COMMON_COMPILE_FLAGS} -fno-strict-aliasing -fno-builtin-memcmp") # Warning about implicit fallthrough in switch blocks diff --git a/common/include/VkVideoCore/VulkanVideoCapabilities.h b/common/include/VkVideoCore/VulkanVideoCapabilities.h index 8e0caf4f..3c8f572f 100644 --- a/common/include/VkVideoCore/VulkanVideoCapabilities.h +++ b/common/include/VkVideoCore/VulkanVideoCapabilities.h @@ -360,7 +360,7 @@ class VulkanVideoCapabilities } } - formatCount = std::min(supportedFormatCount, formatCount); + formatCount = std::min(supportedFormatCount, formatCount); for (uint32_t i = 0; i < formatCount; i++) { formats[i] = pSupportedFormats[i].format; diff --git a/common/include/mio/mio.hpp b/common/include/mio/mio.hpp index 5cd55ea8..3c3e1adb 100644 --- a/common/include/mio/mio.hpp +++ b/common/include/mio/mio.hpp @@ -786,13 +786,13 @@ namespace win { /** Returns the 4 upper bytes of an 8-byte integer. */ inline DWORD int64_high(int64_t n) noexcept { - return n >> 32; + return (DWORD)(n >> 32); } /** Returns the 4 lower bytes of an 8-byte integer. */ inline DWORD int64_low(int64_t n) noexcept { - return n & 0xffffffff; + return (DWORD)(n & 0xffffffff); } inline std::wstring s_2_ws(const std::string& s) @@ -887,7 +887,7 @@ inline size_t query_file_size(file_handle_type handle, std::error_code& error) error = detail::last_error(); return 0; } - return static_cast(file_size.QuadPart); + return static_cast(file_size.QuadPart); #else // POSIX struct stat sbuf; if(::fstat(handle, &sbuf) == -1) @@ -933,7 +933,7 @@ inline mmap_context memory_map(const file_handle_type file_handle, const int64_t mode == access_mode::read ? FILE_MAP_READ : FILE_MAP_WRITE, win::int64_high(aligned_offset), win::int64_low(aligned_offset), - length_to_map)); + (size_t)length_to_map)); if(mapping_start == nullptr) { // Close file handle if mapping it failed. diff --git a/common/include/nvidia_utils/vulkan/ycbcr_utils.h b/common/include/nvidia_utils/vulkan/ycbcr_utils.h index 7713c1e7..46f3ed78 100644 --- a/common/include/nvidia_utils/vulkan/ycbcr_utils.h +++ b/common/include/nvidia_utils/vulkan/ycbcr_utils.h @@ -103,6 +103,24 @@ typedef struct YcbcrPlanesLayoutInfo { uint8_t reserved; // reserved for structure alignment. } YcbcrPlanesLayoutInfo; +static inline uint32_t GetBitsPerChannel(const YcbcrPlanesLayoutInfo& pYcbcrPlanesLayoutInfo) +{ + switch (pYcbcrPlanesLayoutInfo.bpp) { + case YCBCRA_8BPP: + return 8; + case YCBCRA_10BPP: + return 10; + case YCBCRA_12BPP: + return 12; + case YCBCRA_14BPP: + return 14; + case YCBCRA_16BPP: + return 16; + default: + return 8; + } +} + static inline size_t YcbcrAlign(size_t toAlign, size_t alignment) { return ((toAlign + (alignment - 1)) & ~(alignment -1)); diff --git a/common/libs/VkCodecUtils/FrameProcessor.h b/common/libs/VkCodecUtils/FrameProcessor.h index 8a94f6ab..097a3fa6 100644 --- a/common/libs/VkCodecUtils/FrameProcessor.h +++ b/common/libs/VkCodecUtils/FrameProcessor.h @@ -106,7 +106,7 @@ class FrameProcessor : public VkVideoRefCountBase { FrameProcessor(bool verbose = false) : m_frameCount(0) , m_profileFramesCount(0) - , m_displayTimePeriodMilliseconds(1000) + , m_displayTimePeriodMilliseconds(100) , start_time (std::chrono::steady_clock::now()) , m_verbose(verbose) { diff --git a/common/libs/VkCodecUtils/Helpers.h b/common/libs/VkCodecUtils/Helpers.h index 333548e0..b74e71a3 100644 --- a/common/libs/VkCodecUtils/Helpers.h +++ b/common/libs/VkCodecUtils/Helpers.h @@ -320,7 +320,7 @@ inline VkResult WaitAndGetStatus(const VkInterfaceFunctions* vkIf, VkDevice devi } template -inline VkBaseInStructure* ChainNextVkStruct(NodeType& node, ChainedNodeType& nextChainedNode) { +inline void ChainNextVkStruct(NodeType& node, ChainedNodeType& nextChainedNode) { // make sure the node is of type VkBaseInStructure static_assert(offsetof(NodeType, sType) == offsetof(VkBaseInStructure, sType), "NodeType does not have sType at the same offset as VkBaseInStructure"); @@ -341,16 +341,16 @@ inline VkBaseInStructure* ChainNextVkStruct(NodeType& node, ChainedNodeType& nex "ChainedNodeType must be a standard-layout type"); assert(node.sType > 0); - VkBaseInStructure* pNode = (VkBaseInStructure*)&node; - while (pNode->pNext != nullptr) { - pNode = (VkBaseInStructure*)pNode->pNext; - } - pNode->pNext = (VkBaseInStructure*)&nextChainedNode; - // make sure the nextChainedNode is of type VkBaseInStructure - assert(nextChainedNode.sType > 0); - assert(nextChainedNode.pNext == nullptr); - return (VkBaseInStructure*)nextChainedNode.pNext; - } + VkBaseInStructure* pNode = (VkBaseInStructure*)(&node); + VkBaseInStructure* pNextNode = (VkBaseInStructure*)(&nextChainedNode); + + // The incoming object may not have anything chained. + assert(pNextNode->pNext == nullptr); + + // Inserts the incoming object at the beginning of the list. + pNextNode->pNext = pNode->pNext; + pNode->pNext = pNextNode; +} class DeviceUuidUtils { diff --git a/common/libs/VkCodecUtils/VkThreadPool.h b/common/libs/VkCodecUtils/VkThreadPool.h index 44d31bd1..b9d5a508 100644 --- a/common/libs/VkCodecUtils/VkThreadPool.h +++ b/common/libs/VkCodecUtils/VkThreadPool.h @@ -65,8 +65,11 @@ class VkThreadPool std::future res = task->get_future(); { std::unique_lock lock(queue_mutex); - if(stop) + if(stop) { +#ifdef __cpp_exceptions throw std::runtime_error("enqueue on stopped ThreadPool"); +#endif + } tasks.emplace([task](){ (*task)(); }); } diff --git a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp index cb71ccb6..6285d2aa 100644 --- a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp +++ b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp @@ -23,80 +23,7 @@ #include "VulkanDecodedFrame.h" #include "Helpers.h" #include "VkVideoFrameOutput.h" - -// CRC32 lookup table -static unsigned long Crc32Table[256] = { - 0x00000000,0x77073096,0xee0e612c,0x990951ba, - 0x076dc419,0x706af48f,0xe963a535,0x9e6495a3, - 0x0edb8832,0x79dcb8a4,0xe0d5e91e,0x97d2d988, - 0x09b64c2b,0x7eb17cbd,0xe7b82d07,0x90bf1d91, - 0x1db71064,0x6ab020f2,0xf3b97148,0x84be41de, - 0x1adad47d,0x6ddde4eb,0xf4d4b551,0x83d385c7, - 0x136c9856,0x646ba8c0,0xfd62f97a,0x8a65c9ec, - 0x14015c4f,0x63066cd9,0xfa0f3d63,0x8d080df5, - 0x3b6e20c8,0x4c69105e,0xd56041e4,0xa2677172, - 0x3c03e4d1,0x4b04d447,0xd20d85fd,0xa50ab56b, - 0x35b5a8fa,0x42b2986c,0xdbbbc9d6,0xacbcf940, - 0x32d86ce3,0x45df5c75,0xdcd60dcf,0xabd13d59, - 0x26d930ac,0x51de003a,0xc8d75180,0xbfd06116, - 0x21b4f4b5,0x56b3c423,0xcfba9599,0xb8bda50f, - 0x2802b89e,0x5f058808,0xc60cd9b2,0xb10be924, - 0x2f6f7c87,0x58684c11,0xc1611dab,0xb6662d3d, - 0x76dc4190,0x01db7106,0x98d220bc,0xefd5102a, - 0x71b18589,0x06b6b51f,0x9fbfe4a5,0xe8b8d433, - 0x7807c9a2,0x0f00f934,0x9609a88e,0xe10e9818, - 0x7f6a0dbb,0x086d3d2d,0x91646c97,0xe6635c01, - 0x6b6b51f4,0x1c6c6162,0x856530d8,0xf262004e, - 0x6c0695ed,0x1b01a57b,0x8208f4c1,0xf50fc457, - 0x65b0d9c6,0x12b7e950,0x8bbeb8ea,0xfcb9887c, - 0x62dd1ddf,0x15da2d49,0x8cd37cf3,0xfbd44c65, - 0x4db26158,0x3ab551ce,0xa3bc0074,0xd4bb30e2, - 0x4adfa541,0x3dd895d7,0xa4d1c46d,0xd3d6f4fb, - 0x4369e96a,0x346ed9fc,0xad678846,0xda60b8d0, - 0x44042d73,0x33031de5,0xaa0a4c5f,0xdd0d7cc9, - 0x5005713c,0x270241aa,0xbe0b1010,0xc90c2086, - 0x5768b525,0x206f85b3,0xb966d409,0xce61e49f, - 0x5edef90e,0x29d9c998,0xb0d09822,0xc7d7a8b4, - 0x59b33d17,0x2eb40d81,0xb7bd5c3b,0xc0ba6cad, - 0xedb88320,0x9abfb3b6,0x03b6e20c,0x74b1d29a, - 0xead54739,0x9dd277af,0x04db2615,0x73dc1683, - 0xe3630b12,0x94643b84,0x0d6d6a3e,0x7a6a5aa8, - 0xe40ecf0b,0x9309ff9d,0x0a00ae27,0x7d079eb1, - 0xf00f9344,0x8708a3d2,0x1e01f268,0x6906c2fe, - 0xf762575d,0x806567cb,0x196c3671,0x6e6b06e7, - 0xfed41b76,0x89d32be0,0x10da7a5a,0x67dd4acc, - 0xf9b9df6f,0x8ebeeff9,0x17b7be43,0x60b08ed5, - 0xd6d6a3e8,0xa1d1937e,0x38d8c2c4,0x4fdff252, - 0xd1bb67f1,0xa6bc5767,0x3fb506dd,0x48b2364b, - 0xd80d2bda,0xaf0a1b4c,0x36034af6,0x41047a60, - 0xdf60efc3,0xa867df55,0x316e8eef,0x4669be79, - 0xcb61b38c,0xbc66831a,0x256fd2a0,0x5268e236, - 0xcc0c7795,0xbb0b4703,0x220216b9,0x5505262f, - 0xc5ba3bbe,0xb2bd0b28,0x2bb45a92,0x5cb36a04, - 0xc2d7ffa7,0xb5d0cf31,0x2cd99e8b,0x5bdeae1d, - 0x9b64c2b0,0xec63f226,0x756aa39c,0x026d930a, - 0x9c0906a9,0xeb0e363f,0x72076785,0x05005713, - 0x95bf4a82,0xe2b87a14,0x7bb12bae,0x0cb61b38, - 0x92d28e9b,0xe5d5be0d,0x7cdcefb7,0x0bdbdf21, - 0x86d3d2d4,0xf1d4e242,0x68ddb3f8,0x1fda836e, - 0x81be16cd,0xf6b9265b,0x6fb077e1,0x18b74777, - 0x88085ae6,0xff0f6a70,0x66063bca,0x11010b5c, - 0x8f659eff,0xf862ae69,0x616bffd3,0x166ccf45, - 0xa00ae278,0xd70dd2ee,0x4e048354,0x3903b3c2, - 0xa7672661,0xd06016f7,0x4969474d,0x3e6e77db, - 0xaed16a4a,0xd9d65adc,0x40df0b66,0x37d83bf0, - 0xa9bcae53,0xdebb9ec5,0x47b2cf7f,0x30b5ffe9, - 0xbdbdf21c,0xcabac28a,0x53b39330,0x24b4a3a6, - 0xbad03605,0xcdd70693,0x54de5729,0x23d967bf, - 0xb3667a2e,0xc4614ab8,0x5d681b02,0x2a6f2b94, - 0xb40bbe37,0xc30c8ea1,0x5a05df1b,0x2d02ef8d -}; - -static void getCRC(uint32_t *checksum, const uint8_t *inputBytes, size_t length, unsigned long crcTable[]) { - for (size_t i = 0; i < length; i += 1) { - *checksum = crcTable[inputBytes[i] ^ (*checksum & 0xff)] ^ (*checksum >> 8); - } -} +#include "crcgenerator.h" // Rotate right for 16-bit unsigned integers. // Used to normalize MSB-aligned high bit-depth samples (10-bit, 12-bit) to LSB-aligned. @@ -240,7 +167,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } if (m_outputcrcPerFrame && m_crcOutputFile) { - fprintf(m_crcOutputFile, "CRC Frame[%" PRId64 "]:", pFrame->displayOrder); + fprintf(m_crcOutputFile, "CRC Frame[%lld]:", (long long)pFrame->displayOrder); for (size_t i = 0; i < m_crcInitValue.size(); i += 1) { uint32_t frameCrc = m_crcInitValue[i]; getCRC(&frameCrc, pOutputBuffer, usedBufferSize, Crc32Table); @@ -265,15 +192,44 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } } - FILE* AttachFile(const char* fileName) { + bool hasExtension(const char* fileName, const char* extension) { + size_t fileLen = std::strlen(fileName); + size_t extLen = std::strlen(extension); + + if (fileLen < extLen) { + return false; + } + + return std::strcmp(fileName + fileLen - extLen, extension) == 0; + } + + FILE* AttachFile(const char* fileName, bool y4mFormat) { if (m_outputFile) { fclose(m_outputFile); m_outputFile = nullptr; } + std::string fileNameWithModExt; + // Check if the file does not have a y4m extension, + // but y4m format is requested. + if (y4mFormat && !hasExtension(fileName, ".y4m")) { + std::cout << std::endl << "y4m output format is requested, "; + std::cout << "but the output file's (" << fileName << ") extension isn't .y4m!" + << std::endl; + fileNameWithModExt = fileName + std::string(".y4m"); + fileName = fileNameWithModExt.c_str(); + } else if ((y4mFormat == false) && !hasExtension(fileName, ".yuv")) { + std::cout << std::endl << "Raw yuv output format is requested, "; + std::cout << "but the output file's (" << fileName << ") extension isn't .yuv!" + << std::endl; + fileNameWithModExt = fileName + std::string(".yuv"); + fileName = fileNameWithModExt.c_str(); + } + if (fileName != nullptr) { m_outputFile = fopen(fileName, "wb"); if (m_outputFile) { + std::cout << "Output file name is: " << fileName << std::endl; return m_outputFile; } } @@ -386,6 +342,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { VkDeviceSize maxSize = 0; const uint8_t* readImagePtr = srcImageDeviceMemory->GetReadOnlyDataPtr(imageOffset, maxSize); assert(readImagePtr != nullptr); + assert(maxSize <= SIZE_MAX); // Ensure we don't lose data in conversion int32_t secondaryPlaneWidth = frameWidth; int32_t secondaryPlaneHeight = frameHeight; @@ -461,14 +418,18 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { // Copy the luma plane const uint32_t numCompatiblePlanes = 1; for (uint32_t plane = 0; plane < numCompatiblePlanes; plane++) { - const uint8_t* pSrc = readImagePtr + layouts[plane].offset; - uint8_t* pDst = pOutBuffer + yuvPlaneLayouts[plane].offset; + const uint8_t* pSrc = readImagePtr + static_cast(layouts[plane].offset); + uint8_t* pDst = pOutBuffer + static_cast(yuvPlaneLayouts[plane].offset); if (is8Bit) { - CopyPlaneData(pSrc, pDst, layouts[plane].rowPitch, yuvPlaneLayouts[plane].rowPitch, + assert(layouts[plane].rowPitch <= SIZE_MAX); + assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); + CopyPlaneData(pSrc, pDst, static_cast(layouts[plane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), frameWidth, imageHeight); } else { - CopyPlaneData(pSrc, pDst, layouts[plane].rowPitch, yuvPlaneLayouts[plane].rowPitch, + assert(layouts[plane].rowPitch <= SIZE_MAX); + assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); + CopyPlaneData(pSrc, pDst, static_cast(layouts[plane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), frameWidth, imageHeight, 1, bitShift); } } @@ -488,10 +449,14 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } if (is8Bit) { - CopyPlaneData(pSrc, pDst, layouts[srcPlane].rowPitch, yuvPlaneLayouts[plane].rowPitch, + assert(layouts[srcPlane].rowPitch <= SIZE_MAX); + assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); + CopyPlaneData(pSrc, pDst, static_cast(layouts[srcPlane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), planeWidth, 1, 2); } else { - CopyPlaneData(pSrc, pDst, layouts[srcPlane].rowPitch, yuvPlaneLayouts[plane].rowPitch, + assert(layouts[srcPlane].rowPitch <= SIZE_MAX); + assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); + CopyPlaneData(pSrc, pDst, static_cast(layouts[srcPlane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), planeWidth, 1, 2, bitShift); } pDst += yuvPlaneLayouts[plane].rowPitch; @@ -499,10 +464,10 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } // Calculate total buffer size - outputBufferSize = yuvPlaneLayouts[0].rowPitch * imageHeight; + outputBufferSize = static_cast(yuvPlaneLayouts[0].rowPitch * imageHeight); if (mpInfo->planesLayout.numberOfExtraPlanes >= 1) { - outputBufferSize += yuvPlaneLayouts[1].rowPitch * secondaryPlaneHeight; - outputBufferSize += yuvPlaneLayouts[2].rowPitch * secondaryPlaneHeight; + outputBufferSize += static_cast(yuvPlaneLayouts[1].rowPitch * secondaryPlaneHeight); + outputBufferSize += static_cast(yuvPlaneLayouts[2].rowPitch * secondaryPlaneHeight); } return outputBufferSize; @@ -516,6 +481,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } VkDeviceSize imageMemorySize = imageResource->GetImageDeviceMemorySize(); + assert(imageMemorySize <= SIZE_MAX); // Ensure we don't lose data in conversion if ((m_pLinearMemory == nullptr) || (imageMemorySize > m_allocationSize)) { if (m_outputFile) { @@ -527,7 +493,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { m_pLinearMemory = nullptr; } - m_allocationSize = (size_t)(imageMemorySize); + m_allocationSize = static_cast(imageMemorySize); m_pLinearMemory = new uint8_t[m_allocationSize]; if (m_pLinearMemory == nullptr) { return nullptr; @@ -568,7 +534,7 @@ VkResult VkVideoFrameOutput::Create(const char* fileName, return VK_ERROR_OUT_OF_HOST_MEMORY; } - FILE* outFile = newFrameToFile->AttachFile(fileName); + FILE* outFile = newFrameToFile->AttachFile(fileName, outputy4m); if ((fileName != nullptr) && (outFile == nullptr)) { delete newFrameToFile; return VK_ERROR_INITIALIZATION_FAILED; diff --git a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp index dd67b2b5..dcea5b41 100644 --- a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp +++ b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp @@ -17,7 +17,7 @@ #include "VulkanFilterYuvCompute.h" #include "nvidia_utils/vulkan/ycbcrvkinfo.h" -static bool dumpShaders = false; +static bool dumpShaders = true; VkResult VulkanFilterYuvCompute::Create(const VulkanDeviceContext* vkDevCtx, uint32_t queueFamilyIndex, @@ -26,6 +26,8 @@ VkResult VulkanFilterYuvCompute::Create(const VulkanDeviceContext* vkDevCtx, uint32_t maxNumFrames, VkFormat inputFormat, VkFormat outputFormat, + bool inputEnableMsbToLsbShift, + bool outputEnableLsbToMsbShift, const VkSamplerYcbcrConversionCreateInfo* pYcbcrConversionCreateInfo, const YcbcrPrimariesConstants* pYcbcrPrimariesConstants, const VkSamplerCreateInfo* pSamplerCreateInfo, @@ -39,6 +41,8 @@ VkResult VulkanFilterYuvCompute::Create(const VulkanDeviceContext* vkDevCtx, maxNumFrames, inputFormat, outputFormat, + inputEnableMsbToLsbShift, + outputEnableLsbToMsbShift, pYcbcrPrimariesConstants)); if (!yCbCrVulkanFilter) { @@ -116,34 +120,58 @@ VkResult VulkanFilterYuvCompute::Init(const VkSamplerYcbcrConversionCreateInfo* VkResult VulkanFilterYuvCompute::InitDescriptorSetLayout(uint32_t maxNumFrames) { + VkSampler ccSampler = m_samplerYcbcrConversion.GetSampler(); - assert(ccSampler != VK_NULL_HANDLE); - VkDescriptorType type = (ccSampler != VK_NULL_HANDLE) ? VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER : VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + VkDescriptorType type = (ccSampler != VK_NULL_HANDLE) ? VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER : + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; const VkSampler* pImmutableSamplers = (ccSampler != VK_NULL_HANDLE) ? &ccSampler : nullptr; - const std::vector setLayoutBindings{ - // binding, descriptorType, descriptorCount, stageFlags, pImmutableSamplers; + std::vector setLayoutBindings; + + // Input bindings (either images or buffers) + if (m_inputIsBuffer) { + // Binding 0: Input buffer (read-only) for single buffer case + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 1: Input buffer (read-only) Y plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 2: Input buffer (read-only) Cb or CbCr plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 3: Input buffer (read-only) Cr plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + } else { // Binding 0: Input image (read-only) RGBA or RGBA YCbCr sampler sampled - VkDescriptorSetLayoutBinding{ 0, type, 1, VK_SHADER_STAGE_COMPUTE_BIT, pImmutableSamplers}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 0, type, 1, VK_SHADER_STAGE_COMPUTE_BIT, pImmutableSamplers}); // Binding 1: Input image (read-only) Y plane of YCbCr Image - VkDescriptorSetLayoutBinding{ 1, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 1, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 2: Input image (read-only) Cb or CbCr plane - VkDescriptorSetLayoutBinding{ 2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 3: Input image (read-only) Cr plane - VkDescriptorSetLayoutBinding{ 3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + } + // Output bindings (either images or buffers) + if (m_outputIsBuffer) { + // Binding 4: Output buffer (write) for single buffer case + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 4, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 5: Output buffer (write) Y plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 5, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 6: Output buffer (write) CbCr plane of 2-plane or Cb of 3-plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 6, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 7: Output buffer (write) Cr plane of 3-plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 7, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + } else { // Binding 4: Output image (write) RGBA or YCbCr single-plane image - VkDescriptorSetLayoutBinding{ 4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 5: Output image (write) Y plane of YCbCr Image - VkDescriptorSetLayoutBinding{ 5, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 5, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 6: Output image (write) CbCr plane of 2-plane or Cb of 3-plane YCbCr Image - VkDescriptorSetLayoutBinding{ 6, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 6, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 7: Output image (write) Cr plane of 3-pane YCbCr Image - VkDescriptorSetLayoutBinding{ 7, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 7, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + } - // Binding 8: uniform buffer for input parameters. - VkDescriptorSetLayoutBinding{ 8, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, - }; + // Binding 8: uniform buffer for input parameters. + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 8, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); VkPushConstantRange pushConstantRange = {}; pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; // Stage the push constant is for @@ -175,20 +203,74 @@ static YcbcrBtStandard GetYcbcrPrimariesConstantsId(VkSamplerYcbcrModelConversio return YcbcrBtStandardUnknown; } +// Generate a unified push constants declaration for shaders +/** + * @brief Generates GLSL code for push constants declaration used in compute shaders + * + * This function creates a standard push constants block with fields for: + * - Source and destination image layers + * - Input and output dimensions + * - Buffer offsets and pitches for Y, Cb, and Cr planes + * + * @param shaderStr Output stringstream where the GLSL code will be written + */ +static void GenPushConstantsDecl(std::stringstream& shaderStr) { + shaderStr << "layout(push_constant) uniform PushConstants {\n" + << " uint srcLayer; // src image layer to use\n" + << " uint dstLayer; // dst image layer to use\n" + << " uint inputWidth; // input image or buffer width\n" + << " uint inputHeight; // input image or buffer height\n" + << " uint outputWidth; // output image or buffer width\n" + << " uint outputHeight; // output image or buffer height\n" + << " uint inYOffset; // input buffer Y plane offset\n" + << " uint inCbOffset; // input buffer Cb plane offset\n" + << " uint inCrOffset; // input buffer Cr plane offset\n" + << " uint inYPitch; // input buffer Y plane pitch\n" + << " uint inCbPitch; // input buffer Cb plane pitch\n" + << " uint inCrPitch; // input buffer Cr plane pitch\n" + << " uint outYOffset; // output buffer Y plane offset\n" + << " uint outCbOffset; // output buffer Cb plane offset\n" + << " uint outCrOffset; // output buffer Cr plane offset\n" + << " uint outYPitch; // output buffer Y plane pitch\n" + << " uint outCbPitch; // output buffer Cb plane pitch\n" + << " uint outCrPitch; // output buffer Cr plane pitch\n" + << "} pushConstants;\n"; +} + +// Updated header function with unified push constants +/** + * @brief Generates the shader header with version declaration and push constants + * + * Creates the beginning of a GLSL compute shader with: + * - GLSL version declaration (#version 450) + * - Push constants structure + * - Local work group size (16x16) + * + * @param shaderStr Output stringstream where the GLSL code will be written + */ static void GenHeaderAndPushConst(std::stringstream& shaderStr) { - shaderStr << "#version 450\n" - "layout(push_constant) uniform PushConstants {\n" - " uint srcImageLayer; // Source image layer index\n" - " uint dstImageLayer; // Destination image layer index\n" - " ivec2 inputSize; // Original input image size (width, height)\n" - " ivec2 outputSize; // Output image size (width, height, with padding)\n" - "} pushConstants;\n" - "\n" - "layout (local_size_x = 16, local_size_y = 16) in;\n" - "\n"; + shaderStr << "#version 450\n"; + GenPushConstantsDecl(shaderStr); + shaderStr << "\n" + << "layout (local_size_x = 16, local_size_y = 16) in;\n" + << "\n"; } +/** + * @brief Generates GLSL code for image binding layout declarations + * + * Creates the binding declaration for an image resource in the shader. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param imageName Base name for the image variable + * @param imageSubName Suffix name for the image variable (e.g., "Y", "CbCr") + * @param imageFormat Format string for the image (e.g., "rgba8") + * @param isInput Whether this is an input (readonly) or output (writeonly) image + * @param binding Binding point in the descriptor set + * @param set Descriptor set number + * @param imageArray Whether the image should be declared as image2DArray instead of image2D + */ static void GenImageIoBindingLayout(std::stringstream& shaderStr, const char *imageName, const char *imageSubName, @@ -206,22 +288,249 @@ static void GenImageIoBindingLayout(std::stringstream& shaderStr, } +/** + * @brief Generates GLSL code for handling global invocation position and bounds checking + * + * Creates code to: + * - Get the current pixel position from gl_GlobalInvocationID + * - Check if the position is within output image bounds + * - Return early if out of bounds to prevent invalid memory access + * + * @param shaderStr Output stringstream where the GLSL code will be written + */ static void GenHandleImagePosition(std::stringstream& shaderStr) { shaderStr << " ivec2 pos = ivec2(gl_GlobalInvocationID.xy);\n" " // Check for out-of-bounds writes\n" - " if ((pos.x >= pushConstants.outputSize.x) || (pos.y >= pushConstants.outputSize.y)) {\n" + " if ((pos.x >= pushConstants.outputWidth) || (pos.y >= pushConstants.outputHeight)) {\n" + " return;\n" + " }\n" + "\n"; +} + +/** + * @brief Generates GLSL code for buffer binding layout declarations + * + * Creates the binding declaration for a buffer resource in the shader. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param bufferName Base name for the buffer variable + * @param bufferSubName Suffix name for the buffer variable (e.g., "Y", "CbCr") + * @param bufferDataType Data type of buffer elements (e.g., "uint8_t", "uint16_t") + * @param bufferType Vulkan descriptor type (Storage buffer, uniform texel buffer, etc.) + * @param isInput Whether this is an input (readonly) or output (writeonly) buffer + * @param binding Binding point in the descriptor set + * @param set Descriptor set number + */ +static void GenBufferIoBindingLayout(std::stringstream& shaderStr, + const char *bufferName, + const char *bufferSubName, + const char *bufferDataType, + VkDescriptorType bufferType, + bool isInput, + uint32_t binding, + uint32_t set) { + + const char* readonlyModifier = isInput ? " readonly" : ""; + const char* writeonlyModifier = isInput ? "" : " writeonly"; + + switch (bufferType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + shaderStr << "layout (set = " << set << ", binding = " << binding << ") uniform" + << " samplerBuffer " + << bufferName << bufferSubName + << ";\n"; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + shaderStr << "layout (set = " << set << ", binding = " << binding << ") uniform" + << readonlyModifier << writeonlyModifier + << " imageBuffer " + << bufferName << bufferSubName + << ";\n"; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + shaderStr << "layout (set = " << set << ", binding = " << binding << ") buffer" + << readonlyModifier << writeonlyModifier + << " " << bufferName << bufferSubName << "Buffer" + << " {\n" + << " " << bufferDataType << "[] data;\n" + << "} " << bufferName << bufferSubName << ";\n"; + break; + + default: + // Unsupported buffer type + break; + } +} + +/** + * @brief Generates GLSL code for determining if a position has chroma information + * + * Creates a condition that checks if the current pixel position contains + * chroma information based on the subsampling ratios. For example, in 4:2:0 + * subsampling, only pixels at even x and y coordinates have chroma samples. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param chromaHorzRatio Horizontal subsampling ratio (1 for 4:4:4, 2 for 4:2:2/4:2:0) + * @param chromaVertRatio Vertical subsampling ratio (1 for 4:4:4/4:2:2, 2 for 4:2:0) + * @param useCondition Whether to output as a full if-condition (true) or just the condition expression (false) + * @param pixelPosName Name of the pixel position variable in the shader (default: "srcPos") + * @param setProcessChromaBool Name of the boolean variable to set (default: "processChromaBool") + */ +static void GenHandleChromaPosition(std::stringstream& shaderStr, + uint32_t chromaHorzRatio, + uint32_t chromaVertRatio, + bool useCondition = true, + const char* pixelPosName = "srcPos", + const char* setProcessChromaBool = "processChromaBool") +{ + // Skip this for 4:4:4 since all pixels have chroma + if (chromaHorzRatio <= 1 && chromaVertRatio <= 1) { + if (useCondition) { + // For 4:4:4, no subsampling check needed - process all pixels + shaderStr << " bool " << setProcessChromaBool << " = true;\n"; + } else { + shaderStr << "true"; + } + return; + } + + // Build condition for chroma sampling + std::stringstream condition; + if (chromaHorzRatio > 1) + condition << "(" << pixelPosName << ".x % " << chromaHorzRatio << " == 0)"; + + if (chromaHorzRatio > 1 && chromaVertRatio > 1) + condition << " && "; + + if (chromaVertRatio > 1) + condition << "(" << pixelPosName << ".y % " << chromaVertRatio << " == 0)"; + + if (useCondition) { + shaderStr << " bool " << setProcessChromaBool << " = " << condition.str() << ";\n"; + } else { + shaderStr << condition.str(); + } +} + +/** + * @brief Generates GLSL code for calculating subsampled chroma positions + * + * Creates code to compute the chroma position from a pixel position + * based on the subsampling ratios. For example, in 4:2:0 subsampling, + * the chroma position is calculated by dividing both x and y by 2. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param chromaHorzRatio Horizontal subsampling ratio (1 for 4:4:4, 2 for 4:2:2/4:2:0) + * @param chromaVertRatio Vertical subsampling ratio (1 for 4:4:4/4:2:2, 2 for 4:2:0) + * @param srcPosName Name of the source position variable (default: "srcPos") + * @param dstPosName Name of the destination position variable (default: "chromaSrcPos") + * @param indent Number of spaces to indent the output code (default: 8) + * @param generateIfBlock Whether to generate an if-block or just assignment statements (default: false) + */ +static void GenCalculateChromaPosition(std::stringstream& shaderStr, + uint32_t chromaHorzRatio, + uint32_t chromaVertRatio, + const char* srcPosName = "srcPos", + const char* dstPosName = "chromaSrcPos", + int indent = 8, + bool generateIfBlock = false) +{ + std::string indentStr(indent, ' '); + + // For 4:4:4, no subsampling needed + if (chromaHorzRatio <= 1 && chromaVertRatio <= 1) { + shaderStr << indentStr << "// No subsampling for 4:4:4 format, use original position\n"; + if (generateIfBlock) { + shaderStr << indentStr << "// " << dstPosName << " already equals " << srcPosName << "\n"; + } else { + shaderStr << indentStr << dstPosName << " = " << srcPosName << ";\n"; + } + return; + } + + shaderStr << indentStr << "// Calculate subsampled positions based on format's subsampling\n"; + + if (generateIfBlock) { + // Generate an if-block for conditional calculation + shaderStr << indentStr << dstPosName << " = " << srcPosName << ";\n"; + shaderStr << indentStr << "if (processChroma) {\n"; + + if (chromaHorzRatio > 1) { + shaderStr << indentStr << " " << dstPosName << ".x = " << srcPosName << ".x / " << chromaHorzRatio << ";\n"; + } + + if (chromaVertRatio > 1) { + shaderStr << indentStr << " " << dstPosName << ".y = " << srcPosName << ".y / " << chromaVertRatio << ";\n"; + } + + shaderStr << indentStr << "}\n"; + } else { + // Generate direct assignment statements + shaderStr << indentStr << dstPosName << " = ivec2("; + + if (chromaHorzRatio > 1) + shaderStr << srcPosName << ".x / " << chromaHorzRatio; + else + shaderStr << srcPosName << ".x"; + + shaderStr << ", "; + + if (chromaVertRatio > 1) + shaderStr << srcPosName << ".y / " << chromaVertRatio; + else + shaderStr << srcPosName << ".y"; + + shaderStr << ");\n"; + } +} + +/** + * @brief Generates GLSL code for handling buffer position calculations with chroma subsampling + * + * Creates code to: + * - Get the current pixel position from gl_GlobalInvocationID + * - Check if the position is within output bounds + * - Calculate appropriate buffer indices based on subsampling ratios + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param chromaHorzRatio Horizontal subsampling ratio (default: 2 for 4:2:0/4:2:2) + * @param chromaVertRatio Vertical subsampling ratio (default: 2 for 4:2:0) + */ +static void GenHandleBufferPosition(std::stringstream& shaderStr, int chromaHorzRatio = 2, int chromaVertRatio = 2) +{ + shaderStr << + " ivec2 pos = ivec2(gl_GlobalInvocationID.xy);\n" + " // Check for out-of-bounds writes\n" + " if ((pos.x >= pushConstants.outputWidth) || (pos.y >= pushConstants.outputHeight)) {\n" " return;\n" " }\n" + " \n" + " // Calculate buffer indices based on position and strides\n" + " uint yIndex = pushConstants.inYOffset + pos.y * pushConstants.inYPitch + pos.x;\n" + " uint cbIndex = pushConstants.inCbOffset + (pos.y / " << chromaVertRatio << ") * pushConstants.inCbPitch + (pos.x / " << chromaHorzRatio << ");\n" + " uint crIndex = pushConstants.inCrOffset + (pos.y / " << chromaVertRatio << ") * pushConstants.inCrPitch + (pos.x / " << chromaHorzRatio << ");\n" "\n"; } +/** + * @brief Generates GLSL code for handling source position with optional replication + * + * Creates code to calculate source position, with optional boundary handling + * by replicating edge pixels when coordinates exceed input dimensions. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param enableReplicate Whether to enable edge replication (clamp to edge) + */ static void GenHandleSourcePositionWithReplicate(std::stringstream& shaderStr, bool enableReplicate) { if (enableReplicate) { shaderStr << - " ivec2 srcPos = min(pos, pushConstants.inputSize );\n" + " ivec2 srcPos = min(pos, ivec2(pushConstants.inputWidth, pushConstants.inputHeight));\n" "\n"; } else { shaderStr << @@ -230,15 +539,622 @@ static void GenHandleSourcePositionWithReplicate(std::stringstream& shaderStr, b } } -void VulkanFilterYuvCompute::ShaderGeneratePlaneDescriptors(std::stringstream& shaderStr, - VkImageAspectFlags& imageAspects, - const char *imageName, - VkFormat imageFormat, - bool isInput, - uint32_t startBinding, - uint32_t set, - bool imageArray) +/** + * @brief Generates GLSL function for fetching Y samples from a buffer + * + * Creates a helper function that reads Y samples from a buffer and + * normalizes values to 0.0-1.0 range, handling different bit depths. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isHighBitDepth Whether the Y data is high bit depth (>8 bits) + * @param bitDepth The bit depth of Y samples (8, 10, 12, or 16) + */ +static void GenFetchYFromBufferFunc(std::stringstream& shaderStr, + bool isHighBitDepth, uint32_t bitDepth) { + shaderStr << "// Function to fetch Y component from buffer\n" + << "float fetchYFromBuffer(uint index) {\n"; + + if (isHighBitDepth) { + shaderStr << " uint16_t rawValue = inputBufferY.data[index];\n" + << " return extractHighBitDepth(rawValue);\n"; + } else { + shaderStr << " uint8_t byteValue = inputBufferY.data[index];\n" + << " return float(byteValue) / 255.0;\n"; + } + + shaderStr << "}\n\n"; +} + +/** + * @brief Generates GLSL functions for fetching Cb and Cr samples from buffers + * + * Creates helper functions to read Cb and Cr chroma samples from buffers and + * normalize values to 0.0-1.0 range, handling different bit depths. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isHighBitDepth Whether the chroma data is high bit depth (>8 bits) + * @param bitDepth The bit depth of chroma samples (8, 10, 12, or 16) + */ +static void GenFetchCbCrFromBufferFunc(std::stringstream& shaderStr, + bool isHighBitDepth, uint32_t bitDepth) { + // Cb fetch function + shaderStr << "// Function to fetch Cb component from buffer\n" + << "float fetchCbFromBuffer(uint index) {\n"; + + if (isHighBitDepth) { + shaderStr << " uint16_t rawValue = inputBufferCb.data[index];\n" + << " return extractHighBitDepth(rawValue);\n"; + } else { + shaderStr << " uint8_t byteValue = inputBufferCb.data[index];\n" + << " return float(byteValue) / 255.0;\n"; + } + + shaderStr << "}\n\n"; + + // Cr fetch function + shaderStr << "// Function to fetch Cr component from buffer\n" + << "float fetchCrFromBuffer(uint index) {\n"; + + if (isHighBitDepth) { + shaderStr << " uint16_t rawValue = inputBufferCr.data[index];\n" + << " return extractHighBitDepth(rawValue);\n"; + } else { + shaderStr << " uint8_t byteValue = inputBufferCr.data[index];\n" + << " return float(byteValue) / 255.0;\n"; + } + + shaderStr << "}\n\n"; +} + +/** + * @brief Generates GLSL function for extracting and normalizing high bit-depth values + * + * Creates a helper function to extract and normalize values from high bit-depth + * formats (10, 12, or 16 bits), handling MSB or LSB aligned data. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isMSB Whether the high bits are MSB-aligned (true) or LSB-aligned (false) + * @param bitDepth The bit depth of the samples (10, 12, or 16) + */ +static void GenExtractHighBitDepthFunc(std::stringstream& shaderStr, + bool isMSB, uint32_t bitDepth) +{ + shaderStr << "// Helper function to extract and normalize high bit-depth values\n"; + + if (isMSB) { + // For MSB-aligned data + shaderStr << "float extractHighBitDepth(uint value) {\n" + << " // For MSB-aligned " << bitDepth << "-bit data, shift right to extract the bits\n" + << " uint extractedValue = value >> (16u - " << bitDepth << "u);\n" + << " // Normalize to 0.0-1.0 range\n" + << " return float(extractedValue) / " << ((1 << bitDepth) - 1) << ".0;\n" + << "}\n\n"; + } else { + // For LSB-aligned data + shaderStr << "float extractHighBitDepth(uint value) {\n" + << " // For LSB-aligned " << bitDepth << "-bit data, mask to extract the bits\n" + << " uint extractedValue = value & " << ((1 << bitDepth) - 1) << "u;\n" + << " // Normalize to 0.0-1.0 range\n" + << " return float(extractedValue) / " << ((1 << bitDepth) - 1) << ".0;\n" + << "}\n\n"; + } +} + +/** + * @brief Generates GLSL code for applying MSB-to-LSB bit shifting for high bit-depth content + * + * Creates code to convert MSB-aligned high bit-depth content to normalized values: + * - For images (floating point): Divide by the appropriate factor + * - For buffers (integer): Perform right bit shift operations + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isInputBuffer Whether the input is a buffer (true) or image (false) + * @param inputBitDepth The bit depth of the input data (8, 10, 12, or 16) + * @param imageAspects Image aspect flags indicating which planes are being processed + */ +static void GenApplyMsbToLsbShift(std::stringstream& shaderStr, + bool isInputBuffer, + uint32_t inputBitDepth, + VkImageAspectFlags imageAspects) +{ + // Only apply for high bit-depth formats (10/12-bit) + if ((inputBitDepth != 10) && (inputBitDepth != 12)) { + return; + } + + // Calculate shift amount based on bit depth + uint32_t shiftAmount = 16 - inputBitDepth; + float shiftFactor = static_cast(1 << shiftAmount); + + shaderStr << "\n // MSB-to-LSB shift for high bit-depth " + << (isInputBuffer ? "buffer" : "image") << " data\n"; + + if (isInputBuffer) { + // For buffers, we use actual bit shifting operations on integer values + shaderStr << " // For high bit-depth data in buffers, we need to shift right by " + << shiftAmount << " bits to convert from MSB-aligned to actual values\n" + << " // This is a right shift operation for integer values\n"; + + // Build a condition mask based on which components are being read + std::string maskCondition = ""; + bool needsOr = false; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + maskCondition += "YCbCrRawOut.x > 0.0"; + needsOr = true; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + if (needsOr) maskCondition += " || "; + maskCondition += "YCbCrRawOut.y > 0.0"; + needsOr = true; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + if (needsOr) maskCondition += " || "; + maskCondition += "YCbCrRawOut.z > 0.0"; + } + + // Only apply shift if there are values to shift + if (!maskCondition.empty()) { + shaderStr << " if (" << maskCondition << ") {\n" + << " // Convert from uint values to normalized float (for buffer inputs)\n"; + + if (inputBitDepth == 10) { + shaderStr << " // For 10-bit: Convert 10-bit values [0-1023] to normalized [0-1]\n" + << " const float normFactor = 1.0 / 1023.0;\n"; + } else { // 12-bit + shaderStr << " // For 12-bit: Convert 12-bit values [0-4095] to normalized [0-1]\n" + << " const float normFactor = 1.0 / 4095.0;\n"; + } + + // Apply right shift with bit mask to extract the actual bit values + // For 10-bit: (value >> 6) & 0x3FF = value / 64 (rounded down) + // For 12-bit: (value >> 4) & 0xFFF = value / 16 (rounded down) + shaderStr << " // Apply right shift to convert from MSB-aligned to actual bit values\n"; + + // Apply component-specific shifting based on which aspects are being read + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << " YCbCrRawOut.x = floor(YCbCrRawOut.x / " << shiftFactor + << ".0) * normFactor;\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " YCbCrRawOut.y = floor(YCbCrRawOut.y / " << shiftFactor + << ".0) * normFactor;\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " YCbCrRawOut.z = floor(YCbCrRawOut.z / " << shiftFactor + << ".0) * normFactor;\n"; + } + + shaderStr << " }\n"; + } + } else { + // For images, we're already working with normalized values, so we divide by shiftFactor + shaderStr << " // For high bit-depth data in images that are MSB-aligned,\n" + << " // we need to divide by " << shiftFactor << " to get the proper normalized values\n"; + + // Build a shift mask based on which components are being read + std::string shiftMask = "vec3("; + shiftMask += (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) ? "1.0, " : "0.0, "; + shiftMask += (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) ? "1.0, " : "0.0, "; + shiftMask += (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) ? "1.0" : "0.0"; + shiftMask += ")"; + + // Calculate reciprocal of shift factor (for multiplication instead of division) + float shiftFactorRecip = 1.0f / shiftFactor; + + // Only apply shift to the components that were actually read + shaderStr << " // Apply multiplication by reciprocal instead of division (more efficient)\n" + << " const float shiftFactorRecip = " << std::fixed << std::setprecision(8) << shiftFactorRecip << "f;\n" + << " YCbCrRawOut = YCbCrRawOut * shiftFactorRecip * " << shiftMask << " + \n" + << " YCbCrRawOut * (vec3(1.0) - " << shiftMask << ");\n"; + } +} + +/** + * @brief Generates GLSL function for reading YCbCr data from either buffer or image sources + * + * Creates a function that reads YCbCr data from the appropriate source (buffer or image) + * based on the input format configuration. Handles different bit depths and plane layouts. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isInputBuffer Whether the input is a buffer (true) or image (false) + * @param inputBitDepth The bit depth of the input data (8, 10, 12, or 16) + * @param isInputTwoPlane Whether the input has two planes (e.g., NV12) or three planes + */ +static void GenReadYCbCrBuffer(std::stringstream& shaderStr, + bool isInputBuffer, + uint32_t inputBitDepth, + bool isInputTwoPlane, + bool enableMsbToLsbShift = false, + VkImageAspectFlags imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT, + const char* useProcessChromaBool = "processChroma") +{ + // Generate function to read from either buffer or image + shaderStr << + "// Function to read YCbCr data from input source (buffer or image)\n" + "vec3 readYCbCrFromSource(ivec2 pos, ivec2 chromaPos, uint srcLayer, bool processChroma) {\n" + " // Initialize to YCbCr black values (for limited range)\n"; + + // Set appropriate black values based on bit depth + if (inputBitDepth == 8) { + shaderStr << " vec3 YCbCrRawOut = vec3(16.0/255.0, 128.0/255.0, 128.0/255.0);\n\n"; + } else if (inputBitDepth == 10) { + shaderStr << " vec3 YCbCrRawOut = vec3(64.0/1023.0, 512.0/1023.0, 512.0/1023.0);\n\n"; + } else if (inputBitDepth == 12) { + shaderStr << " vec3 YCbCrRawOut = vec3(256.0/4095.0, 2048.0/4095.0, 2048.0/4095.0);\n\n"; + } else if (inputBitDepth == 16) { + shaderStr << " vec3 YCbCrRawOut = vec3(4096.0/65535.0, 32768.0/65535.0, 32768.0/65535.0);\n\n"; + } else { + // Default fallback + shaderStr << " vec3 YCbCrRawOut = vec3(16.0/255.0, 128.0/255.0, 128.0/255.0);\n\n"; + } + + if (isInputBuffer) { + // Reading from buffer + shaderStr << " // Reading from buffer source\n"; + + // Read Y component if PLANE_0_BIT is set + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << + " // Calculate buffer index for Y plane\n" + " uint yIndex = pushConstants.inYOffset + pos.y * pushConstants.inYPitch + pos.x;\n" + " YCbCrRawOut.x = fetchYFromBuffer(yIndex);\n\n"; + } + + // Read Cb/Cr components based on plane format and aspect flags + if ((imageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0) { + // Add conditional check for chroma processing + shaderStr << " // Process chroma data conditionally\n" + << " if (processChroma) {\n"; + + if (isInputTwoPlane) { + // Two-plane input buffer format with interleaved CbCr + shaderStr << " // Read interleaved CbCr data from 2-plane input buffer\n" + << " uint cbcrIndex = pushConstants.inCbOffset + chromaPos.y * pushConstants.inCbPitch + chromaPos.x * 2;\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " YCbCrRawOut.y = fetchCbFromBuffer(cbcrIndex);\n" + << " YCbCrRawOut.z = fetchCrFromBuffer(cbcrIndex + 1);\n"; + } + } else { + // Three-plane input buffer format with separate Cb and Cr planes + shaderStr << " // Read separate Cb and Cr from 3-plane input buffer\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " uint cbIndex = pushConstants.inCbOffset + chromaPos.y * pushConstants.inCbPitch + chromaPos.x;\n" + << " YCbCrRawOut.y = fetchCbFromBuffer(cbIndex);\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " uint crIndex = pushConstants.inCrOffset + chromaPos.y * pushConstants.inCrPitch + chromaPos.x;\n" + << " YCbCrRawOut.z = fetchCrFromBuffer(crIndex);\n"; + } + } + + // Close the conditional block + shaderStr << " }\n"; + } + } else { + // Reading from image + shaderStr << " // Reading from image source\n"; + + // Read Y component if PLANE_0_BIT is set + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << " // Read Y value from Y plane\n" + << " YCbCrRawOut.x = imageLoad(inputImageY, ivec3(pos, srcLayer)).r;\n\n"; + } + + // Read Cb/Cr components based on plane format and aspect flags + if ((imageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0) { + // Add conditional check for chroma processing + shaderStr << " // Process chroma data conditionally\n" + << " if (processChroma) {\n"; + + if (isInputTwoPlane) { + // Two-plane input image format with interleaved CbCr + shaderStr << " // Read interleaved CbCr data from 2-plane input image\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + // For two-plane formats (NV12, etc.), both Cb and Cr are in the second plane + shaderStr << " YCbCrRawOut.yz = imageLoad(inputImageCbCr, ivec3(chromaPos, srcLayer)).rg;\n"; + } + } else { + // Three-plane input image format with separate Cb and Cr planes + shaderStr << " // Read separate Cb and Cr from 3-plane input image\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " YCbCrRawOut.y = imageLoad(inputImageCb, ivec3(chromaPos, srcLayer)).r; // Cb\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " YCbCrRawOut.z = imageLoad(inputImageCr, ivec3(chromaPos, srcLayer)).r; // Cr\n"; + } + } + + // Close the conditional block + shaderStr << " }\n"; + } + } + + // Apply MSB-to-LSB shift if enabled + if (enableMsbToLsbShift) { + GenApplyMsbToLsbShift(shaderStr, isInputBuffer, inputBitDepth, imageAspects); + } + + // Return the raw YCbCr values + shaderStr << + "\n return YCbCrRawOut;\n" + "}\n\n"; +} + +/** + * @brief Generates GLSL function for applying LSB-to-MSB bit shifting for high bit-depth content + * + * Creates code to convert normalized values to MSB-aligned high bit-depth content by + * applying the appropriate bit shift. This function only handles the shift calculation, + * not the actual I/O operations. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isOutputBuffer Whether the output is a buffer (true) or image (false) + * @param outputBitDepth The bit depth of the output data (8, 10, 12, or 16) + */ +static void GenApplyLsbToMsbShift(std::stringstream& shaderStr, + bool isOutputBuffer, + uint32_t outputBitDepth) +{ + // Only apply for high bit-depth formats (10/12-bit) + if ((outputBitDepth != 10) && (outputBitDepth != 12)) { + // For 8-bit or 16-bit, no shift is needed - just use the input values directly + shaderStr << " // No bit-depth shift needed for " << outputBitDepth << "-bit format\n\n"; + return; + } + + // Calculate shift amount based on bit depth + uint32_t shiftAmount = 16 - outputBitDepth; + float shiftFactor = static_cast(1 << shiftAmount); + + shaderStr << " // Apply LSB-to-MSB shift for high bit-depth " + << (isOutputBuffer ? "buffer" : "image") << " data\n"; + + if (isOutputBuffer) { + // For buffers, we'll return unshifted values because the packing functions + // handle the bit shifting during the actual write operation + shaderStr << " // For buffer output, shift will be applied during packing\n\n"; + } else { + // For images, we need to multiply by shift factor to align bits properly + // Calculate multiplication factor + shaderStr << " // For image output with " << outputBitDepth << "-bit, multiply by " << shiftFactor + << " to shift into the MSB\n" + << " const float shiftFactorMultiplier = " << shiftFactor << ";\n" + << " YCbCrRawIn = YCbCrRawIn * shiftFactorMultiplier;\n\n"; + } +} + +/** + * @brief Generates GLSL function for writing YCbCr data to either buffer or image destinations + * + * Creates a function that writes YCbCr data to the appropriate destination (buffer or image) + * based on the output format configuration. Handles different bit depths and plane layouts. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isOutputBuffer Whether the output is a buffer (true) or image (false) + * @param outputBitDepth The bit depth of the output data (8, 10, 12, or 16) + * @param isOutputTwoPlane Whether the output format has two planes (e.g., NV12) or three planes + */ +static void GenWriteYCbCrBuffer(std::stringstream& shaderStr, + bool isOutputBuffer, + uint32_t outputBitDepth, + bool isOutputTwoPlane, + bool enableLsbToMsbShift = false, + VkImageAspectFlags imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT, + const char* useProcessChromaBool = "processChroma") +{ + // Generate function to write to either buffer or image + shaderStr << + "// Function to write YCbCr data to output destination (buffer or image)\n" + "void writeYCbCrToDestination(vec3 YCbCrRawIn, ivec2 pos, ivec2 chromaPos, uint dstLayer, bool processChroma) {\n"; + + // Apply LSB-to-MSB shift if enabled - just transforms the values, doesn't do I/O + if (enableLsbToMsbShift) { + GenApplyLsbToMsbShift(shaderStr, isOutputBuffer, outputBitDepth); + } + + if (isOutputBuffer) { + // Writing to buffer + shaderStr << + " // Writing to buffer destination\n"; + + // Write Y component if PLANE_0_BIT is set + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << + " // Calculate buffer index for Y plane\n" + " uint outYIndex = pushConstants.outYOffset + pos.y * pushConstants.outYPitch + pos.x;\n\n"; + + // Handle normal Y component based on bit depth + if (outputBitDepth > 8) { + // For high bit-depth formats + switch (outputBitDepth) { + case 10: + shaderStr << " outputBufferY.data[outYIndex] = pack10BitTo16Bit(YCbCrRawIn.x);\n\n"; + break; + case 12: + shaderStr << " outputBufferY.data[outYIndex] = pack12BitTo16Bit(YCbCrRawIn.x);\n\n"; + break; + case 16: + default: + // For 16-bit, direct value + shaderStr << " outputBufferY.data[outYIndex] = uint16_t(clamp(YCbCrRawIn.x, 0.0, 65535.0));\n\n"; + break; + } + } else { + // For 8-bit formats + shaderStr << " outputBufferY.data[outYIndex] = uint8_t(clamp(YCbCrRawIn.x, 0.0, 255.0));\n\n"; + } + } + + // Write Cb/Cr components based on plane format and aspect flags + if ((imageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0) { + shaderStr << " // Process chroma data conditionally\n" + << " if (processChroma) {\n"; + + if (isOutputTwoPlane) { + // Two-plane output buffer format with interleaved CbCr + shaderStr << " // Write interleaved CbCr to 2-plane output buffer\n" + << " uint outCbCrIndex = pushConstants.outCbOffset + chromaPos.y * pushConstants.outCbPitch + chromaPos.x * 2;\n"; + + // Normal CbCr processing + if (outputBitDepth > 8) { + // For high bit-depth formats with interleaved data + switch (outputBitDepth) { + case 10: + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCbCr.data[outCbCrIndex] = pack10BitTo16Bit(YCbCrRawIn.y);\n" + << " outputBufferCbCr.data[outCbCrIndex + 1] = pack10BitTo16Bit(YCbCrRawIn.z);\n"; + } + break; + case 12: + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCbCr.data[outCbCrIndex] = pack12BitTo16Bit(YCbCrRawIn.y);\n" + << " outputBufferCbCr.data[outCbCrIndex + 1] = pack12BitTo16Bit(YCbCrRawIn.z);\n"; + } + break; + case 16: + default: + // For 16-bit, direct values + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCbCr.data[outCbCrIndex] = uint16_t(clamp(YCbCrRawIn.y, 0.0, 65535.0));\n" + << " outputBufferCbCr.data[outCbCrIndex + 1] = uint16_t(clamp(YCbCrRawIn.z, 0.0, 65535.0));\n"; + } + break; + } + } else { + // For 8-bit formats + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCbCr.data[outCbCrIndex] = uint8_t(clamp(YCbCrRawIn.y, 0.0, 255.0));\n" + << " outputBufferCbCr.data[outCbCrIndex + 1] = uint8_t(clamp(YCbCrRawIn.z, 0.0, 255.0));\n"; + } + } + } else { + // Three-plane output buffer format with separate Cb and Cr planes + shaderStr << " // Write separate Cb and Cr to 3-plane output buffer\n"; + + // Calculate indices for separate planes + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " uint outCbIndex = pushConstants.outCbOffset + chromaPos.y * pushConstants.outCbPitch + chromaPos.x;\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " uint outCrIndex = pushConstants.outCrOffset + chromaPos.y * pushConstants.outCrPitch + chromaPos.x;\n"; + } + + if (outputBitDepth > 8) { + // For high bit-depth formats + switch (outputBitDepth) { + case 10: + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCb.data[outCbIndex] = pack10BitTo16Bit(YCbCrRawIn.y);\n"; + } + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " outputBufferCr.data[outCrIndex] = pack10BitTo16Bit(YCbCrRawIn.z);\n"; + } + break; + case 12: + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCb.data[outCbIndex] = pack12BitTo16Bit(YCbCrRawIn.y);\n"; + } + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " outputBufferCr.data[outCrIndex] = pack12BitTo16Bit(YCbCrRawIn.z);\n"; + } + break; + case 16: + default: + // For 16-bit, direct values + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCb.data[outCbIndex] = uint16_t(clamp(YCbCrRawIn.y, 0.0, 65535.0));\n"; + } + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " outputBufferCr.data[outCrIndex] = uint16_t(clamp(YCbCrRawIn.z, 0.0, 65535.0));\n"; + } + break; + } + } else { + // For 8-bit formats + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCb.data[outCbIndex] = uint8_t(clamp(YCbCrRawIn.y, 0.0, 255.0));\n"; + } + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " outputBufferCr.data[outCrIndex] = uint8_t(clamp(YCbCrRawIn.z, 0.0, 255.0));\n"; + } + } + } + + shaderStr << " }\n"; // Close conditional chroma processing + } + } else { + // Writing to image + shaderStr << " // Writing to image destination\n"; + + // Write Y component if PLANE_0_BIT is set + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << " // Write Y component to Y plane\n" + << " imageStore(outputImageY, ivec3(pos, dstLayer), vec4(YCbCrRawIn.x, 0, 0, 1));\n\n"; + } + + // Write Cb/Cr components if their aspect flags are set + if ((imageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0) { + // Add conditional check for chroma processing + shaderStr << " // Process chroma data conditionally\n" + << " if (processChroma) {\n"; + + if (isOutputTwoPlane) { + // Two-plane output image format with interleaved CbCr + if ((imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) != 0) { + // Both Cb and Cr are needed + shaderStr << " // Write interleaved CbCr to 2-plane output image\n" + << " imageStore(outputImageCbCr, ivec3(chromaPos, dstLayer), " + << "vec4(YCbCrRawIn.y, YCbCrRawIn.z, 0, 1));\n"; + } + } else { + // Three-plane output image format with separate Cb and Cr planes + shaderStr << " // Write separate Cb and Cr to 3-plane output image\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " imageStore(outputImageCb, ivec3(chromaPos, dstLayer), vec4(YCbCrRawIn.y, 0, 0, 1));\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " imageStore(outputImageCr, ivec3(chromaPos, dstLayer), vec4(YCbCrRawIn.z, 0, 0, 1));\n"; + } + } + + // Close the conditional block + shaderStr << " }\n"; + } + } + + // End the function + shaderStr << "}\n\n"; +} + +uint32_t VulkanFilterYuvCompute::ShaderGenerateImagePlaneDescriptors(std::stringstream& shaderStr, + VkImageAspectFlags& imageAspects, + const char *imageName, + VkFormat imageFormat, + bool isInput, + uint32_t startBinding, + uint32_t set, + bool imageArray) +{ + shaderStr << " // The " << (isInput ? "input" : "output") << " image binding\n"; // Image binding goes in this pattern: // offset 0: RGBA image // offset 1: multi-planar image plane Y @@ -267,7 +1183,8 @@ void VulkanFilterYuvCompute::ShaderGeneratePlaneDescriptors(std::stringstream& s } else if (inputMpInfo->planesLayout.numberOfExtraPlanes == 2) { - imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT; + imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT; GenImageIoBindingLayout(shaderStr, imageName, "Cb", vkFormatLookUp(inputMpInfo->vkPlaneFormat[1])->name, @@ -290,10 +1207,631 @@ void VulkanFilterYuvCompute::ShaderGeneratePlaneDescriptors(std::stringstream& s GenImageIoBindingLayout(shaderStr, imageName, "RGB", vkFormatLookUp(imageFormat)->name, isInput, - startBinding, + startBinding++, set, imageArray); } + + return startBinding; +} + +uint32_t VulkanFilterYuvCompute::ShaderGenerateBufferPlaneDescriptors(std::stringstream& shaderStr, + VkImageAspectFlags& imageAspects, + const char *bufferName, + VkFormat bufferFormat, + bool isInput, + uint32_t startBinding, + uint32_t set, + VkDescriptorType bufferType) +{ + // Buffer binding follows the same pattern as image binding: + // offset 0: Single RGBA buffer with all data + // offset 1: Y plane buffer + // offset 2: 2-planar CbCr buffer or 3-planar Cb buffer + // offset 3: 3-planar Cr buffer + const VkMpFormatInfo* inputMpInfo = YcbcrVkFormatInfo(bufferFormat); + + // Determine element size based on format + const char* elementType = "uint8_t"; // Default to 8-bit + + shaderStr << " // The " << (isInput ? "input" : "output") << " buffer binding\n"; + // Check format for higher bit depths (16-bit formats) + const VkFormatDesc* formatInfo = vkFormatLookUp(bufferFormat); + if (formatInfo && formatInfo->name) { + if (strstr(formatInfo->name, "16") != nullptr || + strstr(formatInfo->name, "R16") != nullptr || + strstr(formatInfo->name, "10") != nullptr || + strstr(formatInfo->name, "12") != nullptr) { + elementType = "uint16_t"; // Use 16-bit for 10/12/16-bit formats + } + } + + if (inputMpInfo) { + // For multi-planar formats, define separate buffers for each plane + + // Y plane buffer (plane 0) + GenBufferIoBindingLayout(shaderStr, bufferName, "Y", + elementType, + bufferType, + isInput, + ++startBinding, + set); + + if (inputMpInfo->planesLayout.numberOfExtraPlanes == 1) { + // 2-plane format (NV12, NV21, etc.) + imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_1_BIT; + + GenBufferIoBindingLayout(shaderStr, bufferName, "CbCr", + elementType, + bufferType, + isInput, + ++startBinding, + set); + + } else if (inputMpInfo->planesLayout.numberOfExtraPlanes == 2) { + // 3-plane format (YUV 4:2:0, 4:2:2, 4:4:4, etc.) + imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT; + + GenBufferIoBindingLayout(shaderStr, bufferName, "Cb", + elementType, + bufferType, + isInput, + ++startBinding, + set); + + GenBufferIoBindingLayout(shaderStr, bufferName, "Cr", + elementType, + bufferType, + isInput, + ++startBinding, + set); + } + } else { + // For single-plane formats (like RGBA) + imageAspects = VK_IMAGE_ASPECT_COLOR_BIT; + + GenBufferIoBindingLayout(shaderStr, bufferName, "RGB", + elementType, + bufferType, + isInput, + startBinding++, + set); + } + + return startBinding; +} + + +uint32_t VulkanFilterYuvCompute::ShaderGeneratePlaneDescriptors(std::stringstream& shaderStr, + bool isInput, + uint32_t startBinding, + uint32_t set, + bool imageArray, + VkDescriptorType bufferType) +{ + + if ((isInput && m_inputIsBuffer) || (!isInput && m_outputIsBuffer)) { + + return ShaderGenerateBufferPlaneDescriptors(shaderStr, + isInput ? m_inputImageAspects : m_outputImageAspects, + isInput ? "inputBuffer" : "outputBuffer", + isInput ? m_inputFormat : m_outputFormat, + isInput, // isInput + startBinding, // startBinding + set, // set + bufferType); + } else { + + return ShaderGenerateImagePlaneDescriptors(shaderStr, + isInput ? m_inputImageAspects : m_outputImageAspects, + isInput ? "inputImage" : "outputImage", + isInput ? m_inputFormat : m_outputFormat, + isInput, // isInput + startBinding, // startBinding + set, // set + imageArray // imageArray + ); + } +} + +/** + * @brief Generates GLSL functions for YCbCr normalization with different bit depths + * + * Creates helper functions to normalize YCbCr values, handling different bit depths, + * and applying proper range adjustments (limited/full range). + * + * Process steps: + * 1. Calculate normalization parameters based on bit depth and range + * 2. Generate Y normalization function (scaling + offset) + * 3. Generate CbCr shifting functions (centering around zero) + * 4. Generate CbCr normalization functions (scaling + offset) + * 5. Generate bit-depth specific helpers for 10/12-bit formats + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param bitDepth The bit depth of the YCbCr data (8, 10, 12, or 16) + * @param isLimitedRange Whether values are limited range (true) or full range (false) + * @param hasChroma Whether to include chroma normalization functions + */ +static void GenYCbCrNormalizationFuncs(std::stringstream& shaderStr, + uint32_t bitDepth = 8, + bool isLimitedRange = true, + bool hasChroma = true) +{ + // STEP 1: Calculate normalization parameters based on bit depth and range + // =========================================================================== + + // Use double precision for calculations to maintain precision + double maxValue = (1ULL << bitDepth) - 1.0; // Max value for the given bit depth + + // Limited range values for different bit depths + double yBlack, yWhite, cZero, cScale; + + if (isLimitedRange) { + // Step 1.1: Calculate limited range (aka TV/Video range) values + // Use standard-compliant values for different bit depths + switch (bitDepth) { + case 10: + // 10-bit limited range: Y[64,940], C[64,960] + yBlack = 64.0; + yWhite = 940.0; + cZero = 64.0; + cScale = 896.0; // 960 - 64 + break; + case 12: + // 12-bit limited range: Y[256,3760], C[256,3840] + yBlack = 256.0; + yWhite = 3760.0; + cZero = 256.0; + cScale = 3584.0; // 3840 - 256 + break; + case 16: + // 16-bit limited range: scale 8-bit values by 2^8 + yBlack = 16.0 * 256.0; + yWhite = 235.0 * 256.0; + cZero = 16.0 * 256.0; + cScale = 224.0 * 256.0; + break; + case 8: + default: + // 8-bit limited range: Y[16,235], C[16,240] + yBlack = 16.0; + yWhite = 235.0; + cZero = 16.0; + cScale = 224.0; + break; + } + } else { + // Step 1.2: Calculate full range values (same for all bit depths, just scaled) + yBlack = 0.0; + yWhite = maxValue; + cZero = 0.0; + cScale = maxValue; + } + + // Step 1.3: Calculate normalization factors with double precision + double yRange = yWhite - yBlack; + double yFactor = 1.0 / yRange; + double yOffset = -yBlack * yFactor; + double cFactor = 1.0 / cScale; + + // Format values with high precision for GLSL + std::stringstream ss; + ss.precision(16); // Use high precision for constants + + // STEP 2: Generate Y normalization function + // =========================================================================== + shaderStr << "\n" + << "// Specify high precision for all floating point calculations\n" + << "precision highp float;\n" + << "precision highp int;\n" + << "\n" + << "// STEP 1: Normalize Y component for " << bitDepth << "-bit " + << (isLimitedRange ? "limited range" : "full range") << " content\n" + << "highp float normalizeY(highp float Y) {\n"; + + if (isLimitedRange) { + // Step 2.1: Limited range needs black level adjustment and scaling + // Format with high precision + ss.str(""); + ss << std::fixed << yFactor; + std::string yFactorStr = ss.str(); + + ss.str(""); + ss << std::fixed << yOffset; + std::string yOffsetStr = ss.str(); + + shaderStr << " // Step 1.1: Map from [" << yBlack << ", " << yWhite << "] to [0.0, 1.0]\n" + << " // Formula: normalizedY = (Y - yBlack) / yRange = Y * yFactor + yOffset\n" + << " return Y * " << yFactorStr << " + " << yOffsetStr << ";\n"; + } else { + // Step 2.2: Full range just needs scaling + shaderStr << " // Step 1.1: Map from [0, " << maxValue << "] to [0.0, 1.0]\n" + << " // Formula: normalizedY = Y / maxValue\n" + << " return Y / " << maxValue << ";\n"; + } + shaderStr << "}\n\n"; + + if (hasChroma) { + // STEP 3: Generate CbCr shifting functions + // =========================================================================== + + // Step 3.1: Generate CbCr shifting function for vec2 (common for 2-plane formats) + shaderStr << "// STEP 2: Shift CbCr components from centered range to [-0.5, 0.5] range\n" + << "highp vec2 shiftCbCr(highp vec2 CbCr) {\n" + << " // Step 2.1: Shift from [0.0, 1.0] to [-0.5, 0.5]\n" + << " return CbCr - 0.5;\n" + << "}\n\n"; + + // Step 3.2: Generate CbCr shifting function for vec3 (for full YCbCr triplet) + shaderStr << "// Step 2 (alternative): Shift YCbCr components, leaving Y alone but centering CbCr\n" + << "highp vec3 shiftCbCr(highp vec3 ycbcr) {\n" + << " // Step 2.1: Shift only Cb and Cr from [0.0, 1.0] to [-0.5, 0.5]\n" + << " const highp vec3 shift = vec3(0.0, -0.5, -0.5);\n" + << " return ycbcr + shift;\n" + << "}\n\n"; + + // STEP 4: Generate CbCr normalization function + // =========================================================================== + shaderStr << "// STEP 3: Normalize CbCr components for " << bitDepth << "-bit " + << (isLimitedRange ? "limited range" : "full range") << " content\n" + << "highp vec2 normalizeCbCr(highp vec2 CbCr) {\n"; + + if (isLimitedRange) { + // Step 4.1: Limited range needs zero level adjustment and scaling + // Format with high precision + ss.str(""); + ss << std::fixed << cZero; + std::string cZeroStr = ss.str(); + + ss.str(""); + ss << std::fixed << cFactor; + std::string cFactorStr = ss.str(); + + shaderStr << " // Step 3.1: Map from [" << cZero << ", " << (cZero + cScale) << "] to [0.0, 1.0]\n" + << " // Formula: normalizedCbCr = (CbCr - cZero) / cScale\n" + << " return (CbCr - " << cZeroStr << ") * " << cFactorStr << ";\n"; + } else { + // Step 4.2: Full range just needs scaling + shaderStr << " // Step 3.1: Map from [0, " << maxValue << "] to [0.0, 1.0]\n" + << " // Formula: normalizedCbCr = CbCr / maxValue\n" + << " return CbCr / " << maxValue << ";\n"; + } + shaderStr << "}\n\n"; + } + + // STEP 5: Generate bit-depth specific helper functions for 10/12-bit formats + // =========================================================================== + if (bitDepth == 10) { + shaderStr << "// STEP 4: Special 10-bit format handling functions\n" + << "// 10-bit packing formats often store values in uint16 or uint32 with specific bit layouts\n" + << "\n" + << "// Extract 10-bit value from 16-bit storage (common for P010, P210, etc.)\n" + << "highp float extract10BitFrom16Bit(highp uint value) {\n" + << " // Most 10-bit formats store the value in the most significant 10 bits\n" + << " highp uint raw10bit = value >> 6; // Shift right to remove 6 padding bits\n" + << " return float(raw10bit);\n" + << "}\n\n" + + << "// Extract 10-bit value from 16-bit storage as normalized float\n" + << "highp float extract10BitNormalized(highp uint value) {\n" + << " highp uint raw10bit = value >> 6; // Shift right to remove 6 padding bits\n" + << " return float(raw10bit) / 1023.0; // Normalize to [0,1]\n" + << "}\n\n" + + << "// Normalize packed 10-bit YUV directly\n" + << "highp vec3 normalize10BitYUV(highp uvec3 packedYuv) {\n" + << " // Extract 10-bit components\n" + << " highp float y = extract10BitFrom16Bit(packedYuv.x);\n" + << " highp float cb = extract10BitFrom16Bit(packedYuv.y);\n" + << " highp float cr = extract10BitFrom16Bit(packedYuv.z);\n" + << " // Normalize components\n" + << " y = normalizeY(y);\n" + << " highp vec2 cbcr = normalizeCbCr(vec2(cb, cr));\n" + << " return vec3(y, cbcr);\n" + << "}\n\n"; + } else if (bitDepth == 12) { + shaderStr << "// STEP 4: Special 12-bit format handling functions\n" + << "// 12-bit packing formats often store values in uint16 or uint32 with specific bit layouts\n" + << "\n" + << "// Extract 12-bit value from 16-bit storage (common for P012, P212, etc.)\n" + << "highp float extract12BitFrom16Bit(highp uint value) {\n" + << " // Most 12-bit formats store the value in the most significant 12 bits\n" + << " highp uint raw12bit = value >> 4; // Shift right to remove 4 padding bits\n" + << " return float(raw12bit);\n" + << "}\n\n" + + << "// Extract 12-bit value from 16-bit storage as normalized float\n" + << "highp float extract12BitNormalized(highp uint value) {\n" + << " highp uint raw12bit = value >> 4; // Shift right to remove 4 padding bits\n" + << " return float(raw12bit) / 4095.0; // Normalize to [0,1]\n" + << "}\n\n" + + << "// Normalize packed 12-bit YUV directly\n" + << "highp vec3 normalize12BitYUV(highp uvec3 packedYuv) {\n" + << " // Extract 12-bit components\n" + << " highp float y = extract12BitFrom16Bit(packedYuv.x);\n" + << " highp float cb = extract12BitFrom16Bit(packedYuv.y);\n" + << " highp float cr = extract12BitFrom16Bit(packedYuv.z);\n" + << " // Normalize components\n" + << " y = normalizeY(y);\n" + << " highp vec2 cbcr = normalizeCbCr(vec2(cb, cr));\n" + << " return vec3(y, cbcr);\n" + << "}\n\n"; + } +} + +/** + * @brief Generates GLSL functions for YCbCr denormalization with different bit depths + * + * Creates helper functions to denormalize YCbCr values from normalized [0-1] for Y and + * [-0.5,0.5] for CbCr back to the appropriate bit depth and range (limited or full). + * This is the inverse operation of GenYCbCrNormalizationFuncs. + * + * Process steps: + * 1. Calculate denormalization parameters based on bit depth and range + * 2. Generate Y denormalization function (inverse scaling + offset) + * 3. Generate CbCr unshifting functions (recentering to [0,1]) + * 4. Generate CbCr denormalization functions (inverse scaling + offset) + * 5. Generate combined convenience functions + * 6. Generate bit-depth specific packing helpers for 10/12-bit formats + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param bitDepth The target bit depth for the YCbCr data (8, 10, 12, or 16) + * @param isLimitedRange Whether target values are limited range (true) or full range (false) + * @param hasChroma Whether to include chroma denormalization functions + */ +static void GenYCbCrDeNormalizationFuncs(std::stringstream& shaderStr, + uint32_t bitDepth = 8, + bool isLimitedRange = true, + bool hasChroma = true) +{ + // STEP 1: Calculate denormalization parameters based on bit depth and range + // =========================================================================== + + // Use double precision for calculations to maintain precision + double maxValue = (1ULL << bitDepth) - 1.0; // Max value for the given bit depth + + // Limited range values for different bit depths + double yBlack, yWhite, cZero, cScale; + + if (isLimitedRange) { + // Step 1.1: Calculate limited range (aka TV/Video range) values + // Use standard-compliant values for different bit depths + switch (bitDepth) { + case 10: + // 10-bit limited range: Y[64,940], C[64,960] + yBlack = 64.0; + yWhite = 940.0; + cZero = 64.0; + cScale = 896.0; // 960 - 64 + break; + case 12: + // 12-bit limited range: Y[256,3760], C[256,3840] + yBlack = 256.0; + yWhite = 3760.0; + cZero = 256.0; + cScale = 3584.0; // 3840 - 256 + break; + case 16: + // 16-bit limited range: scale 8-bit values by 2^8 + yBlack = 16.0 * 256.0; + yWhite = 235.0 * 256.0; + cZero = 16.0 * 256.0; + cScale = 224.0 * 256.0; + break; + case 8: + default: + // 8-bit limited range: Y[16,235], C[16,240] + yBlack = 16.0; + yWhite = 235.0; + cZero = 16.0; + cScale = 224.0; + break; + } + } else { + // Step 1.2: Calculate full range values (same for all bit depths, just scaled) + yBlack = 0.0; + yWhite = maxValue; + cZero = 0.0; + cScale = maxValue; + } + + // Step 1.3: Calculate denormalization factors (inverse of normalization) + double yRange = yWhite - yBlack; + + // Format values with high precision for GLSL + std::stringstream ss; + ss.precision(16); // Use high precision for constants + + // STEP 2: Generate Y denormalization function + // =========================================================================== + shaderStr << "\n" + << "// Specify high precision for all floating point calculations\n" + << "precision highp float;\n" + << "precision highp int;\n" + << "\n" + << "// STEP 1: Denormalize Y component from [0.0, 1.0] back to " << bitDepth << "-bit " + << (isLimitedRange ? "limited range" : "full range") << " content\n" + << "highp float denormalizeY(highp float normalizedY) {\n"; + + if (isLimitedRange) { + // Step 2.1: Limited range needs scaling and black level adjustment + // Format with high precision + ss.str(""); + ss << std::fixed << yRange; + std::string yRangeStr = ss.str(); + + ss.str(""); + ss << std::fixed << yBlack; + std::string yBlackStr = ss.str(); + + shaderStr << " // Step 1.1: Map from [0.0, 1.0] back to [" << yBlack << ", " << yWhite << "]\n" + << " // Formula: Y = normalizedY * yRange + yBlack\n" + << " return normalizedY * " << yRangeStr << " + " << yBlackStr << ";\n"; + } else { + // Step 2.2: Full range just needs scaling + shaderStr << " // Step 1.1: Map from [0.0, 1.0] back to [0, " << maxValue << "]\n" + << " // Formula: Y = normalizedY * maxValue\n" + << " return normalizedY * " << maxValue << ";\n"; + } + shaderStr << "}\n\n"; + + if (hasChroma) { + // STEP 3: Generate CbCr unshifting function + // =========================================================================== + shaderStr << "// STEP 2: Unshift CbCr components from [-0.5, 0.5] range back to centered range [0.0, 1.0]\n" + << "highp vec2 unshiftCbCr(highp vec2 shiftedCbCr) {\n" + << " // Step 2.1: Shift from [-0.5, 0.5] back to [0.0, 1.0]\n" + << " return shiftedCbCr + 0.5;\n" + << "}\n\n"; + + // STEP 4: Generate CbCr denormalization function + // =========================================================================== + shaderStr << "// STEP 3: Denormalize CbCr components from [0.0, 1.0] back to " << bitDepth << "-bit " + << (isLimitedRange ? "limited range" : "full range") << " content\n" + << "highp vec2 denormalizeCbCr(highp vec2 normalizedCbCr) {\n"; + + if (isLimitedRange) { + // Step 4.1: Limited range needs scaling and zero level adjustment + // Format with high precision + ss.str(""); + ss << std::fixed << cScale; + std::string cScaleStr = ss.str(); + + ss.str(""); + ss << std::fixed << cZero; + std::string cZeroStr = ss.str(); + + shaderStr << " // Step 3.1: Map from [0.0, 1.0] back to [" << cZero << ", " << (cZero + cScale) << "]\n" + << " // Formula: CbCr = normalizedCbCr * cScale + cZero\n" + << " return normalizedCbCr * " << cScaleStr << " + " << cZeroStr << ";\n"; + } else { + // Step 4.2: Full range just needs scaling + shaderStr << " // Step 3.1: Map from [0.0, 1.0] back to [0, " << maxValue << "]\n" + << " // Formula: CbCr = normalizedCbCr * maxValue\n" + << " return normalizedCbCr * " << maxValue << ";\n"; + } + shaderStr << "}\n\n"; + + // STEP 5: Generate combined convenience functions + // =========================================================================== + + // Step 5.1: Combined unshift and denormalize + shaderStr << "// STEP 4: Combined function: unshift and denormalize CbCr in one step\n" + << "highp vec2 unshiftAndDenormalizeCbCr(highp vec2 shiftedCbCr) {\n" + << " // Step 4.1: First unshift from [-0.5, 0.5] to [0.0, 1.0], then denormalize\n" + << " return denormalizeCbCr(unshiftCbCr(shiftedCbCr));\n" + << "}\n\n"; + + // Step 5.2: Full YCbCr denormalization + shaderStr << "// STEP 5: Combined function to denormalize full YCbCr triplet\n" + << "highp vec3 denormalizeYCbCr(highp vec3 normalizedYCbCr) {\n" + << " // Step 5.1: Denormalize Y component\n" + << " highp float y = denormalizeY(normalizedYCbCr.x);\n" + << " // Step 5.2: Unshift and denormalize Cb and Cr components\n" + << " highp vec2 cbcr = denormalizeCbCr(vec2(normalizedYCbCr.y + 0.5, normalizedYCbCr.z + 0.5));\n" + << " // Step 5.3: Combine the components into a single vector\n" + << " return vec3(y, cbcr);\n" + << "}\n\n"; + } + + // STEP 6: Generate bit-depth specific packing helpers for 10/12-bit formats + // =========================================================================== + if (bitDepth == 10) { + shaderStr << "// STEP 6: Special 10-bit format packing functions\n" + << "// Pack 10-bit values into 16-bit storage (common for P010, P210, etc.)\n" + << "\n" + << "// Pack 10-bit value into 16-bit storage (MSB aligned with padding)\n" + << "highp uint pack10BitTo16Bit(highp float value) {\n" + << " // Clamp the input value to the valid range for 10-bit\n" + << " highp uint raw10bit = uint(clamp(value, 0.0, 1023.0));\n" + << " // Shift left by 6 bits to store in MSB format (standard for P010, etc.)\n" + << " return raw10bit << 6;\n" + << "}\n\n" + + << "// Pack normalized [0,1] value into 10-bit MSB aligned format\n" + << "highp uint packNormalizedTo10Bit(highp float normalizedValue) {\n" + << " // Scale to 10-bit range and pack\n" + << " highp uint raw10bit = uint(clamp(normalizedValue * 1023.0, 0.0, 1023.0));\n" + << " return raw10bit << 6;\n" + << "}\n\n" + + << "// Pack denormalized YUV to 10-bit values\n" + << "highp uvec3 packYUVTo10Bit(highp vec3 yuv) {\n" + << " // Denormalize components first\n" + << " highp vec3 denormYuv = denormalizeYCbCr(yuv);\n" + << " // Pack each component into 16-bit storage (MSB aligned)\n" + << " return uvec3(\n" + << " pack10BitTo16Bit(denormYuv.x), // Y\n" + << " pack10BitTo16Bit(denormYuv.y), // Cb\n" + << " pack10BitTo16Bit(denormYuv.z) // Cr\n" + << " );\n" + << "}\n\n"; + } else if (bitDepth == 12) { + shaderStr << "// STEP 6: Special 12-bit format packing functions\n" + << "// Pack 12-bit values into 16-bit storage (common for P012, P212, etc.)\n" + << "\n" + << "// Pack 12-bit value into 16-bit storage (MSB aligned with padding)\n" + << "highp uint pack12BitTo16Bit(highp float value) {\n" + << " // Clamp the input value to the valid range for 12-bit\n" + << " highp uint raw12bit = uint(clamp(value, 0.0, 4095.0));\n" + << " // Shift left by 4 bits to store in MSB format (standard for P012, etc.)\n" + << " return raw12bit << 4;\n" + << "}\n\n" + + << "// Pack normalized [0,1] value into 12-bit MSB aligned format\n" + << "highp uint packNormalizedTo12Bit(highp float normalizedValue) {\n" + << " // Scale to 12-bit range and pack\n" + << " highp uint raw12bit = uint(clamp(normalizedValue * 4095.0, 0.0, 4095.0));\n" + << " return raw12bit << 4;\n" + << "}\n\n" + + << "// Pack denormalized YUV to 12-bit values\n" + << "highp uvec3 packYUVTo12Bit(highp vec3 yuv) {\n" + << " // Denormalize components first\n" + << " highp vec3 denormYuv = denormalizeYCbCr(yuv);\n" + << " // Pack each component into 16-bit storage (MSB aligned)\n" + << " return uvec3(\n" + << " pack12BitTo16Bit(denormYuv.x), // Y\n" + << " pack12BitTo16Bit(denormYuv.y), // Cb\n" + << " pack12BitTo16Bit(denormYuv.z) // Cr\n" + << " );\n" + << "}\n\n"; + } +} + +/** + * @brief Generates GLSL function for YCbCr format conversion with normalization and denormalization + * + * Creates a helper function for converting between different YCbCr formats + * that normalizes input values, then denormalizes to the target format. + * This handles both bit-depth and range conversions. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param inputBitDepth The bit depth of input YCbCr data (8, 10, 12, or 16 bits) + * @param outputBitDepth The bit depth of output YCbCr data (8, 10, 12, or 16 bits) + * @param isInputLimitedRange Whether the input uses limited range (true) or full range (false) + * @param isOutputLimitedRange Whether the output uses limited range (true) or full range (false) + */ +static void GenConvertYCbCrFormat(std::stringstream& shaderStr, + uint32_t inputBitDepth = 8, + uint32_t outputBitDepth = 8, + bool isInputLimitedRange = true, + bool isOutputLimitedRange = true) +{ + shaderStr << + "// Function to handle YCbCr format conversion with proper normalization\n" + "vec3 convertYCbCrFormat(vec3 YCbCrRawIn) {\n" + " // Step 1: Normalize input YCbCr values to [0-1] range\n" + " float normalizedY = normalizeY(YCbCrRawIn.x);\n" + " vec2 normalizedCbCr = normalizeCbCr(vec2(YCbCrRawIn.y, YCbCrRawIn.z));\n\n" + " // Step 2: Denormalize to output bit depth and range\n" + " float y = denormalizeY(normalizedY);\n" + " vec2 cbcr = denormalizeCbCr(normalizedCbCr);\n\n" + " // Return the converted values\n" + " return vec3(y, cbcr.x, cbcr.y);\n" + "}\n\n"; } size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) @@ -307,56 +1845,45 @@ size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) // Create compute pipeline std::stringstream shaderStr; + + // 1. Generate header and push constants GenHeaderAndPushConst(shaderStr); + + // 2. Generate IO bindings // Input image - shaderStr << " // The input YCbCr image binding\n"; + shaderStr << " // The input YCbCr input binding\n"; + // Input Descriptors ShaderGeneratePlaneDescriptors(shaderStr, - m_inputImageAspects, - "inputImage", - m_inputFormat, true, // isInput 0, // startBinding 0, // set - true // imageArray - ); - - // Output image - shaderStr << " // The output RGBA image binding\n"; - ShaderGeneratePlaneDescriptors(shaderStr, - m_outputImageAspects, - "outputImage", - m_outputFormat, - false, // isInput - 4, // startBinding - 0, // set - true // imageArray - ); - - shaderStr << "\n" - " // TODO: normalize only narrow\n" - "float normalizeY(float Y) {\n" - " // return (Y - (16.0 / 255.0)) * (255.0 / (235.0 - 16.0));\n" - " return (Y - 0.0627451) * 1.164383562;\n" - "}\n" - "\n" - "vec2 shiftCbCr(vec2 CbCr) {\n" - " return CbCr - 0.5;\n" - "}\n" - "\n" - "vec3 shiftCbCr(vec3 ycbcr) {\n" - " const vec3 shiftCbCr = vec3(0.0, -0.5, -0.5);\n" - " return ycbcr + shiftCbCr;\n" - "}\n" - "\n" - " // TODO: normalize only narrow\n" - "vec2 normalizeCbCr(vec2 CbCr) {\n" - " // return (CbCr - (16.0 / 255.0)) / ((240.0 - 16.0) / 255.0);\n" - " return (CbCr - 0.0627451) * 1.138392857;\n" - "}\n" - "\n"; - - const VkSamplerYcbcrConversionCreateInfo& samplerYcbcrConversionCreateInfo = m_samplerYcbcrConversion.GetSamplerYcbcrConversionCreateInfo(); - const VkMpFormatInfo * mpInfo = YcbcrVkFormatInfo(samplerYcbcrConversionCreateInfo.format); + true, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + shaderStr << " // The output RGBA image binding\n"; + // Output Descriptors + ShaderGeneratePlaneDescriptors(shaderStr, + false, // isInput + 4, // startBinding + 0, // set + true, // imageArray + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + // Get format information to determine bit depth + const VkSamplerYcbcrConversionCreateInfo& samplerYcbcrConversionCreateInfo = + m_samplerYcbcrConversion.GetSamplerYcbcrConversionCreateInfo(); + const VkMpFormatInfo* mpInfo = YcbcrVkFormatInfo(samplerYcbcrConversionCreateInfo.format); + + // Determine bit depth from the format + uint32_t bitDepth = mpInfo ? GetBitsPerChannel(mpInfo->planesLayout) : 8; + + // Determine if we're using limited or full range + bool isLimitedRange = (samplerYcbcrConversionCreateInfo.ycbcrRange == VK_SAMPLER_YCBCR_RANGE_ITU_NARROW); + + // 3. Generate helper functions for YCbCr normalization with proper bit depth handling + GenYCbCrNormalizationFuncs(shaderStr, bitDepth, isLimitedRange, true); + + // 4. Generate YCbCr to RGB conversion function const unsigned int bpp = (8 + mpInfo->planesLayout.bpp * 2); const YcbcrBtStandard btStandard = GetYcbcrPrimariesConstantsId(samplerYcbcrConversionCreateInfo.ycbcrModel); @@ -367,7 +1894,6 @@ size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) rangeConstants.cbMax, rangeConstants.crMax); - shaderStr << "vec3 convertYCbCrToRgb(vec3 yuv) {\n" " vec3 rgb;\n"; @@ -377,7 +1903,7 @@ size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) "}\n" "\n"; - + // 5. Generate color range normalization function YcbcrNormalizeColorRange yCbCrNormalizeColorRange(bpp, (samplerYcbcrConversionCreateInfo.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) ? YCBCR_COLOR_RANGE_NATURAL : (YCBCR_COLOR_RANGE)samplerYcbcrConversionCreateInfo.ycbcrRange); @@ -390,21 +1916,51 @@ size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) "}\n" "\n"; + // 6. Generate function to fetch YCbCr components from images + shaderStr << + "vec3 fetchYCbCrFromImage(ivec3 pos) {\n" + " // Fetch from the texture.\n" + " float Y = imageLoad(inputImageY, pos).r;\n" + " // For subsampled formats, divide by 2\n" + " vec2 CbCr = imageLoad(inputImageCbCr, ivec3(pos.xy/2, pos.z)).rg;\n" + " return vec3(Y, CbCr);\n" + "}\n" + "\n"; + + // 7. Generate function to write RGBA to output image + shaderStr << + "void writeRgbaToImage(vec4 rgba, ivec3 pos) {\n" + " imageStore(outputImageRGB, pos, rgba);\n" + "}\n" + "\n"; + + // 8. Main function shaderStr << "void main()\n" "{\n"; + + // 9. Handle position calculation GenHandleImagePosition(shaderStr); + + // 10. Calculate source position with replication if enabled GenHandleSourcePositionWithReplicate(shaderStr, m_enableRowAndColumnReplication); + + // 11. YCbCr to RGB conversion shaderStr << - " // Fetch from the texture.\n" - " float Y = imageLoad(inputImageY, ivec3(srcPos, pushConstants.srcImageLayer)).r;\n" - " // TODO: it is /2 only for sub-sampled formats\n" - " vec2 CbCr = imageLoad(inputImageCbCr, ivec3(srcPos/2, pushConstants.srcImageLayer)).rg;\n" + " // Calculate position with layer\n" + " ivec3 srcPos3D = ivec3(srcPos, pushConstants.srcLayer);\n" + " ivec3 dstPos3D = ivec3(pos, pushConstants.dstLayer);\n" + "\n" + " // Fetch YCbCr components\n" + " vec3 ycbcr = fetchYCbCrFromImage(srcPos3D);\n" + "\n" + " // Process: normalize, shift, and convert to RGB\n" + " ycbcr = shiftCbCr(normalizeYCbCr(ycbcr));\n" + " vec3 rgb = convertYCbCrToRgb(ycbcr);\n" "\n" - " vec3 ycbcr = shiftCbCr(normalizeYCbCr(vec3(Y, CbCr)));\n" - " vec4 rgba = vec4(convertYCbCrToRgb(ycbcr),1.0);\n" - " // Store it back.\n" - " imageStore(outputImageRGB, ivec3(pos, pushConstants.dstImageLayer), rgba);\n" + " // Write final RGBA result\n" + " vec4 rgba = vec4(rgb, 1.0);\n" + " writeRgbaToImage(rgba, dstPos3D);\n" "}\n"; computeShader = shaderStr.str(); @@ -429,51 +1985,171 @@ size_t VulkanFilterYuvCompute::InitYCBCRCOPY(std::string& computeShader) // 3-planar: Cb (R) binding = 6 // 3-planar: Cr (R) binding = 7 + // Get format information to determine bit depths + const VkMpFormatInfo* inputMpInfo = YcbcrVkFormatInfo(m_inputFormat); + const VkMpFormatInfo* outputMpInfo = YcbcrVkFormatInfo(m_outputFormat); + + // Determine bit depth from the formats + const uint32_t inputBitDepth = inputMpInfo ? GetBitsPerChannel(inputMpInfo->planesLayout) : 8; + const uint32_t outputBitDepth = outputMpInfo ? GetBitsPerChannel(outputMpInfo->planesLayout) : 8; + + // Determine if we're using limited or full range for input and output + // Default to limited range as it's more common for YCbCr content + const VkSamplerYcbcrConversionCreateInfo& samplerYcbcrConversionCreateInfo = + m_samplerYcbcrConversion.GetSamplerYcbcrConversionCreateInfo(); + const bool isInputLimitedRange = (samplerYcbcrConversionCreateInfo.ycbcrRange == VK_SAMPLER_YCBCR_RANGE_ITU_NARROW); + const bool isOutputLimitedRange = isInputLimitedRange; // Usually same as input, but could be configurable + + // Check if input or output are buffers + const bool isInputBuffer = m_inputIsBuffer; + const bool isOutputBuffer = m_outputIsBuffer; + + // Check if we need to do any bit depth conversion + const bool needsBitDepthConversion = (inputBitDepth != outputBitDepth); + + // Check if we need to do any range conversion + const bool needsRangeConversion = (isInputLimitedRange != isOutputLimitedRange); + std::stringstream shaderStr; + + // 1. Generate header and push constants GenHeaderAndPushConst(shaderStr); - // Input image - shaderStr << " // The input image binding\n"; + + // 2. Generate IO bindings + // Input Descriptors ShaderGeneratePlaneDescriptors(shaderStr, - m_inputImageAspects, - "inputImage", - m_inputFormat, true, // isInput 0, // startBinding 0, // set - true // imageArray - ); + true, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - // Output image - shaderStr << " // The output image binding\n"; + // Output Descriptors ShaderGeneratePlaneDescriptors(shaderStr, - m_outputImageAspects, - "outputImage", - m_outputFormat, false, // isInput 4, // startBinding 0, // set - true // imageArray - ); + true, // imageArray + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + shaderStr << "\n\n"; + // Determine input and output plane configurations + const bool hasInputChroma = (m_inputImageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0; + const bool hasOutputChroma = (m_outputImageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0; + + // Determine if input is two-plane (e.g., NV12) or three-plane (e.g., I420) + const bool isInputTwoPlane = (m_inputImageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) && + !(m_inputImageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT); + + // Determine if output is two-plane (e.g., NV12) or three-plane (e.g., I420) + const bool isOutputTwoPlane = (m_outputImageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) && + !(m_outputImageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT); + + // 3. Add any bit depth handling functions needed + if (isInputBuffer && inputBitDepth > 8) { + bool isMSB = true; // Default to MSB-aligned (most common case) + GenExtractHighBitDepthFunc(shaderStr, isMSB, inputBitDepth); + } + + // 4. Add buffer read/write functions if needed + if (isInputBuffer) { + // Add fetch functions for Y and CbCr from buffer + GenFetchYFromBufferFunc(shaderStr, inputBitDepth > 8, inputBitDepth); + GenFetchCbCrFromBufferFunc(shaderStr, inputBitDepth > 8, inputBitDepth); + } + + // 5. Add YCbCr normalization and denormalization functions for bit depth conversion + if (needsBitDepthConversion || needsRangeConversion) { + // Generate normalization functions for input format + GenYCbCrNormalizationFuncs(shaderStr, inputBitDepth, isInputLimitedRange, hasInputChroma); + + // Generate denormalization functions for output format + GenYCbCrDeNormalizationFuncs(shaderStr, outputBitDepth, isOutputLimitedRange, hasOutputChroma); + } + + // 6. Generate the read function for YCbCr data + GenReadYCbCrBuffer(shaderStr, isInputBuffer, inputBitDepth, isInputTwoPlane, m_inputEnableMsbToLsbShift, m_inputImageAspects); + + // 7. Generate the write function for YCbCr data + GenWriteYCbCrBuffer(shaderStr, isOutputBuffer, outputBitDepth, isOutputTwoPlane, m_outputEnableLsbToMsbShift, m_outputImageAspects); + + // 8. Helper function for combined normalization and denormalization + if (needsBitDepthConversion || needsRangeConversion) { + GenConvertYCbCrFormat(shaderStr, inputBitDepth, outputBitDepth, isInputLimitedRange, isOutputLimitedRange); + } + + // 9. Main function shaderStr << "void main()\n" "{\n"; - GenHandleImagePosition(shaderStr); + + // 10. Handle position calculation + if (isInputBuffer || isOutputBuffer) { + // Use buffer position calculation + GenHandleBufferPosition(shaderStr); + } else { + // Use image position calculation + GenHandleImagePosition(shaderStr); + } + + // 11. Calculate source position with replication if enabled GenHandleSourcePositionWithReplicate(shaderStr, m_enableRowAndColumnReplication); + + // 12. Handle YCbCr processing + + // For inputs with chroma, we need to handle subsampling + // Get subsampling ratios for input format + const uint32_t chromaHorzRatio = (inputMpInfo != nullptr) ? (1 << inputMpInfo->planesLayout.secondaryPlaneSubsampledX) : 1; + const uint32_t chromaVertRatio = (inputMpInfo != nullptr) ? (1 << inputMpInfo->planesLayout.secondaryPlaneSubsampledY) : 1; + + // Generate condition for chroma processing based on actual subsampling shaderStr << - " // Read Y value from source Y plane and write it to destination Y plane\n" - " float Y = imageLoad(inputImageY, ivec3(srcPos, pushConstants.srcImageLayer)).r;\n" - " imageStore(outputImageY, ivec3(pos, pushConstants.dstImageLayer), vec4(Y, 0, 0, 1));\n" - "\n" - " // Do the same for the CbCr plane, but remember about the 4:2:0 subsampling\n" - " if (srcPos % 2 == ivec2(0, 0)) {\n" - " srcPos /= 2;\n" - " pos /= 2;\n" - " vec2 CbCr = imageLoad(inputImageCbCr, ivec3(srcPos, pushConstants.srcImageLayer)).rg;\n" - " imageStore(outputImageCbCr, ivec3(pos, pushConstants.dstImageLayer), vec4(CbCr, 0, 1));\n" - " }\n" - "}\n"; + " // Handle proper subsampling based on format (" << + (chromaHorzRatio == 2 ? (chromaVertRatio == 2 ? "4:2:0" : "4:2:2") : "4:4:4") << ")\n"; + + // Generate the chroma position condition with a boolean variable + GenHandleChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, true, "pos", "processChroma"); + + // Initialize chroma positions with default values + shaderStr << " // Initialize chroma positions\n" + << " ivec2 chromaSrcPos = srcPos;\n" + << " ivec2 chromaPos = pos;\n\n" + << " // Check if we need to process chroma\n" + << " if (processChroma) {\n"; + + // Generate chroma position calculations for source position + GenCalculateChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, "srcPos", "chromaSrcPos", 8); + + // Generate chroma position calculations for destination position + GenCalculateChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, "pos", "chromaPos", 8); + + shaderStr << " }\n"; + + // Read YCbCr data using the helper function + shaderStr << "\n" + << " // Read YCbCr data from source\n" + << " vec3 YCbCrRawIn = readYCbCrFromSource(srcPos, chromaSrcPos, pushConstants.srcLayer, processChroma);\n\n"; + + // Process the data based on whether we need conversion + if (needsBitDepthConversion || needsRangeConversion) { + shaderStr << + " // Need format conversion - normalize and denormalize\n" + " vec3 YCbCrRawOut = convertYCbCrFormat(YCbCrRawIn);\n\n"; + } else { + shaderStr << + " // No format conversion needed - direct copy\n" + " vec3 YCbCrRawOut = YCbCrRawIn;\n\n"; + } + + // Write the processed data using the helper function + shaderStr << + " // Write processed data to destination\n" + " writeYCbCrToDestination(YCbCrRawOut, pos, chromaPos, pushConstants.dstLayer, processChroma);\n" + "\n\n"; + + // Close the main function + shaderStr << "}\n"; computeShader = shaderStr.str(); if (dumpShaders) @@ -495,37 +2171,849 @@ size_t VulkanFilterYuvCompute::InitYCBCRCLEAR(std::string& computeShader) // Create compute pipeline std::stringstream shaderStr; + + // 1. Generate header and push constants GenHeaderAndPushConst(shaderStr); - // Output image - shaderStr << " // The output image binding\n"; + // 2. Generate output image bindings + shaderStr << " // The output descriptors binding\n"; + // Output Descriptors ShaderGeneratePlaneDescriptors(shaderStr, - m_outputImageAspects, - "outputImage", - m_outputFormat, false, // isInput 4, // startBinding 0, // set - true // imageArray - ); + true, // imageArray + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); shaderStr << "\n\n"; + // Get format information to determine subsampling ratios + const VkMpFormatInfo* outputMpInfo = YcbcrVkFormatInfo(m_outputFormat); + // Get subsampling ratios for output format + const uint32_t chromaHorzRatio = (outputMpInfo != nullptr) ? (1 << outputMpInfo->planesLayout.secondaryPlaneSubsampledX) : 1; + const uint32_t chromaVertRatio = (outputMpInfo != nullptr) ? (1 << outputMpInfo->planesLayout.secondaryPlaneSubsampledY) : 1; + + + // 3. Main function shaderStr << "void main()\n" "{\n"; + + // 4. Handle position calculation GenHandleImagePosition(shaderStr); + + // 5. Clear operations for Y plane shaderStr << - " imageStore(outputImageY, ivec3(pos, pushConstants.dstImageLayer), vec4(0.5, 0, 0, 1));\n" - "\n" - " // Do the same for the CbCr plane, but remember about the 4:2:0 subsampling\n" - " if (pos % 2 == ivec2(0, 0)) {\n" - " pos /= 2;\n" - " imageStore(outputImageCbCr, ivec3(pos, pushConstants.dstImageLayer), vec4(0.5, 0.5, 0.0, 1.0));\n" - " }\n" - "}\n"; + " // Clear Y plane with 50% intensity\n" + " imageStore(outputImageY, ivec3(pos, pushConstants.dstLayer), vec4(0.5, 0, 0, 1));\n" + "\n"; + + // Handle CbCr plane clearing based on format's subsampling + shaderStr << + " // Clear CbCr plane with " << + (chromaHorzRatio == 2 ? (chromaVertRatio == 2 ? "4:2:0" : "4:2:2") : "4:4:4") << + " subsampling\n"; + + // Generate a boolean to track whether this position needs chroma clearing + GenHandleChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, true, "pos", "shouldClearChroma"); + + // Handle position for chroma planes + shaderStr << " ivec2 chromaPos = pos;\n"; + shaderStr << " if (shouldClearChroma) {\n"; + + // Calculate chroma position if necessary + GenCalculateChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, "pos", "chromaPos", 8); + + // For 2-plane format, output CbCr together + if (m_outputImageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " // Clear CbCr plane with 50% intensity (middle range)\n" + << " imageStore(outputImageCbCr, ivec3(chromaPos, pushConstants.dstLayer), vec4(0.5, 0.5, 0.0, 1.0));\n"; + } + + // For 3-plane format, handle Cb and Cr separately + if (m_outputImageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " // Clear separate Cb and Cr planes with 50% intensity (middle range)\n" + << " imageStore(outputImageCb, ivec3(chromaPos, pushConstants.dstLayer), vec4(0.5, 0.0, 0.0, 1.0));\n" + << " imageStore(outputImageCr, ivec3(chromaPos, pushConstants.dstLayer), vec4(0.5, 0.0, 0.0, 1.0));\n"; + } + + shaderStr << " }\n" + << "}\n"; computeShader = shaderStr.str(); if (dumpShaders) std::cout << "\nCompute Shader:\n" << computeShader; return computeShader.size(); } + +uint32_t VulkanFilterYuvCompute::GetPlaneIndex(VkImageAspectFlagBits planeAspect) { + + // Returns index 0 for VK_IMAGE_ASPECT_COLOR_BIT and VK_IMAGE_ASPECT_PLANE_0_BIT + // Returns index 1 for VK_IMAGE_ASPECT_PLANE_1_BIT + // Returns index 2 for VK_IMAGE_ASPECT_PLANE_2_BIT + + // First, verify it's a plane aspect bit + assert(planeAspect & validAspects); + + if (planeAspect & VK_IMAGE_ASPECT_COLOR_BIT) { + return 0; + } + + // Alternatively, without intrinsics: + return (planeAspect & VK_IMAGE_ASPECT_PLANE_0_BIT) ? 0 : + (planeAspect & VK_IMAGE_ASPECT_PLANE_1_BIT) ? 1 : 2; +} + +uint32_t VulkanFilterYuvCompute::UpdateBufferDescriptorSets( + const VkBuffer* vkBuffers, + uint32_t numVkBuffers, + const VkSubresourceLayout* vkBufferSubresourceLayout, + uint32_t numPlanes, + VkImageAspectFlags validImageAspects, + uint32_t& descrIndex, + uint32_t& baseBinding, + VkDescriptorType descriptorType, // Ex: VK_DESCRIPTOR_TYPE_STORAGE_BUFFER + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr], + std::array& writeDescriptorSets, + const uint32_t maxDescriptors) +{ + + validImageAspects &= validAspects; + uint32_t curImageAspect = 0; + uint32_t bufferIndex = 0; + while(validImageAspects) { + + if (validImageAspects & (VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect) ) { + + uint32_t planeNum = GetPlaneIndex((VkImageAspectFlagBits)(VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect)); + uint32_t dstBinding = baseBinding; + if (curImageAspect > 0) { + // the first plane is 1, second plane is 2, the 3rd is 3 + dstBinding += (1 + planeNum); + } + + writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; + writeDescriptorSets[descrIndex].dstBinding = dstBinding; + writeDescriptorSets[descrIndex].descriptorCount = 1; + writeDescriptorSets[descrIndex].descriptorType = descriptorType; + + bufferDescriptors[descrIndex].buffer = vkBuffers[bufferIndex]; + bufferDescriptors[descrIndex].offset = vkBufferSubresourceLayout[planeNum].offset; + bufferDescriptors[descrIndex].range = vkBufferSubresourceLayout[planeNum].arrayPitch; + writeDescriptorSets[descrIndex].pBufferInfo = &bufferDescriptors[descrIndex]; + descrIndex++; + validImageAspects &= ~(VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect); + bufferIndex = std::min(numVkBuffers - 1, bufferIndex + 1); + } + + curImageAspect++; + } + assert(descrIndex <= maxDescriptors); + return descrIndex; +} + +uint32_t VulkanFilterYuvCompute::UpdateImageDescriptorSets( + const VkImageResourceView* imageView, + VkImageAspectFlags validImageAspects, + VkSampler convSampler, + VkImageLayout imageLayout, + uint32_t& descrIndex, + uint32_t& baseBinding, + VkDescriptorType descriptorType, // Ex: VK_DESCRIPTOR_TYPE_STORAGE_IMAGE + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr], + std::array& writeDescriptorSets, + const uint32_t maxDescriptors) +{ + + validImageAspects &= validAspects; + uint32_t curImageAspect = 0; + [[maybe_unused]] const uint32_t numPlanes = imageView->GetNumberOfPlanes(); + while(validImageAspects) { + + if (validImageAspects & (VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect) ) { + + VkSampler ccSampler = (curImageAspect == 0) ? convSampler : VK_NULL_HANDLE; + uint32_t planeNum = GetPlaneIndex((VkImageAspectFlagBits)(VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect)); + assert(planeNum < numPlanes); + uint32_t dstBinding = baseBinding; + if (curImageAspect > 0) { + // the first plane is 1, second plane is 2, the 3rd is 3 + dstBinding += (1 + planeNum); + } + + writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; + writeDescriptorSets[descrIndex].dstBinding = dstBinding; + writeDescriptorSets[descrIndex].descriptorCount = 1; + writeDescriptorSets[descrIndex].descriptorType = (ccSampler != VK_NULL_HANDLE) ? + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER : + descriptorType; + imageDescriptors[descrIndex].sampler = ccSampler; + imageDescriptors[descrIndex].imageView = (curImageAspect == 0) ? + imageView->GetImageView() : + imageView->GetPlaneImageView(planeNum); + assert(imageDescriptors[descrIndex].imageView); + imageDescriptors[descrIndex].imageLayout = imageLayout; + writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // Y (0) plane + descrIndex++; + validImageAspects &= ~(VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect); + } + + curImageAspect++; + } + assert(descrIndex <= maxDescriptors); + return descrIndex; +} + +// Image input -> Image output +VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkImageResourceView* inImageView, + const VkVideoPictureResourceInfoKHR * inImageResourceInfo, + const VkImageResourceView* outImageView, + const VkVideoPictureResourceInfoKHR * outImageResourceInfo, + uint32_t bufferIdx) +{ + + assert(cmdBuf != VK_NULL_HANDLE); + + m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); + + VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); + + switch (layoutMode) { + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: + { + + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr]{}; + std::array writeDescriptorSets{}; + + // Images + uint32_t set = 0; + uint32_t descrIndex = 0; + uint32_t dstBinding = 0; + + // IN 0: RGBA color converted by an YCbCr sample + // IN 1: y plane - G -> R8 + // IN 2: Cb or Cr or CbCr plane - BR -> R8B8 + // IN 3: Cr or Cb plane - R -> R8 + UpdateImageDescriptorSets(inImageView, + m_inputImageAspects, + m_samplerYcbcrConversion.GetSampler(), + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + descrIndex, + dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + imageDescriptors, + writeDescriptorSets, + maxNumComputeDescr / 2 /* max descriptors */); + + dstBinding = 4; + // OUT 4: Out RGBA or single planar YCbCr image + // OUT 5: y plane - G -> R8 + // OUT 6: Cb or Cr or CbCr plane - BR -> R8B8 + // OUT 7: Cr or Cb plane - R -> R8 + UpdateImageDescriptorSets(outImageView, + m_outputImageAspects, + VK_NULL_HANDLE, + VK_IMAGE_LAYOUT_GENERAL, + descrIndex, + dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + imageDescriptors, + writeDescriptorSets, + maxNumComputeDescr /* max descriptors */); + + assert(descrIndex <= maxNumComputeDescr); + assert(descrIndex >= 2); + + if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, descrIndex, writeDescriptorSets.data()); + } else { + + VkDeviceOrHostAddressConstKHR imageDescriptorBufferDeviceAddress = + m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, + set, + descrIndex, + writeDescriptorSets.data()); + + + // Descriptor buffer bindings + // Set 0 = Image + VkDescriptorBufferBindingInfoEXT bindingInfo{}; + bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + bindingInfo.pNext = nullptr; + bindingInfo.address = imageDescriptorBufferDeviceAddress.deviceAddress; + bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; + m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); + + // Image (set 0) + uint32_t bufferIndexImage = 0; + VkDeviceSize bufferOffset = 0; + m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, 1, &bufferIndexImage, &bufferOffset); + } + } + break; + + default: + m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); + } + + struct ivec2 { + uint32_t width; + uint32_t height; + + ivec2() : width(0), height(0) {} + ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} + }; + + struct ImagePushConstants { + uint32_t srcLayer; + uint32_t dstLayer; + ivec2 inputSize; + ivec2 outputSize; + uint32_t yOffset; // Y plane offset + uint32_t cbOffset; // Cb plane offset + uint32_t crOffset; // Cr plane offset + uint32_t yPitch; // Y plane pitch + uint32_t cbPitch; // Cb plane pitch + uint32_t crPitch; // Cr plane pitch + }; + + ImagePushConstants pushConstants = { + inImageResourceInfo->baseArrayLayer, // Set the source layer index + outImageResourceInfo->baseArrayLayer, // Set the destination layer index + ivec2(inImageResourceInfo->codedExtent.width, inImageResourceInfo->codedExtent.height), + ivec2(outImageResourceInfo->codedExtent.width, outImageResourceInfo->codedExtent.height), + 0, // yOffset - not used for image input + 0, // cbOffset - not used for image input + 0, // crOffset - not used for image input + 0, // yPitch - not used for image input + 0, // cbPitch - not used for image input + 0 // crPitch - not used for image input + }; + + m_vkDevCtx->CmdPushConstants(cmdBuf, + m_descriptorSetLayout.GetPipelineLayout(), + VK_SHADER_STAGE_COMPUTE_BIT, + 0, + sizeof(ImagePushConstants), + &pushConstants); + + const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; + const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; + m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); + + return VK_SUCCESS; +} + +// Buffer input -> Image output +VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkBuffer* inBuffers, + uint32_t numInBuffers, + const VkFormat* inBufferFormats, + const VkSubresourceLayout* inBufferSubresourceLayouts, + uint32_t inBufferNumPlanes, + const VkImageResourceView* outImageView, + const VkVideoPictureResourceInfoKHR* outImageResourceInfo, + const VkBufferImageCopy* pBufferImageCopy, + uint32_t bufferIdx) +{ + assert(cmdBuf != VK_NULL_HANDLE); + assert(m_inputIsBuffer == true); + assert(m_outputIsBuffer == false); + + m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); + + VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); + + switch (layoutMode) { + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: + { + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr / 2]{}; + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr / 2]{}; + std::array writeDescriptorSets{}; + + uint32_t set = 0; + uint32_t descrIndex = 0; + uint32_t dstBinding = 0; + + // Buffer input handling + // IN 0: Single buffer YCbCr + // IN 1: Y plane buffer + // IN 2: Cb, Cr or CbCr plane buffer + // IN 3: Cr plane buffer + UpdateBufferDescriptorSets(inBuffers, numInBuffers, + inBufferSubresourceLayouts, inBufferNumPlanes, + m_inputImageAspects, + descrIndex, dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + bufferDescriptors, + writeDescriptorSets, + maxNumComputeDescr / 2); + + + // Image output + dstBinding = 4; + // OUT 4: Out RGBA or single planar YCbCr image + // OUT 5: y plane - G -> R8 + // OUT 6: Cb or Cr or CbCr plane - BR -> R8B8 + // OUT 7: Cr or Cb plane - R -> R8 + UpdateImageDescriptorSets(outImageView, + m_outputImageAspects, + VK_NULL_HANDLE, + VK_IMAGE_LAYOUT_GENERAL, + descrIndex, + dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + imageDescriptors, + writeDescriptorSets, + maxNumComputeDescr /* max descriptors */); + + assert(descrIndex <= maxNumComputeDescr); + assert(descrIndex >= 2); + + if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, descrIndex, writeDescriptorSets.data()); + } else { + VkDeviceOrHostAddressConstKHR descriptorBufferDeviceAddress = + m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, + set, + descrIndex, + writeDescriptorSets.data()); + + + // Descriptor buffer bindings + VkDescriptorBufferBindingInfoEXT bindingInfo{}; + bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + bindingInfo.pNext = nullptr; + bindingInfo.address = descriptorBufferDeviceAddress.deviceAddress; + bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; + m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); + + uint32_t bufferIndexImage = 0; + VkDeviceSize bufferOffset = 0; + m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, 1, &bufferIndexImage, &bufferOffset); + } + } + break; + + default: + m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); + } + + struct ivec2 { + uint32_t width; + uint32_t height; + + ivec2() : width(0), height(0) {} + ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} + }; + + struct BufferToImagePushConstants { + uint32_t srcLayer; + uint32_t dstLayer; + ivec2 inputSize; + ivec2 outputSize; + uint32_t yOffset; // Y plane offset + uint32_t cbOffset; // Cb plane offset + uint32_t crOffset; // Cr plane offset + uint32_t yPitch; // Y plane pitch + uint32_t cbPitch; // Cb plane pitch + uint32_t crPitch; // Cr plane pitch + }; + + uint32_t width, height; + uint32_t rowPitch; + + assert(pBufferImageCopy); + width = pBufferImageCopy->bufferRowLength > 0 ? + pBufferImageCopy->bufferRowLength : + pBufferImageCopy->imageExtent.width; + height = pBufferImageCopy->bufferImageHeight > 0 ? + pBufferImageCopy->bufferImageHeight : + pBufferImageCopy->imageExtent.height; + rowPitch = width; + + VkExtent3D outputExtent = outImageView->GetImageResource()->GetImageCreateInfo().extent; + + VkDeviceSize planeSize = width * height; + VkDeviceSize yOffset = pBufferImageCopy ? pBufferImageCopy->bufferOffset : 0; + VkDeviceSize cbOffset = yOffset + planeSize; + VkDeviceSize crOffset = cbOffset + (planeSize / 4); + + BufferToImagePushConstants pushConstants = { + pBufferImageCopy->imageSubresource.baseArrayLayer, + outImageResourceInfo->baseArrayLayer, + ivec2(width, height), + ivec2(outputExtent.width, outputExtent.height), + static_cast(yOffset), + static_cast(cbOffset), + static_cast(crOffset), + rowPitch, + rowPitch / 2, // For 4:2:0 format + rowPitch / 2 // For 4:2:0 format + }; + + m_vkDevCtx->CmdPushConstants(cmdBuf, + m_descriptorSetLayout.GetPipelineLayout(), + VK_SHADER_STAGE_COMPUTE_BIT, + 0, + sizeof(BufferToImagePushConstants), + &pushConstants); + + const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; + const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; + m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); + + return VK_SUCCESS; +} + +// Image input -> Buffer output +VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkImageResourceView* inImageView, + const VkVideoPictureResourceInfoKHR* inImageResourceInfo, + const VkBuffer* outBuffers, // with size numOutBuffers + uint32_t numOutBuffers, + const VkFormat* outBufferFormats, // with size outBufferNumPlanes + const VkSubresourceLayout* outBufferSubresourceLayouts, // with size outBufferNumPlanes + uint32_t outBufferNumPlanes, + const VkBufferImageCopy* pBufferImageCopy, + uint32_t bufferIdx) +{ + assert(cmdBuf != VK_NULL_HANDLE); + assert(m_inputIsBuffer == false); + assert(m_outputIsBuffer == true); + + m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); + + VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); + + switch (layoutMode) { + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: + { + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr / 2]{}; + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr / 2]{}; + std::array writeDescriptorSets{}; + + uint32_t set = 0; + uint32_t descrIndex = 0; + uint32_t dstBinding = 0; + + // IN 0: RGBA color converted by an YCbCr sample + // IN 1: y plane - G -> R8 + // IN 2: Cb or Cr or CbCr plane - BR -> R8B8 + // IN 3: Cr or Cb plane - R -> R8 + UpdateImageDescriptorSets(inImageView, + m_inputImageAspects, + m_samplerYcbcrConversion.GetSampler(), + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + descrIndex, + dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + imageDescriptors, + writeDescriptorSets, + maxNumComputeDescr / 2 /* max descriptors */); + + // Output buffer handling + dstBinding = 4; + // OUT 0: Single buffer YCbCr + // OUT 1: Y plane buffer + // OUT 2: Cb, Cr or CbCr plane buffer + // OUT 3: Cr or Cb plane buffer + UpdateBufferDescriptorSets(outBuffers, numOutBuffers, + outBufferSubresourceLayouts, outBufferNumPlanes, + m_inputImageAspects, + descrIndex, dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + bufferDescriptors, + writeDescriptorSets, + maxNumComputeDescr); + + assert(descrIndex <= maxNumComputeDescr); + assert(descrIndex >= 2); + + if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, descrIndex, writeDescriptorSets.data()); + } else { + VkDeviceOrHostAddressConstKHR descriptorBufferDeviceAddress = + m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, + set, + descrIndex, + writeDescriptorSets.data()); + + // Descriptor buffer bindings + VkDescriptorBufferBindingInfoEXT bindingInfo{}; + bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + bindingInfo.pNext = nullptr; + bindingInfo.address = descriptorBufferDeviceAddress.deviceAddress; + bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; + m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); + + uint32_t bufferIndexImage = 0; + VkDeviceSize bufferOffset = 0; + m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, 1, &bufferIndexImage, &bufferOffset); + } + } + break; + + default: + m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); + } + + struct ivec2 { + uint32_t width; + uint32_t height; + + ivec2() : width(0), height(0) {} + ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} + }; + + struct ImageToBufferPushConstants { + uint32_t srcLayer; + uint32_t dstLayer; + ivec2 inputSize; + ivec2 outputSize; + uint32_t yOffset; // Y plane offset + uint32_t cbOffset; // Cb plane offset + uint32_t crOffset; // Cr plane offset + uint32_t yPitch; // Y plane pitch + uint32_t cbPitch; // Cb plane pitch + uint32_t crPitch; // Cr plane pitch + }; + + uint32_t width, height; + uint32_t rowPitch; + VkExtent3D inputExtent = inImageView->GetImageResource()->GetImageCreateInfo().extent; + + if (pBufferImageCopy) { + width = pBufferImageCopy->bufferRowLength > 0 ? + pBufferImageCopy->bufferRowLength : + pBufferImageCopy->imageExtent.width; + height = pBufferImageCopy->bufferImageHeight > 0 ? + pBufferImageCopy->bufferImageHeight : + pBufferImageCopy->imageExtent.height; + rowPitch = width; + } else { + width = inputExtent.width; + height = inputExtent.height; + rowPitch = width; + } + + VkDeviceSize planeSize = width * height; + VkDeviceSize yOffset = pBufferImageCopy ? pBufferImageCopy->bufferOffset : 0; + VkDeviceSize cbOffset = yOffset + planeSize; + VkDeviceSize crOffset = cbOffset + (planeSize / 4); + + ImageToBufferPushConstants pushConstants = { + inImageResourceInfo->baseArrayLayer, + 0, // Destination layer (buffer has no layers) + ivec2(inputExtent.width, inputExtent.height), + ivec2(width, height), + static_cast(yOffset), + static_cast(cbOffset), + static_cast(crOffset), + rowPitch, + rowPitch / 2, // For 4:2:0 format + rowPitch / 2 // For 4:2:0 format + }; + + m_vkDevCtx->CmdPushConstants(cmdBuf, + m_descriptorSetLayout.GetPipelineLayout(), + VK_SHADER_STAGE_COMPUTE_BIT, + 0, + sizeof(ImageToBufferPushConstants), + &pushConstants); + + const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; + const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; + m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); + + return VK_SUCCESS; +} + +// Buffer input -> Buffer output (all buffer case) +VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkBuffer* inBuffers, + uint32_t numInBuffers, + const VkFormat* inBufferFormats, // with size inBufferNumPlanes + const VkSubresourceLayout* inBufferSubresourceLayouts, + uint32_t numInPlanes, + const VkExtent3D& inBufferExtent, + const VkBuffer* outBuffers, + uint32_t numOutBuffers, + const VkFormat* outBufferFormats, + const VkSubresourceLayout* outBufferSubresourceLayouts, + uint32_t numOutPlanes, + const VkExtent3D& outBufferExtent, + uint32_t bufferIdx) +{ + assert(cmdBuf != VK_NULL_HANDLE); + assert(m_inputIsBuffer == true); + assert(m_outputIsBuffer == true); + + m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); + + VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); + + switch (layoutMode) { + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: + { + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr]{}; + std::array writeDescriptorSets{}; + + uint32_t set = 0; + uint32_t descrIndex = 0; + uint32_t dstBinding = 0; + + // Input buffer handling + // IN 0: Single buffer YCbCr + // IN 1: Y plane buffer + // IN 2: Cb, Cr or CbCr plane buffer + // IN 3: Cr plane buffer + UpdateBufferDescriptorSets(inBuffers, numInBuffers, + inBufferSubresourceLayouts, numInPlanes, + m_inputImageAspects, + descrIndex, dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + bufferDescriptors, + writeDescriptorSets, + maxNumComputeDescr / 2); + + // Output buffer handling + dstBinding = 4; + // OUT 0: Single buffer YCbCr + // OUT 1: Y plane buffer + // OUT 2: Cb, Cr or CbCr plane buffer + // OUT 3: Cr or Cb plane buffer + UpdateBufferDescriptorSets(outBuffers, numOutBuffers, + outBufferSubresourceLayouts, numOutPlanes, + m_inputImageAspects, + descrIndex, dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + bufferDescriptors, + writeDescriptorSets, + maxNumComputeDescr); + + assert(descrIndex <= maxNumComputeDescr); + assert(descrIndex >= 2); + + if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, descrIndex, writeDescriptorSets.data()); + } else { + VkDeviceOrHostAddressConstKHR descriptorBufferDeviceAddress = + m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, + set, + descrIndex, + writeDescriptorSets.data()); + + // Descriptor buffer bindings + VkDescriptorBufferBindingInfoEXT bindingInfo{}; + bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + bindingInfo.pNext = nullptr; + bindingInfo.address = descriptorBufferDeviceAddress.deviceAddress; + bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; + m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); + + uint32_t bufferIndexImage = 0; + VkDeviceSize bufferOffset = 0; + m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, 1, &bufferIndexImage, &bufferOffset); + } + } + break; + + default: + m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); + } + + struct ivec2 { + uint32_t width; + uint32_t height; + + ivec2() : width(0), height(0) {} + ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} + }; + + struct BufferToBufferPushConstants { + uint32_t srcLayer; // src image layer to use + uint32_t dstLayer; // dst image layer to use + ivec2 inputSize; // input image or buffer extent + ivec2 outputSize; // output image or buffer extent + uint32_t inYOffset; // input buffer Y plane offset + uint32_t inCbOffset; // input buffer Cb plane offset + uint32_t inCrOffset; // input buffer Cr plane offset + uint32_t inYPitch; // input buffer Y plane pitch + uint32_t inCbPitch; // input buffer Cb plane pitch + uint32_t inCrPitch; // input buffer Cr plane pitch + uint32_t outYOffset; // output buffer Y plane offset + uint32_t outCbOffset; // output buffer Cb plane offset + uint32_t outCrOffset; // output buffer Cr plane offset + uint32_t outYPitch; // output buffer Y plane pitch + uint32_t outCbPitch; // output buffer Cb plane pitch + uint32_t outCrPitch; // output buffer Cr plane pitch + }; + + // Calculate buffer parameters + uint32_t rowPitch = inBufferExtent.width; + VkDeviceSize planeSize = inBufferExtent.width * inBufferExtent.height; + VkDeviceSize yOffset = 0; + VkDeviceSize cbOffset = planeSize; + VkDeviceSize crOffset = cbOffset + (planeSize / 4); + + BufferToBufferPushConstants pushConstants = { + 0, // Source layer (buffer has no layers) + 0, // Destination layer (buffer has no layers) + ivec2(inBufferExtent.width, inBufferExtent.height), + ivec2(outBufferExtent.width, outBufferExtent.height), + static_cast(yOffset), + static_cast(cbOffset), + static_cast(crOffset), + rowPitch, + rowPitch / 2, // For 4:2:0 format + rowPitch / 2 // For 4:2:0 format + }; + + m_vkDevCtx->CmdPushConstants(cmdBuf, + m_descriptorSetLayout.GetPipelineLayout(), + VK_SHADER_STAGE_COMPUTE_BIT, + 0, + sizeof(BufferToBufferPushConstants), + &pushConstants); + + const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; + const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; + m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); + + return VK_SUCCESS; +} diff --git a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.h b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.h index ef8db51a..ab9a8845 100644 --- a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.h +++ b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.h @@ -32,6 +32,15 @@ class VulkanFilterYuvCompute : public VulkanFilter public: enum FilterType { YCBCRCOPY, YCBCRCLEAR, YCBCR2RGBA, RGBA2YCBCR }; + static constexpr uint32_t maxNumComputeDescr = 8; + + static constexpr VkImageAspectFlags validPlaneAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT; + + static constexpr VkImageAspectFlags validAspects = VK_IMAGE_ASPECT_COLOR_BIT | validPlaneAspects; + + static uint32_t GetPlaneIndex(VkImageAspectFlagBits planeAspect); static VkResult Create(const VulkanDeviceContext* vkDevCtx, uint32_t queueFamilyIndex, @@ -40,6 +49,8 @@ class VulkanFilterYuvCompute : public VulkanFilter uint32_t maxNumFrames, VkFormat inputFormat, VkFormat outputFormat, + bool inputEnableMsbToLsbShift, + bool outputEnableLsbToMsbShift, const VkSamplerYcbcrConversionCreateInfo* pYcbcrConversionCreateInfo, const YcbcrPrimariesConstants* pYcbcrPrimariesConstants, const VkSamplerCreateInfo* pSamplerCreateInfo, @@ -52,6 +63,8 @@ class VulkanFilterYuvCompute : public VulkanFilter uint32_t maxNumFrames, VkFormat inputFormat, VkFormat outputFormat, + bool inputEnableMsbToLsbShift, + bool outputEnableLsbToMsbShift, const YcbcrPrimariesConstants* pYcbcrPrimariesConstants) : VulkanFilter(vkDevCtx, queueFamilyIndex, queueIndex) , m_filterType(filterType) @@ -71,7 +84,11 @@ class VulkanFilterYuvCompute : public VulkanFilter VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT) + , m_inputEnableMsbToLsbShift(inputEnableMsbToLsbShift) + , m_outputEnableLsbToMsbShift(outputEnableLsbToMsbShift) , m_enableRowAndColumnReplication(true) + , m_inputIsBuffer(false) + , m_outputIsBuffer(false) { // FIXME: m_ycbcrPrimariesConstants is currently unused but is kept for future use. (void)m_ycbcrPrimariesConstants; @@ -116,263 +133,205 @@ class VulkanFilterYuvCompute : public VulkanFilter assert(m_vkDevCtx != nullptr); } + uint32_t UpdateBufferDescriptorSets(const VkBuffer* vkBuffers, + uint32_t numVkBuffers, + const VkSubresourceLayout* vkBufferSubresourceLayout, + uint32_t numPlanes, + VkImageAspectFlags validImageAspects, + uint32_t& descrIndex, + uint32_t& baseBinding, + VkDescriptorType descriptorType, // Ex: VK_DESCRIPTOR_TYPE_STORAGE_BUFFER + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr], + std::array& writeDescriptorSets, + const uint32_t maxDescriptors = maxNumComputeDescr); + + uint32_t UpdateImageDescriptorSets(const VkImageResourceView* inputImageView, + VkImageAspectFlags validImageAspects, + VkSampler convSampler, + VkImageLayout imageLayout, + uint32_t& descrIndex, + uint32_t& baseBinding, + VkDescriptorType descriptorType, // Ex: VK_DESCRIPTOR_TYPE_STORAGE_IMAGE + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr], + std::array& writeDescriptorSets, + const uint32_t maxDescriptors = maxNumComputeDescr); + + // Image input -> Image output virtual VkResult RecordCommandBuffer(VkCommandBuffer cmdBuf, const VkImageResourceView* inputImageView, const VkVideoPictureResourceInfoKHR * inputImageResourceInfo, const VkImageResourceView* outputImageView, const VkVideoPictureResourceInfoKHR * outputImageResourceInfo, - uint32_t bufferIdx) - { - - assert(cmdBuf != VK_NULL_HANDLE); - - m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); - - VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); - - switch (layoutMode) { - case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: - case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: - { - - const uint32_t maxNumComputeDescr = 8; - VkDescriptorImageInfo imageDescriptors[8]{}; - std::array writeDescriptorSets{}; - - // Images - uint32_t set = 0; - uint32_t descrIndex = 0; - uint32_t dstBinding = 0; - // RGBA color converted by an YCbCr sample - if (m_inputImageAspects & VK_IMAGE_ASPECT_COLOR_BIT) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = (m_samplerYcbcrConversion.GetSampler() != VK_NULL_HANDLE) ? - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER : - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - - imageDescriptors[descrIndex].sampler = m_samplerYcbcrConversion.GetSampler(); - imageDescriptors[descrIndex].imageView = inputImageView->GetImageView(); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // RGBA or Sampled YCbCr - descrIndex++; - } - dstBinding++; - - uint32_t planeNum = 0; - // y plane - G -> R8 - if ((m_inputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < inputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = inputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // Y (0) plane - descrIndex++; - } - dstBinding++; - - // CbCr plane - BR -> R8B8 - if ((m_inputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < inputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = inputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // CbCr (1) plane - descrIndex++; - } - dstBinding++; - - // Cr plane - R -> R8 - if ((m_inputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < inputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = inputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // CbCr (1) plane - descrIndex++; - } - dstBinding++; - - // Out RGBA or single planar YCbCr image - if (m_outputImageAspects & VK_IMAGE_ASPECT_COLOR_BIT) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = outputImageView->GetImageView(); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; - descrIndex++; - } - dstBinding++; - - planeNum = 0; - // y plane out - G -> R8 - if ((m_outputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < outputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = outputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; - descrIndex++; - } - dstBinding++; - - // CbCr plane out - BR -> R8B8 - if ((m_outputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < outputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = outputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; - descrIndex++; - } - dstBinding++; - - // Cr plane out - R -> R8 - if ((m_outputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < outputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = outputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; - descrIndex++; - } - dstBinding++; - - assert(descrIndex <= maxNumComputeDescr); - assert(descrIndex >= 2); - - if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { - m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, - m_descriptorSetLayout.GetPipelineLayout(), - set, descrIndex, writeDescriptorSets.data()); - } else { - - VkDeviceOrHostAddressConstKHR imageDescriptorBufferDeviceAddress = - m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, - set, - descrIndex, - writeDescriptorSets.data()); - - - // Descriptor buffer bindings - // Set 0 = Image - VkDescriptorBufferBindingInfoEXT bindingInfo{}; - bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; - bindingInfo.pNext = nullptr; - bindingInfo.address = imageDescriptorBufferDeviceAddress.deviceAddress; - bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | - VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; - m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); - - // Image (set 0) - uint32_t bufferIndexImage = 0; - VkDeviceSize bufferOffset = 0; - m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, - m_descriptorSetLayout.GetPipelineLayout(), - set, 1, &bufferIndexImage, &bufferOffset); - } - } - break; - - default: - m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, - m_descriptorSetLayout.GetPipelineLayout(), - 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); - } - - struct ivec2 { - uint32_t width; - uint32_t height; - - ivec2() : width(0), height(0) {} - ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} - }; - - struct PushConstants { - uint32_t srcLayer; - uint32_t dstLayer; - ivec2 inputSize; // Original input image size (width, height) - ivec2 outputSize; // Output image size (width, height, with padding) - }; - - PushConstants pushConstants = { - inputImageResourceInfo->baseArrayLayer, // Set the source layer index - outputImageResourceInfo->baseArrayLayer, // Set the destination layer index - ivec2(inputImageResourceInfo->codedExtent.width, inputImageResourceInfo->codedExtent.height), - ivec2(outputImageResourceInfo->codedExtent.width, outputImageResourceInfo->codedExtent.height) - }; - - m_vkDevCtx->CmdPushConstants(cmdBuf, - m_descriptorSetLayout.GetPipelineLayout(), - VK_SHADER_STAGE_COMPUTE_BIT, - 0, // offset - sizeof(PushConstants), - &pushConstants); - - const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; - const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; - m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); - - return VK_SUCCESS; - } + uint32_t bufferIdx) override; + // Buffer input -> Image output + VkResult RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkBuffer* inBuffers, // with size numInBuffers + uint32_t numInBuffers, + const VkFormat* inBufferFormats, // with size inBufferNumPlanes + const VkSubresourceLayout* inBufferSubresourceLayouts, // with size inBufferNumPlanes + uint32_t inBufferNumPlanes, + const VkImageResourceView* outImageView, + const VkVideoPictureResourceInfoKHR* outImageResourceInfo, + const VkBufferImageCopy* pBufferImageCopy, + uint32_t bufferIdx); + + // Image input -> Buffer output + VkResult RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkImageResourceView* inImageView, + const VkVideoPictureResourceInfoKHR* inImageResourceInfo, + const VkBuffer* outBuffers, // with size numOutBuffers + uint32_t numOutBuffers, + const VkFormat* inBufferFormats, // with size outBufferNumPlanes + const VkSubresourceLayout* outBufferSubresourceLayouts, // with size outBufferNumPlanes + uint32_t outBufferNumPlanes, + const VkBufferImageCopy* pBufferImageCopy, + uint32_t bufferIdx); + + // Buffer input -> Buffer output + VkResult RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkBuffer* inBuffers, // with size numInBuffers + uint32_t numInBuffers, + const VkFormat* inBufferFormats, // with size inBufferNumPlanes + const VkSubresourceLayout* inBufferSubresourceLayouts, // with size inBufferNumPlanes + uint32_t inBufferNumPlanes, + const VkExtent3D& inBufferExtent, + const VkBuffer* outBuffers, // with size numOutBuffers + uint32_t numOutBuffers, + const VkFormat* outBufferFormats, // with size outBufferNumPlanes + const VkSubresourceLayout* outBufferSubresourceLayouts, // with size outBufferNumPlanes + uint32_t outBufferNumPlanes, + const VkExtent3D& outBufferExtent, + uint32_t bufferIdx); private: VkResult InitDescriptorSetLayout(uint32_t maxNumFrames); - void ShaderGeneratePlaneDescriptors(std::stringstream& computeShader, - VkImageAspectFlags& imageAspects, - const char *imageName, - VkFormat imageFormat, - bool isInput, - uint32_t startBinding = 0, - uint32_t set = 0, - bool imageArray = true); + + /** + * @brief Generates GLSL image descriptor bindings for shader input/output + * + * Creates appropriate GLSL image binding declarations based on the input/output format. + * Handles different YUV formats like single-plane (RGBA), 2-plane (NV12/NV21), and 3-plane (I420, etc). + * + * @param computeShader Output stringstream for shader code + * @param imageAspects Output parameter to store the image aspect flags used + * @param imageName Base image variable name + * @param imageFormat Vulkan format of the image + * @param isInput Whether this is an input or output resource + * @param startBinding Starting binding number in the descriptor set + * @param set Descriptor set number + * @param imageArray Whether to use image2DArray or image2D + * @return The next available binding number after all descriptors are created + */ + uint32_t ShaderGenerateImagePlaneDescriptors(std::stringstream& computeShader, + VkImageAspectFlags& imageAspects, + const char *imageName, + VkFormat imageFormat, + bool isInput, + uint32_t startBinding = 0, + uint32_t set = 0, + bool imageArray = true); + + /** + * @brief Generates GLSL buffer descriptor bindings for shader input/output + * + * Creates appropriate GLSL buffer binding declarations based on the input/output format. + * Handles different YUV buffer layouts matching single-plane, 2-plane, or 3-plane formats. + * + * @param shaderStr Output stringstream for shader code + * @param imageAspects Output parameter to store the image aspect flags used + * @param bufferName Base buffer variable name + * @param bufferFormat Vulkan format of the buffer data + * @param isInput Whether this is an input or output resource + * @param startBinding Starting binding number in the descriptor set + * @param set Descriptor set number + * @param bufferType The Vulkan descriptor type to use for the buffer + * @return The next available binding number after all descriptors are created + */ + uint32_t ShaderGenerateBufferPlaneDescriptors(std::stringstream& shaderStr, + VkImageAspectFlags& imageAspects, + const char *bufferName, + VkFormat bufferFormat, + bool isInput, + uint32_t startBinding = 0, + uint32_t set = 0, + VkDescriptorType bufferType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + /** + * @brief Unified descriptor generation for either buffer or image resources + * + * Delegates to either ShaderGenerateImagePlaneDescriptors or ShaderGenerateBufferPlaneDescriptors + * based on the resource type (image or buffer) needed for input/output. + * + * @param shaderStr Output stringstream for shader code + * @param isInput Whether this is an input or output resource + * @param startBinding Starting binding number in the descriptor set + * @param set Descriptor set number + * @param imageArray Whether to use image2DArray or image2D (for image resources) + * @param bufferType The Vulkan descriptor type to use for buffer resources + * @return The next available binding number after all descriptors are created + */ + uint32_t ShaderGeneratePlaneDescriptors(std::stringstream& shaderStr, + bool isInput, + uint32_t startBinding, + uint32_t set, + bool imageArray, + VkDescriptorType bufferType); + + /** + * @brief Initializes GLSL shader for YCbCr copy operation + * + * Generates a compute shader that copies YCbCr data from input to output + * without any color space conversion, preserving the format. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ size_t InitYCBCRCOPY(std::string& computeShader); + + /** + * @brief Initializes GLSL shader for YCbCr clear operation + * + * Generates a compute shader that clears/fills YCbCr data in the output + * resource with constant values. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ size_t InitYCBCRCLEAR(std::string& computeShader); + + /** + * @brief Initializes GLSL shader for YCbCr to RGBA conversion + * + * Generates a compute shader that converts YCbCr input to RGBA output + * using the appropriate color space conversion matrix. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ size_t InitYCBCR2RGBA(std::string& computeShader); + /** + * @brief Initializes GLSL shader for RGBA to YCbCr conversion + * + * Generates a compute shader that converts RGBA input to YCbCr output + * using the appropriate color space conversion matrix. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ + size_t InitRGBA2YCBCR(std::string& computeShader); + + /** + * @brief Initializes GLSL shader for YUV to NV12 conversion using buffer input + * + * Generates a compute shader that converts YUV input from buffer to NV12 output, + * handling different YUV formats (I420, I422, I444) with appropriate chroma subsampling. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ + size_t InitYUV2NV12FromBuffer(std::string& computeShader); + private: const FilterType m_filterType; VkFormat m_inputFormat; @@ -386,8 +345,32 @@ class VulkanFilterYuvCompute : public VulkanFilter VulkanComputePipeline m_computePipeline; VkImageAspectFlags m_inputImageAspects; VkImageAspectFlags m_outputImageAspects; + uint32_t m_inputEnableMsbToLsbShift : 1; + uint32_t m_outputEnableLsbToMsbShift : 1; uint32_t m_enableRowAndColumnReplication : 1; - + uint32_t m_inputIsBuffer : 1; + uint32_t m_outputIsBuffer : 1; + + struct PushConstants { + uint32_t srcLayer; // src image layer to use + uint32_t dstLayer; // dst image layer to use + uint32_t inputWidth; // input image or buffer width + uint32_t inputHeight; // input image or buffer height + uint32_t outputWidth; // output image or buffer width + uint32_t outputHeight; // output image or buffer height + uint32_t inYOffset; // input buffer Y plane offset + uint32_t inCbOffset; // input buffer Cb plane offset + uint32_t inCrOffset; // input buffer Cr plane offset + uint32_t inYPitch; // input buffer Y plane pitch + uint32_t inCbPitch; // input buffer Cb plane pitch + uint32_t inCrPitch; // input buffer Cr plane pitch + uint32_t outYOffset; // output buffer Y plane offset + uint32_t outCbOffset; // output buffer Cb plane offset + uint32_t outCrOffset; // output buffer Cr plane offset + uint32_t outYPitch; // output buffer Y plane pitch + uint32_t outCbPitch; // output buffer Cb plane pitch + uint32_t outCrPitch; // output buffer Cr plane pitch + }; }; #endif /* _VULKANFILTERYUVCOMPUTE_H_ */ diff --git a/common/libs/VkCodecUtils/VulkanShaderCompiler.cpp b/common/libs/VkCodecUtils/VulkanShaderCompiler.cpp index 20fc073e..89215a8b 100644 --- a/common/libs/VkCodecUtils/VulkanShaderCompiler.cpp +++ b/common/libs/VkCodecUtils/VulkanShaderCompiler.cpp @@ -103,13 +103,19 @@ VkShaderModule VulkanShaderCompiler::BuildShaderFromFile(const char *fileName, VkShaderStageFlagBits type, const VulkanDeviceContext* vkDevCtx) { +#ifdef seekg // read file from the path std::ifstream is(fileName, std::ios::binary | std::ios::in | std::ios::ate); if (is.is_open()) { - - size_t size = is.tellg(); - is.seekg(0, std::ios::beg); + is.seekg (0, is.end); + std::streamoff fileSize = is.tellg(); + if (fileSize < 0 || static_cast(fileSize) > std::numeric_limits::max()) { + std::cerr << "File size is too large or invalid" << std::endl; + return VK_NULL_HANDLE; + } + size_t size = static_cast(fileSize); + is.seekg(0, is.beg); char* shaderCode = new char[size]; is.read(shaderCode, size); is.close(); @@ -122,6 +128,7 @@ VkShaderModule VulkanShaderCompiler::BuildShaderFromFile(const char *fileName, return shaderModule; } +#endif return VK_NULL_HANDLE; } diff --git a/common/libs/VkCodecUtils/VulkanVideoProcessor.cpp b/common/libs/VkCodecUtils/VulkanVideoProcessor.cpp index ebe00067..d6e1fd18 100644 --- a/common/libs/VkCodecUtils/VulkanVideoProcessor.cpp +++ b/common/libs/VkCodecUtils/VulkanVideoProcessor.cpp @@ -115,11 +115,13 @@ VkResult VulkanVideoProcessor::Initialize(const VulkanDeviceContext* vkDevCtx, return result; } - VkVideoCoreProfile videoProfile(m_videoStreamDemuxer->GetVideoCodec(), - m_videoStreamDemuxer->GetChromaSubsampling(), - m_videoStreamDemuxer->GetLumaBitDepth(), - m_videoStreamDemuxer->GetChromaBitDepth(), - m_videoStreamDemuxer->GetProfileIdc()); + VkVideoCoreProfile videoProfile ({ + m_videoStreamDemuxer->GetVideoCodec(), + m_videoStreamDemuxer->GetChromaSubsampling(), + m_videoStreamDemuxer->GetLumaBitDepth(), + m_videoStreamDemuxer->GetChromaBitDepth(), + m_videoStreamDemuxer->GetProfileIdc() + }); if (!VulkanVideoCapabilities::IsCodecTypeSupported(vkDevCtx, vkDevCtx->GetVideoDecodeQueueFamilyIdx(), @@ -194,12 +196,11 @@ VkResult VulkanVideoProcessor::Create(const DecoderConfig& settings, const Vulka VkVideoProfileInfoKHR VulkanVideoProcessor::GetVkProfile() const { - - VkVideoProfileInfoKHR videoProfile({VK_STRUCTURE_TYPE_VIDEO_PROFILE_INFO_KHR, NULL, + VkVideoProfileInfoKHR videoProfile {VK_STRUCTURE_TYPE_VIDEO_PROFILE_INFO_KHR, NULL, m_videoStreamDemuxer->GetVideoCodec(), m_videoStreamDemuxer->GetChromaSubsampling(), m_videoStreamDemuxer->GetLumaBitDepth(), - m_videoStreamDemuxer->GetChromaBitDepth()}); + m_videoStreamDemuxer->GetChromaBitDepth()}; return videoProfile; } @@ -229,10 +230,10 @@ VkFormat VulkanVideoProcessor::GetFrameImageFormat() const VkExtent3D VulkanVideoProcessor::GetVideoExtent() const { - VkExtent3D extent ({ (uint32_t)m_videoStreamDemuxer->GetWidth(), - (uint32_t)m_videoStreamDemuxer->GetHeight(), - (uint32_t)1 - }); + VkExtent3D extent { (uint32_t)m_videoStreamDemuxer->GetWidth(), + (uint32_t)m_videoStreamDemuxer->GetHeight(), + (uint32_t)1 + }; return extent; } diff --git a/common/libs/VkCodecUtils/VulkanVideoProcessor.h b/common/libs/VkCodecUtils/VulkanVideoProcessor.h index cbdca1f1..0eb08e9c 100644 --- a/common/libs/VkCodecUtils/VulkanVideoProcessor.h +++ b/common/libs/VkCodecUtils/VulkanVideoProcessor.h @@ -23,6 +23,10 @@ #include "VkCodecUtils/VkVideoQueue.h" #include "VkVideoFrameOutput.h" +// Forward declarations +class VulkanDeviceContext; +struct VkMpFormatInfo; + class VulkanVideoProcessor : public VkVideoQueue { public: diff --git a/common/libs/VkShell/Shell.h b/common/libs/VkShell/Shell.h index c9c6c233..b91223b0 100644 --- a/common/libs/VkShell/Shell.h +++ b/common/libs/VkShell/Shell.h @@ -66,7 +66,10 @@ class Shell : public VkWsiDisplay, public VkVideoRefCountBase { if ((res != VK_SUCCESS) && (res != VK_SUBOPTIMAL_KHR)) { std::stringstream ss; ss << "VkResult " << res << " returned"; +#ifdef __cpp_exceptions throw std::runtime_error(ss.str()); +#endif // __cpp_exceptions + } return res; diff --git a/vk_video_decoder/demos/vk-video-dec/CMakeLists.txt b/vk_video_decoder/demos/vk-video-dec/CMakeLists.txt index 30e3e4cd..5ebba8a3 100644 --- a/vk_video_decoder/demos/vk-video-dec/CMakeLists.txt +++ b/vk_video_decoder/demos/vk-video-dec/CMakeLists.txt @@ -50,6 +50,7 @@ set(sources ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanCommandBufferPool.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanCommandBufferPool.h ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VkVideoFrameToFile.cpp + ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/crcgenerator.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.h ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/ElementaryStream.cpp diff --git a/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1Decoder.cpp b/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1Decoder.cpp index bc65f33f..c401eec1 100644 --- a/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1Decoder.cpp +++ b/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1Decoder.cpp @@ -1132,7 +1132,7 @@ static uint32_t tile_log2(int blk_size, int target) return k; } -uint32_t FloorLog2(uint32_t x) +static uint32_t FloorLog2(uint32_t x) { int s = 0; @@ -2289,7 +2289,11 @@ bool VulkanAV1Decoder::ParseObuTileGroup(const AV1ObuHeader& hdr) consumedBytes += tile_size_bytes_minus_1 + 1; m_PicData.tileOffsets[m_PicData.khr_info.tileCount] = (uint32_t)m_nalu.start_offset + (uint32_t)consumedBytes; - tileSize = tile_size_minus_1 + 1; + // Add bounds checking and safe conversion + if (tile_size_minus_1 > (SIZE_MAX - 1)) { + return false; // Tile size too large + } + tileSize = (size_t)(tile_size_minus_1 + 1); consumedBytes += (uint32_t)tileSize; skip_bits((uint32_t)(tileSize * 8)); @@ -2302,7 +2306,7 @@ bool VulkanAV1Decoder::ParseObuTileGroup(const AV1ObuHeader& hdr) return (tg_end == num_tiles - 1); } -bool IsObuInCurrentOperatingPoint(int current_operating_point, AV1ObuHeader *hdr) { +static bool IsObuInCurrentOperatingPoint(int current_operating_point, AV1ObuHeader *hdr) { if (current_operating_point == 0) return true; if (((current_operating_point >> hdr->temporal_id) & 0x1) && ((current_operating_point >> (hdr->spatial_id + 8)) & 0x1)) { diff --git a/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1GlobalMotionDec.cpp b/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1GlobalMotionDec.cpp index e5a35316..37691fe5 100644 --- a/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1GlobalMotionDec.cpp +++ b/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1GlobalMotionDec.cpp @@ -82,7 +82,7 @@ #define WARP_PARAM_REDUCE_BITS 6 #define WARPEDMODEL_PREC_BITS 16 -int get_msb(unsigned int n) +static int get_msb(unsigned int n) { int log = 0; unsigned int value = n; diff --git a/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp b/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp index 2b9f6b66..c855386a 100644 --- a/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp +++ b/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp @@ -334,16 +334,18 @@ int32_t VkVideoDecoder::StartVideoSequence(VkParserDetectedVideoFormat* pVideoFo if (needNewFilter) { result = VulkanFilterYuvCompute::Create(m_vkDevCtx, - m_vkDevCtx->GetComputeQueueFamilyIdx(), - 0, - m_filterType, - numDecodeSurfaces + 1, - inputFormat, - outputFormat, - &ycbcrConversionCreateInfo, - &ycbcrPrimariesConstants, - &samplerInfo, - m_yuvFilter); + m_vkDevCtx->GetComputeQueueFamilyIdx(), + 0, + m_filterType, + numDecodeSurfaces + 1, + inputFormat, + outputFormat, + false, // inputEnableMsbToLsbShift + false, // outputEnableLsbToMsbShift + &ycbcrConversionCreateInfo, + &ycbcrPrimariesConstants, + &samplerInfo, + m_yuvFilter); } if (result == VK_SUCCESS) { diff --git a/vk_video_decoder/src/vulkan_video_decoder.cpp b/vk_video_decoder/src/vulkan_video_decoder.cpp index 1d0e0541..f98f3f82 100644 --- a/vk_video_decoder/src/vulkan_video_decoder.cpp +++ b/vk_video_decoder/src/vulkan_video_decoder.cpp @@ -66,10 +66,11 @@ class VulkanVideoDecoderImpl : public VulkanVideoDecoder { virtual VkExtent3D GetVideoExtent() const { - VkExtent3D extent ({ (uint32_t)m_vulkanVideoProcessor->GetWidth(), - (uint32_t)m_vulkanVideoProcessor->GetHeight(), - (uint32_t)1 - }); + VkExtent3D extent { + (uint32_t)m_vulkanVideoProcessor->GetWidth(), + (uint32_t)m_vulkanVideoProcessor->GetHeight(), + 1 + }; return extent; } diff --git a/vk_video_decoder/test/vulkan-video-dec/CMakeLists.txt b/vk_video_decoder/test/vulkan-video-dec/CMakeLists.txt index 7f10d58f..084a6676 100644 --- a/vk_video_decoder/test/vulkan-video-dec/CMakeLists.txt +++ b/vk_video_decoder/test/vulkan-video-dec/CMakeLists.txt @@ -30,6 +30,7 @@ set(VULKAN_VIDEO_DEC_SOURCES ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanSamplerYcbcrConversion.h ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/nvVkFormats.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VkVideoFrameToFile.cpp + ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/crcgenerator.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.h ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/ElementaryStream.cpp diff --git a/vk_video_decoder/test/vulkan-video-simple-dec/CMakeLists.txt b/vk_video_decoder/test/vulkan-video-simple-dec/CMakeLists.txt index 30cf00be..d533f95e 100644 --- a/vk_video_decoder/test/vulkan-video-simple-dec/CMakeLists.txt +++ b/vk_video_decoder/test/vulkan-video-simple-dec/CMakeLists.txt @@ -1,6 +1,7 @@ set(VULKAN_VIDEO_SIMPLE_DEC_SOURCES Main.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/nvVkFormats.cpp + ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/crcgenerator.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.h ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/ElementaryStream.cpp diff --git a/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt b/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt index d3bba268..b043412b 100644 --- a/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt +++ b/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt @@ -80,6 +80,7 @@ set(sources ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/nvVkFormats.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanBistreamBufferImpl.h ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanBistreamBufferImpl.cpp + ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/crcgenerator.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VulkanVideoFrameBuffer/VulkanVideoFrameBuffer.h ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VulkanVideoFrameBuffer/VulkanVideoFrameBuffer.cpp ) @@ -118,6 +119,10 @@ if(TARGET vulkan) list(APPEND definitions PRIVATE -DUNINSTALLED_LOADER="$") endif() +if(USE_ENCODER_SHADERC) + list(APPEND definitions PRIVATE -DSHADERC_SUPPORT) +endif() + if(WIN32) list(APPEND definitions PRIVATE -DVK_USE_PLATFORM_WIN32_KHR) list(APPEND definitions PRIVATE -DWIN32_LEAN_AND_MEAN) diff --git a/vk_video_encoder/demos/vk-video-enc/Main.cpp b/vk_video_encoder/demos/vk-video-enc/Main.cpp index 31d24b2d..bb849f72 100644 --- a/vk_video_encoder/demos/vk-video-enc/Main.cpp +++ b/vk_video_encoder/demos/vk-video-enc/Main.cpp @@ -21,7 +21,7 @@ #include "VkCodecUtils/VulkanEncoderFrameProcessor.h" #include "VkShell/Shell.h" -int main(int argc, char** argv) +int main(int argc, const char* argv[]) { VkSharedBaseObj encoderConfig; if (VK_SUCCESS != EncoderConfig::CreateCodecConfig(argc, argv, encoderConfig)) { diff --git a/vk_video_encoder/include/vulkan_video_encoder.h b/vk_video_encoder/include/vulkan_video_encoder.h index e757f238..f170fd4a 100644 --- a/vk_video_encoder/include/vulkan_video_encoder.h +++ b/vk_video_encoder/include/vulkan_video_encoder.h @@ -43,7 +43,7 @@ class VulkanVideoEncoder : public virtual VkVideoRefCountBase { public: virtual VkResult Initialize(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv) = 0; + int argc, const char** argv) = 0; virtual int64_t GetNumberOfFrames() = 0; virtual VkResult EncodeNextFrame(int64_t& frameNumEncoded) = 0; virtual VkResult GetBitstream() = 0; @@ -52,7 +52,7 @@ class VulkanVideoEncoder : public virtual VkVideoRefCountBase { extern "C" VK_VIDEO_ENCODER_EXPORT VkResult CreateVulkanVideoEncoder(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv, + int argc, const char** argv, VkSharedBaseObj& vulkanVideoEncoder); #endif /* _VULKAN_VIDEO_ENCODER_H_ */ diff --git a/vk_video_encoder/libs/CMakeLists.txt b/vk_video_encoder/libs/CMakeLists.txt index 5cca8809..66685d33 100644 --- a/vk_video_encoder/libs/CMakeLists.txt +++ b/vk_video_encoder/libs/CMakeLists.txt @@ -88,7 +88,7 @@ set(LIBVKVIDEOENCODER_DEFINITIONS PRIVATE VK_VIDEO_ENCODER_IMPLEMENTATION PUBLIC VK_VIDEO_ENCODER_SHAREDLIB) -if(USE_SHADERC) +if(USE_ENCODER_SHADERC) list(APPEND LIBVKVIDEOENCODER_SRC ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanShaderCompiler.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanShaderCompiler.h @@ -108,7 +108,7 @@ include_directories(BEFORE ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}) set(LIBVKVIDEOENCODER_DEPENDENCIES GenerateDispatchTables ${VULKAN_VIDEO_PARSER_LIB}) add_library(${VULKAN_VIDEO_ENCODER_LIB} SHARED ${LIBVKVIDEOENCODER_SRC}) -if(USE_SHADERC) +if(USE_ENCODER_SHADERC) # Link the libraries target_link_libraries(${VULKAN_VIDEO_ENCODER_LIB} PUBLIC ${SHADERC_SHARED_LIBRARY}) # Ensure the library depends on the generation of these files @@ -137,7 +137,7 @@ if(WIN32) endif() add_library(${VULKAN_VIDEO_ENCODER_STATIC_LIB} STATIC ${LIBVKVIDEOENCODER_SRC}) -if(USE_SHADERC) +if(USE_ENCODER_SHADERC) # Link the libraries target_link_libraries(${VULKAN_VIDEO_ENCODER_STATIC_LIB} PUBLIC ${SHADERC_SHARED_LIBRARY}) endif() diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.cpp index 53d7cec3..fdfe92de 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.cpp @@ -19,7 +19,7 @@ #include "VkVideoEncoder/VkEncoderConfigH265.h" #include "VkVideoEncoder/VkEncoderConfigAV1.h" -void printHelp(VkVideoCodecOperationFlagBitsKHR codec) +static void printHelp(VkVideoCodecOperationFlagBitsKHR codec) { fprintf(stderr, "Version: " VKVS_VERSION_STRING "\n"\ @@ -156,10 +156,10 @@ void printHelp(VkVideoCodecOperationFlagBitsKHR codec) } } -int EncoderConfig::ParseArguments(int argc, char *argv[]) +int EncoderConfig::ParseArguments(int argc, const char *argv[]) { int argcount = 0; - std::vector arglist; + std::vector arglist; std::vector args(argv, argv + argc); uint32_t frameCount = 0; @@ -572,7 +572,7 @@ int EncoderConfig::ParseArguments(int argc, char *argv[]) gopStructure.SetIntraRefreshSkippedStartIndex(intraRefreshSkippedStartIndex); } else { argcount++; - arglist.push_back((char*)args[i].c_str()); + arglist.push_back(args[i].c_str()); } } @@ -703,7 +703,7 @@ int EncoderConfig::ParseArguments(int argc, char *argv[]) return DoParseArguments(argcount, arglist.data()); } -VkResult EncoderConfig::CreateCodecConfig(int argc, char *argv[], +VkResult EncoderConfig::CreateCodecConfig(int argc, const char *argv[], VkSharedBaseObj& encoderConfig) { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.h index 94adb438..896c1636 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.h @@ -913,13 +913,13 @@ struct EncoderConfig : public VkVideoRefCountBase { } // Factory Function - static VkResult CreateCodecConfig(int argc, char *argv[], VkSharedBaseObj& encoderConfig); + static VkResult CreateCodecConfig(int argc, const char *argv[], VkSharedBaseObj& encoderConfig); void InitVideoProfile(); - int ParseArguments(int argc, char *argv[]); + int ParseArguments(int argc, const char *argv[]); - virtual int DoParseArguments(int argc, char *argv[]) { + virtual int DoParseArguments(int argc, const char *argv[]) { if (argc > 0) { std::cout << "Invalid paramters: "; for (int i = 0; i < argc; i++) { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.cpp index aeab421d..c3ba67c1 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.cpp @@ -26,7 +26,7 @@ } \ } -int EncoderConfigAV1::DoParseArguments(int argc, char* argv[]) +int EncoderConfigAV1::DoParseArguments(int argc, const char* argv[]) { // No validation of command line options. So, all options must be valid and // values with in the limits of vulkan and av1 specification diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.h index 0838e2c8..622977d6 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.h @@ -88,7 +88,7 @@ struct EncoderConfigAV1 : public EncoderConfig { } virtual ~EncoderConfigAV1() {} - virtual int DoParseArguments(int argc, char* argv[]) override; + virtual int DoParseArguments(int argc, const char* argv[]) override; virtual VkResult InitializeParameters() override { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp index 68829578..e9c94bed 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp @@ -17,7 +17,7 @@ #include "VkVideoEncoder/VkEncoderConfigH264.h" #include "VkVideoEncoder/VkVideoEncoderH264.h" -int EncoderConfigH264::DoParseArguments(int argc, char* argv[]) +int EncoderConfigH264::DoParseArguments(int argc, const char* argv[]) { std::vector args(argv, argv + argc); diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h index fb1c0611..6d8865a5 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h @@ -156,7 +156,7 @@ struct EncoderConfigH264 : public EncoderConfig { const LevelLimits* levelLimits; size_t levelLimitsSize; - virtual int DoParseArguments(int argc, char* argv[]) override; + virtual int DoParseArguments(int argc, const char* argv[]) override; StdVideoH264LevelIdc DetermineLevel(uint8_t dpbSize, uint32_t bitrate, diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp index b4a03ce1..33bcc53e 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp @@ -68,7 +68,7 @@ uint32_t EncoderConfigH265::GetCpbVclFactor() return baseFactor + depthFactor; } -int EncoderConfigH265::DoParseArguments(int argc, char* argv[]) +int EncoderConfigH265::DoParseArguments(int argc, const char* argv[]) { std::vector args(argv, argv + argc); diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h index ebc5ca38..774bf1a9 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h @@ -135,7 +135,7 @@ struct EncoderConfigH265 : public EncoderConfig { return this; } - virtual int DoParseArguments(int argc, char* argv[]) override; + virtual int DoParseArguments(int argc, const char* argv[]) override; uint32_t GetCtbAlignedPicSizeInSamples(uint32_t& picWidthInCtbsY, uint32_t& picHeightInCtbsY, bool minCtbsY = false); diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderDpbH264.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderDpbH264.h index a54bed3c..c828c3c7 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderDpbH264.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderDpbH264.h @@ -139,7 +139,7 @@ class VkEncDpbH264 const StdVideoEncodeH264PictureInfo *GetCurrentDpbEntry(void) { assert((m_currDpbIdx < m_max_dpb_size) || (m_currDpbIdx == MAX_DPB_SLOTS)); - return &m_DPB[m_currDpbIdx].picInfo; + return &m_DPB[(int)m_currDpbIdx].picInfo; } uint32_t GetUpdatedFrameNumAndPicOrderCnt(int32_t& PicOrderCnt) diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp index 8649df07..84d96deb 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp @@ -164,96 +164,100 @@ VkResult VkVideoEncoder::LoadNextFrame(VkSharedBaseObj& const uint8_t* pInputFrameData = m_encoderConfig->inputFileHandler.GetMappedPtr(m_encoderConfig->input.fullImageSize, encodeFrameInfo->frameInputOrderNum); + // NOTE: Get image layout const VkSubresourceLayout* dstSubresourceLayout = dstImageResource->GetSubresourceLayout(); - int yCbCrConvResult = 0; - if (m_encoderConfig->input.bpp == 8) { - - if (m_encoderConfig->encodeChromaSubsampling == VK_VIDEO_CHROMA_SUBSAMPLING_444_BIT_KHR) { - // Load current 8-bit frame from file and convert to 2-plane YUV444 - yCbCrConvResult = YCbCrConvUtilsCpu::I444ToP444( - pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset, // src_y - (int)m_encoderConfig->input.planeLayouts[0].rowPitch, // src_stride_y - pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset, // src_u - (int)m_encoderConfig->input.planeLayouts[1].rowPitch, // src_stride_u - pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset, // src_v - (int)m_encoderConfig->input.planeLayouts[2].rowPitch, // src_stride_v - writeImagePtr + dstSubresourceLayout[0].offset, // dst_y - (int)dstSubresourceLayout[0].rowPitch, // dst_stride_y - writeImagePtr + dstSubresourceLayout[1].offset, // dst_uv - (int)dstSubresourceLayout[1].rowPitch, // dst_stride_uv - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height)); // height - } else { - // Load current 8-bit frame from file and convert to NV12 - yCbCrConvResult = YCbCrConvUtilsCpu::I420ToNV12( - pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset, // src_y, - (int)m_encoderConfig->input.planeLayouts[0].rowPitch, // src_stride_y, - pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset, // src_u, - (int)m_encoderConfig->input.planeLayouts[1].rowPitch, // src_stride_u, - pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset, // src_v, - (int)m_encoderConfig->input.planeLayouts[2].rowPitch, // src_stride_v, - writeImagePtr + dstSubresourceLayout[0].offset, // dst_y, - (int)dstSubresourceLayout[0].rowPitch, // dst_stride_y, - writeImagePtr + dstSubresourceLayout[1].offset, // dst_uv, - (int)dstSubresourceLayout[1].rowPitch, // dst_stride_uv, - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height)); // height - } - - } else if (m_encoderConfig->input.bpp == 10) { // 10-bit - actually 16-bit only for now. + const uint32_t width = std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width); + const uint32_t height = std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height); + + if (m_inputComputeFilter != nullptr) { + // Compute filter available: direct plane copy, GPU filter handles conversion + CopyYCbCrPlanesDirectCPU( + pInputFrameData, // Source buffer + m_encoderConfig->input.planeLayouts, // Source layouts + writeImagePtr, // Destination buffer + dstSubresourceLayout, // Destination layouts + width, height, + m_encoderConfig->input.numPlanes, // Number of planes + m_encoderConfig->input.vkFormat); // Format for subsampling detection + } else { + // No compute filter: CPU conversion from 3-plane to 2-plane format + int yCbCrConvResult = 0; + if (m_encoderConfig->input.bpp == 8) { + if (m_encoderConfig->encodeChromaSubsampling == VK_VIDEO_CHROMA_SUBSAMPLING_444_BIT_KHR) { + yCbCrConvResult = YCbCrConvUtilsCpu::I444ToP444( + pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset, + (int)m_encoderConfig->input.planeLayouts[0].rowPitch, + pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset, + (int)m_encoderConfig->input.planeLayouts[1].rowPitch, + pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset, + (int)m_encoderConfig->input.planeLayouts[2].rowPitch, + writeImagePtr + dstSubresourceLayout[0].offset, + (int)dstSubresourceLayout[0].rowPitch, + writeImagePtr + dstSubresourceLayout[1].offset, + (int)dstSubresourceLayout[1].rowPitch, + width, height); + } else { + yCbCrConvResult = YCbCrConvUtilsCpu::I420ToNV12( + pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset, + (int)m_encoderConfig->input.planeLayouts[0].rowPitch, + pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset, + (int)m_encoderConfig->input.planeLayouts[1].rowPitch, + pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset, + (int)m_encoderConfig->input.planeLayouts[2].rowPitch, + writeImagePtr + dstSubresourceLayout[0].offset, + (int)dstSubresourceLayout[0].rowPitch, + writeImagePtr + dstSubresourceLayout[1].offset, + (int)dstSubresourceLayout[1].rowPitch, + width, height); + } + } else if (m_encoderConfig->input.bpp == 10 || m_encoderConfig->input.bpp == 12) { + int shiftBits = 0; + if (m_encoderConfig->input.msbShift >= 0) { + shiftBits = m_encoderConfig->input.msbShift; + } else { + shiftBits = 16 - m_encoderConfig->input.bpp; + } - int shiftBits = 0; - if (m_encoderConfig->input.msbShift >= 0) { - shiftBits = m_encoderConfig->input.msbShift; + if (m_encoderConfig->encodeChromaSubsampling == VK_VIDEO_CHROMA_SUBSAMPLING_444_BIT_KHR) { + yCbCrConvResult = YCbCrConvUtilsCpu::I444ToP444( + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset), + (int)m_encoderConfig->input.planeLayouts[0].rowPitch, + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset), + (int)m_encoderConfig->input.planeLayouts[1].rowPitch, + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset), + (int)m_encoderConfig->input.planeLayouts[2].rowPitch, + (uint16_t*)(writeImagePtr + dstSubresourceLayout[0].offset), + (int)dstSubresourceLayout[0].rowPitch, + (uint16_t*)(writeImagePtr + dstSubresourceLayout[1].offset), + (int)dstSubresourceLayout[1].rowPitch, + width, height, shiftBits); + } else { + yCbCrConvResult = YCbCrConvUtilsCpu::I420ToNV12( + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset), + (int)m_encoderConfig->input.planeLayouts[0].rowPitch, + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset), + (int)m_encoderConfig->input.planeLayouts[1].rowPitch, + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset), + (int)m_encoderConfig->input.planeLayouts[2].rowPitch, + (uint16_t*)(writeImagePtr + dstSubresourceLayout[0].offset), + (int)dstSubresourceLayout[0].rowPitch, + (uint16_t*)(writeImagePtr + dstSubresourceLayout[1].offset), + (int)dstSubresourceLayout[1].rowPitch, + width, height, shiftBits); + } } else { - shiftBits = 16 - m_encoderConfig->input.bpp; + assert(!"Requested bit-depth is not supported!"); + return VK_ERROR_INITIALIZATION_FAILED; } - if (m_encoderConfig->encodeChromaSubsampling == VK_VIDEO_CHROMA_SUBSAMPLING_444_BIT_KHR) { - // Load current 10-bit frame from file and convert to 2-plane YUV444 - yCbCrConvResult = YCbCrConvUtilsCpu::I444ToP444( - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset), // src_y - (int)m_encoderConfig->input.planeLayouts[0].rowPitch, // src_stride_y - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset), // src_u - (int)m_encoderConfig->input.planeLayouts[1].rowPitch, // src_stride_u - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset), // src_v - (int)m_encoderConfig->input.planeLayouts[2].rowPitch, // src_stride_v - (uint16_t*)(writeImagePtr + dstSubresourceLayout[0].offset), // dst_y - (int)dstSubresourceLayout[0].rowPitch, // dst_stride_y - (uint16_t*)(writeImagePtr + dstSubresourceLayout[1].offset), // dst_uv - (int)dstSubresourceLayout[1].rowPitch, // dst_stride_uv - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height), // height - shiftBits); - } else { - // Load current 10-bit frame from file and convert to P010/P016 - yCbCrConvResult = YCbCrConvUtilsCpu::I420ToNV12( - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset), // src_y, - (int)m_encoderConfig->input.planeLayouts[0].rowPitch, // src_stride_y, - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset), // src_u, - (int)m_encoderConfig->input.planeLayouts[1].rowPitch, // src_stride_u, - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset), // src_v, - (int)m_encoderConfig->input.planeLayouts[2].rowPitch, // src_stride_v, - (uint16_t*)(writeImagePtr + dstSubresourceLayout[0].offset), // dst_y, - (int)dstSubresourceLayout[0].rowPitch, // dst_stride_y, - (uint16_t*)(writeImagePtr + dstSubresourceLayout[1].offset), // dst_uv, - (int)dstSubresourceLayout[1].rowPitch, // dst_stride_uv, - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height), // height - shiftBits); + if (yCbCrConvResult != 0) { + return VK_ERROR_INITIALIZATION_FAILED; } - - } else { - assert(!"Requested bit-depth is not supported!"); - } - - if (yCbCrConvResult == 0) { - // On success, stage the input frame for the encoder video input - return StageInputFrame(encodeFrameInfo); } - return VK_ERROR_INITIALIZATION_FAILED; + // Now stage the input frame for the encoder video input + return StageInputFrame(encodeFrameInfo); } VkResult VkVideoEncoder::StageInputFrameQpMap(VkSharedBaseObj& encodeFrameInfo, @@ -479,6 +483,123 @@ VkResult VkVideoEncoder::SubmitStagedQpMap(VkSharedBaseObjplanesLayout) : 8; // Default to 8-bit + const uint32_t bytesPerPixel = (bitDepth > 8) ? 2 : 1; + + // Determine chroma subsampling ratios + const uint32_t chromaHorzRatio = (formatInfo != nullptr) ? (1 << formatInfo->planesLayout.secondaryPlaneSubsampledX) : 1; + const uint32_t chromaVertRatio = (formatInfo != nullptr) ? (1 << formatInfo->planesLayout.secondaryPlaneSubsampledY) : 1; + + // Log the format subsampling for debugging + if (m_encoderConfig->verbose) { + const char* subsamplingDesc = "4:4:4"; + if (chromaHorzRatio == 2 && chromaVertRatio == 2) { + subsamplingDesc = "4:2:0"; + } else if (chromaHorzRatio == 2 && chromaVertRatio == 1) { + subsamplingDesc = "4:2:2"; + } + printf("YCbCr copy with %s subsampling (chromaHorzRatio=%d, chromaVertRatio=%d), %d-bit\n", + subsamplingDesc, chromaHorzRatio, chromaVertRatio, bitDepth); + } + + // Handle all planes + for (uint32_t plane = 0; plane < numPlanes; plane++) { + // Source and destination plane pointers + const uint8_t* srcPlane = pInputFrameData + inputPlaneLayouts[plane].offset; + uint8_t* dstPlane = writeImagePtr + dstSubresourceLayout[plane].offset; + + // Get plane dimensions - adjust for chroma planes + uint32_t planeWidth = width; + uint32_t planeHeight = height; + + // Adjust dimensions for chroma planes based on format subsampling + if (plane > 0) { + if (chromaHorzRatio > 1) { + planeWidth = (width + chromaHorzRatio - 1) / chromaHorzRatio; + } + if (chromaVertRatio > 1) { + planeHeight = (height + chromaVertRatio - 1) / chromaVertRatio; + } + } + + // Source and destination strides + assert(inputPlaneLayouts[plane].rowPitch <= SIZE_MAX); + assert(dstSubresourceLayout[plane].rowPitch <= SIZE_MAX); + const size_t srcStride = (size_t)inputPlaneLayouts[plane].rowPitch; + const size_t dstStride = (size_t)dstSubresourceLayout[plane].rowPitch; + + // Line width in bytes + const size_t lineBytes = planeWidth * bytesPerPixel; + + // Get the starting pointers for this plane + const uint8_t* srcRow = srcPlane; + uint8_t* dstRow = dstPlane; + + if (false && (bitDepth > 8)) { + + const int shiftBits = 16 - bitDepth; + + // Copy each line, incrementing pointers by stride amounts + for (uint32_t y = 0; y < planeHeight; y++) { + + // Get the starting pointers for this row + const uint16_t* srcRow16 = (const uint16_t*)srcRow; + uint16_t* dstRow16 = (uint16_t*)dstRow; + + for (uint32_t i = 0; i < planeWidth; i++) { + *dstRow16++ = (*srcRow16++ << shiftBits); + } + + // Advance to the next line using pointer arithmetic + srcRow += srcStride; + dstRow += dstStride; + } + + } else { + + // Copy each line, incrementing pointers by stride amounts + for (uint32_t y = 0; y < planeHeight; y++) { + // Copy the current line + memcpy(dstRow, srcRow, lineBytes); + + // Advance to the next line using pointer arithmetic + srcRow += srcStride; + dstRow += dstStride; + } + } + } +} VkResult VkVideoEncoder::SubmitStagedInputFrame(VkSharedBaseObj& encodeFrameInfo) { @@ -943,6 +1064,7 @@ VkResult VkVideoEncoder::InitEncoder(VkSharedBaseObj& encoderConf VK_IMAGE_USAGE_TRANSFER_DST_BIT); const VkImageUsageFlags dpbImageUsage = VK_IMAGE_USAGE_VIDEO_ENCODE_DPB_BIT_KHR; + // NOTE: Create linearInputImage result = VulkanVideoImagePool::Create(m_vkDevCtx, m_linearInputImagePool); if(result != VK_SUCCESS) { fprintf(stderr, "\nInitEncoder Error: Failed to create linearInputImagePool.\n"); @@ -954,9 +1076,21 @@ VkResult VkVideoEncoder::InitEncoder(VkSharedBaseObj& encoderConf std::max(m_maxCodedExtent.height, encoderConfig->input.height) }; + // When compute filter is available, the linear image stores raw input format + // and the filter handles conversion. Without it, the linear image must match + // the encode source format since CopyLinearToOptimalImage does no conversion. + const VkFormat linearImageFormat = +#ifdef SHADERC_SUPPORT + encoderConfig->enablePreprocessComputeFilter + ? encoderConfig->input.vkFormat + : m_imageInFormat; +#else + m_imageInFormat; +#endif + result = m_linearInputImagePool->Configure( m_vkDevCtx, encoderConfig->numInputImages, - m_imageInFormat, + linearImageFormat, linearInputImageExtent, ( VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | @@ -1217,8 +1351,10 @@ VkResult VkVideoEncoder::InitEncoder(VkSharedBaseObj& encoderConf 0, // queueIndex encoderConfig->filterType, encoderConfig->numInputImages, - m_imageInFormat, // in filter format (can be RGB) + encoderConfig->input.vkFormat, // in filter format (can be RGB) m_imageInFormat, // out filter - same as input for now. + false, // inputEnableMsbToLsbShift + (encoderConfig->input.msbShift > 0), &ycbcrConversionCreateInfo, &ycbcrPrimariesConstants, &samplerInfo, @@ -1405,7 +1541,9 @@ VkImageLayout VkVideoEncoder::TransitionImageLayout(VkCommandBuffer cmdBuf, imageBarrier.srcStageMask = VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR; imageBarrier.dstStageMask = VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR; } else { +#ifdef __cpp_exceptions throw std::invalid_argument("unsupported layout transition!"); +#endif } const VkDependencyInfoKHR dependencyInfo = { @@ -1449,8 +1587,9 @@ VkResult VkVideoEncoder::CopyLinearToOptimalImage(VkCommandBuffer& commandBuffer // Bind memory for the image. const VkMpFormatInfo* mpInfo = YcbcrVkFormatInfo(format); - // Currently formats that have more than 2 output planes are not supported. 444 formats have a shared CbCr planes in all current tests - assert((mpInfo->vkPlaneFormat[2] == VK_FORMAT_UNDEFINED) && (mpInfo->vkPlaneFormat[3] == VK_FORMAT_UNDEFINED)); + // Determine number of planes: 1 (base) + numberOfExtraPlanes + const uint32_t numPlanes = 1 + mpInfo->planesLayout.numberOfExtraPlanes; + assert(numPlanes >= 1 && numPlanes <= 3); // Copy src buffer to image. VkImageCopy copyRegion[3]{}; @@ -1485,9 +1624,21 @@ VkResult VkVideoEncoder::CopyLinearToOptimalImage(VkCommandBuffer& commandBuffer copyRegion[1].dstSubresource.baseArrayLayer = dstCopyArrayLayer; copyRegion[1].dstSubresource.layerCount = 1; + if (numPlanes > 2) { + copyRegion[2].extent = copyRegion[1].extent; + copyRegion[2].srcSubresource.aspectMask = VK_IMAGE_ASPECT_PLANE_2_BIT; + copyRegion[2].srcSubresource.mipLevel = 0; + copyRegion[2].srcSubresource.baseArrayLayer = srcCopyArrayLayer; + copyRegion[2].srcSubresource.layerCount = 1; + copyRegion[2].dstSubresource.aspectMask = VK_IMAGE_ASPECT_PLANE_2_BIT; + copyRegion[2].dstSubresource.mipLevel = 0; + copyRegion[2].dstSubresource.baseArrayLayer = dstCopyArrayLayer; + copyRegion[2].dstSubresource.layerCount = 1; + } + m_vkDevCtx->CmdCopyImage(commandBuffer, srcImageResource->GetImage(), srcImageLayout, dstImageResource->GetImage(), dstImageLayout, - (uint32_t)2, copyRegion); + numPlanes, copyRegion); { VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER}; @@ -1610,12 +1761,9 @@ VkResult VkVideoEncoder::HandleCtrlCmd(VkSharedBaseObj& encodeFrameInfo->qualityLevelInfo.sType = VK_STRUCTURE_TYPE_VIDEO_ENCODE_QUALITY_LEVEL_INFO_KHR; encodeFrameInfo->qualityLevelInfo.qualityLevel = encodeFrameInfo->qualityLevel; if (pNext != nullptr) { - if (encodeFrameInfo->rateControlInfo.pNext == nullptr) { - encodeFrameInfo->rateControlInfo.pNext = pNext; - } else { - ((VkBaseInStructure*)(encodeFrameInfo->rateControlInfo.pNext))->pNext = pNext; - } + vk::ChainNextVkStruct(encodeFrameInfo->rateControlInfo, *pNext); } + pNext = (VkBaseInStructure*)&encodeFrameInfo->qualityLevelInfo; } @@ -1638,12 +1786,9 @@ VkResult VkVideoEncoder::HandleCtrlCmd(VkSharedBaseObj& m_beginRateControlInfo = encodeFrameInfo->rateControlInfo; if (pNext != nullptr) { - if (encodeFrameInfo->rateControlInfo.pNext == nullptr) { - encodeFrameInfo->rateControlInfo.pNext = pNext; - } else { - ((VkBaseInStructure*)(encodeFrameInfo->rateControlInfo.pNext))->pNext = pNext; - } + vk::ChainNextVkStruct(encodeFrameInfo->rateControlInfo, *pNext); } + pNext = (VkBaseInStructure*)&encodeFrameInfo->rateControlInfo; } @@ -1723,7 +1868,8 @@ VkResult VkVideoEncoder::RecordVideoCodingCmd(VkSharedBaseObjCmdControlVideoCodingKHR(cmdBuf, &renderControlInfo); m_beginRateControlInfo = *(VkVideoEncodeRateControlInfoKHR*)encodeFrameInfo->pControlCmdChain; - ((VkBaseInStructure*)(m_beginRateControlInfo.pNext))->pNext = NULL; + // Do not walk the chain, otherwise we end up creating a loop here. + m_beginRateControlInfo.pNext = (VkBaseInStructure*)(&encodeFrameInfo->pControlCmdChain); } if (m_videoMaintenance1FeaturesSupported) @@ -1735,10 +1881,12 @@ VkResult VkVideoEncoder::RecordVideoCodingCmd(VkSharedBaseObjencodeInfo; - while (pStruct->pNext) pStruct = (VkBaseInStructure*)pStruct->pNext; - pStruct->pNext = (VkBaseInStructure*)&videoInlineQueryInfoKHR; + vk::ChainNextVkStruct(*pStruct, videoInlineQueryInfoKHR); vkDevCtx->CmdEncodeVideoKHR(cmdBuf, &encodeFrameInfo->encodeInfo); + + // Remove the stack pointer from the chain, causes a use after free otherwise in GetEncodeFrameInfoH264 + encodeFrameInfo->encodeInfo.pNext = videoInlineQueryInfoKHR.pNext; } else { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h index 61c2ec84..dacc2929 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h @@ -58,7 +58,7 @@ class VkVideoEncoder : public VkVideoRefCountBase { { VkStructureType GetType() { return (encodeInfo.pNext == nullptr) ? - VK_STRUCTURE_TYPE_VIDEO_ENCODE_INFO_KHR : ((VkBaseInStructure*)encodeInfo.pNext)->sType; + VK_STRUCTURE_TYPE_VIDEO_ENCODE_INFO_KHR : reinterpret_cast(encodeInfo.pNext)->sType; } VkVideoEncodeFrameInfo(const void* pNext = nullptr) @@ -559,6 +559,29 @@ class VkVideoEncoder : public VkVideoRefCountBase { const uint8_t* setPlaneOffset(const uint8_t* pFrameData, size_t bufferSize, size_t ¤tReadOffset); + /** + * @brief Copies YCbCr planes directly from input buffer to output buffer when formats are the same + * + * @param pInputFrameData Source buffer containing YCbCr planes + * @param inputPlaneLayouts Array of source buffer plane layouts (offset, pitch, etc.) + * @param writeImagePtr Destination buffer for the YCbCr planes + * @param dstSubresourceLayout Array of destination buffer plane layouts + * @param width Width of the image in pixels + * @param height Height of the image in pixels + * @param numPlanes Number of planes in the format (1, 2, or 3) + * @param format The VkFormat of the image for proper subsampling and bit depth detection + * @return none + */ + void CopyYCbCrPlanesDirectCPU( + const uint8_t* pInputFrameData, + const VkSubresourceLayout* inputPlaneLayouts, + uint8_t* writeImagePtr, + const VkSubresourceLayout* dstSubresourceLayout, + uint32_t width, + uint32_t height, + uint32_t numPlanes, + VkFormat format); + bool WaitForThreadsToComplete(); protected: diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoGopStructure.h b/vk_video_encoder/libs/VkVideoEncoder/VkVideoGopStructure.h index d3b1ab0a..2ab76bcd 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoGopStructure.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoGopStructure.h @@ -25,6 +25,7 @@ #include #include #include +#include // for std::min static const uint32_t MAX_GOP_SIZE = 64; @@ -207,15 +208,15 @@ class VkVideoGopStructure { uint32_t periodDelta = INT32_MAX; // the delta of this frame to the next closed GOP reference. -1 if it is not a B-frame if (framesLeft <= consecutiveBFrameCount) { // Handle last frames sequence - periodDelta = std::min(periodDelta, framesLeft); + periodDelta = std::min(periodDelta, framesLeft); } if (m_idrPeriod > 0) { // Is the IDR period valid - periodDelta = std::min(periodDelta, GetPeriodDelta(gopState, m_idrPeriod)); + periodDelta = std::min(periodDelta, GetPeriodDelta(gopState, m_idrPeriod)); } if (m_closedGop) { // A closed GOP is required. - periodDelta = std::min(periodDelta, GetPeriodDelta(gopState, m_gopFrameCount)); + periodDelta = std::min(periodDelta, GetPeriodDelta(gopState, m_gopFrameCount)); } uint32_t refDelta = INT32_MAX; // the delta of this frame from the last reference. -1 if it is not a B-frame diff --git a/vk_video_encoder/src/vulkan_video_encoder.cpp b/vk_video_encoder/src/vulkan_video_encoder.cpp index 61c3637d..ae44f7ce 100644 --- a/vk_video_encoder/src/vulkan_video_encoder.cpp +++ b/vk_video_encoder/src/vulkan_video_encoder.cpp @@ -23,7 +23,7 @@ class VulkanVideoEncoderImpl : public VulkanVideoEncoder { public: virtual VkResult Initialize(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv); + int argc, const char** argv); virtual int64_t GetNumberOfFrames() { return m_encoderConfig->numFrames; @@ -81,7 +81,7 @@ class VulkanVideoEncoderImpl : public VulkanVideoEncoder { }; VkResult VulkanVideoEncoderImpl::Initialize(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv) + int argc, const char** argv) { VkResult result = EncoderConfig::CreateCodecConfig(argc, argv, m_encoderConfig); if (VK_SUCCESS != result) { @@ -235,7 +235,7 @@ VkResult VulkanVideoEncoderImpl::EncodeNextFrame(int64_t& frameNumEncoded) VK_VIDEO_ENCODER_EXPORT VkResult CreateVulkanVideoEncoder(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv, + int argc, const char** argv, VkSharedBaseObj& vulkanVideoEncoder) { switch((uint32_t)videoCodecOperation) diff --git a/vk_video_encoder/test/vulkan-video-enc/Main.cpp b/vk_video_encoder/test/vulkan-video-enc/Main.cpp index 58c5cb49..09f55420 100644 --- a/vk_video_encoder/test/vulkan-video-enc/Main.cpp +++ b/vk_video_encoder/test/vulkan-video-enc/Main.cpp @@ -18,7 +18,7 @@ #include "vulkan_video_encoder.h" #include "VkVSCommon.h" -int main(int argc, char** argv) +int main(int argc, const char** argv) { std::cout << "Enter encoder test" << std::endl; VkSharedBaseObj vulkanVideoEncoder;