From ef5a10ec7f38ca522130ce64762ff9cf2230829e Mon Sep 17 00:00:00 2001 From: Tony Zlatinski Date: Thu, 27 Mar 2025 18:35:37 -0500 Subject: [PATCH 01/14] common: Modify the output filename extension based on the type --- .../libs/VkCodecUtils/VkVideoFrameToFile.cpp | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp index cb71ccb6..846a0890 100644 --- a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp +++ b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp @@ -265,15 +265,44 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } } - FILE* AttachFile(const char* fileName) { + bool hasExtension(const char* fileName, const char* extension) { + size_t fileLen = std::strlen(fileName); + size_t extLen = std::strlen(extension); + + if (fileLen < extLen) { + return false; + } + + return std::strcmp(fileName + fileLen - extLen, extension) == 0; + } + + FILE* AttachFile(const char* fileName, bool y4mFormat) { if (m_outputFile) { fclose(m_outputFile); m_outputFile = nullptr; } + std::string fileNameWithModExt; + // Check if the file does not have a y4m extension, + // but y4m format is requested. + if (y4mFormat && !hasExtension(fileName, ".y4m")) { + std::cout << std::endl << "y4m output format is requested, "; + std::cout << "but the output file's (" << fileName << ") extension isn't .y4m!" + << std::endl; + fileNameWithModExt = fileName + std::string(".y4m"); + fileName = fileNameWithModExt.c_str(); + } else if (!hasExtension(fileName, ".yuv")) { + std::cout << std::endl << "Raw yuv output format is requested, "; + std::cout << "but the output file's (" << fileName << ") extension isn't .yuv!" + << std::endl; + fileNameWithModExt = fileName + std::string(".yuv"); + fileName = fileNameWithModExt.c_str(); + } + if (fileName != nullptr) { m_outputFile = fopen(fileName, "wb"); if (m_outputFile) { + std::cout << "Output file name is: " << fileName << std::endl; return m_outputFile; } } @@ -568,7 +597,7 @@ VkResult VkVideoFrameOutput::Create(const char* fileName, return VK_ERROR_OUT_OF_HOST_MEMORY; } - FILE* outFile = newFrameToFile->AttachFile(fileName); + FILE* outFile = newFrameToFile->AttachFile(fileName, outputy4m); if ((fileName != nullptr) && (outFile == nullptr)) { delete newFrameToFile; return VK_ERROR_INITIALIZATION_FAILED; From 892cb87b2cccf08eebff83ed775e4a5639ca4d99 Mon Sep 17 00:00:00 2001 From: Tony Zlatinski Date: Mon, 12 May 2025 08:15:31 -0500 Subject: [PATCH 02/14] encode: remove the CPU input conversion function Compute filter: Add support for bufffer as input Compute filter: Add support for in/out plane mismatch Compute filter: Add 10/12-bit shift support Compute filter: Add supprt for 4:4:4, 4:2:2 and 4:2:0 --- .../include/nvidia_utils/vulkan/ycbcr_utils.h | 18 + common/libs/VkCodecUtils/FrameProcessor.h | 2 +- .../VkCodecUtils/VulkanFilterYuvCompute.cpp | 2756 ++++++++++++++++- .../VkCodecUtils/VulkanFilterYuvCompute.h | 475 ++- .../libs/VkVideoDecoder/VkVideoDecoder.cpp | 22 +- .../libs/VkVideoEncoder/VkVideoEncoder.cpp | 224 +- .../libs/VkVideoEncoder/VkVideoEncoder.h | 23 + 7 files changed, 3039 insertions(+), 481 deletions(-) diff --git a/common/include/nvidia_utils/vulkan/ycbcr_utils.h b/common/include/nvidia_utils/vulkan/ycbcr_utils.h index 7713c1e7..46f3ed78 100644 --- a/common/include/nvidia_utils/vulkan/ycbcr_utils.h +++ b/common/include/nvidia_utils/vulkan/ycbcr_utils.h @@ -103,6 +103,24 @@ typedef struct YcbcrPlanesLayoutInfo { uint8_t reserved; // reserved for structure alignment. } YcbcrPlanesLayoutInfo; +static inline uint32_t GetBitsPerChannel(const YcbcrPlanesLayoutInfo& pYcbcrPlanesLayoutInfo) +{ + switch (pYcbcrPlanesLayoutInfo.bpp) { + case YCBCRA_8BPP: + return 8; + case YCBCRA_10BPP: + return 10; + case YCBCRA_12BPP: + return 12; + case YCBCRA_14BPP: + return 14; + case YCBCRA_16BPP: + return 16; + default: + return 8; + } +} + static inline size_t YcbcrAlign(size_t toAlign, size_t alignment) { return ((toAlign + (alignment - 1)) & ~(alignment -1)); diff --git a/common/libs/VkCodecUtils/FrameProcessor.h b/common/libs/VkCodecUtils/FrameProcessor.h index 8a94f6ab..097a3fa6 100644 --- a/common/libs/VkCodecUtils/FrameProcessor.h +++ b/common/libs/VkCodecUtils/FrameProcessor.h @@ -106,7 +106,7 @@ class FrameProcessor : public VkVideoRefCountBase { FrameProcessor(bool verbose = false) : m_frameCount(0) , m_profileFramesCount(0) - , m_displayTimePeriodMilliseconds(1000) + , m_displayTimePeriodMilliseconds(100) , start_time (std::chrono::steady_clock::now()) , m_verbose(verbose) { diff --git a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp index dd67b2b5..906cc229 100644 --- a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp +++ b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp @@ -17,7 +17,7 @@ #include "VulkanFilterYuvCompute.h" #include "nvidia_utils/vulkan/ycbcrvkinfo.h" -static bool dumpShaders = false; +static bool dumpShaders = true; VkResult VulkanFilterYuvCompute::Create(const VulkanDeviceContext* vkDevCtx, uint32_t queueFamilyIndex, @@ -26,6 +26,8 @@ VkResult VulkanFilterYuvCompute::Create(const VulkanDeviceContext* vkDevCtx, uint32_t maxNumFrames, VkFormat inputFormat, VkFormat outputFormat, + bool inputEnableMsbToLsbShift, + bool outputEnableLsbToMsbShift, const VkSamplerYcbcrConversionCreateInfo* pYcbcrConversionCreateInfo, const YcbcrPrimariesConstants* pYcbcrPrimariesConstants, const VkSamplerCreateInfo* pSamplerCreateInfo, @@ -39,6 +41,8 @@ VkResult VulkanFilterYuvCompute::Create(const VulkanDeviceContext* vkDevCtx, maxNumFrames, inputFormat, outputFormat, + inputEnableMsbToLsbShift, + outputEnableLsbToMsbShift, pYcbcrPrimariesConstants)); if (!yCbCrVulkanFilter) { @@ -116,34 +120,58 @@ VkResult VulkanFilterYuvCompute::Init(const VkSamplerYcbcrConversionCreateInfo* VkResult VulkanFilterYuvCompute::InitDescriptorSetLayout(uint32_t maxNumFrames) { + VkSampler ccSampler = m_samplerYcbcrConversion.GetSampler(); - assert(ccSampler != VK_NULL_HANDLE); - VkDescriptorType type = (ccSampler != VK_NULL_HANDLE) ? VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER : VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; + VkDescriptorType type = (ccSampler != VK_NULL_HANDLE) ? VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER : + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; const VkSampler* pImmutableSamplers = (ccSampler != VK_NULL_HANDLE) ? &ccSampler : nullptr; - const std::vector setLayoutBindings{ - // binding, descriptorType, descriptorCount, stageFlags, pImmutableSamplers; + std::vector setLayoutBindings; + + // Input bindings (either images or buffers) + if (m_inputIsBuffer) { + // Binding 0: Input buffer (read-only) for single buffer case + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 0, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 1: Input buffer (read-only) Y plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 1, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 2: Input buffer (read-only) Cb or CbCr plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 2, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 3: Input buffer (read-only) Cr plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 3, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + } else { // Binding 0: Input image (read-only) RGBA or RGBA YCbCr sampler sampled - VkDescriptorSetLayoutBinding{ 0, type, 1, VK_SHADER_STAGE_COMPUTE_BIT, pImmutableSamplers}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 0, type, 1, VK_SHADER_STAGE_COMPUTE_BIT, pImmutableSamplers}); // Binding 1: Input image (read-only) Y plane of YCbCr Image - VkDescriptorSetLayoutBinding{ 1, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 1, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 2: Input image (read-only) Cb or CbCr plane - VkDescriptorSetLayoutBinding{ 2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 2, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 3: Input image (read-only) Cr plane - VkDescriptorSetLayoutBinding{ 3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 3, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + } + // Output bindings (either images or buffers) + if (m_outputIsBuffer) { + // Binding 4: Output buffer (write) for single buffer case + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 4, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 5: Output buffer (write) Y plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 5, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 6: Output buffer (write) CbCr plane of 2-plane or Cb of 3-plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 6, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + // Binding 7: Output buffer (write) Cr plane of 3-plane + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 7, VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + } else { // Binding 4: Output image (write) RGBA or YCbCr single-plane image - VkDescriptorSetLayoutBinding{ 4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 4, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 5: Output image (write) Y plane of YCbCr Image - VkDescriptorSetLayoutBinding{ 5, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 5, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 6: Output image (write) CbCr plane of 2-plane or Cb of 3-plane YCbCr Image - VkDescriptorSetLayoutBinding{ 6, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 6, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); // Binding 7: Output image (write) Cr plane of 3-pane YCbCr Image - VkDescriptorSetLayoutBinding{ 7, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 7, VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); + } - // Binding 8: uniform buffer for input parameters. - VkDescriptorSetLayoutBinding{ 8, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}, - }; + // Binding 8: uniform buffer for input parameters. + setLayoutBindings.push_back(VkDescriptorSetLayoutBinding{ 8, VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, 1, VK_SHADER_STAGE_COMPUTE_BIT, nullptr}); VkPushConstantRange pushConstantRange = {}; pushConstantRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; // Stage the push constant is for @@ -175,20 +203,74 @@ static YcbcrBtStandard GetYcbcrPrimariesConstantsId(VkSamplerYcbcrModelConversio return YcbcrBtStandardUnknown; } +// Generate a unified push constants declaration for shaders +/** + * @brief Generates GLSL code for push constants declaration used in compute shaders + * + * This function creates a standard push constants block with fields for: + * - Source and destination image layers + * - Input and output dimensions + * - Buffer offsets and pitches for Y, Cb, and Cr planes + * + * @param shaderStr Output stringstream where the GLSL code will be written + */ +void GenPushConstantsDecl(std::stringstream& shaderStr) { + shaderStr << "layout(push_constant) uniform PushConstants {\n" + << " uint srcLayer; // src image layer to use\n" + << " uint dstLayer; // dst image layer to use\n" + << " uint inputWidth; // input image or buffer width\n" + << " uint inputHeight; // input image or buffer height\n" + << " uint outputWidth; // output image or buffer width\n" + << " uint outputHeight; // output image or buffer height\n" + << " uint inYOffset; // input buffer Y plane offset\n" + << " uint inCbOffset; // input buffer Cb plane offset\n" + << " uint inCrOffset; // input buffer Cr plane offset\n" + << " uint inYPitch; // input buffer Y plane pitch\n" + << " uint inCbPitch; // input buffer Cb plane pitch\n" + << " uint inCrPitch; // input buffer Cr plane pitch\n" + << " uint outYOffset; // output buffer Y plane offset\n" + << " uint outCbOffset; // output buffer Cb plane offset\n" + << " uint outCrOffset; // output buffer Cr plane offset\n" + << " uint outYPitch; // output buffer Y plane pitch\n" + << " uint outCbPitch; // output buffer Cb plane pitch\n" + << " uint outCrPitch; // output buffer Cr plane pitch\n" + << "} pushConstants;\n"; +} + +// Updated header function with unified push constants +/** + * @brief Generates the shader header with version declaration and push constants + * + * Creates the beginning of a GLSL compute shader with: + * - GLSL version declaration (#version 450) + * - Push constants structure + * - Local work group size (16x16) + * + * @param shaderStr Output stringstream where the GLSL code will be written + */ static void GenHeaderAndPushConst(std::stringstream& shaderStr) { - shaderStr << "#version 450\n" - "layout(push_constant) uniform PushConstants {\n" - " uint srcImageLayer; // Source image layer index\n" - " uint dstImageLayer; // Destination image layer index\n" - " ivec2 inputSize; // Original input image size (width, height)\n" - " ivec2 outputSize; // Output image size (width, height, with padding)\n" - "} pushConstants;\n" - "\n" - "layout (local_size_x = 16, local_size_y = 16) in;\n" - "\n"; + shaderStr << "#version 450\n"; + GenPushConstantsDecl(shaderStr); + shaderStr << "\n" + << "layout (local_size_x = 16, local_size_y = 16) in;\n" + << "\n"; } +/** + * @brief Generates GLSL code for image binding layout declarations + * + * Creates the binding declaration for an image resource in the shader. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param imageName Base name for the image variable + * @param imageSubName Suffix name for the image variable (e.g., "Y", "CbCr") + * @param imageFormat Format string for the image (e.g., "rgba8") + * @param isInput Whether this is an input (readonly) or output (writeonly) image + * @param binding Binding point in the descriptor set + * @param set Descriptor set number + * @param imageArray Whether the image should be declared as image2DArray instead of image2D + */ static void GenImageIoBindingLayout(std::stringstream& shaderStr, const char *imageName, const char *imageSubName, @@ -206,22 +288,249 @@ static void GenImageIoBindingLayout(std::stringstream& shaderStr, } +/** + * @brief Generates GLSL code for handling global invocation position and bounds checking + * + * Creates code to: + * - Get the current pixel position from gl_GlobalInvocationID + * - Check if the position is within output image bounds + * - Return early if out of bounds to prevent invalid memory access + * + * @param shaderStr Output stringstream where the GLSL code will be written + */ static void GenHandleImagePosition(std::stringstream& shaderStr) { shaderStr << " ivec2 pos = ivec2(gl_GlobalInvocationID.xy);\n" " // Check for out-of-bounds writes\n" - " if ((pos.x >= pushConstants.outputSize.x) || (pos.y >= pushConstants.outputSize.y)) {\n" + " if ((pos.x >= pushConstants.outputWidth) || (pos.y >= pushConstants.outputHeight)) {\n" + " return;\n" + " }\n" + "\n"; +} + +/** + * @brief Generates GLSL code for buffer binding layout declarations + * + * Creates the binding declaration for a buffer resource in the shader. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param bufferName Base name for the buffer variable + * @param bufferSubName Suffix name for the buffer variable (e.g., "Y", "CbCr") + * @param bufferDataType Data type of buffer elements (e.g., "uint8_t", "uint16_t") + * @param bufferType Vulkan descriptor type (Storage buffer, uniform texel buffer, etc.) + * @param isInput Whether this is an input (readonly) or output (writeonly) buffer + * @param binding Binding point in the descriptor set + * @param set Descriptor set number + */ +static void GenBufferIoBindingLayout(std::stringstream& shaderStr, + const char *bufferName, + const char *bufferSubName, + const char *bufferDataType, + VkDescriptorType bufferType, + bool isInput, + uint32_t binding, + uint32_t set) { + + const char* readonlyModifier = isInput ? " readonly" : ""; + const char* writeonlyModifier = isInput ? "" : " writeonly"; + + switch (bufferType) { + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: + shaderStr << "layout (set = " << set << ", binding = " << binding << ") uniform" + << " samplerBuffer " + << bufferName << bufferSubName + << ";\n"; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: + shaderStr << "layout (set = " << set << ", binding = " << binding << ") uniform" + << readonlyModifier << writeonlyModifier + << " imageBuffer " + << bufferName << bufferSubName + << ";\n"; + break; + + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC: + shaderStr << "layout (set = " << set << ", binding = " << binding << ") buffer" + << readonlyModifier << writeonlyModifier + << " " << bufferName << bufferSubName << "Buffer" + << " {\n" + << " " << bufferDataType << "[] data;\n" + << "} " << bufferName << bufferSubName << ";\n"; + break; + + default: + // Unsupported buffer type + break; + } +} + +/** + * @brief Generates GLSL code for determining if a position has chroma information + * + * Creates a condition that checks if the current pixel position contains + * chroma information based on the subsampling ratios. For example, in 4:2:0 + * subsampling, only pixels at even x and y coordinates have chroma samples. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param chromaHorzRatio Horizontal subsampling ratio (1 for 4:4:4, 2 for 4:2:2/4:2:0) + * @param chromaVertRatio Vertical subsampling ratio (1 for 4:4:4/4:2:2, 2 for 4:2:0) + * @param useCondition Whether to output as a full if-condition (true) or just the condition expression (false) + * @param pixelPosName Name of the pixel position variable in the shader (default: "srcPos") + * @param setProcessChromaBool Name of the boolean variable to set (default: "processChromaBool") + */ +static void GenHandleChromaPosition(std::stringstream& shaderStr, + uint32_t chromaHorzRatio, + uint32_t chromaVertRatio, + bool useCondition = true, + const char* pixelPosName = "srcPos", + const char* setProcessChromaBool = "processChromaBool") +{ + // Skip this for 4:4:4 since all pixels have chroma + if (chromaHorzRatio <= 1 && chromaVertRatio <= 1) { + if (useCondition) { + // For 4:4:4, no subsampling check needed - process all pixels + shaderStr << " bool " << setProcessChromaBool << " = true;\n"; + } else { + shaderStr << "true"; + } + return; + } + + // Build condition for chroma sampling + std::stringstream condition; + if (chromaHorzRatio > 1) + condition << "(" << pixelPosName << ".x % " << chromaHorzRatio << " == 0)"; + + if (chromaHorzRatio > 1 && chromaVertRatio > 1) + condition << " && "; + + if (chromaVertRatio > 1) + condition << "(" << pixelPosName << ".y % " << chromaVertRatio << " == 0)"; + + if (useCondition) { + shaderStr << " bool " << setProcessChromaBool << " = " << condition.str() << ";\n"; + } else { + shaderStr << condition.str(); + } +} + +/** + * @brief Generates GLSL code for calculating subsampled chroma positions + * + * Creates code to compute the chroma position from a pixel position + * based on the subsampling ratios. For example, in 4:2:0 subsampling, + * the chroma position is calculated by dividing both x and y by 2. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param chromaHorzRatio Horizontal subsampling ratio (1 for 4:4:4, 2 for 4:2:2/4:2:0) + * @param chromaVertRatio Vertical subsampling ratio (1 for 4:4:4/4:2:2, 2 for 4:2:0) + * @param srcPosName Name of the source position variable (default: "srcPos") + * @param dstPosName Name of the destination position variable (default: "chromaSrcPos") + * @param indent Number of spaces to indent the output code (default: 8) + * @param generateIfBlock Whether to generate an if-block or just assignment statements (default: false) + */ +static void GenCalculateChromaPosition(std::stringstream& shaderStr, + uint32_t chromaHorzRatio, + uint32_t chromaVertRatio, + const char* srcPosName = "srcPos", + const char* dstPosName = "chromaSrcPos", + int indent = 8, + bool generateIfBlock = false) +{ + std::string indentStr(indent, ' '); + + // For 4:4:4, no subsampling needed + if (chromaHorzRatio <= 1 && chromaVertRatio <= 1) { + shaderStr << indentStr << "// No subsampling for 4:4:4 format, use original position\n"; + if (generateIfBlock) { + shaderStr << indentStr << "// " << dstPosName << " already equals " << srcPosName << "\n"; + } else { + shaderStr << indentStr << dstPosName << " = " << srcPosName << ";\n"; + } + return; + } + + shaderStr << indentStr << "// Calculate subsampled positions based on format's subsampling\n"; + + if (generateIfBlock) { + // Generate an if-block for conditional calculation + shaderStr << indentStr << dstPosName << " = " << srcPosName << ";\n"; + shaderStr << indentStr << "if (processChroma) {\n"; + + if (chromaHorzRatio > 1) { + shaderStr << indentStr << " " << dstPosName << ".x = " << srcPosName << ".x / " << chromaHorzRatio << ";\n"; + } + + if (chromaVertRatio > 1) { + shaderStr << indentStr << " " << dstPosName << ".y = " << srcPosName << ".y / " << chromaVertRatio << ";\n"; + } + + shaderStr << indentStr << "}\n"; + } else { + // Generate direct assignment statements + shaderStr << indentStr << dstPosName << " = ivec2("; + + if (chromaHorzRatio > 1) + shaderStr << srcPosName << ".x / " << chromaHorzRatio; + else + shaderStr << srcPosName << ".x"; + + shaderStr << ", "; + + if (chromaVertRatio > 1) + shaderStr << srcPosName << ".y / " << chromaVertRatio; + else + shaderStr << srcPosName << ".y"; + + shaderStr << ");\n"; + } +} + +/** + * @brief Generates GLSL code for handling buffer position calculations with chroma subsampling + * + * Creates code to: + * - Get the current pixel position from gl_GlobalInvocationID + * - Check if the position is within output bounds + * - Calculate appropriate buffer indices based on subsampling ratios + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param chromaHorzRatio Horizontal subsampling ratio (default: 2 for 4:2:0/4:2:2) + * @param chromaVertRatio Vertical subsampling ratio (default: 2 for 4:2:0) + */ +static void GenHandleBufferPosition(std::stringstream& shaderStr, int chromaHorzRatio = 2, int chromaVertRatio = 2) +{ + shaderStr << + " ivec2 pos = ivec2(gl_GlobalInvocationID.xy);\n" + " // Check for out-of-bounds writes\n" + " if ((pos.x >= pushConstants.outputWidth) || (pos.y >= pushConstants.outputHeight)) {\n" " return;\n" " }\n" + " \n" + " // Calculate buffer indices based on position and strides\n" + " uint yIndex = pushConstants.inYOffset + pos.y * pushConstants.inYPitch + pos.x;\n" + " uint cbIndex = pushConstants.inCbOffset + (pos.y / " << chromaVertRatio << ") * pushConstants.inCbPitch + (pos.x / " << chromaHorzRatio << ");\n" + " uint crIndex = pushConstants.inCrOffset + (pos.y / " << chromaVertRatio << ") * pushConstants.inCrPitch + (pos.x / " << chromaHorzRatio << ");\n" "\n"; } +/** + * @brief Generates GLSL code for handling source position with optional replication + * + * Creates code to calculate source position, with optional boundary handling + * by replicating edge pixels when coordinates exceed input dimensions. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param enableReplicate Whether to enable edge replication (clamp to edge) + */ static void GenHandleSourcePositionWithReplicate(std::stringstream& shaderStr, bool enableReplicate) { if (enableReplicate) { shaderStr << - " ivec2 srcPos = min(pos, pushConstants.inputSize );\n" + " ivec2 srcPos = min(pos, ivec2(pushConstants.inputWidth, pushConstants.inputHeight));\n" "\n"; } else { shaderStr << @@ -230,15 +539,622 @@ static void GenHandleSourcePositionWithReplicate(std::stringstream& shaderStr, b } } -void VulkanFilterYuvCompute::ShaderGeneratePlaneDescriptors(std::stringstream& shaderStr, - VkImageAspectFlags& imageAspects, - const char *imageName, - VkFormat imageFormat, - bool isInput, - uint32_t startBinding, - uint32_t set, - bool imageArray) +/** + * @brief Generates GLSL function for fetching Y samples from a buffer + * + * Creates a helper function that reads Y samples from a buffer and + * normalizes values to 0.0-1.0 range, handling different bit depths. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isHighBitDepth Whether the Y data is high bit depth (>8 bits) + * @param bitDepth The bit depth of Y samples (8, 10, 12, or 16) + */ +static void GenFetchYFromBufferFunc(std::stringstream& shaderStr, + bool isHighBitDepth, uint32_t bitDepth) { + shaderStr << "// Function to fetch Y component from buffer\n" + << "float fetchYFromBuffer(uint index) {\n"; + + if (isHighBitDepth) { + shaderStr << " uint16_t rawValue = inputBufferY.data[index];\n" + << " return extractHighBitDepth(rawValue);\n"; + } else { + shaderStr << " uint8_t byteValue = inputBufferY.data[index];\n" + << " return float(byteValue) / 255.0;\n"; + } + + shaderStr << "}\n\n"; +} + +/** + * @brief Generates GLSL functions for fetching Cb and Cr samples from buffers + * + * Creates helper functions to read Cb and Cr chroma samples from buffers and + * normalize values to 0.0-1.0 range, handling different bit depths. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isHighBitDepth Whether the chroma data is high bit depth (>8 bits) + * @param bitDepth The bit depth of chroma samples (8, 10, 12, or 16) + */ +static void GenFetchCbCrFromBufferFunc(std::stringstream& shaderStr, + bool isHighBitDepth, uint32_t bitDepth) { + // Cb fetch function + shaderStr << "// Function to fetch Cb component from buffer\n" + << "float fetchCbFromBuffer(uint index) {\n"; + + if (isHighBitDepth) { + shaderStr << " uint16_t rawValue = inputBufferCb.data[index];\n" + << " return extractHighBitDepth(rawValue);\n"; + } else { + shaderStr << " uint8_t byteValue = inputBufferCb.data[index];\n" + << " return float(byteValue) / 255.0;\n"; + } + + shaderStr << "}\n\n"; + + // Cr fetch function + shaderStr << "// Function to fetch Cr component from buffer\n" + << "float fetchCrFromBuffer(uint index) {\n"; + + if (isHighBitDepth) { + shaderStr << " uint16_t rawValue = inputBufferCr.data[index];\n" + << " return extractHighBitDepth(rawValue);\n"; + } else { + shaderStr << " uint8_t byteValue = inputBufferCr.data[index];\n" + << " return float(byteValue) / 255.0;\n"; + } + + shaderStr << "}\n\n"; +} + +/** + * @brief Generates GLSL function for extracting and normalizing high bit-depth values + * + * Creates a helper function to extract and normalize values from high bit-depth + * formats (10, 12, or 16 bits), handling MSB or LSB aligned data. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isMSB Whether the high bits are MSB-aligned (true) or LSB-aligned (false) + * @param bitDepth The bit depth of the samples (10, 12, or 16) + */ +static void GenExtractHighBitDepthFunc(std::stringstream& shaderStr, + bool isMSB, uint32_t bitDepth) +{ + shaderStr << "// Helper function to extract and normalize high bit-depth values\n"; + + if (isMSB) { + // For MSB-aligned data + shaderStr << "float extractHighBitDepth(uint value) {\n" + << " // For MSB-aligned " << bitDepth << "-bit data, shift right to extract the bits\n" + << " uint extractedValue = value >> (16u - " << bitDepth << "u);\n" + << " // Normalize to 0.0-1.0 range\n" + << " return float(extractedValue) / " << ((1 << bitDepth) - 1) << ".0;\n" + << "}\n\n"; + } else { + // For LSB-aligned data + shaderStr << "float extractHighBitDepth(uint value) {\n" + << " // For LSB-aligned " << bitDepth << "-bit data, mask to extract the bits\n" + << " uint extractedValue = value & " << ((1 << bitDepth) - 1) << "u;\n" + << " // Normalize to 0.0-1.0 range\n" + << " return float(extractedValue) / " << ((1 << bitDepth) - 1) << ".0;\n" + << "}\n\n"; + } +} + +/** + * @brief Generates GLSL code for applying MSB-to-LSB bit shifting for high bit-depth content + * + * Creates code to convert MSB-aligned high bit-depth content to normalized values: + * - For images (floating point): Divide by the appropriate factor + * - For buffers (integer): Perform right bit shift operations + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isInputBuffer Whether the input is a buffer (true) or image (false) + * @param inputBitDepth The bit depth of the input data (8, 10, 12, or 16) + * @param imageAspects Image aspect flags indicating which planes are being processed + */ +static void GenApplyMsbToLsbShift(std::stringstream& shaderStr, + bool isInputBuffer, + uint32_t inputBitDepth, + VkImageAspectFlags imageAspects) +{ + // Only apply for high bit-depth formats (10/12-bit) + if ((inputBitDepth != 10) && (inputBitDepth != 12)) { + return; + } + + // Calculate shift amount based on bit depth + uint32_t shiftAmount = 16 - inputBitDepth; + float shiftFactor = static_cast(1 << shiftAmount); + + shaderStr << "\n // MSB-to-LSB shift for high bit-depth " + << (isInputBuffer ? "buffer" : "image") << " data\n"; + + if (isInputBuffer) { + // For buffers, we use actual bit shifting operations on integer values + shaderStr << " // For high bit-depth data in buffers, we need to shift right by " + << shiftAmount << " bits to convert from MSB-aligned to actual values\n" + << " // This is a right shift operation for integer values\n"; + + // Build a condition mask based on which components are being read + std::string maskCondition = ""; + bool needsOr = false; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + maskCondition += "YCbCrRawOut.x > 0.0"; + needsOr = true; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + if (needsOr) maskCondition += " || "; + maskCondition += "YCbCrRawOut.y > 0.0"; + needsOr = true; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + if (needsOr) maskCondition += " || "; + maskCondition += "YCbCrRawOut.z > 0.0"; + } + + // Only apply shift if there are values to shift + if (!maskCondition.empty()) { + shaderStr << " if (" << maskCondition << ") {\n" + << " // Convert from uint values to normalized float (for buffer inputs)\n"; + + if (inputBitDepth == 10) { + shaderStr << " // For 10-bit: Convert 10-bit values [0-1023] to normalized [0-1]\n" + << " const float normFactor = 1.0 / 1023.0;\n"; + } else { // 12-bit + shaderStr << " // For 12-bit: Convert 12-bit values [0-4095] to normalized [0-1]\n" + << " const float normFactor = 1.0 / 4095.0;\n"; + } + + // Apply right shift with bit mask to extract the actual bit values + // For 10-bit: (value >> 6) & 0x3FF = value / 64 (rounded down) + // For 12-bit: (value >> 4) & 0xFFF = value / 16 (rounded down) + shaderStr << " // Apply right shift to convert from MSB-aligned to actual bit values\n"; + + // Apply component-specific shifting based on which aspects are being read + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << " YCbCrRawOut.x = floor(YCbCrRawOut.x / " << shiftFactor + << ".0) * normFactor;\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " YCbCrRawOut.y = floor(YCbCrRawOut.y / " << shiftFactor + << ".0) * normFactor;\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " YCbCrRawOut.z = floor(YCbCrRawOut.z / " << shiftFactor + << ".0) * normFactor;\n"; + } + + shaderStr << " }\n"; + } + } else { + // For images, we're already working with normalized values, so we divide by shiftFactor + shaderStr << " // For high bit-depth data in images that are MSB-aligned,\n" + << " // we need to divide by " << shiftFactor << " to get the proper normalized values\n"; + + // Build a shift mask based on which components are being read + std::string shiftMask = "vec3("; + shiftMask += (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) ? "1.0, " : "0.0, "; + shiftMask += (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) ? "1.0, " : "0.0, "; + shiftMask += (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) ? "1.0" : "0.0"; + shiftMask += ")"; + + // Calculate reciprocal of shift factor (for multiplication instead of division) + float shiftFactorRecip = 1.0f / shiftFactor; + + // Only apply shift to the components that were actually read + shaderStr << " // Apply multiplication by reciprocal instead of division (more efficient)\n" + << " const float shiftFactorRecip = " << std::fixed << std::setprecision(8) << shiftFactorRecip << "f;\n" + << " YCbCrRawOut = YCbCrRawOut * shiftFactorRecip * " << shiftMask << " + \n" + << " YCbCrRawOut * (vec3(1.0) - " << shiftMask << ");\n"; + } +} + +/** + * @brief Generates GLSL function for reading YCbCr data from either buffer or image sources + * + * Creates a function that reads YCbCr data from the appropriate source (buffer or image) + * based on the input format configuration. Handles different bit depths and plane layouts. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isInputBuffer Whether the input is a buffer (true) or image (false) + * @param inputBitDepth The bit depth of the input data (8, 10, 12, or 16) + * @param isInputTwoPlane Whether the input has two planes (e.g., NV12) or three planes + */ +static void GenReadYCbCrBuffer(std::stringstream& shaderStr, + bool isInputBuffer, + uint32_t inputBitDepth, + bool isInputTwoPlane, + bool enableMsbToLsbShift = false, + VkImageAspectFlags imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT, + const char* useProcessChromaBool = "processChroma") +{ + // Generate function to read from either buffer or image + shaderStr << + "// Function to read YCbCr data from input source (buffer or image)\n" + "vec3 readYCbCrFromSource(ivec2 pos, ivec2 chromaPos, uint srcLayer, bool processChroma) {\n" + " // Initialize to YCbCr black values (for limited range)\n"; + + // Set appropriate black values based on bit depth + if (inputBitDepth == 8) { + shaderStr << " vec3 YCbCrRawOut = vec3(16.0/255.0, 128.0/255.0, 128.0/255.0);\n\n"; + } else if (inputBitDepth == 10) { + shaderStr << " vec3 YCbCrRawOut = vec3(64.0/1023.0, 512.0/1023.0, 512.0/1023.0);\n\n"; + } else if (inputBitDepth == 12) { + shaderStr << " vec3 YCbCrRawOut = vec3(256.0/4095.0, 2048.0/4095.0, 2048.0/4095.0);\n\n"; + } else if (inputBitDepth == 16) { + shaderStr << " vec3 YCbCrRawOut = vec3(4096.0/65535.0, 32768.0/65535.0, 32768.0/65535.0);\n\n"; + } else { + // Default fallback + shaderStr << " vec3 YCbCrRawOut = vec3(16.0/255.0, 128.0/255.0, 128.0/255.0);\n\n"; + } + + if (isInputBuffer) { + // Reading from buffer + shaderStr << " // Reading from buffer source\n"; + + // Read Y component if PLANE_0_BIT is set + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << + " // Calculate buffer index for Y plane\n" + " uint yIndex = pushConstants.inYOffset + pos.y * pushConstants.inYPitch + pos.x;\n" + " YCbCrRawOut.x = fetchYFromBuffer(yIndex);\n\n"; + } + + // Read Cb/Cr components based on plane format and aspect flags + if ((imageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0) { + // Add conditional check for chroma processing + shaderStr << " // Process chroma data conditionally\n" + << " if (processChroma) {\n"; + + if (isInputTwoPlane) { + // Two-plane input buffer format with interleaved CbCr + shaderStr << " // Read interleaved CbCr data from 2-plane input buffer\n" + << " uint cbcrIndex = pushConstants.inCbOffset + chromaPos.y * pushConstants.inCbPitch + chromaPos.x * 2;\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " YCbCrRawOut.y = fetchCbFromBuffer(cbcrIndex);\n" + << " YCbCrRawOut.z = fetchCrFromBuffer(cbcrIndex + 1);\n"; + } + } else { + // Three-plane input buffer format with separate Cb and Cr planes + shaderStr << " // Read separate Cb and Cr from 3-plane input buffer\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " uint cbIndex = pushConstants.inCbOffset + chromaPos.y * pushConstants.inCbPitch + chromaPos.x;\n" + << " YCbCrRawOut.y = fetchCbFromBuffer(cbIndex);\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " uint crIndex = pushConstants.inCrOffset + chromaPos.y * pushConstants.inCrPitch + chromaPos.x;\n" + << " YCbCrRawOut.z = fetchCrFromBuffer(crIndex);\n"; + } + } + + // Close the conditional block + shaderStr << " }\n"; + } + } else { + // Reading from image + shaderStr << " // Reading from image source\n"; + + // Read Y component if PLANE_0_BIT is set + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << " // Read Y value from Y plane\n" + << " YCbCrRawOut.x = imageLoad(inputImageY, ivec3(pos, srcLayer)).r;\n\n"; + } + + // Read Cb/Cr components based on plane format and aspect flags + if ((imageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0) { + // Add conditional check for chroma processing + shaderStr << " // Process chroma data conditionally\n" + << " if (processChroma) {\n"; + + if (isInputTwoPlane) { + // Two-plane input image format with interleaved CbCr + shaderStr << " // Read interleaved CbCr data from 2-plane input image\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + // For two-plane formats (NV12, etc.), both Cb and Cr are in the second plane + shaderStr << " YCbCrRawOut.yz = imageLoad(inputImageCbCr, ivec3(chromaPos, srcLayer)).rg;\n"; + } + } else { + // Three-plane input image format with separate Cb and Cr planes + shaderStr << " // Read separate Cb and Cr from 3-plane input image\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " YCbCrRawOut.y = imageLoad(inputImageCb, ivec3(chromaPos, srcLayer)).r; // Cb\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " YCbCrRawOut.z = imageLoad(inputImageCr, ivec3(chromaPos, srcLayer)).r; // Cr\n"; + } + } + + // Close the conditional block + shaderStr << " }\n"; + } + } + + // Apply MSB-to-LSB shift if enabled + if (enableMsbToLsbShift) { + GenApplyMsbToLsbShift(shaderStr, isInputBuffer, inputBitDepth, imageAspects); + } + + // Return the raw YCbCr values + shaderStr << + "\n return YCbCrRawOut;\n" + "}\n\n"; +} + +/** + * @brief Generates GLSL function for applying LSB-to-MSB bit shifting for high bit-depth content + * + * Creates code to convert normalized values to MSB-aligned high bit-depth content by + * applying the appropriate bit shift. This function only handles the shift calculation, + * not the actual I/O operations. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isOutputBuffer Whether the output is a buffer (true) or image (false) + * @param outputBitDepth The bit depth of the output data (8, 10, 12, or 16) + */ +static void GenApplyLsbToMsbShift(std::stringstream& shaderStr, + bool isOutputBuffer, + uint32_t outputBitDepth) +{ + // Only apply for high bit-depth formats (10/12-bit) + if ((outputBitDepth != 10) && (outputBitDepth != 12)) { + // For 8-bit or 16-bit, no shift is needed - just use the input values directly + shaderStr << " // No bit-depth shift needed for " << outputBitDepth << "-bit format\n\n"; + return; + } + + // Calculate shift amount based on bit depth + uint32_t shiftAmount = 16 - outputBitDepth; + float shiftFactor = static_cast(1 << shiftAmount); + + shaderStr << " // Apply LSB-to-MSB shift for high bit-depth " + << (isOutputBuffer ? "buffer" : "image") << " data\n"; + + if (isOutputBuffer) { + // For buffers, we'll return unshifted values because the packing functions + // handle the bit shifting during the actual write operation + shaderStr << " // For buffer output, shift will be applied during packing\n\n"; + } else { + // For images, we need to multiply by shift factor to align bits properly + // Calculate multiplication factor + shaderStr << " // For image output with " << outputBitDepth << "-bit, multiply by " << shiftFactor + << " to shift into the MSB\n" + << " const float shiftFactorMultiplier = " << shiftFactor << ";\n" + << " YCbCrRawIn = YCbCrRawIn * shiftFactorMultiplier;\n\n"; + } +} + +/** + * @brief Generates GLSL function for writing YCbCr data to either buffer or image destinations + * + * Creates a function that writes YCbCr data to the appropriate destination (buffer or image) + * based on the output format configuration. Handles different bit depths and plane layouts. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param isOutputBuffer Whether the output is a buffer (true) or image (false) + * @param outputBitDepth The bit depth of the output data (8, 10, 12, or 16) + * @param isOutputTwoPlane Whether the output format has two planes (e.g., NV12) or three planes + */ +static void GenWriteYCbCrBuffer(std::stringstream& shaderStr, + bool isOutputBuffer, + uint32_t outputBitDepth, + bool isOutputTwoPlane, + bool enableLsbToMsbShift = false, + VkImageAspectFlags imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT, + const char* useProcessChromaBool = "processChroma") +{ + // Generate function to write to either buffer or image + shaderStr << + "// Function to write YCbCr data to output destination (buffer or image)\n" + "void writeYCbCrToDestination(vec3 YCbCrRawIn, ivec2 pos, ivec2 chromaPos, uint dstLayer, bool processChroma) {\n"; + + // Apply LSB-to-MSB shift if enabled - just transforms the values, doesn't do I/O + if (enableLsbToMsbShift) { + GenApplyLsbToMsbShift(shaderStr, isOutputBuffer, outputBitDepth); + } + + if (isOutputBuffer) { + // Writing to buffer + shaderStr << + " // Writing to buffer destination\n"; + + // Write Y component if PLANE_0_BIT is set + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << + " // Calculate buffer index for Y plane\n" + " uint outYIndex = pushConstants.outYOffset + pos.y * pushConstants.outYPitch + pos.x;\n\n"; + + // Handle normal Y component based on bit depth + if (outputBitDepth > 8) { + // For high bit-depth formats + switch (outputBitDepth) { + case 10: + shaderStr << " outputBufferY.data[outYIndex] = pack10BitTo16Bit(YCbCrRawIn.x);\n\n"; + break; + case 12: + shaderStr << " outputBufferY.data[outYIndex] = pack12BitTo16Bit(YCbCrRawIn.x);\n\n"; + break; + case 16: + default: + // For 16-bit, direct value + shaderStr << " outputBufferY.data[outYIndex] = uint16_t(clamp(YCbCrRawIn.x, 0.0, 65535.0));\n\n"; + break; + } + } else { + // For 8-bit formats + shaderStr << " outputBufferY.data[outYIndex] = uint8_t(clamp(YCbCrRawIn.x, 0.0, 255.0));\n\n"; + } + } + + // Write Cb/Cr components based on plane format and aspect flags + if ((imageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0) { + shaderStr << " // Process chroma data conditionally\n" + << " if (processChroma) {\n"; + + if (isOutputTwoPlane) { + // Two-plane output buffer format with interleaved CbCr + shaderStr << " // Write interleaved CbCr to 2-plane output buffer\n" + << " uint outCbCrIndex = pushConstants.outCbOffset + chromaPos.y * pushConstants.outCbPitch + chromaPos.x * 2;\n"; + + // Normal CbCr processing + if (outputBitDepth > 8) { + // For high bit-depth formats with interleaved data + switch (outputBitDepth) { + case 10: + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCbCr.data[outCbCrIndex] = pack10BitTo16Bit(YCbCrRawIn.y);\n" + << " outputBufferCbCr.data[outCbCrIndex + 1] = pack10BitTo16Bit(YCbCrRawIn.z);\n"; + } + break; + case 12: + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCbCr.data[outCbCrIndex] = pack12BitTo16Bit(YCbCrRawIn.y);\n" + << " outputBufferCbCr.data[outCbCrIndex + 1] = pack12BitTo16Bit(YCbCrRawIn.z);\n"; + } + break; + case 16: + default: + // For 16-bit, direct values + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCbCr.data[outCbCrIndex] = uint16_t(clamp(YCbCrRawIn.y, 0.0, 65535.0));\n" + << " outputBufferCbCr.data[outCbCrIndex + 1] = uint16_t(clamp(YCbCrRawIn.z, 0.0, 65535.0));\n"; + } + break; + } + } else { + // For 8-bit formats + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCbCr.data[outCbCrIndex] = uint8_t(clamp(YCbCrRawIn.y, 0.0, 255.0));\n" + << " outputBufferCbCr.data[outCbCrIndex + 1] = uint8_t(clamp(YCbCrRawIn.z, 0.0, 255.0));\n"; + } + } + } else { + // Three-plane output buffer format with separate Cb and Cr planes + shaderStr << " // Write separate Cb and Cr to 3-plane output buffer\n"; + + // Calculate indices for separate planes + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " uint outCbIndex = pushConstants.outCbOffset + chromaPos.y * pushConstants.outCbPitch + chromaPos.x;\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " uint outCrIndex = pushConstants.outCrOffset + chromaPos.y * pushConstants.outCrPitch + chromaPos.x;\n"; + } + + if (outputBitDepth > 8) { + // For high bit-depth formats + switch (outputBitDepth) { + case 10: + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCb.data[outCbIndex] = pack10BitTo16Bit(YCbCrRawIn.y);\n"; + } + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " outputBufferCr.data[outCrIndex] = pack10BitTo16Bit(YCbCrRawIn.z);\n"; + } + break; + case 12: + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCb.data[outCbIndex] = pack12BitTo16Bit(YCbCrRawIn.y);\n"; + } + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " outputBufferCr.data[outCrIndex] = pack12BitTo16Bit(YCbCrRawIn.z);\n"; + } + break; + case 16: + default: + // For 16-bit, direct values + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCb.data[outCbIndex] = uint16_t(clamp(YCbCrRawIn.y, 0.0, 65535.0));\n"; + } + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " outputBufferCr.data[outCrIndex] = uint16_t(clamp(YCbCrRawIn.z, 0.0, 65535.0));\n"; + } + break; + } + } else { + // For 8-bit formats + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " outputBufferCb.data[outCbIndex] = uint8_t(clamp(YCbCrRawIn.y, 0.0, 255.0));\n"; + } + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " outputBufferCr.data[outCrIndex] = uint8_t(clamp(YCbCrRawIn.z, 0.0, 255.0));\n"; + } + } + } + + shaderStr << " }\n"; // Close conditional chroma processing + } + } else { + // Writing to image + shaderStr << " // Writing to image destination\n"; + + // Write Y component if PLANE_0_BIT is set + if (imageAspects & VK_IMAGE_ASPECT_PLANE_0_BIT) { + shaderStr << " // Write Y component to Y plane\n" + << " imageStore(outputImageY, ivec3(pos, dstLayer), vec4(YCbCrRawIn.x, 0, 0, 1));\n\n"; + } + + // Write Cb/Cr components if their aspect flags are set + if ((imageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0) { + // Add conditional check for chroma processing + shaderStr << " // Process chroma data conditionally\n" + << " if (processChroma) {\n"; + + if (isOutputTwoPlane) { + // Two-plane output image format with interleaved CbCr + if ((imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) != 0) { + // Both Cb and Cr are needed + shaderStr << " // Write interleaved CbCr to 2-plane output image\n" + << " imageStore(outputImageCbCr, ivec3(chromaPos, dstLayer), " + << "vec4(YCbCrRawIn.y, YCbCrRawIn.z, 0, 1));\n"; + } + } else { + // Three-plane output image format with separate Cb and Cr planes + shaderStr << " // Write separate Cb and Cr to 3-plane output image\n"; + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " imageStore(outputImageCb, ivec3(chromaPos, dstLayer), vec4(YCbCrRawIn.y, 0, 0, 1));\n"; + } + + if (imageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " imageStore(outputImageCr, ivec3(chromaPos, dstLayer), vec4(YCbCrRawIn.z, 0, 0, 1));\n"; + } + } + + // Close the conditional block + shaderStr << " }\n"; + } + } + + // End the function + shaderStr << "}\n\n"; +} + +uint32_t VulkanFilterYuvCompute::ShaderGenerateImagePlaneDescriptors(std::stringstream& shaderStr, + VkImageAspectFlags& imageAspects, + const char *imageName, + VkFormat imageFormat, + bool isInput, + uint32_t startBinding, + uint32_t set, + bool imageArray) +{ + shaderStr << " // The " << (isInput ? "input" : "output") << " image binding\n"; // Image binding goes in this pattern: // offset 0: RGBA image // offset 1: multi-planar image plane Y @@ -267,7 +1183,8 @@ void VulkanFilterYuvCompute::ShaderGeneratePlaneDescriptors(std::stringstream& s } else if (inputMpInfo->planesLayout.numberOfExtraPlanes == 2) { - imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT; + imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT; GenImageIoBindingLayout(shaderStr, imageName, "Cb", vkFormatLookUp(inputMpInfo->vkPlaneFormat[1])->name, @@ -290,10 +1207,631 @@ void VulkanFilterYuvCompute::ShaderGeneratePlaneDescriptors(std::stringstream& s GenImageIoBindingLayout(shaderStr, imageName, "RGB", vkFormatLookUp(imageFormat)->name, isInput, - startBinding, + startBinding++, set, imageArray); } + + return startBinding; +} + +uint32_t VulkanFilterYuvCompute::ShaderGenerateBufferPlaneDescriptors(std::stringstream& shaderStr, + VkImageAspectFlags& imageAspects, + const char *bufferName, + VkFormat bufferFormat, + bool isInput, + uint32_t startBinding, + uint32_t set, + VkDescriptorType bufferType) +{ + // Buffer binding follows the same pattern as image binding: + // offset 0: Single RGBA buffer with all data + // offset 1: Y plane buffer + // offset 2: 2-planar CbCr buffer or 3-planar Cb buffer + // offset 3: 3-planar Cr buffer + const VkMpFormatInfo* inputMpInfo = YcbcrVkFormatInfo(bufferFormat); + + // Determine element size based on format + const char* elementType = "uint8_t"; // Default to 8-bit + + shaderStr << " // The " << (isInput ? "input" : "output") << " buffer binding\n"; + // Check format for higher bit depths (16-bit formats) + const VkFormatDesc* formatInfo = vkFormatLookUp(bufferFormat); + if (formatInfo && formatInfo->name) { + if (strstr(formatInfo->name, "16") != nullptr || + strstr(formatInfo->name, "R16") != nullptr || + strstr(formatInfo->name, "10") != nullptr || + strstr(formatInfo->name, "12") != nullptr) { + elementType = "uint16_t"; // Use 16-bit for 10/12/16-bit formats + } + } + + if (inputMpInfo) { + // For multi-planar formats, define separate buffers for each plane + + // Y plane buffer (plane 0) + GenBufferIoBindingLayout(shaderStr, bufferName, "Y", + elementType, + bufferType, + isInput, + ++startBinding, + set); + + if (inputMpInfo->planesLayout.numberOfExtraPlanes == 1) { + // 2-plane format (NV12, NV21, etc.) + imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_1_BIT; + + GenBufferIoBindingLayout(shaderStr, bufferName, "CbCr", + elementType, + bufferType, + isInput, + ++startBinding, + set); + + } else if (inputMpInfo->planesLayout.numberOfExtraPlanes == 2) { + // 3-plane format (YUV 4:2:0, 4:2:2, 4:4:4, etc.) + imageAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT; + + GenBufferIoBindingLayout(shaderStr, bufferName, "Cb", + elementType, + bufferType, + isInput, + ++startBinding, + set); + + GenBufferIoBindingLayout(shaderStr, bufferName, "Cr", + elementType, + bufferType, + isInput, + ++startBinding, + set); + } + } else { + // For single-plane formats (like RGBA) + imageAspects = VK_IMAGE_ASPECT_COLOR_BIT; + + GenBufferIoBindingLayout(shaderStr, bufferName, "RGB", + elementType, + bufferType, + isInput, + startBinding++, + set); + } + + return startBinding; +} + + +uint32_t VulkanFilterYuvCompute::ShaderGeneratePlaneDescriptors(std::stringstream& shaderStr, + bool isInput, + uint32_t startBinding, + uint32_t set, + bool imageArray, + VkDescriptorType bufferType) +{ + + if ((isInput && m_inputIsBuffer) || (!isInput && m_outputIsBuffer)) { + + return ShaderGenerateBufferPlaneDescriptors(shaderStr, + isInput ? m_inputImageAspects : m_outputImageAspects, + isInput ? "inputBuffer" : "outputBuffer", + isInput ? m_inputFormat : m_outputFormat, + isInput, // isInput + startBinding, // startBinding + set, // set + bufferType); + } else { + + return ShaderGenerateImagePlaneDescriptors(shaderStr, + isInput ? m_inputImageAspects : m_outputImageAspects, + isInput ? "inputImage" : "outputImage", + isInput ? m_inputFormat : m_outputFormat, + isInput, // isInput + startBinding, // startBinding + set, // set + imageArray // imageArray + ); + } +} + +/** + * @brief Generates GLSL functions for YCbCr normalization with different bit depths + * + * Creates helper functions to normalize YCbCr values, handling different bit depths, + * and applying proper range adjustments (limited/full range). + * + * Process steps: + * 1. Calculate normalization parameters based on bit depth and range + * 2. Generate Y normalization function (scaling + offset) + * 3. Generate CbCr shifting functions (centering around zero) + * 4. Generate CbCr normalization functions (scaling + offset) + * 5. Generate bit-depth specific helpers for 10/12-bit formats + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param bitDepth The bit depth of the YCbCr data (8, 10, 12, or 16) + * @param isLimitedRange Whether values are limited range (true) or full range (false) + * @param hasChroma Whether to include chroma normalization functions + */ +static void GenYCbCrNormalizationFuncs(std::stringstream& shaderStr, + uint32_t bitDepth = 8, + bool isLimitedRange = true, + bool hasChroma = true) +{ + // STEP 1: Calculate normalization parameters based on bit depth and range + // =========================================================================== + + // Use double precision for calculations to maintain precision + double maxValue = (1ULL << bitDepth) - 1.0; // Max value for the given bit depth + + // Limited range values for different bit depths + double yBlack, yWhite, cZero, cScale; + + if (isLimitedRange) { + // Step 1.1: Calculate limited range (aka TV/Video range) values + // Use standard-compliant values for different bit depths + switch (bitDepth) { + case 10: + // 10-bit limited range: Y[64,940], C[64,960] + yBlack = 64.0; + yWhite = 940.0; + cZero = 64.0; + cScale = 896.0; // 960 - 64 + break; + case 12: + // 12-bit limited range: Y[256,3760], C[256,3840] + yBlack = 256.0; + yWhite = 3760.0; + cZero = 256.0; + cScale = 3584.0; // 3840 - 256 + break; + case 16: + // 16-bit limited range: scale 8-bit values by 2^8 + yBlack = 16.0 * 256.0; + yWhite = 235.0 * 256.0; + cZero = 16.0 * 256.0; + cScale = 224.0 * 256.0; + break; + case 8: + default: + // 8-bit limited range: Y[16,235], C[16,240] + yBlack = 16.0; + yWhite = 235.0; + cZero = 16.0; + cScale = 224.0; + break; + } + } else { + // Step 1.2: Calculate full range values (same for all bit depths, just scaled) + yBlack = 0.0; + yWhite = maxValue; + cZero = 0.0; + cScale = maxValue; + } + + // Step 1.3: Calculate normalization factors with double precision + double yRange = yWhite - yBlack; + double yFactor = 1.0 / yRange; + double yOffset = -yBlack * yFactor; + double cFactor = 1.0 / cScale; + + // Format values with high precision for GLSL + std::stringstream ss; + ss.precision(16); // Use high precision for constants + + // STEP 2: Generate Y normalization function + // =========================================================================== + shaderStr << "\n" + << "// Specify high precision for all floating point calculations\n" + << "precision highp float;\n" + << "precision highp int;\n" + << "\n" + << "// STEP 1: Normalize Y component for " << bitDepth << "-bit " + << (isLimitedRange ? "limited range" : "full range") << " content\n" + << "highp float normalizeY(highp float Y) {\n"; + + if (isLimitedRange) { + // Step 2.1: Limited range needs black level adjustment and scaling + // Format with high precision + ss.str(""); + ss << std::fixed << yFactor; + std::string yFactorStr = ss.str(); + + ss.str(""); + ss << std::fixed << yOffset; + std::string yOffsetStr = ss.str(); + + shaderStr << " // Step 1.1: Map from [" << yBlack << ", " << yWhite << "] to [0.0, 1.0]\n" + << " // Formula: normalizedY = (Y - yBlack) / yRange = Y * yFactor + yOffset\n" + << " return Y * " << yFactorStr << " + " << yOffsetStr << ";\n"; + } else { + // Step 2.2: Full range just needs scaling + shaderStr << " // Step 1.1: Map from [0, " << maxValue << "] to [0.0, 1.0]\n" + << " // Formula: normalizedY = Y / maxValue\n" + << " return Y / " << maxValue << ";\n"; + } + shaderStr << "}\n\n"; + + if (hasChroma) { + // STEP 3: Generate CbCr shifting functions + // =========================================================================== + + // Step 3.1: Generate CbCr shifting function for vec2 (common for 2-plane formats) + shaderStr << "// STEP 2: Shift CbCr components from centered range to [-0.5, 0.5] range\n" + << "highp vec2 shiftCbCr(highp vec2 CbCr) {\n" + << " // Step 2.1: Shift from [0.0, 1.0] to [-0.5, 0.5]\n" + << " return CbCr - 0.5;\n" + << "}\n\n"; + + // Step 3.2: Generate CbCr shifting function for vec3 (for full YCbCr triplet) + shaderStr << "// Step 2 (alternative): Shift YCbCr components, leaving Y alone but centering CbCr\n" + << "highp vec3 shiftCbCr(highp vec3 ycbcr) {\n" + << " // Step 2.1: Shift only Cb and Cr from [0.0, 1.0] to [-0.5, 0.5]\n" + << " const highp vec3 shift = vec3(0.0, -0.5, -0.5);\n" + << " return ycbcr + shift;\n" + << "}\n\n"; + + // STEP 4: Generate CbCr normalization function + // =========================================================================== + shaderStr << "// STEP 3: Normalize CbCr components for " << bitDepth << "-bit " + << (isLimitedRange ? "limited range" : "full range") << " content\n" + << "highp vec2 normalizeCbCr(highp vec2 CbCr) {\n"; + + if (isLimitedRange) { + // Step 4.1: Limited range needs zero level adjustment and scaling + // Format with high precision + ss.str(""); + ss << std::fixed << cZero; + std::string cZeroStr = ss.str(); + + ss.str(""); + ss << std::fixed << cFactor; + std::string cFactorStr = ss.str(); + + shaderStr << " // Step 3.1: Map from [" << cZero << ", " << (cZero + cScale) << "] to [0.0, 1.0]\n" + << " // Formula: normalizedCbCr = (CbCr - cZero) / cScale\n" + << " return (CbCr - " << cZeroStr << ") * " << cFactorStr << ";\n"; + } else { + // Step 4.2: Full range just needs scaling + shaderStr << " // Step 3.1: Map from [0, " << maxValue << "] to [0.0, 1.0]\n" + << " // Formula: normalizedCbCr = CbCr / maxValue\n" + << " return CbCr / " << maxValue << ";\n"; + } + shaderStr << "}\n\n"; + } + + // STEP 5: Generate bit-depth specific helper functions for 10/12-bit formats + // =========================================================================== + if (bitDepth == 10) { + shaderStr << "// STEP 4: Special 10-bit format handling functions\n" + << "// 10-bit packing formats often store values in uint16 or uint32 with specific bit layouts\n" + << "\n" + << "// Extract 10-bit value from 16-bit storage (common for P010, P210, etc.)\n" + << "highp float extract10BitFrom16Bit(highp uint value) {\n" + << " // Most 10-bit formats store the value in the most significant 10 bits\n" + << " highp uint raw10bit = value >> 6; // Shift right to remove 6 padding bits\n" + << " return float(raw10bit);\n" + << "}\n\n" + + << "// Extract 10-bit value from 16-bit storage as normalized float\n" + << "highp float extract10BitNormalized(highp uint value) {\n" + << " highp uint raw10bit = value >> 6; // Shift right to remove 6 padding bits\n" + << " return float(raw10bit) / 1023.0; // Normalize to [0,1]\n" + << "}\n\n" + + << "// Normalize packed 10-bit YUV directly\n" + << "highp vec3 normalize10BitYUV(highp uvec3 packedYuv) {\n" + << " // Extract 10-bit components\n" + << " highp float y = extract10BitFrom16Bit(packedYuv.x);\n" + << " highp float cb = extract10BitFrom16Bit(packedYuv.y);\n" + << " highp float cr = extract10BitFrom16Bit(packedYuv.z);\n" + << " // Normalize components\n" + << " y = normalizeY(y);\n" + << " highp vec2 cbcr = normalizeCbCr(vec2(cb, cr));\n" + << " return vec3(y, cbcr);\n" + << "}\n\n"; + } else if (bitDepth == 12) { + shaderStr << "// STEP 4: Special 12-bit format handling functions\n" + << "// 12-bit packing formats often store values in uint16 or uint32 with specific bit layouts\n" + << "\n" + << "// Extract 12-bit value from 16-bit storage (common for P012, P212, etc.)\n" + << "highp float extract12BitFrom16Bit(highp uint value) {\n" + << " // Most 12-bit formats store the value in the most significant 12 bits\n" + << " highp uint raw12bit = value >> 4; // Shift right to remove 4 padding bits\n" + << " return float(raw12bit);\n" + << "}\n\n" + + << "// Extract 12-bit value from 16-bit storage as normalized float\n" + << "highp float extract12BitNormalized(highp uint value) {\n" + << " highp uint raw12bit = value >> 4; // Shift right to remove 4 padding bits\n" + << " return float(raw12bit) / 4095.0; // Normalize to [0,1]\n" + << "}\n\n" + + << "// Normalize packed 12-bit YUV directly\n" + << "highp vec3 normalize12BitYUV(highp uvec3 packedYuv) {\n" + << " // Extract 12-bit components\n" + << " highp float y = extract12BitFrom16Bit(packedYuv.x);\n" + << " highp float cb = extract12BitFrom16Bit(packedYuv.y);\n" + << " highp float cr = extract12BitFrom16Bit(packedYuv.z);\n" + << " // Normalize components\n" + << " y = normalizeY(y);\n" + << " highp vec2 cbcr = normalizeCbCr(vec2(cb, cr));\n" + << " return vec3(y, cbcr);\n" + << "}\n\n"; + } +} + +/** + * @brief Generates GLSL functions for YCbCr denormalization with different bit depths + * + * Creates helper functions to denormalize YCbCr values from normalized [0-1] for Y and + * [-0.5,0.5] for CbCr back to the appropriate bit depth and range (limited or full). + * This is the inverse operation of GenYCbCrNormalizationFuncs. + * + * Process steps: + * 1. Calculate denormalization parameters based on bit depth and range + * 2. Generate Y denormalization function (inverse scaling + offset) + * 3. Generate CbCr unshifting functions (recentering to [0,1]) + * 4. Generate CbCr denormalization functions (inverse scaling + offset) + * 5. Generate combined convenience functions + * 6. Generate bit-depth specific packing helpers for 10/12-bit formats + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param bitDepth The target bit depth for the YCbCr data (8, 10, 12, or 16) + * @param isLimitedRange Whether target values are limited range (true) or full range (false) + * @param hasChroma Whether to include chroma denormalization functions + */ +static void GenYCbCrDeNormalizationFuncs(std::stringstream& shaderStr, + uint32_t bitDepth = 8, + bool isLimitedRange = true, + bool hasChroma = true) +{ + // STEP 1: Calculate denormalization parameters based on bit depth and range + // =========================================================================== + + // Use double precision for calculations to maintain precision + double maxValue = (1ULL << bitDepth) - 1.0; // Max value for the given bit depth + + // Limited range values for different bit depths + double yBlack, yWhite, cZero, cScale; + + if (isLimitedRange) { + // Step 1.1: Calculate limited range (aka TV/Video range) values + // Use standard-compliant values for different bit depths + switch (bitDepth) { + case 10: + // 10-bit limited range: Y[64,940], C[64,960] + yBlack = 64.0; + yWhite = 940.0; + cZero = 64.0; + cScale = 896.0; // 960 - 64 + break; + case 12: + // 12-bit limited range: Y[256,3760], C[256,3840] + yBlack = 256.0; + yWhite = 3760.0; + cZero = 256.0; + cScale = 3584.0; // 3840 - 256 + break; + case 16: + // 16-bit limited range: scale 8-bit values by 2^8 + yBlack = 16.0 * 256.0; + yWhite = 235.0 * 256.0; + cZero = 16.0 * 256.0; + cScale = 224.0 * 256.0; + break; + case 8: + default: + // 8-bit limited range: Y[16,235], C[16,240] + yBlack = 16.0; + yWhite = 235.0; + cZero = 16.0; + cScale = 224.0; + break; + } + } else { + // Step 1.2: Calculate full range values (same for all bit depths, just scaled) + yBlack = 0.0; + yWhite = maxValue; + cZero = 0.0; + cScale = maxValue; + } + + // Step 1.3: Calculate denormalization factors (inverse of normalization) + double yRange = yWhite - yBlack; + + // Format values with high precision for GLSL + std::stringstream ss; + ss.precision(16); // Use high precision for constants + + // STEP 2: Generate Y denormalization function + // =========================================================================== + shaderStr << "\n" + << "// Specify high precision for all floating point calculations\n" + << "precision highp float;\n" + << "precision highp int;\n" + << "\n" + << "// STEP 1: Denormalize Y component from [0.0, 1.0] back to " << bitDepth << "-bit " + << (isLimitedRange ? "limited range" : "full range") << " content\n" + << "highp float denormalizeY(highp float normalizedY) {\n"; + + if (isLimitedRange) { + // Step 2.1: Limited range needs scaling and black level adjustment + // Format with high precision + ss.str(""); + ss << std::fixed << yRange; + std::string yRangeStr = ss.str(); + + ss.str(""); + ss << std::fixed << yBlack; + std::string yBlackStr = ss.str(); + + shaderStr << " // Step 1.1: Map from [0.0, 1.0] back to [" << yBlack << ", " << yWhite << "]\n" + << " // Formula: Y = normalizedY * yRange + yBlack\n" + << " return normalizedY * " << yRangeStr << " + " << yBlackStr << ";\n"; + } else { + // Step 2.2: Full range just needs scaling + shaderStr << " // Step 1.1: Map from [0.0, 1.0] back to [0, " << maxValue << "]\n" + << " // Formula: Y = normalizedY * maxValue\n" + << " return normalizedY * " << maxValue << ";\n"; + } + shaderStr << "}\n\n"; + + if (hasChroma) { + // STEP 3: Generate CbCr unshifting function + // =========================================================================== + shaderStr << "// STEP 2: Unshift CbCr components from [-0.5, 0.5] range back to centered range [0.0, 1.0]\n" + << "highp vec2 unshiftCbCr(highp vec2 shiftedCbCr) {\n" + << " // Step 2.1: Shift from [-0.5, 0.5] back to [0.0, 1.0]\n" + << " return shiftedCbCr + 0.5;\n" + << "}\n\n"; + + // STEP 4: Generate CbCr denormalization function + // =========================================================================== + shaderStr << "// STEP 3: Denormalize CbCr components from [0.0, 1.0] back to " << bitDepth << "-bit " + << (isLimitedRange ? "limited range" : "full range") << " content\n" + << "highp vec2 denormalizeCbCr(highp vec2 normalizedCbCr) {\n"; + + if (isLimitedRange) { + // Step 4.1: Limited range needs scaling and zero level adjustment + // Format with high precision + ss.str(""); + ss << std::fixed << cScale; + std::string cScaleStr = ss.str(); + + ss.str(""); + ss << std::fixed << cZero; + std::string cZeroStr = ss.str(); + + shaderStr << " // Step 3.1: Map from [0.0, 1.0] back to [" << cZero << ", " << (cZero + cScale) << "]\n" + << " // Formula: CbCr = normalizedCbCr * cScale + cZero\n" + << " return normalizedCbCr * " << cScaleStr << " + " << cZeroStr << ";\n"; + } else { + // Step 4.2: Full range just needs scaling + shaderStr << " // Step 3.1: Map from [0.0, 1.0] back to [0, " << maxValue << "]\n" + << " // Formula: CbCr = normalizedCbCr * maxValue\n" + << " return normalizedCbCr * " << maxValue << ";\n"; + } + shaderStr << "}\n\n"; + + // STEP 5: Generate combined convenience functions + // =========================================================================== + + // Step 5.1: Combined unshift and denormalize + shaderStr << "// STEP 4: Combined function: unshift and denormalize CbCr in one step\n" + << "highp vec2 unshiftAndDenormalizeCbCr(highp vec2 shiftedCbCr) {\n" + << " // Step 4.1: First unshift from [-0.5, 0.5] to [0.0, 1.0], then denormalize\n" + << " return denormalizeCbCr(unshiftCbCr(shiftedCbCr));\n" + << "}\n\n"; + + // Step 5.2: Full YCbCr denormalization + shaderStr << "// STEP 5: Combined function to denormalize full YCbCr triplet\n" + << "highp vec3 denormalizeYCbCr(highp vec3 normalizedYCbCr) {\n" + << " // Step 5.1: Denormalize Y component\n" + << " highp float y = denormalizeY(normalizedYCbCr.x);\n" + << " // Step 5.2: Unshift and denormalize Cb and Cr components\n" + << " highp vec2 cbcr = denormalizeCbCr(vec2(normalizedYCbCr.y + 0.5, normalizedYCbCr.z + 0.5));\n" + << " // Step 5.3: Combine the components into a single vector\n" + << " return vec3(y, cbcr);\n" + << "}\n\n"; + } + + // STEP 6: Generate bit-depth specific packing helpers for 10/12-bit formats + // =========================================================================== + if (bitDepth == 10) { + shaderStr << "// STEP 6: Special 10-bit format packing functions\n" + << "// Pack 10-bit values into 16-bit storage (common for P010, P210, etc.)\n" + << "\n" + << "// Pack 10-bit value into 16-bit storage (MSB aligned with padding)\n" + << "highp uint pack10BitTo16Bit(highp float value) {\n" + << " // Clamp the input value to the valid range for 10-bit\n" + << " highp uint raw10bit = uint(clamp(value, 0.0, 1023.0));\n" + << " // Shift left by 6 bits to store in MSB format (standard for P010, etc.)\n" + << " return raw10bit << 6;\n" + << "}\n\n" + + << "// Pack normalized [0,1] value into 10-bit MSB aligned format\n" + << "highp uint packNormalizedTo10Bit(highp float normalizedValue) {\n" + << " // Scale to 10-bit range and pack\n" + << " highp uint raw10bit = uint(clamp(normalizedValue * 1023.0, 0.0, 1023.0));\n" + << " return raw10bit << 6;\n" + << "}\n\n" + + << "// Pack denormalized YUV to 10-bit values\n" + << "highp uvec3 packYUVTo10Bit(highp vec3 yuv) {\n" + << " // Denormalize components first\n" + << " highp vec3 denormYuv = denormalizeYCbCr(yuv);\n" + << " // Pack each component into 16-bit storage (MSB aligned)\n" + << " return uvec3(\n" + << " pack10BitTo16Bit(denormYuv.x), // Y\n" + << " pack10BitTo16Bit(denormYuv.y), // Cb\n" + << " pack10BitTo16Bit(denormYuv.z) // Cr\n" + << " );\n" + << "}\n\n"; + } else if (bitDepth == 12) { + shaderStr << "// STEP 6: Special 12-bit format packing functions\n" + << "// Pack 12-bit values into 16-bit storage (common for P012, P212, etc.)\n" + << "\n" + << "// Pack 12-bit value into 16-bit storage (MSB aligned with padding)\n" + << "highp uint pack12BitTo16Bit(highp float value) {\n" + << " // Clamp the input value to the valid range for 12-bit\n" + << " highp uint raw12bit = uint(clamp(value, 0.0, 4095.0));\n" + << " // Shift left by 4 bits to store in MSB format (standard for P012, etc.)\n" + << " return raw12bit << 4;\n" + << "}\n\n" + + << "// Pack normalized [0,1] value into 12-bit MSB aligned format\n" + << "highp uint packNormalizedTo12Bit(highp float normalizedValue) {\n" + << " // Scale to 12-bit range and pack\n" + << " highp uint raw12bit = uint(clamp(normalizedValue * 4095.0, 0.0, 4095.0));\n" + << " return raw12bit << 4;\n" + << "}\n\n" + + << "// Pack denormalized YUV to 12-bit values\n" + << "highp uvec3 packYUVTo12Bit(highp vec3 yuv) {\n" + << " // Denormalize components first\n" + << " highp vec3 denormYuv = denormalizeYCbCr(yuv);\n" + << " // Pack each component into 16-bit storage (MSB aligned)\n" + << " return uvec3(\n" + << " pack12BitTo16Bit(denormYuv.x), // Y\n" + << " pack12BitTo16Bit(denormYuv.y), // Cb\n" + << " pack12BitTo16Bit(denormYuv.z) // Cr\n" + << " );\n" + << "}\n\n"; + } +} + +/** + * @brief Generates GLSL function for YCbCr format conversion with normalization and denormalization + * + * Creates a helper function for converting between different YCbCr formats + * that normalizes input values, then denormalizes to the target format. + * This handles both bit-depth and range conversions. + * + * @param shaderStr Output stringstream where the GLSL code will be written + * @param inputBitDepth The bit depth of input YCbCr data (8, 10, 12, or 16 bits) + * @param outputBitDepth The bit depth of output YCbCr data (8, 10, 12, or 16 bits) + * @param isInputLimitedRange Whether the input uses limited range (true) or full range (false) + * @param isOutputLimitedRange Whether the output uses limited range (true) or full range (false) + */ +static void GenConvertYCbCrFormat(std::stringstream& shaderStr, + uint32_t inputBitDepth = 8, + uint32_t outputBitDepth = 8, + bool isInputLimitedRange = true, + bool isOutputLimitedRange = true) +{ + shaderStr << + "// Function to handle YCbCr format conversion with proper normalization\n" + "vec3 convertYCbCrFormat(vec3 YCbCrRawIn) {\n" + " // Step 1: Normalize input YCbCr values to [0-1] range\n" + " float normalizedY = normalizeY(YCbCrRawIn.x);\n" + " vec2 normalizedCbCr = normalizeCbCr(vec2(YCbCrRawIn.y, YCbCrRawIn.z));\n\n" + " // Step 2: Denormalize to output bit depth and range\n" + " float y = denormalizeY(normalizedY);\n" + " vec2 cbcr = denormalizeCbCr(normalizedCbCr);\n\n" + " // Return the converted values\n" + " return vec3(y, cbcr.x, cbcr.y);\n" + "}\n\n"; } size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) @@ -307,56 +1845,45 @@ size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) // Create compute pipeline std::stringstream shaderStr; + + // 1. Generate header and push constants GenHeaderAndPushConst(shaderStr); + + // 2. Generate IO bindings // Input image - shaderStr << " // The input YCbCr image binding\n"; + shaderStr << " // The input YCbCr input binding\n"; + // Input Descriptors ShaderGeneratePlaneDescriptors(shaderStr, - m_inputImageAspects, - "inputImage", - m_inputFormat, true, // isInput 0, // startBinding 0, // set - true // imageArray - ); - - // Output image - shaderStr << " // The output RGBA image binding\n"; - ShaderGeneratePlaneDescriptors(shaderStr, - m_outputImageAspects, - "outputImage", - m_outputFormat, - false, // isInput - 4, // startBinding - 0, // set - true // imageArray - ); - - shaderStr << "\n" - " // TODO: normalize only narrow\n" - "float normalizeY(float Y) {\n" - " // return (Y - (16.0 / 255.0)) * (255.0 / (235.0 - 16.0));\n" - " return (Y - 0.0627451) * 1.164383562;\n" - "}\n" - "\n" - "vec2 shiftCbCr(vec2 CbCr) {\n" - " return CbCr - 0.5;\n" - "}\n" - "\n" - "vec3 shiftCbCr(vec3 ycbcr) {\n" - " const vec3 shiftCbCr = vec3(0.0, -0.5, -0.5);\n" - " return ycbcr + shiftCbCr;\n" - "}\n" - "\n" - " // TODO: normalize only narrow\n" - "vec2 normalizeCbCr(vec2 CbCr) {\n" - " // return (CbCr - (16.0 / 255.0)) / ((240.0 - 16.0) / 255.0);\n" - " return (CbCr - 0.0627451) * 1.138392857;\n" - "}\n" - "\n"; - - const VkSamplerYcbcrConversionCreateInfo& samplerYcbcrConversionCreateInfo = m_samplerYcbcrConversion.GetSamplerYcbcrConversionCreateInfo(); - const VkMpFormatInfo * mpInfo = YcbcrVkFormatInfo(samplerYcbcrConversionCreateInfo.format); + true, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + shaderStr << " // The output RGBA image binding\n"; + // Output Descriptors + ShaderGeneratePlaneDescriptors(shaderStr, + false, // isInput + 4, // startBinding + 0, // set + true, // imageArray + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + // Get format information to determine bit depth + const VkSamplerYcbcrConversionCreateInfo& samplerYcbcrConversionCreateInfo = + m_samplerYcbcrConversion.GetSamplerYcbcrConversionCreateInfo(); + const VkMpFormatInfo* mpInfo = YcbcrVkFormatInfo(samplerYcbcrConversionCreateInfo.format); + + // Determine bit depth from the format + uint32_t bitDepth = mpInfo ? GetBitsPerChannel(mpInfo->planesLayout) : 8; + + // Determine if we're using limited or full range + bool isLimitedRange = (samplerYcbcrConversionCreateInfo.ycbcrRange == VK_SAMPLER_YCBCR_RANGE_ITU_NARROW); + + // 3. Generate helper functions for YCbCr normalization with proper bit depth handling + GenYCbCrNormalizationFuncs(shaderStr, bitDepth, isLimitedRange, true); + + // 4. Generate YCbCr to RGB conversion function const unsigned int bpp = (8 + mpInfo->planesLayout.bpp * 2); const YcbcrBtStandard btStandard = GetYcbcrPrimariesConstantsId(samplerYcbcrConversionCreateInfo.ycbcrModel); @@ -367,7 +1894,6 @@ size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) rangeConstants.cbMax, rangeConstants.crMax); - shaderStr << "vec3 convertYCbCrToRgb(vec3 yuv) {\n" " vec3 rgb;\n"; @@ -377,7 +1903,7 @@ size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) "}\n" "\n"; - + // 5. Generate color range normalization function YcbcrNormalizeColorRange yCbCrNormalizeColorRange(bpp, (samplerYcbcrConversionCreateInfo.ycbcrModel == VK_SAMPLER_YCBCR_MODEL_CONVERSION_RGB_IDENTITY) ? YCBCR_COLOR_RANGE_NATURAL : (YCBCR_COLOR_RANGE)samplerYcbcrConversionCreateInfo.ycbcrRange); @@ -390,21 +1916,51 @@ size_t VulkanFilterYuvCompute::InitYCBCR2RGBA(std::string& computeShader) "}\n" "\n"; + // 6. Generate function to fetch YCbCr components from images + shaderStr << + "vec3 fetchYCbCrFromImage(ivec3 pos) {\n" + " // Fetch from the texture.\n" + " float Y = imageLoad(inputImageY, pos).r;\n" + " // For subsampled formats, divide by 2\n" + " vec2 CbCr = imageLoad(inputImageCbCr, ivec3(pos.xy/2, pos.z)).rg;\n" + " return vec3(Y, CbCr);\n" + "}\n" + "\n"; + + // 7. Generate function to write RGBA to output image + shaderStr << + "void writeRgbaToImage(vec4 rgba, ivec3 pos) {\n" + " imageStore(outputImageRGB, pos, rgba);\n" + "}\n" + "\n"; + + // 8. Main function shaderStr << "void main()\n" "{\n"; + + // 9. Handle position calculation GenHandleImagePosition(shaderStr); + + // 10. Calculate source position with replication if enabled GenHandleSourcePositionWithReplicate(shaderStr, m_enableRowAndColumnReplication); + + // 11. YCbCr to RGB conversion shaderStr << - " // Fetch from the texture.\n" - " float Y = imageLoad(inputImageY, ivec3(srcPos, pushConstants.srcImageLayer)).r;\n" - " // TODO: it is /2 only for sub-sampled formats\n" - " vec2 CbCr = imageLoad(inputImageCbCr, ivec3(srcPos/2, pushConstants.srcImageLayer)).rg;\n" + " // Calculate position with layer\n" + " ivec3 srcPos3D = ivec3(srcPos, pushConstants.srcLayer);\n" + " ivec3 dstPos3D = ivec3(pos, pushConstants.dstLayer);\n" + "\n" + " // Fetch YCbCr components\n" + " vec3 ycbcr = fetchYCbCrFromImage(srcPos3D);\n" + "\n" + " // Process: normalize, shift, and convert to RGB\n" + " ycbcr = shiftCbCr(normalizeYCbCr(ycbcr));\n" + " vec3 rgb = convertYCbCrToRgb(ycbcr);\n" "\n" - " vec3 ycbcr = shiftCbCr(normalizeYCbCr(vec3(Y, CbCr)));\n" - " vec4 rgba = vec4(convertYCbCrToRgb(ycbcr),1.0);\n" - " // Store it back.\n" - " imageStore(outputImageRGB, ivec3(pos, pushConstants.dstImageLayer), rgba);\n" + " // Write final RGBA result\n" + " vec4 rgba = vec4(rgb, 1.0);\n" + " writeRgbaToImage(rgba, dstPos3D);\n" "}\n"; computeShader = shaderStr.str(); @@ -429,51 +1985,171 @@ size_t VulkanFilterYuvCompute::InitYCBCRCOPY(std::string& computeShader) // 3-planar: Cb (R) binding = 6 // 3-planar: Cr (R) binding = 7 + // Get format information to determine bit depths + const VkMpFormatInfo* inputMpInfo = YcbcrVkFormatInfo(m_inputFormat); + const VkMpFormatInfo* outputMpInfo = YcbcrVkFormatInfo(m_outputFormat); + + // Determine bit depth from the formats + const uint32_t inputBitDepth = inputMpInfo ? GetBitsPerChannel(inputMpInfo->planesLayout) : 8; + const uint32_t outputBitDepth = outputMpInfo ? GetBitsPerChannel(outputMpInfo->planesLayout) : 8; + + // Determine if we're using limited or full range for input and output + // Default to limited range as it's more common for YCbCr content + const VkSamplerYcbcrConversionCreateInfo& samplerYcbcrConversionCreateInfo = + m_samplerYcbcrConversion.GetSamplerYcbcrConversionCreateInfo(); + const bool isInputLimitedRange = (samplerYcbcrConversionCreateInfo.ycbcrRange == VK_SAMPLER_YCBCR_RANGE_ITU_NARROW); + const bool isOutputLimitedRange = isInputLimitedRange; // Usually same as input, but could be configurable + + // Check if input or output are buffers + const bool isInputBuffer = m_inputIsBuffer; + const bool isOutputBuffer = m_outputIsBuffer; + + // Check if we need to do any bit depth conversion + const bool needsBitDepthConversion = (inputBitDepth != outputBitDepth); + + // Check if we need to do any range conversion + const bool needsRangeConversion = (isInputLimitedRange != isOutputLimitedRange); + std::stringstream shaderStr; + + // 1. Generate header and push constants GenHeaderAndPushConst(shaderStr); - // Input image - shaderStr << " // The input image binding\n"; + + // 2. Generate IO bindings + // Input Descriptors ShaderGeneratePlaneDescriptors(shaderStr, - m_inputImageAspects, - "inputImage", - m_inputFormat, true, // isInput 0, // startBinding 0, // set - true // imageArray - ); + true, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - // Output image - shaderStr << " // The output image binding\n"; + // Output Descriptors ShaderGeneratePlaneDescriptors(shaderStr, - m_outputImageAspects, - "outputImage", - m_outputFormat, false, // isInput 4, // startBinding 0, // set - true // imageArray - ); + true, // imageArray + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + shaderStr << "\n\n"; + // Determine input and output plane configurations + const bool hasInputChroma = (m_inputImageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0; + const bool hasOutputChroma = (m_outputImageAspects & (VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT)) != 0; + + // Determine if input is two-plane (e.g., NV12) or three-plane (e.g., I420) + const bool isInputTwoPlane = (m_inputImageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) && + !(m_inputImageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT); + + // Determine if output is two-plane (e.g., NV12) or three-plane (e.g., I420) + const bool isOutputTwoPlane = (m_outputImageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) && + !(m_outputImageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT); + + // 3. Add any bit depth handling functions needed + if (isInputBuffer && inputBitDepth > 8) { + bool isMSB = true; // Default to MSB-aligned (most common case) + GenExtractHighBitDepthFunc(shaderStr, isMSB, inputBitDepth); + } + + // 4. Add buffer read/write functions if needed + if (isInputBuffer) { + // Add fetch functions for Y and CbCr from buffer + GenFetchYFromBufferFunc(shaderStr, inputBitDepth > 8, inputBitDepth); + GenFetchCbCrFromBufferFunc(shaderStr, inputBitDepth > 8, inputBitDepth); + } + + // 5. Add YCbCr normalization and denormalization functions for bit depth conversion + if (needsBitDepthConversion || needsRangeConversion) { + // Generate normalization functions for input format + GenYCbCrNormalizationFuncs(shaderStr, inputBitDepth, isInputLimitedRange, hasInputChroma); + + // Generate denormalization functions for output format + GenYCbCrDeNormalizationFuncs(shaderStr, outputBitDepth, isOutputLimitedRange, hasOutputChroma); + } + + // 6. Generate the read function for YCbCr data + GenReadYCbCrBuffer(shaderStr, isInputBuffer, inputBitDepth, isInputTwoPlane, m_inputEnableMsbToLsbShift, m_inputImageAspects); + + // 7. Generate the write function for YCbCr data + GenWriteYCbCrBuffer(shaderStr, isOutputBuffer, outputBitDepth, isOutputTwoPlane, m_outputEnableLsbToMsbShift, m_outputImageAspects); + + // 8. Helper function for combined normalization and denormalization + if (needsBitDepthConversion || needsRangeConversion) { + GenConvertYCbCrFormat(shaderStr, inputBitDepth, outputBitDepth, isInputLimitedRange, isOutputLimitedRange); + } + + // 9. Main function shaderStr << "void main()\n" "{\n"; - GenHandleImagePosition(shaderStr); + + // 10. Handle position calculation + if (isInputBuffer || isOutputBuffer) { + // Use buffer position calculation + GenHandleBufferPosition(shaderStr); + } else { + // Use image position calculation + GenHandleImagePosition(shaderStr); + } + + // 11. Calculate source position with replication if enabled GenHandleSourcePositionWithReplicate(shaderStr, m_enableRowAndColumnReplication); + + // 12. Handle YCbCr processing + + // For inputs with chroma, we need to handle subsampling + // Get subsampling ratios for input format + const uint32_t chromaHorzRatio = (inputMpInfo != nullptr) ? (1 << inputMpInfo->planesLayout.secondaryPlaneSubsampledX) : 1; + const uint32_t chromaVertRatio = (inputMpInfo != nullptr) ? (1 << inputMpInfo->planesLayout.secondaryPlaneSubsampledY) : 1; + + // Generate condition for chroma processing based on actual subsampling shaderStr << - " // Read Y value from source Y plane and write it to destination Y plane\n" - " float Y = imageLoad(inputImageY, ivec3(srcPos, pushConstants.srcImageLayer)).r;\n" - " imageStore(outputImageY, ivec3(pos, pushConstants.dstImageLayer), vec4(Y, 0, 0, 1));\n" - "\n" - " // Do the same for the CbCr plane, but remember about the 4:2:0 subsampling\n" - " if (srcPos % 2 == ivec2(0, 0)) {\n" - " srcPos /= 2;\n" - " pos /= 2;\n" - " vec2 CbCr = imageLoad(inputImageCbCr, ivec3(srcPos, pushConstants.srcImageLayer)).rg;\n" - " imageStore(outputImageCbCr, ivec3(pos, pushConstants.dstImageLayer), vec4(CbCr, 0, 1));\n" - " }\n" - "}\n"; + " // Handle proper subsampling based on format (" << + (chromaHorzRatio == 2 ? (chromaVertRatio == 2 ? "4:2:0" : "4:2:2") : "4:4:4") << ")\n"; + + // Generate the chroma position condition with a boolean variable + GenHandleChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, true, "pos", "processChroma"); + + // Initialize chroma positions with default values + shaderStr << " // Initialize chroma positions\n" + << " ivec2 chromaSrcPos = srcPos;\n" + << " ivec2 chromaPos = pos;\n\n" + << " // Check if we need to process chroma\n" + << " if (processChroma) {\n"; + + // Generate chroma position calculations for source position + GenCalculateChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, "srcPos", "chromaSrcPos", 8); + + // Generate chroma position calculations for destination position + GenCalculateChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, "pos", "chromaPos", 8); + + shaderStr << " }\n"; + + // Read YCbCr data using the helper function + shaderStr << "\n" + << " // Read YCbCr data from source\n" + << " vec3 YCbCrRawIn = readYCbCrFromSource(srcPos, chromaSrcPos, pushConstants.srcLayer, processChroma);\n\n"; + + // Process the data based on whether we need conversion + if (needsBitDepthConversion || needsRangeConversion) { + shaderStr << + " // Need format conversion - normalize and denormalize\n" + " vec3 YCbCrRawOut = convertYCbCrFormat(YCbCrRawIn);\n\n"; + } else { + shaderStr << + " // No format conversion needed - direct copy\n" + " vec3 YCbCrRawOut = YCbCrRawIn;\n\n"; + } + + // Write the processed data using the helper function + shaderStr << + " // Write processed data to destination\n" + " writeYCbCrToDestination(YCbCrRawOut, pos, chromaPos, pushConstants.dstLayer, processChroma);\n" + "\n\n"; + + // Close the main function + shaderStr << "}\n"; computeShader = shaderStr.str(); if (dumpShaders) @@ -495,37 +2171,849 @@ size_t VulkanFilterYuvCompute::InitYCBCRCLEAR(std::string& computeShader) // Create compute pipeline std::stringstream shaderStr; + + // 1. Generate header and push constants GenHeaderAndPushConst(shaderStr); - // Output image - shaderStr << " // The output image binding\n"; + // 2. Generate output image bindings + shaderStr << " // The output descriptors binding\n"; + // Output Descriptors ShaderGeneratePlaneDescriptors(shaderStr, - m_outputImageAspects, - "outputImage", - m_outputFormat, false, // isInput 4, // startBinding 0, // set - true // imageArray - ); + true, // imageArray + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); shaderStr << "\n\n"; + // Get format information to determine subsampling ratios + const VkMpFormatInfo* outputMpInfo = YcbcrVkFormatInfo(m_outputFormat); + // Get subsampling ratios for output format + const uint32_t chromaHorzRatio = (outputMpInfo != nullptr) ? (1 << outputMpInfo->planesLayout.secondaryPlaneSubsampledX) : 1; + const uint32_t chromaVertRatio = (outputMpInfo != nullptr) ? (1 << outputMpInfo->planesLayout.secondaryPlaneSubsampledY) : 1; + + + // 3. Main function shaderStr << "void main()\n" "{\n"; + + // 4. Handle position calculation GenHandleImagePosition(shaderStr); + + // 5. Clear operations for Y plane shaderStr << - " imageStore(outputImageY, ivec3(pos, pushConstants.dstImageLayer), vec4(0.5, 0, 0, 1));\n" - "\n" - " // Do the same for the CbCr plane, but remember about the 4:2:0 subsampling\n" - " if (pos % 2 == ivec2(0, 0)) {\n" - " pos /= 2;\n" - " imageStore(outputImageCbCr, ivec3(pos, pushConstants.dstImageLayer), vec4(0.5, 0.5, 0.0, 1.0));\n" - " }\n" - "}\n"; + " // Clear Y plane with 50% intensity\n" + " imageStore(outputImageY, ivec3(pos, pushConstants.dstLayer), vec4(0.5, 0, 0, 1));\n" + "\n"; + + // Handle CbCr plane clearing based on format's subsampling + shaderStr << + " // Clear CbCr plane with " << + (chromaHorzRatio == 2 ? (chromaVertRatio == 2 ? "4:2:0" : "4:2:2") : "4:4:4") << + " subsampling\n"; + + // Generate a boolean to track whether this position needs chroma clearing + GenHandleChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, true, "pos", "shouldClearChroma"); + + // Handle position for chroma planes + shaderStr << " ivec2 chromaPos = pos;\n"; + shaderStr << " if (shouldClearChroma) {\n"; + + // Calculate chroma position if necessary + GenCalculateChromaPosition(shaderStr, chromaHorzRatio, chromaVertRatio, "pos", "chromaPos", 8); + + // For 2-plane format, output CbCr together + if (m_outputImageAspects & VK_IMAGE_ASPECT_PLANE_1_BIT) { + shaderStr << " // Clear CbCr plane with 50% intensity (middle range)\n" + << " imageStore(outputImageCbCr, ivec3(chromaPos, pushConstants.dstLayer), vec4(0.5, 0.5, 0.0, 1.0));\n"; + } + + // For 3-plane format, handle Cb and Cr separately + if (m_outputImageAspects & VK_IMAGE_ASPECT_PLANE_2_BIT) { + shaderStr << " // Clear separate Cb and Cr planes with 50% intensity (middle range)\n" + << " imageStore(outputImageCb, ivec3(chromaPos, pushConstants.dstLayer), vec4(0.5, 0.0, 0.0, 1.0));\n" + << " imageStore(outputImageCr, ivec3(chromaPos, pushConstants.dstLayer), vec4(0.5, 0.0, 0.0, 1.0));\n"; + } + + shaderStr << " }\n" + << "}\n"; computeShader = shaderStr.str(); if (dumpShaders) std::cout << "\nCompute Shader:\n" << computeShader; return computeShader.size(); } + +uint32_t VulkanFilterYuvCompute::GetPlaneIndex(VkImageAspectFlagBits planeAspect) { + + // Returns index 0 for VK_IMAGE_ASPECT_COLOR_BIT and VK_IMAGE_ASPECT_PLANE_0_BIT + // Returns index 1 for VK_IMAGE_ASPECT_PLANE_1_BIT + // Returns index 2 for VK_IMAGE_ASPECT_PLANE_2_BIT + + // First, verify it's a plane aspect bit + assert(planeAspect & validAspects); + + if (planeAspect & VK_IMAGE_ASPECT_COLOR_BIT) { + return 0; + } + + // Alternatively, without intrinsics: + return (planeAspect & VK_IMAGE_ASPECT_PLANE_0_BIT) ? 0 : + (planeAspect & VK_IMAGE_ASPECT_PLANE_1_BIT) ? 1 : 2; +} + +uint32_t VulkanFilterYuvCompute::UpdateBufferDescriptorSets( + const VkBuffer* vkBuffers, + uint32_t numVkBuffers, + const VkSubresourceLayout* vkBufferSubresourceLayout, + uint32_t numPlanes, + VkImageAspectFlags validImageAspects, + uint32_t& descrIndex, + uint32_t& baseBinding, + VkDescriptorType descriptorType, // Ex: VK_DESCRIPTOR_TYPE_STORAGE_BUFFER + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr], + std::array& writeDescriptorSets, + const uint32_t maxDescriptors) +{ + + validImageAspects &= validAspects; + uint32_t curImageAspect = 0; + uint32_t bufferIndex = 0; + while(validImageAspects) { + + if (validImageAspects & (VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect) ) { + + uint32_t planeNum = GetPlaneIndex((VkImageAspectFlagBits)(VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect)); + uint32_t dstBinding = baseBinding; + if (curImageAspect > 0) { + // the first plane is 1, second plane is 2, the 3rd is 3 + dstBinding += (1 + planeNum); + } + + writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; + writeDescriptorSets[descrIndex].dstBinding = dstBinding; + writeDescriptorSets[descrIndex].descriptorCount = 1; + writeDescriptorSets[descrIndex].descriptorType = descriptorType; + + bufferDescriptors[descrIndex].buffer = vkBuffers[bufferIndex]; + bufferDescriptors[descrIndex].offset = vkBufferSubresourceLayout[planeNum].offset; + bufferDescriptors[descrIndex].range = vkBufferSubresourceLayout[planeNum].arrayPitch; + writeDescriptorSets[descrIndex].pBufferInfo = &bufferDescriptors[descrIndex]; + descrIndex++; + validImageAspects &= ~(VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect); + bufferIndex = std::min(numVkBuffers - 1, bufferIndex + 1); + } + + curImageAspect++; + } + assert(descrIndex <= maxDescriptors); + return descrIndex; +} + +uint32_t VulkanFilterYuvCompute::UpdateImageDescriptorSets( + const VkImageResourceView* imageView, + VkImageAspectFlags validImageAspects, + VkSampler convSampler, + VkImageLayout imageLayout, + uint32_t& descrIndex, + uint32_t& baseBinding, + VkDescriptorType descriptorType, // Ex: VK_DESCRIPTOR_TYPE_STORAGE_IMAGE + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr], + std::array& writeDescriptorSets, + const uint32_t maxDescriptors) +{ + + validImageAspects &= validAspects; + uint32_t curImageAspect = 0; + const uint32_t numPlanes = imageView->GetNumberOfPlanes(); + while(validImageAspects) { + + if (validImageAspects & (VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect) ) { + + VkSampler ccSampler = (curImageAspect == 0) ? convSampler : VK_NULL_HANDLE; + uint32_t planeNum = GetPlaneIndex((VkImageAspectFlagBits)(VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect)); + assert(planeNum < numPlanes); + uint32_t dstBinding = baseBinding; + if (curImageAspect > 0) { + // the first plane is 1, second plane is 2, the 3rd is 3 + dstBinding += (1 + planeNum); + } + + writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; + writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; + writeDescriptorSets[descrIndex].dstBinding = dstBinding; + writeDescriptorSets[descrIndex].descriptorCount = 1; + writeDescriptorSets[descrIndex].descriptorType = (ccSampler != VK_NULL_HANDLE) ? + VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER : + descriptorType; + imageDescriptors[descrIndex].sampler = ccSampler; + imageDescriptors[descrIndex].imageView = (curImageAspect == 0) ? + imageView->GetImageView() : + imageView->GetPlaneImageView(planeNum); + assert(imageDescriptors[descrIndex].imageView); + imageDescriptors[descrIndex].imageLayout = imageLayout; + writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // Y (0) plane + descrIndex++; + validImageAspects &= ~(VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect); + } + + curImageAspect++; + } + assert(descrIndex <= maxDescriptors); + return descrIndex; +} + +// Image input -> Image output +VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkImageResourceView* inImageView, + const VkVideoPictureResourceInfoKHR * inImageResourceInfo, + const VkImageResourceView* outImageView, + const VkVideoPictureResourceInfoKHR * outImageResourceInfo, + uint32_t bufferIdx) +{ + + assert(cmdBuf != VK_NULL_HANDLE); + + m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); + + VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); + + switch (layoutMode) { + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: + { + + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr]{}; + std::array writeDescriptorSets{}; + + // Images + uint32_t set = 0; + uint32_t descrIndex = 0; + uint32_t dstBinding = 0; + + // IN 0: RGBA color converted by an YCbCr sample + // IN 1: y plane - G -> R8 + // IN 2: Cb or Cr or CbCr plane - BR -> R8B8 + // IN 3: Cr or Cb plane - R -> R8 + UpdateImageDescriptorSets(inImageView, + m_inputImageAspects, + m_samplerYcbcrConversion.GetSampler(), + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + descrIndex, + dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + imageDescriptors, + writeDescriptorSets, + maxNumComputeDescr / 2 /* max descriptors */); + + dstBinding = 4; + // OUT 4: Out RGBA or single planar YCbCr image + // OUT 5: y plane - G -> R8 + // OUT 6: Cb or Cr or CbCr plane - BR -> R8B8 + // OUT 7: Cr or Cb plane - R -> R8 + UpdateImageDescriptorSets(outImageView, + m_outputImageAspects, + VK_NULL_HANDLE, + VK_IMAGE_LAYOUT_GENERAL, + descrIndex, + dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + imageDescriptors, + writeDescriptorSets, + maxNumComputeDescr /* max descriptors */); + + assert(descrIndex <= maxNumComputeDescr); + assert(descrIndex >= 2); + + if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, descrIndex, writeDescriptorSets.data()); + } else { + + VkDeviceOrHostAddressConstKHR imageDescriptorBufferDeviceAddress = + m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, + set, + descrIndex, + writeDescriptorSets.data()); + + + // Descriptor buffer bindings + // Set 0 = Image + VkDescriptorBufferBindingInfoEXT bindingInfo{}; + bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + bindingInfo.pNext = nullptr; + bindingInfo.address = imageDescriptorBufferDeviceAddress.deviceAddress; + bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; + m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); + + // Image (set 0) + uint32_t bufferIndexImage = 0; + VkDeviceSize bufferOffset = 0; + m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, 1, &bufferIndexImage, &bufferOffset); + } + } + break; + + default: + m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); + } + + struct ivec2 { + uint32_t width; + uint32_t height; + + ivec2() : width(0), height(0) {} + ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} + }; + + struct PushConstants { + uint32_t srcLayer; + uint32_t dstLayer; + ivec2 inputSize; + ivec2 outputSize; + uint32_t yOffset; // Y plane offset + uint32_t cbOffset; // Cb plane offset + uint32_t crOffset; // Cr plane offset + uint32_t yPitch; // Y plane pitch + uint32_t cbPitch; // Cb plane pitch + uint32_t crPitch; // Cr plane pitch + }; + + PushConstants pushConstants = { + inImageResourceInfo->baseArrayLayer, // Set the source layer index + outImageResourceInfo->baseArrayLayer, // Set the destination layer index + ivec2(inImageResourceInfo->codedExtent.width, inImageResourceInfo->codedExtent.height), + ivec2(outImageResourceInfo->codedExtent.width, outImageResourceInfo->codedExtent.height), + 0, // yOffset - not used for image input + 0, // cbOffset - not used for image input + 0, // crOffset - not used for image input + 0, // yPitch - not used for image input + 0, // cbPitch - not used for image input + 0 // crPitch - not used for image input + }; + + m_vkDevCtx->CmdPushConstants(cmdBuf, + m_descriptorSetLayout.GetPipelineLayout(), + VK_SHADER_STAGE_COMPUTE_BIT, + 0, + sizeof(PushConstants), + &pushConstants); + + const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; + const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; + m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); + + return VK_SUCCESS; +} + +// Buffer input -> Image output +VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkBuffer* inBuffers, + uint32_t numInBuffers, + const VkFormat* inBufferFormats, + const VkSubresourceLayout* inBufferSubresourceLayouts, + uint32_t inBufferNumPlanes, + const VkImageResourceView* outImageView, + const VkVideoPictureResourceInfoKHR* outImageResourceInfo, + const VkBufferImageCopy* pBufferImageCopy, + uint32_t bufferIdx) +{ + assert(cmdBuf != VK_NULL_HANDLE); + assert(m_inputIsBuffer == true); + assert(m_outputIsBuffer == false); + + m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); + + VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); + + switch (layoutMode) { + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: + { + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr / 2]{}; + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr / 2]{}; + std::array writeDescriptorSets{}; + + uint32_t set = 0; + uint32_t descrIndex = 0; + uint32_t dstBinding = 0; + + // Buffer input handling + // IN 0: Single buffer YCbCr + // IN 1: Y plane buffer + // IN 2: Cb, Cr or CbCr plane buffer + // IN 3: Cr plane buffer + UpdateBufferDescriptorSets(inBuffers, numInBuffers, + inBufferSubresourceLayouts, inBufferNumPlanes, + m_inputImageAspects, + descrIndex, dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + bufferDescriptors, + writeDescriptorSets, + maxNumComputeDescr / 2); + + + // Image output + dstBinding = 4; + // OUT 4: Out RGBA or single planar YCbCr image + // OUT 5: y plane - G -> R8 + // OUT 6: Cb or Cr or CbCr plane - BR -> R8B8 + // OUT 7: Cr or Cb plane - R -> R8 + UpdateImageDescriptorSets(outImageView, + m_outputImageAspects, + VK_NULL_HANDLE, + VK_IMAGE_LAYOUT_GENERAL, + descrIndex, + dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + imageDescriptors, + writeDescriptorSets, + maxNumComputeDescr /* max descriptors */); + + assert(descrIndex <= maxNumComputeDescr); + assert(descrIndex >= 2); + + if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, descrIndex, writeDescriptorSets.data()); + } else { + VkDeviceOrHostAddressConstKHR descriptorBufferDeviceAddress = + m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, + set, + descrIndex, + writeDescriptorSets.data()); + + + // Descriptor buffer bindings + VkDescriptorBufferBindingInfoEXT bindingInfo{}; + bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + bindingInfo.pNext = nullptr; + bindingInfo.address = descriptorBufferDeviceAddress.deviceAddress; + bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; + m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); + + uint32_t bufferIndexImage = 0; + VkDeviceSize bufferOffset = 0; + m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, 1, &bufferIndexImage, &bufferOffset); + } + } + break; + + default: + m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); + } + + struct ivec2 { + uint32_t width; + uint32_t height; + + ivec2() : width(0), height(0) {} + ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} + }; + + struct PushConstants { + uint32_t srcLayer; + uint32_t dstLayer; + ivec2 inputSize; + ivec2 outputSize; + uint32_t yOffset; // Y plane offset + uint32_t cbOffset; // Cb plane offset + uint32_t crOffset; // Cr plane offset + uint32_t yPitch; // Y plane pitch + uint32_t cbPitch; // Cb plane pitch + uint32_t crPitch; // Cr plane pitch + }; + + uint32_t width, height; + uint32_t rowPitch; + + assert(pBufferImageCopy); + width = pBufferImageCopy->bufferRowLength > 0 ? + pBufferImageCopy->bufferRowLength : + pBufferImageCopy->imageExtent.width; + height = pBufferImageCopy->bufferImageHeight > 0 ? + pBufferImageCopy->bufferImageHeight : + pBufferImageCopy->imageExtent.height; + rowPitch = width; + + VkExtent3D outputExtent = outImageView->GetImageResource()->GetImageCreateInfo().extent; + + VkDeviceSize planeSize = width * height; + VkDeviceSize yOffset = pBufferImageCopy ? pBufferImageCopy->bufferOffset : 0; + VkDeviceSize cbOffset = yOffset + planeSize; + VkDeviceSize crOffset = cbOffset + (planeSize / 4); + + PushConstants pushConstants = { + pBufferImageCopy->imageSubresource.baseArrayLayer, + outImageResourceInfo->baseArrayLayer, + ivec2(width, height), + ivec2(outputExtent.width, outputExtent.height), + static_cast(yOffset), + static_cast(cbOffset), + static_cast(crOffset), + rowPitch, + rowPitch / 2, // For 4:2:0 format + rowPitch / 2 // For 4:2:0 format + }; + + m_vkDevCtx->CmdPushConstants(cmdBuf, + m_descriptorSetLayout.GetPipelineLayout(), + VK_SHADER_STAGE_COMPUTE_BIT, + 0, + sizeof(PushConstants), + &pushConstants); + + const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; + const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; + m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); + + return VK_SUCCESS; +} + +// Image input -> Buffer output +VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkImageResourceView* inImageView, + const VkVideoPictureResourceInfoKHR* inImageResourceInfo, + const VkBuffer* outBuffers, // with size numOutBuffers + uint32_t numOutBuffers, + const VkFormat* outBufferFormats, // with size outBufferNumPlanes + const VkSubresourceLayout* outBufferSubresourceLayouts, // with size outBufferNumPlanes + uint32_t outBufferNumPlanes, + const VkBufferImageCopy* pBufferImageCopy, + uint32_t bufferIdx) +{ + assert(cmdBuf != VK_NULL_HANDLE); + assert(m_inputIsBuffer == false); + assert(m_outputIsBuffer == true); + + m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); + + VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); + + switch (layoutMode) { + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: + { + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr / 2]{}; + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr / 2]{}; + std::array writeDescriptorSets{}; + + uint32_t set = 0; + uint32_t descrIndex = 0; + uint32_t dstBinding = 0; + + // IN 0: RGBA color converted by an YCbCr sample + // IN 1: y plane - G -> R8 + // IN 2: Cb or Cr or CbCr plane - BR -> R8B8 + // IN 3: Cr or Cb plane - R -> R8 + UpdateImageDescriptorSets(inImageView, + m_inputImageAspects, + m_samplerYcbcrConversion.GetSampler(), + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL, + descrIndex, + dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, + imageDescriptors, + writeDescriptorSets, + maxNumComputeDescr / 2 /* max descriptors */); + + // Output buffer handling + dstBinding = 4; + // OUT 0: Single buffer YCbCr + // OUT 1: Y plane buffer + // OUT 2: Cb, Cr or CbCr plane buffer + // OUT 3: Cr or Cb plane buffer + UpdateBufferDescriptorSets(outBuffers, numOutBuffers, + outBufferSubresourceLayouts, outBufferNumPlanes, + m_inputImageAspects, + descrIndex, dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + bufferDescriptors, + writeDescriptorSets, + maxNumComputeDescr); + + assert(descrIndex <= maxNumComputeDescr); + assert(descrIndex >= 2); + + if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, descrIndex, writeDescriptorSets.data()); + } else { + VkDeviceOrHostAddressConstKHR descriptorBufferDeviceAddress = + m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, + set, + descrIndex, + writeDescriptorSets.data()); + + // Descriptor buffer bindings + VkDescriptorBufferBindingInfoEXT bindingInfo{}; + bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + bindingInfo.pNext = nullptr; + bindingInfo.address = descriptorBufferDeviceAddress.deviceAddress; + bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; + m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); + + uint32_t bufferIndexImage = 0; + VkDeviceSize bufferOffset = 0; + m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, 1, &bufferIndexImage, &bufferOffset); + } + } + break; + + default: + m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); + } + + struct ivec2 { + uint32_t width; + uint32_t height; + + ivec2() : width(0), height(0) {} + ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} + }; + + struct PushConstants { + uint32_t srcLayer; + uint32_t dstLayer; + ivec2 inputSize; + ivec2 outputSize; + uint32_t yOffset; // Y plane offset + uint32_t cbOffset; // Cb plane offset + uint32_t crOffset; // Cr plane offset + uint32_t yPitch; // Y plane pitch + uint32_t cbPitch; // Cb plane pitch + uint32_t crPitch; // Cr plane pitch + }; + + uint32_t width, height; + uint32_t rowPitch; + VkExtent3D inputExtent = inImageView->GetImageResource()->GetImageCreateInfo().extent; + + if (pBufferImageCopy) { + width = pBufferImageCopy->bufferRowLength > 0 ? + pBufferImageCopy->bufferRowLength : + pBufferImageCopy->imageExtent.width; + height = pBufferImageCopy->bufferImageHeight > 0 ? + pBufferImageCopy->bufferImageHeight : + pBufferImageCopy->imageExtent.height; + rowPitch = width; + } else { + width = inputExtent.width; + height = inputExtent.height; + rowPitch = width; + } + + VkDeviceSize planeSize = width * height; + VkDeviceSize yOffset = pBufferImageCopy ? pBufferImageCopy->bufferOffset : 0; + VkDeviceSize cbOffset = yOffset + planeSize; + VkDeviceSize crOffset = cbOffset + (planeSize / 4); + + PushConstants pushConstants = { + inImageResourceInfo->baseArrayLayer, + 0, // Destination layer (buffer has no layers) + ivec2(inputExtent.width, inputExtent.height), + ivec2(width, height), + static_cast(yOffset), + static_cast(cbOffset), + static_cast(crOffset), + rowPitch, + rowPitch / 2, // For 4:2:0 format + rowPitch / 2 // For 4:2:0 format + }; + + m_vkDevCtx->CmdPushConstants(cmdBuf, + m_descriptorSetLayout.GetPipelineLayout(), + VK_SHADER_STAGE_COMPUTE_BIT, + 0, + sizeof(PushConstants), + &pushConstants); + + const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; + const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; + m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); + + return VK_SUCCESS; +} + +// Buffer input -> Buffer output (all buffer case) +VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkBuffer* inBuffers, + uint32_t numInBuffers, + const VkFormat* inBufferFormats, // with size inBufferNumPlanes + const VkSubresourceLayout* inBufferSubresourceLayouts, + uint32_t numInPlanes, + const VkExtent3D& inBufferExtent, + const VkBuffer* outBuffers, + uint32_t numOutBuffers, + const VkFormat* outBufferFormats, + const VkSubresourceLayout* outBufferSubresourceLayouts, + uint32_t numOutPlanes, + const VkExtent3D& outBufferExtent, + uint32_t bufferIdx) +{ + assert(cmdBuf != VK_NULL_HANDLE); + assert(m_inputIsBuffer == true); + assert(m_outputIsBuffer == true); + + m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); + + VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); + + switch (layoutMode) { + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: + case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: + { + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr]{}; + std::array writeDescriptorSets{}; + + uint32_t set = 0; + uint32_t descrIndex = 0; + uint32_t dstBinding = 0; + + // Input buffer handling + // IN 0: Single buffer YCbCr + // IN 1: Y plane buffer + // IN 2: Cb, Cr or CbCr plane buffer + // IN 3: Cr plane buffer + UpdateBufferDescriptorSets(inBuffers, numInBuffers, + inBufferSubresourceLayouts, numInPlanes, + m_inputImageAspects, + descrIndex, dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + bufferDescriptors, + writeDescriptorSets, + maxNumComputeDescr / 2); + + // Output buffer handling + dstBinding = 4; + // OUT 0: Single buffer YCbCr + // OUT 1: Y plane buffer + // OUT 2: Cb, Cr or CbCr plane buffer + // OUT 3: Cr or Cb plane buffer + UpdateBufferDescriptorSets(outBuffers, numOutBuffers, + outBufferSubresourceLayouts, numOutPlanes, + m_inputImageAspects, + descrIndex, dstBinding, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + bufferDescriptors, + writeDescriptorSets, + maxNumComputeDescr); + + assert(descrIndex <= maxNumComputeDescr); + assert(descrIndex >= 2); + + if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { + m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, descrIndex, writeDescriptorSets.data()); + } else { + VkDeviceOrHostAddressConstKHR descriptorBufferDeviceAddress = + m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, + set, + descrIndex, + writeDescriptorSets.data()); + + // Descriptor buffer bindings + VkDescriptorBufferBindingInfoEXT bindingInfo{}; + bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; + bindingInfo.pNext = nullptr; + bindingInfo.address = descriptorBufferDeviceAddress.deviceAddress; + bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | + VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; + m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); + + uint32_t bufferIndexImage = 0; + VkDeviceSize bufferOffset = 0; + m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + set, 1, &bufferIndexImage, &bufferOffset); + } + } + break; + + default: + m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, + m_descriptorSetLayout.GetPipelineLayout(), + 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); + } + + struct ivec2 { + uint32_t width; + uint32_t height; + + ivec2() : width(0), height(0) {} + ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} + }; + + struct PushConstants { + uint32_t srcLayer; // src image layer to use + uint32_t dstLayer; // dst image layer to use + ivec2 inputSize; // input image or buffer extent + ivec2 outputSize; // output image or buffer extent + uint32_t inYOffset; // input buffer Y plane offset + uint32_t inCbOffset; // input buffer Cb plane offset + uint32_t inCrOffset; // input buffer Cr plane offset + uint32_t inYPitch; // input buffer Y plane pitch + uint32_t inCbPitch; // input buffer Cb plane pitch + uint32_t inCrPitch; // input buffer Cr plane pitch + uint32_t outYOffset; // output buffer Y plane offset + uint32_t outCbOffset; // output buffer Cb plane offset + uint32_t outCrOffset; // output buffer Cr plane offset + uint32_t outYPitch; // output buffer Y plane pitch + uint32_t outCbPitch; // output buffer Cb plane pitch + uint32_t outCrPitch; // output buffer Cr plane pitch + }; + + // Calculate buffer parameters + uint32_t rowPitch = inBufferExtent.width; + VkDeviceSize planeSize = inBufferExtent.width * inBufferExtent.height; + VkDeviceSize yOffset = 0; + VkDeviceSize cbOffset = planeSize; + VkDeviceSize crOffset = cbOffset + (planeSize / 4); + + PushConstants pushConstants = { + 0, // Source layer (buffer has no layers) + 0, // Destination layer (buffer has no layers) + ivec2(inBufferExtent.width, inBufferExtent.height), + ivec2(outBufferExtent.width, outBufferExtent.height), + static_cast(yOffset), + static_cast(cbOffset), + static_cast(crOffset), + rowPitch, + rowPitch / 2, // For 4:2:0 format + rowPitch / 2 // For 4:2:0 format + }; + + m_vkDevCtx->CmdPushConstants(cmdBuf, + m_descriptorSetLayout.GetPipelineLayout(), + VK_SHADER_STAGE_COMPUTE_BIT, + 0, + sizeof(PushConstants), + &pushConstants); + + const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; + const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; + m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); + + return VK_SUCCESS; +} diff --git a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.h b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.h index ef8db51a..ab9a8845 100644 --- a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.h +++ b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.h @@ -32,6 +32,15 @@ class VulkanFilterYuvCompute : public VulkanFilter public: enum FilterType { YCBCRCOPY, YCBCRCLEAR, YCBCR2RGBA, RGBA2YCBCR }; + static constexpr uint32_t maxNumComputeDescr = 8; + + static constexpr VkImageAspectFlags validPlaneAspects = VK_IMAGE_ASPECT_PLANE_0_BIT | + VK_IMAGE_ASPECT_PLANE_1_BIT | + VK_IMAGE_ASPECT_PLANE_2_BIT; + + static constexpr VkImageAspectFlags validAspects = VK_IMAGE_ASPECT_COLOR_BIT | validPlaneAspects; + + static uint32_t GetPlaneIndex(VkImageAspectFlagBits planeAspect); static VkResult Create(const VulkanDeviceContext* vkDevCtx, uint32_t queueFamilyIndex, @@ -40,6 +49,8 @@ class VulkanFilterYuvCompute : public VulkanFilter uint32_t maxNumFrames, VkFormat inputFormat, VkFormat outputFormat, + bool inputEnableMsbToLsbShift, + bool outputEnableLsbToMsbShift, const VkSamplerYcbcrConversionCreateInfo* pYcbcrConversionCreateInfo, const YcbcrPrimariesConstants* pYcbcrPrimariesConstants, const VkSamplerCreateInfo* pSamplerCreateInfo, @@ -52,6 +63,8 @@ class VulkanFilterYuvCompute : public VulkanFilter uint32_t maxNumFrames, VkFormat inputFormat, VkFormat outputFormat, + bool inputEnableMsbToLsbShift, + bool outputEnableLsbToMsbShift, const YcbcrPrimariesConstants* pYcbcrPrimariesConstants) : VulkanFilter(vkDevCtx, queueFamilyIndex, queueIndex) , m_filterType(filterType) @@ -71,7 +84,11 @@ class VulkanFilterYuvCompute : public VulkanFilter VK_IMAGE_ASPECT_PLANE_0_BIT | VK_IMAGE_ASPECT_PLANE_1_BIT | VK_IMAGE_ASPECT_PLANE_2_BIT) + , m_inputEnableMsbToLsbShift(inputEnableMsbToLsbShift) + , m_outputEnableLsbToMsbShift(outputEnableLsbToMsbShift) , m_enableRowAndColumnReplication(true) + , m_inputIsBuffer(false) + , m_outputIsBuffer(false) { // FIXME: m_ycbcrPrimariesConstants is currently unused but is kept for future use. (void)m_ycbcrPrimariesConstants; @@ -116,263 +133,205 @@ class VulkanFilterYuvCompute : public VulkanFilter assert(m_vkDevCtx != nullptr); } + uint32_t UpdateBufferDescriptorSets(const VkBuffer* vkBuffers, + uint32_t numVkBuffers, + const VkSubresourceLayout* vkBufferSubresourceLayout, + uint32_t numPlanes, + VkImageAspectFlags validImageAspects, + uint32_t& descrIndex, + uint32_t& baseBinding, + VkDescriptorType descriptorType, // Ex: VK_DESCRIPTOR_TYPE_STORAGE_BUFFER + VkDescriptorBufferInfo bufferDescriptors[maxNumComputeDescr], + std::array& writeDescriptorSets, + const uint32_t maxDescriptors = maxNumComputeDescr); + + uint32_t UpdateImageDescriptorSets(const VkImageResourceView* inputImageView, + VkImageAspectFlags validImageAspects, + VkSampler convSampler, + VkImageLayout imageLayout, + uint32_t& descrIndex, + uint32_t& baseBinding, + VkDescriptorType descriptorType, // Ex: VK_DESCRIPTOR_TYPE_STORAGE_IMAGE + VkDescriptorImageInfo imageDescriptors[maxNumComputeDescr], + std::array& writeDescriptorSets, + const uint32_t maxDescriptors = maxNumComputeDescr); + + // Image input -> Image output virtual VkResult RecordCommandBuffer(VkCommandBuffer cmdBuf, const VkImageResourceView* inputImageView, const VkVideoPictureResourceInfoKHR * inputImageResourceInfo, const VkImageResourceView* outputImageView, const VkVideoPictureResourceInfoKHR * outputImageResourceInfo, - uint32_t bufferIdx) - { - - assert(cmdBuf != VK_NULL_HANDLE); - - m_vkDevCtx->CmdBindPipeline(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, m_computePipeline.getPipeline()); - - VkDescriptorSetLayoutCreateFlags layoutMode = m_descriptorSetLayout.GetDescriptorSetLayoutInfo().GetDescriptorLayoutMode(); - - switch (layoutMode) { - case VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR: - case VK_DESCRIPTOR_SET_LAYOUT_CREATE_DESCRIPTOR_BUFFER_BIT_EXT: - { - - const uint32_t maxNumComputeDescr = 8; - VkDescriptorImageInfo imageDescriptors[8]{}; - std::array writeDescriptorSets{}; - - // Images - uint32_t set = 0; - uint32_t descrIndex = 0; - uint32_t dstBinding = 0; - // RGBA color converted by an YCbCr sample - if (m_inputImageAspects & VK_IMAGE_ASPECT_COLOR_BIT) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = (m_samplerYcbcrConversion.GetSampler() != VK_NULL_HANDLE) ? - VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER : - VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - - imageDescriptors[descrIndex].sampler = m_samplerYcbcrConversion.GetSampler(); - imageDescriptors[descrIndex].imageView = inputImageView->GetImageView(); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // RGBA or Sampled YCbCr - descrIndex++; - } - dstBinding++; - - uint32_t planeNum = 0; - // y plane - G -> R8 - if ((m_inputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < inputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = inputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // Y (0) plane - descrIndex++; - } - dstBinding++; - - // CbCr plane - BR -> R8B8 - if ((m_inputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < inputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = inputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // CbCr (1) plane - descrIndex++; - } - dstBinding++; - - // Cr plane - R -> R8 - if ((m_inputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < inputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = inputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; // CbCr (1) plane - descrIndex++; - } - dstBinding++; - - // Out RGBA or single planar YCbCr image - if (m_outputImageAspects & VK_IMAGE_ASPECT_COLOR_BIT) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = outputImageView->GetImageView(); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; - descrIndex++; - } - dstBinding++; - - planeNum = 0; - // y plane out - G -> R8 - if ((m_outputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < outputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = outputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; - descrIndex++; - } - dstBinding++; - - // CbCr plane out - BR -> R8B8 - if ((m_outputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < outputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = outputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; - descrIndex++; - } - dstBinding++; - - // Cr plane out - R -> R8 - if ((m_outputImageAspects & (VK_IMAGE_ASPECT_PLANE_0_BIT << planeNum)) && - (planeNum < outputImageView->GetNumberOfPlanes())) { - writeDescriptorSets[descrIndex].sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET; - writeDescriptorSets[descrIndex].dstSet = VK_NULL_HANDLE; - writeDescriptorSets[descrIndex].dstBinding = dstBinding; - writeDescriptorSets[descrIndex].descriptorCount = 1; - writeDescriptorSets[descrIndex].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; - imageDescriptors[descrIndex].sampler = VK_NULL_HANDLE; - imageDescriptors[descrIndex].imageView = outputImageView->GetPlaneImageView(planeNum++); - assert(imageDescriptors[descrIndex].imageView); - imageDescriptors[descrIndex].imageLayout = VK_IMAGE_LAYOUT_GENERAL; - writeDescriptorSets[descrIndex].pImageInfo = &imageDescriptors[descrIndex]; - descrIndex++; - } - dstBinding++; - - assert(descrIndex <= maxNumComputeDescr); - assert(descrIndex >= 2); - - if (layoutMode == VK_DESCRIPTOR_SET_LAYOUT_CREATE_PUSH_DESCRIPTOR_BIT_KHR) { - m_vkDevCtx->CmdPushDescriptorSetKHR(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, - m_descriptorSetLayout.GetPipelineLayout(), - set, descrIndex, writeDescriptorSets.data()); - } else { - - VkDeviceOrHostAddressConstKHR imageDescriptorBufferDeviceAddress = - m_descriptorSetLayout.UpdateDescriptorBuffer(bufferIdx, - set, - descrIndex, - writeDescriptorSets.data()); - - - // Descriptor buffer bindings - // Set 0 = Image - VkDescriptorBufferBindingInfoEXT bindingInfo{}; - bindingInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_BUFFER_BINDING_INFO_EXT; - bindingInfo.pNext = nullptr; - bindingInfo.address = imageDescriptorBufferDeviceAddress.deviceAddress; - bindingInfo.usage = VK_BUFFER_USAGE_SAMPLER_DESCRIPTOR_BUFFER_BIT_EXT | - VK_BUFFER_USAGE_RESOURCE_DESCRIPTOR_BUFFER_BIT_EXT; - m_vkDevCtx->CmdBindDescriptorBuffersEXT(cmdBuf, 1, &bindingInfo); - - // Image (set 0) - uint32_t bufferIndexImage = 0; - VkDeviceSize bufferOffset = 0; - m_vkDevCtx->CmdSetDescriptorBufferOffsetsEXT(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, - m_descriptorSetLayout.GetPipelineLayout(), - set, 1, &bufferIndexImage, &bufferOffset); - } - } - break; - - default: - m_vkDevCtx->CmdBindDescriptorSets(cmdBuf, VK_PIPELINE_BIND_POINT_COMPUTE, - m_descriptorSetLayout.GetPipelineLayout(), - 0, 1, m_descriptorSetLayout.GetDescriptorSet(), 0, 0); - } - - struct ivec2 { - uint32_t width; - uint32_t height; - - ivec2() : width(0), height(0) {} - ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} - }; - - struct PushConstants { - uint32_t srcLayer; - uint32_t dstLayer; - ivec2 inputSize; // Original input image size (width, height) - ivec2 outputSize; // Output image size (width, height, with padding) - }; - - PushConstants pushConstants = { - inputImageResourceInfo->baseArrayLayer, // Set the source layer index - outputImageResourceInfo->baseArrayLayer, // Set the destination layer index - ivec2(inputImageResourceInfo->codedExtent.width, inputImageResourceInfo->codedExtent.height), - ivec2(outputImageResourceInfo->codedExtent.width, outputImageResourceInfo->codedExtent.height) - }; - - m_vkDevCtx->CmdPushConstants(cmdBuf, - m_descriptorSetLayout.GetPipelineLayout(), - VK_SHADER_STAGE_COMPUTE_BIT, - 0, // offset - sizeof(PushConstants), - &pushConstants); - - const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; - const uint32_t workgroupHeight = (pushConstants.outputSize.height + (m_workgroupSizeY - 1)) / m_workgroupSizeY; - m_vkDevCtx->CmdDispatch(cmdBuf, workgroupWidth, workgroupHeight, 1); - - return VK_SUCCESS; - } + uint32_t bufferIdx) override; + // Buffer input -> Image output + VkResult RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkBuffer* inBuffers, // with size numInBuffers + uint32_t numInBuffers, + const VkFormat* inBufferFormats, // with size inBufferNumPlanes + const VkSubresourceLayout* inBufferSubresourceLayouts, // with size inBufferNumPlanes + uint32_t inBufferNumPlanes, + const VkImageResourceView* outImageView, + const VkVideoPictureResourceInfoKHR* outImageResourceInfo, + const VkBufferImageCopy* pBufferImageCopy, + uint32_t bufferIdx); + + // Image input -> Buffer output + VkResult RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkImageResourceView* inImageView, + const VkVideoPictureResourceInfoKHR* inImageResourceInfo, + const VkBuffer* outBuffers, // with size numOutBuffers + uint32_t numOutBuffers, + const VkFormat* inBufferFormats, // with size outBufferNumPlanes + const VkSubresourceLayout* outBufferSubresourceLayouts, // with size outBufferNumPlanes + uint32_t outBufferNumPlanes, + const VkBufferImageCopy* pBufferImageCopy, + uint32_t bufferIdx); + + // Buffer input -> Buffer output + VkResult RecordCommandBuffer(VkCommandBuffer cmdBuf, + const VkBuffer* inBuffers, // with size numInBuffers + uint32_t numInBuffers, + const VkFormat* inBufferFormats, // with size inBufferNumPlanes + const VkSubresourceLayout* inBufferSubresourceLayouts, // with size inBufferNumPlanes + uint32_t inBufferNumPlanes, + const VkExtent3D& inBufferExtent, + const VkBuffer* outBuffers, // with size numOutBuffers + uint32_t numOutBuffers, + const VkFormat* outBufferFormats, // with size outBufferNumPlanes + const VkSubresourceLayout* outBufferSubresourceLayouts, // with size outBufferNumPlanes + uint32_t outBufferNumPlanes, + const VkExtent3D& outBufferExtent, + uint32_t bufferIdx); private: VkResult InitDescriptorSetLayout(uint32_t maxNumFrames); - void ShaderGeneratePlaneDescriptors(std::stringstream& computeShader, - VkImageAspectFlags& imageAspects, - const char *imageName, - VkFormat imageFormat, - bool isInput, - uint32_t startBinding = 0, - uint32_t set = 0, - bool imageArray = true); + + /** + * @brief Generates GLSL image descriptor bindings for shader input/output + * + * Creates appropriate GLSL image binding declarations based on the input/output format. + * Handles different YUV formats like single-plane (RGBA), 2-plane (NV12/NV21), and 3-plane (I420, etc). + * + * @param computeShader Output stringstream for shader code + * @param imageAspects Output parameter to store the image aspect flags used + * @param imageName Base image variable name + * @param imageFormat Vulkan format of the image + * @param isInput Whether this is an input or output resource + * @param startBinding Starting binding number in the descriptor set + * @param set Descriptor set number + * @param imageArray Whether to use image2DArray or image2D + * @return The next available binding number after all descriptors are created + */ + uint32_t ShaderGenerateImagePlaneDescriptors(std::stringstream& computeShader, + VkImageAspectFlags& imageAspects, + const char *imageName, + VkFormat imageFormat, + bool isInput, + uint32_t startBinding = 0, + uint32_t set = 0, + bool imageArray = true); + + /** + * @brief Generates GLSL buffer descriptor bindings for shader input/output + * + * Creates appropriate GLSL buffer binding declarations based on the input/output format. + * Handles different YUV buffer layouts matching single-plane, 2-plane, or 3-plane formats. + * + * @param shaderStr Output stringstream for shader code + * @param imageAspects Output parameter to store the image aspect flags used + * @param bufferName Base buffer variable name + * @param bufferFormat Vulkan format of the buffer data + * @param isInput Whether this is an input or output resource + * @param startBinding Starting binding number in the descriptor set + * @param set Descriptor set number + * @param bufferType The Vulkan descriptor type to use for the buffer + * @return The next available binding number after all descriptors are created + */ + uint32_t ShaderGenerateBufferPlaneDescriptors(std::stringstream& shaderStr, + VkImageAspectFlags& imageAspects, + const char *bufferName, + VkFormat bufferFormat, + bool isInput, + uint32_t startBinding = 0, + uint32_t set = 0, + VkDescriptorType bufferType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); + + /** + * @brief Unified descriptor generation for either buffer or image resources + * + * Delegates to either ShaderGenerateImagePlaneDescriptors or ShaderGenerateBufferPlaneDescriptors + * based on the resource type (image or buffer) needed for input/output. + * + * @param shaderStr Output stringstream for shader code + * @param isInput Whether this is an input or output resource + * @param startBinding Starting binding number in the descriptor set + * @param set Descriptor set number + * @param imageArray Whether to use image2DArray or image2D (for image resources) + * @param bufferType The Vulkan descriptor type to use for buffer resources + * @return The next available binding number after all descriptors are created + */ + uint32_t ShaderGeneratePlaneDescriptors(std::stringstream& shaderStr, + bool isInput, + uint32_t startBinding, + uint32_t set, + bool imageArray, + VkDescriptorType bufferType); + + /** + * @brief Initializes GLSL shader for YCbCr copy operation + * + * Generates a compute shader that copies YCbCr data from input to output + * without any color space conversion, preserving the format. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ size_t InitYCBCRCOPY(std::string& computeShader); + + /** + * @brief Initializes GLSL shader for YCbCr clear operation + * + * Generates a compute shader that clears/fills YCbCr data in the output + * resource with constant values. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ size_t InitYCBCRCLEAR(std::string& computeShader); + + /** + * @brief Initializes GLSL shader for YCbCr to RGBA conversion + * + * Generates a compute shader that converts YCbCr input to RGBA output + * using the appropriate color space conversion matrix. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ size_t InitYCBCR2RGBA(std::string& computeShader); + /** + * @brief Initializes GLSL shader for RGBA to YCbCr conversion + * + * Generates a compute shader that converts RGBA input to YCbCr output + * using the appropriate color space conversion matrix. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ + size_t InitRGBA2YCBCR(std::string& computeShader); + + /** + * @brief Initializes GLSL shader for YUV to NV12 conversion using buffer input + * + * Generates a compute shader that converts YUV input from buffer to NV12 output, + * handling different YUV formats (I420, I422, I444) with appropriate chroma subsampling. + * + * @param computeShader Output string for the complete GLSL shader code + * @return Size of the generated shader code in bytes + */ + size_t InitYUV2NV12FromBuffer(std::string& computeShader); + private: const FilterType m_filterType; VkFormat m_inputFormat; @@ -386,8 +345,32 @@ class VulkanFilterYuvCompute : public VulkanFilter VulkanComputePipeline m_computePipeline; VkImageAspectFlags m_inputImageAspects; VkImageAspectFlags m_outputImageAspects; + uint32_t m_inputEnableMsbToLsbShift : 1; + uint32_t m_outputEnableLsbToMsbShift : 1; uint32_t m_enableRowAndColumnReplication : 1; - + uint32_t m_inputIsBuffer : 1; + uint32_t m_outputIsBuffer : 1; + + struct PushConstants { + uint32_t srcLayer; // src image layer to use + uint32_t dstLayer; // dst image layer to use + uint32_t inputWidth; // input image or buffer width + uint32_t inputHeight; // input image or buffer height + uint32_t outputWidth; // output image or buffer width + uint32_t outputHeight; // output image or buffer height + uint32_t inYOffset; // input buffer Y plane offset + uint32_t inCbOffset; // input buffer Cb plane offset + uint32_t inCrOffset; // input buffer Cr plane offset + uint32_t inYPitch; // input buffer Y plane pitch + uint32_t inCbPitch; // input buffer Cb plane pitch + uint32_t inCrPitch; // input buffer Cr plane pitch + uint32_t outYOffset; // output buffer Y plane offset + uint32_t outCbOffset; // output buffer Cb plane offset + uint32_t outCrOffset; // output buffer Cr plane offset + uint32_t outYPitch; // output buffer Y plane pitch + uint32_t outCbPitch; // output buffer Cb plane pitch + uint32_t outCrPitch; // output buffer Cr plane pitch + }; }; #endif /* _VULKANFILTERYUVCOMPUTE_H_ */ diff --git a/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp b/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp index 2b9f6b66..c855386a 100644 --- a/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp +++ b/vk_video_decoder/libs/VkVideoDecoder/VkVideoDecoder.cpp @@ -334,16 +334,18 @@ int32_t VkVideoDecoder::StartVideoSequence(VkParserDetectedVideoFormat* pVideoFo if (needNewFilter) { result = VulkanFilterYuvCompute::Create(m_vkDevCtx, - m_vkDevCtx->GetComputeQueueFamilyIdx(), - 0, - m_filterType, - numDecodeSurfaces + 1, - inputFormat, - outputFormat, - &ycbcrConversionCreateInfo, - &ycbcrPrimariesConstants, - &samplerInfo, - m_yuvFilter); + m_vkDevCtx->GetComputeQueueFamilyIdx(), + 0, + m_filterType, + numDecodeSurfaces + 1, + inputFormat, + outputFormat, + false, // inputEnableMsbToLsbShift + false, // outputEnableLsbToMsbShift + &ycbcrConversionCreateInfo, + &ycbcrPrimariesConstants, + &samplerInfo, + m_yuvFilter); } if (result == VK_SUCCESS) { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp index 8649df07..84e83dfa 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp @@ -164,96 +164,22 @@ VkResult VkVideoEncoder::LoadNextFrame(VkSharedBaseObj& const uint8_t* pInputFrameData = m_encoderConfig->inputFileHandler.GetMappedPtr(m_encoderConfig->input.fullImageSize, encodeFrameInfo->frameInputOrderNum); + // NOTE: Get image layout const VkSubresourceLayout* dstSubresourceLayout = dstImageResource->GetSubresourceLayout(); - int yCbCrConvResult = 0; - if (m_encoderConfig->input.bpp == 8) { - - if (m_encoderConfig->encodeChromaSubsampling == VK_VIDEO_CHROMA_SUBSAMPLING_444_BIT_KHR) { - // Load current 8-bit frame from file and convert to 2-plane YUV444 - yCbCrConvResult = YCbCrConvUtilsCpu::I444ToP444( - pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset, // src_y - (int)m_encoderConfig->input.planeLayouts[0].rowPitch, // src_stride_y - pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset, // src_u - (int)m_encoderConfig->input.planeLayouts[1].rowPitch, // src_stride_u - pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset, // src_v - (int)m_encoderConfig->input.planeLayouts[2].rowPitch, // src_stride_v - writeImagePtr + dstSubresourceLayout[0].offset, // dst_y - (int)dstSubresourceLayout[0].rowPitch, // dst_stride_y - writeImagePtr + dstSubresourceLayout[1].offset, // dst_uv - (int)dstSubresourceLayout[1].rowPitch, // dst_stride_uv - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height)); // height - } else { - // Load current 8-bit frame from file and convert to NV12 - yCbCrConvResult = YCbCrConvUtilsCpu::I420ToNV12( - pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset, // src_y, - (int)m_encoderConfig->input.planeLayouts[0].rowPitch, // src_stride_y, - pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset, // src_u, - (int)m_encoderConfig->input.planeLayouts[1].rowPitch, // src_stride_u, - pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset, // src_v, - (int)m_encoderConfig->input.planeLayouts[2].rowPitch, // src_stride_v, - writeImagePtr + dstSubresourceLayout[0].offset, // dst_y, - (int)dstSubresourceLayout[0].rowPitch, // dst_stride_y, - writeImagePtr + dstSubresourceLayout[1].offset, // dst_uv, - (int)dstSubresourceLayout[1].rowPitch, // dst_stride_uv, - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height)); // height - } - - } else if (m_encoderConfig->input.bpp == 10) { // 10-bit - actually 16-bit only for now. - - int shiftBits = 0; - if (m_encoderConfig->input.msbShift >= 0) { - shiftBits = m_encoderConfig->input.msbShift; - } else { - shiftBits = 16 - m_encoderConfig->input.bpp; - } - - if (m_encoderConfig->encodeChromaSubsampling == VK_VIDEO_CHROMA_SUBSAMPLING_444_BIT_KHR) { - // Load current 10-bit frame from file and convert to 2-plane YUV444 - yCbCrConvResult = YCbCrConvUtilsCpu::I444ToP444( - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset), // src_y - (int)m_encoderConfig->input.planeLayouts[0].rowPitch, // src_stride_y - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset), // src_u - (int)m_encoderConfig->input.planeLayouts[1].rowPitch, // src_stride_u - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset), // src_v - (int)m_encoderConfig->input.planeLayouts[2].rowPitch, // src_stride_v - (uint16_t*)(writeImagePtr + dstSubresourceLayout[0].offset), // dst_y - (int)dstSubresourceLayout[0].rowPitch, // dst_stride_y - (uint16_t*)(writeImagePtr + dstSubresourceLayout[1].offset), // dst_uv - (int)dstSubresourceLayout[1].rowPitch, // dst_stride_uv - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height), // height - shiftBits); - } else { - // Load current 10-bit frame from file and convert to P010/P016 - yCbCrConvResult = YCbCrConvUtilsCpu::I420ToNV12( - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset), // src_y, - (int)m_encoderConfig->input.planeLayouts[0].rowPitch, // src_stride_y, - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset), // src_u, - (int)m_encoderConfig->input.planeLayouts[1].rowPitch, // src_stride_u, - (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset), // src_v, - (int)m_encoderConfig->input.planeLayouts[2].rowPitch, // src_stride_v, - (uint16_t*)(writeImagePtr + dstSubresourceLayout[0].offset), // dst_y, - (int)dstSubresourceLayout[0].rowPitch, // dst_stride_y, - (uint16_t*)(writeImagePtr + dstSubresourceLayout[1].offset), // dst_uv, - (int)dstSubresourceLayout[1].rowPitch, // dst_stride_uv, - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height), // height - shiftBits); - } - - } else { - assert(!"Requested bit-depth is not supported!"); - } - - if (yCbCrConvResult == 0) { - // On success, stage the input frame for the encoder video input - return StageInputFrame(encodeFrameInfo); - } - - return VK_ERROR_INITIALIZATION_FAILED; + // Direct plane copy - no color space conversion needed + CopyYCbCrPlanesDirectCPU( + pInputFrameData, // Source buffer + m_encoderConfig->input.planeLayouts, // Source layouts + writeImagePtr, // Destination buffer + dstSubresourceLayout, // Destination layouts + std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // Width + std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height), // Height + m_encoderConfig->input.numPlanes, // Number of planes + m_encoderConfig->input.vkFormat); // Format for subsampling detection + + // Now stage the input frame for the encoder video input + return StageInputFrame(encodeFrameInfo); } VkResult VkVideoEncoder::StageInputFrameQpMap(VkSharedBaseObj& encodeFrameInfo, @@ -479,6 +405,121 @@ VkResult VkVideoEncoder::SubmitStagedQpMap(VkSharedBaseObjplanesLayout) : 8; // Default to 8-bit + const uint32_t bytesPerPixel = (bitDepth > 8) ? 2 : 1; + + // Determine chroma subsampling ratios + const uint32_t chromaHorzRatio = (formatInfo != nullptr) ? (1 << formatInfo->planesLayout.secondaryPlaneSubsampledX) : 1; + const uint32_t chromaVertRatio = (formatInfo != nullptr) ? (1 << formatInfo->planesLayout.secondaryPlaneSubsampledY) : 1; + + // Log the format subsampling for debugging + if (m_encoderConfig->verbose) { + const char* subsamplingDesc = "4:4:4"; + if (chromaHorzRatio == 2 && chromaVertRatio == 2) { + subsamplingDesc = "4:2:0"; + } else if (chromaHorzRatio == 2 && chromaVertRatio == 1) { + subsamplingDesc = "4:2:2"; + } + printf("YCbCr copy with %s subsampling (chromaHorzRatio=%d, chromaVertRatio=%d), %d-bit\n", + subsamplingDesc, chromaHorzRatio, chromaVertRatio, bitDepth); + } + + // Handle all planes + for (uint32_t plane = 0; plane < numPlanes; plane++) { + // Source and destination plane pointers + const uint8_t* srcPlane = pInputFrameData + inputPlaneLayouts[plane].offset; + uint8_t* dstPlane = writeImagePtr + dstSubresourceLayout[plane].offset; + + // Get plane dimensions - adjust for chroma planes + uint32_t planeWidth = width; + uint32_t planeHeight = height; + + // Adjust dimensions for chroma planes based on format subsampling + if (plane > 0) { + if (chromaHorzRatio > 1) { + planeWidth = (width + chromaHorzRatio - 1) / chromaHorzRatio; + } + if (chromaVertRatio > 1) { + planeHeight = (height + chromaVertRatio - 1) / chromaVertRatio; + } + } + + // Source and destination strides + const size_t srcStride = inputPlaneLayouts[plane].rowPitch; + const size_t dstStride = dstSubresourceLayout[plane].rowPitch; + + // Line width in bytes + const size_t lineBytes = planeWidth * bytesPerPixel; + + // Get the starting pointers for this plane + const uint8_t* srcRow = srcPlane; + uint8_t* dstRow = dstPlane; + + if (false && (bitDepth > 8)) { + + const int shiftBits = 16 - bitDepth; + + // Copy each line, incrementing pointers by stride amounts + for (uint32_t y = 0; y < planeHeight; y++) { + + // Get the starting pointers for this row + const uint16_t* srcRow16 = (const uint16_t*)srcRow; + uint16_t* dstRow16 = (uint16_t*)dstRow; + + for (uint32_t i = 0; i < planeWidth; i++) { + *dstRow16++ = (*srcRow16++ << shiftBits); + } + + // Advance to the next line using pointer arithmetic + srcRow += srcStride; + dstRow += dstStride; + } + + } else { + + // Copy each line, incrementing pointers by stride amounts + for (uint32_t y = 0; y < planeHeight; y++) { + // Copy the current line + memcpy(dstRow, srcRow, lineBytes); + + // Advance to the next line using pointer arithmetic + srcRow += srcStride; + dstRow += dstStride; + } + } + } +} VkResult VkVideoEncoder::SubmitStagedInputFrame(VkSharedBaseObj& encodeFrameInfo) { @@ -943,6 +984,7 @@ VkResult VkVideoEncoder::InitEncoder(VkSharedBaseObj& encoderConf VK_IMAGE_USAGE_TRANSFER_DST_BIT); const VkImageUsageFlags dpbImageUsage = VK_IMAGE_USAGE_VIDEO_ENCODE_DPB_BIT_KHR; + // NOTE: Create linearInputImage result = VulkanVideoImagePool::Create(m_vkDevCtx, m_linearInputImagePool); if(result != VK_SUCCESS) { fprintf(stderr, "\nInitEncoder Error: Failed to create linearInputImagePool.\n"); @@ -956,7 +998,7 @@ VkResult VkVideoEncoder::InitEncoder(VkSharedBaseObj& encoderConf result = m_linearInputImagePool->Configure( m_vkDevCtx, encoderConfig->numInputImages, - m_imageInFormat, + encoderConfig->input.vkFormat, linearInputImageExtent, ( VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | @@ -1217,8 +1259,10 @@ VkResult VkVideoEncoder::InitEncoder(VkSharedBaseObj& encoderConf 0, // queueIndex encoderConfig->filterType, encoderConfig->numInputImages, - m_imageInFormat, // in filter format (can be RGB) + encoderConfig->input.vkFormat, // in filter format (can be RGB) m_imageInFormat, // out filter - same as input for now. + false, // inputEnableMsbToLsbShift + (encoderConfig->input.msbShift > 0), &ycbcrConversionCreateInfo, &ycbcrPrimariesConstants, &samplerInfo, diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h index 61c2ec84..c939bda6 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h @@ -559,6 +559,29 @@ class VkVideoEncoder : public VkVideoRefCountBase { const uint8_t* setPlaneOffset(const uint8_t* pFrameData, size_t bufferSize, size_t ¤tReadOffset); + /** + * @brief Copies YCbCr planes directly from input buffer to output buffer when formats are the same + * + * @param pInputFrameData Source buffer containing YCbCr planes + * @param inputPlaneLayouts Array of source buffer plane layouts (offset, pitch, etc.) + * @param writeImagePtr Destination buffer for the YCbCr planes + * @param dstSubresourceLayout Array of destination buffer plane layouts + * @param width Width of the image in pixels + * @param height Height of the image in pixels + * @param numPlanes Number of planes in the format (1, 2, or 3) + * @param format The VkFormat of the image for proper subsampling and bit depth detection + * @return none + */ + void CopyYCbCrPlanesDirectCPU( + const uint8_t* pInputFrameData, + const VkSubresourceLayout* inputPlaneLayouts, + uint8_t* writeImagePtr, + const VkSubresourceLayout* dstSubresourceLayout, + uint32_t width, + uint32_t height, + uint32_t numPlanes, + VkFormat format); + bool WaitForThreadsToComplete(); protected: From f5659698c1b251242d6eb9cdcae7d9f67777eaf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Cerveau?= Date: Tue, 3 Jun 2025 16:38:13 +0200 Subject: [PATCH 03/14] encoder: allow to build without shaderc dep 2 Fixup of 62138ad --- vk_video_encoder/demos/vk-video-enc/CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt b/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt index d3bba268..7fe88cc3 100644 --- a/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt +++ b/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt @@ -118,6 +118,10 @@ if(TARGET vulkan) list(APPEND definitions PRIVATE -DUNINSTALLED_LOADER="$") endif() +if(USE_SHADERC) + list(APPEND definitions PRIVATE -DSHADERC_SUPPORT) +endif() + if(WIN32) list(APPEND definitions PRIVATE -DVK_USE_PLATFORM_WIN32_KHR) list(APPEND definitions PRIVATE -DWIN32_LEAN_AND_MEAN) From ee5acf0da5e6c1a96e23a8e60a07e3a2eb2e77e9 Mon Sep 17 00:00:00 2001 From: "Vassili Nikolaev (NVIDIA)" Date: Mon, 9 Jun 2025 08:59:16 -0500 Subject: [PATCH 04/14] common: Code compilation fixes --- .../VkVideoCore/VulkanVideoCapabilities.h | 2 +- common/include/mio/mio.hpp | 8 ++--- common/libs/VkCodecUtils/VkThreadPool.h | 5 ++- .../libs/VkCodecUtils/VkVideoFrameToFile.cpp | 36 ++++++++++++------- .../VkCodecUtils/VulkanFilterYuvCompute.cpp | 2 +- .../VkCodecUtils/VulkanShaderCompiler.cpp | 13 +++++-- .../VkCodecUtils/VulkanVideoProcessor.cpp | 25 ++++++------- .../libs/VkCodecUtils/VulkanVideoProcessor.h | 4 +++ common/libs/VkShell/Shell.h | 3 ++ .../NvVideoParser/src/VulkanAV1Decoder.cpp | 10 ++++-- .../src/VulkanAV1GlobalMotionDec.cpp | 2 +- vk_video_decoder/src/vulkan_video_decoder.cpp | 9 ++--- vk_video_encoder/demos/vk-video-enc/Main.cpp | 2 +- .../include/vulkan_video_encoder.h | 4 +-- .../libs/VkVideoEncoder/VkEncoderConfig.cpp | 10 +++--- .../libs/VkVideoEncoder/VkEncoderConfig.h | 6 ++-- .../VkVideoEncoder/VkEncoderConfigAV1.cpp | 2 +- .../libs/VkVideoEncoder/VkEncoderConfigAV1.h | 2 +- .../libs/VkVideoEncoder/VkEncoderDpbH264.h | 2 +- .../libs/VkVideoEncoder/VkVideoEncoder.cpp | 8 +++-- .../libs/VkVideoEncoder/VkVideoGopStructure.h | 7 ++-- vk_video_encoder/src/vulkan_video_encoder.cpp | 6 ++-- .../test/vulkan-video-enc/Main.cpp | 2 +- 23 files changed, 104 insertions(+), 66 deletions(-) diff --git a/common/include/VkVideoCore/VulkanVideoCapabilities.h b/common/include/VkVideoCore/VulkanVideoCapabilities.h index 8e0caf4f..3c8f572f 100644 --- a/common/include/VkVideoCore/VulkanVideoCapabilities.h +++ b/common/include/VkVideoCore/VulkanVideoCapabilities.h @@ -360,7 +360,7 @@ class VulkanVideoCapabilities } } - formatCount = std::min(supportedFormatCount, formatCount); + formatCount = std::min(supportedFormatCount, formatCount); for (uint32_t i = 0; i < formatCount; i++) { formats[i] = pSupportedFormats[i].format; diff --git a/common/include/mio/mio.hpp b/common/include/mio/mio.hpp index 5cd55ea8..3c3e1adb 100644 --- a/common/include/mio/mio.hpp +++ b/common/include/mio/mio.hpp @@ -786,13 +786,13 @@ namespace win { /** Returns the 4 upper bytes of an 8-byte integer. */ inline DWORD int64_high(int64_t n) noexcept { - return n >> 32; + return (DWORD)(n >> 32); } /** Returns the 4 lower bytes of an 8-byte integer. */ inline DWORD int64_low(int64_t n) noexcept { - return n & 0xffffffff; + return (DWORD)(n & 0xffffffff); } inline std::wstring s_2_ws(const std::string& s) @@ -887,7 +887,7 @@ inline size_t query_file_size(file_handle_type handle, std::error_code& error) error = detail::last_error(); return 0; } - return static_cast(file_size.QuadPart); + return static_cast(file_size.QuadPart); #else // POSIX struct stat sbuf; if(::fstat(handle, &sbuf) == -1) @@ -933,7 +933,7 @@ inline mmap_context memory_map(const file_handle_type file_handle, const int64_t mode == access_mode::read ? FILE_MAP_READ : FILE_MAP_WRITE, win::int64_high(aligned_offset), win::int64_low(aligned_offset), - length_to_map)); + (size_t)length_to_map)); if(mapping_start == nullptr) { // Close file handle if mapping it failed. diff --git a/common/libs/VkCodecUtils/VkThreadPool.h b/common/libs/VkCodecUtils/VkThreadPool.h index 44d31bd1..b9d5a508 100644 --- a/common/libs/VkCodecUtils/VkThreadPool.h +++ b/common/libs/VkCodecUtils/VkThreadPool.h @@ -65,8 +65,11 @@ class VkThreadPool std::future res = task->get_future(); { std::unique_lock lock(queue_mutex); - if(stop) + if(stop) { +#ifdef __cpp_exceptions throw std::runtime_error("enqueue on stopped ThreadPool"); +#endif + } tasks.emplace([task](){ (*task)(); }); } diff --git a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp index 846a0890..c5ca2d0e 100644 --- a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp +++ b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp @@ -240,7 +240,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } if (m_outputcrcPerFrame && m_crcOutputFile) { - fprintf(m_crcOutputFile, "CRC Frame[%" PRId64 "]:", pFrame->displayOrder); + fprintf(m_crcOutputFile, "CRC Frame[%lld]:", (long long)pFrame->displayOrder); for (size_t i = 0; i < m_crcInitValue.size(); i += 1) { uint32_t frameCrc = m_crcInitValue[i]; getCRC(&frameCrc, pOutputBuffer, usedBufferSize, Crc32Table); @@ -415,6 +415,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { VkDeviceSize maxSize = 0; const uint8_t* readImagePtr = srcImageDeviceMemory->GetReadOnlyDataPtr(imageOffset, maxSize); assert(readImagePtr != nullptr); + assert(maxSize <= SIZE_MAX); // Ensure we don't lose data in conversion int32_t secondaryPlaneWidth = frameWidth; int32_t secondaryPlaneHeight = frameHeight; @@ -490,15 +491,19 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { // Copy the luma plane const uint32_t numCompatiblePlanes = 1; for (uint32_t plane = 0; plane < numCompatiblePlanes; plane++) { - const uint8_t* pSrc = readImagePtr + layouts[plane].offset; - uint8_t* pDst = pOutBuffer + yuvPlaneLayouts[plane].offset; + const uint8_t* pSrc = readImagePtr + static_cast(layouts[plane].offset); + uint8_t* pDst = pOutBuffer + static_cast(yuvPlaneLayouts[plane].offset); if (is8Bit) { - CopyPlaneData(pSrc, pDst, layouts[plane].rowPitch, yuvPlaneLayouts[plane].rowPitch, + assert(layouts[plane].rowPitch <= SIZE_MAX); + assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); + CopyPlaneData(pSrc, pDst, static_cast(layouts[plane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), frameWidth, imageHeight); } else { - CopyPlaneData(pSrc, pDst, layouts[plane].rowPitch, yuvPlaneLayouts[plane].rowPitch, - frameWidth, imageHeight, 1, bitShift); + assert(layouts[plane].rowPitch <= SIZE_MAX); + assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); + CopyPlaneData(pSrc, pDst, static_cast(layouts[plane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), + frameWidth, imageHeight); } } @@ -517,21 +522,25 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } if (is8Bit) { - CopyPlaneData(pSrc, pDst, layouts[srcPlane].rowPitch, yuvPlaneLayouts[plane].rowPitch, + assert(layouts[srcPlane].rowPitch <= SIZE_MAX); + assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); + CopyPlaneData(pSrc, pDst, static_cast(layouts[srcPlane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), planeWidth, 1, 2); } else { - CopyPlaneData(pSrc, pDst, layouts[srcPlane].rowPitch, yuvPlaneLayouts[plane].rowPitch, - planeWidth, 1, 2, bitShift); + assert(layouts[srcPlane].rowPitch <= SIZE_MAX); + assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); + CopyPlaneData(pSrc, pDst, static_cast(layouts[srcPlane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), + planeWidth, 1, 2); } pDst += yuvPlaneLayouts[plane].rowPitch; } } // Calculate total buffer size - outputBufferSize = yuvPlaneLayouts[0].rowPitch * imageHeight; + outputBufferSize = static_cast(yuvPlaneLayouts[0].rowPitch * imageHeight); if (mpInfo->planesLayout.numberOfExtraPlanes >= 1) { - outputBufferSize += yuvPlaneLayouts[1].rowPitch * secondaryPlaneHeight; - outputBufferSize += yuvPlaneLayouts[2].rowPitch * secondaryPlaneHeight; + outputBufferSize += static_cast(yuvPlaneLayouts[1].rowPitch * secondaryPlaneHeight); + outputBufferSize += static_cast(yuvPlaneLayouts[2].rowPitch * secondaryPlaneHeight); } return outputBufferSize; @@ -545,6 +554,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { } VkDeviceSize imageMemorySize = imageResource->GetImageDeviceMemorySize(); + assert(imageMemorySize <= SIZE_MAX); // Ensure we don't lose data in conversion if ((m_pLinearMemory == nullptr) || (imageMemorySize > m_allocationSize)) { if (m_outputFile) { @@ -556,7 +566,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { m_pLinearMemory = nullptr; } - m_allocationSize = (size_t)(imageMemorySize); + m_allocationSize = static_cast(imageMemorySize); m_pLinearMemory = new uint8_t[m_allocationSize]; if (m_pLinearMemory == nullptr) { return nullptr; diff --git a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp index 906cc229..597f5d7c 100644 --- a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp +++ b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp @@ -214,7 +214,7 @@ static YcbcrBtStandard GetYcbcrPrimariesConstantsId(VkSamplerYcbcrModelConversio * * @param shaderStr Output stringstream where the GLSL code will be written */ -void GenPushConstantsDecl(std::stringstream& shaderStr) { +static void GenPushConstantsDecl(std::stringstream& shaderStr) { shaderStr << "layout(push_constant) uniform PushConstants {\n" << " uint srcLayer; // src image layer to use\n" << " uint dstLayer; // dst image layer to use\n" diff --git a/common/libs/VkCodecUtils/VulkanShaderCompiler.cpp b/common/libs/VkCodecUtils/VulkanShaderCompiler.cpp index 20fc073e..89215a8b 100644 --- a/common/libs/VkCodecUtils/VulkanShaderCompiler.cpp +++ b/common/libs/VkCodecUtils/VulkanShaderCompiler.cpp @@ -103,13 +103,19 @@ VkShaderModule VulkanShaderCompiler::BuildShaderFromFile(const char *fileName, VkShaderStageFlagBits type, const VulkanDeviceContext* vkDevCtx) { +#ifdef seekg // read file from the path std::ifstream is(fileName, std::ios::binary | std::ios::in | std::ios::ate); if (is.is_open()) { - - size_t size = is.tellg(); - is.seekg(0, std::ios::beg); + is.seekg (0, is.end); + std::streamoff fileSize = is.tellg(); + if (fileSize < 0 || static_cast(fileSize) > std::numeric_limits::max()) { + std::cerr << "File size is too large or invalid" << std::endl; + return VK_NULL_HANDLE; + } + size_t size = static_cast(fileSize); + is.seekg(0, is.beg); char* shaderCode = new char[size]; is.read(shaderCode, size); is.close(); @@ -122,6 +128,7 @@ VkShaderModule VulkanShaderCompiler::BuildShaderFromFile(const char *fileName, return shaderModule; } +#endif return VK_NULL_HANDLE; } diff --git a/common/libs/VkCodecUtils/VulkanVideoProcessor.cpp b/common/libs/VkCodecUtils/VulkanVideoProcessor.cpp index ebe00067..d6e1fd18 100644 --- a/common/libs/VkCodecUtils/VulkanVideoProcessor.cpp +++ b/common/libs/VkCodecUtils/VulkanVideoProcessor.cpp @@ -115,11 +115,13 @@ VkResult VulkanVideoProcessor::Initialize(const VulkanDeviceContext* vkDevCtx, return result; } - VkVideoCoreProfile videoProfile(m_videoStreamDemuxer->GetVideoCodec(), - m_videoStreamDemuxer->GetChromaSubsampling(), - m_videoStreamDemuxer->GetLumaBitDepth(), - m_videoStreamDemuxer->GetChromaBitDepth(), - m_videoStreamDemuxer->GetProfileIdc()); + VkVideoCoreProfile videoProfile ({ + m_videoStreamDemuxer->GetVideoCodec(), + m_videoStreamDemuxer->GetChromaSubsampling(), + m_videoStreamDemuxer->GetLumaBitDepth(), + m_videoStreamDemuxer->GetChromaBitDepth(), + m_videoStreamDemuxer->GetProfileIdc() + }); if (!VulkanVideoCapabilities::IsCodecTypeSupported(vkDevCtx, vkDevCtx->GetVideoDecodeQueueFamilyIdx(), @@ -194,12 +196,11 @@ VkResult VulkanVideoProcessor::Create(const DecoderConfig& settings, const Vulka VkVideoProfileInfoKHR VulkanVideoProcessor::GetVkProfile() const { - - VkVideoProfileInfoKHR videoProfile({VK_STRUCTURE_TYPE_VIDEO_PROFILE_INFO_KHR, NULL, + VkVideoProfileInfoKHR videoProfile {VK_STRUCTURE_TYPE_VIDEO_PROFILE_INFO_KHR, NULL, m_videoStreamDemuxer->GetVideoCodec(), m_videoStreamDemuxer->GetChromaSubsampling(), m_videoStreamDemuxer->GetLumaBitDepth(), - m_videoStreamDemuxer->GetChromaBitDepth()}); + m_videoStreamDemuxer->GetChromaBitDepth()}; return videoProfile; } @@ -229,10 +230,10 @@ VkFormat VulkanVideoProcessor::GetFrameImageFormat() const VkExtent3D VulkanVideoProcessor::GetVideoExtent() const { - VkExtent3D extent ({ (uint32_t)m_videoStreamDemuxer->GetWidth(), - (uint32_t)m_videoStreamDemuxer->GetHeight(), - (uint32_t)1 - }); + VkExtent3D extent { (uint32_t)m_videoStreamDemuxer->GetWidth(), + (uint32_t)m_videoStreamDemuxer->GetHeight(), + (uint32_t)1 + }; return extent; } diff --git a/common/libs/VkCodecUtils/VulkanVideoProcessor.h b/common/libs/VkCodecUtils/VulkanVideoProcessor.h index cbdca1f1..0eb08e9c 100644 --- a/common/libs/VkCodecUtils/VulkanVideoProcessor.h +++ b/common/libs/VkCodecUtils/VulkanVideoProcessor.h @@ -23,6 +23,10 @@ #include "VkCodecUtils/VkVideoQueue.h" #include "VkVideoFrameOutput.h" +// Forward declarations +class VulkanDeviceContext; +struct VkMpFormatInfo; + class VulkanVideoProcessor : public VkVideoQueue { public: diff --git a/common/libs/VkShell/Shell.h b/common/libs/VkShell/Shell.h index c9c6c233..b91223b0 100644 --- a/common/libs/VkShell/Shell.h +++ b/common/libs/VkShell/Shell.h @@ -66,7 +66,10 @@ class Shell : public VkWsiDisplay, public VkVideoRefCountBase { if ((res != VK_SUCCESS) && (res != VK_SUBOPTIMAL_KHR)) { std::stringstream ss; ss << "VkResult " << res << " returned"; +#ifdef __cpp_exceptions throw std::runtime_error(ss.str()); +#endif // __cpp_exceptions + } return res; diff --git a/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1Decoder.cpp b/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1Decoder.cpp index bc65f33f..c401eec1 100644 --- a/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1Decoder.cpp +++ b/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1Decoder.cpp @@ -1132,7 +1132,7 @@ static uint32_t tile_log2(int blk_size, int target) return k; } -uint32_t FloorLog2(uint32_t x) +static uint32_t FloorLog2(uint32_t x) { int s = 0; @@ -2289,7 +2289,11 @@ bool VulkanAV1Decoder::ParseObuTileGroup(const AV1ObuHeader& hdr) consumedBytes += tile_size_bytes_minus_1 + 1; m_PicData.tileOffsets[m_PicData.khr_info.tileCount] = (uint32_t)m_nalu.start_offset + (uint32_t)consumedBytes; - tileSize = tile_size_minus_1 + 1; + // Add bounds checking and safe conversion + if (tile_size_minus_1 > (SIZE_MAX - 1)) { + return false; // Tile size too large + } + tileSize = (size_t)(tile_size_minus_1 + 1); consumedBytes += (uint32_t)tileSize; skip_bits((uint32_t)(tileSize * 8)); @@ -2302,7 +2306,7 @@ bool VulkanAV1Decoder::ParseObuTileGroup(const AV1ObuHeader& hdr) return (tg_end == num_tiles - 1); } -bool IsObuInCurrentOperatingPoint(int current_operating_point, AV1ObuHeader *hdr) { +static bool IsObuInCurrentOperatingPoint(int current_operating_point, AV1ObuHeader *hdr) { if (current_operating_point == 0) return true; if (((current_operating_point >> hdr->temporal_id) & 0x1) && ((current_operating_point >> (hdr->spatial_id + 8)) & 0x1)) { diff --git a/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1GlobalMotionDec.cpp b/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1GlobalMotionDec.cpp index e5a35316..37691fe5 100644 --- a/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1GlobalMotionDec.cpp +++ b/vk_video_decoder/libs/NvVideoParser/src/VulkanAV1GlobalMotionDec.cpp @@ -82,7 +82,7 @@ #define WARP_PARAM_REDUCE_BITS 6 #define WARPEDMODEL_PREC_BITS 16 -int get_msb(unsigned int n) +static int get_msb(unsigned int n) { int log = 0; unsigned int value = n; diff --git a/vk_video_decoder/src/vulkan_video_decoder.cpp b/vk_video_decoder/src/vulkan_video_decoder.cpp index 1d0e0541..f98f3f82 100644 --- a/vk_video_decoder/src/vulkan_video_decoder.cpp +++ b/vk_video_decoder/src/vulkan_video_decoder.cpp @@ -66,10 +66,11 @@ class VulkanVideoDecoderImpl : public VulkanVideoDecoder { virtual VkExtent3D GetVideoExtent() const { - VkExtent3D extent ({ (uint32_t)m_vulkanVideoProcessor->GetWidth(), - (uint32_t)m_vulkanVideoProcessor->GetHeight(), - (uint32_t)1 - }); + VkExtent3D extent { + (uint32_t)m_vulkanVideoProcessor->GetWidth(), + (uint32_t)m_vulkanVideoProcessor->GetHeight(), + 1 + }; return extent; } diff --git a/vk_video_encoder/demos/vk-video-enc/Main.cpp b/vk_video_encoder/demos/vk-video-enc/Main.cpp index 31d24b2d..bb849f72 100644 --- a/vk_video_encoder/demos/vk-video-enc/Main.cpp +++ b/vk_video_encoder/demos/vk-video-enc/Main.cpp @@ -21,7 +21,7 @@ #include "VkCodecUtils/VulkanEncoderFrameProcessor.h" #include "VkShell/Shell.h" -int main(int argc, char** argv) +int main(int argc, const char* argv[]) { VkSharedBaseObj encoderConfig; if (VK_SUCCESS != EncoderConfig::CreateCodecConfig(argc, argv, encoderConfig)) { diff --git a/vk_video_encoder/include/vulkan_video_encoder.h b/vk_video_encoder/include/vulkan_video_encoder.h index e757f238..f170fd4a 100644 --- a/vk_video_encoder/include/vulkan_video_encoder.h +++ b/vk_video_encoder/include/vulkan_video_encoder.h @@ -43,7 +43,7 @@ class VulkanVideoEncoder : public virtual VkVideoRefCountBase { public: virtual VkResult Initialize(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv) = 0; + int argc, const char** argv) = 0; virtual int64_t GetNumberOfFrames() = 0; virtual VkResult EncodeNextFrame(int64_t& frameNumEncoded) = 0; virtual VkResult GetBitstream() = 0; @@ -52,7 +52,7 @@ class VulkanVideoEncoder : public virtual VkVideoRefCountBase { extern "C" VK_VIDEO_ENCODER_EXPORT VkResult CreateVulkanVideoEncoder(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv, + int argc, const char** argv, VkSharedBaseObj& vulkanVideoEncoder); #endif /* _VULKAN_VIDEO_ENCODER_H_ */ diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.cpp index 53d7cec3..fdfe92de 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.cpp @@ -19,7 +19,7 @@ #include "VkVideoEncoder/VkEncoderConfigH265.h" #include "VkVideoEncoder/VkEncoderConfigAV1.h" -void printHelp(VkVideoCodecOperationFlagBitsKHR codec) +static void printHelp(VkVideoCodecOperationFlagBitsKHR codec) { fprintf(stderr, "Version: " VKVS_VERSION_STRING "\n"\ @@ -156,10 +156,10 @@ void printHelp(VkVideoCodecOperationFlagBitsKHR codec) } } -int EncoderConfig::ParseArguments(int argc, char *argv[]) +int EncoderConfig::ParseArguments(int argc, const char *argv[]) { int argcount = 0; - std::vector arglist; + std::vector arglist; std::vector args(argv, argv + argc); uint32_t frameCount = 0; @@ -572,7 +572,7 @@ int EncoderConfig::ParseArguments(int argc, char *argv[]) gopStructure.SetIntraRefreshSkippedStartIndex(intraRefreshSkippedStartIndex); } else { argcount++; - arglist.push_back((char*)args[i].c_str()); + arglist.push_back(args[i].c_str()); } } @@ -703,7 +703,7 @@ int EncoderConfig::ParseArguments(int argc, char *argv[]) return DoParseArguments(argcount, arglist.data()); } -VkResult EncoderConfig::CreateCodecConfig(int argc, char *argv[], +VkResult EncoderConfig::CreateCodecConfig(int argc, const char *argv[], VkSharedBaseObj& encoderConfig) { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.h index 94adb438..896c1636 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfig.h @@ -913,13 +913,13 @@ struct EncoderConfig : public VkVideoRefCountBase { } // Factory Function - static VkResult CreateCodecConfig(int argc, char *argv[], VkSharedBaseObj& encoderConfig); + static VkResult CreateCodecConfig(int argc, const char *argv[], VkSharedBaseObj& encoderConfig); void InitVideoProfile(); - int ParseArguments(int argc, char *argv[]); + int ParseArguments(int argc, const char *argv[]); - virtual int DoParseArguments(int argc, char *argv[]) { + virtual int DoParseArguments(int argc, const char *argv[]) { if (argc > 0) { std::cout << "Invalid paramters: "; for (int i = 0; i < argc; i++) { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.cpp index aeab421d..c3ba67c1 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.cpp @@ -26,7 +26,7 @@ } \ } -int EncoderConfigAV1::DoParseArguments(int argc, char* argv[]) +int EncoderConfigAV1::DoParseArguments(int argc, const char* argv[]) { // No validation of command line options. So, all options must be valid and // values with in the limits of vulkan and av1 specification diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.h index 0838e2c8..622977d6 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigAV1.h @@ -88,7 +88,7 @@ struct EncoderConfigAV1 : public EncoderConfig { } virtual ~EncoderConfigAV1() {} - virtual int DoParseArguments(int argc, char* argv[]) override; + virtual int DoParseArguments(int argc, const char* argv[]) override; virtual VkResult InitializeParameters() override { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderDpbH264.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderDpbH264.h index a54bed3c..c828c3c7 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderDpbH264.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderDpbH264.h @@ -139,7 +139,7 @@ class VkEncDpbH264 const StdVideoEncodeH264PictureInfo *GetCurrentDpbEntry(void) { assert((m_currDpbIdx < m_max_dpb_size) || (m_currDpbIdx == MAX_DPB_SLOTS)); - return &m_DPB[m_currDpbIdx].picInfo; + return &m_DPB[(int)m_currDpbIdx].picInfo; } uint32_t GetUpdatedFrameNumAndPicOrderCnt(int32_t& PicOrderCnt) diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp index 84e83dfa..5468511c 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp @@ -476,8 +476,10 @@ void VkVideoEncoder::CopyYCbCrPlanesDirectCPU( } // Source and destination strides - const size_t srcStride = inputPlaneLayouts[plane].rowPitch; - const size_t dstStride = dstSubresourceLayout[plane].rowPitch; + assert(inputPlaneLayouts[plane].rowPitch <= SIZE_MAX); + assert(dstSubresourceLayout[plane].rowPitch <= SIZE_MAX); + const size_t srcStride = (size_t)inputPlaneLayouts[plane].rowPitch; + const size_t dstStride = (size_t)dstSubresourceLayout[plane].rowPitch; // Line width in bytes const size_t lineBytes = planeWidth * bytesPerPixel; @@ -1449,7 +1451,9 @@ VkImageLayout VkVideoEncoder::TransitionImageLayout(VkCommandBuffer cmdBuf, imageBarrier.srcStageMask = VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR; imageBarrier.dstStageMask = VK_PIPELINE_STAGE_2_VIDEO_ENCODE_BIT_KHR; } else { +#ifdef __cpp_exceptions throw std::invalid_argument("unsupported layout transition!"); +#endif } const VkDependencyInfoKHR dependencyInfo = { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoGopStructure.h b/vk_video_encoder/libs/VkVideoEncoder/VkVideoGopStructure.h index d3b1ab0a..2ab76bcd 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoGopStructure.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoGopStructure.h @@ -25,6 +25,7 @@ #include #include #include +#include // for std::min static const uint32_t MAX_GOP_SIZE = 64; @@ -207,15 +208,15 @@ class VkVideoGopStructure { uint32_t periodDelta = INT32_MAX; // the delta of this frame to the next closed GOP reference. -1 if it is not a B-frame if (framesLeft <= consecutiveBFrameCount) { // Handle last frames sequence - periodDelta = std::min(periodDelta, framesLeft); + periodDelta = std::min(periodDelta, framesLeft); } if (m_idrPeriod > 0) { // Is the IDR period valid - periodDelta = std::min(periodDelta, GetPeriodDelta(gopState, m_idrPeriod)); + periodDelta = std::min(periodDelta, GetPeriodDelta(gopState, m_idrPeriod)); } if (m_closedGop) { // A closed GOP is required. - periodDelta = std::min(periodDelta, GetPeriodDelta(gopState, m_gopFrameCount)); + periodDelta = std::min(periodDelta, GetPeriodDelta(gopState, m_gopFrameCount)); } uint32_t refDelta = INT32_MAX; // the delta of this frame from the last reference. -1 if it is not a B-frame diff --git a/vk_video_encoder/src/vulkan_video_encoder.cpp b/vk_video_encoder/src/vulkan_video_encoder.cpp index 61c3637d..ae44f7ce 100644 --- a/vk_video_encoder/src/vulkan_video_encoder.cpp +++ b/vk_video_encoder/src/vulkan_video_encoder.cpp @@ -23,7 +23,7 @@ class VulkanVideoEncoderImpl : public VulkanVideoEncoder { public: virtual VkResult Initialize(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv); + int argc, const char** argv); virtual int64_t GetNumberOfFrames() { return m_encoderConfig->numFrames; @@ -81,7 +81,7 @@ class VulkanVideoEncoderImpl : public VulkanVideoEncoder { }; VkResult VulkanVideoEncoderImpl::Initialize(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv) + int argc, const char** argv) { VkResult result = EncoderConfig::CreateCodecConfig(argc, argv, m_encoderConfig); if (VK_SUCCESS != result) { @@ -235,7 +235,7 @@ VkResult VulkanVideoEncoderImpl::EncodeNextFrame(int64_t& frameNumEncoded) VK_VIDEO_ENCODER_EXPORT VkResult CreateVulkanVideoEncoder(VkVideoCodecOperationFlagBitsKHR videoCodecOperation, - int argc, char** argv, + int argc, const char** argv, VkSharedBaseObj& vulkanVideoEncoder) { switch((uint32_t)videoCodecOperation) diff --git a/vk_video_encoder/test/vulkan-video-enc/Main.cpp b/vk_video_encoder/test/vulkan-video-enc/Main.cpp index 58c5cb49..09f55420 100644 --- a/vk_video_encoder/test/vulkan-video-enc/Main.cpp +++ b/vk_video_encoder/test/vulkan-video-enc/Main.cpp @@ -18,7 +18,7 @@ #include "vulkan_video_encoder.h" #include "VkVSCommon.h" -int main(int argc, char** argv) +int main(int argc, const char** argv) { std::cout << "Enter encoder test" << std::endl; VkSharedBaseObj vulkanVideoEncoder; From 804b94d3c8ea7240faa6aac143d9b42c8ad38c33 Mon Sep 17 00:00:00 2001 From: "Vassili Nikolaev (NVIDIA)" Date: Mon, 9 Jun 2025 09:13:35 -0500 Subject: [PATCH 05/14] common: Use the CRC generator instead of embedding it --- .../libs/VkCodecUtils/VkVideoFrameToFile.cpp | 75 +------------------ .../demos/vk-video-dec/CMakeLists.txt | 1 + .../test/vulkan-video-dec/CMakeLists.txt | 1 + .../vulkan-video-simple-dec/CMakeLists.txt | 1 + .../demos/vk-video-enc/CMakeLists.txt | 1 + 5 files changed, 5 insertions(+), 74 deletions(-) diff --git a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp index c5ca2d0e..0336a7f5 100644 --- a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp +++ b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp @@ -23,80 +23,7 @@ #include "VulkanDecodedFrame.h" #include "Helpers.h" #include "VkVideoFrameOutput.h" - -// CRC32 lookup table -static unsigned long Crc32Table[256] = { - 0x00000000,0x77073096,0xee0e612c,0x990951ba, - 0x076dc419,0x706af48f,0xe963a535,0x9e6495a3, - 0x0edb8832,0x79dcb8a4,0xe0d5e91e,0x97d2d988, - 0x09b64c2b,0x7eb17cbd,0xe7b82d07,0x90bf1d91, - 0x1db71064,0x6ab020f2,0xf3b97148,0x84be41de, - 0x1adad47d,0x6ddde4eb,0xf4d4b551,0x83d385c7, - 0x136c9856,0x646ba8c0,0xfd62f97a,0x8a65c9ec, - 0x14015c4f,0x63066cd9,0xfa0f3d63,0x8d080df5, - 0x3b6e20c8,0x4c69105e,0xd56041e4,0xa2677172, - 0x3c03e4d1,0x4b04d447,0xd20d85fd,0xa50ab56b, - 0x35b5a8fa,0x42b2986c,0xdbbbc9d6,0xacbcf940, - 0x32d86ce3,0x45df5c75,0xdcd60dcf,0xabd13d59, - 0x26d930ac,0x51de003a,0xc8d75180,0xbfd06116, - 0x21b4f4b5,0x56b3c423,0xcfba9599,0xb8bda50f, - 0x2802b89e,0x5f058808,0xc60cd9b2,0xb10be924, - 0x2f6f7c87,0x58684c11,0xc1611dab,0xb6662d3d, - 0x76dc4190,0x01db7106,0x98d220bc,0xefd5102a, - 0x71b18589,0x06b6b51f,0x9fbfe4a5,0xe8b8d433, - 0x7807c9a2,0x0f00f934,0x9609a88e,0xe10e9818, - 0x7f6a0dbb,0x086d3d2d,0x91646c97,0xe6635c01, - 0x6b6b51f4,0x1c6c6162,0x856530d8,0xf262004e, - 0x6c0695ed,0x1b01a57b,0x8208f4c1,0xf50fc457, - 0x65b0d9c6,0x12b7e950,0x8bbeb8ea,0xfcb9887c, - 0x62dd1ddf,0x15da2d49,0x8cd37cf3,0xfbd44c65, - 0x4db26158,0x3ab551ce,0xa3bc0074,0xd4bb30e2, - 0x4adfa541,0x3dd895d7,0xa4d1c46d,0xd3d6f4fb, - 0x4369e96a,0x346ed9fc,0xad678846,0xda60b8d0, - 0x44042d73,0x33031de5,0xaa0a4c5f,0xdd0d7cc9, - 0x5005713c,0x270241aa,0xbe0b1010,0xc90c2086, - 0x5768b525,0x206f85b3,0xb966d409,0xce61e49f, - 0x5edef90e,0x29d9c998,0xb0d09822,0xc7d7a8b4, - 0x59b33d17,0x2eb40d81,0xb7bd5c3b,0xc0ba6cad, - 0xedb88320,0x9abfb3b6,0x03b6e20c,0x74b1d29a, - 0xead54739,0x9dd277af,0x04db2615,0x73dc1683, - 0xe3630b12,0x94643b84,0x0d6d6a3e,0x7a6a5aa8, - 0xe40ecf0b,0x9309ff9d,0x0a00ae27,0x7d079eb1, - 0xf00f9344,0x8708a3d2,0x1e01f268,0x6906c2fe, - 0xf762575d,0x806567cb,0x196c3671,0x6e6b06e7, - 0xfed41b76,0x89d32be0,0x10da7a5a,0x67dd4acc, - 0xf9b9df6f,0x8ebeeff9,0x17b7be43,0x60b08ed5, - 0xd6d6a3e8,0xa1d1937e,0x38d8c2c4,0x4fdff252, - 0xd1bb67f1,0xa6bc5767,0x3fb506dd,0x48b2364b, - 0xd80d2bda,0xaf0a1b4c,0x36034af6,0x41047a60, - 0xdf60efc3,0xa867df55,0x316e8eef,0x4669be79, - 0xcb61b38c,0xbc66831a,0x256fd2a0,0x5268e236, - 0xcc0c7795,0xbb0b4703,0x220216b9,0x5505262f, - 0xc5ba3bbe,0xb2bd0b28,0x2bb45a92,0x5cb36a04, - 0xc2d7ffa7,0xb5d0cf31,0x2cd99e8b,0x5bdeae1d, - 0x9b64c2b0,0xec63f226,0x756aa39c,0x026d930a, - 0x9c0906a9,0xeb0e363f,0x72076785,0x05005713, - 0x95bf4a82,0xe2b87a14,0x7bb12bae,0x0cb61b38, - 0x92d28e9b,0xe5d5be0d,0x7cdcefb7,0x0bdbdf21, - 0x86d3d2d4,0xf1d4e242,0x68ddb3f8,0x1fda836e, - 0x81be16cd,0xf6b9265b,0x6fb077e1,0x18b74777, - 0x88085ae6,0xff0f6a70,0x66063bca,0x11010b5c, - 0x8f659eff,0xf862ae69,0x616bffd3,0x166ccf45, - 0xa00ae278,0xd70dd2ee,0x4e048354,0x3903b3c2, - 0xa7672661,0xd06016f7,0x4969474d,0x3e6e77db, - 0xaed16a4a,0xd9d65adc,0x40df0b66,0x37d83bf0, - 0xa9bcae53,0xdebb9ec5,0x47b2cf7f,0x30b5ffe9, - 0xbdbdf21c,0xcabac28a,0x53b39330,0x24b4a3a6, - 0xbad03605,0xcdd70693,0x54de5729,0x23d967bf, - 0xb3667a2e,0xc4614ab8,0x5d681b02,0x2a6f2b94, - 0xb40bbe37,0xc30c8ea1,0x5a05df1b,0x2d02ef8d -}; - -static void getCRC(uint32_t *checksum, const uint8_t *inputBytes, size_t length, unsigned long crcTable[]) { - for (size_t i = 0; i < length; i += 1) { - *checksum = crcTable[inputBytes[i] ^ (*checksum & 0xff)] ^ (*checksum >> 8); - } -} +#include "crcgenerator.h" // Rotate right for 16-bit unsigned integers. // Used to normalize MSB-aligned high bit-depth samples (10-bit, 12-bit) to LSB-aligned. diff --git a/vk_video_decoder/demos/vk-video-dec/CMakeLists.txt b/vk_video_decoder/demos/vk-video-dec/CMakeLists.txt index 30e3e4cd..5ebba8a3 100644 --- a/vk_video_decoder/demos/vk-video-dec/CMakeLists.txt +++ b/vk_video_decoder/demos/vk-video-dec/CMakeLists.txt @@ -50,6 +50,7 @@ set(sources ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanCommandBufferPool.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanCommandBufferPool.h ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VkVideoFrameToFile.cpp + ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/crcgenerator.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.h ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/ElementaryStream.cpp diff --git a/vk_video_decoder/test/vulkan-video-dec/CMakeLists.txt b/vk_video_decoder/test/vulkan-video-dec/CMakeLists.txt index 7f10d58f..084a6676 100644 --- a/vk_video_decoder/test/vulkan-video-dec/CMakeLists.txt +++ b/vk_video_decoder/test/vulkan-video-dec/CMakeLists.txt @@ -30,6 +30,7 @@ set(VULKAN_VIDEO_DEC_SOURCES ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanSamplerYcbcrConversion.h ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/nvVkFormats.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VkVideoFrameToFile.cpp + ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/crcgenerator.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.h ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/ElementaryStream.cpp diff --git a/vk_video_decoder/test/vulkan-video-simple-dec/CMakeLists.txt b/vk_video_decoder/test/vulkan-video-simple-dec/CMakeLists.txt index 30cf00be..d533f95e 100644 --- a/vk_video_decoder/test/vulkan-video-simple-dec/CMakeLists.txt +++ b/vk_video_decoder/test/vulkan-video-simple-dec/CMakeLists.txt @@ -1,6 +1,7 @@ set(VULKAN_VIDEO_SIMPLE_DEC_SOURCES Main.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/nvVkFormats.cpp + ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/crcgenerator.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/VideoStreamDemuxer.h ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VkDecoderUtils/ElementaryStream.cpp diff --git a/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt b/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt index 7fe88cc3..33dfbc3e 100644 --- a/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt +++ b/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt @@ -80,6 +80,7 @@ set(sources ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/nvVkFormats.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanBistreamBufferImpl.h ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanBistreamBufferImpl.cpp + ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/crcgenerator.cpp ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VulkanVideoFrameBuffer/VulkanVideoFrameBuffer.h ${VK_VIDEO_DECODER_LIBS_SOURCE_ROOT}/VulkanVideoFrameBuffer/VulkanVideoFrameBuffer.cpp ) From cfb93a786600c3a29acc140ae8cba5f52fbd5fa4 Mon Sep 17 00:00:00 2001 From: "Vassili Nikolaev (NVIDIA)" Date: Mon, 9 Jun 2025 09:15:59 -0500 Subject: [PATCH 06/14] encoder: Deal with the Vulkan chained stuctures --- common/libs/VkCodecUtils/Helpers.h | 22 +++++++++--------- .../libs/VkVideoEncoder/VkVideoEncoder.cpp | 23 ++++++++----------- .../libs/VkVideoEncoder/VkVideoEncoder.h | 2 +- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/common/libs/VkCodecUtils/Helpers.h b/common/libs/VkCodecUtils/Helpers.h index 333548e0..b74e71a3 100644 --- a/common/libs/VkCodecUtils/Helpers.h +++ b/common/libs/VkCodecUtils/Helpers.h @@ -320,7 +320,7 @@ inline VkResult WaitAndGetStatus(const VkInterfaceFunctions* vkIf, VkDevice devi } template -inline VkBaseInStructure* ChainNextVkStruct(NodeType& node, ChainedNodeType& nextChainedNode) { +inline void ChainNextVkStruct(NodeType& node, ChainedNodeType& nextChainedNode) { // make sure the node is of type VkBaseInStructure static_assert(offsetof(NodeType, sType) == offsetof(VkBaseInStructure, sType), "NodeType does not have sType at the same offset as VkBaseInStructure"); @@ -341,16 +341,16 @@ inline VkBaseInStructure* ChainNextVkStruct(NodeType& node, ChainedNodeType& nex "ChainedNodeType must be a standard-layout type"); assert(node.sType > 0); - VkBaseInStructure* pNode = (VkBaseInStructure*)&node; - while (pNode->pNext != nullptr) { - pNode = (VkBaseInStructure*)pNode->pNext; - } - pNode->pNext = (VkBaseInStructure*)&nextChainedNode; - // make sure the nextChainedNode is of type VkBaseInStructure - assert(nextChainedNode.sType > 0); - assert(nextChainedNode.pNext == nullptr); - return (VkBaseInStructure*)nextChainedNode.pNext; - } + VkBaseInStructure* pNode = (VkBaseInStructure*)(&node); + VkBaseInStructure* pNextNode = (VkBaseInStructure*)(&nextChainedNode); + + // The incoming object may not have anything chained. + assert(pNextNode->pNext == nullptr); + + // Inserts the incoming object at the beginning of the list. + pNextNode->pNext = pNode->pNext; + pNode->pNext = pNextNode; +} class DeviceUuidUtils { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp index 5468511c..536ad489 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp @@ -1658,12 +1658,9 @@ VkResult VkVideoEncoder::HandleCtrlCmd(VkSharedBaseObj& encodeFrameInfo->qualityLevelInfo.sType = VK_STRUCTURE_TYPE_VIDEO_ENCODE_QUALITY_LEVEL_INFO_KHR; encodeFrameInfo->qualityLevelInfo.qualityLevel = encodeFrameInfo->qualityLevel; if (pNext != nullptr) { - if (encodeFrameInfo->rateControlInfo.pNext == nullptr) { - encodeFrameInfo->rateControlInfo.pNext = pNext; - } else { - ((VkBaseInStructure*)(encodeFrameInfo->rateControlInfo.pNext))->pNext = pNext; - } + vk::ChainNextVkStruct(encodeFrameInfo->rateControlInfo, *pNext); } + pNext = (VkBaseInStructure*)&encodeFrameInfo->qualityLevelInfo; } @@ -1686,12 +1683,9 @@ VkResult VkVideoEncoder::HandleCtrlCmd(VkSharedBaseObj& m_beginRateControlInfo = encodeFrameInfo->rateControlInfo; if (pNext != nullptr) { - if (encodeFrameInfo->rateControlInfo.pNext == nullptr) { - encodeFrameInfo->rateControlInfo.pNext = pNext; - } else { - ((VkBaseInStructure*)(encodeFrameInfo->rateControlInfo.pNext))->pNext = pNext; - } + vk::ChainNextVkStruct(encodeFrameInfo->rateControlInfo, *pNext); } + pNext = (VkBaseInStructure*)&encodeFrameInfo->rateControlInfo; } @@ -1771,7 +1765,8 @@ VkResult VkVideoEncoder::RecordVideoCodingCmd(VkSharedBaseObjCmdControlVideoCodingKHR(cmdBuf, &renderControlInfo); m_beginRateControlInfo = *(VkVideoEncodeRateControlInfoKHR*)encodeFrameInfo->pControlCmdChain; - ((VkBaseInStructure*)(m_beginRateControlInfo.pNext))->pNext = NULL; + // Do not walk the chain, otherwise we end up creating a loop here. + m_beginRateControlInfo.pNext = (VkBaseInStructure*)(&encodeFrameInfo->pControlCmdChain); } if (m_videoMaintenance1FeaturesSupported) @@ -1783,10 +1778,12 @@ VkResult VkVideoEncoder::RecordVideoCodingCmd(VkSharedBaseObjencodeInfo; - while (pStruct->pNext) pStruct = (VkBaseInStructure*)pStruct->pNext; - pStruct->pNext = (VkBaseInStructure*)&videoInlineQueryInfoKHR; + vk::ChainNextVkStruct(*pStruct, videoInlineQueryInfoKHR); vkDevCtx->CmdEncodeVideoKHR(cmdBuf, &encodeFrameInfo->encodeInfo); + + // Remove the stack pointer from the chain, causes a use after free otherwise in GetEncodeFrameInfoH264 + encodeFrameInfo->encodeInfo.pNext = videoInlineQueryInfoKHR.pNext; } else { diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h index c939bda6..dacc2929 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.h @@ -58,7 +58,7 @@ class VkVideoEncoder : public VkVideoRefCountBase { { VkStructureType GetType() { return (encodeInfo.pNext == nullptr) ? - VK_STRUCTURE_TYPE_VIDEO_ENCODE_INFO_KHR : ((VkBaseInStructure*)encodeInfo.pNext)->sType; + VK_STRUCTURE_TYPE_VIDEO_ENCODE_INFO_KHR : reinterpret_cast(encodeInfo.pNext)->sType; } VkVideoEncodeFrameInfo(const void* pNext = nullptr) From 990ca866e9f62b31ebddcd8e26e2b122c22e3d4e Mon Sep 17 00:00:00 2001 From: "Vassili Nikolaev (NVIDIA)" Date: Fri, 23 May 2025 08:50:47 -0700 Subject: [PATCH 07/14] common: Fix frame to file from adding .yuv to filenames with .y4m already Signed-off-by: Vassili Nikolaev (NVIDIA) --- common/libs/VkCodecUtils/VkVideoFrameToFile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp index 0336a7f5..1ca1e2f6 100644 --- a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp +++ b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp @@ -218,7 +218,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { << std::endl; fileNameWithModExt = fileName + std::string(".y4m"); fileName = fileNameWithModExt.c_str(); - } else if (!hasExtension(fileName, ".yuv")) { + } else if ((y4mFormat == false) && !hasExtension(fileName, ".yuv")) { std::cout << std::endl << "Raw yuv output format is requested, "; std::cout << "but the output file's (" << fileName << ") extension isn't .yuv!" << std::endl; From 0b14224410e58f1ae807ecc3dd3292b59a397713 Mon Sep 17 00:00:00 2001 From: "Vassili Nikolaev (NVIDIA)" Date: Tue, 17 Jun 2025 15:57:24 -0700 Subject: [PATCH 08/14] cmake: Add cast-qual to the cmake settings Signed-off-by: Vassili Nikolaev (NVIDIA) --- cmake/LinuxSettings.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/LinuxSettings.cmake b/cmake/LinuxSettings.cmake index a90e96ee..f9f3c727 100644 --- a/cmake/LinuxSettings.cmake +++ b/cmake/LinuxSettings.cmake @@ -84,7 +84,7 @@ endif() # Compiler flags for GCC/Clang if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang") - set(COMMON_COMPILE_FLAGS "-Wall -Wextra -Wundef -Wno-unused-parameter -Wno-missing-field-initializers -Wshadow") + set(COMMON_COMPILE_FLAGS "-Wall -Wextra -Wundef -Wno-unused-parameter -Wno-missing-field-initializers -Wshadow -Wcast-qual") set(COMMON_COMPILE_FLAGS "${COMMON_COMPILE_FLAGS} -fno-strict-aliasing -fno-builtin-memcmp") # Warning about implicit fallthrough in switch blocks From ec05fb5469779f1ae1f76ec45fc1c095aebe68c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Cerveau?= Date: Mon, 15 Dec 2025 17:34:56 +0100 Subject: [PATCH 09/14] EncoderConfig: fix override error to use const DoParseArguments changed with: a934d3b common: Code compilation fixes --- vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp | 2 +- vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h | 2 +- vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp | 2 +- vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp index 68829578..e9c94bed 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.cpp @@ -17,7 +17,7 @@ #include "VkVideoEncoder/VkEncoderConfigH264.h" #include "VkVideoEncoder/VkVideoEncoderH264.h" -int EncoderConfigH264::DoParseArguments(int argc, char* argv[]) +int EncoderConfigH264::DoParseArguments(int argc, const char* argv[]) { std::vector args(argv, argv + argc); diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h index fb1c0611..6d8865a5 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH264.h @@ -156,7 +156,7 @@ struct EncoderConfigH264 : public EncoderConfig { const LevelLimits* levelLimits; size_t levelLimitsSize; - virtual int DoParseArguments(int argc, char* argv[]) override; + virtual int DoParseArguments(int argc, const char* argv[]) override; StdVideoH264LevelIdc DetermineLevel(uint8_t dpbSize, uint32_t bitrate, diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp index b4a03ce1..33bcc53e 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.cpp @@ -68,7 +68,7 @@ uint32_t EncoderConfigH265::GetCpbVclFactor() return baseFactor + depthFactor; } -int EncoderConfigH265::DoParseArguments(int argc, char* argv[]) +int EncoderConfigH265::DoParseArguments(int argc, const char* argv[]) { std::vector args(argv, argv + argc); diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h index ebc5ca38..774bf1a9 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h +++ b/vk_video_encoder/libs/VkVideoEncoder/VkEncoderConfigH265.h @@ -135,7 +135,7 @@ struct EncoderConfigH265 : public EncoderConfig { return this; } - virtual int DoParseArguments(int argc, char* argv[]) override; + virtual int DoParseArguments(int argc, const char* argv[]) override; uint32_t GetCtbAlignedPicSizeInSamples(uint32_t& picWidthInCtbsY, uint32_t& picHeightInCtbsY, bool minCtbsY = false); From fb17b8e25612c0f33fd0a8dc31d0791cd81d66c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Cerveau?= Date: Mon, 15 Dec 2025 17:36:50 +0100 Subject: [PATCH 10/14] VulkanFilterYuvCompute: fix shadowed struct declarations Use explicit struct names to avoid local PushConstants struct definitions from shadowing the class member PushConstants. --- .../VkCodecUtils/VulkanFilterYuvCompute.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp index 597f5d7c..64582775 100644 --- a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp +++ b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp @@ -2474,7 +2474,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} }; - struct PushConstants { + struct ImagePushConstants { uint32_t srcLayer; uint32_t dstLayer; ivec2 inputSize; @@ -2487,7 +2487,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, uint32_t crPitch; // Cr plane pitch }; - PushConstants pushConstants = { + ImagePushConstants pushConstants = { inImageResourceInfo->baseArrayLayer, // Set the source layer index outImageResourceInfo->baseArrayLayer, // Set the destination layer index ivec2(inImageResourceInfo->codedExtent.width, inImageResourceInfo->codedExtent.height), @@ -2504,7 +2504,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, m_descriptorSetLayout.GetPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(PushConstants), + sizeof(ImagePushConstants), &pushConstants); const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; @@ -2625,7 +2625,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} }; - struct PushConstants { + struct BufferToImagePushConstants { uint32_t srcLayer; uint32_t dstLayer; ivec2 inputSize; @@ -2657,7 +2657,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, VkDeviceSize cbOffset = yOffset + planeSize; VkDeviceSize crOffset = cbOffset + (planeSize / 4); - PushConstants pushConstants = { + BufferToImagePushConstants pushConstants = { pBufferImageCopy->imageSubresource.baseArrayLayer, outImageResourceInfo->baseArrayLayer, ivec2(width, height), @@ -2674,7 +2674,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, m_descriptorSetLayout.GetPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(PushConstants), + sizeof(BufferToImagePushConstants), &pushConstants); const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; @@ -2792,7 +2792,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} }; - struct PushConstants { + struct ImageToBufferPushConstants { uint32_t srcLayer; uint32_t dstLayer; ivec2 inputSize; @@ -2828,7 +2828,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, VkDeviceSize cbOffset = yOffset + planeSize; VkDeviceSize crOffset = cbOffset + (planeSize / 4); - PushConstants pushConstants = { + ImageToBufferPushConstants pushConstants = { inImageResourceInfo->baseArrayLayer, 0, // Destination layer (buffer has no layers) ivec2(inputExtent.width, inputExtent.height), @@ -2845,7 +2845,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, m_descriptorSetLayout.GetPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(PushConstants), + sizeof(ImageToBufferPushConstants), &pushConstants); const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; @@ -2965,7 +2965,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, ivec2(int32_t width_, int32_t height_) : width(width_), height(height_) {} }; - struct PushConstants { + struct BufferToBufferPushConstants { uint32_t srcLayer; // src image layer to use uint32_t dstLayer; // dst image layer to use ivec2 inputSize; // input image or buffer extent @@ -2991,7 +2991,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, VkDeviceSize cbOffset = planeSize; VkDeviceSize crOffset = cbOffset + (planeSize / 4); - PushConstants pushConstants = { + BufferToBufferPushConstants pushConstants = { 0, // Source layer (buffer has no layers) 0, // Destination layer (buffer has no layers) ivec2(inBufferExtent.width, inBufferExtent.height), @@ -3008,7 +3008,7 @@ VkResult VulkanFilterYuvCompute::RecordCommandBuffer(VkCommandBuffer cmdBuf, m_descriptorSetLayout.GetPipelineLayout(), VK_SHADER_STAGE_COMPUTE_BIT, 0, - sizeof(PushConstants), + sizeof(BufferToBufferPushConstants), &pushConstants); const uint32_t workgroupWidth = (pushConstants.outputSize.width + (m_workgroupSizeX - 1)) / m_workgroupSizeX; From 8bcd876498bee2ff0a104670350fbfbd93d6fd57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Cerveau?= Date: Tue, 16 Dec 2025 17:10:01 +0100 Subject: [PATCH 11/14] VulkanFilterYuvCompute: fix unused variable numPlanes numPlanes will be used only in Debug as assert are disabled in release. This is a fixup of "encode: remove the CPU input conversion function" --- common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp index 64582775..dcea5b41 100644 --- a/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp +++ b/common/libs/VkCodecUtils/VulkanFilterYuvCompute.cpp @@ -2327,7 +2327,7 @@ uint32_t VulkanFilterYuvCompute::UpdateImageDescriptorSets( validImageAspects &= validAspects; uint32_t curImageAspect = 0; - const uint32_t numPlanes = imageView->GetNumberOfPlanes(); + [[maybe_unused]] const uint32_t numPlanes = imageView->GetNumberOfPlanes(); while(validImageAspects) { if (validImageAspects & (VK_IMAGE_ASPECT_COLOR_BIT << curImageAspect) ) { From 18d19ae736c18746ae66471c7fc767c2785a8fd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Cerveau?= Date: Fri, 20 Feb 2026 16:04:14 +0100 Subject: [PATCH 12/14] common: Code compilation fixes(khr fixes) This fix allows to pass 10 bits decode tests --- common/libs/VkCodecUtils/VkVideoFrameToFile.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp index 1ca1e2f6..6285d2aa 100644 --- a/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp +++ b/common/libs/VkCodecUtils/VkVideoFrameToFile.cpp @@ -430,7 +430,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { assert(layouts[plane].rowPitch <= SIZE_MAX); assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); CopyPlaneData(pSrc, pDst, static_cast(layouts[plane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), - frameWidth, imageHeight); + frameWidth, imageHeight, 1, bitShift); } } @@ -457,7 +457,7 @@ class VkVideoFrameToFileImpl : public VkVideoFrameOutput { assert(layouts[srcPlane].rowPitch <= SIZE_MAX); assert(yuvPlaneLayouts[plane].rowPitch <= SIZE_MAX); CopyPlaneData(pSrc, pDst, static_cast(layouts[srcPlane].rowPitch), static_cast(yuvPlaneLayouts[plane].rowPitch), - planeWidth, 1, 2); + planeWidth, 1, 2, bitShift); } pDst += yuvPlaneLayouts[plane].rowPitch; } From c8ac37ae244b298c1748680e920e04b6a8d5aaed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Cerveau?= Date: Thu, 19 Feb 2026 17:53:19 +0100 Subject: [PATCH 13/14] encoder: restore CPU fallback and support 3-plane image copy When SHADERC is not enabled, the GPU compute filter is compiled out and m_inputComputeFilter stays nullptr. Restore the CPU I420-to-NV12 and I444-to-P444 conversion path from YCbCrConvUtilsCpu as a fallback, and configure the linear staging image pool with m_imageInFormat (hardware 2-plane format) instead of the raw input format. Additionally, extend CopyLinearToOptimalImage to handle 3-plane formats dynamically instead of asserting that only 2-plane formats are supported. --- .../libs/VkVideoEncoder/VkVideoEncoder.cpp | 131 ++++++++++++++++-- 1 file changed, 117 insertions(+), 14 deletions(-) diff --git a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp index 536ad489..84d96deb 100644 --- a/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp +++ b/vk_video_encoder/libs/VkVideoEncoder/VkVideoEncoder.cpp @@ -167,16 +167,94 @@ VkResult VkVideoEncoder::LoadNextFrame(VkSharedBaseObj& // NOTE: Get image layout const VkSubresourceLayout* dstSubresourceLayout = dstImageResource->GetSubresourceLayout(); - // Direct plane copy - no color space conversion needed - CopyYCbCrPlanesDirectCPU( - pInputFrameData, // Source buffer - m_encoderConfig->input.planeLayouts, // Source layouts - writeImagePtr, // Destination buffer - dstSubresourceLayout, // Destination layouts - std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width), // Width - std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height), // Height - m_encoderConfig->input.numPlanes, // Number of planes - m_encoderConfig->input.vkFormat); // Format for subsampling detection + const uint32_t width = std::min(m_encoderConfig->encodeWidth, m_encoderConfig->input.width); + const uint32_t height = std::min(m_encoderConfig->encodeHeight, m_encoderConfig->input.height); + + if (m_inputComputeFilter != nullptr) { + // Compute filter available: direct plane copy, GPU filter handles conversion + CopyYCbCrPlanesDirectCPU( + pInputFrameData, // Source buffer + m_encoderConfig->input.planeLayouts, // Source layouts + writeImagePtr, // Destination buffer + dstSubresourceLayout, // Destination layouts + width, height, + m_encoderConfig->input.numPlanes, // Number of planes + m_encoderConfig->input.vkFormat); // Format for subsampling detection + } else { + // No compute filter: CPU conversion from 3-plane to 2-plane format + int yCbCrConvResult = 0; + if (m_encoderConfig->input.bpp == 8) { + if (m_encoderConfig->encodeChromaSubsampling == VK_VIDEO_CHROMA_SUBSAMPLING_444_BIT_KHR) { + yCbCrConvResult = YCbCrConvUtilsCpu::I444ToP444( + pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset, + (int)m_encoderConfig->input.planeLayouts[0].rowPitch, + pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset, + (int)m_encoderConfig->input.planeLayouts[1].rowPitch, + pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset, + (int)m_encoderConfig->input.planeLayouts[2].rowPitch, + writeImagePtr + dstSubresourceLayout[0].offset, + (int)dstSubresourceLayout[0].rowPitch, + writeImagePtr + dstSubresourceLayout[1].offset, + (int)dstSubresourceLayout[1].rowPitch, + width, height); + } else { + yCbCrConvResult = YCbCrConvUtilsCpu::I420ToNV12( + pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset, + (int)m_encoderConfig->input.planeLayouts[0].rowPitch, + pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset, + (int)m_encoderConfig->input.planeLayouts[1].rowPitch, + pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset, + (int)m_encoderConfig->input.planeLayouts[2].rowPitch, + writeImagePtr + dstSubresourceLayout[0].offset, + (int)dstSubresourceLayout[0].rowPitch, + writeImagePtr + dstSubresourceLayout[1].offset, + (int)dstSubresourceLayout[1].rowPitch, + width, height); + } + } else if (m_encoderConfig->input.bpp == 10 || m_encoderConfig->input.bpp == 12) { + int shiftBits = 0; + if (m_encoderConfig->input.msbShift >= 0) { + shiftBits = m_encoderConfig->input.msbShift; + } else { + shiftBits = 16 - m_encoderConfig->input.bpp; + } + + if (m_encoderConfig->encodeChromaSubsampling == VK_VIDEO_CHROMA_SUBSAMPLING_444_BIT_KHR) { + yCbCrConvResult = YCbCrConvUtilsCpu::I444ToP444( + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset), + (int)m_encoderConfig->input.planeLayouts[0].rowPitch, + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset), + (int)m_encoderConfig->input.planeLayouts[1].rowPitch, + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset), + (int)m_encoderConfig->input.planeLayouts[2].rowPitch, + (uint16_t*)(writeImagePtr + dstSubresourceLayout[0].offset), + (int)dstSubresourceLayout[0].rowPitch, + (uint16_t*)(writeImagePtr + dstSubresourceLayout[1].offset), + (int)dstSubresourceLayout[1].rowPitch, + width, height, shiftBits); + } else { + yCbCrConvResult = YCbCrConvUtilsCpu::I420ToNV12( + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[0].offset), + (int)m_encoderConfig->input.planeLayouts[0].rowPitch, + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[1].offset), + (int)m_encoderConfig->input.planeLayouts[1].rowPitch, + (const uint16_t*)(pInputFrameData + m_encoderConfig->input.planeLayouts[2].offset), + (int)m_encoderConfig->input.planeLayouts[2].rowPitch, + (uint16_t*)(writeImagePtr + dstSubresourceLayout[0].offset), + (int)dstSubresourceLayout[0].rowPitch, + (uint16_t*)(writeImagePtr + dstSubresourceLayout[1].offset), + (int)dstSubresourceLayout[1].rowPitch, + width, height, shiftBits); + } + } else { + assert(!"Requested bit-depth is not supported!"); + return VK_ERROR_INITIALIZATION_FAILED; + } + + if (yCbCrConvResult != 0) { + return VK_ERROR_INITIALIZATION_FAILED; + } + } // Now stage the input frame for the encoder video input return StageInputFrame(encodeFrameInfo); @@ -998,9 +1076,21 @@ VkResult VkVideoEncoder::InitEncoder(VkSharedBaseObj& encoderConf std::max(m_maxCodedExtent.height, encoderConfig->input.height) }; + // When compute filter is available, the linear image stores raw input format + // and the filter handles conversion. Without it, the linear image must match + // the encode source format since CopyLinearToOptimalImage does no conversion. + const VkFormat linearImageFormat = +#ifdef SHADERC_SUPPORT + encoderConfig->enablePreprocessComputeFilter + ? encoderConfig->input.vkFormat + : m_imageInFormat; +#else + m_imageInFormat; +#endif + result = m_linearInputImagePool->Configure( m_vkDevCtx, encoderConfig->numInputImages, - encoderConfig->input.vkFormat, + linearImageFormat, linearInputImageExtent, ( VK_IMAGE_USAGE_SAMPLED_BIT | VK_IMAGE_USAGE_STORAGE_BIT | @@ -1497,8 +1587,9 @@ VkResult VkVideoEncoder::CopyLinearToOptimalImage(VkCommandBuffer& commandBuffer // Bind memory for the image. const VkMpFormatInfo* mpInfo = YcbcrVkFormatInfo(format); - // Currently formats that have more than 2 output planes are not supported. 444 formats have a shared CbCr planes in all current tests - assert((mpInfo->vkPlaneFormat[2] == VK_FORMAT_UNDEFINED) && (mpInfo->vkPlaneFormat[3] == VK_FORMAT_UNDEFINED)); + // Determine number of planes: 1 (base) + numberOfExtraPlanes + const uint32_t numPlanes = 1 + mpInfo->planesLayout.numberOfExtraPlanes; + assert(numPlanes >= 1 && numPlanes <= 3); // Copy src buffer to image. VkImageCopy copyRegion[3]{}; @@ -1533,9 +1624,21 @@ VkResult VkVideoEncoder::CopyLinearToOptimalImage(VkCommandBuffer& commandBuffer copyRegion[1].dstSubresource.baseArrayLayer = dstCopyArrayLayer; copyRegion[1].dstSubresource.layerCount = 1; + if (numPlanes > 2) { + copyRegion[2].extent = copyRegion[1].extent; + copyRegion[2].srcSubresource.aspectMask = VK_IMAGE_ASPECT_PLANE_2_BIT; + copyRegion[2].srcSubresource.mipLevel = 0; + copyRegion[2].srcSubresource.baseArrayLayer = srcCopyArrayLayer; + copyRegion[2].srcSubresource.layerCount = 1; + copyRegion[2].dstSubresource.aspectMask = VK_IMAGE_ASPECT_PLANE_2_BIT; + copyRegion[2].dstSubresource.mipLevel = 0; + copyRegion[2].dstSubresource.baseArrayLayer = dstCopyArrayLayer; + copyRegion[2].dstSubresource.layerCount = 1; + } + m_vkDevCtx->CmdCopyImage(commandBuffer, srcImageResource->GetImage(), srcImageLayout, dstImageResource->GetImage(), dstImageLayout, - (uint32_t)2, copyRegion); + numPlanes, copyRegion); { VkMemoryBarrier memoryBarrier = {VK_STRUCTURE_TYPE_MEMORY_BARRIER}; From 3a6646720697c7b2d8e7fdbfccaeba76aca1d2a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Cerveau?= Date: Fri, 20 Feb 2026 16:09:47 +0100 Subject: [PATCH 14/14] CMake: enable USE_ENCODER_SHADERC by default and skip search when OFF USE_ENCODER_SHADERC was never declared as an option, so it defaulted to OFF unless explicitly passed on the command line. Declare it as an option with ON as the default so the GPU compute filter path is always built. When USE_ENCODER_SHADERC=OFF and encoder only, skip FindShaderc.cmake entirely to avoid unnecessary dependency searches. --- CMakeLists.txt | 5 ++++- vk_video_encoder/demos/vk-video-enc/CMakeLists.txt | 2 +- vk_video_encoder/libs/CMakeLists.txt | 6 +++--- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 50a16a32..b2ef28f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,7 +54,10 @@ set (VULKAN_SDK_MIN_MINOR_VERSION 4) set (VULKAN_SDK_MIN_PATCH_VERSION 321) FIND_VULKAN_SDK(${VULKAN_SDK_MIN_MAJOR_VERSION} ${VULKAN_SDK_MIN_MINOR_VERSION} ${VULKAN_SDK_MIN_PATCH_VERSION}) -include(FindShaderc) +option(USE_ENCODER_SHADERC "Enable shaderc GPU compute filters for encoder (e.g. YUV conversion). Only affects the encoder build; the decoder always uses shaderc." ON) +if(BUILD_DECODER OR USE_ENCODER_SHADERC) + include(FindShaderc) +endif() ############ VULKAN_FFMPEG_LIB_PATH ###################################### if (DEFINED ENV{VULKAN_FFMPEG_LIB_DIR_PATH}) diff --git a/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt b/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt index 33dfbc3e..b043412b 100644 --- a/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt +++ b/vk_video_encoder/demos/vk-video-enc/CMakeLists.txt @@ -119,7 +119,7 @@ if(TARGET vulkan) list(APPEND definitions PRIVATE -DUNINSTALLED_LOADER="$") endif() -if(USE_SHADERC) +if(USE_ENCODER_SHADERC) list(APPEND definitions PRIVATE -DSHADERC_SUPPORT) endif() diff --git a/vk_video_encoder/libs/CMakeLists.txt b/vk_video_encoder/libs/CMakeLists.txt index 5cca8809..66685d33 100644 --- a/vk_video_encoder/libs/CMakeLists.txt +++ b/vk_video_encoder/libs/CMakeLists.txt @@ -88,7 +88,7 @@ set(LIBVKVIDEOENCODER_DEFINITIONS PRIVATE VK_VIDEO_ENCODER_IMPLEMENTATION PUBLIC VK_VIDEO_ENCODER_SHAREDLIB) -if(USE_SHADERC) +if(USE_ENCODER_SHADERC) list(APPEND LIBVKVIDEOENCODER_SRC ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanShaderCompiler.cpp ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}/VkCodecUtils/VulkanShaderCompiler.h @@ -108,7 +108,7 @@ include_directories(BEFORE ${VK_VIDEO_COMMON_LIBS_SOURCE_ROOT}) set(LIBVKVIDEOENCODER_DEPENDENCIES GenerateDispatchTables ${VULKAN_VIDEO_PARSER_LIB}) add_library(${VULKAN_VIDEO_ENCODER_LIB} SHARED ${LIBVKVIDEOENCODER_SRC}) -if(USE_SHADERC) +if(USE_ENCODER_SHADERC) # Link the libraries target_link_libraries(${VULKAN_VIDEO_ENCODER_LIB} PUBLIC ${SHADERC_SHARED_LIBRARY}) # Ensure the library depends on the generation of these files @@ -137,7 +137,7 @@ if(WIN32) endif() add_library(${VULKAN_VIDEO_ENCODER_STATIC_LIB} STATIC ${LIBVKVIDEOENCODER_SRC}) -if(USE_SHADERC) +if(USE_ENCODER_SHADERC) # Link the libraries target_link_libraries(${VULKAN_VIDEO_ENCODER_STATIC_LIB} PUBLIC ${SHADERC_SHARED_LIBRARY}) endif()