From 096e09d83f113767ec3f85bb03a1f7d24c612e64 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Sat, 20 Jul 2024 00:33:56 +0530 Subject: [PATCH 01/56] Add luma_meter and tonemapper --- .../nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 16 ++++++++++++++++ .../nbl/builtin/hlsl/tonemapper/operators.hlsl | 16 ++++++++++++++++ src/nbl/builtin/CMakeLists.txt | 4 ++++ 3 files changed, 36 insertions(+) create mode 100644 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl create mode 100644 include/nbl/builtin/hlsl/tonemapper/operators.hlsl diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl new file mode 100644 index 0000000000..4e18655852 --- /dev/null +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -0,0 +1,16 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ +#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ + +namespace nbl +{ +namespace hls +{ + +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl new file mode 100644 index 0000000000..5ebb5b2ffa --- /dev/null +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -0,0 +1,16 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ + +namespace nbl +{ +namespace hls +{ + +} +} + +#endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 8f797b9454..9dd9ddfd42 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -34,6 +34,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/barycentric/utils.glsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl") +# luma metering +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl") +# tonemapper +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl") # bump mapping LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl` LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/utils.glsl") From 4fd700fe69709ec127f7f42ec09b4f7f4ce0260c Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Sat, 20 Jul 2024 00:34:17 +0530 Subject: [PATCH 02/56] Update submodule pointer --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index c6d5ee3498..87d4794dcc 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit c6d5ee349859ce0b5229bc62a2372fa1d4b6b17c +Subproject commit 87d4794dcc5de8264528292c4a30b5284979754a From 52e7ab24dedb16f6c94855d6f0037e7ea77fba81 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 1 Aug 2024 21:20:52 +0530 Subject: [PATCH 03/56] Convert morton.h to hlsl --- include/nbl/asset/utils/IMeshPacker.h | 2 +- include/nbl/asset/utils/IVirtualTexture.h | 3 +- include/nbl/builtin/hlsl/math/morton.hlsl | 283 ++++++++++++++++++++++ src/nbl/builtin/CMakeLists.txt | 2 + 4 files changed, 288 insertions(+), 2 deletions(-) create mode 100644 include/nbl/builtin/hlsl/math/morton.hlsl diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h index 3f09062b18..355d792782 100644 --- a/include/nbl/asset/utils/IMeshPacker.h +++ b/include/nbl/asset/utils/IMeshPacker.h @@ -6,7 +6,7 @@ #define __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__ #include "nbl/asset/utils/IMeshManipulator.h" -#include "nbl/core/math/morton.h" +#include "nbl/builtin/hlsl/math/morton.hlsl" namespace nbl { diff --git a/include/nbl/asset/utils/IVirtualTexture.h b/include/nbl/asset/utils/IVirtualTexture.h index ec26f56103..64ea49cbe7 100644 --- a/include/nbl/asset/utils/IVirtualTexture.h +++ b/include/nbl/asset/utils/IVirtualTexture.h @@ -7,7 +7,6 @@ #include -#include "nbl/core/math/morton.h" #include "nbl/core/memory/memory.h" #include "nbl/core/alloc/GeneralpurposeAddressAllocator.h" #include "nbl/core/alloc/PoolAddressAllocator.h" @@ -19,6 +18,8 @@ #include "nbl/asset/filters/CPaddedCopyImageFilter.h" #include "nbl/asset/filters/CFillImageFilter.h" +#include "nbl/builtin/hlsl/math/morton.hlsl" + namespace nbl::asset { diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl new file mode 100644 index 0000000000..64b0b66cb7 --- /dev/null +++ b/include/nbl/builtin/hlsl/math/morton.hlsl @@ -0,0 +1,283 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_MORTON_INCLUDED_ +#define _NBL_BUILTIN_HLSL_MORTON_INCLUDED_ + +#ifdef __HLSL_VERSION +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#else +#include +#endif + +namespace nbl +{ +namespace core +{ + +namespace impl +{ + +#ifdef __HLSL_VERSION +template +T morton2d_mask(uint16_t _n) const +{ + const static uint64_t mask[5] = + { + 0x5555555555555555ull, + 0x3333333333333333ull, + 0x0F0F0F0F0F0F0F0Full, + 0x00FF00FF00FF00FFull, + 0x0000FFFF0000FFFFull + }; + return static_cast(mask[_n]); +} + +template +T morton3d_mask(uint16_t _n) const +{ + const static uint64_t mask[5] = + { + 0x1249249249249249ull, + 0x10C30C30C30C30C3ull, + 0x010F00F00F00F00Full, + 0x001F0000FF0000FFull, + 0x001F00000000FFFFull + }; + return static_cast(mask[_n]); +} +template +T morton4d_mask(uint16_t _n) const +{ + const static uint64_t mask[4] = + { + 0x1111111111111111ull, + 0x0303030303030303ull, + 0x000F000F000F000Full, + 0x000000FF000000FFull + }; + return static_cast(mask[_n]); +} + +template +inline T morton2d_decode(T x) +{ + x = x & morton2d_mask(0); + x = (x | (x >> 1)) & morton2d_mask(1); + x = (x | (x >> 2)) & morton2d_mask(2); + if (bitDepth > 8u) + { + x = (x | (x >> 4)) & morton2d_mask(3); + } + if (bitDepth > 16u) + { + x = (x | (x >> 8)) & morton2d_mask(4); + } + if (bitDepth > 32u) + { + x = (x | (x >> 16)); + } + return x; +} + +//! Puts bits on even positions filling gaps with 0s +template +inline T separate_bits_2d(T x) +{ + if (bitDepth > 32u) + { + x = (x | (x << 16)) & morton2d_mask(4); + } + if (bitDepth > 16u) + { + x = (x | (x << 8)) & morton2d_mask(3); + } + if (bitDepth > 8u) + { + x = (x | (x << 4)) & morton2d_mask(2); + } + x = (x | (x << 2)) & morton2d_mask(1); + x = (x | (x << 1)) & morton2d_mask(0); + + return x; +} +template +inline T separate_bits_3d(T x) +{ + if (bitDepth > 32u) + { + x = (x | (x << 32)) & morton3d_mask(4); + } + if (bitDepth > 16u) + { + x = (x | (x << 16)) & morton3d_mask(3); + } + if (bitDepth > 8u) + { + x = (x | (x << 8)) & morton3d_mask(2); + } + x = (x | (x << 4)) & morton3d_mask(1); + x = (x | (x << 2)) & morton3d_mask(0); + + return x; +} +template +inline T separate_bits_4d(T x) +{ + if (bitDepth > 32u) + { + x = (x | (x << 24)) & morton4d_mask(3); + } + if (bitDepth > 16u) + { + x = (x | (x << 12)) & morton4d_mask(2); + } + if (bitDepth > 8u) + { + x = (x | (x << 6)) & morton4d_mask(1); + } + x = (x | (x << 3)) & morton4d_mask(0); + + return x; +} +#else +template +constexpr T morton2d_mask(uint8_t _n) +{ + constexpr uint64_t mask[5] = + { + 0x5555555555555555ull, + 0x3333333333333333ull, + 0x0F0F0F0F0F0F0F0Full, + 0x00FF00FF00FF00FFull, + 0x0000FFFF0000FFFFull + }; + return static_cast(mask[_n]); +} +template +constexpr T morton3d_mask(uint8_t _n) +{ + constexpr uint64_t mask[5] = + { + 0x1249249249249249ull, + 0x10C30C30C30C30C3ull, + 0x010F00F00F00F00Full, + 0x001F0000FF0000FFull, + 0x001F00000000FFFFull + }; + return static_cast(mask[_n]); +} +template +constexpr T morton4d_mask(uint8_t _n) +{ + constexpr uint64_t mask[4] = + { + 0x1111111111111111ull, + 0x0303030303030303ull, + 0x000F000F000F000Full, + 0x000000FF000000FFull + }; + return static_cast(mask[_n]); +} + +template +inline T morton2d_decode(T x) +{ + x = x & morton2d_mask(0); + x = (x | (x >> 1)) & morton2d_mask(1); + x = (x | (x >> 2)) & morton2d_mask(2); + if constexpr (bitDepth > 8u) + { + x = (x | (x >> 4)) & morton2d_mask(3); + } + if constexpr (bitDepth > 16u) + { + x = (x | (x >> 8)) & morton2d_mask(4); + } + if constexpr (bitDepth > 32u) + { + x = (x | (x >> 16)); + } + return x; +} + +//! Puts bits on even positions filling gaps with 0s +template +inline T separate_bits_2d(T x) +{ + if constexpr (bitDepth > 32u) + { + x = (x | (x << 16)) & morton2d_mask(4); + } + if constexpr (bitDepth > 16u) + { + x = (x | (x << 8)) & morton2d_mask(3); + } + if constexpr (bitDepth > 8u) + { + x = (x | (x << 4)) & morton2d_mask(2); + } + x = (x | (x << 2)) & morton2d_mask(1); + x = (x | (x << 1)) & morton2d_mask(0); + + return x; +} +template +inline T separate_bits_3d(T x) +{ + if constexpr (bitDepth > 32u) + { + x = (x | (x << 32)) & morton3d_mask(4); + } + if constexpr (bitDepth > 16u) + { + x = (x | (x << 16)) & morton3d_mask(3); + } + if constexpr (bitDepth > 8u) + { + x = (x | (x << 8)) & morton3d_mask(2); + } + x = (x | (x << 4)) & morton3d_mask(1); + x = (x | (x << 2)) & morton3d_mask(0); + + return x; +} +template +inline T separate_bits_4d(T x) +{ + if constexpr (bitDepth > 32u) + { + x = (x | (x << 24)) & morton4d_mask(3); + } + if constexpr (bitDepth > 16u) + { + x = (x | (x << 12)) & morton4d_mask(2); + } + if constexpr (bitDepth > 8u) + { + x = (x | (x << 6)) & morton4d_mask(1); + } + x = (x | (x << 3)) & morton4d_mask(0); + + return x; +} +#endif +} + +template +T morton2d_decode_x(T _morton) { return impl::morton2d_decode(_morton); } +template +T morton2d_decode_y(T _morton) { return impl::morton2d_decode(_morton >> 1); } + +template +T morton2d_encode(T x, T y) { return impl::separate_bits_2d(x) | (impl::separate_bits_2d(y) << 1); } +template +T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d(x) | (impl::separate_bits_3d(y) << 1) | (impl::separate_bits_3d(z) << 2); } +template +T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d(x) | (impl::separate_bits_4d(y) << 1) | (impl::separate_bits_4d(z) << 2) | (impl::separate_bits_4d(w) << 3); } + +} +} + +#endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index 8a7775c7a5..df61293d4a 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -281,6 +281,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl") #extra math LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/impl.hlsl") +#morton +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/morton.hlsl") #acceleration structures LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/acceleration_structures.hlsl") #colorspace From 1cc26bdcd583bbbc354c8c5e951f06e6cb1d3f28 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Fri, 2 Aug 2024 19:00:47 +0530 Subject: [PATCH 04/56] Fix HLSL morton code --- include/nbl/builtin/hlsl/math/morton.hlsl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl index 64b0b66cb7..4150af637a 100644 --- a/include/nbl/builtin/hlsl/math/morton.hlsl +++ b/include/nbl/builtin/hlsl/math/morton.hlsl @@ -21,7 +21,7 @@ namespace impl #ifdef __HLSL_VERSION template -T morton2d_mask(uint16_t _n) const +T morton2d_mask(uint16_t _n) { const static uint64_t mask[5] = { @@ -31,11 +31,11 @@ T morton2d_mask(uint16_t _n) const 0x00FF00FF00FF00FFull, 0x0000FFFF0000FFFFull }; - return static_cast(mask[_n]); + return mask[_n]; } template -T morton3d_mask(uint16_t _n) const +T morton3d_mask(uint16_t _n) { const static uint64_t mask[5] = { @@ -45,10 +45,10 @@ T morton3d_mask(uint16_t _n) const 0x001F0000FF0000FFull, 0x001F00000000FFFFull }; - return static_cast(mask[_n]); + return mask[_n]; } template -T morton4d_mask(uint16_t _n) const +T morton4d_mask(uint16_t _n) { const static uint64_t mask[4] = { @@ -57,7 +57,7 @@ T morton4d_mask(uint16_t _n) const 0x000F000F000F000Full, 0x000000FF000000FFull }; - return static_cast(mask[_n]); + return mask[_n]; } template From 6922d0c41b509a125be89d86627ba206d565b053 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Mon, 5 Aug 2024 19:02:04 +0530 Subject: [PATCH 05/56] Create geom_luma_meter and computeLuma --- .../builtin/hlsl/luma_meter/luma_meter.hlsl | 47 ++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 4e18655852..d2c33602c8 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -5,11 +5,56 @@ #ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ #define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/type_traits.hlsl" +#include "nbl/builtin/hlsl/math/morton.hlsl" +#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl" +#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl" + namespace nbl { -namespace hls +namespace hlsl +{ +namespace luma_meter +{ + +struct LumaMeteringWindow { + float32_t2 meteringWindowScale; + float32_t2 meteringWindowOffset; +}; + +template +struct geom_luma_meter { + using this_t = geom_luma_meter; + + static this_t create(NBL_REF_ARG(LumaMeteringWindow) window) + { + this_t retval; + retval.window = window; + return retval; + } + float32_t computeLuma(NBL_REF_ARG(TexAccessor) tex, uint32_t2 sampleCount, uint32_t2 sampleIndex, float32_t2 viewportSize) + { + float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f)); + float32_t2 samplePos = stride * sampleIndex; + float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize; + float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos)); + float32_t luma = dot(colorspace::sRGBtoXYZ[1], color); + + const float32_t minLuma = 1.0 / 4096.0; + const float32_t maxLuma = 32768.0; + + luma = clamp(luma, minLuma, maxLuma); + + return log2(luma / minLuma) / log2(maxLuma / minLuma); + } + + LumaMeteringWindow window; +}; +} } } From 603a92f87a5831dc491ff4e4b53e99f5af9a57ce Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Wed, 7 Aug 2024 19:22:52 +0530 Subject: [PATCH 06/56] Add gatherLuma method --- include/nbl/asset/utils/IVirtualTexture.h | 4 +- .../builtin/hlsl/luma_meter/luma_meter.hlsl | 54 +++++++++++++++++-- include/nbl/builtin/hlsl/math/morton.hlsl | 2 +- 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/include/nbl/asset/utils/IVirtualTexture.h b/include/nbl/asset/utils/IVirtualTexture.h index 64ea49cbe7..b715c40cfc 100644 --- a/include/nbl/asset/utils/IVirtualTexture.h +++ b/include/nbl/asset/utils/IVirtualTexture.h @@ -922,7 +922,7 @@ class IVirtualTexture : public core::IReferenceCounted, public IVirtualTextureBa storage->incrTileCounter(neededPhysPages); return offsetToTextureData( - page_tab_offset_t(core::morton2d_decode_x(addr), core::morton2d_decode_y(addr), pgtLayer), + page_tab_offset_t(hlsl::morton2d_decode_x(addr), hlsl::morton2d_decode_y(addr), pgtLayer), extent, _subres.levelCount, _wrapu, @@ -934,7 +934,7 @@ class IVirtualTexture : public core::IReferenceCounted, public IVirtualTextureBa { uint32_t sz = computeSquareSz(_addr.origsize_x, _addr.origsize_y); sz *= sz; - const uint32_t addr = core::morton2d_encode(_addr.pgTab_x, _addr.pgTab_y); + const uint32_t addr = hlsl::morton2d_encode(_addr.pgTab_x, _addr.pgTab_y); core::address_allocator_traits::multi_free_addr(m_pageTableLayerAllocators[_addr.pgTab_layer], 1u, &addr, &sz); diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index d2c33602c8..7ed9604c4f 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -7,6 +7,9 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" #include "nbl/builtin/hlsl/math/morton.hlsl" #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl" @@ -25,9 +28,9 @@ struct LumaMeteringWindow float32_t2 meteringWindowOffset; }; -template +template struct geom_luma_meter { - using this_t = geom_luma_meter; + using this_t = geom_luma_meter; static this_t create(NBL_REF_ARG(LumaMeteringWindow) window) { @@ -36,7 +39,18 @@ struct geom_luma_meter { return retval; } - float32_t computeLuma(NBL_REF_ARG(TexAccessor) tex, uint32_t2 sampleCount, uint32_t2 sampleIndex, float32_t2 viewportSize) + float32_t reduction(float32_t value, NBL_REF_ARG(SharedAccessor) sdata) + { + return workgroup::reduction < plus < float32_t >, GroupSize >:: + template __call (value, sdata); + } + + float32_t computeLuma( + NBL_REF_ARG(TexAccessor) tex, + uint32_t2 sampleCount, + uint32_t2 sampleIndex, + float32_t2 viewportSize + ) { float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f)); float32_t2 samplePos = stride * sampleIndex; @@ -52,6 +66,40 @@ struct geom_luma_meter { return log2(luma / minLuma) / log2(maxLuma / minLuma); } + void gatherLuma( + NBL_REF_ARG(ValueAccessor) val, + NBL_REF_ARG(TexAccessor) tex, + NBL_REF_ARG(SharedAccessor) sdata, + uint32_t2 sampleCount, + float32_t2 viewportSize + ) { + uint32_t2 coord = { + morton2d_decode_x(glsl::gl_LocalInvocationIndex()), + morton2d_decode_y(glsl::gl_LocalInvocationIndex()) + }; + uint32_t tid = workgroup::SubgroupContiguousIndex(); + + uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1); + float32_t luma = 0.0f; + + if (sampleIndex.x <= sampleCount.x && sampleIndex.y <= sampleCount.y) { + luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize); + float32_t lumaSum = reduction(luma, sdata); + + sdata.workgroupExecutionAndMemoryBarrier(); + + if (tid == GroupSize - 1) { + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + uint32_t lumaSumBitPattern = uint32_t(clamp(lumaSum, 0.f, float((1 << fixedPointBitsLeft) - 1))); + uint32_t3 workgroupSize = glsl::gl_WorkGroupSize(); + uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID()); + + val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); + } + } + } + LumaMeteringWindow window; }; } diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl index 4150af637a..1f35016cb6 100644 --- a/include/nbl/builtin/hlsl/math/morton.hlsl +++ b/include/nbl/builtin/hlsl/math/morton.hlsl @@ -13,7 +13,7 @@ namespace nbl { -namespace core +namespace hlsl { namespace impl From 810a6ac1cc2ff6662dca36edd0413288b4f1b1ea Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 8 Aug 2024 16:29:20 +0530 Subject: [PATCH 07/56] Add getGatheredLuma() --- .../builtin/hlsl/luma_meter/luma_meter.hlsl | 25 ++++++++++++++----- 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 7ed9604c4f..21bd813439 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -32,10 +32,12 @@ template; - static this_t create(NBL_REF_ARG(LumaMeteringWindow) window) + static this_t create(NBL_REF_ARG(LumaMeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum) { this_t retval; retval.window = window; + retval.minLuma = lumaMinimum; + retval.maxLuma = lumaMaximum; return retval; } @@ -58,9 +60,6 @@ struct geom_luma_meter { float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos)); float32_t luma = dot(colorspace::sRGBtoXYZ[1], color); - const float32_t minLuma = 1.0 / 4096.0; - const float32_t maxLuma = 32768.0; - luma = clamp(luma, minLuma, maxLuma); return log2(luma / minLuma) / log2(maxLuma / minLuma); @@ -72,7 +71,8 @@ struct geom_luma_meter { NBL_REF_ARG(SharedAccessor) sdata, uint32_t2 sampleCount, float32_t2 viewportSize - ) { + ) + { uint32_t2 coord = { morton2d_decode_x(glsl::gl_LocalInvocationIndex()), morton2d_decode_y(glsl::gl_LocalInvocationIndex()) @@ -91,7 +91,9 @@ struct geom_luma_meter { if (tid == GroupSize - 1) { uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - uint32_t lumaSumBitPattern = uint32_t(clamp(lumaSum, 0.f, float((1 << fixedPointBitsLeft) - 1))); + + uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(minLuma)) * (log2(maxLuma) - log2(minLuma)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + uint32_t3 workgroupSize = glsl::gl_WorkGroupSize(); uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID()); @@ -100,7 +102,18 @@ struct geom_luma_meter { } } + float32_t getGatheredLuma( + NBL_REF_ARG(ValueAccessor) val, + uint32_t2 sampleCount + ) + { + uint32_t lumaSumBitPattern = val.get(glsl::gl_SubgroupInvocationID()); + float32_t lumaSumValue = float32_t(lumaSumBitPattern) / (log2(maxLuma) - log2(minLuma)) + log2(minLuma); + return glsl::subgroupAdd(lumaSumValue) / (sampleCount.x * sampleCount.y); + } + LumaMeteringWindow window; + float32_t minLuma, maxLuma; }; } } From 69a73c1d90a0702894ecead0de1455d459d8b2ca Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 8 Aug 2024 16:59:59 +0530 Subject: [PATCH 08/56] Add reinhard and aces hlsl operators --- .../builtin/hlsl/tonemapper/operators.hlsl | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl index 5ebb5b2ffa..cc5728e9ff 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -5,10 +5,67 @@ #ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ #define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + namespace nbl { -namespace hls +namespace hlsl +{ + +struct ReinhardParams +{ + float32_t keyAndManualLinearExposure; + float32_t rcpWhite2; +}; + +struct ACESParams +{ + float32_t gamma; // 1.0 + float32_t exposure; // actualExposure+midGrayLog2 +}; + + +float32_t3 reinhard(ReinhardParams params, float32_t3 rawCIEXYZcolor) +{ + float32_t exposureFactors = params.keyAndManualLinearExposure; + float32_t exposedLuma = rawCIEXYZcolor.y * exposureFactors; + float32_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * params.rcpWhite2) / (1.0 + exposedLuma)); + return rawCIEXYZcolor * colorMultiplier; +} + +float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor) { + float32_t3 tonemapped = rawCIEXYZcolor; + if (tonemapped.y > 1.175494351e-38) + tonemapped *= exp2(log2(tonemapped.y) * (params.gamma - 1.0) + (params.exposure) * params.gamma); + + // XYZ => RRT_SAT + // this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t) + const float32_t3x3 XYZ_RRT_Input = float32_t3x3( + float32_t3(1.594168310, -0.262608051, -0.231993079), + float32_t3(-0.6332771780, 1.5840380200, 0.0164147373), + float32_t3(0.00892840419, 0.03648501260, 0.87711471300) + ); + + // this is obviously fitted to some particular simulated sensor/film and display + float32_t3 v = mul(XYZ_RRT_Input, tonemapped); + float32_t3 a = v * (v + float32_t3(0.0245786)) - float32_t3(0.000090537); + float32_t3 b = v * (v * float32_t(0.983729) + float32_t3(0.4329510)) + float32_t3(0.238081); + v = a / b; + + // ODT_SAT => XYZ + // this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t) + const float32_t3x3 ODT_XYZ_Output = float32_t3x3( + float32_t3(0.624798000, 0.164064825, 0.161605373), + float32_t3(0.268048108, 0.674283803, 0.057667464), + float32_t3(0.0157514643, 0.0526682511, 1.0204007600) + ); + return mul(ODT_XYZ_Output, v); +} + +// ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php +// or get proper ACES RRT and ODTs +// https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use- } } From 4c70cf5bb919abab9c82e36320de45be88fe02ee Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 13 Aug 2024 21:47:49 +0530 Subject: [PATCH 09/56] cast mask values to correct type --- include/nbl/builtin/hlsl/math/morton.hlsl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl index 1f35016cb6..1cd2105dc5 100644 --- a/include/nbl/builtin/hlsl/math/morton.hlsl +++ b/include/nbl/builtin/hlsl/math/morton.hlsl @@ -31,7 +31,7 @@ T morton2d_mask(uint16_t _n) 0x00FF00FF00FF00FFull, 0x0000FFFF0000FFFFull }; - return mask[_n]; + return (T)mask[_n]; } template @@ -45,7 +45,7 @@ T morton3d_mask(uint16_t _n) 0x001F0000FF0000FFull, 0x001F00000000FFFFull }; - return mask[_n]; + return (T)mask[_n]; } template T morton4d_mask(uint16_t _n) @@ -57,7 +57,7 @@ T morton4d_mask(uint16_t _n) 0x000F000F000F000Full, 0x000000FF000000FFull }; - return mask[_n]; + return (T)mask[_n]; } template From d9d6dd8c19a1c896ea03dce1182791bfb2e1834b Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Fri, 16 Aug 2024 16:35:19 +0530 Subject: [PATCH 10/56] Add create methods to tonemapper params --- .../builtin/hlsl/tonemapper/operators.hlsl | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl index cc5728e9ff..daff652bbd 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -11,15 +11,34 @@ namespace nbl { namespace hlsl { +namespace tonemapper +{ struct ReinhardParams { + using this_t = ReinhardParams; + static this_t create(float EV, float key = 0.18f, float WhitePointRelToEV = 16.f) + { + this_t retval; + retval.keyAndManualLinearExposure = key * exp2(EV); + retval.rcpWhite2 = 1.f / (WhitePointRelToEV * WhitePointRelToEV); + return retval; + } + float32_t keyAndManualLinearExposure; float32_t rcpWhite2; }; struct ACESParams { + using this_t = ACESParams; + static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) { + this_t retval; + retval.gamma = Contrast; + retval.exposure = EV + log2(key * 0.77321666f); + return retval; + } + float32_t gamma; // 1.0 float32_t exposure; // actualExposure+midGrayLog2 }; @@ -49,8 +68,8 @@ float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor) // this is obviously fitted to some particular simulated sensor/film and display float32_t3 v = mul(XYZ_RRT_Input, tonemapped); - float32_t3 a = v * (v + float32_t3(0.0245786)) - float32_t3(0.000090537); - float32_t3 b = v * (v * float32_t(0.983729) + float32_t3(0.4329510)) + float32_t3(0.238081); + float32_t3 a = v * (v + float32_t3(0.0245786, 0.0245786, 0.0245786)) - float32_t3(0.000090537, 0.000090537, 0.000090537); + float32_t3 b = v * (v * float32_t3(0.983729, 0.983729, 0.983729) + float32_t3(0.4329510, 0.4329510, 0.4329510)) + float32_t3(0.238081, 0.238081, 0.238081); v = a / b; // ODT_SAT => XYZ @@ -67,6 +86,7 @@ float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor) // or get proper ACES RRT and ODTs // https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use- +} } } From 305f7e7430077c72a9bbf0b814ed5a6bd9e691a6 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Fri, 16 Aug 2024 16:35:49 +0530 Subject: [PATCH 11/56] Remove getGatheredLuma from luma_meter --- include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 21bd813439..94b898670b 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -13,6 +13,7 @@ #include "nbl/builtin/hlsl/type_traits.hlsl" #include "nbl/builtin/hlsl/math/morton.hlsl" #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl" +#include "nbl/builtin/hlsl/colorspace/OETF.hlsl" #include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl" namespace nbl @@ -57,7 +58,7 @@ struct geom_luma_meter { float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f)); float32_t2 samplePos = stride * sampleIndex; float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize; - float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos)); + float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos)); float32_t luma = dot(colorspace::sRGBtoXYZ[1], color); luma = clamp(luma, minLuma, maxLuma); @@ -102,16 +103,6 @@ struct geom_luma_meter { } } - float32_t getGatheredLuma( - NBL_REF_ARG(ValueAccessor) val, - uint32_t2 sampleCount - ) - { - uint32_t lumaSumBitPattern = val.get(glsl::gl_SubgroupInvocationID()); - float32_t lumaSumValue = float32_t(lumaSumBitPattern) / (log2(maxLuma) - log2(minLuma)) + log2(minLuma); - return glsl::subgroupAdd(lumaSumValue) / (sampleCount.x * sampleCount.y); - } - LumaMeteringWindow window; float32_t minLuma, maxLuma; }; From 3f4f6e93163e5c0c1a67f88b8906a07916ddbe84 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 20 Aug 2024 18:28:48 +0530 Subject: [PATCH 12/56] Separate LumaMeteringWindow into a common header --- .../nbl/builtin/hlsl/luma_meter/common.hlsl | 27 +++++++++++++++++++ .../builtin/hlsl/luma_meter/luma_meter.hlsl | 12 +++------ src/nbl/builtin/CMakeLists.txt | 1 + 3 files changed, 31 insertions(+), 9 deletions(-) create mode 100644 include/nbl/builtin/hlsl/luma_meter/common.hlsl diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl new file mode 100644 index 0000000000..210039390e --- /dev/null +++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl @@ -0,0 +1,27 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_ +#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace luma_meter +{ + +struct MeteringWindow +{ + float32_t2 meteringWindowScale; + float32_t2 meteringWindowOffset; +}; + +} +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 94b898670b..e865d61c0d 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -5,7 +5,6 @@ #ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ #define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ -#include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/workgroup/basic.hlsl" @@ -15,6 +14,7 @@ #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl" #include "nbl/builtin/hlsl/colorspace/OETF.hlsl" #include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl" +#include "nbl/builtin/hlsl/luma_meter/common.hlsl" namespace nbl { @@ -23,17 +23,11 @@ namespace hlsl namespace luma_meter { -struct LumaMeteringWindow -{ - float32_t2 meteringWindowScale; - float32_t2 meteringWindowOffset; -}; - template struct geom_luma_meter { using this_t = geom_luma_meter; - static this_t create(NBL_REF_ARG(LumaMeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum) + static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum) { this_t retval; retval.window = window; @@ -103,7 +97,7 @@ struct geom_luma_meter { } } - LumaMeteringWindow window; + MeteringWindow window; float32_t minLuma, maxLuma; }; } diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index df61293d4a..b4346c428e 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -35,6 +35,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl") # luma metering +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl") # tonemapper LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl") From 515512a9dc5287dd68acce86205c53b5b219ba54 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 20 Aug 2024 18:32:27 +0530 Subject: [PATCH 13/56] Simplify luma_meter naming --- include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index e865d61c0d..fb07acb8f4 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -24,8 +24,8 @@ namespace luma_meter { template -struct geom_luma_meter { - using this_t = geom_luma_meter; +struct geom_meter { + using this_t = geom_meter; static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum) { From 1919e53ed6ecb319f7892005d0faad86706288a2 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 20 Aug 2024 19:06:03 +0530 Subject: [PATCH 14/56] Simplify morton code --- include/nbl/builtin/hlsl/math/morton.hlsl | 135 +--------------------- 1 file changed, 6 insertions(+), 129 deletions(-) diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl index 1cd2105dc5..c0769fc88b 100644 --- a/include/nbl/builtin/hlsl/math/morton.hlsl +++ b/include/nbl/builtin/hlsl/math/morton.hlsl @@ -19,9 +19,8 @@ namespace hlsl namespace impl { -#ifdef __HLSL_VERSION template -T morton2d_mask(uint16_t _n) +NBL_CONSTEXPR_FUNC T morton2d_mask(uint16_t _n) { const static uint64_t mask[5] = { @@ -31,11 +30,11 @@ T morton2d_mask(uint16_t _n) 0x00FF00FF00FF00FFull, 0x0000FFFF0000FFFFull }; - return (T)mask[_n]; + return nbl::hlsl::_static_cast(mask[_n]); } template -T morton3d_mask(uint16_t _n) +NBL_CONSTEXPR_FUNC T morton3d_mask(uint16_t _n) { const static uint64_t mask[5] = { @@ -45,10 +44,10 @@ T morton3d_mask(uint16_t _n) 0x001F0000FF0000FFull, 0x001F00000000FFFFull }; - return (T)mask[_n]; + return nbl::hlsl::_static_cast(mask[_n]); } template -T morton4d_mask(uint16_t _n) +NBL_CONSTEXPR_FUNC T morton4d_mask(uint16_t _n) { const static uint64_t mask[4] = { @@ -57,7 +56,7 @@ T morton4d_mask(uint16_t _n) 0x000F000F000F000Full, 0x000000FF000000FFull }; - return (T)mask[_n]; + return nbl::hlsl::_static_cast(mask[_n]); } template @@ -141,128 +140,6 @@ inline T separate_bits_4d(T x) return x; } -#else -template -constexpr T morton2d_mask(uint8_t _n) -{ - constexpr uint64_t mask[5] = - { - 0x5555555555555555ull, - 0x3333333333333333ull, - 0x0F0F0F0F0F0F0F0Full, - 0x00FF00FF00FF00FFull, - 0x0000FFFF0000FFFFull - }; - return static_cast(mask[_n]); -} -template -constexpr T morton3d_mask(uint8_t _n) -{ - constexpr uint64_t mask[5] = - { - 0x1249249249249249ull, - 0x10C30C30C30C30C3ull, - 0x010F00F00F00F00Full, - 0x001F0000FF0000FFull, - 0x001F00000000FFFFull - }; - return static_cast(mask[_n]); -} -template -constexpr T morton4d_mask(uint8_t _n) -{ - constexpr uint64_t mask[4] = - { - 0x1111111111111111ull, - 0x0303030303030303ull, - 0x000F000F000F000Full, - 0x000000FF000000FFull - }; - return static_cast(mask[_n]); -} - -template -inline T morton2d_decode(T x) -{ - x = x & morton2d_mask(0); - x = (x | (x >> 1)) & morton2d_mask(1); - x = (x | (x >> 2)) & morton2d_mask(2); - if constexpr (bitDepth > 8u) - { - x = (x | (x >> 4)) & morton2d_mask(3); - } - if constexpr (bitDepth > 16u) - { - x = (x | (x >> 8)) & morton2d_mask(4); - } - if constexpr (bitDepth > 32u) - { - x = (x | (x >> 16)); - } - return x; -} - -//! Puts bits on even positions filling gaps with 0s -template -inline T separate_bits_2d(T x) -{ - if constexpr (bitDepth > 32u) - { - x = (x | (x << 16)) & morton2d_mask(4); - } - if constexpr (bitDepth > 16u) - { - x = (x | (x << 8)) & morton2d_mask(3); - } - if constexpr (bitDepth > 8u) - { - x = (x | (x << 4)) & morton2d_mask(2); - } - x = (x | (x << 2)) & morton2d_mask(1); - x = (x | (x << 1)) & morton2d_mask(0); - - return x; -} -template -inline T separate_bits_3d(T x) -{ - if constexpr (bitDepth > 32u) - { - x = (x | (x << 32)) & morton3d_mask(4); - } - if constexpr (bitDepth > 16u) - { - x = (x | (x << 16)) & morton3d_mask(3); - } - if constexpr (bitDepth > 8u) - { - x = (x | (x << 8)) & morton3d_mask(2); - } - x = (x | (x << 4)) & morton3d_mask(1); - x = (x | (x << 2)) & morton3d_mask(0); - - return x; -} -template -inline T separate_bits_4d(T x) -{ - if constexpr (bitDepth > 32u) - { - x = (x | (x << 24)) & morton4d_mask(3); - } - if constexpr (bitDepth > 16u) - { - x = (x | (x << 12)) & morton4d_mask(2); - } - if constexpr (bitDepth > 8u) - { - x = (x | (x << 6)) & morton4d_mask(1); - } - x = (x | (x << 3)) & morton4d_mask(0); - - return x; -} -#endif } template From 4c582382e8adca012b959577367138a8f1a92dfd Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 20 Aug 2024 19:09:24 +0530 Subject: [PATCH 15/56] Add missing comment --- include/nbl/builtin/hlsl/tonemapper/operators.hlsl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl index daff652bbd..1481fd92b2 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -35,7 +35,8 @@ struct ACESParams static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) { this_t retval; retval.gamma = Contrast; - retval.exposure = EV + log2(key * 0.77321666f); + const float reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key + retval.exposure = EV + log2(key * reinhardMatchCorrection); return retval; } From 3c3f8b84025dfddb3464d4bc9ed5ca76f651b07c Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 20 Aug 2024 20:09:02 +0530 Subject: [PATCH 16/56] Refactor tonemapping operators --- .../builtin/hlsl/tonemapper/operators.hlsl | 106 +++++++++--------- 1 file changed, 56 insertions(+), 50 deletions(-) diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl index 1481fd92b2..854f78e302 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -6,6 +6,7 @@ #define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ #include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/type_traits.hlsl" namespace nbl { @@ -14,10 +15,13 @@ namespace hlsl namespace tonemapper { -struct ReinhardParams +template +struct Reinhard { - using this_t = ReinhardParams; - static this_t create(float EV, float key = 0.18f, float WhitePointRelToEV = 16.f) + using float_t = enable_if_t::value, T>; + using float_t3 = typename conditional, float32_t3, float16_t3>::type; + using this_t = Reinhard; + static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f) { this_t retval; retval.keyAndManualLinearExposure = key * exp2(EV); @@ -25,63 +29,65 @@ struct ReinhardParams return retval; } - float32_t keyAndManualLinearExposure; - float32_t rcpWhite2; + float_t3 operator()(float_t3 rawCIEXYZcolor) { + float_t exposureFactors = keyAndManualLinearExposure; + float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors; + float_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * rcpWhite2) / (1.0 + exposedLuma)); + return rawCIEXYZcolor * colorMultiplier; + } + + float_t3 keyAndManualLinearExposure; + float_t3 rcpWhite2; }; -struct ACESParams +template +struct ACES { - using this_t = ACESParams; - static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) { + using float_t = enable_if_t::value, T>; + using float_t3 = typename conditional, float32_t3, float16_t3>::type; + using float_t3x3 = typename conditional, float32_t3x3, float16_t3x3>::type; + + using this_t = ACES; + static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) { this_t retval; retval.gamma = Contrast; - const float reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key + const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key retval.exposure = EV + log2(key * reinhardMatchCorrection); return retval; } - float32_t gamma; // 1.0 - float32_t exposure; // actualExposure+midGrayLog2 -}; - - -float32_t3 reinhard(ReinhardParams params, float32_t3 rawCIEXYZcolor) -{ - float32_t exposureFactors = params.keyAndManualLinearExposure; - float32_t exposedLuma = rawCIEXYZcolor.y * exposureFactors; - float32_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * params.rcpWhite2) / (1.0 + exposedLuma)); - return rawCIEXYZcolor * colorMultiplier; -} + float_t3 operator()(float_t3 rawCIEXYZcolor) { + float_t3 tonemapped = rawCIEXYZcolor; + if (tonemapped.y > 1.175494351e-38) + tonemapped *= exp2(log2(tonemapped.y) * (gamma - 1.0) + (exposure) * gamma); + + // XYZ => RRT_SAT + // this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t) + const float_t3x3 XYZ_RRT_Input = float_t3x3( + float_t3(1.594168310, -0.262608051, -0.231993079), + float_t3(-0.6332771780, 1.5840380200, 0.0164147373), + float_t3(0.00892840419, 0.03648501260, 0.87711471300) + ); + + // this is obviously fitted to some particular simulated sensor/film and display + float_t3 v = mul(XYZ_RRT_Input, tonemapped); + float_t3 a = v * (v + float_t3(0.0245786, 0.0245786, 0.0245786)) - float_t3(0.000090537, 0.000090537, 0.000090537); + float_t3 b = v * (v * float_t3(0.983729, 0.983729, 0.983729) + float_t3(0.4329510, 0.4329510, 0.4329510)) + float_t3(0.238081, 0.238081, 0.238081); + v = a / b; + + // ODT_SAT => XYZ + // this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t) + const float_t3x3 ODT_XYZ_Output = float_t3x3( + float_t3(0.624798000, 0.164064825, 0.161605373), + float_t3(0.268048108, 0.674283803, 0.057667464), + float_t3(0.0157514643, 0.0526682511, 1.0204007600) + ); + return mul(ODT_XYZ_Output, v); + } -float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor) -{ - float32_t3 tonemapped = rawCIEXYZcolor; - if (tonemapped.y > 1.175494351e-38) - tonemapped *= exp2(log2(tonemapped.y) * (params.gamma - 1.0) + (params.exposure) * params.gamma); - - // XYZ => RRT_SAT - // this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t) - const float32_t3x3 XYZ_RRT_Input = float32_t3x3( - float32_t3(1.594168310, -0.262608051, -0.231993079), - float32_t3(-0.6332771780, 1.5840380200, 0.0164147373), - float32_t3(0.00892840419, 0.03648501260, 0.87711471300) - ); - - // this is obviously fitted to some particular simulated sensor/film and display - float32_t3 v = mul(XYZ_RRT_Input, tonemapped); - float32_t3 a = v * (v + float32_t3(0.0245786, 0.0245786, 0.0245786)) - float32_t3(0.000090537, 0.000090537, 0.000090537); - float32_t3 b = v * (v * float32_t3(0.983729, 0.983729, 0.983729) + float32_t3(0.4329510, 0.4329510, 0.4329510)) + float32_t3(0.238081, 0.238081, 0.238081); - v = a / b; - - // ODT_SAT => XYZ - // this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t) - const float32_t3x3 ODT_XYZ_Output = float32_t3x3( - float32_t3(0.624798000, 0.164064825, 0.161605373), - float32_t3(0.268048108, 0.674283803, 0.057667464), - float32_t3(0.0157514643, 0.0526682511, 1.0204007600) - ); - return mul(ODT_XYZ_Output, v); -} + float_t gamma; // 1.0 + float_t exposure; // actualExposure+midGrayLog2 +}; // ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php // or get proper ACES RRT and ODTs From b0e07505a374d3e81e18e9e71c39152e4599051c Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 20 Aug 2024 20:17:38 +0530 Subject: [PATCH 17/56] Small fixes --- include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index fb07acb8f4..af128b0f98 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -27,7 +27,7 @@ template; - static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum) + static this_t create(NBL_CONST_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum) { this_t retval; retval.window = window; @@ -68,11 +68,12 @@ struct geom_meter { float32_t2 viewportSize ) { + + uint32_t tid = workgroup::SubgroupContiguousIndex(); uint32_t2 coord = { - morton2d_decode_x(glsl::gl_LocalInvocationIndex()), - morton2d_decode_y(glsl::gl_LocalInvocationIndex()) + morton2d_decode_x(tid), + morton2d_decode_y(tid) }; - uint32_t tid = workgroup::SubgroupContiguousIndex(); uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1); float32_t luma = 0.0f; @@ -81,8 +82,6 @@ struct geom_meter { luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize); float32_t lumaSum = reduction(luma, sdata); - sdata.workgroupExecutionAndMemoryBarrier(); - if (tid == GroupSize - 1) { uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); From e8e46c9d042e76adb3bfd449982fcff70986cfba Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Wed, 21 Aug 2024 16:20:32 +0530 Subject: [PATCH 18/56] Use promote to simplify code --- include/nbl/builtin/hlsl/tonemapper/operators.hlsl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl index 854f78e302..e5e6a9a97c 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -71,8 +71,8 @@ struct ACES // this is obviously fitted to some particular simulated sensor/film and display float_t3 v = mul(XYZ_RRT_Input, tonemapped); - float_t3 a = v * (v + float_t3(0.0245786, 0.0245786, 0.0245786)) - float_t3(0.000090537, 0.000090537, 0.000090537); - float_t3 b = v * (v * float_t3(0.983729, 0.983729, 0.983729) + float_t3(0.4329510, 0.4329510, 0.4329510)) + float_t3(0.238081, 0.238081, 0.238081); + float_t3 a = v * (v + promote(0.0245786)) - promote(0.000090537); + float_t3 b = v * (v * promote(0.983729) + promote(0.4329510)) + promote(0.238081); v = a / b; // ODT_SAT => XYZ From ee5affe6f20f25e1c7eb2675e07fe340be9204fb Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Wed, 21 Aug 2024 17:07:34 +0530 Subject: [PATCH 19/56] Add static create to MeteringWindow --- include/nbl/builtin/hlsl/luma_meter/common.hlsl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl index 210039390e..55d1713619 100644 --- a/include/nbl/builtin/hlsl/luma_meter/common.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl @@ -16,8 +16,16 @@ namespace luma_meter struct MeteringWindow { + using this_t = MeteringWindow; float32_t2 meteringWindowScale; float32_t2 meteringWindowOffset; + + static this_t create(float32_t2 scale, float32_t2 offset) { + this_t retval; + retval.meteringWindowScale = scale; + retval.meteringWindowOffset = offset; + return retval; + } }; } From 56389f45a6f5689889d232fb051a15b0001e43f7 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:31:28 +0530 Subject: [PATCH 20/56] Infer sample count from viewportSize --- .../builtin/hlsl/luma_meter/luma_meter.hlsl | 49 +++++++------------ .../builtin/hlsl/tonemapper/operators.hlsl | 4 +- 2 files changed, 21 insertions(+), 32 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index af128b0f98..23deac8bbe 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -27,12 +27,10 @@ template; - static this_t create(NBL_CONST_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum) + static this_t create(float32_t2 lumaMinMax) { this_t retval; - retval.window = window; - retval.minLuma = lumaMinimum; - retval.maxLuma = lumaMaximum; + retval.lumaMinMax = lumaMinMax; return retval; } @@ -43,61 +41,52 @@ struct geom_meter { } float32_t computeLuma( + NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(TexAccessor) tex, - uint32_t2 sampleCount, - uint32_t2 sampleIndex, - float32_t2 viewportSize + float32_t2 shiftedCoord ) { - float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f)); - float32_t2 samplePos = stride * sampleIndex; - float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize; + float32_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos)); float32_t luma = dot(colorspace::sRGBtoXYZ[1], color); - luma = clamp(luma, minLuma, maxLuma); + luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); - return log2(luma / minLuma) / log2(maxLuma / minLuma); + return log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x); } void gatherLuma( + NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(ValueAccessor) val, NBL_REF_ARG(TexAccessor) tex, NBL_REF_ARG(SharedAccessor) sdata, - uint32_t2 sampleCount, - float32_t2 viewportSize + float32_t2 tileOffset ) { - uint32_t tid = workgroup::SubgroupContiguousIndex(); uint32_t2 coord = { morton2d_decode_x(tid), morton2d_decode_y(tid) }; - uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1); float32_t luma = 0.0f; + luma = computeLuma(window, tex, tileOffset + (float32_t2)(coord)); + float32_t lumaSum = reduction(luma, sdata); - if (sampleIndex.x <= sampleCount.x && sampleIndex.y <= sampleCount.y) { - luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize); - float32_t lumaSum = reduction(luma, sdata); - - if (tid == GroupSize - 1) { - uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); - uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + if (tid == GroupSize - 1) { + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(minLuma)) * (log2(maxLuma) - log2(minLuma)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(lumaMinMax.x)) * (log2(lumaMinMax.y) - log2(lumaMinMax.x)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); - uint32_t3 workgroupSize = glsl::gl_WorkGroupSize(); - uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID()); + uint32_t3 workgroupSize = glsl::gl_WorkGroupSize(); + uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID()); - val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); - } + val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); } } - MeteringWindow window; - float32_t minLuma, maxLuma; + float32_t2 lumaMinMax; }; } } diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl index e5e6a9a97c..824e31d68a 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -36,8 +36,8 @@ struct Reinhard return rawCIEXYZcolor * colorMultiplier; } - float_t3 keyAndManualLinearExposure; - float_t3 rcpWhite2; + float_t keyAndManualLinearExposure; + float_t rcpWhite2; }; template From 23771d1610b50e2af60b2f4661d11c06e50d854f Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 22 Aug 2024 23:02:11 +0530 Subject: [PATCH 21/56] Rename gatherLuma, add toXYZ method and templatize the float type --- .../builtin/hlsl/luma_meter/luma_meter.hlsl | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 23deac8bbe..b0b19b3a82 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -11,9 +11,6 @@ #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" #include "nbl/builtin/hlsl/math/morton.hlsl" -#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl" -#include "nbl/builtin/hlsl/colorspace/OETF.hlsl" -#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl" #include "nbl/builtin/hlsl/luma_meter/common.hlsl" namespace nbl @@ -25,42 +22,45 @@ namespace luma_meter template struct geom_meter { + using float_t = typename SharedAccessor::type; + using float_t2 = typename conditional, float32_t2, float16_t2>::type; + using float_t3 = typename conditional, float32_t3, float16_t3>::type; using this_t = geom_meter; - static this_t create(float32_t2 lumaMinMax) + static this_t create(float_t2 lumaMinMax) { this_t retval; retval.lumaMinMax = lumaMinMax; return retval; } - float32_t reduction(float32_t value, NBL_REF_ARG(SharedAccessor) sdata) + float_t reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { - return workgroup::reduction < plus < float32_t >, GroupSize >:: + return workgroup::reduction < plus < float_t >, GroupSize >:: template __call (value, sdata); } - float32_t computeLuma( + float_t computeLumaLog2( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(TexAccessor) tex, - float32_t2 shiftedCoord + float_t2 shiftedCoord ) { - float32_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; - float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos)); - float32_t luma = dot(colorspace::sRGBtoXYZ[1], color); + float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; + float_t3 color = tex.get(uvPos); + float_t luma = TexAccessor::toXYZ(color); luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); - return log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x); + return max(log2(luma), log2(lumaMinMax.x)); } - void gatherLuma( + void sampleLuma( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(ValueAccessor) val, NBL_REF_ARG(TexAccessor) tex, NBL_REF_ARG(SharedAccessor) sdata, - float32_t2 tileOffset + float_t2 tileOffset ) { uint32_t tid = workgroup::SubgroupContiguousIndex(); @@ -69,9 +69,9 @@ struct geom_meter { morton2d_decode_y(tid) }; - float32_t luma = 0.0f; - luma = computeLuma(window, tex, tileOffset + (float32_t2)(coord)); - float32_t lumaSum = reduction(luma, sdata); + float_t luma = 0.0f; + luma = computeLumaLog2(window, tex, tileOffset + (float32_t2)(coord)); + float_t lumaSum = reduction(luma, sdata); if (tid == GroupSize - 1) { uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); @@ -86,7 +86,7 @@ struct geom_meter { } } - float32_t2 lumaMinMax; + float_t2 lumaMinMax; }; } } From ac390393cca2c89237532b57f12d95cc5584f0be Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 27 Aug 2024 00:41:14 +0530 Subject: [PATCH 22/56] Add uploadFloat, downloadFloat and gatherLuma --- .../builtin/hlsl/luma_meter/luma_meter.hlsl | 63 ++++++++++++++++--- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index b0b19b3a82..c39b2e3ab6 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -7,6 +7,7 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl" #include "nbl/builtin/hlsl/workgroup/basic.hlsl" #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" @@ -27,10 +28,11 @@ struct geom_meter { using float_t3 = typename conditional, float32_t3, float16_t3>::type; using this_t = geom_meter; - static this_t create(float_t2 lumaMinMax) + static this_t create(float_t2 lumaMinMax, float_t sampleCount) { this_t retval; retval.lumaMinMax = lumaMinMax; + retval.sampleCount = sampleCount; return retval; } @@ -55,6 +57,34 @@ struct geom_meter { return max(log2(luma), log2(lumaMinMax.x)); } + void uploadFloat( + NBL_REF_ARG(ValueAccessor) val_accessor, + uint32_t index, + float_t val, + float_t minLog2, + float_t rangeLog2 + ) + { + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + + uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + + val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); + } + + float_t downloadFloat( + NBL_REF_ARG(ValueAccessor) val_accessor, + uint32_t index, + float_t minLog2, + float_t rangeLog2 + ) + { + float_t luma = (float_t)val.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); + luma = luma / rangeLog2 + minLog2; + return luma; + } + void sampleLuma( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(ValueAccessor) val, @@ -74,18 +104,37 @@ struct geom_meter { float_t lumaSum = reduction(luma, sdata); if (tid == GroupSize - 1) { - uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); - uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - - uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(lumaMinMax.x)) * (log2(lumaMinMax.y) - log2(lumaMinMax.x)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); - uint32_t3 workgroupSize = glsl::gl_WorkGroupSize(); uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID()); - val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); + uploadFloat( + val, + workgroupIndex, + lumaSum, + log2(lumaMinMax.x), + log2(lumaMinMax.y / lumaMinMax.x) + ); } } + void gatherLuma( + NBL_REF_ARG(ValueAccessor) val + ) + { + uint32_t tid = workgroup::SubgroupContiguousIndex(); + float_t lumaSum = glsl::subgroupAdd( + downloadFloat( + val, + tid, + log2(lumaMinMax.x), + log2(lumaMinMax.y / lumaMinMax.x) + ) + ); + + uploadFloat(val, 0, lumaSum, log2(lumaMinMax.x), log2(lumaMinMax.y / lumaMinMax.x)); + } + + float_t sampleCount; float_t2 lumaMinMax; }; } From 49a80499c4ee3c7b09ce20e1f7a995d63cc7a73d Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 27 Aug 2024 19:37:11 +0530 Subject: [PATCH 23/56] Normalize tileOffset and coord to uv before computing Luma --- .../nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index c39b2e3ab6..6804c1d631 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -50,7 +50,7 @@ struct geom_meter { { float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; float_t3 color = tex.get(uvPos); - float_t luma = TexAccessor::toXYZ(color); + float_t luma = (float_t)TexAccessor::toXYZ(color); luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); @@ -80,7 +80,7 @@ struct geom_meter { float_t rangeLog2 ) { - float_t luma = (float_t)val.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); + float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); luma = luma / rangeLog2 + minLog2; return luma; } @@ -90,7 +90,8 @@ struct geom_meter { NBL_REF_ARG(ValueAccessor) val, NBL_REF_ARG(TexAccessor) tex, NBL_REF_ARG(SharedAccessor) sdata, - float_t2 tileOffset + float_t2 tileOffset, + float_t2 viewportSize ) { uint32_t tid = workgroup::SubgroupContiguousIndex(); @@ -100,7 +101,8 @@ struct geom_meter { }; float_t luma = 0.0f; - luma = computeLumaLog2(window, tex, tileOffset + (float32_t2)(coord)); + float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + luma = computeLumaLog2(window, tex, shiftedCoord); float_t lumaSum = reduction(luma, sdata); if (tid == GroupSize - 1) { @@ -117,7 +119,7 @@ struct geom_meter { } } - void gatherLuma( + float_t gatherLuma( NBL_REF_ARG(ValueAccessor) val ) { @@ -131,7 +133,7 @@ struct geom_meter { ) ); - uploadFloat(val, 0, lumaSum, log2(lumaMinMax.x), log2(lumaMinMax.y / lumaMinMax.x)); + return lumaSum; } float_t sampleCount; From 8a10ae2e12f36d48f39ff3350920d800da1cc47e Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Sun, 29 Sep 2024 18:16:56 +0100 Subject: [PATCH 24/56] Simplify return statement --- include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 6804c1d631..266d6e6a2a 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -81,8 +81,7 @@ struct geom_meter { ) { float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); - luma = luma / rangeLog2 + minLog2; - return luma; + return luma / rangeLog2 + minLog2; } void sampleLuma( From 6b01b6ddd4e687684e6e7a5f8073f7e556ad6967 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Wed, 11 Dec 2024 00:26:02 +0000 Subject: [PATCH 25/56] Update submodule pointers --- 3rdparty/dxc/dxc | 2 +- 3rdparty/libexpat | 2 +- 3rdparty/nbl_spirv_cross | 2 +- 3rdparty/openexr | 2 +- 3rdparty/volk | 2 +- examples_tests | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index a08b6cbeb1..29a5e1258e 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb +Subproject commit 29a5e1258e2f01dd15ef1f58e24a02337c96c8f7 diff --git a/3rdparty/libexpat b/3rdparty/libexpat index e2004f9195..39e487da35 160000 --- a/3rdparty/libexpat +++ b/3rdparty/libexpat @@ -1 +1 @@ -Subproject commit e2004f9195700bb8248c8c954578f14fda58be27 +Subproject commit 39e487da353b20bb3a724311d179ba0fddffc65b diff --git a/3rdparty/nbl_spirv_cross b/3rdparty/nbl_spirv_cross index f4accc2a4b..b52e6a55ca 160000 --- a/3rdparty/nbl_spirv_cross +++ b/3rdparty/nbl_spirv_cross @@ -1 +1 @@ -Subproject commit f4accc2a4b478c42038c920aa0e43a8aab7d135c +Subproject commit b52e6a55ca2d9805a18dccfc45c7a2e692c1d8e1 diff --git a/3rdparty/openexr b/3rdparty/openexr index fca936a964..824ed557b3 160000 --- a/3rdparty/openexr +++ b/3rdparty/openexr @@ -1 +1 @@ -Subproject commit fca936a964da5983daecdbed7cd249934701b41a +Subproject commit 824ed557b3c59288a685356c708e5806b1122fe1 diff --git a/3rdparty/volk b/3rdparty/volk index b6be5ba0af..efb96f9031 160000 --- a/3rdparty/volk +++ b/3rdparty/volk @@ -1 +1 @@ -Subproject commit b6be5ba0af5567974cc8a0261471573418f0f34f +Subproject commit efb96f90317e1c902d6b45ae95d14e67779a2241 diff --git a/examples_tests b/examples_tests index 8b6675b3ba..36633f5c2c 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 8b6675b3ba9fe1ca00f2c6573a4888abb8477da7 +Subproject commit 36633f5c2cae3e8e870a837c86e71f3a50061a3e From f95f1c1e7eb5fe5c930b1c0badba345f4e27033e Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Wed, 11 Dec 2024 00:54:41 +0000 Subject: [PATCH 26/56] Update submodule pointer --- 3rdparty/imgui | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/imgui b/3rdparty/imgui index e489e40a85..a29e9dba30 160000 --- a/3rdparty/imgui +++ b/3rdparty/imgui @@ -1 +1 @@ -Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e +Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175 From 1a5827379821023273130a547b8ba50141cd85a9 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Fri, 13 Dec 2024 04:34:45 +0000 Subject: [PATCH 27/56] Update submodule pointer --- 3rdparty/Vulkan-Headers | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers index 2c823b7f27..31aa7f634b 160000 --- a/3rdparty/Vulkan-Headers +++ b/3rdparty/Vulkan-Headers @@ -1 +1 @@ -Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb +Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3 From b6e1f57110c4e34715bd6c15223a1db9224c47ff Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Fri, 13 Dec 2024 04:46:17 +0000 Subject: [PATCH 28/56] Update submodule pointer --- 3rdparty/volk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/3rdparty/volk b/3rdparty/volk index efb96f9031..b6be5ba0af 160000 --- a/3rdparty/volk +++ b/3rdparty/volk @@ -1 +1 @@ -Subproject commit efb96f90317e1c902d6b45ae95d14e67779a2241 +Subproject commit b6be5ba0af5567974cc8a0261471573418f0f34f From 5239c29945cd2f609d13f40c66af3dcc4bd2f6a2 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Tue, 14 Jan 2025 00:42:26 +0000 Subject: [PATCH 29/56] Update submodule pointer --- 3rdparty/Vulkan-Headers | 2 +- 3rdparty/dxc/dxc | 2 +- 3rdparty/imgui | 2 +- 3rdparty/libexpat | 2 +- 3rdparty/nbl_spirv_cross | 2 +- 3rdparty/openexr | 2 +- 3rdparty/parallel-hashmap | 2 +- examples_tests | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers index 31aa7f634b..2c823b7f27 160000 --- a/3rdparty/Vulkan-Headers +++ b/3rdparty/Vulkan-Headers @@ -1 +1 @@ -Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3 +Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc index 5adc27f9e4..a08b6cbeb1 160000 --- a/3rdparty/dxc/dxc +++ b/3rdparty/dxc/dxc @@ -1 +1 @@ -Subproject commit 5adc27f9e42de7681d65a98873048af661b9b367 +Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb diff --git a/3rdparty/imgui b/3rdparty/imgui index a29e9dba30..e489e40a85 160000 --- a/3rdparty/imgui +++ b/3rdparty/imgui @@ -1 +1 @@ -Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175 +Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e diff --git a/3rdparty/libexpat b/3rdparty/libexpat index 39e487da35..e2004f9195 160000 --- a/3rdparty/libexpat +++ b/3rdparty/libexpat @@ -1 +1 @@ -Subproject commit 39e487da353b20bb3a724311d179ba0fddffc65b +Subproject commit e2004f9195700bb8248c8c954578f14fda58be27 diff --git a/3rdparty/nbl_spirv_cross b/3rdparty/nbl_spirv_cross index b52e6a55ca..f4accc2a4b 160000 --- a/3rdparty/nbl_spirv_cross +++ b/3rdparty/nbl_spirv_cross @@ -1 +1 @@ -Subproject commit b52e6a55ca2d9805a18dccfc45c7a2e692c1d8e1 +Subproject commit f4accc2a4b478c42038c920aa0e43a8aab7d135c diff --git a/3rdparty/openexr b/3rdparty/openexr index c8a74d9ac9..fca936a964 160000 --- a/3rdparty/openexr +++ b/3rdparty/openexr @@ -1 +1 @@ -Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd +Subproject commit fca936a964da5983daecdbed7cd249934701b41a diff --git a/3rdparty/parallel-hashmap b/3rdparty/parallel-hashmap index 7684faf186..fd7b8fb87d 160000 --- a/3rdparty/parallel-hashmap +++ b/3rdparty/parallel-hashmap @@ -1 +1 @@ -Subproject commit 7684faf186806e2c88554a78188c18185b21f127 +Subproject commit fd7b8fb87d74cc990591c3443b2ef21e9e137500 diff --git a/examples_tests b/examples_tests index 36633f5c2c..f79caed8b5 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 36633f5c2cae3e8e870a837c86e71f3a50061a3e +Subproject commit f79caed8b54499c1a4e848672dec38ce85d9a184 From 06c915e42162869f11ae951b7a081c722505d4e8 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 21 Jan 2025 16:11:27 +0100 Subject: [PATCH 30/56] stop rolling back my modules! --- 3rdparty/Vulkan-Headers | 2 +- 3rdparty/imgui | 2 +- 3rdparty/imguizmo | 2 +- 3rdparty/openexr | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers index 2c823b7f27..31aa7f634b 160000 --- a/3rdparty/Vulkan-Headers +++ b/3rdparty/Vulkan-Headers @@ -1 +1 @@ -Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb +Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3 diff --git a/3rdparty/imgui b/3rdparty/imgui index e489e40a85..a29e9dba30 160000 --- a/3rdparty/imgui +++ b/3rdparty/imgui @@ -1 +1 @@ -Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e +Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175 diff --git a/3rdparty/imguizmo b/3rdparty/imguizmo index 6f4b2197ef..b10e91756d 160000 --- a/3rdparty/imguizmo +++ b/3rdparty/imguizmo @@ -1 +1 @@ -Subproject commit 6f4b2197efd715d16b19775b00f36c6c6f5aacb6 +Subproject commit b10e91756d32395f5c1fefd417899b657ed7cb88 diff --git a/3rdparty/openexr b/3rdparty/openexr index fca936a964..c8a74d9ac9 160000 --- a/3rdparty/openexr +++ b/3rdparty/openexr @@ -1 +1 @@ -Subproject commit fca936a964da5983daecdbed7cd249934701b41a +Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd From 90d20c44783c9f3837f554ae8a05beb1ecd9f956 Mon Sep 17 00:00:00 2001 From: devsh Date: Tue, 21 Jan 2025 16:49:29 +0100 Subject: [PATCH 31/56] point submodule at head --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index f79caed8b5..9e26a74aa1 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit f79caed8b54499c1a4e848672dec38ce85d9a184 +Subproject commit 9e26a74aa1bcbe5e26ee14a79d4f2ef9e2701e0d From 4edd38c002531e3bbf55a8f0649af187223a1077 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 13 Mar 2025 11:57:14 +0000 Subject: [PATCH 32/56] Add capabilities for atomic ops --- include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl index 2ecb08cdb2..973a313e9c 100644 --- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl +++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl @@ -61,37 +61,45 @@ pointer_t copyObject([[vk::ext_reference]] T v); // Here's the thing with atomics, it's not only the data type that dictates whether you can do an atomic or not. // It's the storage class that has the most effect (shared vs storage vs image) and we can't check that easily template // integers operate on 2s complement so same op for signed and unsigned +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_instruction(spv::OpAtomicIAdd)]] enable_if_t || is_same_v, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_instruction(spv::OpAtomicIAdd)]] enable_if_t && (is_same_v || is_same_v), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // integers operate on 2s complement so same op for signed and unsigned +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_capability(spv::CapabilityInt64Atomics)]] [[vk::ext_instruction(spv::OpAtomicIAdd)]] enable_if_t || is_same_v, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_capability(spv::CapabilityInt64Atomics)]] [[vk::ext_instruction(spv::OpAtomicIAdd)]] enable_if_t && (is_same_v || is_same_v), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // integers operate on 2s complement so same op for signed and unsigned +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_instruction(spv::OpAtomicISub)]] enable_if_t || is_same_v, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_instruction(spv::OpAtomicISub)]] enable_if_t && (is_same_v || is_same_v), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // integers operate on 2s complement so same op for signed and unsigned +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_capability(spv::CapabilityInt64Atomics)]] [[vk::ext_instruction(spv::OpAtomicISub)]] enable_if_t || is_same_v, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); template // DXC Workaround +[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]] [[vk::ext_capability(spv::CapabilityInt64Atomics)]] [[vk::ext_instruction(spv::OpAtomicISub)]] enable_if_t && (is_same_v || is_same_v), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value); From f1e3e9866682fc79fa830d4a1c888674e24f58f7 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Thu, 13 Mar 2025 11:58:01 +0000 Subject: [PATCH 33/56] Fix luma_meter --- include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 266d6e6a2a..9808b9e26d 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -105,8 +105,8 @@ struct geom_meter { float_t lumaSum = reduction(luma, sdata); if (tid == GroupSize - 1) { - uint32_t3 workgroupSize = glsl::gl_WorkGroupSize(); - uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID()); + uint32_t3 workgroupCount = glsl::gl_NumWorkGroups(); + uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64; uploadFloat( val, @@ -122,8 +122,8 @@ struct geom_meter { NBL_REF_ARG(ValueAccessor) val ) { - uint32_t tid = workgroup::SubgroupContiguousIndex(); - float_t lumaSum = glsl::subgroupAdd( + uint32_t tid = glsl::gl_SubgroupInvocationID(); + float_t luma = glsl::subgroupAdd( downloadFloat( val, tid, @@ -132,7 +132,10 @@ struct geom_meter { ) ); - return lumaSum; + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + + return (luma / (1 << fixedPointBitsLeft)) / sampleCount; } float_t sampleCount; From f1b7d170718d1ba0d48eef0b69af842be0463bea Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Sun, 16 Mar 2025 11:07:47 +0000 Subject: [PATCH 34/56] Add median_luma_meter --- .../builtin/hlsl/luma_meter/luma_meter.hlsl | 145 ++++++++++++++++++ 1 file changed, 145 insertions(+) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 9808b9e26d..c17a64c437 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -141,6 +141,151 @@ struct geom_meter { float_t sampleCount; float_t2 lumaMinMax; }; + +template +struct median_meter { + using int_t = typename SharedAccessor::type; + using float_t = float32_t; + using float_t2 = typename conditional, float32_t2, float16_t2>::type; + using float_t3 = typename conditional, float32_t3, float16_t3>::type; + using this_t = median_meter; + + static this_t create(float_t2 lumaMinMax, float_t sampleCount) { + this_t retval; + retval.lumaMinMax = lumaMinMax; + retval.sampleCount = sampleCount; + return retval; + } + + int_t inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { + return workgroup::inclusive_scan < plus < int_t >, GroupSize >:: + template __call (value, sdata); + } + + float_t computeLuma( + NBL_CONST_REF_ARG(MeteringWindow) window, + NBL_REF_ARG(TexAccessor) tex, + float_t2 shiftedCoord + ) { + float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; + float_t3 color = tex.get(uvPos); + float_t luma = (float_t)TexAccessor::toXYZ(color); + + return clamp(luma, lumaMinMax.x, lumaMinMax.y); + } + + int_t float2Int( + float_t val, + float_t minLog2, + float_t rangeLog2 + ) { + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + + return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + } + + float_t int2Float( + int_t val, + float_t minLog2, + float_t rangeLog2 + ) { + return val / rangeLog2 + minLog2; + } + + void sampleLuma( + NBL_CONST_REF_ARG(MeteringWindow) window, + NBL_REF_ARG(HistogramAccessor) histo, + NBL_REF_ARG(TexAccessor) tex, + NBL_REF_ARG(SharedAccessor) sdata, + float_t2 tileOffset, + float_t2 viewportSize + ) { + uint32_t tid = workgroup::SubgroupContiguousIndex(); + + for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { + sdata.set(vid, 0); + } + + sdata.workgroupExecutionAndMemoryBarrier(); + + uint32_t2 coord = { + morton2d_decode_x(tid), + morton2d_decode_y(tid) + }; + + float_t luma = 0.0f; + float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + luma = computeLuma(window, tex, shiftedCoord); + + float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount; + uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize); + + sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + + sdata.workgroupExecutionAndMemoryBarrier(); + + float_t histogram_value; + sdata.get(tid, histogram_value); + + sdata.workgroupExecutionAndMemoryBarrier(); + + float_t sum = inclusive_scan(histogram_value, sdata); + histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + + const bool is_last_wg_invocation = tid == (GroupSize - 1); + const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize; + + for (int i = 1; i < RoundedBinCount; i++) { + uint32_t keyBucketStart = GroupSize * i; + uint32_t vid = tid + keyBucketStart; + + // no if statement about the last iteration needed + if (is_last_wg_invocation) { + float_t beforeSum; + sdata.get(keyBucketStart, beforeSum); + sdata.set(keyBucketStart, beforeSum + sum); + } + + // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes + sdata.workgroupExecutionAndMemoryBarrier(); + + // no aliasing anymore + float_t atVid; + sdata.get(vid, atVid); + sum = inclusive_scan(atVid, sdata); + if (vid < BinCount) { + histo.atomicAdd(vid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + } + } + } + + float_t gatherLuma( + NBL_REF_ARG(HistogramAccessor) histo, + NBL_REF_ARG(SharedAccessor) sdata + ) { + uint32_t tid = workgroup::SubgroupContiguousIndex(); + + for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { + sdata.set( + vid, + histo.get(vid & (BinCount - 1)) + ); + } + + sdata.workgroupExecutionAndMemoryBarrier(); + + uint32_t percentile40, percentile60; + sdata.get(BinCount * 0.4, percentile40); + sdata.get(BinCount * 0.6, percentile60); + + return (int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2; + } + + float_t sampleCount; + float_t2 lumaMinMax; +}; + } } } From 83ac633896008509ea16f8d896e4048f98eb888d Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Sun, 16 Mar 2025 11:49:58 +0000 Subject: [PATCH 35/56] Update submodule pointer --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 06dad8c118..498ffd21a0 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 06dad8c118027d6ebc8ee04e19340ba643079a63 +Subproject commit 498ffd21a06b9e9c74d20b37860421d17fe7cf49 From 2b5e502d23c14b8cba96cb8a7ff7a4b6d4d5b4e3 Mon Sep 17 00:00:00 2001 From: Nipun Garg <24457793+nipunG314@users.noreply.github.com> Date: Mon, 17 Mar 2025 16:11:48 +0000 Subject: [PATCH 36/56] Make changes to luma_meter --- .../builtin/hlsl/luma_meter/luma_meter.hlsl | 48 ++++++++----------- .../builtin/hlsl/tonemapper/operators.hlsl | 20 +++++--- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index c17a64c437..20af804603 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -36,13 +36,13 @@ struct geom_meter { return retval; } - float_t reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) + float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { return workgroup::reduction < plus < float_t >, GroupSize >:: template __call (value, sdata); } - float_t computeLumaLog2( + float_t __computeLumaLog2( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(TexAccessor) tex, float_t2 shiftedCoord @@ -54,26 +54,26 @@ struct geom_meter { luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); - return max(log2(luma), log2(lumaMinMax.x)); + return log2(luma); } - void uploadFloat( + void __uploadFloat( NBL_REF_ARG(ValueAccessor) val_accessor, - uint32_t index, float_t val, float_t minLog2, float_t rangeLog2 ) { uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64; uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); - val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); + val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); } - float_t downloadFloat( + float_t __downloadFloat( NBL_REF_ARG(ValueAccessor) val_accessor, uint32_t index, float_t minLog2, @@ -101,17 +101,13 @@ struct geom_meter { float_t luma = 0.0f; float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; - luma = computeLumaLog2(window, tex, shiftedCoord); - float_t lumaSum = reduction(luma, sdata); - - if (tid == GroupSize - 1) { - uint32_t3 workgroupCount = glsl::gl_NumWorkGroups(); - uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64; + float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); + float_t lumaLog2Sum = __reduction(lumaLog2, sdata); - uploadFloat( + if (tid == 0) { + __uploadFloat( val, - workgroupIndex, - lumaSum, + lumaLog2Sum, log2(lumaMinMax.x), log2(lumaMinMax.y / lumaMinMax.x) ); @@ -124,7 +120,7 @@ struct geom_meter { { uint32_t tid = glsl::gl_SubgroupInvocationID(); float_t luma = glsl::subgroupAdd( - downloadFloat( + __downloadFloat( val, tid, log2(lumaMinMax.x), @@ -150,19 +146,18 @@ struct median_meter { using float_t3 = typename conditional, float32_t3, float16_t3>::type; using this_t = median_meter; - static this_t create(float_t2 lumaMinMax, float_t sampleCount) { + static this_t create(float_t2 lumaMinMax) { this_t retval; retval.lumaMinMax = lumaMinMax; - retval.sampleCount = sampleCount; return retval; } - int_t inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { + int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { return workgroup::inclusive_scan < plus < int_t >, GroupSize >:: template __call (value, sdata); } - float_t computeLuma( + float_t __computeLuma( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(TexAccessor) tex, float_t2 shiftedCoord @@ -174,7 +169,7 @@ struct median_meter { return clamp(luma, lumaMinMax.x, lumaMinMax.y); } - int_t float2Int( + int_t __float2Int( float_t val, float_t minLog2, float_t rangeLog2 @@ -185,7 +180,7 @@ struct median_meter { return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); } - float_t int2Float( + float_t __int2Float( int_t val, float_t minLog2, float_t rangeLog2 @@ -216,7 +211,7 @@ struct median_meter { float_t luma = 0.0f; float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; - luma = computeLuma(window, tex, shiftedCoord); + luma = __computeLuma(window, tex, shiftedCoord); float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount; uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize); @@ -255,7 +250,7 @@ struct median_meter { sdata.get(vid, atVid); sum = inclusive_scan(atVid, sdata); if (vid < BinCount) { - histo.atomicAdd(vid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); } } } @@ -279,10 +274,9 @@ struct median_meter { sdata.get(BinCount * 0.4, percentile40); sdata.get(BinCount * 0.6, percentile60); - return (int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2; + return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2; } - float_t sampleCount; float_t2 lumaMinMax; }; diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl index 824e31d68a..46d241c76c 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl @@ -19,20 +19,25 @@ template struct Reinhard { using float_t = enable_if_t::value, T>; - using float_t3 = typename conditional, float32_t3, float16_t3>::type; + using float_t3 = vector; using this_t = Reinhard; + static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f) { this_t retval; + + const float_t unit = 1.0; retval.keyAndManualLinearExposure = key * exp2(EV); - retval.rcpWhite2 = 1.f / (WhitePointRelToEV * WhitePointRelToEV); + retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV); + return retval; } float_t3 operator()(float_t3 rawCIEXYZcolor) { + const float_t unit = 1.0; float_t exposureFactors = keyAndManualLinearExposure; float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors; - float_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * rcpWhite2) / (1.0 + exposedLuma)); + float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma)); return rawCIEXYZcolor * colorMultiplier; } @@ -44,8 +49,8 @@ template struct ACES { using float_t = enable_if_t::value, T>; - using float_t3 = typename conditional, float32_t3, float16_t3>::type; - using float_t3x3 = typename conditional, float32_t3x3, float16_t3x3>::type; + using float_t3 = vector; + using float_t3x3 = matrix; using this_t = ACES; static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) { @@ -57,9 +62,10 @@ struct ACES } float_t3 operator()(float_t3 rawCIEXYZcolor) { + const float_t unit = 1.0; float_t3 tonemapped = rawCIEXYZcolor; - if (tonemapped.y > 1.175494351e-38) - tonemapped *= exp2(log2(tonemapped.y) * (gamma - 1.0) + (exposure) * gamma); + if (tonemapped.y > bit_cast(numeric_limits::min)) + tonemapped *= exp2(log2(tonemapped.y) * (gamma - unit) + (exposure) * gamma); // XYZ => RRT_SAT // this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t) From c1524a9ecd0ddf480f4bbee3df1988c20ee54324 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 12 Jan 2026 10:54:14 +0700 Subject: [PATCH 37/56] refactor morton usage --- include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl index 20af804603..1bca324d13 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl @@ -94,10 +94,7 @@ struct geom_meter { ) { uint32_t tid = workgroup::SubgroupContiguousIndex(); - uint32_t2 coord = { - morton2d_decode_x(tid), - morton2d_decode_y(tid) - }; + uint32_t2 coord = math::Morton::decode2d(tid); float_t luma = 0.0f; float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; @@ -204,10 +201,7 @@ struct median_meter { sdata.workgroupExecutionAndMemoryBarrier(); - uint32_t2 coord = { - morton2d_decode_x(tid), - morton2d_decode_y(tid) - }; + uint32_t2 coord = math::Morton::decode2d(tid); float_t luma = 0.0f; float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; From d8a2b81c9c830db5175d77180980f93aeb460541 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 12 Jan 2026 12:08:21 +0700 Subject: [PATCH 38/56] split out luma_meter and tonemapper operators into their own separate files --- .../builtin/hlsl/luma_meter/geom_mean.hlsl | 143 ++++++++++++++++++ .../{luma_meter.hlsl => histogram.hlsl} | 138 ++--------------- .../{operators.hlsl => operators/aces.hlsl} | 41 +---- .../hlsl/tonemapper/operators/reinhard.hlsl | 54 +++++++ src/nbl/builtin/CMakeLists.txt | 6 +- 5 files changed, 224 insertions(+), 158 deletions(-) create mode 100644 include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl rename include/nbl/builtin/hlsl/luma_meter/{luma_meter.hlsl => histogram.hlsl} (59%) rename include/nbl/builtin/hlsl/tonemapper/{operators.hlsl => operators/aces.hlsl} (71%) create mode 100644 include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl new file mode 100644 index 0000000000..7c85f786fa --- /dev/null +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -0,0 +1,143 @@ +// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ +#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_ + +#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" +#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl" +#include "nbl/builtin/hlsl/workgroup/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" +#include "nbl/builtin/hlsl/type_traits.hlsl" +#include "nbl/builtin/hlsl/math/morton.hlsl" +#include "nbl/builtin/hlsl/luma_meter/common.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace luma_meter +{ + +template +struct geom_meter +{ + using float_t = typename SharedAccessor::type; + using float_t2 = typename conditional, float32_t2, float16_t2>::type; + using float_t3 = typename conditional, float32_t3, float16_t3>::type; + using this_t = geom_meter; + + static this_t create(float_t2 lumaMinMax, float_t sampleCount) + { + this_t retval; + retval.lumaMinMax = lumaMinMax; + retval.sampleCount = sampleCount; + return retval; + } + + float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) + { + return workgroup::reduction < plus < float_t >, GroupSize >:: + template __call (value, sdata); + } + + float_t __computeLumaLog2( + NBL_CONST_REF_ARG(MeteringWindow) window, + NBL_REF_ARG(TexAccessor) tex, + float_t2 shiftedCoord + ) + { + float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; + float_t3 color = tex.get(uvPos); + float_t luma = (float_t)TexAccessor::toXYZ(color); + + luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); + + return log2(luma); + } + + void __uploadFloat( + NBL_REF_ARG(ValueAccessor) val_accessor, + float_t val, + float_t minLog2, + float_t rangeLog2 + ) + { + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64; + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + + uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + + val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); + } + + float_t __downloadFloat( + NBL_REF_ARG(ValueAccessor) val_accessor, + uint32_t index, + float_t minLog2, + float_t rangeLog2 + ) + { + float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); + return luma / rangeLog2 + minLog2; + } + + void sampleLuma( + NBL_CONST_REF_ARG(MeteringWindow) window, + NBL_REF_ARG(ValueAccessor) val, + NBL_REF_ARG(TexAccessor) tex, + NBL_REF_ARG(SharedAccessor) sdata, + float_t2 tileOffset, + float_t2 viewportSize + ) + { + uint32_t tid = workgroup::SubgroupContiguousIndex(); + uint32_t2 coord = math::Morton::decode2d(tid); + + float_t luma = 0.0f; + float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); + float_t lumaLog2Sum = __reduction(lumaLog2, sdata); + + if (tid == 0) { + __uploadFloat( + val, + lumaLog2Sum, + log2(lumaMinMax.x), + log2(lumaMinMax.y / lumaMinMax.x) + ); + } + } + + float_t gatherLuma( + NBL_REF_ARG(ValueAccessor) val + ) + { + uint32_t tid = glsl::gl_SubgroupInvocationID(); + float_t luma = glsl::subgroupAdd( + __downloadFloat( + val, + tid, + log2(lumaMinMax.x), + log2(lumaMinMax.y / lumaMinMax.x) + ) + ); + + uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + + return (luma / (1 << fixedPointBitsLeft)) / sampleCount; + } + + float_t sampleCount; + float_t2 lumaMinMax; +}; + +} +} +} + +#endif \ No newline at end of file diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl similarity index 59% rename from include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl rename to include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 1bca324d13..51c27c8e9e 100644 --- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -21,135 +21,24 @@ namespace hlsl namespace luma_meter { -template -struct geom_meter { - using float_t = typename SharedAccessor::type; - using float_t2 = typename conditional, float32_t2, float16_t2>::type; - using float_t3 = typename conditional, float32_t3, float16_t3>::type; - using this_t = geom_meter; - - static this_t create(float_t2 lumaMinMax, float_t sampleCount) - { - this_t retval; - retval.lumaMinMax = lumaMinMax; - retval.sampleCount = sampleCount; - return retval; - } - - float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) - { - return workgroup::reduction < plus < float_t >, GroupSize >:: - template __call (value, sdata); - } - - float_t __computeLumaLog2( - NBL_CONST_REF_ARG(MeteringWindow) window, - NBL_REF_ARG(TexAccessor) tex, - float_t2 shiftedCoord - ) - { - float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; - float_t3 color = tex.get(uvPos); - float_t luma = (float_t)TexAccessor::toXYZ(color); - - luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); - - return log2(luma); - } - - void __uploadFloat( - NBL_REF_ARG(ValueAccessor) val_accessor, - float_t val, - float_t minLog2, - float_t rangeLog2 - ) - { - uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); - uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64; - uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - - uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); - - val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); - } - - float_t __downloadFloat( - NBL_REF_ARG(ValueAccessor) val_accessor, - uint32_t index, - float_t minLog2, - float_t rangeLog2 - ) - { - float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); - return luma / rangeLog2 + minLog2; - } - - void sampleLuma( - NBL_CONST_REF_ARG(MeteringWindow) window, - NBL_REF_ARG(ValueAccessor) val, - NBL_REF_ARG(TexAccessor) tex, - NBL_REF_ARG(SharedAccessor) sdata, - float_t2 tileOffset, - float_t2 viewportSize - ) - { - uint32_t tid = workgroup::SubgroupContiguousIndex(); - uint32_t2 coord = math::Morton::decode2d(tid); - - float_t luma = 0.0f; - float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; - float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); - float_t lumaLog2Sum = __reduction(lumaLog2, sdata); - - if (tid == 0) { - __uploadFloat( - val, - lumaLog2Sum, - log2(lumaMinMax.x), - log2(lumaMinMax.y / lumaMinMax.x) - ); - } - } - - float_t gatherLuma( - NBL_REF_ARG(ValueAccessor) val - ) - { - uint32_t tid = glsl::gl_SubgroupInvocationID(); - float_t luma = glsl::subgroupAdd( - __downloadFloat( - val, - tid, - log2(lumaMinMax.x), - log2(lumaMinMax.y / lumaMinMax.x) - ) - ); - - uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); - uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - - return (luma / (1 << fixedPointBitsLeft)) / sampleCount; - } - - float_t sampleCount; - float_t2 lumaMinMax; -}; - template -struct median_meter { +struct median_meter +{ using int_t = typename SharedAccessor::type; using float_t = float32_t; using float_t2 = typename conditional, float32_t2, float16_t2>::type; using float_t3 = typename conditional, float32_t3, float16_t3>::type; using this_t = median_meter; - static this_t create(float_t2 lumaMinMax) { + static this_t create(float_t2 lumaMinMax) + { this_t retval; retval.lumaMinMax = lumaMinMax; return retval; } - int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { + int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) + { return workgroup::inclusive_scan < plus < int_t >, GroupSize >:: template __call (value, sdata); } @@ -158,7 +47,8 @@ struct median_meter { NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(TexAccessor) tex, float_t2 shiftedCoord - ) { + ) + { float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; float_t3 color = tex.get(uvPos); float_t luma = (float_t)TexAccessor::toXYZ(color); @@ -170,7 +60,8 @@ struct median_meter { float_t val, float_t minLog2, float_t rangeLog2 - ) { + ) + { uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); @@ -181,7 +72,8 @@ struct median_meter { int_t val, float_t minLog2, float_t rangeLog2 - ) { + ) + { return val / rangeLog2 + minLog2; } @@ -192,7 +84,8 @@ struct median_meter { NBL_REF_ARG(SharedAccessor) sdata, float_t2 tileOffset, float_t2 viewportSize - ) { + ) + { uint32_t tid = workgroup::SubgroupContiguousIndex(); for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { @@ -252,7 +145,8 @@ struct median_meter { float_t gatherLuma( NBL_REF_ARG(HistogramAccessor) histo, NBL_REF_ARG(SharedAccessor) sdata - ) { + ) + { uint32_t tid = workgroup::SubgroupContiguousIndex(); for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl similarity index 71% rename from include/nbl/builtin/hlsl/tonemapper/operators.hlsl rename to include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl index 46d241c76c..b2e0e4b053 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl @@ -1,4 +1,4 @@ -// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O. +// Copyright (C) 2018-2026 - DevSH Graphics Programming Sp. z O.O. // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h @@ -7,6 +7,7 @@ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" +#include "nbl/builtin/hlsl/concepts/core.hlsl" namespace nbl { @@ -15,37 +16,7 @@ namespace hlsl namespace tonemapper { -template -struct Reinhard -{ - using float_t = enable_if_t::value, T>; - using float_t3 = vector; - using this_t = Reinhard; - - static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f) - { - this_t retval; - - const float_t unit = 1.0; - retval.keyAndManualLinearExposure = key * exp2(EV); - retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV); - - return retval; - } - - float_t3 operator()(float_t3 rawCIEXYZcolor) { - const float_t unit = 1.0; - float_t exposureFactors = keyAndManualLinearExposure; - float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors; - float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma)); - return rawCIEXYZcolor * colorMultiplier; - } - - float_t keyAndManualLinearExposure; - float_t rcpWhite2; -}; - -template +template) struct ACES { using float_t = enable_if_t::value, T>; @@ -53,7 +24,8 @@ struct ACES using float_t3x3 = matrix; using this_t = ACES; - static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) { + static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) + { this_t retval; retval.gamma = Contrast; const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key @@ -61,7 +33,8 @@ struct ACES return retval; } - float_t3 operator()(float_t3 rawCIEXYZcolor) { + float_t3 operator()(float_t3 rawCIEXYZcolor) + { const float_t unit = 1.0; float_t3 tonemapped = rawCIEXYZcolor; if (tonemapped.y > bit_cast(numeric_limits::min)) diff --git a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl new file mode 100644 index 0000000000..de73959f86 --- /dev/null +++ b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl @@ -0,0 +1,54 @@ +// Copyright (C) 2018-2026 - DevSH Graphics Programming Sp. z O.O. +// This file is part of the "Nabla Engine". +// For conditions of distribution and use, see copyright notice in nabla.h + +#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ + +#include "nbl/builtin/hlsl/cpp_compat.hlsl" +#include "nbl/builtin/hlsl/type_traits.hlsl" +#include "nbl/builtin/hlsl/concepts/core.hlsl" + +namespace nbl +{ +namespace hlsl +{ +namespace tonemapper +{ + +template) +struct Reinhard +{ + using float_t = enable_if_t::value, T>; + using float_t3 = vector; + using this_t = Reinhard; + + static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f) + { + this_t retval; + + const float_t unit = 1.0; + retval.keyAndManualLinearExposure = key * exp2(EV); + retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV); + + return retval; + } + + float_t3 operator()(float_t3 rawCIEXYZcolor) + { + const float_t unit = 1.0; + float_t exposureFactors = keyAndManualLinearExposure; + float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors; + float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma)); + return rawCIEXYZcolor * colorMultiplier; + } + + float_t keyAndManualLinearExposure; + float_t rcpWhite2; +}; + +} +} +} + +#endif \ No newline at end of file diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt index d5d293a564..00c5f021d5 100644 --- a/src/nbl/builtin/CMakeLists.txt +++ b/src/nbl/builtin/CMakeLists.txt @@ -29,9 +29,11 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/struct_declare.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl") # luma metering LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl") -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/geom_mean.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/histogram.hlsl") # tonemapper -LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators/reinhard.hlsl") +LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators/aces.hlsl") LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/legacy_bda_accessor.hlsl") # bump mapping LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl` From 918c6a4e04d492019e797080e88b4a3cfc910886 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 12 Jan 2026 14:49:34 +0700 Subject: [PATCH 39/56] remove obsolete morton code --- include/nbl/builtin/hlsl/math/morton.hlsl | 140 ---------------------- 1 file changed, 140 deletions(-) diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl index 203eca80b3..7af5aadb8b 100644 --- a/include/nbl/builtin/hlsl/math/morton.hlsl +++ b/include/nbl/builtin/hlsl/math/morton.hlsl @@ -13,146 +13,6 @@ namespace hlsl namespace math { -// TODO: this is is the old stuff before merging morton pr (I think), I don't know if it's been replaced -namespace impl -{ - -template -NBL_CONSTEXPR_FUNC T morton2d_mask(uint16_t _n) -{ - const static uint64_t mask[5] = - { - 0x5555555555555555ull, - 0x3333333333333333ull, - 0x0F0F0F0F0F0F0F0Full, - 0x00FF00FF00FF00FFull, - 0x0000FFFF0000FFFFull - }; - return nbl::hlsl::_static_cast(mask[_n]); -} - -template -NBL_CONSTEXPR_FUNC T morton3d_mask(uint16_t _n) -{ - const static uint64_t mask[5] = - { - 0x1249249249249249ull, - 0x10C30C30C30C30C3ull, - 0x010F00F00F00F00Full, - 0x001F0000FF0000FFull, - 0x001F00000000FFFFull - }; - return nbl::hlsl::_static_cast(mask[_n]); -} -template -NBL_CONSTEXPR_FUNC T morton4d_mask(uint16_t _n) -{ - const static uint64_t mask[4] = - { - 0x1111111111111111ull, - 0x0303030303030303ull, - 0x000F000F000F000Full, - 0x000000FF000000FFull - }; - return nbl::hlsl::_static_cast(mask[_n]); -} - -template -inline T morton2d_decode(T x) -{ - x = x & morton2d_mask(0); - x = (x | (x >> 1)) & morton2d_mask(1); - x = (x | (x >> 2)) & morton2d_mask(2); - if (bitDepth > 8u) - { - x = (x | (x >> 4)) & morton2d_mask(3); - } - if (bitDepth > 16u) - { - x = (x | (x >> 8)) & morton2d_mask(4); - } - if (bitDepth > 32u) - { - x = (x | (x >> 16)); - } - return x; -} - -//! Puts bits on even positions filling gaps with 0s -template -inline T separate_bits_2d(T x) -{ - if (bitDepth > 32u) - { - x = (x | (x << 16)) & morton2d_mask(4); - } - if (bitDepth > 16u) - { - x = (x | (x << 8)) & morton2d_mask(3); - } - if (bitDepth > 8u) - { - x = (x | (x << 4)) & morton2d_mask(2); - } - x = (x | (x << 2)) & morton2d_mask(1); - x = (x | (x << 1)) & morton2d_mask(0); - - return x; -} -template -inline T separate_bits_3d(T x) -{ - if (bitDepth > 32u) - { - x = (x | (x << 32)) & morton3d_mask(4); - } - if (bitDepth > 16u) - { - x = (x | (x << 16)) & morton3d_mask(3); - } - if (bitDepth > 8u) - { - x = (x | (x << 8)) & morton3d_mask(2); - } - x = (x | (x << 4)) & morton3d_mask(1); - x = (x | (x << 2)) & morton3d_mask(0); - - return x; -} -template -inline T separate_bits_4d(T x) -{ - if (bitDepth > 32u) - { - x = (x | (x << 24)) & morton4d_mask(3); - } - if (bitDepth > 16u) - { - x = (x | (x << 12)) & morton4d_mask(2); - } - if (bitDepth > 8u) - { - x = (x | (x << 6)) & morton4d_mask(1); - } - x = (x | (x << 3)) & morton4d_mask(0); - - return x; -} -} - -template -T morton2d_decode_x(T _morton) { return impl::morton2d_decode(_morton); } -template -T morton2d_decode_y(T _morton) { return impl::morton2d_decode(_morton >> 1); } - -template -T morton2d_encode(T x, T y) { return impl::separate_bits_2d(x) | (impl::separate_bits_2d(y) << 1); } -template -T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d(x) | (impl::separate_bits_3d(y) << 1) | (impl::separate_bits_3d(z) << 2); } -template -T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d(x) | (impl::separate_bits_4d(y) << 1) | (impl::separate_bits_4d(z) << 2) | (impl::separate_bits_4d(w) << 3); } -// TODO: end of old stuff - namespace impl { From 3de93db0beea29318486082669da61e2794b6170 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 30 Jan 2026 15:09:27 +0700 Subject: [PATCH 40/56] use new morton class --- include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl | 6 ++++-- include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index 7c85f786fa..971017993c 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -11,7 +11,7 @@ #include "nbl/builtin/hlsl/workgroup/basic.hlsl" #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" -#include "nbl/builtin/hlsl/math/morton.hlsl" +#include "nbl/builtin/hlsl/morton.hlsl" #include "nbl/builtin/hlsl/luma_meter/common.hlsl" namespace nbl @@ -95,7 +95,9 @@ struct geom_meter ) { uint32_t tid = workgroup::SubgroupContiguousIndex(); - uint32_t2 coord = math::Morton::decode2d(tid); + morton::code mc; + mc.value = tid; + uint32_t2 coord = _static_cast(mc); float_t luma = 0.0f; float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 51c27c8e9e..71a9ca2e3b 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -11,7 +11,7 @@ #include "nbl/builtin/hlsl/workgroup/basic.hlsl" #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" -#include "nbl/builtin/hlsl/math/morton.hlsl" +#include "nbl/builtin/hlsl/morton.hlsl" #include "nbl/builtin/hlsl/luma_meter/common.hlsl" namespace nbl @@ -94,7 +94,9 @@ struct median_meter sdata.workgroupExecutionAndMemoryBarrier(); - uint32_t2 coord = math::Morton::decode2d(tid); + morton::code mc; + mc.value = tid; + uint32_t2 coord = _static_cast(mc); float_t luma = 0.0f; float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; From 87fca818209beb0621b89b82503ee39a26be0443 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 30 Jan 2026 17:14:10 +0700 Subject: [PATCH 41/56] removed optimizations that are broken, make it work like glsl version --- .../builtin/hlsl/luma_meter/geom_mean.hlsl | 145 ++++++++++++++++-- .../hlsl/tonemapper/operators/reinhard.hlsl | 2 +- 2 files changed, 136 insertions(+), 11 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index 971017993c..ab0c27c340 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -69,9 +69,11 @@ struct geom_meter uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64; uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + val /= 32.0 * 32.0; + // uint32_t lumaSumBitPattern = uint32_t(((val - minLog2) / rangeLog2) * 4096.0 + 0.5); // 32*32 subgroups + uint32_t lumaSumBitPattern = uint32_t(val * 4096.0 + 0.5); // 32*32 subgroups - val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); + val_accessor.atomicAdd(0u, lumaSumBitPattern); } float_t __downloadFloat( @@ -81,8 +83,8 @@ struct geom_meter float_t rangeLog2 ) { - float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); - return luma / rangeLog2 + minLog2; + float_t luma = (float_t)val_accessor.get(0u); + return (luma / float_t(4096 * 60 * 34)) * rangeLog2 + minLog2; } void sampleLuma( @@ -100,8 +102,10 @@ struct geom_meter uint32_t2 coord = _static_cast(mc); float_t luma = 0.0f; - float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + // float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + float_t2 shiftedCoord = float_t2(glsl::gl_GlobalInvocationID().xy) / viewportSize; float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); + lumaLog2 = (lumaLog2 - log2(lumaMinMax.x)) / log2(lumaMinMax.y / lumaMinMax.x); float_t lumaLog2Sum = __reduction(lumaLog2, sdata); if (tid == 0) { @@ -119,25 +123,146 @@ struct geom_meter ) { uint32_t tid = glsl::gl_SubgroupInvocationID(); - float_t luma = glsl::subgroupAdd( - __downloadFloat( + // float_t luma = glsl::subgroupAdd( + // __downloadFloat( + // val, + // tid, + // log2(lumaMinMax.x), + // log2(lumaMinMax.y / lumaMinMax.x) + // ) + // ); + float_t luma = __downloadFloat( val, tid, log2(lumaMinMax.x), log2(lumaMinMax.y / lumaMinMax.x) - ) - ); + ); uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - return (luma / (1 << fixedPointBitsLeft)) / sampleCount; + return luma;// / sampleCount; } float_t sampleCount; float_t2 lumaMinMax; }; +// template +// struct geom_meter +// { +// using float_t = typename SharedAccessor::type; +// using float_t2 = typename conditional, float32_t2, float16_t2>::type; +// using float_t3 = typename conditional, float32_t3, float16_t3>::type; +// using this_t = geom_meter; + +// static this_t create(float_t2 lumaMinMax, float_t sampleCount) +// { +// this_t retval; +// retval.lumaMinMax = lumaMinMax; +// retval.sampleCount = sampleCount; +// return retval; +// } + +// float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) +// { +// return workgroup::reduction < plus < float_t >, GroupSize >:: +// template __call (value, sdata); +// } + +// float_t __computeLumaLog2( +// NBL_CONST_REF_ARG(MeteringWindow) window, +// NBL_REF_ARG(TexAccessor) tex, +// float_t2 shiftedCoord +// ) +// { +// float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; +// float_t3 color = tex.get(uvPos); +// float_t luma = (float_t)TexAccessor::toXYZ(color); + +// luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); + +// return log2(luma); +// } + +// void __uploadFloat( +// NBL_REF_ARG(ValueAccessor) val_accessor, +// float_t val, +// float_t minLog2, +// float_t rangeLog2 +// ) +// { +// uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); +// uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64; +// uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + +// uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); + +// val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); +// } + +// float_t __downloadFloat( +// NBL_REF_ARG(ValueAccessor) val_accessor, +// uint32_t index, +// float_t minLog2, +// float_t rangeLog2 +// ) +// { +// float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); +// return luma / rangeLog2 + minLog2; +// } + +// void sampleLuma( +// NBL_CONST_REF_ARG(MeteringWindow) window, +// NBL_REF_ARG(ValueAccessor) val, +// NBL_REF_ARG(TexAccessor) tex, +// NBL_REF_ARG(SharedAccessor) sdata, +// float_t2 tileOffset, +// float_t2 viewportSize +// ) +// { +// uint32_t tid = workgroup::SubgroupContiguousIndex(); +// uint32_t2 coord = math::Morton::decode2d(tid); + +// float_t luma = 0.0f; +// float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; +// float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); +// float_t lumaLog2Sum = __reduction(lumaLog2, sdata); + +// if (tid == 0) { +// __uploadFloat( +// val, +// lumaLog2Sum, +// log2(lumaMinMax.x), +// log2(lumaMinMax.y / lumaMinMax.x) +// ); +// } +// } + +// float_t gatherLuma( +// NBL_REF_ARG(ValueAccessor) val +// ) +// { +// uint32_t tid = glsl::gl_SubgroupInvocationID(); +// float_t luma = glsl::subgroupAdd( +// __downloadFloat( +// val, +// tid, +// log2(lumaMinMax.x), +// log2(lumaMinMax.y / lumaMinMax.x) +// ) +// ); + +// uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); +// uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + +// return (luma / (1 << fixedPointBitsLeft)) / sampleCount; +// } + +// float_t sampleCount; +// float_t2 lumaMinMax; +// }; + } } } diff --git a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl index de73959f86..da48fbf66d 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl @@ -28,7 +28,7 @@ struct Reinhard this_t retval; const float_t unit = 1.0; - retval.keyAndManualLinearExposure = key * exp2(EV); + retval.keyAndManualLinearExposure = key * exp2(-EV); retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV); return retval; From af209328ffb0020a6ab0a968aecd52583875fb6c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 2 Feb 2026 14:16:00 +0700 Subject: [PATCH 42/56] fix minor bugs so it compiles --- include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 71a9ca2e3b..52b711a923 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -105,7 +105,7 @@ struct median_meter float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount; uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize); - sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + sdata.atomicAdd(binIndex, __float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); sdata.workgroupExecutionAndMemoryBarrier(); @@ -114,8 +114,8 @@ struct median_meter sdata.workgroupExecutionAndMemoryBarrier(); - float_t sum = inclusive_scan(histogram_value, sdata); - histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + float_t sum = __inclusive_scan(histogram_value, sdata); + histo.atomicAdd(tid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); const bool is_last_wg_invocation = tid == (GroupSize - 1); const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize; @@ -137,7 +137,7 @@ struct median_meter // no aliasing anymore float_t atVid; sdata.get(vid, atVid); - sum = inclusive_scan(atVid, sdata); + sum = __inclusive_scan(atVid, sdata); if (vid < BinCount) { histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); } From dc7e751b663391afaf00d2c18d77111fe689bb10 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 3 Feb 2026 14:51:39 +0700 Subject: [PATCH 43/56] histogram autoexposure working --- .../builtin/hlsl/luma_meter/geom_mean.hlsl | 4 +- .../builtin/hlsl/luma_meter/histogram.hlsl | 264 ++++++++++++++---- 2 files changed, 204 insertions(+), 64 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index ab0c27c340..25e67ec35b 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -101,9 +101,7 @@ struct geom_meter mc.value = tid; uint32_t2 coord = _static_cast(mc); - float_t luma = 0.0f; - // float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; - float_t2 shiftedCoord = float_t2(glsl::gl_GlobalInvocationID().xy) / viewportSize; + float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); lumaLog2 = (lumaLog2 - log2(lumaMinMax.x)) / log2(lumaMinMax.y / lumaMinMax.x); float_t lumaLog2Sum = __reduction(lumaLog2, sdata); diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 52b711a923..58fd085cd2 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -56,27 +56,6 @@ struct median_meter return clamp(luma, lumaMinMax.x, lumaMinMax.y); } - int_t __float2Int( - float_t val, - float_t minLog2, - float_t rangeLog2 - ) - { - uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); - uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - - return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); - } - - float_t __int2Float( - int_t val, - float_t minLog2, - float_t rangeLog2 - ) - { - return val / rangeLog2 + minLog2; - } - void sampleLuma( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(HistogramAccessor) histo, @@ -98,50 +77,22 @@ struct median_meter mc.value = tid; uint32_t2 coord = _static_cast(mc); - float_t luma = 0.0f; float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; - luma = __computeLuma(window, tex, shiftedCoord); - - float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount; - uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize); + float_t luma = __computeLuma(window, tex, shiftedCoord); - sdata.atomicAdd(binIndex, __float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + float_t scaledLogLuma = log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x); + uint32_t binIndex = int_t(scaledLogLuma * float_t(BinCount-1u) + 0.5); + sdata.atomicAdd(binIndex, 1u); sdata.workgroupExecutionAndMemoryBarrier(); - float_t histogram_value; + int_t histogram_value; sdata.get(tid, histogram_value); sdata.workgroupExecutionAndMemoryBarrier(); - float_t sum = __inclusive_scan(histogram_value, sdata); - histo.atomicAdd(tid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); - - const bool is_last_wg_invocation = tid == (GroupSize - 1); - const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize; - - for (int i = 1; i < RoundedBinCount; i++) { - uint32_t keyBucketStart = GroupSize * i; - uint32_t vid = tid + keyBucketStart; - - // no if statement about the last iteration needed - if (is_last_wg_invocation) { - float_t beforeSum; - sdata.get(keyBucketStart, beforeSum); - sdata.set(keyBucketStart, beforeSum + sum); - } - - // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes - sdata.workgroupExecutionAndMemoryBarrier(); - - // no aliasing anymore - float_t atVid; - sdata.get(vid, atVid); - sum = __inclusive_scan(atVid, sdata); - if (vid < BinCount) { - histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); - } - } + int_t sum = __inclusive_scan(histogram_value, sdata); + histo.atomicAdd(tid, sum); } float_t gatherLuma( @@ -154,22 +105,213 @@ struct median_meter for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { sdata.set( vid, - histo.get(vid & (BinCount - 1)) + histo.get(vid) ); } sdata.workgroupExecutionAndMemoryBarrier(); - uint32_t percentile40, percentile60; - sdata.get(BinCount * 0.4, percentile40); - sdata.get(BinCount * 0.6, percentile60); + // TODO: choose percentile in push constant + int_t lower, upper; + if (tid == 0) + { + uint32_t percentile40 = uint32_t(BinCount * 0.4); + // lower bound + uint32_t lo = 0u; + uint32_t hi = BinCount; + int_t v; + while (lo < hi) + { + uint32_t mid = lo + (hi - lo) / 2; + sdata.get(mid, v); + if (percentile40 <= v) + hi = mid; + else + lo = mid + 1; + } + + lower = lo; + } + if (tid == 1) + { + uint32_t percentile60 = uint32_t(BinCount * 0.6); + // upper bound + uint32_t lo = 0u; + uint32_t hi = BinCount; + int_t v; + while (lo < hi) + { + uint32_t mid = lo + (hi - lo) / 2; + sdata.get(mid, v); + if (percentile60 >= v) + lo = mid + 1; + else + hi = mid; + } + + upper = lo; + } - return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2; + sdata.workgroupExecutionAndMemoryBarrier(); + + lower = workgroup::Broadcast(lower, sdata, 0); + upper = workgroup::Broadcast(upper, sdata, 1); + + return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2(lumaMinMax.y/lumaMinMax.x) + log2(lumaMinMax.x); } float_t2 lumaMinMax; }; +// template +// struct median_meter +// { +// using int_t = typename SharedAccessor::type; +// using float_t = float32_t; +// using float_t2 = typename conditional, float32_t2, float16_t2>::type; +// using float_t3 = typename conditional, float32_t3, float16_t3>::type; +// using this_t = median_meter; + +// static this_t create(float_t2 lumaMinMax) +// { +// this_t retval; +// retval.lumaMinMax = lumaMinMax; +// return retval; +// } + +// int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) +// { +// return workgroup::inclusive_scan < plus < int_t >, GroupSize >:: +// template __call (value, sdata); +// } + +// float_t __computeLuma( +// NBL_CONST_REF_ARG(MeteringWindow) window, +// NBL_REF_ARG(TexAccessor) tex, +// float_t2 shiftedCoord +// ) +// { +// float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; +// float_t3 color = tex.get(uvPos); +// float_t luma = (float_t)TexAccessor::toXYZ(color); + +// return clamp(luma, lumaMinMax.x, lumaMinMax.y); +// } + +// int_t __float2Int( +// float_t val, +// float_t minLog2, +// float_t rangeLog2 +// ) +// { +// uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); +// uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); + +// return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); +// } + +// float_t __int2Float( +// int_t val, +// float_t minLog2, +// float_t rangeLog2 +// ) +// { +// return val / rangeLog2 + minLog2; +// } + +// void sampleLuma( +// NBL_CONST_REF_ARG(MeteringWindow) window, +// NBL_REF_ARG(HistogramAccessor) histo, +// NBL_REF_ARG(TexAccessor) tex, +// NBL_REF_ARG(SharedAccessor) sdata, +// float_t2 tileOffset, +// float_t2 viewportSize +// ) +// { +// uint32_t tid = workgroup::SubgroupContiguousIndex(); + +// for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { +// sdata.set(vid, 0); +// } + +// sdata.workgroupExecutionAndMemoryBarrier(); + +// morton::code mc; +// mc.value = tid; +// uint32_t2 coord = _static_cast(mc); + +// float_t luma = 0.0f; +// float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; +// luma = __computeLuma(window, tex, shiftedCoord); + +// float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount; +// uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize); + +// sdata.atomicAdd(binIndex, __float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + +// sdata.workgroupExecutionAndMemoryBarrier(); + +// float_t histogram_value; +// sdata.get(tid, histogram_value); + +// sdata.workgroupExecutionAndMemoryBarrier(); + +// float_t sum = __inclusive_scan(histogram_value, sdata); +// histo.atomicAdd(tid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); + +// const bool is_last_wg_invocation = tid == (GroupSize - 1); +// const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize; + +// for (int i = 1; i < RoundedBinCount; i++) { +// uint32_t keyBucketStart = GroupSize * i; +// uint32_t vid = tid + keyBucketStart; + +// // no if statement about the last iteration needed +// if (is_last_wg_invocation) { +// float_t beforeSum; +// sdata.get(keyBucketStart, beforeSum); +// sdata.set(keyBucketStart, beforeSum + sum); +// } + +// // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes +// sdata.workgroupExecutionAndMemoryBarrier(); + +// // no aliasing anymore +// float_t atVid; +// sdata.get(vid, atVid); +// sum = __inclusive_scan(atVid, sdata); +// if (vid < BinCount) { +// histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); +// } +// } +// } + +// float_t gatherLuma( +// NBL_REF_ARG(HistogramAccessor) histo, +// NBL_REF_ARG(SharedAccessor) sdata +// ) +// { +// uint32_t tid = workgroup::SubgroupContiguousIndex(); + +// for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { +// sdata.set( +// vid, +// histo.get(vid & (BinCount - 1)) +// ); +// } + +// sdata.workgroupExecutionAndMemoryBarrier(); + +// uint32_t percentile40, percentile60; +// sdata.get(BinCount * 0.4, percentile40); +// sdata.get(BinCount * 0.6, percentile60); + +// return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2; +// } + +// float_t2 lumaMinMax; +// }; + } } } From f57886d20370a46b19d9ec8ae88309748fea537c Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 3 Feb 2026 16:17:06 +0700 Subject: [PATCH 44/56] more values passed in through push constants at create --- .../builtin/hlsl/luma_meter/geom_mean.hlsl | 42 +++++++++---------- .../builtin/hlsl/luma_meter/histogram.hlsl | 31 +++++++------- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index 25e67ec35b..e6be2e3a60 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -29,11 +29,15 @@ struct geom_meter using float_t3 = typename conditional, float32_t3, float16_t3>::type; using this_t = geom_meter; - static this_t create(float_t2 lumaMinMax, float_t sampleCount) + NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxWorkgroupIncrement = 0x1000u; + + static this_t create(float_t lumaMin, float_t lumaMax, float_t sampleCount, float_t rcpFirstPassWGCount) { this_t retval; - retval.lumaMinMax = lumaMinMax; + retval.lumaMin = lumaMin; + retval.lumaMax = lumaMax; retval.sampleCount = sampleCount; + retval.rcpFirstPassWGCount = rcpFirstPassWGCount; return retval; } @@ -53,7 +57,7 @@ struct geom_meter float_t3 color = tex.get(uvPos); float_t luma = (float_t)TexAccessor::toXYZ(color); - luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); + luma = clamp(luma, lumaMin, lumaMax); return log2(luma); } @@ -65,15 +69,8 @@ struct geom_meter float_t rangeLog2 ) { - uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); - uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64; - uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - - val /= 32.0 * 32.0; - // uint32_t lumaSumBitPattern = uint32_t(((val - minLog2) / rangeLog2) * 4096.0 + 0.5); // 32*32 subgroups - uint32_t lumaSumBitPattern = uint32_t(val * 4096.0 + 0.5); // 32*32 subgroups - - val_accessor.atomicAdd(0u, lumaSumBitPattern); + uint32_t lumaVal = uint32_t((val / (32.0 * 32.0)) * float_t(MaxWorkgroupIncrement) + 0.5); // 32*32 subgroups + val_accessor.atomicAdd(0u, lumaVal); } float_t __downloadFloat( @@ -83,8 +80,8 @@ struct geom_meter float_t rangeLog2 ) { - float_t luma = (float_t)val_accessor.get(0u); - return (luma / float_t(4096 * 60 * 34)) * rangeLog2 + minLog2; + float_t luma = float_t(val_accessor.get(0u)); + return luma / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * rangeLog2 + minLog2; } void sampleLuma( @@ -103,15 +100,15 @@ struct geom_meter float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); - lumaLog2 = (lumaLog2 - log2(lumaMinMax.x)) / log2(lumaMinMax.y / lumaMinMax.x); + lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin); float_t lumaLog2Sum = __reduction(lumaLog2, sdata); if (tid == 0) { __uploadFloat( val, lumaLog2Sum, - log2(lumaMinMax.x), - log2(lumaMinMax.y / lumaMinMax.x) + log2(lumaMin), + log2(lumaMax / lumaMin) ); } } @@ -132,18 +129,17 @@ struct geom_meter float_t luma = __downloadFloat( val, tid, - log2(lumaMinMax.x), - log2(lumaMinMax.y / lumaMinMax.x) + log2(lumaMin), + log2(lumaMax / lumaMin) ); - uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); - uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - return luma;// / sampleCount; } + float_t lumaMin; + float_t lumaMax; float_t sampleCount; - float_t2 lumaMinMax; + float_t rcpFirstPassWGCount; }; // template diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 58fd085cd2..2025f28f8b 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -30,10 +30,13 @@ struct median_meter using float_t3 = typename conditional, float32_t3, float16_t3>::type; using this_t = median_meter; - static this_t create(float_t2 lumaMinMax) + static this_t create(float_t lumaMin, float_t lumaMax, float_t lowerBoundPercentile, float_t upperBoundPercentile) { this_t retval; - retval.lumaMinMax = lumaMinMax; + retval.lumaMin = lumaMin; + retval.lumaMax = lumaMax; + retval.lowerBoundPercentile = lowerBoundPercentile; + retval.upperBoundPercentile = upperBoundPercentile; return retval; } @@ -53,7 +56,7 @@ struct median_meter float_t3 color = tex.get(uvPos); float_t luma = (float_t)TexAccessor::toXYZ(color); - return clamp(luma, lumaMinMax.x, lumaMinMax.y); + return clamp(luma, lumaMin, lumaMax); } void sampleLuma( @@ -80,7 +83,7 @@ struct median_meter float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; float_t luma = __computeLuma(window, tex, shiftedCoord); - float_t scaledLogLuma = log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x); + float_t scaledLogLuma = log2(luma / lumaMin) / log2(lumaMax / lumaMin); uint32_t binIndex = int_t(scaledLogLuma * float_t(BinCount-1u) + 0.5); sdata.atomicAdd(binIndex, 1u); @@ -108,15 +111,12 @@ struct median_meter histo.get(vid) ); } - sdata.workgroupExecutionAndMemoryBarrier(); - // TODO: choose percentile in push constant int_t lower, upper; if (tid == 0) { - uint32_t percentile40 = uint32_t(BinCount * 0.4); - // lower bound + const uint32_t lowerPercentile = uint32_t(BinCount * lowerBoundPercentile); uint32_t lo = 0u; uint32_t hi = BinCount; int_t v; @@ -124,7 +124,7 @@ struct median_meter { uint32_t mid = lo + (hi - lo) / 2; sdata.get(mid, v); - if (percentile40 <= v) + if (lowerPercentile <= v) hi = mid; else lo = mid + 1; @@ -134,8 +134,7 @@ struct median_meter } if (tid == 1) { - uint32_t percentile60 = uint32_t(BinCount * 0.6); - // upper bound + const uint32_t upperPercentile = uint32_t(BinCount * upperBoundPercentile); uint32_t lo = 0u; uint32_t hi = BinCount; int_t v; @@ -143,7 +142,7 @@ struct median_meter { uint32_t mid = lo + (hi - lo) / 2; sdata.get(mid, v); - if (percentile60 >= v) + if (upperPercentile >= v) lo = mid + 1; else hi = mid; @@ -151,16 +150,18 @@ struct median_meter upper = lo; } - sdata.workgroupExecutionAndMemoryBarrier(); lower = workgroup::Broadcast(lower, sdata, 0); upper = workgroup::Broadcast(upper, sdata, 1); - return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2(lumaMinMax.y/lumaMinMax.x) + log2(lumaMinMax.x); + return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2(lumaMax/lumaMin) + log2(lumaMin); } - float_t2 lumaMinMax; + float_t lumaMin; + float_t lumaMax; + float_t lowerBoundPercentile; + float_t upperBoundPercentile; }; // template From 8466a9db6ee2d060abccb5710461f3f2d97ecf5a Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 3 Feb 2026 17:01:23 +0700 Subject: [PATCH 45/56] make template names clearer, mean stores to a subgroup size buffer instead of one value --- .../builtin/hlsl/luma_meter/geom_mean.hlsl | 28 ++++++++----------- .../builtin/hlsl/luma_meter/histogram.hlsl | 10 +++---- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index e6be2e3a60..7ab959fd5d 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -21,13 +21,13 @@ namespace hlsl namespace luma_meter { -template +template struct geom_meter { using float_t = typename SharedAccessor::type; using float_t2 = typename conditional, float32_t2, float16_t2>::type; using float_t3 = typename conditional, float32_t3, float16_t3>::type; - using this_t = geom_meter; + using this_t = geom_meter; NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxWorkgroupIncrement = 0x1000u; @@ -43,7 +43,7 @@ struct geom_meter float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { - return workgroup::reduction < plus < float_t >, GroupSize >:: + return workgroup::reduction < plus < float_t >, WorkgroupSize >:: template __call (value, sdata); } @@ -69,8 +69,11 @@ struct geom_meter float_t rangeLog2 ) { - uint32_t lumaVal = uint32_t((val / (32.0 * 32.0)) * float_t(MaxWorkgroupIncrement) + 0.5); // 32*32 subgroups - val_accessor.atomicAdd(0u, lumaVal); + const uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); + const uint32_t3 workgroupID = glsl::gl_WorkGroupID(); + const uint32_t index = (workgroupID.y * workGroupCount.x + workgroupID.x) & (SubgroupSize - 1u); + uint32_t lumaVal = uint32_t(val / float_t(WorkgroupSize) * float_t(MaxWorkgroupIncrement) + 0.5); + val_accessor.atomicAdd(index, lumaVal); } float_t __downloadFloat( @@ -80,8 +83,9 @@ struct geom_meter float_t rangeLog2 ) { - float_t luma = float_t(val_accessor.get(0u)); - return luma / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * rangeLog2 + minLog2; + uint32_t lumaVal = val_accessor.get(index); + lumaVal = glsl::subgroupAdd(lumaVal); + return float_t(lumaVal) / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * rangeLog2 + minLog2; } void sampleLuma( @@ -118,14 +122,6 @@ struct geom_meter ) { uint32_t tid = glsl::gl_SubgroupInvocationID(); - // float_t luma = glsl::subgroupAdd( - // __downloadFloat( - // val, - // tid, - // log2(lumaMinMax.x), - // log2(lumaMinMax.y / lumaMinMax.x) - // ) - // ); float_t luma = __downloadFloat( val, tid, @@ -133,7 +129,7 @@ struct geom_meter log2(lumaMax / lumaMin) ); - return luma;// / sampleCount; + return luma; } float_t lumaMin; diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 2025f28f8b..58aea923f0 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -21,14 +21,14 @@ namespace hlsl namespace luma_meter { -template +template struct median_meter { using int_t = typename SharedAccessor::type; using float_t = float32_t; using float_t2 = typename conditional, float32_t2, float16_t2>::type; using float_t3 = typename conditional, float32_t3, float16_t3>::type; - using this_t = median_meter; + using this_t = median_meter; static this_t create(float_t lumaMin, float_t lumaMax, float_t lowerBoundPercentile, float_t upperBoundPercentile) { @@ -42,7 +42,7 @@ struct median_meter int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) { - return workgroup::inclusive_scan < plus < int_t >, GroupSize >:: + return workgroup::inclusive_scan < plus < int_t >, WorkgroupSize >:: template __call (value, sdata); } @@ -70,7 +70,7 @@ struct median_meter { uint32_t tid = workgroup::SubgroupContiguousIndex(); - for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { + for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) { sdata.set(vid, 0); } @@ -105,7 +105,7 @@ struct median_meter { uint32_t tid = workgroup::SubgroupContiguousIndex(); - for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { + for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) { sdata.set( vid, histo.get(vid) From 3a891b7329b5be80d6f7bc68fac912a84ed8fded Mon Sep 17 00:00:00 2001 From: keptsecret Date: Wed, 4 Feb 2026 16:42:28 +0700 Subject: [PATCH 46/56] fixes to aces tonemap --- include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl | 6 +++--- include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl index b2e0e4b053..5384c7dc84 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl @@ -2,8 +2,8 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ -#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_ACES_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TONE_MAPPER_ACES_INCLUDED_ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" @@ -29,7 +29,7 @@ struct ACES this_t retval; retval.gamma = Contrast; const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key - retval.exposure = EV + log2(key * reinhardMatchCorrection); + retval.exposure = -EV + log2(key * reinhardMatchCorrection); return retval; } diff --git a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl index da48fbf66d..b442093b6e 100644 --- a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl +++ b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl @@ -2,8 +2,8 @@ // This file is part of the "Nabla Engine". // For conditions of distribution and use, see copyright notice in nabla.h -#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ -#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_ +#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_REINHARD_INCLUDED_ +#define _NBL_BUILTIN_HLSL_TONE_MAPPER_REINHARD_INCLUDED_ #include "nbl/builtin/hlsl/cpp_compat.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" From c8a8f9ff9b9fc5bfb7e4d058da8bda6c898c74cf Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 5 Feb 2026 17:00:46 +0700 Subject: [PATCH 47/56] use workgroup2 for average metering --- .../builtin/hlsl/luma_meter/geom_mean.hlsl | 157 ++++-------------- .../builtin/hlsl/luma_meter/histogram.hlsl | 149 ----------------- 2 files changed, 34 insertions(+), 272 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index 7ab959fd5d..9ba05e2088 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -8,8 +8,8 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl" -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" +#include "nbl/builtin/hlsl/workgroup2/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" #include "nbl/builtin/hlsl/morton.hlsl" #include "nbl/builtin/hlsl/luma_meter/common.hlsl" @@ -21,13 +21,34 @@ namespace hlsl namespace luma_meter { -template +namespace impl +{ +template +struct data_proxy +{ + template + void get(const IndexType idx, NBL_REF_ARG(AccessType) value) + { + value = data[idx]; + } + + T data; +}; +} + +template struct geom_meter { using float_t = typename SharedAccessor::type; using float_t2 = typename conditional, float32_t2, float16_t2>::type; using float_t3 = typename conditional, float32_t3, float16_t3>::type; - using this_t = geom_meter; + + using proxy_data_t = vector; + using proxy_t = impl::data_proxy; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = WorkgroupConfig::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = WorkgroupConfig::SubgroupSize; + using this_t = geom_meter; NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxWorkgroupIncrement = 0x1000u; @@ -41,10 +62,12 @@ struct geom_meter return retval; } - float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) + float_t __reduction(NBL_REF_ARG(proxy_t) data, NBL_REF_ARG(SharedAccessor) sdata) { - return workgroup::reduction < plus < float_t >, WorkgroupSize >:: - template __call (value, sdata); + // return workgroup::reduction < plus < float_t >, WorkgroupSize >:: + // template __call (value, sdata); + return workgroup2::reduction< WorkgroupConfig, plus, device_capabilities >:: + template __call (data, sdata); } float_t __computeLumaLog2( @@ -105,7 +128,10 @@ struct geom_meter float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin); - float_t lumaLog2Sum = __reduction(lumaLog2, sdata); + + proxy_t data; + data.data[0] = lumaLog2; + float_t lumaLog2Sum = __reduction(data, sdata); if (tid == 0) { __uploadFloat( @@ -138,121 +164,6 @@ struct geom_meter float_t rcpFirstPassWGCount; }; -// template -// struct geom_meter -// { -// using float_t = typename SharedAccessor::type; -// using float_t2 = typename conditional, float32_t2, float16_t2>::type; -// using float_t3 = typename conditional, float32_t3, float16_t3>::type; -// using this_t = geom_meter; - -// static this_t create(float_t2 lumaMinMax, float_t sampleCount) -// { -// this_t retval; -// retval.lumaMinMax = lumaMinMax; -// retval.sampleCount = sampleCount; -// return retval; -// } - -// float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata) -// { -// return workgroup::reduction < plus < float_t >, GroupSize >:: -// template __call (value, sdata); -// } - -// float_t __computeLumaLog2( -// NBL_CONST_REF_ARG(MeteringWindow) window, -// NBL_REF_ARG(TexAccessor) tex, -// float_t2 shiftedCoord -// ) -// { -// float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; -// float_t3 color = tex.get(uvPos); -// float_t luma = (float_t)TexAccessor::toXYZ(color); - -// luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); - -// return log2(luma); -// } - -// void __uploadFloat( -// NBL_REF_ARG(ValueAccessor) val_accessor, -// float_t val, -// float_t minLog2, -// float_t rangeLog2 -// ) -// { -// uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); -// uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64; -// uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - -// uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); - -// val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); -// } - -// float_t __downloadFloat( -// NBL_REF_ARG(ValueAccessor) val_accessor, -// uint32_t index, -// float_t minLog2, -// float_t rangeLog2 -// ) -// { -// float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); -// return luma / rangeLog2 + minLog2; -// } - -// void sampleLuma( -// NBL_CONST_REF_ARG(MeteringWindow) window, -// NBL_REF_ARG(ValueAccessor) val, -// NBL_REF_ARG(TexAccessor) tex, -// NBL_REF_ARG(SharedAccessor) sdata, -// float_t2 tileOffset, -// float_t2 viewportSize -// ) -// { -// uint32_t tid = workgroup::SubgroupContiguousIndex(); -// uint32_t2 coord = math::Morton::decode2d(tid); - -// float_t luma = 0.0f; -// float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; -// float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); -// float_t lumaLog2Sum = __reduction(lumaLog2, sdata); - -// if (tid == 0) { -// __uploadFloat( -// val, -// lumaLog2Sum, -// log2(lumaMinMax.x), -// log2(lumaMinMax.y / lumaMinMax.x) -// ); -// } -// } - -// float_t gatherLuma( -// NBL_REF_ARG(ValueAccessor) val -// ) -// { -// uint32_t tid = glsl::gl_SubgroupInvocationID(); -// float_t luma = glsl::subgroupAdd( -// __downloadFloat( -// val, -// tid, -// log2(lumaMinMax.x), -// log2(lumaMinMax.y / lumaMinMax.x) -// ) -// ); - -// uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); -// uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - -// return (luma / (1 << fixedPointBitsLeft)) / sampleCount; -// } - -// float_t sampleCount; -// float_t2 lumaMinMax; -// }; - } } } diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 58aea923f0..40a4db59a7 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -164,155 +164,6 @@ struct median_meter float_t upperBoundPercentile; }; -// template -// struct median_meter -// { -// using int_t = typename SharedAccessor::type; -// using float_t = float32_t; -// using float_t2 = typename conditional, float32_t2, float16_t2>::type; -// using float_t3 = typename conditional, float32_t3, float16_t3>::type; -// using this_t = median_meter; - -// static this_t create(float_t2 lumaMinMax) -// { -// this_t retval; -// retval.lumaMinMax = lumaMinMax; -// return retval; -// } - -// int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) -// { -// return workgroup::inclusive_scan < plus < int_t >, GroupSize >:: -// template __call (value, sdata); -// } - -// float_t __computeLuma( -// NBL_CONST_REF_ARG(MeteringWindow) window, -// NBL_REF_ARG(TexAccessor) tex, -// float_t2 shiftedCoord -// ) -// { -// float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; -// float_t3 color = tex.get(uvPos); -// float_t luma = (float_t)TexAccessor::toXYZ(color); - -// return clamp(luma, lumaMinMax.x, lumaMinMax.y); -// } - -// int_t __float2Int( -// float_t val, -// float_t minLog2, -// float_t rangeLog2 -// ) -// { -// uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); -// uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); - -// return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); -// } - -// float_t __int2Float( -// int_t val, -// float_t minLog2, -// float_t rangeLog2 -// ) -// { -// return val / rangeLog2 + minLog2; -// } - -// void sampleLuma( -// NBL_CONST_REF_ARG(MeteringWindow) window, -// NBL_REF_ARG(HistogramAccessor) histo, -// NBL_REF_ARG(TexAccessor) tex, -// NBL_REF_ARG(SharedAccessor) sdata, -// float_t2 tileOffset, -// float_t2 viewportSize -// ) -// { -// uint32_t tid = workgroup::SubgroupContiguousIndex(); - -// for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { -// sdata.set(vid, 0); -// } - -// sdata.workgroupExecutionAndMemoryBarrier(); - -// morton::code mc; -// mc.value = tid; -// uint32_t2 coord = _static_cast(mc); - -// float_t luma = 0.0f; -// float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; -// luma = __computeLuma(window, tex, shiftedCoord); - -// float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount; -// uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize); - -// sdata.atomicAdd(binIndex, __float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); - -// sdata.workgroupExecutionAndMemoryBarrier(); - -// float_t histogram_value; -// sdata.get(tid, histogram_value); - -// sdata.workgroupExecutionAndMemoryBarrier(); - -// float_t sum = __inclusive_scan(histogram_value, sdata); -// histo.atomicAdd(tid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); - -// const bool is_last_wg_invocation = tid == (GroupSize - 1); -// const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize; - -// for (int i = 1; i < RoundedBinCount; i++) { -// uint32_t keyBucketStart = GroupSize * i; -// uint32_t vid = tid + keyBucketStart; - -// // no if statement about the last iteration needed -// if (is_last_wg_invocation) { -// float_t beforeSum; -// sdata.get(keyBucketStart, beforeSum); -// sdata.set(keyBucketStart, beforeSum + sum); -// } - -// // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes -// sdata.workgroupExecutionAndMemoryBarrier(); - -// // no aliasing anymore -// float_t atVid; -// sdata.get(vid, atVid); -// sum = __inclusive_scan(atVid, sdata); -// if (vid < BinCount) { -// histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)); -// } -// } -// } - -// float_t gatherLuma( -// NBL_REF_ARG(HistogramAccessor) histo, -// NBL_REF_ARG(SharedAccessor) sdata -// ) -// { -// uint32_t tid = workgroup::SubgroupContiguousIndex(); - -// for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) { -// sdata.set( -// vid, -// histo.get(vid & (BinCount - 1)) -// ); -// } - -// sdata.workgroupExecutionAndMemoryBarrier(); - -// uint32_t percentile40, percentile60; -// sdata.get(BinCount * 0.4, percentile40); -// sdata.get(BinCount * 0.6, percentile60); - -// return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2; -// } - -// float_t2 lumaMinMax; -// }; - } } } From 7598f2f51e3fad81f7f290b926031314dcec91ca Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 6 Feb 2026 12:08:33 +0700 Subject: [PATCH 48/56] use workgroup2 with histogram metering --- .../builtin/hlsl/luma_meter/geom_mean.hlsl | 10 ++- .../builtin/hlsl/luma_meter/histogram.hlsl | 70 +++++++++++++------ 2 files changed, 52 insertions(+), 28 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index 9ba05e2088..e8a8b7b15c 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -29,7 +29,7 @@ struct data_proxy template void get(const IndexType idx, NBL_REF_ARG(AccessType) value) { - value = data[idx]; + value = data; } T data; @@ -43,7 +43,7 @@ struct geom_meter using float_t2 = typename conditional, float32_t2, float16_t2>::type; using float_t3 = typename conditional, float32_t3, float16_t3>::type; - using proxy_data_t = vector; + using proxy_data_t = float_t; using proxy_t = impl::data_proxy; NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = WorkgroupConfig::WorkgroupSize; @@ -64,10 +64,8 @@ struct geom_meter float_t __reduction(NBL_REF_ARG(proxy_t) data, NBL_REF_ARG(SharedAccessor) sdata) { - // return workgroup::reduction < plus < float_t >, WorkgroupSize >:: - // template __call (value, sdata); return workgroup2::reduction< WorkgroupConfig, plus, device_capabilities >:: - template __call (data, sdata); + template __call(data, sdata); } float_t __computeLumaLog2( @@ -130,7 +128,7 @@ struct geom_meter lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin); proxy_t data; - data.data[0] = lumaLog2; + data.data = lumaLog2; float_t lumaLog2Sum = __reduction(data, sdata); if (tid == 0) { diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 40a4db59a7..61f662cb06 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -8,8 +8,8 @@ #include "nbl/builtin/hlsl/glsl_compat/core.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl" #include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl" -#include "nbl/builtin/hlsl/workgroup/basic.hlsl" -#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl" +#include "nbl/builtin/hlsl/workgroup2/basic.hlsl" +#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl" #include "nbl/builtin/hlsl/type_traits.hlsl" #include "nbl/builtin/hlsl/morton.hlsl" #include "nbl/builtin/hlsl/luma_meter/common.hlsl" @@ -21,14 +21,41 @@ namespace hlsl namespace luma_meter { -template +namespace impl +{ +template +struct data_proxy +{ + template + void get(const IndexType idx, NBL_REF_ARG(AccessType) value) + { + value = data; + } + + template + void set(const IndexType ix, const AccessType value) + { + data = value; + } + + T data; +}; +} + +template struct median_meter { using int_t = typename SharedAccessor::type; using float_t = float32_t; using float_t2 = typename conditional, float32_t2, float16_t2>::type; using float_t3 = typename conditional, float32_t3, float16_t3>::type; - using this_t = median_meter; + + NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = WorkgroupConfig::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanItemsPerInvoc = WorkgroupConfig::ItemsPerInvocation_0; + using proxy_data_t = vector; + using proxy_t = impl::data_proxy; + + using this_t = median_meter; static this_t create(float_t lumaMin, float_t lumaMax, float_t lowerBoundPercentile, float_t upperBoundPercentile) { @@ -40,10 +67,12 @@ struct median_meter return retval; } - int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) + void __inclusive_scan(NBL_REF_ARG(proxy_t) data, NBL_REF_ARG(SharedAccessor) sdata) { - return workgroup::inclusive_scan < plus < int_t >, WorkgroupSize >:: - template __call (value, sdata); + // return workgroup::inclusive_scan < plus < int_t >, WorkgroupSize >:: + // template __call (value, sdata); + workgroup2::inclusive_scan< WorkgroupConfig, plus, device_capabilities >:: + template __call(data, sdata); } float_t __computeLuma( @@ -70,9 +99,8 @@ struct median_meter { uint32_t tid = workgroup::SubgroupContiguousIndex(); - for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) { - sdata.set(vid, 0); - } + for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) + sdata.template set(vid, 0u); sdata.workgroupExecutionAndMemoryBarrier(); @@ -89,13 +117,15 @@ struct median_meter sdata.workgroupExecutionAndMemoryBarrier(); - int_t histogram_value; - sdata.get(tid, histogram_value); + proxy_t histogram_data; + NBL_UNROLL for (uint32_t i = 0; i < ScanItemsPerInvoc; i++) + sdata.template get(tid * ScanItemsPerInvoc + i, histogram_data.data[i]); sdata.workgroupExecutionAndMemoryBarrier(); - int_t sum = __inclusive_scan(histogram_value, sdata); - histo.atomicAdd(tid, sum); + __inclusive_scan(histogram_data, sdata); + NBL_UNROLL for (uint32_t i = 0; i < ScanItemsPerInvoc; i++) + histo.atomicAdd(tid * ScanItemsPerInvoc + i, histogram_data.data[i]); } float_t gatherLuma( @@ -105,12 +135,8 @@ struct median_meter { uint32_t tid = workgroup::SubgroupContiguousIndex(); - for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) { - sdata.set( - vid, - histo.get(vid) - ); - } + for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) + sdata.template set(vid, histo.get(vid)); sdata.workgroupExecutionAndMemoryBarrier(); int_t lower, upper; @@ -123,7 +149,7 @@ struct median_meter while (lo < hi) { uint32_t mid = lo + (hi - lo) / 2; - sdata.get(mid, v); + sdata.template get(mid, v); if (lowerPercentile <= v) hi = mid; else @@ -141,7 +167,7 @@ struct median_meter while (lo < hi) { uint32_t mid = lo + (hi - lo) / 2; - sdata.get(mid, v); + sdata.template get(mid, v); if (upperPercentile >= v) lo = mid + 1; else From c7442832ba98b36eca30be4f1ebf569d04319564 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 6 Feb 2026 15:02:18 +0700 Subject: [PATCH 49/56] removed commented out code --- include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 61f662cb06..fcfc06ed5c 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -69,8 +69,6 @@ struct median_meter void __inclusive_scan(NBL_REF_ARG(proxy_t) data, NBL_REF_ARG(SharedAccessor) sdata) { - // return workgroup::inclusive_scan < plus < int_t >, WorkgroupSize >:: - // template __call (value, sdata); workgroup2::inclusive_scan< WorkgroupConfig, plus, device_capabilities >:: template __call(data, sdata); } From 79fad2f7d16ba75347c94a8b4327042b4d786547 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Fri, 6 Feb 2026 15:11:09 +0700 Subject: [PATCH 50/56] latest example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 5e27920875..77ec3d54bb 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 5e279208751e882805a62f12e1ba86f6389e4954 +Subproject commit 77ec3d54bba6bd54236b14fc9f1a105f5ca562ff From fe2b5bcd16e84b1e7ded41475ba4a6e8ab953819 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 9 Feb 2026 15:04:54 +0700 Subject: [PATCH 51/56] fixes converting thread to image uv --- include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl | 5 ++--- include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index e8a8b7b15c..40139d2863 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -114,8 +114,7 @@ struct geom_meter NBL_REF_ARG(ValueAccessor) val, NBL_REF_ARG(TexAccessor) tex, NBL_REF_ARG(SharedAccessor) sdata, - float_t2 tileOffset, - float_t2 viewportSize + float_t2 tileOffset ) { uint32_t tid = workgroup::SubgroupContiguousIndex(); @@ -123,7 +122,7 @@ struct geom_meter mc.value = tid; uint32_t2 coord = _static_cast(mc); - float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + float_t2 shiftedCoord = tileOffset + float32_t2(coord); float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin); diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index fcfc06ed5c..0e71b46925 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -91,8 +91,7 @@ struct median_meter NBL_REF_ARG(HistogramAccessor) histo, NBL_REF_ARG(TexAccessor) tex, NBL_REF_ARG(SharedAccessor) sdata, - float_t2 tileOffset, - float_t2 viewportSize + float_t2 tileOffset ) { uint32_t tid = workgroup::SubgroupContiguousIndex(); @@ -106,7 +105,7 @@ struct median_meter mc.value = tid; uint32_t2 coord = _static_cast(mc); - float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; + float_t2 shiftedCoord = tileOffset + float32_t2(coord); float_t luma = __computeLuma(window, tex, shiftedCoord); float_t scaledLogLuma = log2(luma / lumaMin) / log2(lumaMax / lumaMin); From da066716735e81905623f81fb7effb98ed1b0dca Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 9 Feb 2026 15:07:15 +0700 Subject: [PATCH 52/56] latest example --- examples_tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 77ec3d54bb..d104945cb3 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 77ec3d54bba6bd54236b14fc9f1a105f5ca562ff +Subproject commit d104945cb3c41e89c20dc60135faecbd0778ed83 From ec58514620a2d0de5055d7ec5048e08caea765d1 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Mon, 9 Feb 2026 17:00:20 +0700 Subject: [PATCH 53/56] fix histogram metering percentiles by using sample count instead --- examples_tests | 2 +- include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/examples_tests b/examples_tests index d104945cb3..07ad5db796 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit d104945cb3c41e89c20dc60135faecbd0778ed83 +Subproject commit 07ad5db7968fbab38a3fed4b93e97d17504f1b83 diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index 0e71b46925..e86068ca87 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -139,7 +139,6 @@ struct median_meter int_t lower, upper; if (tid == 0) { - const uint32_t lowerPercentile = uint32_t(BinCount * lowerBoundPercentile); uint32_t lo = 0u; uint32_t hi = BinCount; int_t v; @@ -147,7 +146,7 @@ struct median_meter { uint32_t mid = lo + (hi - lo) / 2; sdata.template get(mid, v); - if (lowerPercentile <= v) + if (lowerBoundPercentile <= v) hi = mid; else lo = mid + 1; @@ -157,7 +156,6 @@ struct median_meter } if (tid == 1) { - const uint32_t upperPercentile = uint32_t(BinCount * upperBoundPercentile); uint32_t lo = 0u; uint32_t hi = BinCount; int_t v; @@ -165,7 +163,7 @@ struct median_meter { uint32_t mid = lo + (hi - lo) / 2; sdata.template get(mid, v); - if (upperPercentile >= v) + if (upperBoundPercentile >= v) lo = mid + 1; else hi = mid; @@ -183,8 +181,8 @@ struct median_meter float_t lumaMin; float_t lumaMax; - float_t lowerBoundPercentile; - float_t upperBoundPercentile; + int_t lowerBoundPercentile; + int_t upperBoundPercentile; }; } From 93582da0a8ad93719ce32ca5ab92e27d6fe9dfc0 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Tue, 10 Feb 2026 15:39:23 +0700 Subject: [PATCH 54/56] removed sample count from average metering --- include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index 40139d2863..feeb260a3f 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -52,12 +52,11 @@ struct geom_meter NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxWorkgroupIncrement = 0x1000u; - static this_t create(float_t lumaMin, float_t lumaMax, float_t sampleCount, float_t rcpFirstPassWGCount) + static this_t create(float_t lumaMin, float_t lumaMax, float_t rcpFirstPassWGCount) { this_t retval; retval.lumaMin = lumaMin; retval.lumaMax = lumaMax; - retval.sampleCount = sampleCount; retval.rcpFirstPassWGCount = rcpFirstPassWGCount; return retval; } @@ -157,7 +156,6 @@ struct geom_meter float_t lumaMin; float_t lumaMax; - float_t sampleCount; float_t rcpFirstPassWGCount; }; From 86fba58bcf128ed4ec8147352af56c4cef3d5ac7 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 12 Feb 2026 14:53:40 +0700 Subject: [PATCH 55/56] luma meter push constants in common --- examples_tests | 2 +- .../nbl/builtin/hlsl/luma_meter/common.hlsl | 26 +++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/examples_tests b/examples_tests index 07ad5db796..9482f15f67 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 07ad5db7968fbab38a3fed4b93e97d17504f1b83 +Subproject commit 9482f15f6730176bc5910d89d9b91d8ac0ccaa13 diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl index 55d1713619..e469e1103d 100644 --- a/include/nbl/builtin/hlsl/luma_meter/common.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl @@ -28,6 +28,32 @@ struct MeteringWindow } }; +struct GeomMeanParameters +{ + float32_t rcpFirstPassWGCount; +}; + +struct HistogramParameters +{ + uint32_t lowerBoundPercentile; + uint32_t upperBoundPercentile; +}; + +struct PushConstants +{ + MeteringWindow window; + float32_t lumaMin; + float32_t lumaMax; + uint32_t2 viewportSize; + float32_t2 exposureAdaptationFactors; + uint64_t pLumaMeterBuf; + uint64_t pLastFrameEVBuf; + uint64_t pCurrFrameEVBuf; + + GeomMeanParameters meanParams; + HistogramParameters histoParams; +}; + } } } From 7c4b40160bd1940acfa30ab52be6f7ffc32948f2 Mon Sep 17 00:00:00 2001 From: keptsecret Date: Thu, 12 Feb 2026 15:48:14 +0700 Subject: [PATCH 56/56] precompute log2 values, minor changes to functions --- examples_tests | 2 +- .../builtin/hlsl/luma_meter/geom_mean.hlsl | 44 +++++++++---------- .../builtin/hlsl/luma_meter/histogram.hlsl | 27 +++++++----- 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/examples_tests b/examples_tests index 9482f15f67..ca0e50e78e 160000 --- a/examples_tests +++ b/examples_tests @@ -1 +1 @@ -Subproject commit 9482f15f6730176bc5910d89d9b91d8ac0ccaa13 +Subproject commit ca0e50e78e58e9c0472c779af5e37dcf96a88f12 diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl index feeb260a3f..40d706351a 100644 --- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl @@ -57,6 +57,8 @@ struct geom_meter this_t retval; retval.lumaMin = lumaMin; retval.lumaMax = lumaMax; + retval.log2LumaMin = log2(lumaMin); + retval.log2LumaRange = log2(lumaMax) - retval.log2LumaMin; retval.rcpFirstPassWGCount = rcpFirstPassWGCount; return retval; } @@ -70,12 +72,12 @@ struct geom_meter float_t __computeLumaLog2( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(TexAccessor) tex, - float_t2 shiftedCoord + const float_t2 shiftedCoord ) { - float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; - float_t3 color = tex.get(uvPos); - float_t luma = (float_t)TexAccessor::toXYZ(color); + const float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; + const float_t3 color = tex.get(uvPos); + float_t luma = TexAccessor::toXYZ(color); luma = clamp(luma, lumaMin, lumaMax); @@ -84,36 +86,31 @@ struct geom_meter void __uploadFloat( NBL_REF_ARG(ValueAccessor) val_accessor, - float_t val, - float_t minLog2, - float_t rangeLog2 + float_t val ) { const uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); const uint32_t3 workgroupID = glsl::gl_WorkGroupID(); const uint32_t index = (workgroupID.y * workGroupCount.x + workgroupID.x) & (SubgroupSize - 1u); - uint32_t lumaVal = uint32_t(val / float_t(WorkgroupSize) * float_t(MaxWorkgroupIncrement) + 0.5); + const uint32_t lumaVal = uint32_t(val / float_t(WorkgroupSize) * float_t(MaxWorkgroupIncrement) + 0.5); val_accessor.atomicAdd(index, lumaVal); } float_t __downloadFloat( NBL_REF_ARG(ValueAccessor) val_accessor, - uint32_t index, - float_t minLog2, - float_t rangeLog2 + uint32_t index ) { uint32_t lumaVal = val_accessor.get(index); lumaVal = glsl::subgroupAdd(lumaVal); - return float_t(lumaVal) / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * rangeLog2 + minLog2; + return float_t(lumaVal) / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * log2LumaRange + log2LumaMin; } void sampleLuma( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(ValueAccessor) val, NBL_REF_ARG(TexAccessor) tex, - NBL_REF_ARG(SharedAccessor) sdata, - float_t2 tileOffset + NBL_REF_ARG(SharedAccessor) sdata ) { uint32_t tid = workgroup::SubgroupContiguousIndex(); @@ -121,9 +118,10 @@ struct geom_meter mc.value = tid; uint32_t2 coord = _static_cast(mc); - float_t2 shiftedCoord = tileOffset + float32_t2(coord); + const float_t2 tileOffset = float32_t2((glsl::gl_WorkGroupID() * SubgroupSize).xy); + const float_t2 shiftedCoord = tileOffset + float32_t2(coord); float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord); - lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin); + lumaLog2 = (lumaLog2 - log2LumaMin) / log2LumaRange; proxy_t data; data.data = lumaLog2; @@ -132,9 +130,7 @@ struct geom_meter if (tid == 0) { __uploadFloat( val, - lumaLog2Sum, - log2(lumaMin), - log2(lumaMax / lumaMin) + lumaLog2Sum ); } } @@ -143,12 +139,10 @@ struct geom_meter NBL_REF_ARG(ValueAccessor) val ) { - uint32_t tid = glsl::gl_SubgroupInvocationID(); - float_t luma = __downloadFloat( + const uint32_t tid = glsl::gl_SubgroupInvocationID(); + const float_t luma = __downloadFloat( val, - tid, - log2(lumaMin), - log2(lumaMax / lumaMin) + tid ); return luma; @@ -156,6 +150,8 @@ struct geom_meter float_t lumaMin; float_t lumaMax; + float_t log2LumaMin; + float_t log2LumaRange; float_t rcpFirstPassWGCount; }; diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl index e86068ca87..eb9672b15e 100644 --- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl +++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl @@ -51,6 +51,7 @@ struct median_meter using float_t3 = typename conditional, float32_t3, float16_t3>::type; NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = WorkgroupConfig::WorkgroupSize; + NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = WorkgroupConfig::SubgroupSize; NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanItemsPerInvoc = WorkgroupConfig::ItemsPerInvocation_0; using proxy_data_t = vector; using proxy_t = impl::data_proxy; @@ -62,6 +63,8 @@ struct median_meter this_t retval; retval.lumaMin = lumaMin; retval.lumaMax = lumaMax; + retval.log2LumaMin = log2(lumaMin); + retval.log2LumaRange = log2(lumaMax) - retval.log2LumaMin; retval.lowerBoundPercentile = lowerBoundPercentile; retval.upperBoundPercentile = upperBoundPercentile; return retval; @@ -76,12 +79,12 @@ struct median_meter float_t __computeLuma( NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(TexAccessor) tex, - float_t2 shiftedCoord + const float_t2 shiftedCoord ) { - float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; - float_t3 color = tex.get(uvPos); - float_t luma = (float_t)TexAccessor::toXYZ(color); + const float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset; + const float_t3 color = tex.get(uvPos); + const float_t luma = TexAccessor::toXYZ(color); return clamp(luma, lumaMin, lumaMax); } @@ -90,8 +93,7 @@ struct median_meter NBL_CONST_REF_ARG(MeteringWindow) window, NBL_REF_ARG(HistogramAccessor) histo, NBL_REF_ARG(TexAccessor) tex, - NBL_REF_ARG(SharedAccessor) sdata, - float_t2 tileOffset + NBL_REF_ARG(SharedAccessor) sdata ) { uint32_t tid = workgroup::SubgroupContiguousIndex(); @@ -105,11 +107,12 @@ struct median_meter mc.value = tid; uint32_t2 coord = _static_cast(mc); - float_t2 shiftedCoord = tileOffset + float32_t2(coord); - float_t luma = __computeLuma(window, tex, shiftedCoord); + const float_t2 tileOffset = float32_t2((glsl::gl_WorkGroupID() * SubgroupSize).xy); + const float_t2 shiftedCoord = tileOffset + float32_t2(coord); + const float_t luma = __computeLuma(window, tex, shiftedCoord); - float_t scaledLogLuma = log2(luma / lumaMin) / log2(lumaMax / lumaMin); - uint32_t binIndex = int_t(scaledLogLuma * float_t(BinCount-1u) + 0.5); + const float_t scaledLogLuma = (log2(luma) - log2LumaMin) / log2LumaRange; + const uint32_t binIndex = int_t(scaledLogLuma * float_t(BinCount-1u) + 0.5); sdata.atomicAdd(binIndex, 1u); sdata.workgroupExecutionAndMemoryBarrier(); @@ -176,11 +179,13 @@ struct median_meter lower = workgroup::Broadcast(lower, sdata, 0); upper = workgroup::Broadcast(upper, sdata, 1); - return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2(lumaMax/lumaMin) + log2(lumaMin); + return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2LumaRange + log2LumaMin; } float_t lumaMin; float_t lumaMax; + float_t log2LumaMin; + float_t log2LumaRange; int_t lowerBoundPercentile; int_t upperBoundPercentile; };