From 096e09d83f113767ec3f85bb03a1f7d24c612e64 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sat, 20 Jul 2024 00:33:56 +0530
Subject: [PATCH 01/56] Add luma_meter and tonemapper

---
 .../nbl/builtin/hlsl/luma_meter/luma_meter.hlsl  | 16 ++++++++++++++++
 .../nbl/builtin/hlsl/tonemapper/operators.hlsl   | 16 ++++++++++++++++
 src/nbl/builtin/CMakeLists.txt                   |  4 ++++
 3 files changed, 36 insertions(+)
 create mode 100644 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
 create mode 100644 include/nbl/builtin/hlsl/tonemapper/operators.hlsl

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
new file mode 100644
index 0000000000..4e18655852
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -0,0 +1,16 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+
+namespace nbl
+{
+namespace hls
+{
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
new file mode 100644
index 0000000000..5ebb5b2ffa
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -0,0 +1,16 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+
+namespace nbl
+{
+namespace hls
+{
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 8f797b9454..9dd9ddfd42 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -34,6 +34,10 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/barycentric/utils.glsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl")
+# luma metering
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")
+# tonemapper
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl")
 # bump mapping
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl`
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/utils.glsl")

From 4fd700fe69709ec127f7f42ec09b4f7f4ce0260c Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sat, 20 Jul 2024 00:34:17 +0530
Subject: [PATCH 02/56] Update submodule pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index c6d5ee3498..87d4794dcc 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit c6d5ee349859ce0b5229bc62a2372fa1d4b6b17c
+Subproject commit 87d4794dcc5de8264528292c4a30b5284979754a

From 52e7ab24dedb16f6c94855d6f0037e7ea77fba81 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 1 Aug 2024 21:20:52 +0530
Subject: [PATCH 03/56] Convert morton.h to hlsl

---
 include/nbl/asset/utils/IMeshPacker.h     |   2 +-
 include/nbl/asset/utils/IVirtualTexture.h |   3 +-
 include/nbl/builtin/hlsl/math/morton.hlsl | 283 ++++++++++++++++++++++
 src/nbl/builtin/CMakeLists.txt            |   2 +
 4 files changed, 288 insertions(+), 2 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/math/morton.hlsl

diff --git a/include/nbl/asset/utils/IMeshPacker.h b/include/nbl/asset/utils/IMeshPacker.h
index 3f09062b18..355d792782 100644
--- a/include/nbl/asset/utils/IMeshPacker.h
+++ b/include/nbl/asset/utils/IMeshPacker.h
@@ -6,7 +6,7 @@
 #define __NBL_ASSET_I_MESH_PACKER_H_INCLUDED__
 
 #include "nbl/asset/utils/IMeshManipulator.h"
-#include "nbl/core/math/morton.h"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
 
 namespace nbl
 {
diff --git a/include/nbl/asset/utils/IVirtualTexture.h b/include/nbl/asset/utils/IVirtualTexture.h
index ec26f56103..64ea49cbe7 100644
--- a/include/nbl/asset/utils/IVirtualTexture.h
+++ b/include/nbl/asset/utils/IVirtualTexture.h
@@ -7,7 +7,6 @@
 
 #include <functional>
 
-#include "nbl/core/math/morton.h"
 #include "nbl/core/memory/memory.h"
 #include "nbl/core/alloc/GeneralpurposeAddressAllocator.h"
 #include "nbl/core/alloc/PoolAddressAllocator.h"
@@ -19,6 +18,8 @@
 #include "nbl/asset/filters/CPaddedCopyImageFilter.h"
 #include "nbl/asset/filters/CFillImageFilter.h"
 
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+
 namespace nbl::asset
 {
 
diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
new file mode 100644
index 0000000000..64b0b66cb7
--- /dev/null
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -0,0 +1,283 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_MORTON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_MORTON_INCLUDED_
+
+#ifdef __HLSL_VERSION
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#else
+#include <cstdint>
+#endif
+
+namespace nbl
+{
+namespace core
+{
+
+namespace impl
+{
+
+#ifdef __HLSL_VERSION
+template <typename T>
+T morton2d_mask(uint16_t _n) const
+{
+    const static uint64_t mask[5] =
+    {
+        0x5555555555555555ull,
+        0x3333333333333333ull,
+        0x0F0F0F0F0F0F0F0Full,
+        0x00FF00FF00FF00FFull,
+        0x0000FFFF0000FFFFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+
+template <typename T>
+T morton3d_mask(uint16_t _n) const
+{
+    const static uint64_t mask[5] =
+    {
+        0x1249249249249249ull,
+        0x10C30C30C30C30C3ull,
+        0x010F00F00F00F00Full,
+        0x001F0000FF0000FFull,
+        0x001F00000000FFFFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+template <typename T>
+T morton4d_mask(uint16_t _n) const
+{
+    const static uint64_t mask[4] =
+    {
+        0x1111111111111111ull,
+        0x0303030303030303ull,
+        0x000F000F000F000Full,
+        0x000000FF000000FFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+
+template <typename T, uint32_t bitDepth>
+inline T morton2d_decode(T x)
+{
+    x = x & morton2d_mask<T>(0);
+    x = (x | (x >> 1)) & morton2d_mask<T>(1);
+    x = (x | (x >> 2)) & morton2d_mask<T>(2);
+    if (bitDepth > 8u)
+    {
+        x = (x | (x >> 4)) & morton2d_mask<T>(3);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x >> 8)) & morton2d_mask<T>(4);
+    }
+    if (bitDepth > 32u)
+    {
+        x = (x | (x >> 16));
+    }
+    return x;
+}
+
+//! Puts bits on even positions filling gaps with 0s
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_2d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 16)) & morton2d_mask<T>(4);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 8)) & morton2d_mask<T>(3);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 4)) & morton2d_mask<T>(2);
+    }
+    x = (x | (x << 2)) & morton2d_mask<T>(1);
+    x = (x | (x << 1)) & morton2d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_3d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 32)) & morton3d_mask<T>(4);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 16)) & morton3d_mask<T>(3);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 8)) & morton3d_mask<T>(2);
+    }
+    x = (x | (x << 4)) & morton3d_mask<T>(1);
+    x = (x | (x << 2)) & morton3d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_4d(T x)
+{
+    if (bitDepth > 32u)
+    {
+        x = (x | (x << 24)) & morton4d_mask<T>(3);
+    }
+    if (bitDepth > 16u)
+    {
+        x = (x | (x << 12)) & morton4d_mask<T>(2);
+    }
+    if (bitDepth > 8u)
+    {
+        x = (x | (x << 6)) & morton4d_mask<T>(1);
+    }
+    x = (x | (x << 3)) & morton4d_mask<T>(0);
+
+    return x;
+}
+#else
+template <typename T>
+constexpr T morton2d_mask(uint8_t _n)
+{
+    constexpr uint64_t mask[5] =
+    {
+        0x5555555555555555ull,
+        0x3333333333333333ull,
+        0x0F0F0F0F0F0F0F0Full,
+        0x00FF00FF00FF00FFull,
+        0x0000FFFF0000FFFFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+template <typename T>
+constexpr T morton3d_mask(uint8_t _n)
+{
+    constexpr uint64_t mask[5] =
+    {
+        0x1249249249249249ull,
+        0x10C30C30C30C30C3ull,
+        0x010F00F00F00F00Full,
+        0x001F0000FF0000FFull,
+        0x001F00000000FFFFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+template <typename T>
+constexpr T morton4d_mask(uint8_t _n)
+{
+    constexpr uint64_t mask[4] =
+    {
+        0x1111111111111111ull,
+        0x0303030303030303ull,
+        0x000F000F000F000Full,
+        0x000000FF000000FFull
+    };
+    return static_cast<T>(mask[_n]);
+}
+
+template <typename T, uint32_t bitDepth>
+inline T morton2d_decode(T x)
+{
+    x = x & morton2d_mask<T>(0);
+    x = (x | (x >> 1)) & morton2d_mask<T>(1);
+    x = (x | (x >> 2)) & morton2d_mask<T>(2);
+    if constexpr (bitDepth > 8u)
+    {
+        x = (x | (x >> 4)) & morton2d_mask<T>(3);
+    }
+    if constexpr (bitDepth > 16u)
+    {
+        x = (x | (x >> 8)) & morton2d_mask<T>(4);
+    }
+    if constexpr (bitDepth > 32u)
+    {
+        x = (x | (x >> 16));
+    }
+    return x;
+}
+
+//! Puts bits on even positions filling gaps with 0s
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_2d(T x)
+{
+    if constexpr (bitDepth > 32u)
+    {
+        x = (x | (x << 16)) & morton2d_mask<T>(4);
+    }
+    if constexpr (bitDepth > 16u)
+    {
+        x = (x | (x << 8)) & morton2d_mask<T>(3);
+    }
+    if constexpr (bitDepth > 8u)
+    {
+        x = (x | (x << 4)) & morton2d_mask<T>(2);
+    }
+    x = (x | (x << 2)) & morton2d_mask<T>(1);
+    x = (x | (x << 1)) & morton2d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_3d(T x)
+{
+    if constexpr (bitDepth > 32u)
+    {
+        x = (x | (x << 32)) & morton3d_mask<T>(4);
+    }
+    if constexpr (bitDepth > 16u)
+    {
+        x = (x | (x << 16)) & morton3d_mask<T>(3);
+    }
+    if constexpr (bitDepth > 8u)
+    {
+        x = (x | (x << 8)) & morton3d_mask<T>(2);
+    }
+    x = (x | (x << 4)) & morton3d_mask<T>(1);
+    x = (x | (x << 2)) & morton3d_mask<T>(0);
+
+    return x;
+}
+template <typename T, uint32_t bitDepth>
+inline T separate_bits_4d(T x)
+{
+    if constexpr (bitDepth > 32u)
+    {
+        x = (x | (x << 24)) & morton4d_mask<T>(3);
+    }
+    if constexpr (bitDepth > 16u)
+    {
+        x = (x | (x << 12)) & morton4d_mask<T>(2);
+    }
+    if constexpr (bitDepth > 8u)
+    {
+        x = (x | (x << 6)) & morton4d_mask<T>(1);
+    }
+    x = (x | (x << 3)) & morton4d_mask<T>(0);
+
+    return x;
+}
+#endif
+}
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); }
+
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); }
+template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
+T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); }
+
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index 8a7775c7a5..df61293d4a 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -281,6 +281,8 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/equations/quartic.hlsl")
 #extra math
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/gauss_legendre.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/quadrature/gauss_legendre/impl.hlsl")
+#morton
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/math/morton.hlsl")
 #acceleration structures
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/acceleration_structures.hlsl")
 #colorspace

From 1cc26bdcd583bbbc354c8c5e951f06e6cb1d3f28 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 2 Aug 2024 19:00:47 +0530
Subject: [PATCH 04/56] Fix HLSL morton code

---
 include/nbl/builtin/hlsl/math/morton.hlsl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 64b0b66cb7..4150af637a 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -21,7 +21,7 @@ namespace impl
 
 #ifdef __HLSL_VERSION
 template <typename T>
-T morton2d_mask(uint16_t _n) const
+T morton2d_mask(uint16_t _n)
 {
     const static uint64_t mask[5] =
     {
@@ -31,11 +31,11 @@ T morton2d_mask(uint16_t _n) const
         0x00FF00FF00FF00FFull,
         0x0000FFFF0000FFFFull
     };
-    return static_cast<T>(mask[_n]);
+    return mask[_n];
 }
 
 template <typename T>
-T morton3d_mask(uint16_t _n) const
+T morton3d_mask(uint16_t _n)
 {
     const static uint64_t mask[5] =
     {
@@ -45,10 +45,10 @@ T morton3d_mask(uint16_t _n) const
         0x001F0000FF0000FFull,
         0x001F00000000FFFFull
     };
-    return static_cast<T>(mask[_n]);
+    return mask[_n];
 }
 template <typename T>
-T morton4d_mask(uint16_t _n) const
+T morton4d_mask(uint16_t _n)
 {
     const static uint64_t mask[4] =
     {
@@ -57,7 +57,7 @@ T morton4d_mask(uint16_t _n) const
         0x000F000F000F000Full,
         0x000000FF000000FFull
     };
-    return static_cast<T>(mask[_n]);
+    return mask[_n];
 }
 
 template <typename T, uint32_t bitDepth>

From 6922d0c41b509a125be89d86627ba206d565b053 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Mon, 5 Aug 2024 19:02:04 +0530
Subject: [PATCH 05/56] Create geom_luma_meter and computeLuma

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 47 ++++++++++++++++++-
 1 file changed, 46 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 4e18655852..d2c33602c8 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -5,11 +5,56 @@
 #ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 #define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
+#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
+
 namespace nbl
 {
-namespace hls
+namespace hlsl
+{
+namespace luma_meter
+{
+
+struct LumaMeteringWindow
 {
+	float32_t2 meteringWindowScale;
+	float32_t2 meteringWindowOffset;
+};
+
+template<uint32_t SubgroupSize, uint32_t SubgroupCount, typename SharedAccessor, typename TexAccessor>
+struct geom_luma_meter {
+    using this_t = geom_luma_meter<SubgroupSize, SubgroupCount, SharedAccessor, TexAccessor>;
+
+    static this_t create(NBL_REF_ARG(LumaMeteringWindow) window)
+    {
+        this_t retval;
+        retval.window = window;
+        return retval;
+    }
 
+    float32_t computeLuma(NBL_REF_ARG(TexAccessor) tex, uint32_t2 sampleCount, uint32_t2 sampleIndex, float32_t2 viewportSize)
+    {
+        float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f));
+        float32_t2 samplePos = stride * sampleIndex;
+        float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize;
+        float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos));
+        float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
+
+        const float32_t minLuma = 1.0 / 4096.0;
+        const float32_t maxLuma = 32768.0;
+
+        luma = clamp(luma, minLuma, maxLuma);
+
+        return log2(luma / minLuma) / log2(maxLuma / minLuma);
+    }
+
+    LumaMeteringWindow window;
+};
+}
 }
 }
 

From 603a92f87a5831dc491ff4e4b53e99f5af9a57ce Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 7 Aug 2024 19:22:52 +0530
Subject: [PATCH 06/56] Add gatherLuma method

---
 include/nbl/asset/utils/IVirtualTexture.h     |  4 +-
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 54 +++++++++++++++++--
 include/nbl/builtin/hlsl/math/morton.hlsl     |  2 +-
 3 files changed, 54 insertions(+), 6 deletions(-)

diff --git a/include/nbl/asset/utils/IVirtualTexture.h b/include/nbl/asset/utils/IVirtualTexture.h
index 64ea49cbe7..b715c40cfc 100644
--- a/include/nbl/asset/utils/IVirtualTexture.h
+++ b/include/nbl/asset/utils/IVirtualTexture.h
@@ -922,7 +922,7 @@ class IVirtualTexture : public core::IReferenceCounted, public IVirtualTextureBa
         storage->incrTileCounter(neededPhysPages);
 
         return offsetToTextureData(
-            page_tab_offset_t(core::morton2d_decode_x(addr), core::morton2d_decode_y(addr), pgtLayer),
+            page_tab_offset_t(hlsl::morton2d_decode_x(addr), hlsl::morton2d_decode_y(addr), pgtLayer),
             extent,
             _subres.levelCount,
             _wrapu,
@@ -934,7 +934,7 @@ class IVirtualTexture : public core::IReferenceCounted, public IVirtualTextureBa
     {
         uint32_t sz = computeSquareSz(_addr.origsize_x, _addr.origsize_y);
         sz *= sz;
-        const uint32_t addr = core::morton2d_encode(_addr.pgTab_x, _addr.pgTab_y);
+        const uint32_t addr = hlsl::morton2d_encode(_addr.pgTab_x, _addr.pgTab_y);
 
         core::address_allocator_traits<pg_tab_addr_alctr_t>::multi_free_addr(m_pageTableLayerAllocators[_addr.pgTab_layer], 1u, &addr, &sz);
 
diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index d2c33602c8..7ed9604c4f 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -7,6 +7,9 @@
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/math/morton.hlsl"
 #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
@@ -25,9 +28,9 @@ struct LumaMeteringWindow
 	float32_t2 meteringWindowOffset;
 };
 
-template<uint32_t SubgroupSize, uint32_t SubgroupCount, typename SharedAccessor, typename TexAccessor>
+template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
 struct geom_luma_meter {
-    using this_t = geom_luma_meter<SubgroupSize, SubgroupCount, SharedAccessor, TexAccessor>;
+    using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
     static this_t create(NBL_REF_ARG(LumaMeteringWindow) window)
     {
@@ -36,7 +39,18 @@ struct geom_luma_meter {
         return retval;
     }
 
-    float32_t computeLuma(NBL_REF_ARG(TexAccessor) tex, uint32_t2 sampleCount, uint32_t2 sampleIndex, float32_t2 viewportSize)
+    float32_t reduction(float32_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    {
+        return workgroup::reduction < plus < float32_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float32_t computeLuma(
+        NBL_REF_ARG(TexAccessor) tex,
+        uint32_t2 sampleCount,
+        uint32_t2 sampleIndex,
+        float32_t2 viewportSize
+    )
     {
         float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f));
         float32_t2 samplePos = stride * sampleIndex;
@@ -52,6 +66,40 @@ struct geom_luma_meter {
         return log2(luma / minLuma) / log2(maxLuma / minLuma);
     }
 
+    void gatherLuma(
+        NBL_REF_ARG(ValueAccessor) val,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        uint32_t2 sampleCount,
+        float32_t2 viewportSize
+    ) {
+        uint32_t2 coord = {
+            morton2d_decode_x(glsl::gl_LocalInvocationIndex()),
+            morton2d_decode_y(glsl::gl_LocalInvocationIndex())
+        };
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+
+        uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1);
+        float32_t luma = 0.0f;
+
+        if (sampleIndex.x <= sampleCount.x && sampleIndex.y <= sampleCount.y) {
+            luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize);
+            float32_t lumaSum = reduction(luma, sdata);
+
+            sdata.workgroupExecutionAndMemoryBarrier();
+
+            if (tid == GroupSize - 1) {
+                uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+                uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+                uint32_t lumaSumBitPattern = uint32_t(clamp(lumaSum, 0.f, float((1 << fixedPointBitsLeft) - 1)));
+                uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
+                uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
+
+                val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+            }
+        }
+    }
+
     LumaMeteringWindow window;
 };
 }
diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 4150af637a..1f35016cb6 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -13,7 +13,7 @@
 
 namespace nbl
 {
-namespace core
+namespace hlsl
 {
 
 namespace impl

From 810a6ac1cc2ff6662dca36edd0413288b4f1b1ea Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:29:20 +0530
Subject: [PATCH 07/56] Add getGatheredLuma()

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 25 ++++++++++++++-----
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 7ed9604c4f..21bd813439 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -32,10 +32,12 @@ template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, ty
 struct geom_luma_meter {
     using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(NBL_REF_ARG(LumaMeteringWindow) window)
+    static this_t create(NBL_REF_ARG(LumaMeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
     {
         this_t retval;
         retval.window = window;
+        retval.minLuma = lumaMinimum;
+        retval.maxLuma = lumaMaximum;
         return retval;
     }
 
@@ -58,9 +60,6 @@ struct geom_luma_meter {
         float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos));
         float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
 
-        const float32_t minLuma = 1.0 / 4096.0;
-        const float32_t maxLuma = 32768.0;
-
         luma = clamp(luma, minLuma, maxLuma);
 
         return log2(luma / minLuma) / log2(maxLuma / minLuma);
@@ -72,7 +71,8 @@ struct geom_luma_meter {
         NBL_REF_ARG(SharedAccessor) sdata,
         uint32_t2 sampleCount,
         float32_t2 viewportSize
-    ) {
+    )
+    {
         uint32_t2 coord = {
             morton2d_decode_x(glsl::gl_LocalInvocationIndex()),
             morton2d_decode_y(glsl::gl_LocalInvocationIndex())
@@ -91,7 +91,9 @@ struct geom_luma_meter {
             if (tid == GroupSize - 1) {
                 uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
                 uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-                uint32_t lumaSumBitPattern = uint32_t(clamp(lumaSum, 0.f, float((1 << fixedPointBitsLeft) - 1)));
+
+                uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(minLuma)) * (log2(maxLuma) - log2(minLuma)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
                 uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
                 uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
 
@@ -100,7 +102,18 @@ struct geom_luma_meter {
         }
     }
 
+    float32_t getGatheredLuma(
+        NBL_REF_ARG(ValueAccessor) val,
+        uint32_t2 sampleCount
+    )
+    {
+        uint32_t lumaSumBitPattern = val.get(glsl::gl_SubgroupInvocationID());
+        float32_t lumaSumValue = float32_t(lumaSumBitPattern) / (log2(maxLuma) - log2(minLuma)) + log2(minLuma);
+        return glsl::subgroupAdd(lumaSumValue) / (sampleCount.x * sampleCount.y);
+    }
+
     LumaMeteringWindow window;
+    float32_t minLuma, maxLuma;
 };
 }
 }

From 69a73c1d90a0702894ecead0de1455d459d8b2ca Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 8 Aug 2024 16:59:59 +0530
Subject: [PATCH 08/56] Add reinhard and aces hlsl operators

---
 .../builtin/hlsl/tonemapper/operators.hlsl    | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index 5ebb5b2ffa..cc5728e9ff 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -5,10 +5,67 @@
 #ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
 #define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
 
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
 namespace nbl
 {
-namespace hls
+namespace hlsl
+{
+
+struct ReinhardParams
+{
+	float32_t keyAndManualLinearExposure;
+	float32_t rcpWhite2;
+};
+
+struct ACESParams
+{
+	float32_t gamma; // 1.0
+	float32_t exposure; // actualExposure+midGrayLog2
+};
+
+
+float32_t3 reinhard(ReinhardParams params, float32_t3 rawCIEXYZcolor)
+{
+	float32_t exposureFactors = params.keyAndManualLinearExposure;
+	float32_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
+	float32_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * params.rcpWhite2) / (1.0 + exposedLuma));
+	return rawCIEXYZcolor * colorMultiplier;
+}
+
+float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor)
 {
+	float32_t3 tonemapped = rawCIEXYZcolor;
+	if (tonemapped.y > 1.175494351e-38)
+		tonemapped *= exp2(log2(tonemapped.y) * (params.gamma - 1.0) + (params.exposure) * params.gamma);
+
+	// XYZ => RRT_SAT
+	// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)
+	const float32_t3x3 XYZ_RRT_Input = float32_t3x3(
+		float32_t3(1.594168310, -0.262608051, -0.231993079),
+		float32_t3(-0.6332771780, 1.5840380200, 0.0164147373),
+		float32_t3(0.00892840419, 0.03648501260, 0.87711471300)
+	);
+
+	// this is obviously fitted to some particular simulated sensor/film and display
+	float32_t3 v = mul(XYZ_RRT_Input, tonemapped);
+	float32_t3 a = v * (v + float32_t3(0.0245786)) - float32_t3(0.000090537);
+	float32_t3 b = v * (v * float32_t(0.983729) + float32_t3(0.4329510)) + float32_t3(0.238081);
+	v = a / b;
+
+	// ODT_SAT => XYZ
+	// this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t)
+	const float32_t3x3 ODT_XYZ_Output = float32_t3x3(
+		float32_t3(0.624798000, 0.164064825, 0.161605373),
+		float32_t3(0.268048108, 0.674283803, 0.057667464),
+		float32_t3(0.0157514643, 0.0526682511, 1.0204007600)
+	);
+	return mul(ODT_XYZ_Output, v);
+}
+
+// ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php
+// or get proper ACES RRT and ODTs
+// https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use-
 
 }
 }

From 4c70cf5bb919abab9c82e36320de45be88fe02ee Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 13 Aug 2024 21:47:49 +0530
Subject: [PATCH 09/56] cast mask values to correct type

---
 include/nbl/builtin/hlsl/math/morton.hlsl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 1f35016cb6..1cd2105dc5 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -31,7 +31,7 @@ T morton2d_mask(uint16_t _n)
         0x00FF00FF00FF00FFull,
         0x0000FFFF0000FFFFull
     };
-    return mask[_n];
+    return (T)mask[_n];
 }
 
 template <typename T>
@@ -45,7 +45,7 @@ T morton3d_mask(uint16_t _n)
         0x001F0000FF0000FFull,
         0x001F00000000FFFFull
     };
-    return mask[_n];
+    return (T)mask[_n];
 }
 template <typename T>
 T morton4d_mask(uint16_t _n)
@@ -57,7 +57,7 @@ T morton4d_mask(uint16_t _n)
         0x000F000F000F000Full,
         0x000000FF000000FFull
     };
-    return mask[_n];
+    return (T)mask[_n];
 }
 
 template <typename T, uint32_t bitDepth>

From d9d6dd8c19a1c896ea03dce1182791bfb2e1834b Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 16 Aug 2024 16:35:19 +0530
Subject: [PATCH 10/56] Add create methods to tonemapper params

---
 .../builtin/hlsl/tonemapper/operators.hlsl    | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index cc5728e9ff..daff652bbd 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -11,15 +11,34 @@ namespace nbl
 {
 namespace hlsl
 {
+namespace tonemapper
+{
 
 struct ReinhardParams
 {
+	using this_t = ReinhardParams;
+	static this_t create(float EV, float key = 0.18f, float WhitePointRelToEV = 16.f)
+	{
+		this_t retval;
+		retval.keyAndManualLinearExposure = key * exp2(EV);
+		retval.rcpWhite2 = 1.f / (WhitePointRelToEV * WhitePointRelToEV);
+		return retval;
+	}
+
 	float32_t keyAndManualLinearExposure;
 	float32_t rcpWhite2;
 };
 
 struct ACESParams
 {
+	using this_t = ACESParams;
+	static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) {
+		this_t retval;
+		retval.gamma = Contrast;
+		retval.exposure = EV + log2(key * 0.77321666f);
+		return retval;
+	}
+
 	float32_t gamma; // 1.0
 	float32_t exposure; // actualExposure+midGrayLog2
 };
@@ -49,8 +68,8 @@ float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor)
 
 	// this is obviously fitted to some particular simulated sensor/film and display
 	float32_t3 v = mul(XYZ_RRT_Input, tonemapped);
-	float32_t3 a = v * (v + float32_t3(0.0245786)) - float32_t3(0.000090537);
-	float32_t3 b = v * (v * float32_t(0.983729) + float32_t3(0.4329510)) + float32_t3(0.238081);
+	float32_t3 a = v * (v + float32_t3(0.0245786, 0.0245786, 0.0245786)) - float32_t3(0.000090537, 0.000090537, 0.000090537);
+	float32_t3 b = v * (v * float32_t3(0.983729, 0.983729, 0.983729) + float32_t3(0.4329510, 0.4329510, 0.4329510)) + float32_t3(0.238081, 0.238081, 0.238081);
 	v = a / b;
 
 	// ODT_SAT => XYZ
@@ -67,6 +86,7 @@ float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor)
 // or get proper ACES RRT and ODTs
 // https://partnerhelp.netflixstudios.com/hc/en-us/articles/360000622487-I-m-using-ACES-Which-Output-Transform-should-I-use-
 
+}
 }
 }
 

From 305f7e7430077c72a9bbf0b814ed5a6bd9e691a6 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 16 Aug 2024 16:35:49 +0530
Subject: [PATCH 11/56] Remove getGatheredLuma from luma_meter

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 21bd813439..94b898670b 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -13,6 +13,7 @@
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/math/morton.hlsl"
 #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
+#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
 #include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
 
 namespace nbl
@@ -57,7 +58,7 @@ struct geom_luma_meter {
         float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f));
         float32_t2 samplePos = stride * sampleIndex;
         float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize;
-        float32_t3 color = colorspace::eotf::sRGB(tex.get(uvPos));
+        float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos));
         float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
 
         luma = clamp(luma, minLuma, maxLuma);
@@ -102,16 +103,6 @@ struct geom_luma_meter {
         }
     }
 
-    float32_t getGatheredLuma(
-        NBL_REF_ARG(ValueAccessor) val,
-        uint32_t2 sampleCount
-    )
-    {
-        uint32_t lumaSumBitPattern = val.get(glsl::gl_SubgroupInvocationID());
-        float32_t lumaSumValue = float32_t(lumaSumBitPattern) / (log2(maxLuma) - log2(minLuma)) + log2(minLuma);
-        return glsl::subgroupAdd(lumaSumValue) / (sampleCount.x * sampleCount.y);
-    }
-
     LumaMeteringWindow window;
     float32_t minLuma, maxLuma;
 };

From 3f4f6e93163e5c0c1a67f88b8906a07916ddbe84 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 18:28:48 +0530
Subject: [PATCH 12/56] Separate LumaMeteringWindow into a common header

---
 .../nbl/builtin/hlsl/luma_meter/common.hlsl   | 27 +++++++++++++++++++
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 12 +++------
 src/nbl/builtin/CMakeLists.txt                |  1 +
 3 files changed, 31 insertions(+), 9 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/luma_meter/common.hlsl

diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
new file mode 100644
index 0000000000..210039390e
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
@@ -0,0 +1,27 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_COMMON_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace luma_meter
+{
+
+struct MeteringWindow
+{
+	float32_t2 meteringWindowScale;
+	float32_t2 meteringWindowOffset;
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 94b898670b..e865d61c0d 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -5,7 +5,6 @@
 #ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 #define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
 
-#include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
@@ -15,6 +14,7 @@
 #include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
 #include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
 #include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
+#include "nbl/builtin/hlsl/luma_meter/common.hlsl"
 
 namespace nbl
 {
@@ -23,17 +23,11 @@ namespace hlsl
 namespace luma_meter
 {
 
-struct LumaMeteringWindow
-{
-	float32_t2 meteringWindowScale;
-	float32_t2 meteringWindowOffset;
-};
-
 template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
 struct geom_luma_meter {
     using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(NBL_REF_ARG(LumaMeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
+    static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
     {
         this_t retval;
         retval.window = window;
@@ -103,7 +97,7 @@ struct geom_luma_meter {
         }
     }
 
-    LumaMeteringWindow window;
+    MeteringWindow window;
     float32_t minLuma, maxLuma;
 };
 }
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index df61293d4a..b4346c428e 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -35,6 +35,7 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ref.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/__ptr.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl")
 # luma metering
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")
 # tonemapper
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl")

From 515512a9dc5287dd68acce86205c53b5b219ba54 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 18:32:27 +0530
Subject: [PATCH 13/56] Simplify luma_meter naming

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index e865d61c0d..fb07acb8f4 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -24,8 +24,8 @@ namespace luma_meter
 {
 
 template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
-struct geom_luma_meter {
-    using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
+struct geom_meter {
+    using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
     static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
     {

From 1919e53ed6ecb319f7892005d0faad86706288a2 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 19:06:03 +0530
Subject: [PATCH 14/56] Simplify morton code

---
 include/nbl/builtin/hlsl/math/morton.hlsl | 135 +---------------------
 1 file changed, 6 insertions(+), 129 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 1cd2105dc5..c0769fc88b 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -19,9 +19,8 @@ namespace hlsl
 namespace impl
 {
 
-#ifdef __HLSL_VERSION
 template <typename T>
-T morton2d_mask(uint16_t _n)
+NBL_CONSTEXPR_FUNC T morton2d_mask(uint16_t _n)
 {
     const static uint64_t mask[5] =
     {
@@ -31,11 +30,11 @@ T morton2d_mask(uint16_t _n)
         0x00FF00FF00FF00FFull,
         0x0000FFFF0000FFFFull
     };
-    return (T)mask[_n];
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
 }
 
 template <typename T>
-T morton3d_mask(uint16_t _n)
+NBL_CONSTEXPR_FUNC T morton3d_mask(uint16_t _n)
 {
     const static uint64_t mask[5] =
     {
@@ -45,10 +44,10 @@ T morton3d_mask(uint16_t _n)
         0x001F0000FF0000FFull,
         0x001F00000000FFFFull
     };
-    return (T)mask[_n];
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
 }
 template <typename T>
-T morton4d_mask(uint16_t _n)
+NBL_CONSTEXPR_FUNC T morton4d_mask(uint16_t _n)
 {
     const static uint64_t mask[4] =
     {
@@ -57,7 +56,7 @@ T morton4d_mask(uint16_t _n)
         0x000F000F000F000Full,
         0x000000FF000000FFull
     };
-    return (T)mask[_n];
+    return nbl::hlsl::_static_cast<T>(mask[_n]);
 }
 
 template <typename T, uint32_t bitDepth>
@@ -141,128 +140,6 @@ inline T separate_bits_4d(T x)
 
     return x;
 }
-#else
-template <typename T>
-constexpr T morton2d_mask(uint8_t _n)
-{
-    constexpr uint64_t mask[5] =
-    {
-        0x5555555555555555ull,
-        0x3333333333333333ull,
-        0x0F0F0F0F0F0F0F0Full,
-        0x00FF00FF00FF00FFull,
-        0x0000FFFF0000FFFFull
-    };
-    return static_cast<T>(mask[_n]);
-}
-template <typename T>
-constexpr T morton3d_mask(uint8_t _n)
-{
-    constexpr uint64_t mask[5] =
-    {
-        0x1249249249249249ull,
-        0x10C30C30C30C30C3ull,
-        0x010F00F00F00F00Full,
-        0x001F0000FF0000FFull,
-        0x001F00000000FFFFull
-    };
-    return static_cast<T>(mask[_n]);
-}
-template <typename T>
-constexpr T morton4d_mask(uint8_t _n)
-{
-    constexpr uint64_t mask[4] =
-    {
-        0x1111111111111111ull,
-        0x0303030303030303ull,
-        0x000F000F000F000Full,
-        0x000000FF000000FFull
-    };
-    return static_cast<T>(mask[_n]);
-}
-
-template <typename T, uint32_t bitDepth>
-inline T morton2d_decode(T x)
-{
-    x = x & morton2d_mask<T>(0);
-    x = (x | (x >> 1)) & morton2d_mask<T>(1);
-    x = (x | (x >> 2)) & morton2d_mask<T>(2);
-    if constexpr (bitDepth > 8u)
-    {
-        x = (x | (x >> 4)) & morton2d_mask<T>(3);
-    }
-    if constexpr (bitDepth > 16u)
-    {
-        x = (x | (x >> 8)) & morton2d_mask<T>(4);
-    }
-    if constexpr (bitDepth > 32u)
-    {
-        x = (x | (x >> 16));
-    }
-    return x;
-}
-
-//! Puts bits on even positions filling gaps with 0s
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_2d(T x)
-{
-    if constexpr (bitDepth > 32u)
-    {
-        x = (x | (x << 16)) & morton2d_mask<T>(4);
-    }
-    if constexpr (bitDepth > 16u)
-    {
-        x = (x | (x << 8)) & morton2d_mask<T>(3);
-    }
-    if constexpr (bitDepth > 8u)
-    {
-        x = (x | (x << 4)) & morton2d_mask<T>(2);
-    }
-    x = (x | (x << 2)) & morton2d_mask<T>(1);
-    x = (x | (x << 1)) & morton2d_mask<T>(0);
-
-    return x;
-}
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_3d(T x)
-{
-    if constexpr (bitDepth > 32u)
-    {
-        x = (x | (x << 32)) & morton3d_mask<T>(4);
-    }
-    if constexpr (bitDepth > 16u)
-    {
-        x = (x | (x << 16)) & morton3d_mask<T>(3);
-    }
-    if constexpr (bitDepth > 8u)
-    {
-        x = (x | (x << 8)) & morton3d_mask<T>(2);
-    }
-    x = (x | (x << 4)) & morton3d_mask<T>(1);
-    x = (x | (x << 2)) & morton3d_mask<T>(0);
-
-    return x;
-}
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_4d(T x)
-{
-    if constexpr (bitDepth > 32u)
-    {
-        x = (x | (x << 24)) & morton4d_mask<T>(3);
-    }
-    if constexpr (bitDepth > 16u)
-    {
-        x = (x | (x << 12)) & morton4d_mask<T>(2);
-    }
-    if constexpr (bitDepth > 8u)
-    {
-        x = (x | (x << 6)) & morton4d_mask<T>(1);
-    }
-    x = (x | (x << 3)) & morton4d_mask<T>(0);
-
-    return x;
-}
-#endif
 }
 
 template<typename T, uint32_t bitDepth = sizeof(T) * 8u>

From 4c582382e8adca012b959577367138a8f1a92dfd Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 19:09:24 +0530
Subject: [PATCH 15/56] Add missing comment

---
 include/nbl/builtin/hlsl/tonemapper/operators.hlsl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index daff652bbd..1481fd92b2 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -35,7 +35,8 @@ struct ACESParams
 	static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) {
 		this_t retval;
 		retval.gamma = Contrast;
-		retval.exposure = EV + log2(key * 0.77321666f);
+		const float reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
+		retval.exposure = EV + log2(key * reinhardMatchCorrection);
 		return retval;
 	}
 

From 3c3f8b84025dfddb3464d4bc9ed5ca76f651b07c Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 20:09:02 +0530
Subject: [PATCH 16/56] Refactor tonemapping operators

---
 .../builtin/hlsl/tonemapper/operators.hlsl    | 106 +++++++++---------
 1 file changed, 56 insertions(+), 50 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index 1481fd92b2..854f78e302 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -6,6 +6,7 @@
 #define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
 
 namespace nbl
 {
@@ -14,10 +15,13 @@ namespace hlsl
 namespace tonemapper
 {
 
-struct ReinhardParams
+template<typename T = float32_t>
+struct Reinhard
 {
-	using this_t = ReinhardParams;
-	static this_t create(float EV, float key = 0.18f, float WhitePointRelToEV = 16.f)
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+	using this_t = Reinhard<float_t>;
+	static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f)
 	{
 		this_t retval;
 		retval.keyAndManualLinearExposure = key * exp2(EV);
@@ -25,63 +29,65 @@ struct ReinhardParams
 		return retval;
 	}
 
-	float32_t keyAndManualLinearExposure;
-	float32_t rcpWhite2;
+	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		float_t exposureFactors = keyAndManualLinearExposure;
+		float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
+		float_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * rcpWhite2) / (1.0 + exposedLuma));
+		return rawCIEXYZcolor * colorMultiplier;
+	}
+
+	float_t3 keyAndManualLinearExposure;
+	float_t3 rcpWhite2;
 };
 
-struct ACESParams
+template<typename T = float32_t>
+struct ACES
 {
-	using this_t = ACESParams;
-	static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) {
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+	using float_t3x3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3x3, float16_t3x3>::type;
+
+	using this_t = ACES<T>;
+	static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) {
 		this_t retval;
 		retval.gamma = Contrast;
-		const float reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
+		const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
 		retval.exposure = EV + log2(key * reinhardMatchCorrection);
 		return retval;
 	}
 
-	float32_t gamma; // 1.0
-	float32_t exposure; // actualExposure+midGrayLog2
-};
-
-
-float32_t3 reinhard(ReinhardParams params, float32_t3 rawCIEXYZcolor)
-{
-	float32_t exposureFactors = params.keyAndManualLinearExposure;
-	float32_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
-	float32_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * params.rcpWhite2) / (1.0 + exposedLuma));
-	return rawCIEXYZcolor * colorMultiplier;
-}
+	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		float_t3 tonemapped = rawCIEXYZcolor;
+		if (tonemapped.y > 1.175494351e-38)
+			tonemapped *= exp2(log2(tonemapped.y) * (gamma - 1.0) + (exposure) * gamma);
+
+		// XYZ => RRT_SAT
+		// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)
+		const float_t3x3 XYZ_RRT_Input = float_t3x3(
+			float_t3(1.594168310, -0.262608051, -0.231993079),
+			float_t3(-0.6332771780, 1.5840380200, 0.0164147373),
+			float_t3(0.00892840419, 0.03648501260, 0.87711471300)
+		);
+
+		// this is obviously fitted to some particular simulated sensor/film and display
+		float_t3 v = mul(XYZ_RRT_Input, tonemapped);
+		float_t3 a = v * (v + float_t3(0.0245786, 0.0245786, 0.0245786)) - float_t3(0.000090537, 0.000090537, 0.000090537);
+		float_t3 b = v * (v * float_t3(0.983729, 0.983729, 0.983729) + float_t3(0.4329510, 0.4329510, 0.4329510)) + float_t3(0.238081, 0.238081, 0.238081);
+		v = a / b;
+
+		// ODT_SAT => XYZ
+		// this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t)
+		const float_t3x3 ODT_XYZ_Output = float_t3x3(
+			float_t3(0.624798000, 0.164064825, 0.161605373),
+			float_t3(0.268048108, 0.674283803, 0.057667464),
+			float_t3(0.0157514643, 0.0526682511, 1.0204007600)
+		);
+		return mul(ODT_XYZ_Output, v);
+	}
 
-float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor)
-{
-	float32_t3 tonemapped = rawCIEXYZcolor;
-	if (tonemapped.y > 1.175494351e-38)
-		tonemapped *= exp2(log2(tonemapped.y) * (params.gamma - 1.0) + (params.exposure) * params.gamma);
-
-	// XYZ => RRT_SAT
-	// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)
-	const float32_t3x3 XYZ_RRT_Input = float32_t3x3(
-		float32_t3(1.594168310, -0.262608051, -0.231993079),
-		float32_t3(-0.6332771780, 1.5840380200, 0.0164147373),
-		float32_t3(0.00892840419, 0.03648501260, 0.87711471300)
-	);
-
-	// this is obviously fitted to some particular simulated sensor/film and display
-	float32_t3 v = mul(XYZ_RRT_Input, tonemapped);
-	float32_t3 a = v * (v + float32_t3(0.0245786, 0.0245786, 0.0245786)) - float32_t3(0.000090537, 0.000090537, 0.000090537);
-	float32_t3 b = v * (v * float32_t3(0.983729, 0.983729, 0.983729) + float32_t3(0.4329510, 0.4329510, 0.4329510)) + float32_t3(0.238081, 0.238081, 0.238081);
-	v = a / b;
-
-	// ODT_SAT => XYZ
-	// this seems to be a matrix for some hybrid colorspace, coefficients are similar to AdobeRGB,BT2020 and ACEScc(t)
-	const float32_t3x3 ODT_XYZ_Output = float32_t3x3(
-		float32_t3(0.624798000, 0.164064825, 0.161605373),
-		float32_t3(0.268048108, 0.674283803, 0.057667464),
-		float32_t3(0.0157514643, 0.0526682511, 1.0204007600)
-	);
-	return mul(ODT_XYZ_Output, v);
-}
+	float_t gamma; // 1.0
+	float_t exposure; // actualExposure+midGrayLog2
+};
 
 // ideas for more operators https://web.archive.org/web/20191226154550/http://cs.columbia.edu/CAVE/software/softlib/dorf.php
 // or get proper ACES RRT and ODTs

From b0e07505a374d3e81e18e9e71c39152e4599051c Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 20 Aug 2024 20:17:38 +0530
Subject: [PATCH 17/56] Small fixes

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index fb07acb8f4..af128b0f98 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -27,7 +27,7 @@ template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, ty
 struct geom_meter {
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(NBL_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
+    static this_t create(NBL_CONST_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
     {
         this_t retval;
         retval.window = window;
@@ -68,11 +68,12 @@ struct geom_meter {
         float32_t2 viewportSize
     )
     {
+
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
         uint32_t2 coord = {
-            morton2d_decode_x(glsl::gl_LocalInvocationIndex()),
-            morton2d_decode_y(glsl::gl_LocalInvocationIndex())
+            morton2d_decode_x(tid),
+            morton2d_decode_y(tid)
         };
-        uint32_t tid = workgroup::SubgroupContiguousIndex();
 
         uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1);
         float32_t luma = 0.0f;
@@ -81,8 +82,6 @@ struct geom_meter {
             luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize);
             float32_t lumaSum = reduction(luma, sdata);
 
-            sdata.workgroupExecutionAndMemoryBarrier();
-
             if (tid == GroupSize - 1) {
                 uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
                 uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();

From e8e46c9d042e76adb3bfd449982fcff70986cfba Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 21 Aug 2024 16:20:32 +0530
Subject: [PATCH 18/56] Use promote to simplify code

---
 include/nbl/builtin/hlsl/tonemapper/operators.hlsl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index 854f78e302..e5e6a9a97c 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -71,8 +71,8 @@ struct ACES
 
 		// this is obviously fitted to some particular simulated sensor/film and display
 		float_t3 v = mul(XYZ_RRT_Input, tonemapped);
-		float_t3 a = v * (v + float_t3(0.0245786, 0.0245786, 0.0245786)) - float_t3(0.000090537, 0.000090537, 0.000090537);
-		float_t3 b = v * (v * float_t3(0.983729, 0.983729, 0.983729) + float_t3(0.4329510, 0.4329510, 0.4329510)) + float_t3(0.238081, 0.238081, 0.238081);
+		float_t3 a = v * (v + promote<float_t3>(0.0245786)) - promote<float_t3>(0.000090537);
+		float_t3 b = v * (v * promote<float_t3>(0.983729) + promote<float_t3>(0.4329510)) + promote<float_t3>(0.238081);
 		v = a / b;
 
 		// ODT_SAT => XYZ

From ee5affe6f20f25e1c7eb2675e07fe340be9204fb Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 21 Aug 2024 17:07:34 +0530
Subject: [PATCH 19/56] Add static create to MeteringWindow

---
 include/nbl/builtin/hlsl/luma_meter/common.hlsl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
index 210039390e..55d1713619 100644
--- a/include/nbl/builtin/hlsl/luma_meter/common.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
@@ -16,8 +16,16 @@ namespace luma_meter
 
 struct MeteringWindow
 {
+	using this_t = MeteringWindow;
 	float32_t2 meteringWindowScale;
 	float32_t2 meteringWindowOffset;
+
+	static this_t create(float32_t2 scale, float32_t2 offset) {
+		this_t retval;
+		retval.meteringWindowScale = scale;
+		retval.meteringWindowOffset = offset;
+		return retval;
+	}
 };
 
 }

From 56389f45a6f5689889d232fb051a15b0001e43f7 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 21 Aug 2024 18:31:28 +0530
Subject: [PATCH 20/56] Infer sample count from viewportSize

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 49 +++++++------------
 .../builtin/hlsl/tonemapper/operators.hlsl    |  4 +-
 2 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index af128b0f98..23deac8bbe 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -27,12 +27,10 @@ template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, ty
 struct geom_meter {
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(NBL_CONST_REF_ARG(MeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum)
+    static this_t create(float32_t2 lumaMinMax)
     {
         this_t retval;
-        retval.window = window;
-        retval.minLuma = lumaMinimum;
-        retval.maxLuma = lumaMaximum;
+        retval.lumaMinMax = lumaMinMax;
         return retval;
     }
 
@@ -43,61 +41,52 @@ struct geom_meter {
     }
 
     float32_t computeLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
-        uint32_t2 sampleCount,
-        uint32_t2 sampleIndex,
-        float32_t2 viewportSize
+        float32_t2 shiftedCoord
     )
     {
-        float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f));
-        float32_t2 samplePos = stride * sampleIndex;
-        float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize;
+        float32_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
         float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos));
         float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
 
-        luma = clamp(luma, minLuma, maxLuma);
+        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
 
-        return log2(luma / minLuma) / log2(maxLuma / minLuma);
+        return log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x);
     }
 
     void gatherLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(ValueAccessor) val,
         NBL_REF_ARG(TexAccessor) tex,
         NBL_REF_ARG(SharedAccessor) sdata,
-        uint32_t2 sampleCount,
-        float32_t2 viewportSize
+        float32_t2 tileOffset
     )
     {
-
         uint32_t tid = workgroup::SubgroupContiguousIndex();
         uint32_t2 coord = {
             morton2d_decode_x(tid),
             morton2d_decode_y(tid)
         };
 
-        uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1);
         float32_t luma = 0.0f;
+        luma = computeLuma(window, tex, tileOffset + (float32_t2)(coord));
+        float32_t lumaSum = reduction(luma, sdata);
 
-        if (sampleIndex.x <= sampleCount.x && sampleIndex.y <= sampleCount.y) {
-            luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize);
-            float32_t lumaSum = reduction(luma, sdata);
-
-            if (tid == GroupSize - 1) {
-                uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-                uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+        if (tid == GroupSize - 1) {
+            uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+            uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
 
-                uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(minLuma)) * (log2(maxLuma) - log2(minLuma)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+            uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(lumaMinMax.x)) * (log2(lumaMinMax.y) - log2(lumaMinMax.x)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
 
-                uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
-                uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
+            uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
+            uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
 
-                val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
-            }
+            val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
         }
     }
 
-    MeteringWindow window;
-    float32_t minLuma, maxLuma;
+    float32_t2 lumaMinMax;
 };
 }
 }
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index e5e6a9a97c..824e31d68a 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -36,8 +36,8 @@ struct Reinhard
 		return rawCIEXYZcolor * colorMultiplier;
 	}
 
-	float_t3 keyAndManualLinearExposure;
-	float_t3 rcpWhite2;
+	float_t keyAndManualLinearExposure;
+	float_t rcpWhite2;
 };
 
 template<typename T = float32_t>

From 23771d1610b50e2af60b2f4661d11c06e50d854f Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 22 Aug 2024 23:02:11 +0530
Subject: [PATCH 21/56] Rename gatherLuma, add toXYZ method and templatize the
 float type

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 23deac8bbe..b0b19b3a82 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -11,9 +11,6 @@
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/math/morton.hlsl"
-#include "nbl/builtin/hlsl/colorspace/EOTF.hlsl"
-#include "nbl/builtin/hlsl/colorspace/OETF.hlsl"
-#include "nbl/builtin/hlsl/colorspace/encodeCIEXYZ.hlsl"
 #include "nbl/builtin/hlsl/luma_meter/common.hlsl"
 
 namespace nbl
@@ -25,42 +22,45 @@ namespace luma_meter
 
 template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
 struct geom_meter {
+    using float_t = typename SharedAccessor::type;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float32_t2 lumaMinMax)
+    static this_t create(float_t2 lumaMinMax)
     {
         this_t retval;
         retval.lumaMinMax = lumaMinMax;
         return retval;
     }
 
-    float32_t reduction(float32_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    float_t reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        return workgroup::reduction < plus < float32_t >, GroupSize >::
+        return workgroup::reduction < plus < float_t >, GroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
 
-    float32_t computeLuma(
+    float_t computeLumaLog2(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
-        float32_t2 shiftedCoord
+        float_t2 shiftedCoord
     )
     {
-        float32_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
-        float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos));
-        float32_t luma = dot(colorspace::sRGBtoXYZ[1], color);
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = TexAccessor::toXYZ(color);
 
         luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
 
-        return log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x);
+        return max(log2(luma), log2(lumaMinMax.x));
     }
 
-    void gatherLuma(
+    void sampleLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(ValueAccessor) val,
         NBL_REF_ARG(TexAccessor) tex,
         NBL_REF_ARG(SharedAccessor) sdata,
-        float32_t2 tileOffset
+        float_t2 tileOffset
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
@@ -69,9 +69,9 @@ struct geom_meter {
             morton2d_decode_y(tid)
         };
 
-        float32_t luma = 0.0f;
-        luma = computeLuma(window, tex, tileOffset + (float32_t2)(coord));
-        float32_t lumaSum = reduction(luma, sdata);
+        float_t luma = 0.0f;
+        luma = computeLumaLog2(window, tex, tileOffset + (float32_t2)(coord));
+        float_t lumaSum = reduction(luma, sdata);
 
         if (tid == GroupSize - 1) {
             uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
@@ -86,7 +86,7 @@ struct geom_meter {
         }
     }
 
-    float32_t2 lumaMinMax;
+    float_t2 lumaMinMax;
 };
 }
 }

From ac390393cca2c89237532b57f12d95cc5584f0be Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 27 Aug 2024 00:41:14 +0530
Subject: [PATCH 22/56] Add uploadFloat, downloadFloat and gatherLuma

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 63 ++++++++++++++++---
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index b0b19b3a82..c39b2e3ab6 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -7,6 +7,7 @@
 
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
@@ -27,10 +28,11 @@ struct geom_meter {
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float_t2 lumaMinMax)
+    static this_t create(float_t2 lumaMinMax, float_t sampleCount)
     {
         this_t retval;
         retval.lumaMinMax = lumaMinMax;
+        retval.sampleCount = sampleCount;
         return retval;
     }
 
@@ -55,6 +57,34 @@ struct geom_meter {
         return max(log2(luma), log2(lumaMinMax.x));
     }
 
+    void uploadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        uint32_t index,
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
+        val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+    }
+
+    float_t downloadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        uint32_t index,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        float_t luma = (float_t)val.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+        luma = luma / rangeLog2 + minLog2;
+        return luma;
+    }
+
     void sampleLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(ValueAccessor) val,
@@ -74,18 +104,37 @@ struct geom_meter {
         float_t lumaSum = reduction(luma, sdata);
 
         if (tid == GroupSize - 1) {
-            uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-            uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-            uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(lumaMinMax.x)) * (log2(lumaMinMax.y) - log2(lumaMinMax.x)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
-
             uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
             uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
 
-            val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+            uploadFloat(
+                val,
+                workgroupIndex,
+                lumaSum,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            );
         }
     }
 
+    void gatherLuma(
+        NBL_REF_ARG(ValueAccessor) val
+    )
+    {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        float_t lumaSum = glsl::subgroupAdd(
+            downloadFloat(
+                val,
+                tid,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            )
+        );
+
+        uploadFloat(val, 0, lumaSum, log2(lumaMinMax.x), log2(lumaMinMax.y / lumaMinMax.x));
+    }
+
+    float_t sampleCount;
     float_t2 lumaMinMax;
 };
 }

From 49a80499c4ee3c7b09ce20e1f7a995d63cc7a73d Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 27 Aug 2024 19:37:11 +0530
Subject: [PATCH 23/56] Normalize tileOffset and coord to uv before computing
 Luma

---
 .../nbl/builtin/hlsl/luma_meter/luma_meter.hlsl    | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index c39b2e3ab6..6804c1d631 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -50,7 +50,7 @@ struct geom_meter {
     {
         float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
         float_t3 color = tex.get(uvPos);
-        float_t luma = TexAccessor::toXYZ(color);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
 
         luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
 
@@ -80,7 +80,7 @@ struct geom_meter {
         float_t rangeLog2
     )
     {
-        float_t luma = (float_t)val.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
         luma = luma / rangeLog2 + minLog2;
         return luma;
     }
@@ -90,7 +90,8 @@ struct geom_meter {
         NBL_REF_ARG(ValueAccessor) val,
         NBL_REF_ARG(TexAccessor) tex,
         NBL_REF_ARG(SharedAccessor) sdata,
-        float_t2 tileOffset
+        float_t2 tileOffset,
+        float_t2 viewportSize
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
@@ -100,7 +101,8 @@ struct geom_meter {
         };
 
         float_t luma = 0.0f;
-        luma = computeLumaLog2(window, tex, tileOffset + (float32_t2)(coord));
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        luma = computeLumaLog2(window, tex, shiftedCoord);
         float_t lumaSum = reduction(luma, sdata);
 
         if (tid == GroupSize - 1) {
@@ -117,7 +119,7 @@ struct geom_meter {
         }
     }
 
-    void gatherLuma(
+    float_t gatherLuma(
         NBL_REF_ARG(ValueAccessor) val
     )
     {
@@ -131,7 +133,7 @@ struct geom_meter {
             )
         );
 
-        uploadFloat(val, 0, lumaSum, log2(lumaMinMax.x), log2(lumaMinMax.y / lumaMinMax.x));
+        return lumaSum;
     }
 
     float_t sampleCount;

From 8a10ae2e12f36d48f39ff3350920d800da1cc47e Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sun, 29 Sep 2024 18:16:56 +0100
Subject: [PATCH 24/56] Simplify return statement

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 6804c1d631..266d6e6a2a 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -81,8 +81,7 @@ struct geom_meter {
     )
     {
         float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
-        luma = luma / rangeLog2 + minLog2;
-        return luma;
+        return luma / rangeLog2 + minLog2;
     }
 
     void sampleLuma(

From 6b01b6ddd4e687684e6e7a5f8073f7e556ad6967 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 11 Dec 2024 00:26:02 +0000
Subject: [PATCH 25/56] Update submodule pointers

---
 3rdparty/dxc/dxc         | 2 +-
 3rdparty/libexpat        | 2 +-
 3rdparty/nbl_spirv_cross | 2 +-
 3rdparty/openexr         | 2 +-
 3rdparty/volk            | 2 +-
 examples_tests           | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index a08b6cbeb1..29a5e1258e 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb
+Subproject commit 29a5e1258e2f01dd15ef1f58e24a02337c96c8f7
diff --git a/3rdparty/libexpat b/3rdparty/libexpat
index e2004f9195..39e487da35 160000
--- a/3rdparty/libexpat
+++ b/3rdparty/libexpat
@@ -1 +1 @@
-Subproject commit e2004f9195700bb8248c8c954578f14fda58be27
+Subproject commit 39e487da353b20bb3a724311d179ba0fddffc65b
diff --git a/3rdparty/nbl_spirv_cross b/3rdparty/nbl_spirv_cross
index f4accc2a4b..b52e6a55ca 160000
--- a/3rdparty/nbl_spirv_cross
+++ b/3rdparty/nbl_spirv_cross
@@ -1 +1 @@
-Subproject commit f4accc2a4b478c42038c920aa0e43a8aab7d135c
+Subproject commit b52e6a55ca2d9805a18dccfc45c7a2e692c1d8e1
diff --git a/3rdparty/openexr b/3rdparty/openexr
index fca936a964..824ed557b3 160000
--- a/3rdparty/openexr
+++ b/3rdparty/openexr
@@ -1 +1 @@
-Subproject commit fca936a964da5983daecdbed7cd249934701b41a
+Subproject commit 824ed557b3c59288a685356c708e5806b1122fe1
diff --git a/3rdparty/volk b/3rdparty/volk
index b6be5ba0af..efb96f9031 160000
--- a/3rdparty/volk
+++ b/3rdparty/volk
@@ -1 +1 @@
-Subproject commit b6be5ba0af5567974cc8a0261471573418f0f34f
+Subproject commit efb96f90317e1c902d6b45ae95d14e67779a2241
diff --git a/examples_tests b/examples_tests
index 8b6675b3ba..36633f5c2c 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 8b6675b3ba9fe1ca00f2c6573a4888abb8477da7
+Subproject commit 36633f5c2cae3e8e870a837c86e71f3a50061a3e

From f95f1c1e7eb5fe5c930b1c0badba345f4e27033e Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Wed, 11 Dec 2024 00:54:41 +0000
Subject: [PATCH 26/56] Update submodule pointer

---
 3rdparty/imgui | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/imgui b/3rdparty/imgui
index e489e40a85..a29e9dba30 160000
--- a/3rdparty/imgui
+++ b/3rdparty/imgui
@@ -1 +1 @@
-Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e
+Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175

From 1a5827379821023273130a547b8ba50141cd85a9 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 13 Dec 2024 04:34:45 +0000
Subject: [PATCH 27/56] Update submodule pointer

---
 3rdparty/Vulkan-Headers | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers
index 2c823b7f27..31aa7f634b 160000
--- a/3rdparty/Vulkan-Headers
+++ b/3rdparty/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb
+Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3

From b6e1f57110c4e34715bd6c15223a1db9224c47ff Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Fri, 13 Dec 2024 04:46:17 +0000
Subject: [PATCH 28/56] Update submodule pointer

---
 3rdparty/volk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/3rdparty/volk b/3rdparty/volk
index efb96f9031..b6be5ba0af 160000
--- a/3rdparty/volk
+++ b/3rdparty/volk
@@ -1 +1 @@
-Subproject commit efb96f90317e1c902d6b45ae95d14e67779a2241
+Subproject commit b6be5ba0af5567974cc8a0261471573418f0f34f

From 5239c29945cd2f609d13f40c66af3dcc4bd2f6a2 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Tue, 14 Jan 2025 00:42:26 +0000
Subject: [PATCH 29/56] Update submodule pointer

---
 3rdparty/Vulkan-Headers   | 2 +-
 3rdparty/dxc/dxc          | 2 +-
 3rdparty/imgui            | 2 +-
 3rdparty/libexpat         | 2 +-
 3rdparty/nbl_spirv_cross  | 2 +-
 3rdparty/openexr          | 2 +-
 3rdparty/parallel-hashmap | 2 +-
 examples_tests            | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers
index 31aa7f634b..2c823b7f27 160000
--- a/3rdparty/Vulkan-Headers
+++ b/3rdparty/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3
+Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb
diff --git a/3rdparty/dxc/dxc b/3rdparty/dxc/dxc
index 5adc27f9e4..a08b6cbeb1 160000
--- a/3rdparty/dxc/dxc
+++ b/3rdparty/dxc/dxc
@@ -1 +1 @@
-Subproject commit 5adc27f9e42de7681d65a98873048af661b9b367
+Subproject commit a08b6cbeb1038d14d0586d10a8cfa507b2fda8eb
diff --git a/3rdparty/imgui b/3rdparty/imgui
index a29e9dba30..e489e40a85 160000
--- a/3rdparty/imgui
+++ b/3rdparty/imgui
@@ -1 +1 @@
-Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175
+Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e
diff --git a/3rdparty/libexpat b/3rdparty/libexpat
index 39e487da35..e2004f9195 160000
--- a/3rdparty/libexpat
+++ b/3rdparty/libexpat
@@ -1 +1 @@
-Subproject commit 39e487da353b20bb3a724311d179ba0fddffc65b
+Subproject commit e2004f9195700bb8248c8c954578f14fda58be27
diff --git a/3rdparty/nbl_spirv_cross b/3rdparty/nbl_spirv_cross
index b52e6a55ca..f4accc2a4b 160000
--- a/3rdparty/nbl_spirv_cross
+++ b/3rdparty/nbl_spirv_cross
@@ -1 +1 @@
-Subproject commit b52e6a55ca2d9805a18dccfc45c7a2e692c1d8e1
+Subproject commit f4accc2a4b478c42038c920aa0e43a8aab7d135c
diff --git a/3rdparty/openexr b/3rdparty/openexr
index c8a74d9ac9..fca936a964 160000
--- a/3rdparty/openexr
+++ b/3rdparty/openexr
@@ -1 +1 @@
-Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd
+Subproject commit fca936a964da5983daecdbed7cd249934701b41a
diff --git a/3rdparty/parallel-hashmap b/3rdparty/parallel-hashmap
index 7684faf186..fd7b8fb87d 160000
--- a/3rdparty/parallel-hashmap
+++ b/3rdparty/parallel-hashmap
@@ -1 +1 @@
-Subproject commit 7684faf186806e2c88554a78188c18185b21f127
+Subproject commit fd7b8fb87d74cc990591c3443b2ef21e9e137500
diff --git a/examples_tests b/examples_tests
index 36633f5c2c..f79caed8b5 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 36633f5c2cae3e8e870a837c86e71f3a50061a3e
+Subproject commit f79caed8b54499c1a4e848672dec38ce85d9a184

From 06c915e42162869f11ae951b7a081c722505d4e8 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 21 Jan 2025 16:11:27 +0100
Subject: [PATCH 30/56] stop rolling back my modules!

---
 3rdparty/Vulkan-Headers | 2 +-
 3rdparty/imgui          | 2 +-
 3rdparty/imguizmo       | 2 +-
 3rdparty/openexr        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/3rdparty/Vulkan-Headers b/3rdparty/Vulkan-Headers
index 2c823b7f27..31aa7f634b 160000
--- a/3rdparty/Vulkan-Headers
+++ b/3rdparty/Vulkan-Headers
@@ -1 +1 @@
-Subproject commit 2c823b7f27590ec0a489f7fbe14b154e13fa5cfb
+Subproject commit 31aa7f634b052d87ede4664053e85f3f4d1d50d3
diff --git a/3rdparty/imgui b/3rdparty/imgui
index e489e40a85..a29e9dba30 160000
--- a/3rdparty/imgui
+++ b/3rdparty/imgui
@@ -1 +1 @@
-Subproject commit e489e40a853426767de9ce0637bc0c9ceb431c1e
+Subproject commit a29e9dba3012eca9f80bdc4c39ca61a1df8e7175
diff --git a/3rdparty/imguizmo b/3rdparty/imguizmo
index 6f4b2197ef..b10e91756d 160000
--- a/3rdparty/imguizmo
+++ b/3rdparty/imguizmo
@@ -1 +1 @@
-Subproject commit 6f4b2197efd715d16b19775b00f36c6c6f5aacb6
+Subproject commit b10e91756d32395f5c1fefd417899b657ed7cb88
diff --git a/3rdparty/openexr b/3rdparty/openexr
index fca936a964..c8a74d9ac9 160000
--- a/3rdparty/openexr
+++ b/3rdparty/openexr
@@ -1 +1 @@
-Subproject commit fca936a964da5983daecdbed7cd249934701b41a
+Subproject commit c8a74d9ac97dd579a47a7913f361a87349c0fffd

From 90d20c44783c9f3837f554ae8a05beb1ecd9f956 Mon Sep 17 00:00:00 2001
From: devsh <devsh@devsh.eu>
Date: Tue, 21 Jan 2025 16:49:29 +0100
Subject: [PATCH 31/56] point submodule at head

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index f79caed8b5..9e26a74aa1 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit f79caed8b54499c1a4e848672dec38ce85d9a184
+Subproject commit 9e26a74aa1bcbe5e26ee14a79d4f2ef9e2701e0d

From 4edd38c002531e3bbf55a8f0649af187223a1077 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 13 Mar 2025 11:57:14 +0000
Subject: [PATCH 32/56] Add capabilities for atomic ops

---
 include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
index 2ecb08cdb2..973a313e9c 100644
--- a/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
+++ b/include/nbl/builtin/hlsl/spirv_intrinsics/core.hlsl
@@ -61,37 +61,45 @@ pointer_t<StorageClass,T> copyObject([[vk::ext_reference]] T v);
 // Here's the thing with atomics, it's not only the data type that dictates whether you can do an atomic or not.
 // It's the storage class that has the most effect (shared vs storage vs image) and we can't check that easily
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicIAdd([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicIAdd)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicIAdd(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint32_t> || is_same_v<T,int32_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint32_t> || is_same_v<T,int32_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T> // integers operate on 2s complement so same op for signed and unsigned
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_same_v<T,uint64_t> || is_same_v<T,int64_t>, T> atomicISub([[vk::ext_reference]] T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);
 
 template<typename T, typename Ptr_T> // DXC Workaround
+[[vk::ext_capability(spv::CapabilityPhysicalStorageBufferAddresses)]]
 [[vk::ext_capability(spv::CapabilityInt64Atomics)]]
 [[vk::ext_instruction(spv::OpAtomicISub)]]
 enable_if_t<is_spirv_type_v<Ptr_T> && (is_same_v<T,uint64_t> || is_same_v<T,int64_t>), T> atomicISub(Ptr_T ptr, uint32_t memoryScope, uint32_t memorySemantics, T value);

From f1e3e9866682fc79fa830d4a1c888674e24f58f7 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Thu, 13 Mar 2025 11:58:01 +0000
Subject: [PATCH 33/56] Fix luma_meter

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 266d6e6a2a..9808b9e26d 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -105,8 +105,8 @@ struct geom_meter {
         float_t lumaSum = reduction(luma, sdata);
 
         if (tid == GroupSize - 1) {
-            uint32_t3 workgroupSize = glsl::gl_WorkGroupSize();
-            uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID());
+            uint32_t3 workgroupCount = glsl::gl_NumWorkGroups();
+            uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64;
 
             uploadFloat(
                 val,
@@ -122,8 +122,8 @@ struct geom_meter {
         NBL_REF_ARG(ValueAccessor) val
     )
     {
-        uint32_t tid = workgroup::SubgroupContiguousIndex();
-        float_t lumaSum = glsl::subgroupAdd(
+        uint32_t tid = glsl::gl_SubgroupInvocationID();
+        float_t luma = glsl::subgroupAdd(
             downloadFloat(
                 val,
                 tid,
@@ -132,7 +132,10 @@ struct geom_meter {
             )
         );
 
-        return lumaSum;
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
     }
 
     float_t sampleCount;

From f1b7d170718d1ba0d48eef0b69af842be0463bea Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sun, 16 Mar 2025 11:07:47 +0000
Subject: [PATCH 34/56] Add median_luma_meter

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 9808b9e26d..c17a64c437 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -141,6 +141,151 @@ struct geom_meter {
     float_t sampleCount;
     float_t2 lumaMinMax;
 };
+
+template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
+struct median_meter {
+    using int_t = typename SharedAccessor::type;
+    using float_t  = float32_t;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+    using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
+
+    static this_t create(float_t2 lumaMinMax, float_t sampleCount) {
+        this_t retval;
+        retval.lumaMinMax = lumaMinMax;
+        retval.sampleCount = sampleCount;
+        return retval;
+    }
+
+    int_t inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
+        return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float_t computeLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(TexAccessor) tex,
+        float_t2 shiftedCoord
+    ) {
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+        return clamp(luma, lumaMinMax.x, lumaMinMax.y);
+    }
+
+    int_t float2Int(
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    ) {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+    }
+
+    float_t int2Float(
+        int_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    ) {
+        return val / rangeLog2 + minLog2;
+    }
+
+    void sampleLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(HistogramAccessor) histo,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        float_t2 tileOffset,
+        float_t2 viewportSize
+    ) {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        
+        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+            sdata.set(vid, 0);
+        }
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        uint32_t2 coord = {
+            morton2d_decode_x(tid),
+            morton2d_decode_y(tid)
+        };
+
+        float_t luma = 0.0f;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        luma = computeLuma(window, tex, shiftedCoord);
+
+        float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
+        uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
+
+        sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        float_t histogram_value;
+        sdata.get(tid, histogram_value);
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        float_t sum = inclusive_scan(histogram_value, sdata);
+        histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+        const bool is_last_wg_invocation = tid == (GroupSize - 1);
+        const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;
+
+        for (int i = 1; i < RoundedBinCount; i++) {
+            uint32_t keyBucketStart = GroupSize * i;
+            uint32_t vid = tid + keyBucketStart;
+
+            // no if statement about the last iteration needed
+            if (is_last_wg_invocation) {
+                float_t beforeSum;
+                sdata.get(keyBucketStart, beforeSum);
+                sdata.set(keyBucketStart, beforeSum + sum);
+            }
+
+            // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
+            sdata.workgroupExecutionAndMemoryBarrier();
+
+            // no aliasing anymore
+            float_t atVid;
+            sdata.get(vid, atVid);
+            sum = inclusive_scan(atVid, sdata);
+            if (vid < BinCount) {
+                histo.atomicAdd(vid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+            }
+        }
+    }
+
+    float_t gatherLuma(
+        NBL_REF_ARG(HistogramAccessor) histo,
+        NBL_REF_ARG(SharedAccessor) sdata
+    ) {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+
+        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+            sdata.set(
+                vid,
+                histo.get(vid & (BinCount - 1))
+            );
+        }
+
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        uint32_t percentile40, percentile60;
+        sdata.get(BinCount * 0.4, percentile40);
+        sdata.get(BinCount * 0.6, percentile60);
+
+        return (int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
+    }
+
+    float_t sampleCount;
+    float_t2 lumaMinMax;
+};
+
 }
 }
 }

From 83ac633896008509ea16f8d896e4048f98eb888d Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Sun, 16 Mar 2025 11:49:58 +0000
Subject: [PATCH 35/56] Update submodule pointer

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 06dad8c118..498ffd21a0 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 06dad8c118027d6ebc8ee04e19340ba643079a63
+Subproject commit 498ffd21a06b9e9c74d20b37860421d17fe7cf49

From 2b5e502d23c14b8cba96cb8a7ff7a4b6d4d5b4e3 Mon Sep 17 00:00:00 2001
From: Nipun Garg <24457793+nipunG314@users.noreply.github.com>
Date: Mon, 17 Mar 2025 16:11:48 +0000
Subject: [PATCH 36/56] Make changes to luma_meter

---
 .../builtin/hlsl/luma_meter/luma_meter.hlsl   | 48 ++++++++-----------
 .../builtin/hlsl/tonemapper/operators.hlsl    | 20 +++++---
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index c17a64c437..20af804603 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -36,13 +36,13 @@ struct geom_meter {
         return retval;
     }
 
-    float_t reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
     {
         return workgroup::reduction < plus < float_t >, GroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
 
-    float_t computeLumaLog2(
+    float_t __computeLumaLog2(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
         float_t2 shiftedCoord
@@ -54,26 +54,26 @@ struct geom_meter {
 
         luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
 
-        return max(log2(luma), log2(lumaMinMax.x));
+        return log2(luma);
     }
 
-    void uploadFloat(
+    void __uploadFloat(
         NBL_REF_ARG(ValueAccessor) val_accessor,
-        uint32_t index,
         float_t val,
         float_t minLog2,
         float_t rangeLog2
     )
     {
         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
 
         uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
 
-        val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+        val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
     }
 
-    float_t downloadFloat(
+    float_t __downloadFloat(
         NBL_REF_ARG(ValueAccessor) val_accessor,
         uint32_t index,
         float_t minLog2,
@@ -101,17 +101,13 @@ struct geom_meter {
 
         float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-        luma = computeLumaLog2(window, tex, shiftedCoord);
-        float_t lumaSum = reduction(luma, sdata);
-
-        if (tid == GroupSize - 1) {
-            uint32_t3 workgroupCount = glsl::gl_NumWorkGroups();
-            uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64;
+        float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
+        float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
 
-            uploadFloat(
+        if (tid == 0) {
+            __uploadFloat(
                 val,
-                workgroupIndex,
-                lumaSum,
+                lumaLog2Sum,
                 log2(lumaMinMax.x),
                 log2(lumaMinMax.y / lumaMinMax.x)
             );
@@ -124,7 +120,7 @@ struct geom_meter {
     {
         uint32_t tid = glsl::gl_SubgroupInvocationID();
         float_t luma = glsl::subgroupAdd(
-            downloadFloat(
+            __downloadFloat(
                 val,
                 tid,
                 log2(lumaMinMax.x),
@@ -150,19 +146,18 @@ struct median_meter {
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float_t2 lumaMinMax, float_t sampleCount) {
+    static this_t create(float_t2 lumaMinMax) {
         this_t retval;
         retval.lumaMinMax = lumaMinMax;
-        retval.sampleCount = sampleCount;
         return retval;
     }
 
-    int_t inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
+    int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
         return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
 
-    float_t computeLuma(
+    float_t __computeLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
         float_t2 shiftedCoord
@@ -174,7 +169,7 @@ struct median_meter {
         return clamp(luma, lumaMinMax.x, lumaMinMax.y);
     }
 
-    int_t float2Int(
+    int_t __float2Int(
         float_t val,
         float_t minLog2,
         float_t rangeLog2
@@ -185,7 +180,7 @@ struct median_meter {
         return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
     }
 
-    float_t int2Float(
+    float_t __int2Float(
         int_t val,
         float_t minLog2,
         float_t rangeLog2
@@ -216,7 +211,7 @@ struct median_meter {
 
         float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-        luma = computeLuma(window, tex, shiftedCoord);
+        luma = __computeLuma(window, tex, shiftedCoord);
 
         float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
         uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
@@ -255,7 +250,7 @@ struct median_meter {
             sdata.get(vid, atVid);
             sum = inclusive_scan(atVid, sdata);
             if (vid < BinCount) {
-                histo.atomicAdd(vid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+                histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
             }
         }
     }
@@ -279,10 +274,9 @@ struct median_meter {
         sdata.get(BinCount * 0.4, percentile40);
         sdata.get(BinCount * 0.6, percentile60);
 
-        return (int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
+        return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
     }
 
-    float_t sampleCount;
     float_t2 lumaMinMax;
 };
 
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
index 824e31d68a..46d241c76c 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
@@ -19,20 +19,25 @@ template<typename T = float32_t>
 struct Reinhard
 {
 	using float_t = enable_if_t<is_floating_point<T>::value, T>;
-	using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+	using float_t3 = vector<float_t, 3>;
 	using this_t = Reinhard<float_t>;
+
 	static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f)
 	{
 		this_t retval;
+
+		const float_t unit = 1.0;
 		retval.keyAndManualLinearExposure = key * exp2(EV);
-		retval.rcpWhite2 = 1.f / (WhitePointRelToEV * WhitePointRelToEV);
+		retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV);
+
 		return retval;
 	}
 
 	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		const float_t unit = 1.0;
 		float_t exposureFactors = keyAndManualLinearExposure;
 		float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
-		float_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * rcpWhite2) / (1.0 + exposedLuma));
+		float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma));
 		return rawCIEXYZcolor * colorMultiplier;
 	}
 
@@ -44,8 +49,8 @@ template<typename T = float32_t>
 struct ACES
 {
 	using float_t = enable_if_t<is_floating_point<T>::value, T>;
-	using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-	using float_t3x3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3x3, float16_t3x3>::type;
+	using float_t3 = vector<float_t, 3>;
+	using float_t3x3 = matrix<float_t, 3, 3>;
 
 	using this_t = ACES<T>;
 	static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) {
@@ -57,9 +62,10 @@ struct ACES
 	}
 
 	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+		const float_t unit = 1.0;
 		float_t3 tonemapped = rawCIEXYZcolor;
-		if (tonemapped.y > 1.175494351e-38)
-			tonemapped *= exp2(log2(tonemapped.y) * (gamma - 1.0) + (exposure) * gamma);
+		if (tonemapped.y > bit_cast<float_t>(numeric_limits<float_t>::min))
+			tonemapped *= exp2(log2(tonemapped.y) * (gamma - unit) + (exposure) * gamma);
 
 		// XYZ => RRT_SAT
 		// this seems to be a matrix for some hybrid colorspace, coefficients are somewhere inbetween BT2020 and ACEScc(t)

From c1524a9ecd0ddf480f4bbee3df1988c20ee54324 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 12 Jan 2026 10:54:14 +0700
Subject: [PATCH 37/56] refactor morton usage

---
 include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
index 20af804603..1bca324d13 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
@@ -94,10 +94,7 @@ struct geom_meter {
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
-        uint32_t2 coord = {
-            morton2d_decode_x(tid),
-            morton2d_decode_y(tid)
-        };
+        uint32_t2 coord = math::Morton<uint32_t>::decode2d(tid);
 
         float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
@@ -204,10 +201,7 @@ struct median_meter {
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        uint32_t2 coord = {
-            morton2d_decode_x(tid),
-            morton2d_decode_y(tid)
-        };
+        uint32_t2 coord = math::Morton<uint32_t>::decode2d(tid);
 
         float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;

From d8a2b81c9c830db5175d77180980f93aeb460541 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 12 Jan 2026 12:08:21 +0700
Subject: [PATCH 38/56] split out luma_meter and tonemapper operators into
 their own separate files

---
 .../builtin/hlsl/luma_meter/geom_mean.hlsl    | 143 ++++++++++++++++++
 .../{luma_meter.hlsl => histogram.hlsl}       | 138 ++---------------
 .../{operators.hlsl => operators/aces.hlsl}   |  41 +----
 .../hlsl/tonemapper/operators/reinhard.hlsl   |  54 +++++++
 src/nbl/builtin/CMakeLists.txt                |   6 +-
 5 files changed, 224 insertions(+), 158 deletions(-)
 create mode 100644 include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
 rename include/nbl/builtin/hlsl/luma_meter/{luma_meter.hlsl => histogram.hlsl} (59%)
 rename include/nbl/builtin/hlsl/tonemapper/{operators.hlsl => operators/aces.hlsl} (71%)
 create mode 100644 include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
new file mode 100644
index 0000000000..7c85f786fa
--- /dev/null
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -0,0 +1,143 @@
+// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+#define _NBL_BUILTIN_HLSL_LUMA_METER_INCLUDED_
+
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "nbl/builtin/hlsl/luma_meter/common.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace luma_meter
+{
+
+template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
+struct geom_meter
+{
+    using float_t = typename SharedAccessor::type;
+    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+    using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
+
+    static this_t create(float_t2 lumaMinMax, float_t sampleCount)
+    {
+        this_t retval;
+        retval.lumaMinMax = lumaMinMax;
+        retval.sampleCount = sampleCount;
+        return retval;
+    }
+
+    float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    {
+        return workgroup::reduction < plus < float_t >, GroupSize >::
+            template __call <SharedAccessor>(value, sdata);
+    }
+
+    float_t __computeLumaLog2(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(TexAccessor) tex,
+        float_t2 shiftedCoord
+    )
+    {
+        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        float_t3 color = tex.get(uvPos);
+        float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
+
+        return log2(luma);
+    }
+
+    void __uploadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        float_t val,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
+        val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+    }
+
+    float_t __downloadFloat(
+        NBL_REF_ARG(ValueAccessor) val_accessor,
+        uint32_t index,
+        float_t minLog2,
+        float_t rangeLog2
+    )
+    {
+        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+        return luma / rangeLog2 + minLog2;
+    }
+
+    void sampleLuma(
+        NBL_CONST_REF_ARG(MeteringWindow) window,
+        NBL_REF_ARG(ValueAccessor) val,
+        NBL_REF_ARG(TexAccessor) tex,
+        NBL_REF_ARG(SharedAccessor) sdata,
+        float_t2 tileOffset,
+        float_t2 viewportSize
+    )
+    {
+        uint32_t tid = workgroup::SubgroupContiguousIndex();
+        uint32_t2 coord = math::Morton<uint32_t>::decode2d(tid);
+
+        float_t luma = 0.0f;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
+        float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
+
+        if (tid == 0) {
+            __uploadFloat(
+                val,
+                lumaLog2Sum,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            );
+        }
+    }
+
+    float_t gatherLuma(
+        NBL_REF_ARG(ValueAccessor) val
+    )
+    {
+        uint32_t tid = glsl::gl_SubgroupInvocationID();
+        float_t luma = glsl::subgroupAdd(
+            __downloadFloat(
+                val,
+                tid,
+                log2(lumaMinMax.x),
+                log2(lumaMinMax.y / lumaMinMax.x)
+            )
+        );
+
+        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
+    }
+
+    float_t sampleCount;
+    float_t2 lumaMinMax;
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
similarity index 59%
rename from include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
rename to include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 1bca324d13..51c27c8e9e 100644
--- a/include/nbl/builtin/hlsl/luma_meter/luma_meter.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -21,135 +21,24 @@ namespace hlsl
 namespace luma_meter
 {
 
-template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
-struct geom_meter {
-    using float_t = typename SharedAccessor::type;
-    using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
-    using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-    using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
-
-    static this_t create(float_t2 lumaMinMax, float_t sampleCount)
-    {
-        this_t retval;
-        retval.lumaMinMax = lumaMinMax;
-        retval.sampleCount = sampleCount;
-        return retval;
-    }
-
-    float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
-    {
-        return workgroup::reduction < plus < float_t >, GroupSize >::
-            template __call <SharedAccessor>(value, sdata);
-    }
-
-    float_t __computeLumaLog2(
-        NBL_CONST_REF_ARG(MeteringWindow) window,
-        NBL_REF_ARG(TexAccessor) tex,
-        float_t2 shiftedCoord
-    )
-    {
-        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
-        float_t3 color = tex.get(uvPos);
-        float_t luma = (float_t)TexAccessor::toXYZ(color);
-
-        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
-
-        return log2(luma);
-    }
-
-    void __uploadFloat(
-        NBL_REF_ARG(ValueAccessor) val_accessor,
-        float_t val,
-        float_t minLog2,
-        float_t rangeLog2
-    )
-    {
-        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-        uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
-        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
-
-        val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
-    }
-
-    float_t __downloadFloat(
-        NBL_REF_ARG(ValueAccessor) val_accessor,
-        uint32_t index,
-        float_t minLog2,
-        float_t rangeLog2
-    )
-    {
-        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
-        return luma / rangeLog2 + minLog2;
-    }
-
-    void sampleLuma(
-        NBL_CONST_REF_ARG(MeteringWindow) window,
-        NBL_REF_ARG(ValueAccessor) val,
-        NBL_REF_ARG(TexAccessor) tex,
-        NBL_REF_ARG(SharedAccessor) sdata,
-        float_t2 tileOffset,
-        float_t2 viewportSize
-    )
-    {
-        uint32_t tid = workgroup::SubgroupContiguousIndex();
-        uint32_t2 coord = math::Morton<uint32_t>::decode2d(tid);
-
-        float_t luma = 0.0f;
-        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-        float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
-        float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
-
-        if (tid == 0) {
-            __uploadFloat(
-                val,
-                lumaLog2Sum,
-                log2(lumaMinMax.x),
-                log2(lumaMinMax.y / lumaMinMax.x)
-            );
-        }
-    }
-
-    float_t gatherLuma(
-        NBL_REF_ARG(ValueAccessor) val
-    )
-    {
-        uint32_t tid = glsl::gl_SubgroupInvocationID();
-        float_t luma = glsl::subgroupAdd(
-            __downloadFloat(
-                val,
-                tid,
-                log2(lumaMinMax.x),
-                log2(lumaMinMax.y / lumaMinMax.x)
-            )
-        );
-
-        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
-    }
-
-    float_t sampleCount;
-    float_t2 lumaMinMax;
-};
-
 template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
-struct median_meter {
+struct median_meter
+{
     using int_t = typename SharedAccessor::type;
     using float_t  = float32_t;
     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float_t2 lumaMinMax) {
+    static this_t create(float_t2 lumaMinMax)
+    {
         this_t retval;
         retval.lumaMinMax = lumaMinMax;
         return retval;
     }
 
-    int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata) {
+    int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    {
         return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
@@ -158,7 +47,8 @@ struct median_meter {
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
         float_t2 shiftedCoord
-    ) {
+    )
+    {
         float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
         float_t3 color = tex.get(uvPos);
         float_t luma = (float_t)TexAccessor::toXYZ(color);
@@ -170,7 +60,8 @@ struct median_meter {
         float_t val,
         float_t minLog2,
         float_t rangeLog2
-    ) {
+    )
+    {
         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
 
@@ -181,7 +72,8 @@ struct median_meter {
         int_t val,
         float_t minLog2,
         float_t rangeLog2
-    ) {
+    )
+    {
         return val / rangeLog2 + minLog2;
     }
 
@@ -192,7 +84,8 @@ struct median_meter {
         NBL_REF_ARG(SharedAccessor) sdata,
         float_t2 tileOffset,
         float_t2 viewportSize
-    ) {
+    )
+    {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
         
         for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
@@ -252,7 +145,8 @@ struct median_meter {
     float_t gatherLuma(
         NBL_REF_ARG(HistogramAccessor) histo,
         NBL_REF_ARG(SharedAccessor) sdata
-    ) {
+    )
+    {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
 
         for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl
similarity index 71%
rename from include/nbl/builtin/hlsl/tonemapper/operators.hlsl
rename to include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl
index 46d241c76c..b2e0e4b053 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl
@@ -1,4 +1,4 @@
-// Copyright (C) 2018-2024 - DevSH Graphics Programming Sp. z O.O.
+// Copyright (C) 2018-2026 - DevSH Graphics Programming Sp. z O.O.
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
@@ -7,6 +7,7 @@
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
+#include "nbl/builtin/hlsl/concepts/core.hlsl"
 
 namespace nbl
 {
@@ -15,37 +16,7 @@ namespace hlsl
 namespace tonemapper
 {
 
-template<typename T = float32_t>
-struct Reinhard
-{
-	using float_t = enable_if_t<is_floating_point<T>::value, T>;
-	using float_t3 = vector<float_t, 3>;
-	using this_t = Reinhard<float_t>;
-
-	static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f)
-	{
-		this_t retval;
-
-		const float_t unit = 1.0;
-		retval.keyAndManualLinearExposure = key * exp2(EV);
-		retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV);
-
-		return retval;
-	}
-
-	float_t3 operator()(float_t3 rawCIEXYZcolor) {
-		const float_t unit = 1.0;
-		float_t exposureFactors = keyAndManualLinearExposure;
-		float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
-		float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma));
-		return rawCIEXYZcolor * colorMultiplier;
-	}
-
-	float_t keyAndManualLinearExposure;
-	float_t rcpWhite2;
-};
-
-template<typename T = float32_t>
+template<typename T NBL_PRIMARY_REQUIRES(concepts::FloatingPointLikeScalar<T>)
 struct ACES
 {
 	using float_t = enable_if_t<is_floating_point<T>::value, T>;
@@ -53,7 +24,8 @@ struct ACES
 	using float_t3x3 = matrix<float_t, 3, 3>;
 
 	using this_t = ACES<T>;
-	static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f) {
+	static this_t create(float_t EV, float_t key = 0.18f, float_t Contrast = 1.f)
+    {
 		this_t retval;
 		retval.gamma = Contrast;
 		const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
@@ -61,7 +33,8 @@ struct ACES
 		return retval;
 	}
 
-	float_t3 operator()(float_t3 rawCIEXYZcolor) {
+	float_t3 operator()(float_t3 rawCIEXYZcolor)
+    {
 		const float_t unit = 1.0;
 		float_t3 tonemapped = rawCIEXYZcolor;
 		if (tonemapped.y > bit_cast<float_t>(numeric_limits<float_t>::min))
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl
new file mode 100644
index 0000000000..de73959f86
--- /dev/null
+++ b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl
@@ -0,0 +1,54 @@
+// Copyright (C) 2018-2026 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+#include "nbl/builtin/hlsl/type_traits.hlsl"
+#include "nbl/builtin/hlsl/concepts/core.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace tonemapper
+{
+
+template<typename T NBL_PRIMARY_REQUIRES(concepts::FloatingPointLikeScalar<T>)
+struct Reinhard
+{
+	using float_t = enable_if_t<is_floating_point<T>::value, T>;
+	using float_t3 = vector<float_t, 3>;
+	using this_t = Reinhard<float_t>;
+
+	static this_t create(float_t EV, float_t key = 0.18f, float_t WhitePointRelToEV = 16.f)
+	{
+		this_t retval;
+
+		const float_t unit = 1.0;
+		retval.keyAndManualLinearExposure = key * exp2(EV);
+		retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV);
+
+		return retval;
+	}
+
+	float_t3 operator()(float_t3 rawCIEXYZcolor)
+    {
+		const float_t unit = 1.0;
+		float_t exposureFactors = keyAndManualLinearExposure;
+		float_t exposedLuma = rawCIEXYZcolor.y * exposureFactors;
+		float_t colorMultiplier = (exposureFactors * (unit + exposedLuma * rcpWhite2) / (unit + exposedLuma));
+		return rawCIEXYZcolor * colorMultiplier;
+	}
+
+	float_t keyAndManualLinearExposure;
+	float_t rcpWhite2;
+};
+
+}
+}
+}
+
+#endif
\ No newline at end of file
diff --git a/src/nbl/builtin/CMakeLists.txt b/src/nbl/builtin/CMakeLists.txt
index d5d293a564..00c5f021d5 100644
--- a/src/nbl/builtin/CMakeLists.txt
+++ b/src/nbl/builtin/CMakeLists.txt
@@ -29,9 +29,11 @@ LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/struct_declare.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/bda_accessor.hlsl")
 # luma metering
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/common.hlsl")
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/luma_meter.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/geom_mean.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/luma_meter/histogram.hlsl")
 # tonemapper
-LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators/reinhard.hlsl")
+LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/tonemapper/operators/aces.hlsl")
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "hlsl/bda/legacy_bda_accessor.hlsl")
 # bump mapping
 LIST_BUILTIN_RESOURCE(NBL_RESOURCES_TO_EMBED "glsl/bump_mapping/fragment.glsl") # TODO: rename to `frag.glsl`

From 918c6a4e04d492019e797080e88b4a3cfc910886 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 12 Jan 2026 14:49:34 +0700
Subject: [PATCH 39/56] remove obsolete morton code

---
 include/nbl/builtin/hlsl/math/morton.hlsl | 140 ----------------------
 1 file changed, 140 deletions(-)

diff --git a/include/nbl/builtin/hlsl/math/morton.hlsl b/include/nbl/builtin/hlsl/math/morton.hlsl
index 203eca80b3..7af5aadb8b 100644
--- a/include/nbl/builtin/hlsl/math/morton.hlsl
+++ b/include/nbl/builtin/hlsl/math/morton.hlsl
@@ -13,146 +13,6 @@ namespace hlsl
 namespace math
 {
 
-// TODO: this is is the old stuff before merging morton pr (I think), I don't know if it's been replaced
-namespace impl
-{
-
-template <typename T>
-NBL_CONSTEXPR_FUNC T morton2d_mask(uint16_t _n)
-{
-    const static uint64_t mask[5] =
-    {
-        0x5555555555555555ull,
-        0x3333333333333333ull,
-        0x0F0F0F0F0F0F0F0Full,
-        0x00FF00FF00FF00FFull,
-        0x0000FFFF0000FFFFull
-    };
-    return nbl::hlsl::_static_cast<T>(mask[_n]);
-}
-
-template <typename T>
-NBL_CONSTEXPR_FUNC T morton3d_mask(uint16_t _n)
-{
-    const static uint64_t mask[5] =
-    {
-        0x1249249249249249ull,
-        0x10C30C30C30C30C3ull,
-        0x010F00F00F00F00Full,
-        0x001F0000FF0000FFull,
-        0x001F00000000FFFFull
-    };
-    return nbl::hlsl::_static_cast<T>(mask[_n]);
-}
-template <typename T>
-NBL_CONSTEXPR_FUNC T morton4d_mask(uint16_t _n)
-{
-    const static uint64_t mask[4] =
-    {
-        0x1111111111111111ull,
-        0x0303030303030303ull,
-        0x000F000F000F000Full,
-        0x000000FF000000FFull
-    };
-    return nbl::hlsl::_static_cast<T>(mask[_n]);
-}
-
-template <typename T, uint32_t bitDepth>
-inline T morton2d_decode(T x)
-{
-    x = x & morton2d_mask<T>(0);
-    x = (x | (x >> 1)) & morton2d_mask<T>(1);
-    x = (x | (x >> 2)) & morton2d_mask<T>(2);
-    if (bitDepth > 8u)
-    {
-        x = (x | (x >> 4)) & morton2d_mask<T>(3);
-    }
-    if (bitDepth > 16u)
-    {
-        x = (x | (x >> 8)) & morton2d_mask<T>(4);
-    }
-    if (bitDepth > 32u)
-    {
-        x = (x | (x >> 16));
-    }
-    return x;
-}
-
-//! Puts bits on even positions filling gaps with 0s
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_2d(T x)
-{
-    if (bitDepth > 32u)
-    {
-        x = (x | (x << 16)) & morton2d_mask<T>(4);
-    }
-    if (bitDepth > 16u)
-    {
-        x = (x | (x << 8)) & morton2d_mask<T>(3);
-    }
-    if (bitDepth > 8u)
-    {
-        x = (x | (x << 4)) & morton2d_mask<T>(2);
-    }
-    x = (x | (x << 2)) & morton2d_mask<T>(1);
-    x = (x | (x << 1)) & morton2d_mask<T>(0);
-
-    return x;
-}
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_3d(T x)
-{
-    if (bitDepth > 32u)
-    {
-        x = (x | (x << 32)) & morton3d_mask<T>(4);
-    }
-    if (bitDepth > 16u)
-    {
-        x = (x | (x << 16)) & morton3d_mask<T>(3);
-    }
-    if (bitDepth > 8u)
-    {
-        x = (x | (x << 8)) & morton3d_mask<T>(2);
-    }
-    x = (x | (x << 4)) & morton3d_mask<T>(1);
-    x = (x | (x << 2)) & morton3d_mask<T>(0);
-
-    return x;
-}
-template <typename T, uint32_t bitDepth>
-inline T separate_bits_4d(T x)
-{
-    if (bitDepth > 32u)
-    {
-        x = (x | (x << 24)) & morton4d_mask<T>(3);
-    }
-    if (bitDepth > 16u)
-    {
-        x = (x | (x << 12)) & morton4d_mask<T>(2);
-    }
-    if (bitDepth > 8u)
-    {
-        x = (x | (x << 6)) & morton4d_mask<T>(1);
-    }
-    x = (x | (x << 3)) & morton4d_mask<T>(0);
-
-    return x;
-}
-}
-
-template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
-T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); }
-template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
-T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); }
-
-template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
-T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); }
-template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
-T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); }
-template<typename T, uint32_t bitDepth = sizeof(T) * 8u>
-T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); }
-// TODO: end of old stuff
-
 namespace impl
 {
 

From 3de93db0beea29318486082669da61e2794b6170 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 30 Jan 2026 15:09:27 +0700
Subject: [PATCH 40/56] use new morton class

---
 include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl | 6 ++++--
 include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 6 ++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index 7c85f786fa..971017993c 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -11,7 +11,7 @@
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
-#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "nbl/builtin/hlsl/morton.hlsl"
 #include "nbl/builtin/hlsl/luma_meter/common.hlsl"
 
 namespace nbl
@@ -95,7 +95,9 @@ struct geom_meter
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
-        uint32_t2 coord = math::Morton<uint32_t>::decode2d(tid);
+        morton::code<false, 32, 2> mc;
+        mc.value = tid;
+        uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
         float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 51c27c8e9e..71a9ca2e3b 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -11,7 +11,7 @@
 #include "nbl/builtin/hlsl/workgroup/basic.hlsl"
 #include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
-#include "nbl/builtin/hlsl/math/morton.hlsl"
+#include "nbl/builtin/hlsl/morton.hlsl"
 #include "nbl/builtin/hlsl/luma_meter/common.hlsl"
 
 namespace nbl
@@ -94,7 +94,9 @@ struct median_meter
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        uint32_t2 coord = math::Morton<uint32_t>::decode2d(tid);
+        morton::code<false, 32, 2> mc;
+        mc.value = tid;
+        uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
         float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;

From 87fca818209beb0621b89b82503ee39a26be0443 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 30 Jan 2026 17:14:10 +0700
Subject: [PATCH 41/56] removed optimizations that are broken, make it work
 like glsl version

---
 .../builtin/hlsl/luma_meter/geom_mean.hlsl    | 145 ++++++++++++++++--
 .../hlsl/tonemapper/operators/reinhard.hlsl   |   2 +-
 2 files changed, 136 insertions(+), 11 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index 971017993c..ab0c27c340 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -69,9 +69,11 @@ struct geom_meter
         uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
 
-        uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+        val /= 32.0 * 32.0;
+        // uint32_t lumaSumBitPattern = uint32_t(((val - minLog2) / rangeLog2) * 4096.0 + 0.5); // 32*32 subgroups
+        uint32_t lumaSumBitPattern = uint32_t(val * 4096.0 + 0.5); // 32*32 subgroups
 
-        val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+        val_accessor.atomicAdd(0u, lumaSumBitPattern);
     }
 
     float_t __downloadFloat(
@@ -81,8 +83,8 @@ struct geom_meter
         float_t rangeLog2
     )
     {
-        float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
-        return luma / rangeLog2 + minLog2;
+        float_t luma = (float_t)val_accessor.get(0u);
+        return (luma / float_t(4096 * 60 * 34)) * rangeLog2 + minLog2;
     }
 
     void sampleLuma(
@@ -100,8 +102,10 @@ struct geom_meter
         uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
         float_t luma = 0.0f;
-        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        // float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        float_t2 shiftedCoord = float_t2(glsl::gl_GlobalInvocationID().xy) / viewportSize;
         float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
+        lumaLog2 = (lumaLog2 - log2(lumaMinMax.x)) / log2(lumaMinMax.y / lumaMinMax.x);
         float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
 
         if (tid == 0) {
@@ -119,25 +123,146 @@ struct geom_meter
     )
     {
         uint32_t tid = glsl::gl_SubgroupInvocationID();
-        float_t luma = glsl::subgroupAdd(
-            __downloadFloat(
+        // float_t luma = glsl::subgroupAdd(
+        //     __downloadFloat(
+        //         val,
+        //         tid,
+        //         log2(lumaMinMax.x),
+        //         log2(lumaMinMax.y / lumaMinMax.x)
+        //     )
+        // );
+        float_t luma = __downloadFloat(
                 val,
                 tid,
                 log2(lumaMinMax.x),
                 log2(lumaMinMax.y / lumaMinMax.x)
-            )
-        );
+            );
 
         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
 
-        return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
+        return luma;// / sampleCount;
     }
 
     float_t sampleCount;
     float_t2 lumaMinMax;
 };
 
+// template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
+// struct geom_meter
+// {
+//     using float_t = typename SharedAccessor::type;
+//     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+//     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+//     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
+
+//     static this_t create(float_t2 lumaMinMax, float_t sampleCount)
+//     {
+//         this_t retval;
+//         retval.lumaMinMax = lumaMinMax;
+//         retval.sampleCount = sampleCount;
+//         return retval;
+//     }
+
+//     float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+//     {
+//         return workgroup::reduction < plus < float_t >, GroupSize >::
+//             template __call <SharedAccessor>(value, sdata);
+//     }
+
+//     float_t __computeLumaLog2(
+//         NBL_CONST_REF_ARG(MeteringWindow) window,
+//         NBL_REF_ARG(TexAccessor) tex,
+//         float_t2 shiftedCoord
+//     )
+//     {
+//         float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+//         float_t3 color = tex.get(uvPos);
+//         float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+//         luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
+
+//         return log2(luma);
+//     }
+
+//     void __uploadFloat(
+//         NBL_REF_ARG(ValueAccessor) val_accessor,
+//         float_t val,
+//         float_t minLog2,
+//         float_t rangeLog2
+//     )
+//     {
+//         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+//         uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
+//         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+//         uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+
+//         val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
+//     }
+
+//     float_t __downloadFloat(
+//         NBL_REF_ARG(ValueAccessor) val_accessor,
+//         uint32_t index,
+//         float_t minLog2,
+//         float_t rangeLog2
+//     )
+//     {
+//         float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
+//         return luma / rangeLog2 + minLog2;
+//     }
+
+//     void sampleLuma(
+//         NBL_CONST_REF_ARG(MeteringWindow) window,
+//         NBL_REF_ARG(ValueAccessor) val,
+//         NBL_REF_ARG(TexAccessor) tex,
+//         NBL_REF_ARG(SharedAccessor) sdata,
+//         float_t2 tileOffset,
+//         float_t2 viewportSize
+//     )
+//     {
+//         uint32_t tid = workgroup::SubgroupContiguousIndex();
+//         uint32_t2 coord = math::Morton<uint32_t>::decode2d(tid);
+
+//         float_t luma = 0.0f;
+//         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+//         float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
+//         float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
+
+//         if (tid == 0) {
+//             __uploadFloat(
+//                 val,
+//                 lumaLog2Sum,
+//                 log2(lumaMinMax.x),
+//                 log2(lumaMinMax.y / lumaMinMax.x)
+//             );
+//         }
+//     }
+
+//     float_t gatherLuma(
+//         NBL_REF_ARG(ValueAccessor) val
+//     )
+//     {
+//         uint32_t tid = glsl::gl_SubgroupInvocationID();
+//         float_t luma = glsl::subgroupAdd(
+//             __downloadFloat(
+//                 val,
+//                 tid,
+//                 log2(lumaMinMax.x),
+//                 log2(lumaMinMax.y / lumaMinMax.x)
+//             )
+//         );
+
+//         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+//         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+//         return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
+//     }
+
+//     float_t sampleCount;
+//     float_t2 lumaMinMax;
+// };
+
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl
index de73959f86..da48fbf66d 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl
@@ -28,7 +28,7 @@ struct Reinhard
 		this_t retval;
 
 		const float_t unit = 1.0;
-		retval.keyAndManualLinearExposure = key * exp2(EV);
+		retval.keyAndManualLinearExposure = key * exp2(-EV);
 		retval.rcpWhite2 = unit / (WhitePointRelToEV * WhitePointRelToEV);
 
 		return retval;

From af209328ffb0020a6ab0a968aecd52583875fb6c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 2 Feb 2026 14:16:00 +0700
Subject: [PATCH 42/56] fix minor bugs so it compiles

---
 include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 71a9ca2e3b..52b711a923 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -105,7 +105,7 @@ struct median_meter
         float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
         uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
 
-        sdata.atomicAdd(binIndex, float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+        sdata.atomicAdd(binIndex, __float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
@@ -114,8 +114,8 @@ struct median_meter
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        float_t sum = inclusive_scan(histogram_value, sdata);
-        histo.atomicAdd(tid, float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+        float_t sum = __inclusive_scan(histogram_value, sdata);
+        histo.atomicAdd(tid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
 
         const bool is_last_wg_invocation = tid == (GroupSize - 1);
         const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;
@@ -137,7 +137,7 @@ struct median_meter
             // no aliasing anymore
             float_t atVid;
             sdata.get(vid, atVid);
-            sum = inclusive_scan(atVid, sdata);
+            sum = __inclusive_scan(atVid, sdata);
             if (vid < BinCount) {
                 histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
             }

From dc7e751b663391afaf00d2c18d77111fe689bb10 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 3 Feb 2026 14:51:39 +0700
Subject: [PATCH 43/56] histogram autoexposure working

---
 .../builtin/hlsl/luma_meter/geom_mean.hlsl    |   4 +-
 .../builtin/hlsl/luma_meter/histogram.hlsl    | 264 ++++++++++++++----
 2 files changed, 204 insertions(+), 64 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index ab0c27c340..25e67ec35b 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -101,9 +101,7 @@ struct geom_meter
         mc.value = tid;
         uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
-        float_t luma = 0.0f;
-        // float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-        float_t2 shiftedCoord = float_t2(glsl::gl_GlobalInvocationID().xy) / viewportSize;
+        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
         float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
         lumaLog2 = (lumaLog2 - log2(lumaMinMax.x)) / log2(lumaMinMax.y / lumaMinMax.x);
         float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 52b711a923..58fd085cd2 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -56,27 +56,6 @@ struct median_meter
         return clamp(luma, lumaMinMax.x, lumaMinMax.y);
     }
 
-    int_t __float2Int(
-        float_t val,
-        float_t minLog2,
-        float_t rangeLog2
-    )
-    {
-        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-        return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
-    }
-
-    float_t __int2Float(
-        int_t val,
-        float_t minLog2,
-        float_t rangeLog2
-    )
-    {
-        return val / rangeLog2 + minLog2;
-    }
-
     void sampleLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(HistogramAccessor) histo,
@@ -98,50 +77,22 @@ struct median_meter
         mc.value = tid;
         uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
-        float_t luma = 0.0f;
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-        luma = __computeLuma(window, tex, shiftedCoord);
-
-        float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
-        uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
+        float_t luma = __computeLuma(window, tex, shiftedCoord);
 
-        sdata.atomicAdd(binIndex, __float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+        float_t scaledLogLuma = log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x);
+        uint32_t binIndex = int_t(scaledLogLuma * float_t(BinCount-1u) + 0.5);
+        sdata.atomicAdd(binIndex, 1u);
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        float_t histogram_value;
+        int_t histogram_value;
         sdata.get(tid, histogram_value);
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        float_t sum = __inclusive_scan(histogram_value, sdata);
-        histo.atomicAdd(tid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
-
-        const bool is_last_wg_invocation = tid == (GroupSize - 1);
-        const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;
-
-        for (int i = 1; i < RoundedBinCount; i++) {
-            uint32_t keyBucketStart = GroupSize * i;
-            uint32_t vid = tid + keyBucketStart;
-
-            // no if statement about the last iteration needed
-            if (is_last_wg_invocation) {
-                float_t beforeSum;
-                sdata.get(keyBucketStart, beforeSum);
-                sdata.set(keyBucketStart, beforeSum + sum);
-            }
-
-            // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
-            sdata.workgroupExecutionAndMemoryBarrier();
-
-            // no aliasing anymore
-            float_t atVid;
-            sdata.get(vid, atVid);
-            sum = __inclusive_scan(atVid, sdata);
-            if (vid < BinCount) {
-                histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
-            }
-        }
+        int_t sum = __inclusive_scan(histogram_value, sdata);
+        histo.atomicAdd(tid, sum);
     }
 
     float_t gatherLuma(
@@ -154,22 +105,213 @@ struct median_meter
         for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
             sdata.set(
                 vid,
-                histo.get(vid & (BinCount - 1))
+                histo.get(vid)
             );
         }
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        uint32_t percentile40, percentile60;
-        sdata.get(BinCount * 0.4, percentile40);
-        sdata.get(BinCount * 0.6, percentile60);
+        // TODO: choose percentile in push constant
+        int_t lower, upper;
+        if (tid == 0)
+        {
+            uint32_t percentile40 = uint32_t(BinCount * 0.4);
+            // lower bound
+            uint32_t lo = 0u;
+            uint32_t hi = BinCount;
+            int_t v;
+            while (lo < hi)
+            {
+                uint32_t mid = lo + (hi - lo) / 2;
+                sdata.get(mid, v);
+                if (percentile40 <= v)
+                    hi = mid;
+                else
+                    lo = mid + 1;
+            }
+
+            lower = lo;
+        }
+        if (tid == 1)
+        {
+            uint32_t percentile60 = uint32_t(BinCount * 0.6);
+            // upper bound
+            uint32_t lo = 0u;
+            uint32_t hi = BinCount;
+            int_t v;
+            while (lo < hi)
+            {
+                uint32_t mid = lo + (hi - lo) / 2;
+                sdata.get(mid, v);
+                if (percentile60 >= v)
+                    lo = mid + 1;
+                else
+                    hi = mid;
+            }
+
+            upper = lo;
+        }
 
-        return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
+        sdata.workgroupExecutionAndMemoryBarrier();
+
+        lower = workgroup::Broadcast(lower, sdata, 0);
+        upper = workgroup::Broadcast(upper, sdata, 1);
+
+        return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2(lumaMinMax.y/lumaMinMax.x) + log2(lumaMinMax.x);
     }
 
     float_t2 lumaMinMax;
 };
 
+// template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
+// struct median_meter
+// {
+//     using int_t = typename SharedAccessor::type;
+//     using float_t  = float32_t;
+//     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
+//     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
+//     using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
+
+//     static this_t create(float_t2 lumaMinMax)
+//     {
+//         this_t retval;
+//         retval.lumaMinMax = lumaMinMax;
+//         return retval;
+//     }
+
+//     int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+//     {
+//         return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
+//             template __call <SharedAccessor>(value, sdata);
+//     }
+
+//     float_t __computeLuma(
+//         NBL_CONST_REF_ARG(MeteringWindow) window,
+//         NBL_REF_ARG(TexAccessor) tex,
+//         float_t2 shiftedCoord
+//     )
+//     {
+//         float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+//         float_t3 color = tex.get(uvPos);
+//         float_t luma = (float_t)TexAccessor::toXYZ(color);
+
+//         return clamp(luma, lumaMinMax.x, lumaMinMax.y);
+//     }
+
+//     int_t __float2Int(
+//         float_t val,
+//         float_t minLog2,
+//         float_t rangeLog2
+//     )
+//     {
+//         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+//         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
+
+//         return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
+//     }
+
+//     float_t __int2Float(
+//         int_t val,
+//         float_t minLog2,
+//         float_t rangeLog2
+//     )
+//     {
+//         return val / rangeLog2 + minLog2;
+//     }
+
+//     void sampleLuma(
+//         NBL_CONST_REF_ARG(MeteringWindow) window,
+//         NBL_REF_ARG(HistogramAccessor) histo,
+//         NBL_REF_ARG(TexAccessor) tex,
+//         NBL_REF_ARG(SharedAccessor) sdata,
+//         float_t2 tileOffset,
+//         float_t2 viewportSize
+//     )
+//     {
+//         uint32_t tid = workgroup::SubgroupContiguousIndex();
+        
+//         for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+//             sdata.set(vid, 0);
+//         }
+
+//         sdata.workgroupExecutionAndMemoryBarrier();
+
+//         morton::code<false, 32, 2> mc;
+//         mc.value = tid;
+//         uint32_t2 coord = _static_cast<uint32_t2>(mc);
+
+//         float_t luma = 0.0f;
+//         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+//         luma = __computeLuma(window, tex, shiftedCoord);
+
+//         float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
+//         uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
+
+//         sdata.atomicAdd(binIndex, __float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+//         sdata.workgroupExecutionAndMemoryBarrier();
+
+//         float_t histogram_value;
+//         sdata.get(tid, histogram_value);
+
+//         sdata.workgroupExecutionAndMemoryBarrier();
+
+//         float_t sum = __inclusive_scan(histogram_value, sdata);
+//         histo.atomicAdd(tid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+
+//         const bool is_last_wg_invocation = tid == (GroupSize - 1);
+//         const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;
+
+//         for (int i = 1; i < RoundedBinCount; i++) {
+//             uint32_t keyBucketStart = GroupSize * i;
+//             uint32_t vid = tid + keyBucketStart;
+
+//             // no if statement about the last iteration needed
+//             if (is_last_wg_invocation) {
+//                 float_t beforeSum;
+//                 sdata.get(keyBucketStart, beforeSum);
+//                 sdata.set(keyBucketStart, beforeSum + sum);
+//             }
+
+//             // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
+//             sdata.workgroupExecutionAndMemoryBarrier();
+
+//             // no aliasing anymore
+//             float_t atVid;
+//             sdata.get(vid, atVid);
+//             sum = __inclusive_scan(atVid, sdata);
+//             if (vid < BinCount) {
+//                 histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
+//             }
+//         }
+//     }
+
+//     float_t gatherLuma(
+//         NBL_REF_ARG(HistogramAccessor) histo,
+//         NBL_REF_ARG(SharedAccessor) sdata
+//     )
+//     {
+//         uint32_t tid = workgroup::SubgroupContiguousIndex();
+
+//         for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+//             sdata.set(
+//                 vid,
+//                 histo.get(vid & (BinCount - 1))
+//             );
+//         }
+
+//         sdata.workgroupExecutionAndMemoryBarrier();
+
+//         uint32_t percentile40, percentile60;
+//         sdata.get(BinCount * 0.4, percentile40);
+//         sdata.get(BinCount * 0.6, percentile60);
+
+//         return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
+//     }
+
+//     float_t2 lumaMinMax;
+// };
+
 }
 }
 }

From f57886d20370a46b19d9ec8ae88309748fea537c Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 3 Feb 2026 16:17:06 +0700
Subject: [PATCH 44/56] more values passed in through push constants at create

---
 .../builtin/hlsl/luma_meter/geom_mean.hlsl    | 42 +++++++++----------
 .../builtin/hlsl/luma_meter/histogram.hlsl    | 31 +++++++-------
 2 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index 25e67ec35b..e6be2e3a60 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -29,11 +29,15 @@ struct geom_meter
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float_t2 lumaMinMax, float_t sampleCount)
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxWorkgroupIncrement = 0x1000u;
+
+    static this_t create(float_t lumaMin, float_t lumaMax, float_t sampleCount, float_t rcpFirstPassWGCount)
     {
         this_t retval;
-        retval.lumaMinMax = lumaMinMax;
+        retval.lumaMin = lumaMin;
+        retval.lumaMax = lumaMax;
         retval.sampleCount = sampleCount;
+        retval.rcpFirstPassWGCount = rcpFirstPassWGCount;
         return retval;
     }
 
@@ -53,7 +57,7 @@ struct geom_meter
         float_t3 color = tex.get(uvPos);
         float_t luma = (float_t)TexAccessor::toXYZ(color);
 
-        luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
+        luma = clamp(luma, lumaMin, lumaMax);
 
         return log2(luma);
     }
@@ -65,15 +69,8 @@ struct geom_meter
         float_t rangeLog2
     )
     {
-        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-        uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
-        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-        val /= 32.0 * 32.0;
-        // uint32_t lumaSumBitPattern = uint32_t(((val - minLog2) / rangeLog2) * 4096.0 + 0.5); // 32*32 subgroups
-        uint32_t lumaSumBitPattern = uint32_t(val * 4096.0 + 0.5); // 32*32 subgroups
-
-        val_accessor.atomicAdd(0u, lumaSumBitPattern);
+        uint32_t lumaVal = uint32_t((val / (32.0 * 32.0)) * float_t(MaxWorkgroupIncrement) + 0.5); // 32*32 subgroups
+        val_accessor.atomicAdd(0u, lumaVal);
     }
 
     float_t __downloadFloat(
@@ -83,8 +80,8 @@ struct geom_meter
         float_t rangeLog2
     )
     {
-        float_t luma = (float_t)val_accessor.get(0u);
-        return (luma / float_t(4096 * 60 * 34)) * rangeLog2 + minLog2;
+        float_t luma = float_t(val_accessor.get(0u));
+        return luma / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * rangeLog2 + minLog2;
     }
 
     void sampleLuma(
@@ -103,15 +100,15 @@ struct geom_meter
 
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
         float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
-        lumaLog2 = (lumaLog2 - log2(lumaMinMax.x)) / log2(lumaMinMax.y / lumaMinMax.x);
+        lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin);
         float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
 
         if (tid == 0) {
             __uploadFloat(
                 val,
                 lumaLog2Sum,
-                log2(lumaMinMax.x),
-                log2(lumaMinMax.y / lumaMinMax.x)
+                log2(lumaMin),
+                log2(lumaMax / lumaMin)
             );
         }
     }
@@ -132,18 +129,17 @@ struct geom_meter
         float_t luma = __downloadFloat(
                 val,
                 tid,
-                log2(lumaMinMax.x),
-                log2(lumaMinMax.y / lumaMinMax.x)
+                log2(lumaMin),
+                log2(lumaMax / lumaMin)
             );
 
-        uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-        uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
         return luma;// / sampleCount;
     }
 
+    float_t lumaMin;
+    float_t lumaMax;
     float_t sampleCount;
-    float_t2 lumaMinMax;
+    float_t rcpFirstPassWGCount;
 };
 
 // template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 58fd085cd2..2025f28f8b 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -30,10 +30,13 @@ struct median_meter
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
 
-    static this_t create(float_t2 lumaMinMax)
+    static this_t create(float_t lumaMin, float_t lumaMax, float_t lowerBoundPercentile, float_t upperBoundPercentile)
     {
         this_t retval;
-        retval.lumaMinMax = lumaMinMax;
+        retval.lumaMin = lumaMin;
+        retval.lumaMax = lumaMax;
+        retval.lowerBoundPercentile = lowerBoundPercentile;
+        retval.upperBoundPercentile = upperBoundPercentile;
         return retval;
     }
 
@@ -53,7 +56,7 @@ struct median_meter
         float_t3 color = tex.get(uvPos);
         float_t luma = (float_t)TexAccessor::toXYZ(color);
 
-        return clamp(luma, lumaMinMax.x, lumaMinMax.y);
+        return clamp(luma, lumaMin, lumaMax);
     }
 
     void sampleLuma(
@@ -80,7 +83,7 @@ struct median_meter
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
         float_t luma = __computeLuma(window, tex, shiftedCoord);
 
-        float_t scaledLogLuma = log2(luma / lumaMinMax.x) / log2(lumaMinMax.y / lumaMinMax.x);
+        float_t scaledLogLuma = log2(luma / lumaMin) / log2(lumaMax / lumaMin);
         uint32_t binIndex = int_t(scaledLogLuma * float_t(BinCount-1u) + 0.5);
         sdata.atomicAdd(binIndex, 1u);
 
@@ -108,15 +111,12 @@ struct median_meter
                 histo.get(vid)
             );
         }
-
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        // TODO: choose percentile in push constant
         int_t lower, upper;
         if (tid == 0)
         {
-            uint32_t percentile40 = uint32_t(BinCount * 0.4);
-            // lower bound
+            const uint32_t lowerPercentile = uint32_t(BinCount * lowerBoundPercentile);
             uint32_t lo = 0u;
             uint32_t hi = BinCount;
             int_t v;
@@ -124,7 +124,7 @@ struct median_meter
             {
                 uint32_t mid = lo + (hi - lo) / 2;
                 sdata.get(mid, v);
-                if (percentile40 <= v)
+                if (lowerPercentile <= v)
                     hi = mid;
                 else
                     lo = mid + 1;
@@ -134,8 +134,7 @@ struct median_meter
         }
         if (tid == 1)
         {
-            uint32_t percentile60 = uint32_t(BinCount * 0.6);
-            // upper bound
+            const uint32_t upperPercentile = uint32_t(BinCount * upperBoundPercentile);
             uint32_t lo = 0u;
             uint32_t hi = BinCount;
             int_t v;
@@ -143,7 +142,7 @@ struct median_meter
             {
                 uint32_t mid = lo + (hi - lo) / 2;
                 sdata.get(mid, v);
-                if (percentile60 >= v)
+                if (upperPercentile >= v)
                     lo = mid + 1;
                 else
                     hi = mid;
@@ -151,16 +150,18 @@ struct median_meter
 
             upper = lo;
         }
-
         sdata.workgroupExecutionAndMemoryBarrier();
 
         lower = workgroup::Broadcast(lower, sdata, 0);
         upper = workgroup::Broadcast(upper, sdata, 1);
 
-        return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2(lumaMinMax.y/lumaMinMax.x) + log2(lumaMinMax.x);
+        return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2(lumaMax/lumaMin) + log2(lumaMin);
     }
 
-    float_t2 lumaMinMax;
+    float_t lumaMin;
+    float_t lumaMax;
+    float_t lowerBoundPercentile;
+    float_t upperBoundPercentile;
 };
 
 // template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>

From 8466a9db6ee2d060abccb5710461f3f2d97ecf5a Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 3 Feb 2026 17:01:23 +0700
Subject: [PATCH 45/56] make template names clearer, mean stores to a subgroup
 size buffer instead of one value

---
 .../builtin/hlsl/luma_meter/geom_mean.hlsl    | 28 ++++++++-----------
 .../builtin/hlsl/luma_meter/histogram.hlsl    | 10 +++----
 2 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index e6be2e3a60..7ab959fd5d 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -21,13 +21,13 @@ namespace hlsl
 namespace luma_meter
 {
 
-template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
+template<uint32_t WorkgroupSize, uint16_t SubgroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
 struct geom_meter
 {
     using float_t = typename SharedAccessor::type;
     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-    using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
+    using this_t = geom_meter<WorkgroupSize, SubgroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxWorkgroupIncrement = 0x1000u;
 
@@ -43,7 +43,7 @@ struct geom_meter
 
     float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        return workgroup::reduction < plus < float_t >, GroupSize >::
+        return workgroup::reduction < plus < float_t >, WorkgroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
 
@@ -69,8 +69,11 @@ struct geom_meter
         float_t rangeLog2
     )
     {
-        uint32_t lumaVal = uint32_t((val / (32.0 * 32.0)) * float_t(MaxWorkgroupIncrement) + 0.5); // 32*32 subgroups
-        val_accessor.atomicAdd(0u, lumaVal);
+        const uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
+        const uint32_t3 workgroupID = glsl::gl_WorkGroupID();
+        const uint32_t index = (workgroupID.y * workGroupCount.x + workgroupID.x) & (SubgroupSize - 1u);
+        uint32_t lumaVal = uint32_t(val / float_t(WorkgroupSize) * float_t(MaxWorkgroupIncrement) + 0.5);
+        val_accessor.atomicAdd(index, lumaVal);
     }
 
     float_t __downloadFloat(
@@ -80,8 +83,9 @@ struct geom_meter
         float_t rangeLog2
     )
     {
-        float_t luma = float_t(val_accessor.get(0u));
-        return luma / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * rangeLog2 + minLog2;
+        uint32_t lumaVal = val_accessor.get(index);
+        lumaVal = glsl::subgroupAdd(lumaVal);
+        return float_t(lumaVal) / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * rangeLog2 + minLog2;
     }
 
     void sampleLuma(
@@ -118,14 +122,6 @@ struct geom_meter
     )
     {
         uint32_t tid = glsl::gl_SubgroupInvocationID();
-        // float_t luma = glsl::subgroupAdd(
-        //     __downloadFloat(
-        //         val,
-        //         tid,
-        //         log2(lumaMinMax.x),
-        //         log2(lumaMinMax.y / lumaMinMax.x)
-        //     )
-        // );
         float_t luma = __downloadFloat(
                 val,
                 tid,
@@ -133,7 +129,7 @@ struct geom_meter
                 log2(lumaMax / lumaMin)
             );
 
-        return luma;// / sampleCount;
+        return luma;
     }
 
     float_t lumaMin;
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 2025f28f8b..58aea923f0 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -21,14 +21,14 @@ namespace hlsl
 namespace luma_meter
 {
 
-template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
+template<uint32_t WorkgroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
 struct median_meter
 {
     using int_t = typename SharedAccessor::type;
     using float_t  = float32_t;
     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-    using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
+    using this_t = median_meter<WorkgroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
 
     static this_t create(float_t lumaMin, float_t lumaMax, float_t lowerBoundPercentile, float_t upperBoundPercentile)
     {
@@ -42,7 +42,7 @@ struct median_meter
 
     int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
+        return workgroup::inclusive_scan < plus < int_t >, WorkgroupSize >::
             template __call <SharedAccessor>(value, sdata);
     }
 
@@ -70,7 +70,7 @@ struct median_meter
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
         
-        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+        for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) {
             sdata.set(vid, 0);
         }
 
@@ -105,7 +105,7 @@ struct median_meter
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
 
-        for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
+        for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) {
             sdata.set(
                 vid,
                 histo.get(vid)

From 3a891b7329b5be80d6f7bc68fac912a84ed8fded Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Wed, 4 Feb 2026 16:42:28 +0700
Subject: [PATCH 46/56] fixes to aces tonemap

---
 include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl     | 6 +++---
 include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl
index b2e0e4b053..5384c7dc84 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators/aces.hlsl
@@ -2,8 +2,8 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
-#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_ACES_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TONE_MAPPER_ACES_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
@@ -29,7 +29,7 @@ struct ACES
 		this_t retval;
 		retval.gamma = Contrast;
 		const float_t reinhardMatchCorrection = 0.77321666f; // middle grays get exposed to different values between tonemappers given the same key
-		retval.exposure = EV + log2(key * reinhardMatchCorrection);
+		retval.exposure = -EV + log2(key * reinhardMatchCorrection);
 		return retval;
 	}
 
diff --git a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl
index da48fbf66d..b442093b6e 100644
--- a/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl
+++ b/include/nbl/builtin/hlsl/tonemapper/operators/reinhard.hlsl
@@ -2,8 +2,8 @@
 // This file is part of the "Nabla Engine".
 // For conditions of distribution and use, see copyright notice in nabla.h
 
-#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
-#define _NBL_BUILTIN_HLSL_TONE_MAPPER_OPERATORS_INCLUDED_
+#ifndef _NBL_BUILTIN_HLSL_TONE_MAPPER_REINHARD_INCLUDED_
+#define _NBL_BUILTIN_HLSL_TONE_MAPPER_REINHARD_INCLUDED_
 
 #include "nbl/builtin/hlsl/cpp_compat.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"

From c8a8f9ff9b9fc5bfb7e4d058da8bda6c898c74cf Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 5 Feb 2026 17:00:46 +0700
Subject: [PATCH 47/56] use workgroup2 for average metering

---
 .../builtin/hlsl/luma_meter/geom_mean.hlsl    | 157 ++++--------------
 .../builtin/hlsl/luma_meter/histogram.hlsl    | 149 -----------------
 2 files changed, 34 insertions(+), 272 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index 7ab959fd5d..9ba05e2088 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -8,8 +8,8 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/morton.hlsl"
 #include "nbl/builtin/hlsl/luma_meter/common.hlsl"
@@ -21,13 +21,34 @@ namespace hlsl
 namespace luma_meter
 {
 
-template<uint32_t WorkgroupSize, uint16_t SubgroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
+namespace impl
+{
+template<typename T>
+struct data_proxy
+{
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType idx, NBL_REF_ARG(AccessType) value)
+    {
+        value = data[idx];
+    }
+
+    T data;
+};
+}
+
+template<class WorkgroupConfig, typename ValueAccessor, typename SharedAccessor, typename TexAccessor, class device_capabilities>
 struct geom_meter
 {
     using float_t = typename SharedAccessor::type;
     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-    using this_t = geom_meter<WorkgroupSize, SubgroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
+    
+    using proxy_data_t = vector<float_t, WorkgroupConfig::ItemsPerInvocation_0>;
+    using proxy_t = impl::data_proxy<proxy_data_t>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = WorkgroupConfig::WorkgroupSize;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = WorkgroupConfig::SubgroupSize;
+    using this_t = geom_meter<WorkgroupConfig, ValueAccessor, SharedAccessor, TexAccessor, device_capabilities>;
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxWorkgroupIncrement = 0x1000u;
 
@@ -41,10 +62,12 @@ struct geom_meter
         return retval;
     }
 
-    float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    float_t __reduction(NBL_REF_ARG(proxy_t) data, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        return workgroup::reduction < plus < float_t >, WorkgroupSize >::
-            template __call <SharedAccessor>(value, sdata);
+        // return workgroup::reduction < plus < float_t >, WorkgroupSize >::
+        //     template __call <SharedAccessor>(value, sdata);
+        return workgroup2::reduction< WorkgroupConfig, plus<float_t>, device_capabilities >::
+            template __call <proxy_t, SharedAccessor>(data, sdata);
     }
 
     float_t __computeLumaLog2(
@@ -105,7 +128,10 @@ struct geom_meter
         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
         float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
         lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin);
-        float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
+
+        proxy_t data;
+        data.data[0] = lumaLog2;
+        float_t lumaLog2Sum = __reduction(data, sdata);
 
         if (tid == 0) {
             __uploadFloat(
@@ -138,121 +164,6 @@ struct geom_meter
     float_t rcpFirstPassWGCount;
 };
 
-// template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor>
-// struct geom_meter
-// {
-//     using float_t = typename SharedAccessor::type;
-//     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
-//     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-//     using this_t = geom_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>;
-
-//     static this_t create(float_t2 lumaMinMax, float_t sampleCount)
-//     {
-//         this_t retval;
-//         retval.lumaMinMax = lumaMinMax;
-//         retval.sampleCount = sampleCount;
-//         return retval;
-//     }
-
-//     float_t __reduction(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
-//     {
-//         return workgroup::reduction < plus < float_t >, GroupSize >::
-//             template __call <SharedAccessor>(value, sdata);
-//     }
-
-//     float_t __computeLumaLog2(
-//         NBL_CONST_REF_ARG(MeteringWindow) window,
-//         NBL_REF_ARG(TexAccessor) tex,
-//         float_t2 shiftedCoord
-//     )
-//     {
-//         float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
-//         float_t3 color = tex.get(uvPos);
-//         float_t luma = (float_t)TexAccessor::toXYZ(color);
-
-//         luma = clamp(luma, lumaMinMax.x, lumaMinMax.y);
-
-//         return log2(luma);
-//     }
-
-//     void __uploadFloat(
-//         NBL_REF_ARG(ValueAccessor) val_accessor,
-//         float_t val,
-//         float_t minLog2,
-//         float_t rangeLog2
-//     )
-//     {
-//         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-//         uint32_t workgroupIndex = (workGroupCount.x * workGroupCount.y * workGroupCount.z) / 64;
-//         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-//         uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
-
-//         val_accessor.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern);
-//     }
-
-//     float_t __downloadFloat(
-//         NBL_REF_ARG(ValueAccessor) val_accessor,
-//         uint32_t index,
-//         float_t minLog2,
-//         float_t rangeLog2
-//     )
-//     {
-//         float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1));
-//         return luma / rangeLog2 + minLog2;
-//     }
-
-//     void sampleLuma(
-//         NBL_CONST_REF_ARG(MeteringWindow) window,
-//         NBL_REF_ARG(ValueAccessor) val,
-//         NBL_REF_ARG(TexAccessor) tex,
-//         NBL_REF_ARG(SharedAccessor) sdata,
-//         float_t2 tileOffset,
-//         float_t2 viewportSize
-//     )
-//     {
-//         uint32_t tid = workgroup::SubgroupContiguousIndex();
-//         uint32_t2 coord = math::Morton<uint32_t>::decode2d(tid);
-
-//         float_t luma = 0.0f;
-//         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-//         float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
-//         float_t lumaLog2Sum = __reduction(lumaLog2, sdata);
-
-//         if (tid == 0) {
-//             __uploadFloat(
-//                 val,
-//                 lumaLog2Sum,
-//                 log2(lumaMinMax.x),
-//                 log2(lumaMinMax.y / lumaMinMax.x)
-//             );
-//         }
-//     }
-
-//     float_t gatherLuma(
-//         NBL_REF_ARG(ValueAccessor) val
-//     )
-//     {
-//         uint32_t tid = glsl::gl_SubgroupInvocationID();
-//         float_t luma = glsl::subgroupAdd(
-//             __downloadFloat(
-//                 val,
-//                 tid,
-//                 log2(lumaMinMax.x),
-//                 log2(lumaMinMax.y / lumaMinMax.x)
-//             )
-//         );
-
-//         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-//         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-//         return (luma / (1 << fixedPointBitsLeft)) / sampleCount;
-//     }
-
-//     float_t sampleCount;
-//     float_t2 lumaMinMax;
-// };
-
 }
 }
 }
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 58aea923f0..40a4db59a7 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -164,155 +164,6 @@ struct median_meter
     float_t upperBoundPercentile;
 };
 
-// template<uint32_t GroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
-// struct median_meter
-// {
-//     using int_t = typename SharedAccessor::type;
-//     using float_t  = float32_t;
-//     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
-//     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-//     using this_t = median_meter<GroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
-
-//     static this_t create(float_t2 lumaMinMax)
-//     {
-//         this_t retval;
-//         retval.lumaMinMax = lumaMinMax;
-//         return retval;
-//     }
-
-//     int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
-//     {
-//         return workgroup::inclusive_scan < plus < int_t >, GroupSize >::
-//             template __call <SharedAccessor>(value, sdata);
-//     }
-
-//     float_t __computeLuma(
-//         NBL_CONST_REF_ARG(MeteringWindow) window,
-//         NBL_REF_ARG(TexAccessor) tex,
-//         float_t2 shiftedCoord
-//     )
-//     {
-//         float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
-//         float_t3 color = tex.get(uvPos);
-//         float_t luma = (float_t)TexAccessor::toXYZ(color);
-
-//         return clamp(luma, lumaMinMax.x, lumaMinMax.y);
-//     }
-
-//     int_t __float2Int(
-//         float_t val,
-//         float_t minLog2,
-//         float_t rangeLog2
-//     )
-//     {
-//         uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
-//         uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2();
-
-//         return int_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1)));
-//     }
-
-//     float_t __int2Float(
-//         int_t val,
-//         float_t minLog2,
-//         float_t rangeLog2
-//     )
-//     {
-//         return val / rangeLog2 + minLog2;
-//     }
-
-//     void sampleLuma(
-//         NBL_CONST_REF_ARG(MeteringWindow) window,
-//         NBL_REF_ARG(HistogramAccessor) histo,
-//         NBL_REF_ARG(TexAccessor) tex,
-//         NBL_REF_ARG(SharedAccessor) sdata,
-//         float_t2 tileOffset,
-//         float_t2 viewportSize
-//     )
-//     {
-//         uint32_t tid = workgroup::SubgroupContiguousIndex();
-        
-//         for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
-//             sdata.set(vid, 0);
-//         }
-
-//         sdata.workgroupExecutionAndMemoryBarrier();
-
-//         morton::code<false, 32, 2> mc;
-//         mc.value = tid;
-//         uint32_t2 coord = _static_cast<uint32_t2>(mc);
-
-//         float_t luma = 0.0f;
-//         float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
-//         luma = __computeLuma(window, tex, shiftedCoord);
-
-//         float_t binSize = (lumaMinMax.y - lumaMinMax.x) / BinCount;
-//         uint32_t binIndex = (uint32_t)((luma - lumaMinMax.x) / binSize);
-
-//         sdata.atomicAdd(binIndex, __float2Int(luma, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
-
-//         sdata.workgroupExecutionAndMemoryBarrier();
-
-//         float_t histogram_value;
-//         sdata.get(tid, histogram_value);
-
-//         sdata.workgroupExecutionAndMemoryBarrier();
-
-//         float_t sum = __inclusive_scan(histogram_value, sdata);
-//         histo.atomicAdd(tid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
-
-//         const bool is_last_wg_invocation = tid == (GroupSize - 1);
-//         const static uint32_t RoundedBinCount = 1 + (BinCount - 1) / GroupSize;
-
-//         for (int i = 1; i < RoundedBinCount; i++) {
-//             uint32_t keyBucketStart = GroupSize * i;
-//             uint32_t vid = tid + keyBucketStart;
-
-//             // no if statement about the last iteration needed
-//             if (is_last_wg_invocation) {
-//                 float_t beforeSum;
-//                 sdata.get(keyBucketStart, beforeSum);
-//                 sdata.set(keyBucketStart, beforeSum + sum);
-//             }
-
-//             // propagate last block tail to next block head and protect against subsequent scans stepping on each other's toes
-//             sdata.workgroupExecutionAndMemoryBarrier();
-
-//             // no aliasing anymore
-//             float_t atVid;
-//             sdata.get(vid, atVid);
-//             sum = __inclusive_scan(atVid, sdata);
-//             if (vid < BinCount) {
-//                 histo.atomicAdd(vid, __float2Int(sum, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x));
-//             }
-//         }
-//     }
-
-//     float_t gatherLuma(
-//         NBL_REF_ARG(HistogramAccessor) histo,
-//         NBL_REF_ARG(SharedAccessor) sdata
-//     )
-//     {
-//         uint32_t tid = workgroup::SubgroupContiguousIndex();
-
-//         for (uint32_t vid = tid; vid < BinCount; vid += GroupSize) {
-//             sdata.set(
-//                 vid,
-//                 histo.get(vid & (BinCount - 1))
-//             );
-//         }
-
-//         sdata.workgroupExecutionAndMemoryBarrier();
-
-//         uint32_t percentile40, percentile60;
-//         sdata.get(BinCount * 0.4, percentile40);
-//         sdata.get(BinCount * 0.6, percentile60);
-
-//         return (__int2Float(percentile40, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x) + __int2Float(percentile60, lumaMinMax.x, lumaMinMax.y - lumaMinMax.x)) / 2;
-//     }
-
-//     float_t2 lumaMinMax;
-// };
-
 }
 }
 }

From 7598f2f51e3fad81f7f290b926031314dcec91ca Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 6 Feb 2026 12:08:33 +0700
Subject: [PATCH 48/56] use workgroup2 with histogram metering

---
 .../builtin/hlsl/luma_meter/geom_mean.hlsl    | 10 ++-
 .../builtin/hlsl/luma_meter/histogram.hlsl    | 70 +++++++++++++------
 2 files changed, 52 insertions(+), 28 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index 9ba05e2088..e8a8b7b15c 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -29,7 +29,7 @@ struct data_proxy
     template<typename AccessType, typename IndexType>
     void get(const IndexType idx, NBL_REF_ARG(AccessType) value)
     {
-        value = data[idx];
+        value = data;
     }
 
     T data;
@@ -43,7 +43,7 @@ struct geom_meter
     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
     
-    using proxy_data_t = vector<float_t, WorkgroupConfig::ItemsPerInvocation_0>;
+    using proxy_data_t = float_t;
     using proxy_t = impl::data_proxy<proxy_data_t>;
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = WorkgroupConfig::WorkgroupSize;
@@ -64,10 +64,8 @@ struct geom_meter
 
     float_t __reduction(NBL_REF_ARG(proxy_t) data, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        // return workgroup::reduction < plus < float_t >, WorkgroupSize >::
-        //     template __call <SharedAccessor>(value, sdata);
         return workgroup2::reduction< WorkgroupConfig, plus<float_t>, device_capabilities >::
-            template __call <proxy_t, SharedAccessor>(data, sdata);
+            template __call<proxy_t, SharedAccessor>(data, sdata);
     }
 
     float_t __computeLumaLog2(
@@ -130,7 +128,7 @@ struct geom_meter
         lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin);
 
         proxy_t data;
-        data.data[0] = lumaLog2;
+        data.data = lumaLog2;
         float_t lumaLog2Sum = __reduction(data, sdata);
 
         if (tid == 0) {
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 40a4db59a7..61f662cb06 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -8,8 +8,8 @@
 #include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_basic.hlsl"
 #include "nbl/builtin/hlsl/glsl_compat/subgroup_arithmetic.hlsl"
-#include "nbl/builtin/hlsl/workgroup/basic.hlsl"
-#include "nbl/builtin/hlsl/workgroup/arithmetic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/basic.hlsl"
+#include "nbl/builtin/hlsl/workgroup2/arithmetic.hlsl"
 #include "nbl/builtin/hlsl/type_traits.hlsl"
 #include "nbl/builtin/hlsl/morton.hlsl"
 #include "nbl/builtin/hlsl/luma_meter/common.hlsl"
@@ -21,14 +21,41 @@ namespace hlsl
 namespace luma_meter
 {
 
-template<uint32_t WorkgroupSize, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor>
+namespace impl
+{
+template<typename T>
+struct data_proxy
+{
+    template<typename AccessType, typename IndexType>
+    void get(const IndexType idx, NBL_REF_ARG(AccessType) value)
+    {
+        value = data;
+    }
+
+    template<typename AccessType, typename IndexType>
+    void set(const IndexType ix, const AccessType value)
+    {
+        data = value;
+    }
+
+    T data;
+};
+}
+
+template<class WorkgroupConfig, uint16_t BinCount, typename HistogramAccessor, typename SharedAccessor, typename TexAccessor, class device_capabilities>
 struct median_meter
 {
     using int_t = typename SharedAccessor::type;
     using float_t  = float32_t;
     using float_t2 = typename conditional<is_same_v<float_t, float32_t>, float32_t2, float16_t2>::type;
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
-    using this_t = median_meter<WorkgroupSize, BinCount, HistogramAccessor, SharedAccessor, TexAccessor>;
+
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = WorkgroupConfig::WorkgroupSize;
+    NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanItemsPerInvoc = WorkgroupConfig::ItemsPerInvocation_0;
+    using proxy_data_t = vector<int_t, ScanItemsPerInvoc>;
+    using proxy_t = impl::data_proxy<proxy_data_t>;
+
+    using this_t = median_meter<WorkgroupConfig, BinCount, HistogramAccessor, SharedAccessor, TexAccessor, device_capabilities>;
 
     static this_t create(float_t lumaMin, float_t lumaMax, float_t lowerBoundPercentile, float_t upperBoundPercentile)
     {
@@ -40,10 +67,12 @@ struct median_meter
         return retval;
     }
 
-    int_t __inclusive_scan(float_t value, NBL_REF_ARG(SharedAccessor) sdata)
+    void __inclusive_scan(NBL_REF_ARG(proxy_t) data, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        return workgroup::inclusive_scan < plus < int_t >, WorkgroupSize >::
-            template __call <SharedAccessor>(value, sdata);
+        // return workgroup::inclusive_scan < plus < int_t >, WorkgroupSize >::
+        //     template __call <SharedAccessor>(value, sdata);
+        workgroup2::inclusive_scan< WorkgroupConfig, plus<int_t>, device_capabilities >::
+            template __call<proxy_t, SharedAccessor>(data, sdata);
     }
 
     float_t __computeLuma(
@@ -70,9 +99,8 @@ struct median_meter
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
         
-        for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) {
-            sdata.set(vid, 0);
-        }
+        for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize)
+            sdata.template set<uint32_t,uint32_t>(vid, 0u);
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
@@ -89,13 +117,15 @@ struct median_meter
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        int_t histogram_value;
-        sdata.get(tid, histogram_value);
+        proxy_t histogram_data;
+        NBL_UNROLL for (uint32_t i = 0; i < ScanItemsPerInvoc; i++)
+            sdata.template get<uint32_t,uint32_t>(tid * ScanItemsPerInvoc + i, histogram_data.data[i]);
 
         sdata.workgroupExecutionAndMemoryBarrier();
 
-        int_t sum = __inclusive_scan(histogram_value, sdata);
-        histo.atomicAdd(tid, sum);
+        __inclusive_scan(histogram_data, sdata);
+        NBL_UNROLL for (uint32_t i = 0; i < ScanItemsPerInvoc; i++)
+            histo.atomicAdd(tid * ScanItemsPerInvoc + i, histogram_data.data[i]);
     }
 
     float_t gatherLuma(
@@ -105,12 +135,8 @@ struct median_meter
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
 
-        for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize) {
-            sdata.set(
-                vid,
-                histo.get(vid)
-            );
-        }
+        for (uint32_t vid = tid; vid < BinCount; vid += WorkgroupSize)
+            sdata.template set<uint32_t,uint32_t>(vid, histo.get(vid));
         sdata.workgroupExecutionAndMemoryBarrier();
 
         int_t lower, upper;
@@ -123,7 +149,7 @@ struct median_meter
             while (lo < hi)
             {
                 uint32_t mid = lo + (hi - lo) / 2;
-                sdata.get(mid, v);
+                sdata.template get<uint32_t,uint32_t>(mid, v);
                 if (lowerPercentile <= v)
                     hi = mid;
                 else
@@ -141,7 +167,7 @@ struct median_meter
             while (lo < hi)
             {
                 uint32_t mid = lo + (hi - lo) / 2;
-                sdata.get(mid, v);
+                sdata.template get<uint32_t,uint32_t>(mid, v);
                 if (upperPercentile >= v)
                     lo = mid + 1;
                 else

From c7442832ba98b36eca30be4f1ebf569d04319564 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 6 Feb 2026 15:02:18 +0700
Subject: [PATCH 49/56] removed commented out code

---
 include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 61f662cb06..fcfc06ed5c 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -69,8 +69,6 @@ struct median_meter
 
     void __inclusive_scan(NBL_REF_ARG(proxy_t) data, NBL_REF_ARG(SharedAccessor) sdata)
     {
-        // return workgroup::inclusive_scan < plus < int_t >, WorkgroupSize >::
-        //     template __call <SharedAccessor>(value, sdata);
         workgroup2::inclusive_scan< WorkgroupConfig, plus<int_t>, device_capabilities >::
             template __call<proxy_t, SharedAccessor>(data, sdata);
     }

From 79fad2f7d16ba75347c94a8b4327042b4d786547 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Fri, 6 Feb 2026 15:11:09 +0700
Subject: [PATCH 50/56] latest example

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 5e27920875..77ec3d54bb 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 5e279208751e882805a62f12e1ba86f6389e4954
+Subproject commit 77ec3d54bba6bd54236b14fc9f1a105f5ca562ff

From fe2b5bcd16e84b1e7ded41475ba4a6e8ab953819 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 9 Feb 2026 15:04:54 +0700
Subject: [PATCH 51/56] fixes converting thread to image uv

---
 include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl | 5 ++---
 include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index e8a8b7b15c..40139d2863 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -114,8 +114,7 @@ struct geom_meter
         NBL_REF_ARG(ValueAccessor) val,
         NBL_REF_ARG(TexAccessor) tex,
         NBL_REF_ARG(SharedAccessor) sdata,
-        float_t2 tileOffset,
-        float_t2 viewportSize
+        float_t2 tileOffset
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
@@ -123,7 +122,7 @@ struct geom_meter
         mc.value = tid;
         uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
-        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        float_t2 shiftedCoord = tileOffset + float32_t2(coord);
         float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
         lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin);
 
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index fcfc06ed5c..0e71b46925 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -91,8 +91,7 @@ struct median_meter
         NBL_REF_ARG(HistogramAccessor) histo,
         NBL_REF_ARG(TexAccessor) tex,
         NBL_REF_ARG(SharedAccessor) sdata,
-        float_t2 tileOffset,
-        float_t2 viewportSize
+        float_t2 tileOffset
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
@@ -106,7 +105,7 @@ struct median_meter
         mc.value = tid;
         uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
-        float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize;
+        float_t2 shiftedCoord = tileOffset + float32_t2(coord);
         float_t luma = __computeLuma(window, tex, shiftedCoord);
 
         float_t scaledLogLuma = log2(luma / lumaMin) / log2(lumaMax / lumaMin);

From da066716735e81905623f81fb7effb98ed1b0dca Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 9 Feb 2026 15:07:15 +0700
Subject: [PATCH 52/56] latest example

---
 examples_tests | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 77ec3d54bb..d104945cb3 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 77ec3d54bba6bd54236b14fc9f1a105f5ca562ff
+Subproject commit d104945cb3c41e89c20dc60135faecbd0778ed83

From ec58514620a2d0de5055d7ec5048e08caea765d1 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Mon, 9 Feb 2026 17:00:20 +0700
Subject: [PATCH 53/56] fix histogram metering percentiles by using sample
 count instead

---
 examples_tests                                     |  2 +-
 include/nbl/builtin/hlsl/luma_meter/histogram.hlsl | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/examples_tests b/examples_tests
index d104945cb3..07ad5db796 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit d104945cb3c41e89c20dc60135faecbd0778ed83
+Subproject commit 07ad5db7968fbab38a3fed4b93e97d17504f1b83
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index 0e71b46925..e86068ca87 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -139,7 +139,6 @@ struct median_meter
         int_t lower, upper;
         if (tid == 0)
         {
-            const uint32_t lowerPercentile = uint32_t(BinCount * lowerBoundPercentile);
             uint32_t lo = 0u;
             uint32_t hi = BinCount;
             int_t v;
@@ -147,7 +146,7 @@ struct median_meter
             {
                 uint32_t mid = lo + (hi - lo) / 2;
                 sdata.template get<uint32_t,uint32_t>(mid, v);
-                if (lowerPercentile <= v)
+                if (lowerBoundPercentile <= v)
                     hi = mid;
                 else
                     lo = mid + 1;
@@ -157,7 +156,6 @@ struct median_meter
         }
         if (tid == 1)
         {
-            const uint32_t upperPercentile = uint32_t(BinCount * upperBoundPercentile);
             uint32_t lo = 0u;
             uint32_t hi = BinCount;
             int_t v;
@@ -165,7 +163,7 @@ struct median_meter
             {
                 uint32_t mid = lo + (hi - lo) / 2;
                 sdata.template get<uint32_t,uint32_t>(mid, v);
-                if (upperPercentile >= v)
+                if (upperBoundPercentile >= v)
                     lo = mid + 1;
                 else
                     hi = mid;
@@ -183,8 +181,8 @@ struct median_meter
 
     float_t lumaMin;
     float_t lumaMax;
-    float_t lowerBoundPercentile;
-    float_t upperBoundPercentile;
+    int_t lowerBoundPercentile;
+    int_t upperBoundPercentile;
 };
 
 }

From 93582da0a8ad93719ce32ca5ab92e27d6fe9dfc0 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Tue, 10 Feb 2026 15:39:23 +0700
Subject: [PATCH 54/56] removed sample count from average metering

---
 include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index 40139d2863..feeb260a3f 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -52,12 +52,11 @@ struct geom_meter
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t MaxWorkgroupIncrement = 0x1000u;
 
-    static this_t create(float_t lumaMin, float_t lumaMax, float_t sampleCount, float_t rcpFirstPassWGCount)
+    static this_t create(float_t lumaMin, float_t lumaMax, float_t rcpFirstPassWGCount)
     {
         this_t retval;
         retval.lumaMin = lumaMin;
         retval.lumaMax = lumaMax;
-        retval.sampleCount = sampleCount;
         retval.rcpFirstPassWGCount = rcpFirstPassWGCount;
         return retval;
     }
@@ -157,7 +156,6 @@ struct geom_meter
 
     float_t lumaMin;
     float_t lumaMax;
-    float_t sampleCount;
     float_t rcpFirstPassWGCount;
 };
 

From 86fba58bcf128ed4ec8147352af56c4cef3d5ac7 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 12 Feb 2026 14:53:40 +0700
Subject: [PATCH 55/56] luma meter push constants in common

---
 examples_tests                                |  2 +-
 .../nbl/builtin/hlsl/luma_meter/common.hlsl   | 26 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/examples_tests b/examples_tests
index 07ad5db796..9482f15f67 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 07ad5db7968fbab38a3fed4b93e97d17504f1b83
+Subproject commit 9482f15f6730176bc5910d89d9b91d8ac0ccaa13
diff --git a/include/nbl/builtin/hlsl/luma_meter/common.hlsl b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
index 55d1713619..e469e1103d 100644
--- a/include/nbl/builtin/hlsl/luma_meter/common.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/common.hlsl
@@ -28,6 +28,32 @@ struct MeteringWindow
 	}
 };
 
+struct GeomMeanParameters
+{
+	float32_t rcpFirstPassWGCount;
+};
+
+struct HistogramParameters
+{
+	uint32_t lowerBoundPercentile;
+    uint32_t upperBoundPercentile;
+};
+
+struct PushConstants
+{
+    MeteringWindow window;
+    float32_t lumaMin;
+    float32_t lumaMax;
+    uint32_t2 viewportSize;
+    float32_t2 exposureAdaptationFactors;
+    uint64_t pLumaMeterBuf;
+    uint64_t pLastFrameEVBuf;
+    uint64_t pCurrFrameEVBuf;    
+
+    GeomMeanParameters meanParams;
+	HistogramParameters histoParams;    
+};
+
 }
 }
 }

From 7c4b40160bd1940acfa30ab52be6f7ffc32948f2 Mon Sep 17 00:00:00 2001
From: keptsecret <sorchon@gmail.com>
Date: Thu, 12 Feb 2026 15:48:14 +0700
Subject: [PATCH 56/56] precompute log2 values, minor changes to functions

---
 examples_tests                                |  2 +-
 .../builtin/hlsl/luma_meter/geom_mean.hlsl    | 44 +++++++++----------
 .../builtin/hlsl/luma_meter/histogram.hlsl    | 27 +++++++-----
 3 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/examples_tests b/examples_tests
index 9482f15f67..ca0e50e78e 160000
--- a/examples_tests
+++ b/examples_tests
@@ -1 +1 @@
-Subproject commit 9482f15f6730176bc5910d89d9b91d8ac0ccaa13
+Subproject commit ca0e50e78e58e9c0472c779af5e37dcf96a88f12
diff --git a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
index feeb260a3f..40d706351a 100644
--- a/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/geom_mean.hlsl
@@ -57,6 +57,8 @@ struct geom_meter
         this_t retval;
         retval.lumaMin = lumaMin;
         retval.lumaMax = lumaMax;
+        retval.log2LumaMin = log2(lumaMin);
+        retval.log2LumaRange = log2(lumaMax) - retval.log2LumaMin;
         retval.rcpFirstPassWGCount = rcpFirstPassWGCount;
         return retval;
     }
@@ -70,12 +72,12 @@ struct geom_meter
     float_t __computeLumaLog2(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
-        float_t2 shiftedCoord
+        const float_t2 shiftedCoord
     )
     {
-        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
-        float_t3 color = tex.get(uvPos);
-        float_t luma = (float_t)TexAccessor::toXYZ(color);
+        const float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        const float_t3 color = tex.get(uvPos);
+        float_t luma = TexAccessor::toXYZ(color);
 
         luma = clamp(luma, lumaMin, lumaMax);
 
@@ -84,36 +86,31 @@ struct geom_meter
 
     void __uploadFloat(
         NBL_REF_ARG(ValueAccessor) val_accessor,
-        float_t val,
-        float_t minLog2,
-        float_t rangeLog2
+        float_t val
     )
     {
         const uint32_t3 workGroupCount = glsl::gl_NumWorkGroups();
         const uint32_t3 workgroupID = glsl::gl_WorkGroupID();
         const uint32_t index = (workgroupID.y * workGroupCount.x + workgroupID.x) & (SubgroupSize - 1u);
-        uint32_t lumaVal = uint32_t(val / float_t(WorkgroupSize) * float_t(MaxWorkgroupIncrement) + 0.5);
+        const uint32_t lumaVal = uint32_t(val / float_t(WorkgroupSize) * float_t(MaxWorkgroupIncrement) + 0.5);
         val_accessor.atomicAdd(index, lumaVal);
     }
 
     float_t __downloadFloat(
         NBL_REF_ARG(ValueAccessor) val_accessor,
-        uint32_t index,
-        float_t minLog2,
-        float_t rangeLog2
+        uint32_t index
     )
     {
         uint32_t lumaVal = val_accessor.get(index);
         lumaVal = glsl::subgroupAdd(lumaVal);
-        return float_t(lumaVal) / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * rangeLog2 + minLog2;
+        return float_t(lumaVal) / float_t(MaxWorkgroupIncrement) * rcpFirstPassWGCount * log2LumaRange + log2LumaMin;
     }
 
     void sampleLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(ValueAccessor) val,
         NBL_REF_ARG(TexAccessor) tex,
-        NBL_REF_ARG(SharedAccessor) sdata,
-        float_t2 tileOffset
+        NBL_REF_ARG(SharedAccessor) sdata
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
@@ -121,9 +118,10 @@ struct geom_meter
         mc.value = tid;
         uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
-        float_t2 shiftedCoord = tileOffset + float32_t2(coord);
+        const float_t2 tileOffset = float32_t2((glsl::gl_WorkGroupID() * SubgroupSize).xy);
+        const float_t2 shiftedCoord = tileOffset + float32_t2(coord);
         float_t lumaLog2 = __computeLumaLog2(window, tex, shiftedCoord);
-        lumaLog2 = (lumaLog2 - log2(lumaMin)) / log2(lumaMax / lumaMin);
+        lumaLog2 = (lumaLog2 - log2LumaMin) / log2LumaRange;
 
         proxy_t data;
         data.data = lumaLog2;
@@ -132,9 +130,7 @@ struct geom_meter
         if (tid == 0) {
             __uploadFloat(
                 val,
-                lumaLog2Sum,
-                log2(lumaMin),
-                log2(lumaMax / lumaMin)
+                lumaLog2Sum
             );
         }
     }
@@ -143,12 +139,10 @@ struct geom_meter
         NBL_REF_ARG(ValueAccessor) val
     )
     {
-        uint32_t tid = glsl::gl_SubgroupInvocationID();
-        float_t luma = __downloadFloat(
+        const uint32_t tid = glsl::gl_SubgroupInvocationID();
+        const float_t luma = __downloadFloat(
                 val,
-                tid,
-                log2(lumaMin),
-                log2(lumaMax / lumaMin)
+                tid
             );
 
         return luma;
@@ -156,6 +150,8 @@ struct geom_meter
 
     float_t lumaMin;
     float_t lumaMax;
+    float_t log2LumaMin;
+    float_t log2LumaRange;
     float_t rcpFirstPassWGCount;
 };
 
diff --git a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
index e86068ca87..eb9672b15e 100644
--- a/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
+++ b/include/nbl/builtin/hlsl/luma_meter/histogram.hlsl
@@ -51,6 +51,7 @@ struct median_meter
     using float_t3 = typename conditional<is_same_v<float_t, float32_t>, float32_t3, float16_t3>::type;
 
     NBL_CONSTEXPR_STATIC_INLINE uint32_t WorkgroupSize = WorkgroupConfig::WorkgroupSize;
+    NBL_CONSTEXPR_STATIC_INLINE uint16_t SubgroupSize = WorkgroupConfig::SubgroupSize;
     NBL_CONSTEXPR_STATIC_INLINE uint32_t ScanItemsPerInvoc = WorkgroupConfig::ItemsPerInvocation_0;
     using proxy_data_t = vector<int_t, ScanItemsPerInvoc>;
     using proxy_t = impl::data_proxy<proxy_data_t>;
@@ -62,6 +63,8 @@ struct median_meter
         this_t retval;
         retval.lumaMin = lumaMin;
         retval.lumaMax = lumaMax;
+        retval.log2LumaMin = log2(lumaMin);
+        retval.log2LumaRange = log2(lumaMax) - retval.log2LumaMin;
         retval.lowerBoundPercentile = lowerBoundPercentile;
         retval.upperBoundPercentile = upperBoundPercentile;
         return retval;
@@ -76,12 +79,12 @@ struct median_meter
     float_t __computeLuma(
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(TexAccessor) tex,
-        float_t2 shiftedCoord
+        const float_t2 shiftedCoord
     )
     {
-        float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
-        float_t3 color = tex.get(uvPos);
-        float_t luma = (float_t)TexAccessor::toXYZ(color);
+        const float_t2 uvPos = shiftedCoord * window.meteringWindowScale + window.meteringWindowOffset;
+        const float_t3 color = tex.get(uvPos);
+        const float_t luma = TexAccessor::toXYZ(color);
 
         return clamp(luma, lumaMin, lumaMax);
     }
@@ -90,8 +93,7 @@ struct median_meter
         NBL_CONST_REF_ARG(MeteringWindow) window,
         NBL_REF_ARG(HistogramAccessor) histo,
         NBL_REF_ARG(TexAccessor) tex,
-        NBL_REF_ARG(SharedAccessor) sdata,
-        float_t2 tileOffset
+        NBL_REF_ARG(SharedAccessor) sdata
     )
     {
         uint32_t tid = workgroup::SubgroupContiguousIndex();
@@ -105,11 +107,12 @@ struct median_meter
         mc.value = tid;
         uint32_t2 coord = _static_cast<uint32_t2>(mc);
 
-        float_t2 shiftedCoord = tileOffset + float32_t2(coord);
-        float_t luma = __computeLuma(window, tex, shiftedCoord);
+        const float_t2 tileOffset = float32_t2((glsl::gl_WorkGroupID() * SubgroupSize).xy);
+        const float_t2 shiftedCoord = tileOffset + float32_t2(coord);
+        const float_t luma = __computeLuma(window, tex, shiftedCoord);
 
-        float_t scaledLogLuma = log2(luma / lumaMin) / log2(lumaMax / lumaMin);
-        uint32_t binIndex = int_t(scaledLogLuma * float_t(BinCount-1u) + 0.5);
+        const float_t scaledLogLuma = (log2(luma) - log2LumaMin) / log2LumaRange;
+        const uint32_t binIndex = int_t(scaledLogLuma * float_t(BinCount-1u) + 0.5);
         sdata.atomicAdd(binIndex, 1u);
 
         sdata.workgroupExecutionAndMemoryBarrier();
@@ -176,11 +179,13 @@ struct median_meter
         lower = workgroup::Broadcast(lower, sdata, 0);
         upper = workgroup::Broadcast(upper, sdata, 1);
 
-        return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2(lumaMax/lumaMin) + log2(lumaMin);
+        return ((float_t(lower) + float_t(upper)) * 0.5 / float_t(BinCount-1u)) * log2LumaRange + log2LumaMin;
     }
 
     float_t lumaMin;
     float_t lumaMax;
+    float_t log2LumaMin;
+    float_t log2LumaRange;
     int_t lowerBoundPercentile;
     int_t upperBoundPercentile;
 };