diff --git a/.github/workflows/advanced_gltf_example_ci.yml b/.github/workflows/advanced_gltf_example_ci.yml
new file mode 100644
index 000000000..870445bec
--- /dev/null
+++ b/.github/workflows/advanced_gltf_example_ci.yml
@@ -0,0 +1,343 @@
+name: Advanced glTF Example CI
+
+on:
+  push:
+    paths:
+      - 'attachments/advanced_gltf/**'
+      - 'attachments/simple_engine/**'
+      - '.github/workflows/advanced_gltf_example_ci.yml'
+  pull_request:
+    paths:
+      - 'attachments/advanced_gltf/**'
+      - 'attachments/simple_engine/**'
+      - '.github/workflows/advanced_gltf_example_ci.yml'
+  workflow_dispatch:
+
+jobs:
+  build:
+    name: Build (${{ matrix.os }})
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+
+    defaults:
+      run:
+        working-directory: attachments/advanced_gltf
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      # -----------------------------------------------------------------------
+      # Linux toolchain
+      # -----------------------------------------------------------------------
+      - name: Install Clang + Ninja + ccache (Linux)
+        if: runner.os == 'Linux'
+        shell: bash
+        run: |
+          set -euo pipefail
+          sudo apt-get update
+          sudo apt-get install -y clang ninja-build ccache spirv-tools
+
+      - name: Select Clang toolchain (Linux)
+        if: runner.os == 'Linux'
+        shell: bash
+        run: |
+          echo "CC=clang" >> "$GITHUB_ENV"
+          echo "CXX=clang++" >> "$GITHUB_ENV"
+
+      # -----------------------------------------------------------------------
+      # Windows toolchain
+      # -----------------------------------------------------------------------
+      - name: Set up MSVC dev environment (Windows)
+        if: runner.os == 'Windows'
+        uses: ilammy/msvc-dev-cmd@v1
+
+      - name: Set up Ninja + sccache (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          choco install -y ninja sccache
+          $chocoBin = "C:\ProgramData\chocolatey\bin"
+          if (Test-Path $chocoBin) {
+            $chocoBin | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          }
+          "SCCACHE_DIR=$env:LOCALAPPDATA\Mozilla\sccache" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+
+      # -----------------------------------------------------------------------
+      # Compiler cache
+      # -----------------------------------------------------------------------
+      - name: ccache (Linux)
+        if: runner.os == 'Linux'
+        uses: actions/cache@v4
+        with:
+          path: ~/.cache/ccache
+          key: ${{ runner.os }}-advanced-gltf-ccache-${{ github.sha }}
+          restore-keys: ${{ runner.os }}-advanced-gltf-ccache-
+
+      - name: sccache (Windows)
+        if: runner.os == 'Windows'
+        uses: actions/cache@v4
+        with:
+          path: ${{ env.SCCACHE_DIR }}
+          key: ${{ runner.os }}-advanced-gltf-sccache-${{ github.sha }}
+          restore-keys: ${{ runner.os }}-advanced-gltf-sccache-
+
+      # -----------------------------------------------------------------------
+      # Vulkan SDK — provides slangc + headers; reuses same cache keys as
+      # simple_engine_ci so the two workflows share cached SDK tarballs.
+      # -----------------------------------------------------------------------
+      - name: Cache Vulkan SDK (Windows)
+        if: runner.os == 'Windows'
+        id: cache-vulkan-windows
+        uses: actions/cache@v4
+        with:
+          path: C:\VulkanSDK
+          key: ${{ runner.os }}-vulkan-sdk
+
+      - name: Install Vulkan SDK (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          $ErrorActionPreference = 'Stop'
+          if ("${{ steps.cache-vulkan-windows.outputs.cache-hit }}" -ne "true") {
+            choco install -y aria2
+            $installer = Join-Path $env:TEMP "vulkan-sdk.exe"
+            aria2c --split=8 --max-connection-per-server=8 --min-split-size=1M `
+              --dir="$env:TEMP" --out="vulkan-sdk.exe" `
+              "https://sdk.lunarg.com/sdk/download/latest/windows/vulkan-sdk.exe"
+            Start-Process -FilePath $installer `
+              -ArgumentList "--accept-licenses --default-answer --confirm-command install" `
+              -Wait -NoNewWindow
+          }
+          $vulkanPath = Get-ChildItem "C:\VulkanSDK" |
+            Sort-Object Name -Descending | Select-Object -First 1 -ExpandProperty FullName
+          "VULKAN_SDK=$vulkanPath"                  | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          "$vulkanPath\Bin"                          | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          "CMAKE_PREFIX_PATH=$vulkanPath"            | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          "Vulkan_INCLUDE_DIR=$vulkanPath\Include"   | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          "Vulkan_LIBRARY=$vulkanPath\Lib\vulkan-1.lib" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+
+      - name: Cache Vulkan SDK (Linux)
+        if: runner.os == 'Linux'
+        id: cache-vulkan-linux
+        uses: actions/cache@v4
+        with:
+          path: ${{ runner.temp }}/VulkanSDK
+          key: ${{ runner.os }}-vulkan-sdk
+
+      - name: Install Vulkan SDK (Linux)
+        if: runner.os == 'Linux'
+        shell: bash
+        run: |
+          set -euo pipefail
+          SDK_DIR="${RUNNER_TEMP}/VulkanSDK"
+
+          if [ "${{ steps.cache-vulkan-linux.outputs.cache-hit }}" != "true" ]; then
+            SDK_TGZ="${RUNNER_TEMP}/vulkan-sdk.tar.xz"
+            download_ok=0
+            for url in \
+              "https://sdk.lunarg.com/sdk/download/latest/linux/vulkan-sdk.tar.xz" \
+              "https://sdk.lunarg.com/sdk/download/latest/linux/vulkansdk-linux-x86_64.tar.xz"
+            do
+              if curl -L --fail -o "$SDK_TGZ" "$url"; then
+                download_ok=1; break
+              fi
+            done
+            [ "$download_ok" -eq 1 ] || { echo "Vulkan SDK download failed" >&2; exit 1; }
+            rm -rf "$SDK_DIR"; mkdir -p "$SDK_DIR"
+            tar -xJf "$SDK_TGZ" -C "$SDK_DIR"
+          fi
+
+          VULKAN_SDK_PATH="$(find "$SDK_DIR" -maxdepth 1 -type d -name '1.*' | sort -r | head -n 1)"
+          SDK_SYSROOT="$VULKAN_SDK_PATH"
+          [ -d "$VULKAN_SDK_PATH/x86_64" ] && SDK_SYSROOT="$VULKAN_SDK_PATH/x86_64"
+
+          echo "VULKAN_SDK=$VULKAN_SDK_PATH"               >> "$GITHUB_ENV"
+          echo "VULKAN_SDK_SYSROOT=$SDK_SYSROOT"           >> "$GITHUB_ENV"
+          echo "CMAKE_PREFIX_PATH=$SDK_SYSROOT"            >> "$GITHUB_ENV"
+          echo "Vulkan_INCLUDE_DIR=$SDK_SYSROOT/include"   >> "$GITHUB_ENV"
+
+          for libname in libvulkan.so libvulkan.so.1; do
+            if [ -f "$SDK_SYSROOT/lib/$libname" ]; then
+              echo "Vulkan_LIBRARY=$SDK_SYSROOT/lib/$libname" >> "$GITHUB_ENV"; break
+            fi
+          done
+
+          [ -d "$VULKAN_SDK_PATH/bin"  ] && echo "$VULKAN_SDK_PATH/bin"  >> "$GITHUB_PATH"
+          [ -d "$SDK_SYSROOT/bin"      ] && echo "$SDK_SYSROOT/bin"      >> "$GITHUB_PATH"
+
+          if command -v slangc >/dev/null 2>&1; then
+            echo "SLANGC_EXECUTABLE=$(command -v slangc)" >> "$GITHUB_ENV"
+          fi
+
+          compat_dir="${RUNNER_TEMP}/slang-compat"
+          mkdir -p "$compat_dir"
+          pthread_path="$(ldconfig -p | awk '/libpthread\.so\.0/{print $NF; exit 0}' || true)"
+          if [ -n "${pthread_path:-}" ] && [ -f "$pthread_path" ]; then
+            ln -sf "$pthread_path" "$compat_dir/libpthread.so"
+          fi
+          echo "LD_LIBRARY_PATH=$compat_dir:${SDK_SYSROOT}/lib:${VULKAN_SDK_PATH}/lib:${LD_LIBRARY_PATH:-}" >> "$GITHUB_ENV"
+
+      # -----------------------------------------------------------------------
+      # simple_engine system dependencies
+      # The advanced_gltf project links against SimpleEngineLib which is built
+      # from simple_engine's sources, so we need the same library dependencies.
+      # -----------------------------------------------------------------------
+      - name: Install simple_engine dependencies (Linux)
+        if: runner.os == 'Linux'
+        shell: bash
+        run: bash ../simple_engine/install_dependencies_linux.sh
+
+      - name: Set up vcpkg (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          $vcpkgRoot = "C:\vcpkg"
+          if (-not (Test-Path $vcpkgRoot)) {
+            git clone https://github.com/microsoft/vcpkg.git $vcpkgRoot
+            & "$vcpkgRoot\bootstrap-vcpkg.bat" -disableMetrics
+          }
+          "$vcpkgRoot" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          "VCPKG_ROOT=$vcpkgRoot" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+
+      - name: Install simple_engine dependencies (Windows)
+        if: runner.os == 'Windows'
+        shell: cmd
+        run: call ..\simple_engine\install_dependencies_windows.bat
+
+      # -----------------------------------------------------------------------
+      # FetchContent dependency cache (Jolt only; simple_engine deps are system-installed)
+      # -----------------------------------------------------------------------
+      - name: Cache FetchContent (Linux)
+        if: runner.os == 'Linux'
+        uses: actions/cache@v4
+        with:
+          path: attachments/advanced_gltf/build/_deps
+          key: ${{ runner.os }}-advanced-gltf-deps-v1
+          restore-keys: ${{ runner.os }}-advanced-gltf-deps-
+
+      - name: Cache FetchContent (Windows)
+        if: runner.os == 'Windows'
+        uses: actions/cache@v4
+        with:
+          path: attachments/advanced_gltf/build/_deps
+          key: ${{ runner.os }}-advanced-gltf-deps-v1
+          restore-keys: ${{ runner.os }}-advanced-gltf-deps-
+
+      # -----------------------------------------------------------------------
+      # Configure
+      # -----------------------------------------------------------------------
+      - name: Configure (Linux)
+        if: runner.os == 'Linux'
+        shell: bash
+        run: |
+          set -euo pipefail
+          extra_args=()
+          [ -n "${Vulkan_INCLUDE_DIR:-}" ] && extra_args+=("-DVulkan_INCLUDE_DIR=${Vulkan_INCLUDE_DIR}")
+          [ -n "${Vulkan_LIBRARY:-}"     ] && extra_args+=("-DVulkan_LIBRARY=${Vulkan_LIBRARY}")
+          [ -n "${SLANGC_EXECUTABLE:-}"  ] && extra_args+=("-DSLANGC_EXECUTABLE=${SLANGC_EXECUTABLE}")
+
+          cmake -S . -B build -G Ninja \
+            -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_C_COMPILER=clang \
+            -DCMAKE_CXX_COMPILER=clang++ \
+            -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+            -DCMAKE_PREFIX_PATH="${VULKAN_SDK_SYSROOT:-}" \
+            "${extra_args[@]}"
+
+      - name: Configure (Windows)
+        if: runner.os == 'Windows'
+        shell: pwsh
+        run: |
+          $extra = @()
+          if ($env:Vulkan_INCLUDE_DIR) { $extra += "-DVulkan_INCLUDE_DIR=$env:Vulkan_INCLUDE_DIR" }
+          if ($env:Vulkan_LIBRARY)     { $extra += "-DVulkan_LIBRARY=$env:Vulkan_LIBRARY" }
+          if ($env:VCPKG_ROOT)         { $extra += "-DCMAKE_TOOLCHAIN_FILE=$env:VCPKG_ROOT/scripts/buildsystems/vcpkg.cmake" }
+
+          cmake -S . -B build -G Ninja `
+            -DCMAKE_BUILD_TYPE=Release `
+            -DCMAKE_CXX_COMPILER_LAUNCHER=sccache `
+            -DCMAKE_C_COMPILER_LAUNCHER=sccache `
+            $extra
+
+      # -----------------------------------------------------------------------
+      # Build — the tutorial executable + all shader compilation targets
+      # -----------------------------------------------------------------------
+      - name: Build
+        run: cmake --build build --target AdvancedGLTF --parallel 4
+
+      - name: Verify shaders compiled
+        shell: bash
+        run: |
+          set -euo pipefail
+          missing=0
+          for spv in \
+            build/shaders/skinning.spv \
+            build/shaders/morph_accumulate.spv \
+            build/shaders/pbr_heatmap_vertex.spv \
+            "build/shaders/pbr_heatmap_fragment_dominant_bone.spv" \
+            "build/shaders/pbr_heatmap_fragment_weight_distribution.spv"
+          do
+            if [ -f "$spv" ]; then
+              echo "  OK  $spv"
+            else
+              echo "  MISSING  $spv" >&2
+              missing=$((missing + 1))
+            fi
+          done
+          [ "$missing" -eq 0 ] || { echo "$missing shader(s) missing" >&2; exit 1; }
+
+      - name: Cache stats
+        if: always()
+        shell: bash
+        run: |
+          command -v ccache  >/dev/null 2>&1 && ccache -s  || true
+          command -v sccache >/dev/null 2>&1 && sccache -s || true
+
+  # ---------------------------------------------------------------------------
+  # Validate Python asset tool (no deps beyond stdlib)
+  # ---------------------------------------------------------------------------
+  validate-python-tool:
+    name: Validate add_physics_extras.py
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Syntax check
+        run: python3 -m py_compile attachments/advanced_gltf/assets/add_physics_extras.py
+
+      - name: Download SimpleSkin and annotate
+        shell: bash
+        run: |
+          set -euo pipefail
+          # Fetch a minimal CC0 glTF that has a skin (SimpleSkin from Khronos samples)
+          curl -fsSL \
+            "https://raw.githubusercontent.com/KhronosGroup/glTF-Sample-Assets/main/Models/SimpleSkin/glTF/SimpleSkin.gltf" \
+            -o /tmp/SimpleSkin.gltf
+
+          python3 attachments/advanced_gltf/assets/add_physics_extras.py \
+            /tmp/SimpleSkin.gltf \
+            /tmp/SimpleSkin_physics.gltf
+
+          # Verify extras were written
+          python3 -c "
+          import json, sys
+          with open('/tmp/SimpleSkin_physics.gltf') as f:
+              g = json.load(f)
+          joints = set()
+          for skin in g.get('skins', []):
+              joints.update(skin.get('joints', []))
+          for idx in joints:
+              node = g['nodes'][idx]
+              extras = node.get('extras', {})
+              if 'collider' not in extras:
+                  sys.exit(f'Node {idx} ({node.get(\"name\",\"?\")}) missing collider extras')
+              if 'constraint' not in extras:
+                  sys.exit(f'Node {idx} ({node.get(\"name\",\"?\")}) missing constraint extras')
+          print(f'All {len(joints)} joint(s) annotated correctly.')
+          "
diff --git a/antora/modules/ROOT/nav.adoc b/antora/modules/ROOT/nav.adoc
index 89c2dc6fd..bc1b2d327 100644
--- a/antora/modules/ROOT/nav.adoc
+++ b/antora/modules/ROOT/nav.adoc
@@ -149,3 +149,51 @@
 *** xref:Building_a_Simple_Engine/Advanced_Topics/Robustness2.adoc[Robustness2]
 ** Appendix
 *** xref:Building_a_Simple_Engine/Appendix/appendix.adoc[Appendix]
+
+* Advanced glTF
+** xref:Advanced_glTF/introduction.adoc[Introduction]
+** Scene Graph Hierarchy
+*** xref:Advanced_glTF/Scene_Graph_Hierarchy/01_introduction.adoc[Introduction]
+*** xref:Advanced_glTF/Scene_Graph_Hierarchy/02_engine_expansion.adoc[Engine expansion]
+*** xref:Advanced_glTF/Scene_Graph_Hierarchy/03_physics_syncing.adoc[Physics syncing]
+*** xref:Advanced_glTF/Scene_Graph_Hierarchy/04_metadata_and_physics_extras.adoc[Metadata & physics extras]
+*** xref:Advanced_glTF/Scene_Graph_Hierarchy/05_conclusion.adoc[Conclusion]
+** Skeletal Compute Skinning
+*** xref:Advanced_glTF/Skeletal_Compute_Skinning/01_introduction.adoc[Introduction]
+*** xref:Advanced_glTF/Skeletal_Compute_Skinning/02_skinning_math.adoc[The mathematics of skinning]
+*** xref:Advanced_glTF/Skeletal_Compute_Skinning/03_compute_skinning.adoc[Compute skinning]
+*** xref:Advanced_glTF/Skeletal_Compute_Skinning/04_shared_vertex_buffer.adoc[Shared vertex buffer]
+*** xref:Advanced_glTF/Skeletal_Compute_Skinning/05_interpolation_blending.adoc[Interpolation & blending]
+*** xref:Advanced_glTF/Skeletal_Compute_Skinning/06_conclusion.adoc[Conclusion]
+** Physics Integration
+*** xref:Advanced_glTF/Physics_Integration/01_introduction.adoc[Introduction]
+*** xref:Advanced_glTF/Physics_Integration/02_bone_proxy_colliders.adoc[Bone proxy colliders]
+*** xref:Advanced_glTF/Physics_Integration/03_constraints_and_joint_limits.adoc[Constraints & joint limits]
+*** xref:Advanced_glTF/Physics_Integration/04_ragdoll_handoff.adoc[Ragdoll handoff]
+*** xref:Advanced_glTF/Physics_Integration/05_self_collision_filtering.adoc[Self-collision filtering]
+*** xref:Advanced_glTF/Physics_Integration/06_conclusion.adoc[Conclusion]
+** Procedural Animation & IK
+*** xref:Advanced_glTF/Procedural_Animation_IK/01_introduction.adoc[Introduction]
+*** xref:Advanced_glTF/Procedural_Animation_IK/02_ccd_ik.adoc[CCD inverse kinematics]
+*** xref:Advanced_glTF/Procedural_Animation_IK/03_fabrik.adoc[FABRIK]
+*** xref:Advanced_glTF/Procedural_Animation_IK/04_foot_placement.adoc[Foot placement]
+*** xref:Advanced_glTF/Procedural_Animation_IK/05_look_at.adoc[Look-at]
+*** xref:Advanced_glTF/Procedural_Animation_IK/06_physics_driven_lean.adoc[Physics-driven lean]
+*** xref:Advanced_glTF/Procedural_Animation_IK/07_conclusion.adoc[Conclusion]
+** Morph Targets & Facial Animation
+*** xref:Advanced_glTF/Morph_Targets_Facial_Animation/01_introduction.adoc[Introduction]
+*** xref:Advanced_glTF/Morph_Targets_Facial_Animation/02_shape_key_ingestion.adoc[Shape key ingestion]
+*** xref:Advanced_glTF/Morph_Targets_Facial_Animation/03_bindless_morph_buffers.adoc[Bindless morph buffers]
+*** xref:Advanced_glTF/Morph_Targets_Facial_Animation/04_conclusion.adoc[Conclusion]
+** Debugging & Visual Auditing
+*** xref:Advanced_glTF/Debugging_Visual_Auditing/01_introduction.adoc[Introduction]
+*** xref:Advanced_glTF/Debugging_Visual_Auditing/02_debug_drawers.adoc[Debug drawers]
+*** xref:Advanced_glTF/Debugging_Visual_Auditing/03_skinning_heatmaps.adoc[Skinning heatmaps]
+*** xref:Advanced_glTF/Debugging_Visual_Auditing/04_renderdoc_analysis.adoc[RenderDoc analysis]
+*** xref:Advanced_glTF/Debugging_Visual_Auditing/05_conclusion.adoc[Conclusion]
+** Tooling & Production Pipeline
+*** xref:Advanced_glTF/Tooling_Production_Pipeline/01_introduction.adoc[Introduction]
+*** xref:Advanced_glTF/Tooling_Production_Pipeline/02_blender_workflow.adoc[Blender workflow]
+*** xref:Advanced_glTF/Tooling_Production_Pipeline/03_validation.adoc[Validation]
+*** xref:Advanced_glTF/Tooling_Production_Pipeline/04_gltf_viewer_audit.adoc[glTF viewer audit]
+*** xref:Advanced_glTF/Tooling_Production_Pipeline/05_conclusion.adoc[Conclusion]
diff --git a/attachments/advanced_gltf/CMakeLists.txt b/attachments/advanced_gltf/CMakeLists.txt
new file mode 100644
index 000000000..1e9f9b8f3
--- /dev/null
+++ b/attachments/advanced_gltf/CMakeLists.txt
@@ -0,0 +1,301 @@
+# Copyright (c) 2026 Holochip Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 the "License";
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.29)
+project(AdvancedGLTF VERSION 1.0.0 LANGUAGES CXX C)
+
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# ---------------------------------------------------------------------------
+# Simple Engine — bootstrap all engine dependencies.
+#
+# IMPORTED targets created inside add_subdirectory() are local to that
+# subdirectory's scope and are NOT visible back in this parent scope.  We
+# therefore call find_package() ourselves (using simple_engine's custom Find
+# modules) so that all targets (glm::glm, tinygltf::tinygltf, etc.) are
+# created in THIS scope before SimpleEngineLib is defined.
+#
+# Cache variables set here are warm when add_subdirectory runs, so the
+# find_package() calls inside simple_engine are no-ops.  Parent-scope targets
+# ARE visible in child scopes, so the if(NOT TARGET ...) guards inside each
+# Find module skip duplicate creation.
+#
+# EXCLUDE_FROM_ALL prevents the SimpleEngine executable from building by
+# default; this project builds AdvancedGLTF instead.
+# ---------------------------------------------------------------------------
+set(SE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../simple_engine")
+if(NOT IS_DIRECTORY "${SE_DIR}")
+    message(FATAL_ERROR
+        "simple_engine not found at: ${SE_DIR}\n"
+        "This project must be located at attachments/advanced_gltf/ alongside attachments/simple_engine/.")
+endif()
+
+# simple_engine's custom Find modules create INTERFACE/UNKNOWN IMPORTED targets
+# in the subdirectory scope, which are NOT visible back in this parent scope.
+# We must therefore run every find_package() in our own scope, using the same
+# custom Find modules, so the resulting targets exist here.  All cache
+# variables set by these calls (KTX_LIBRARY, tinygltf_INCLUDE_DIR, etc.) are
+# visible everywhere, so when add_subdirectory later re-runs the same
+# find_package() calls they become no-ops: the cache is warm and the
+# if(NOT TARGET ...) guards inside each Find module see the targets we just
+# created (parent-scope targets ARE visible in child scopes).
+list(APPEND CMAKE_MODULE_PATH "${SE_DIR}/CMake")
+
+find_package(glm       REQUIRED)
+find_package(Vulkan    REQUIRED)
+find_package(tinygltf  REQUIRED)
+find_package(KTX       REQUIRED)
+find_package(VulkanHpp REQUIRED)
+if(NOT ANDROID)
+    find_package(glfw3  REQUIRED)
+    find_package(OpenAL REQUIRED)
+endif()
+
+add_subdirectory("${SE_DIR}" simple_engine_build EXCLUDE_FROM_ALL)
+
+# ---------------------------------------------------------------------------
+# JoltPhysics v5.2.0 — physics engine used in chapters 3 and 4.
+# NOTE: The Jolt API is not stable across major versions. Check CHANGELOG.md
+#       for BodyInterface and constraint API changes before updating the tag.
+# ---------------------------------------------------------------------------
+include(FetchContent)
+FetchContent_Declare(
+    JoltPhysics
+    GIT_REPOSITORY https://github.com/jrouwe/JoltPhysics.git
+    GIT_TAG        v5.2.0
+    SOURCE_SUBDIR  Build)
+set(DOUBLE_PRECISION                    OFF CACHE BOOL "" FORCE)
+set(GENERATE_DEBUG_SYMBOLS              ON  CACHE BOOL "" FORCE)
+set(PROFILER_IN_DEBUG_AND_RELEASE       ON  CACHE BOOL "" FORCE)
+set(DEBUG_RENDERER_IN_DEBUG_AND_RELEASE ON  CACHE BOOL "" FORCE)
+set(ENABLE_OBJECT_STREAM                ON  CACHE BOOL "" FORCE)
+set(FLOATING_POINT_EXCEPTIONS_ENABLED   ON  CACHE BOOL "" FORCE)
+set(INTERPROCEDURAL_OPTIMIZATION        ON  CACHE BOOL "" FORCE)
+set(WARNINGS_AS_ERRORS                  OFF CACHE BOOL "" FORCE)
+if(MSVC)
+    set(USE_STATIC_MSVC_RUNTIME_LIBRARY OFF CACHE BOOL "" FORCE)
+endif()
+FetchContent_MakeAvailable(JoltPhysics)
+
+# ---------------------------------------------------------------------------
+# Advanced glTF shader compilation.
+# Uses SLANGC_EXECUTABLE set by simple_engine's find_program() call above.
+# Target names are prefixed 'agltf_' to avoid colliding with the 'shaders'
+# target already created inside simple_engine_build.
+# ---------------------------------------------------------------------------
+set(AGLTF_SHADER_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
+set(AGLTF_SHADER_OUT "${CMAKE_CURRENT_BINARY_DIR}/shaders")
+file(MAKE_DIRECTORY "${AGLTF_SHADER_OUT}")
+
+function(agltf_compile_shader target_name slang_file entry_point out_stem)
+    set(spv_out "${AGLTF_SHADER_OUT}/${out_stem}.spv")
+    set(entry_args "")
+    if(NOT "${entry_point}" STREQUAL "")
+        set(entry_args -entry "${entry_point}")
+    endif()
+
+    if(SLANGC_EXECUTABLE)
+        add_custom_command(
+            OUTPUT  "${spv_out}"
+            COMMAND "${SLANGC_EXECUTABLE}"
+                    "${slang_file}"
+                    -target spirv
+                    -profile spirv_1_5
+                    -emit-spirv-directly
+                    ${entry_args}
+                    -o      "${spv_out}"
+                    -I      "${AGLTF_SHADER_DIR}"
+                    -I      "${SE_DIR}/shaders"
+            DEPENDS "${slang_file}"
+            COMMENT "slangc → ${out_stem}.spv"
+            VERBATIM)
+        add_custom_target("${target_name}" DEPENDS "${spv_out}")
+    else()
+        add_custom_target("${target_name}")
+        message(STATUS "slangc not found — skipping ${out_stem}.spv")
+    endif()
+endfunction()
+
+# Compile simple_engine shaders directly into our output directory
+file(GLOB SE_SHADERS "${SE_DIR}/shaders/*.slang")
+list(FILTER SE_SHADERS EXCLUDE REGEX ".*/(common_types|pbr_utils|lighting_utils|tonemapping_utils)\\.slang$")
+set(SE_SHADER_TARGETS "")
+foreach(shader_path ${SE_SHADERS})
+    get_filename_component(shader_name_we ${shader_path} NAME_WE)
+    agltf_compile_shader("se_shader_${shader_name_we}" "${shader_path}" "" "${shader_name_we}")
+    list(APPEND SE_SHADER_TARGETS "se_shader_${shader_name_we}")
+endforeach()
+
+agltf_compile_shader(agltf_shader_skinning
+    "${AGLTF_SHADER_DIR}/skinning.slang"
+    main "skinning")
+
+agltf_compile_shader(agltf_shader_morph
+    "${AGLTF_SHADER_DIR}/morph_accumulate.slang"
+    main "morph_accumulate")
+
+agltf_compile_shader(agltf_shader_heatmap_vert
+    "${AGLTF_SHADER_DIR}/pbr_heatmap.slang"
+    vertex_main "pbr_heatmap_vertex")
+
+agltf_compile_shader(agltf_shader_heatmap_dominant
+    "${AGLTF_SHADER_DIR}/pbr_heatmap.slang"
+    fragment_dominant_bone "pbr_heatmap_fragment_dominant_bone")
+
+agltf_compile_shader(agltf_shader_heatmap_weights
+    "${AGLTF_SHADER_DIR}/pbr_heatmap.slang"
+    fragment_weight_distribution "pbr_heatmap_fragment_weight_distribution")
+
+add_custom_target(advanced_gltf_shaders
+    DEPENDS
+        agltf_shader_skinning
+        agltf_shader_morph
+        agltf_shader_heatmap_vert
+        agltf_shader_heatmap_dominant
+        agltf_shader_heatmap_weights
+        ${SE_SHADER_TARGETS})
+
+# ---------------------------------------------------------------------------
+# SimpleEngineLib — static library compiled from simple_engine's SOURCES_COMMON.
+#
+# simple_engine builds a desktop executable, not a linkable library. We
+# replicate the explicit SOURCES_COMMON list here so AdvancedGLTF can link
+# against engine code without modifying simple_engine's build system.
+#
+# This list MUST stay in sync with SOURCES_COMMON in simple_engine/CMakeLists.txt.
+# Do NOT replace it with file(GLOB): vulkan_dispatch.cpp lives in the same
+# directory but is intentionally absent from SOURCES_COMMON and must remain
+# excluded.
+# ---------------------------------------------------------------------------
+set(SE_SOURCES_COMMON
+    ${CMAKE_CURRENT_SOURCE_DIR}/engine.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/scene_loading.cpp
+    ${SE_DIR}/platform.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/renderer_core.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/renderer_rendering.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/renderer_pipelines.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/renderer_compute.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/renderer_utils.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/renderer_resources.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/renderer_ray_query.cpp
+    ${SE_DIR}/memory_pool.cpp
+    ${SE_DIR}/resource_manager.cpp
+    ${SE_DIR}/entity.cpp
+    ${SE_DIR}/component.cpp
+    ${SE_DIR}/transform_component.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/mesh_component.cpp
+    ${SE_DIR}/camera_component.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/animation_component.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/model_loader.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/audio_system.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/physics_system.cpp
+    ${SE_DIR}/imgui/imgui.cpp
+    ${SE_DIR}/imgui/imgui_draw.cpp
+    ${SE_DIR}/vulkan_device.cpp
+    ${SE_DIR}/pipeline.cpp
+    ${SE_DIR}/descriptor_manager.cpp
+    ${SE_DIR}/renderdoc_debug_system.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/imgui_system.cpp
+    ${SE_DIR}/mikktspace.c
+)
+
+# Prepend ${SE_DIR}/ to files that are NOT local to advanced_gltf
+set(SE_SOURCES_ABSOLUTE "")
+foreach(src ${SE_SOURCES_COMMON})
+    if(IS_ABSOLUTE "${src}")
+        list(APPEND SE_SOURCES_ABSOLUTE "${src}")
+    else()
+        list(APPEND SE_SOURCES_ABSOLUTE "${SE_DIR}/${src}")
+    endif()
+endforeach()
+set(SE_SOURCES_COMMON ${SE_SOURCES_ABSOLUTE})
+
+add_library(SimpleEngineLib STATIC ${SE_SOURCES_COMMON})
+add_dependencies(SimpleEngineLib advanced_gltf_shaders)
+set_target_properties(SimpleEngineLib PROPERTIES CXX_STANDARD 20)
+
+target_include_directories(SimpleEngineLib PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}" "${SE_DIR}")
+
+target_compile_definitions(SimpleEngineLib PUBLIC
+    GLM_ENABLE_EXPERIMENTAL
+    _USE_MATH_DEFINES
+    VULKAN_HPP_NO_STRUCT_CONSTRUCTORS
+    VULKAN_HPP_DISPATCH_LOADER_DYNAMIC
+    PLATFORM_DESKTOP)
+
+target_link_libraries(SimpleEngineLib PUBLIC
+    Vulkan::Vulkan
+    VulkanHpp::VulkanHpp
+    glm::glm
+    tinygltf::tinygltf
+    KTX::ktx
+    Jolt)
+
+target_link_libraries(SimpleEngineLib PRIVATE
+    glfw
+    OpenAL::OpenAL)
+
+if(MSVC)
+    # NOMINMAX needs to inherit so MSVC doesn't have problems with min / max being a MSVC macro.
+    target_compile_definitions(SimpleEngineLib PUBLIC
+        NOMINMAX WIN32_LEAN_AND_MEAN _CRT_SECURE_NO_WARNINGS)
+    target_compile_options(SimpleEngineLib PUBLIC
+        /permissive- /Zc:__cplusplus /EHsc /W3 /MP /bigobj /wd4714 /wd4723)
+
+    target_link_libraries(SimpleEngineLib PUBLIC Dbghelp)
+
+    # Enable Link Time Code Generation to match Jolt and resolve potential LTO mismatches
+    set_target_properties(SimpleEngineLib PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
+elseif(WIN32)
+    target_compile_definitions(SimpleEngineLib PUBLIC
+        NOMINMAX WIN32_LEAN_AND_MEAN _CRT_SECURE_NO_WARNINGS)
+endif()
+
+# ---------------------------------------------------------------------------
+# AdvancedGLTF — tutorial demo executable
+# ---------------------------------------------------------------------------
+add_executable(AdvancedGLTF
+    main.cpp
+    tutorial_demo.cpp
+    physics_world_jolt.cpp)
+
+target_include_directories(AdvancedGLTF PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}")
+
+target_compile_definitions(AdvancedGLTF PRIVATE
+    ADVANCED_GLTF_REQUIRES_DESCRIPTOR_INDEXING=1)
+
+target_link_libraries(AdvancedGLTF PRIVATE
+    SimpleEngineLib
+    Jolt
+    glm::glm)
+
+if(MSVC)
+    set_target_properties(AdvancedGLTF PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE)
+endif()
+
+# Ensure all shaders (from simple_engine and local) are in the executable's shader directory.
+add_dependencies(AdvancedGLTF advanced_gltf_shaders)
+
+# Copy downloaded sample assets if the assets directory has been populated.
+if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/assets")
+    add_custom_command(TARGET AdvancedGLTF POST_BUILD
+        COMMAND "${CMAKE_COMMAND}" -E copy_directory
+                "${CMAKE_CURRENT_SOURCE_DIR}/assets"
+                "$<TARGET_FILE_DIR:AdvancedGLTF>/assets"
+        COMMENT "Copying assets alongside executable"
+        VERBATIM)
+endif()
diff --git a/attachments/advanced_gltf/animation.h b/attachments/advanced_gltf/animation.h
new file mode 100644
index 000000000..206a69292
--- /dev/null
+++ b/attachments/advanced_gltf/animation.h
@@ -0,0 +1,118 @@
+#pragma once
+#include "node.h"
+#include <algorithm>
+#include <cstring>
+#include <string>
+#include <vector>
+#include <glm/glm.hpp>
+#include <glm/gtc/quaternion.hpp>
+
+// Canonical from appendix_types.adoc
+enum InterpolationMode { STEP, LINEAR, CUBICSPLINE };
+
+struct AnimationSampler {
+    InterpolationMode    interpolation = LINEAR;
+    std::vector<float>   inputs;          // Timestamps in seconds
+    std::vector<glm::vec4> outputs_raw;   // Packed: for CUBICSPLINE stores in_tan/value/out_tan triples
+
+    // Unpacked for CUBICSPLINE — filled at load time from outputs_raw
+    std::vector<glm::vec4> in_tangents;
+    std::vector<glm::vec4> values;
+    std::vector<glm::vec4> out_tangents;
+};
+
+struct AnimationChannel {
+    enum PathType { TRANSLATION, ROTATION, SCALE, WEIGHTS };
+    PathType path;
+    uint32_t node_index;
+    uint32_t sampler_index;
+};
+
+struct Pose {
+    std::vector<glm::vec3> translations;
+    std::vector<glm::quat> rotations;
+    std::vector<glm::vec3> scales;
+};
+
+// Matches the glTF skin object.
+struct Skin {
+    std::string              name;
+    std::vector<uint32_t>    joints;                // Node indices
+    std::vector<glm::mat4>   inverse_bind_matrices; // One per joint
+    uint32_t                 skeleton_root = INVALID_NODE_INDEX;
+};
+
+// Binary search for the largest keyframe index whose timestamp <= time.
+inline uint32_t find_keyframe(const AnimationSampler& sampler, float time) {
+    if (sampler.inputs.size() < 2) return 0;
+    auto it  = std::lower_bound(sampler.inputs.begin(), sampler.inputs.end(), time);
+    uint32_t idx = static_cast<uint32_t>(std::distance(sampler.inputs.begin(), it));
+    return (idx > 0) ? idx - 1 : 0;
+}
+
+// Hermite cubic spline interpolation for vec3 (glTF CUBICSPLINE mode).
+// dt is the duration of the keyframe interval (t1 - t0), needed to scale tangents correctly.
+inline glm::vec3 cubic_spline_interpolate_vec3(
+    float t, float dt,
+    glm::vec3 p0, glm::vec3 out_tan0,
+    glm::vec3 p1, glm::vec3 in_tan1)
+{
+    float t2 = t * t;
+    float t3 = t2 * t;
+    float h00 =  2.0f * t3 - 3.0f * t2 + 1.0f;
+    float h10 =         t3 - 2.0f * t2 + t;
+    float h01 = -2.0f * t3 + 3.0f * t2;
+    float h11 =         t3 -        t2;
+    return h00 * p0 + h10 * dt * out_tan0
+         + h01 * p1 + h11 * dt * in_tan1;
+}
+
+// Hermite cubic spline for quaternion. The glTF spec requires normalization after the blend.
+inline glm::quat cubic_spline_interpolate_quat(
+    float t, float dt,
+    glm::quat p0, glm::quat out_tan0,
+    glm::quat p1, glm::quat in_tan1)
+{
+    float t2 = t * t;
+    float t3 = t2 * t;
+    float h00 =  2.0f * t3 - 3.0f * t2 + 1.0f;
+    float h10 =         t3 - 2.0f * t2 + t;
+    float h01 = -2.0f * t3 + 3.0f * t2;
+    float h11 =         t3 -        t2;
+    glm::vec4 blended =
+          h00 * glm::vec4(p0.x,       p0.y,       p0.z,       p0.w)
+        + h10 * dt * glm::vec4(out_tan0.x, out_tan0.y, out_tan0.z, out_tan0.w)
+        + h01 * glm::vec4(p1.x,       p1.y,       p1.z,       p1.w)
+        + h11 * dt * glm::vec4(in_tan1.x,  in_tan1.y,  in_tan1.z,  in_tan1.w);
+    return glm::normalize(glm::quat(blended.w, blended.x, blended.y, blended.z));
+}
+
+// Canonical signature from appendix_types.adoc.
+// Writes Pose transforms back to their corresponding scene graph nodes and marks them dirty.
+inline void apply_pose_to_scene_graph(
+    std::vector<Node>&           nodes,
+    const Pose&                  pose,
+    const std::vector<uint32_t>& joint_indices)
+{
+    for (size_t i = 0; i < joint_indices.size(); ++i) {
+        Node& node      = nodes[joint_indices[i]];
+        node.translation    = pose.translations[i];
+        node.local_rotation = pose.rotations[i];
+        node.scale          = pose.scales[i];
+        node.mark_dirty();
+    }
+}
+
+// Pre-computes joint matrices (J * IB) on the CPU before uploading to the GPU skinning shader.
+// Call after the animation update and scene graph update, once per frame.
+inline void compute_joint_matrices(
+    const Skin&              skin,
+    const std::vector<Node>& nodes,
+    std::vector<glm::mat4>&  joint_matrices_out)
+{
+    joint_matrices_out.resize(skin.joints.size());
+    for (size_t i = 0; i < skin.joints.size(); ++i) {
+        const Node& joint_node   = nodes[skin.joints[i]];
+        joint_matrices_out[i]    = joint_node.world_matrix * skin.inverse_bind_matrices[i];
+    }
+}
diff --git a/attachments/advanced_gltf/animation_component.cpp b/attachments/advanced_gltf/animation_component.cpp
new file mode 100644
index 000000000..e2f4e5b0a
--- /dev/null
+++ b/attachments/advanced_gltf/animation_component.cpp
@@ -0,0 +1,448 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <iostream>
+#include <functional>
+#include <chrono>
+#include <sstream>
+#include <shared_mutex>
+#include <mutex>
+#include <vector>
+#include <unordered_map>
+#include <string>
+
+
+#include "animation_component.h"
+
+#include "entity.h"
+#include "transform_component.h"
+#include "renderer_advanced_types.h"
+#include "mesh_component.h"
+
+void AnimationComponent_SetHierarchy(AnimationComponent* anim,
+                                    const std::unordered_map<int, std::vector<int>>& nodeChildren,
+                                    const std::unordered_map<int, glm::mat4>& initialLocalTransforms,
+                                    const std::unordered_map<int, glm::vec3>& initialLocalTranslations,
+                                    const std::unordered_map<int, glm::quat>& initialLocalRotations,
+                                    const std::unordered_map<int, glm::vec3>& initialLocalScales,
+                                    const std::vector<int>& rootNodes)
+{
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto& state = g_animationAdvancedStates[anim];
+    state.nodeChildren = nodeChildren;
+    state.initialLocalTransforms = initialLocalTransforms;
+    state.initialLocalTranslations = initialLocalTranslations;
+    state.initialLocalRotations = initialLocalRotations;
+    state.initialLocalScales = initialLocalScales;
+    state.rootNodes = rootNodes;
+}
+
+void AnimationComponent::Update(std::chrono::milliseconds deltaTime)
+{
+	if (!playing || currentAnimationIndex < 0 ||
+	    currentAnimationIndex >= static_cast<int>(animations.size()))
+	{
+		return;
+	}
+
+    AdvancedAnimationState state;
+    {
+        std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+        auto stateIt = g_animationAdvancedStates.find(this);
+        if (stateIt == g_animationAdvancedStates.end()) return;
+        state = stateIt->second;
+    }
+
+	const Animation &anim     = animations[currentAnimationIndex];
+	float            duration = anim.GetDuration();
+
+	if (duration <= 0.0f)
+	{
+		return;
+	}
+
+	// Advance time
+	float dt = static_cast<float>(deltaTime.count()) * 0.001f * playbackSpeed;
+	currentTime += dt;
+
+	// Handle looping or stopping at the end
+	if (currentTime >= duration)
+	{
+		if (looping)
+		{
+			currentTime = std::fmod(currentTime, duration);
+		}
+		else
+		{
+			currentTime = duration;
+			playing     = false;
+		}
+	}
+
+	// 1. Collect all LOCAL transforms for the current time
+	std::unordered_map<int, glm::vec3> currentTranslations = state.initialLocalTranslations;
+	std::unordered_map<int, glm::quat> currentRotations    = state.initialLocalRotations;
+	std::unordered_map<int, glm::vec3> currentScales       = state.initialLocalScales;
+    std::unordered_map<int, std::vector<float>> currentWeights;
+
+	for (const auto &channel : anim.channels)
+	{
+		if (channel.samplerIndex < 0 || channel.samplerIndex >= static_cast<int>(anim.samplers.size()))
+			continue;
+
+		const AnimationSampler &sampler = anim.samplers[channel.samplerIndex];
+		
+		switch (channel.path)
+		{
+			case AnimationPath::Translation:
+				currentTranslations[channel.targetNode] = SampleVec3(sampler, currentTime);
+				break;
+			case AnimationPath::Rotation:
+				currentRotations[channel.targetNode] = SampleQuat(sampler, currentTime);
+				break;
+			case AnimationPath::Scale:
+				currentScales[channel.targetNode] = SampleVec3(sampler, currentTime);
+				break;
+			case AnimationPath::Weights:
+				{
+					int numTargets = 0;
+					auto it = nodeToEntities.find(channel.targetNode);
+					if (it != nodeToEntities.end() && !it->second.empty() && it->second[0]) {
+						if (auto* mesh = it->second[0]->GetComponent<MeshComponent>()) {
+							numTargets = GetMeshComponentMorphTargets(mesh);
+						}
+					}
+					if (numTargets > 0) {
+						currentWeights[channel.targetNode] = SampleWeights(sampler, currentTime, numTargets);
+					}
+				}
+				break;
+			default: break;
+		}
+	}
+
+	// 2. Compute world transforms by traversing hierarchy
+	std::unordered_map<int, glm::mat4> worldTransforms;
+	
+	std::function<void(int, const glm::mat4&)> computeWorldTransforms = [&](int nodeIndex, const glm::mat4& parentTransform) {
+		glm::mat4 localTransform;
+		if (currentTranslations.count(nodeIndex)) {
+            glm::mat4 T = glm::translate(glm::mat4(1.0f), currentTranslations[nodeIndex]);
+            glm::mat4 R = glm::mat4_cast(currentRotations[nodeIndex]);
+            glm::mat4 S = glm::scale(glm::mat4(1.0f), currentScales[nodeIndex]);
+            localTransform = T * R * S;
+		} else if (state.initialLocalTransforms.count(nodeIndex)) {
+            localTransform = state.initialLocalTransforms.at(nodeIndex);
+        } else {
+            localTransform = glm::mat4(1.0f);
+        }
+        
+		glm::mat4 worldTransform = parentTransform * localTransform;
+		worldTransforms[nodeIndex] = worldTransform;
+		
+		if (state.nodeChildren.count(nodeIndex)) {
+			for (int childIndex : state.nodeChildren.at(nodeIndex)) {
+				computeWorldTransforms(childIndex, worldTransform);
+			}
+		}
+	};
+
+	glm::mat4 rootTransform = glm::mat4(1.0f);
+	if (owner) {
+		auto* transform = owner->GetComponent<TransformComponent>();
+		if (transform) {
+			rootTransform = transform->GetModelMatrix();
+		}
+	}
+
+	if (state.rootNodes.empty()) {
+		// If no root nodes defined, we can't traverse. This shouldn't happen for glTF.
+	} else {
+		for (int rootIndex : state.rootNodes) {
+			computeWorldTransforms(rootIndex, rootTransform);
+		}
+	}
+
+	// 3. Apply world transforms to entities AND compute matrix palettes for skins
+	for (const auto& [nodeIndex, entities] : nodeToEntities) {
+		for (Entity* entity : entities) {
+			// Once the physics system owns this entity (e.g. the Fox after it is thrown),
+			// it drives the transform; skip it here so we don't fight physics and cause
+			// the object to oscillate between its physics pose and its animated pose.
+			if (IsEntityPhysicsOwned(entity)) continue;
+			if (entity && worldTransforms.count(nodeIndex)) {
+				auto* transform = entity->GetComponent<TransformComponent>();
+				if (transform) {
+					glm::mat4 worldMatrix = worldTransforms[nodeIndex];
+					
+					// Extract position, rotation, scale from world matrix
+					glm::vec3 worldPos = glm::vec3(worldMatrix[3]);
+					transform->SetPosition(worldPos);
+					
+					// Extract rotation by normalizing axes to remove scale
+					glm::mat4 rotationMatrix = worldMatrix;
+					rotationMatrix[0] = glm::normalize(rotationMatrix[0]);
+					rotationMatrix[1] = glm::normalize(rotationMatrix[1]);
+					rotationMatrix[2] = glm::normalize(rotationMatrix[2]);
+					glm::quat worldRot = glm::quat_cast(rotationMatrix);
+					transform->SetRotation(glm::eulerAngles(worldRot));
+					
+					float sx = glm::length(glm::vec3(worldMatrix[0]));
+					float sy = glm::length(glm::vec3(worldMatrix[1]));
+					float sz = glm::length(glm::vec3(worldMatrix[2]));
+					transform->SetScale(glm::vec3(sx, sy, sz));
+	
+					// If this entity has a deformable mesh, compute its matrix palette
+					auto* mesh = entity->GetComponent<MeshComponent>();
+					if (mesh) {
+						// Update morph weights if present
+						auto wIt = currentWeights.find(nodeIndex);
+						if (wIt != currentWeights.end()) {
+							SetMeshComponentMorphWeights(mesh, wIt->second);
+						}
+	
+						std::vector<int> jointNodes;
+						std::vector<glm::mat4> ibms;
+						{
+							std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+							auto meshIt = g_meshComponentData.find(mesh);
+							if (meshIt != g_meshComponentData.end() && meshIt->second.isDeformable) {
+								jointNodes = meshIt->second.joints;
+								ibms = meshIt->second.inverseBindMatrices;
+							}
+						}
+							
+						if (!jointNodes.empty() && jointNodes.size() == ibms.size()) {
+							std::vector<glm::mat4> palette(jointNodes.size());
+							for (size_t i = 0; i < jointNodes.size(); ++i) {
+								int jointNodeIdx = jointNodes[i];
+								glm::mat4 jointWorld;
+								if (worldTransforms.count(jointNodeIdx)) {
+									jointWorld = worldTransforms[jointNodeIdx];
+								} else {
+									jointWorld = glm::mat4(1.0f);
+								}
+								palette[i] = glm::inverse(worldMatrix) * jointWorld * ibms[i];
+							}
+							SetMeshComponentJointMatrices(mesh, palette);
+						} else if (IsMeshComponentDeformable(mesh)) {
+							// Morph-only or incomplete skinning: provide identity joint matrix for joint 0
+							SetMeshComponentJointMatrices(mesh, {glm::mat4(1.0f)});
+						}
+					}
+				}
+			}
+		}
+	}
+}
+
+void AnimationComponent::FindKeyframes(const std::vector<float> &times, float time,
+                                       size_t &outIndex0, size_t &outIndex1, float &outT) const
+{
+	if (times.empty())
+	{
+		outIndex0 = 0;
+		outIndex1 = 0;
+		outT      = 0.0f;
+		return;
+	}
+
+	if (times.size() == 1 || time <= times.front())
+	{
+		outIndex0 = 0;
+		outIndex1 = 0;
+		outT      = 0.0f;
+		return;
+	}
+
+	if (time >= times.back())
+	{
+		outIndex0 = times.size() - 1;
+		outIndex1 = times.size() - 1;
+		outT      = 0.0f;
+		return;
+	}
+
+	// Binary search for the keyframe
+	auto it = std::lower_bound(times.begin(), times.end(), time);
+	if (it == times.begin())
+	{
+		outIndex0 = 0;
+		outIndex1 = 0;
+		outT      = 0.0f;
+		return;
+	}
+
+	outIndex1 = static_cast<size_t>(std::distance(times.begin(), it));
+	outIndex0 = outIndex1 - 1;
+
+	float t0 = times[outIndex0];
+	float t1 = times[outIndex1];
+	float dt = t1 - t0;
+
+	if (dt > 0.0f)
+	{
+		outT = (time - t0) / dt;
+	}
+	else
+	{
+		outT = 0.0f;
+	}
+}
+
+glm::vec3 AnimationComponent::SampleVec3(const AnimationSampler &sampler, float time) const
+{
+	if (sampler.inputTimes.empty() || sampler.outputValues.size() < 3)
+	{
+		return glm::vec3(0.0f);
+	}
+
+	size_t index0, index1;
+	float  t;
+	FindKeyframes(sampler.inputTimes, time, index0, index1, t);
+
+	// Get values at keyframes (3 floats per vec3)
+	size_t offset0 = index0 * 3;
+	size_t offset1 = index1 * 3;
+
+	if (offset0 + 2 >= sampler.outputValues.size())
+	{
+		offset0 = sampler.outputValues.size() - 3;
+	}
+	if (offset1 + 2 >= sampler.outputValues.size())
+	{
+		offset1 = sampler.outputValues.size() - 3;
+	}
+
+	glm::vec3 v0(sampler.outputValues[offset0],
+	             sampler.outputValues[offset0 + 1],
+	             sampler.outputValues[offset0 + 2]);
+	glm::vec3 v1(sampler.outputValues[offset1],
+	             sampler.outputValues[offset1 + 1],
+	             sampler.outputValues[offset1 + 2]);
+
+	// Interpolate based on interpolation type
+	switch (sampler.interpolation)
+	{
+		case AnimationInterpolation::Step:
+			return v0;
+		case AnimationInterpolation::Linear:
+			return glm::mix(v0, v1, t);
+		case AnimationInterpolation::CubicSpline:
+			// For cubic spline, the output has in-tangent, value, out-tangent
+			// Simplified: just use linear interpolation for now
+			// Full cubic spline would require reading tangents from output data
+			return glm::mix(v0, v1, t);
+		default:
+			return glm::mix(v0, v1, t);
+	}
+}
+
+glm::quat AnimationComponent::SampleQuat(const AnimationSampler &sampler, float time) const
+{
+	if (sampler.inputTimes.empty() || sampler.outputValues.size() < 4)
+	{
+		return glm::quat(1.0f, 0.0f, 0.0f, 0.0f);
+	}
+
+	size_t index0, index1;
+	float  t;
+	FindKeyframes(sampler.inputTimes, time, index0, index1, t);
+
+	// Get values at keyframes (4 floats per quaternion: x, y, z, w)
+	size_t offset0 = index0 * 4;
+	size_t offset1 = index1 * 4;
+
+	if (offset0 + 3 >= sampler.outputValues.size())
+	{
+		offset0 = sampler.outputValues.size() - 4;
+	}
+	if (offset1 + 3 >= sampler.outputValues.size())
+	{
+		offset1 = sampler.outputValues.size() - 4;
+	}
+
+	// glTF quaternions are stored as (x, y, z, w)
+	glm::quat q0(sampler.outputValues[offset0 + 3],         // w
+	             sampler.outputValues[offset0],             // x
+	             sampler.outputValues[offset0 + 1],         // y
+	             sampler.outputValues[offset0 + 2]);        // z
+	glm::quat q1(sampler.outputValues[offset1 + 3],         // w
+	             sampler.outputValues[offset1],             // x
+	             sampler.outputValues[offset1 + 1],         // y
+	             sampler.outputValues[offset1 + 2]);        // z
+
+	// Interpolate based on interpolation type
+	switch (sampler.interpolation)
+	{
+		case AnimationInterpolation::Step:
+			return q0;
+		case AnimationInterpolation::Linear:
+			return glm::slerp(q0, q1, t);
+		case AnimationInterpolation::CubicSpline:
+			// Simplified: use slerp for now
+			return glm::slerp(q0, q1, t);
+		default:
+			return glm::slerp(q0, q1, t);
+	}
+}
+
+std::vector<float> AnimationComponent::SampleWeights(const AnimationSampler &sampler, float time, size_t numTargets) const
+{
+	if (sampler.inputTimes.empty() || sampler.outputValues.size() < numTargets)
+	{
+		return std::vector<float>(numTargets, 0.0f);
+	}
+
+	size_t index0, index1;
+	float  t;
+	FindKeyframes(sampler.inputTimes, time, index0, index1, t);
+
+	std::vector<float> result(numTargets);
+	if (sampler.interpolation == AnimationInterpolation::CubicSpline) {
+		// CubicSpline for weights: each keyframe has (in-tangent, value, out-tangent) per target
+		size_t stride = 3 * numTargets;
+		if (sampler.outputValues.size() < (index1 + 1) * stride) return std::vector<float>(numTargets, 0.0f);
+
+		float dt = sampler.inputTimes[index1] - sampler.inputTimes[index0];
+		for (size_t i = 0; i < numTargets; ++i) {
+			float p0 = sampler.outputValues[index0 * stride + numTargets + i];
+			float m0 = sampler.outputValues[index0 * stride + 2 * numTargets + i] * dt;
+			float p1 = sampler.outputValues[index1 * stride + numTargets + i];
+			float m1 = sampler.outputValues[index1 * stride + i] * dt;
+
+			float t2 = t * t;
+			float t3 = t2 * t;
+			result[i] = (2 * t3 - 3 * t2 + 1) * p0 + (t3 - 2 * t2 + t) * m0 + (-2 * t3 + 3 * t2) * p1 + (t3 - t2) * m1;
+		}
+	} else {
+		size_t stride = numTargets;
+		if (sampler.outputValues.size() < (index1 + 1) * stride) return std::vector<float>(numTargets, 0.0f);
+		
+		for (size_t i = 0; i < numTargets; ++i) {
+			float v0 = sampler.outputValues[index0 * stride + i];
+			float v1 = sampler.outputValues[index1 * stride + i];
+			if (sampler.interpolation == AnimationInterpolation::Step) {
+				result[i] = v0;
+			} else {
+				result[i] = glm::mix(v0, v1, t);
+			}
+		}
+	}
+
+	return result;
+}
diff --git a/attachments/advanced_gltf/animation_component.h b/attachments/advanced_gltf/animation_component.h
new file mode 100644
index 000000000..40900bd19
--- /dev/null
+++ b/attachments/advanced_gltf/animation_component.h
@@ -0,0 +1,235 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <chrono>
+#include <glm/glm.hpp>
+#include <glm/gtc/quaternion.hpp>
+#include <unordered_map>
+#include <vector>
+
+#include "component.h"
+#include "model_loader.h"
+
+class Entity;
+class TransformComponent;
+
+/**
+ * @brief Component that handles skeletal/transform animation playback.
+ *
+ * This component stores animation clips and plays them back by interpolating
+ * keyframes and applying transforms to target nodes (entities).
+ */
+class AnimationComponent final : public Component
+{
+  public:
+	/**
+	 * @brief Constructor with optional name.
+	 * @param componentName The name of the component.
+	 */
+	explicit AnimationComponent(const std::string &componentName = "AnimationComponent") :
+	    Component(componentName)
+	{}
+
+	/**
+	 * @brief Set the animations for this component.
+	 * @param anims Vector of Animation clips to use.
+	 */
+	void SetAnimations(const std::vector<Animation> &anims)
+	{
+		animations = anims;
+		if (!animations.empty())
+		{
+			currentAnimationIndex = 0;
+		}
+	}
+
+	/**
+	 * @brief Get the animations stored in this component.
+	 * @return Reference to the animations vector.
+	 */
+	[[nodiscard]] const std::vector<Animation> &GetAnimations() const
+	{
+		return animations;
+	}
+
+	/**
+	 * @brief Set the mapping from glTF node indices to entity pointers.
+	 * This allows the animation system to apply transforms to the correct entities.
+	 * @param mapping Map from node index to Entity pointer.
+	 */
+	void SetNodeToEntityMap(const std::unordered_map<int, std::vector<Entity *>> &mapping)
+	{
+		nodeToEntities = mapping;
+	}
+
+	/**
+	 * @brief Play an animation by index.
+	 * @param index The index of the animation to play.
+	 * @param loop Whether to loop the animation (default: true).
+	 */
+	void Play(size_t index, bool loop = true)
+	{
+		if (index < animations.size())
+		{
+			currentAnimationIndex = static_cast<int>(index);
+			currentTime           = 0.0f;
+			playing               = true;
+			looping               = loop;
+		}
+	}
+
+	/**
+	 * @brief Play an animation by name.
+	 * @param name The name of the animation to play.
+	 * @param loop Whether to loop the animation (default: true).
+	 */
+	void PlayByName(const std::string &name, bool loop = true)
+	{
+		for (size_t i = 0; i < animations.size(); ++i)
+		{
+			if (animations[i].name == name)
+			{
+				Play(i, loop);
+				return;
+			}
+		}
+	}
+
+	/**
+	 * @brief Stop the current animation.
+	 */
+	void Stop()
+	{
+		playing = false;
+	}
+
+	/**
+	 * @brief Pause the current animation.
+	 */
+	void Pause()
+	{
+		playing = false;
+	}
+
+	/**
+	 * @brief Resume a paused animation.
+	 */
+	void Resume()
+	{
+		playing = true;
+	}
+
+	/**
+	 * @brief Check if an animation is currently playing.
+	 * @return True if playing, false otherwise.
+	 */
+	[[nodiscard]] bool IsPlaying() const
+	{
+		return playing;
+	}
+
+	/**
+	 * @brief Set the playback speed multiplier.
+	 * @param speed The speed multiplier (1.0 = normal speed).
+	 */
+	void SetSpeed(float speed)
+	{
+		playbackSpeed = speed;
+	}
+
+	/**
+	 * @brief Get the current playback speed.
+	 * @return The playback speed multiplier.
+	 */
+	[[nodiscard]] float GetSpeed() const
+	{
+		return playbackSpeed;
+	}
+
+	/**
+	 * @brief Get the current animation time.
+	 * @return The current time in seconds.
+	 */
+	[[nodiscard]] float GetCurrentTime() const
+	{
+		return currentTime;
+	}
+
+	/**
+	 * @brief Get the duration of the current animation.
+	 * @return The duration in seconds, or 0 if no animation is selected.
+	 */
+	[[nodiscard]] float GetCurrentDuration() const
+	{
+		if (currentAnimationIndex >= 0 && currentAnimationIndex < static_cast<int>(animations.size()))
+		{
+			return animations[currentAnimationIndex].GetDuration();
+		}
+		return 0.0f;
+	}
+
+	/**
+	 * @brief Update the animation, advancing time and applying transforms.
+	 * @param deltaTime The time elapsed since the last update.
+	 */
+	void Update(std::chrono::milliseconds deltaTime) override;
+
+  private:
+	std::vector<Animation>            animations;
+	std::unordered_map<int, std::vector<Entity *>> nodeToEntities;        // Maps glTF node index to Entities
+
+	// Store base transforms for each animated node (captured when animation starts)
+	// Animation transforms are applied relative to these base transforms
+	std::unordered_map<int, glm::vec3> basePositions;
+	std::unordered_map<int, glm::quat> baseRotations;        // Quaternions for proper rotation composition
+	std::unordered_map<int, glm::vec3> baseScales;
+
+	int   currentAnimationIndex = -1;
+	float currentTime           = 0.0f;
+	float playbackSpeed         = 1.0f;
+	bool  playing               = false;
+	bool  looping               = true;
+
+	/**
+	 * @brief Sample a vec3 value from a sampler at a given time.
+	 * @param sampler The animation sampler.
+	 * @param time The time to sample at.
+	 * @return The interpolated vec3 value.
+	 */
+	[[nodiscard]] glm::vec3 SampleVec3(const AnimationSampler &sampler, float time) const;
+
+	/**
+	 * @brief Sample a quaternion value from a sampler at a given time.
+	 * @param sampler The animation sampler.
+	 * @param time The time to sample at.
+	 * @return The interpolated quaternion value.
+	 */
+	[[nodiscard]] glm::quat SampleQuat(const AnimationSampler &sampler, float time) const;
+	[[nodiscard]] std::vector<float> SampleWeights(const AnimationSampler &sampler, float time, size_t numTargets) const;
+
+	/**
+	 * @brief Find the keyframe indices for interpolation.
+	 * @param times The input time array.
+	 * @param time The current time.
+	 * @param outIndex0 Output: the lower keyframe index.
+	 * @param outIndex1 Output: the upper keyframe index.
+	 * @param outT Output: the interpolation factor (0-1).
+	 */
+	void FindKeyframes(const std::vector<float> &times, float time,
+	                   size_t &outIndex0, size_t &outIndex1, float &outT) const;
+};
diff --git a/attachments/advanced_gltf/assets/add_physics_extras.py b/attachments/advanced_gltf/assets/add_physics_extras.py
new file mode 100755
index 000000000..821b74b34
--- /dev/null
+++ b/attachments/advanced_gltf/assets/add_physics_extras.py
@@ -0,0 +1,260 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026 Holochip Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 the "License";
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+add_physics_extras.py  —  Annotates glTF skeleton nodes with the physics extras
+schema used by the Advanced glTF tutorial (ColliderDef / ConstraintDef).
+
+Usage:
+    python add_physics_extras.py input.gltf output.gltf [--config ragdoll.json]
+
+If --config is omitted, the script uses built-in heuristics that recognise
+common bone-naming conventions (Mixamo, Blender Rigify, generic "bone_*").
+
+Output extras schema (matches ColliderDef / ConstraintDef in node.h):
+    node.extras.collider  = { "shape", "radius", "half_height",
+                               "box_half_extents", "mass",
+                               "collision_group", "collision_mask" }
+    node.extras.constraint = { "type", "swing_limit_deg", "twist_limit_deg",
+                                "hinge_min_deg", "hinge_max_deg",
+                                "hinge_axis", "parent_bone" }
+
+NOTE: Physics extras are a tutorial-specific extension, not a registered glTF
+extension.  They will pass the Khronos glTF-Validator only if the validator is
+run without the --strict-extra flag.
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Bone heuristic database
+# Each entry is (pattern_substrings, collider, constraint).
+# Patterns are checked case-insensitively; first match wins.
+# ---------------------------------------------------------------------------
+BONE_RULES = [
+    # HEAD / NECK
+    (["head"],
+     {"shape": "CAPSULE", "radius": 0.10, "half_height": 0.06,
+      "mass": 4.0, "collision_group": "character", "collision_mask": "world"},
+     {"type": "BALL_SOCKET", "swing_limit_deg": 50.0, "twist_limit_deg": 30.0}),
+
+    (["neck"],
+     {"shape": "CAPSULE", "radius": 0.06, "half_height": 0.06,
+      "mass": 2.0, "collision_group": "character", "collision_mask": "world"},
+     {"type": "BALL_SOCKET", "swing_limit_deg": 40.0, "twist_limit_deg": 25.0}),
+
+    # SPINE / PELVIS / HIPS
+    (["pelvis", "hips", "root"],
+     {"shape": "BOX", "box_half_extents": [0.14, 0.08, 0.10],
+      "mass": 8.0, "collision_group": "character", "collision_mask": "world"},
+     {"type": "NONE"}),
+
+    (["spine", "chest", "torso"],
+     {"shape": "BOX", "box_half_extents": [0.12, 0.10, 0.08],
+      "mass": 6.0, "collision_group": "character", "collision_mask": "world"},
+     {"type": "BALL_SOCKET", "swing_limit_deg": 20.0, "twist_limit_deg": 15.0}),
+
+    # UPPER LIMBS
+    (["upperarm", "upper_arm", "arm_upper", "uparm"],
+     {"shape": "CAPSULE", "radius": 0.05, "half_height": 0.14,
+      "mass": 2.5, "collision_group": "character", "collision_mask": "world"},
+     {"type": "BALL_SOCKET", "swing_limit_deg": 80.0, "twist_limit_deg": 60.0}),
+
+    (["lowerarm", "lower_arm", "arm_lower", "forearm", "loarm"],
+     {"shape": "CAPSULE", "radius": 0.04, "half_height": 0.13,
+      "mass": 1.5, "collision_group": "character", "collision_mask": "world"},
+     {"type": "HINGE",
+      "hinge_axis": [0.0, 0.0, 1.0],
+      "hinge_min_deg": -140.0, "hinge_max_deg": 0.0}),
+
+    (["hand", "wrist"],
+     {"shape": "BOX", "box_half_extents": [0.04, 0.03, 0.07],
+      "mass": 0.5, "collision_group": "character", "collision_mask": "world"},
+     {"type": "BALL_SOCKET", "swing_limit_deg": 60.0, "twist_limit_deg": 30.0}),
+
+    # LOWER LIMBS
+    (["upperleg", "upper_leg", "leg_upper", "thigh", "upleg"],
+     {"shape": "CAPSULE", "radius": 0.07, "half_height": 0.20,
+      "mass": 5.0, "collision_group": "character", "collision_mask": "world"},
+     {"type": "BALL_SOCKET", "swing_limit_deg": 70.0, "twist_limit_deg": 30.0}),
+
+    (["lowerleg", "lower_leg", "leg_lower", "shin", "calf", "loleg"],
+     {"shape": "CAPSULE", "radius": 0.05, "half_height": 0.18,
+      "mass": 3.0, "collision_group": "character", "collision_mask": "world"},
+     {"type": "HINGE",
+      "hinge_axis": [1.0, 0.0, 0.0],
+      "hinge_min_deg": 0.0, "hinge_max_deg": 140.0}),
+
+    (["foot", "ankle"],
+     {"shape": "BOX", "box_half_extents": [0.05, 0.03, 0.10],
+      "mass": 1.0, "collision_group": "character", "collision_mask": "world"},
+     {"type": "HINGE",
+      "hinge_axis": [1.0, 0.0, 0.0],
+      "hinge_min_deg": -30.0, "hinge_max_deg": 45.0}),
+
+    (["toe"],
+     {"shape": "CAPSULE", "radius": 0.02, "half_height": 0.02,
+      "mass": 0.2, "collision_group": "character", "collision_mask": "world"},
+     {"type": "HINGE",
+      "hinge_axis": [1.0, 0.0, 0.0],
+      "hinge_min_deg": -30.0, "hinge_max_deg": 30.0}),
+]
+
+# Fallback for joints whose name matches none of the rules above.
+DEFAULT_COLLIDER = {
+    "shape": "CAPSULE", "radius": 0.04, "half_height": 0.06,
+    "mass": 1.0, "collision_group": "character", "collision_mask": "world",
+}
+DEFAULT_CONSTRAINT = {
+    "type": "BALL_SOCKET", "swing_limit_deg": 45.0, "twist_limit_deg": 30.0,
+}
+
+
+def match_bone(name: str):
+    """Return (collider, constraint) dicts for the given bone name."""
+    lower = name.lower()
+    # Strip common left/right prefixes/suffixes: l_, r_, _l, _r, left_, right_, .L, .R
+    for prefix in ("left_", "right_", "l_", "r_"):
+        if lower.startswith(prefix):
+            lower = lower[len(prefix):]
+    for suffix in ("_left", "_right", "_l", "_r", ".l", ".r"):
+        if lower.endswith(suffix):
+            lower = lower[: -len(suffix)]
+
+    for patterns, collider, constraint in BONE_RULES:
+        if any(p in lower for p in patterns):
+            return dict(collider), dict(constraint)
+    return dict(DEFAULT_COLLIDER), dict(DEFAULT_CONSTRAINT)
+
+
+def find_parent_bone_name(gltf: dict, node_idx: int) -> str:
+    """Return the name of node_idx's parent if it is also a joint, else ''."""
+    for idx, node in enumerate(gltf.get("nodes", [])):
+        if node_idx in node.get("children", []):
+            return node.get("name", "")
+    return ""
+
+
+def collect_joint_indices(gltf: dict) -> set:
+    """Return the set of node indices that are referenced by any skin."""
+    joints: set = set()
+    for skin in gltf.get("skins", []):
+        joints.update(skin.get("joints", []))
+    return joints
+
+
+def annotate(gltf: dict, config: dict | None, dry_run: bool) -> tuple[int, int]:
+    """
+    Add physics extras to skeleton nodes.
+    Returns (nodes_annotated, nodes_skipped).
+    """
+    joint_indices = collect_joint_indices(gltf)
+    if not joint_indices:
+        print("  WARNING: No skins found in file — no joints to annotate.", file=sys.stderr)
+        return 0, 0
+
+    nodes = gltf.setdefault("nodes", [])
+    annotated = 0
+    skipped = 0
+
+    for idx in sorted(joint_indices):
+        if idx >= len(nodes):
+            skipped += 1
+            continue
+        node = nodes[idx]
+        name = node.get("name", f"node_{idx}")
+
+        # Look up in explicit config first, then heuristics.
+        if config and name in config:
+            collider   = config[name].get("collider",   dict(DEFAULT_COLLIDER))
+            constraint = config[name].get("constraint", dict(DEFAULT_CONSTRAINT))
+        else:
+            collider, constraint = match_bone(name)
+
+        # Attach parent bone name to constraint for context.
+        parent_name = find_parent_bone_name(gltf, idx)
+        if parent_name:
+            constraint["parent_bone"] = parent_name
+
+        if not dry_run:
+            extras = node.setdefault("extras", {})
+            extras["collider"]   = collider
+            extras["constraint"] = constraint
+
+        print(f"  [{idx:3d}] {name:<30s}  shape={collider['shape']:<8s}  "
+              f"constraint={constraint['type']}")
+        annotated += 1
+
+    return annotated, skipped
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Annotate glTF skeleton joints with tutorial physics extras.")
+    parser.add_argument("input",  type=Path, help="Source .gltf file")
+    parser.add_argument("output", type=Path, help="Destination .gltf file")
+    parser.add_argument("--config", type=Path, default=None,
+                        help="Optional JSON config mapping bone names to extras "
+                             "(overrides heuristics for named bones)")
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Print what would be annotated without writing output")
+    args = parser.parse_args()
+
+    if not args.input.exists():
+        sys.exit(f"ERROR: Input file not found: {args.input}")
+
+    if args.input.suffix.lower() != ".gltf":
+        sys.exit("ERROR: Only text-format .gltf files are supported "
+                 "(not binary .glb). Extract with gltf-pipeline first.")
+
+    with args.input.open(encoding="utf-8") as fh:
+        gltf = json.load(fh)
+
+    config: dict | None = None
+    if args.config:
+        with args.config.open(encoding="utf-8") as fh:
+            config = json.load(fh)
+        print(f"Using explicit config: {args.config}")
+
+    print(f"\nAnnotating joints in: {args.input}")
+    annotated, skipped = annotate(gltf, config, dry_run=args.dry_run)
+    print(f"\n  {annotated} joints annotated, {skipped} skipped.")
+
+    if args.dry_run:
+        print("  Dry-run mode — no output file written.")
+        return
+
+    if annotated == 0:
+        print("  Nothing to write.", file=sys.stderr)
+        return
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with args.output.open("w", encoding="utf-8") as fh:
+        json.dump(gltf, fh, indent=2, ensure_ascii=False)
+    print(f"  Written: {args.output}")
+    print()
+    print("  Next steps:")
+    print("    1. Run glTF-Validator to confirm the file is still valid.")
+    print("    2. Open in your engine and verify collider shapes visually with DebugDrawer.")
+    print("    3. Adjust masses/radii in the output file or create a --config JSON")
+    print("       for per-bone overrides, then re-run this script.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/attachments/advanced_gltf/assets/download_samples.sh b/attachments/advanced_gltf/assets/download_samples.sh
new file mode 100755
index 000000000..6561b884b
--- /dev/null
+++ b/attachments/advanced_gltf/assets/download_samples.sh
@@ -0,0 +1,106 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026 Holochip Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 the "License";
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Downloads the subset of Khronos glTF Sample Assets needed for the Advanced glTF tutorial.
+# All models are from https://github.com/KhronosGroup/glTF-Sample-Assets
+# and are licensed CC0-1.0 or CC-BY-4.0 as noted.
+#
+# Pin: commit 2bac6f8c57bf471df0d2a1e8a8ec023c7801dddf (2026-04-27)
+# Update this hash after auditing the CHANGELOG for breaking directory renames.
+
+set -euo pipefail
+
+REPO="https://raw.githubusercontent.com/KhronosGroup/glTF-Sample-Assets"
+COMMIT="2bac6f8c57bf471df0d2a1e8a8ec023c7801dddf"
+BASE="${REPO}/${COMMIT}/Models"
+OUT="$(dirname "$0")"
+
+download() {
+    local dir="$1"; shift
+    mkdir -p "${OUT}/${dir}"
+    for file in "$@"; do
+        local dest="${OUT}/${dir}/${file##*/}"
+        if [ ! -f "${dest}" ]; then
+            echo "  ↓  ${dir}/${file##*/}"
+            curl -fsSL "${BASE}/${dir}/${file}" -o "${dest}"
+        else
+            echo "  ✓  ${dir}/${file##*/} (cached)"
+        fi
+    done
+}
+
+echo "=== Downloading glTF Sample Assets ==="
+
+# --- Chapter 1: Scene Graph Hierarchy ---
+# BoxAnimated  — CC0   — simple node hierarchy with animated transforms
+echo "→ BoxAnimated"
+download "BoxAnimated/glTF" \
+    "BoxAnimated.gltf" \
+    "BoxAnimated0.bin"
+
+# RiggedSimple — CC0 — minimal 2-joint skinned mesh
+echo "→ RiggedSimple"
+download "RiggedSimple/glTF" \
+    "RiggedSimple.gltf" \
+    "RiggedSimple0.bin"
+
+# --- Chapter 2: Skeletal Compute Skinning ---
+# SimpleSkin   — CC0 — the canonical 2-joint skinning tutorial model
+echo "→ SimpleSkin"
+download "SimpleSkin/glTF" \
+    "SimpleSkin.gltf" \
+    "SimpleSkin_animation.bin" \
+    "SimpleSkin_geometry.bin" \
+    "SimpleSkin_inverseBindMatrices.bin" \
+    "SimpleSkin_skinningData.bin"
+
+# Fox          — CC-BY-4.0 (Sketchfab)
+echo "→ Fox (CC-BY-4.0)"
+download "Fox/glTF" \
+    "Fox.gltf" \
+    "Fox.bin" \
+    "Texture.png"
+
+# --- Chapter 3: Interpolation ---
+# InterpolationTest — CC0 — exercises STEP / LINEAR / CUBICSPLINE
+echo "→ InterpolationTest"
+download "InterpolationTest/glTF" \
+    "InterpolationTest.gltf" \
+    "InterpolationTest_data.bin" \
+    "InterpolationTest_img0.png"
+
+# --- Chapter 5: Morph Targets ---
+# AnimatedMorphCube — CC0 — simplest morph target demo
+echo "→ AnimatedMorphCube"
+download "AnimatedMorphCube/glTF" \
+    "AnimatedMorphCube.gltf" \
+    "AnimatedMorphCube.bin"
+
+# MorphPrimitivesTest — CC0 — multiple target/primitive combinations
+echo "→ MorphPrimitivesTest"
+download "MorphPrimitivesTest/glTF" \
+    "MorphPrimitivesTest.gltf" \
+    "MorphPrimitivesTest.bin" \
+    "uv_texture.jpg"
+
+echo ""
+echo "=== Done ==="
+echo ""
+echo "NOTE: Physics ragdoll metadata (glTF extras.collider / extras.constraint)"
+echo "      is a tutorial-specific schema — no standard Khronos sample includes it."
+echo "      Use the add_physics_extras.py script to annotate RiggedSimple or Fox"
+echo "      before testing the Physics Integration chapter."
diff --git a/attachments/advanced_gltf/audio_system.cpp b/attachments/advanced_gltf/audio_system.cpp
new file mode 100644
index 000000000..06f46b70e
--- /dev/null
+++ b/attachments/advanced_gltf/audio_system.cpp
@@ -0,0 +1,1825 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "audio_system.h"
+
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cmath>
+#include <cstring>
+#include <numbers>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <unordered_map>
+#include <utility>
+
+#if defined(PLATFORM_ANDROID)
+#	include <SLES/OpenSLES.h>
+#	include <SLES/OpenSLES_Android.h>
+#else
+// OpenAL headers
+#	ifdef __APPLE__
+#		include <OpenAL/al.h>
+#		include <OpenAL/alc.h>
+#	else
+#		include <AL/al.h>
+#		include <AL/alc.h>
+#	endif
+#endif
+
+#include "engine.h"
+#include "renderer.h"
+
+#if !defined(PLATFORM_ANDROID)
+// OpenAL error checking utility
+static void CheckOpenALError(const std::string& operation) {
+  ALenum error = alGetError();
+  if (error != AL_NO_ERROR) {
+    std::cerr << "OpenAL Error in " << operation << ": ";
+    switch (error) {
+      case AL_INVALID_NAME:
+        std::cerr << "AL_INVALID_NAME";
+        break;
+      case AL_INVALID_ENUM:
+        std::cerr << "AL_INVALID_ENUM";
+        break;
+      case AL_INVALID_VALUE:
+        std::cerr << "AL_INVALID_VALUE";
+        break;
+      case AL_INVALID_OPERATION:
+        std::cerr << "AL_INVALID_OPERATION";
+        break;
+      case AL_OUT_OF_MEMORY:
+        std::cerr << "AL_OUT_OF_MEMORY";
+        break;
+      default:
+        std::cerr << "Unknown error " << error;
+        break;
+    }
+    std::cerr << std::endl;
+  }
+}
+#endif
+
+// Concrete implementation of AudioSource
+class ConcreteAudioSource : public AudioSource {
+  public:
+    explicit ConcreteAudioSource(std::string name) : name(std::move(name)) {
+    }
+    ~ConcreteAudioSource() override = default;
+
+    void Play() override {
+      playing = true;
+      playbackPosition = 0;
+      delayTimer = std::chrono::milliseconds(0);
+      inDelayPhase = false;
+      sampleAccumulator = 0.0;
+    }
+
+    void Pause() override {
+      playing = false;
+    }
+
+    void Stop() override {
+      playing = false;
+      playbackPosition = 0;
+      delayTimer = std::chrono::milliseconds(0);
+      inDelayPhase = false;
+      sampleAccumulator = 0.0;
+    }
+
+    void SetVolume(float volume) override {
+      this->volume = volume;
+    }
+
+    void SetLoop(bool loop) override {
+      this->loop = loop;
+    }
+
+    void SetPosition(float x, float y, float z) override {
+      position[0] = x;
+      position[1] = y;
+      position[2] = z;
+    }
+
+    void SetVelocity(float x, float y, float z) override {
+      velocity[0] = x;
+      velocity[1] = y;
+      velocity[2] = z;
+    }
+
+    [[nodiscard]] bool IsPlaying() const override {
+      return playing;
+    }
+
+    // Additional methods for delay functionality
+    void SetAudioLength(uint32_t lengthInSamples) {
+      audioLengthSamples = lengthInSamples;
+    }
+
+    void UpdatePlayback(std::chrono::milliseconds deltaTime, uint32_t samplesProcessed) {
+      if (!playing)
+        return;
+
+      if (inDelayPhase) {
+        // We're in the delay phase between playthroughs
+        delayTimer += deltaTime;
+        if (delayTimer >= delayDuration) {
+          // Delay finished, restart playback
+          inDelayPhase = false;
+          playbackPosition = 0;
+          delayTimer = std::chrono::milliseconds(0);
+        }
+      } else {
+        // Normal playback, update position
+        playbackPosition += samplesProcessed;
+
+        // Check if we've reached the end of the audio
+        if (audioLengthSamples > 0 && playbackPosition >= audioLengthSamples) {
+          if (loop) {
+            // Start the delay phase before looping
+            inDelayPhase = true;
+            delayTimer = std::chrono::milliseconds(0);
+          } else {
+            // Stop playing if not looping
+            playing = false;
+            playbackPosition = 0;
+          }
+        }
+      }
+    }
+
+    [[nodiscard]] bool ShouldProcessAudio() const {
+      return playing && !inDelayPhase;
+    }
+
+    [[nodiscard]] uint32_t GetPlaybackPosition() const {
+      return playbackPosition;
+    }
+
+    [[nodiscard]] const std::string& GetName() const {
+      return name;
+    }
+
+    [[nodiscard]] const float* GetPosition() const {
+      return position;
+    }
+
+    [[nodiscard]] double GetSampleAccumulator() const {
+      return sampleAccumulator;
+    }
+
+    void SetSampleAccumulator(double value) {
+      sampleAccumulator = value;
+    }
+
+  private:
+    std::string name;
+    bool playing = false;
+    bool loop = false;
+    float volume = 1.0f;
+    float position[3] = {0.0f, 0.0f, 0.0f};
+    float velocity[3] = {0.0f, 0.0f, 0.0f};
+
+    // Delay and timing functionality
+    uint32_t playbackPosition = 0; // Current position in samples
+    uint32_t audioLengthSamples = 0; // Total length of audio in samples
+    std::chrono::milliseconds delayTimer = std::chrono::milliseconds(0); // Timer for delay between loops
+    bool inDelayPhase = false; // Whether we're currently in the delay phase
+    static constexpr std::chrono::milliseconds delayDuration = std::chrono::milliseconds(1500); // 1.5-second delay between loops
+    double sampleAccumulator = 0.0; // Per-source sample accumulator for proper timing
+};
+
+#if defined(PLATFORM_ANDROID)
+
+// OpenSL ES audio output device implementation
+class OpenSLESAudioOutputDevice : public AudioOutputDevice {
+  public:
+    OpenSLESAudioOutputDevice() = default;
+    ~OpenSLESAudioOutputDevice() override {
+      Stop();
+      Cleanup();
+    }
+
+    bool Initialize(uint32_t sampleRate, uint32_t channels, uint32_t bufferSize) override {
+      this->sampleRate = sampleRate;
+      this->channels = channels == 0 ? 2u : channels;
+      this->bufferSize = bufferSize == 0 ? 1024u : bufferSize;
+
+      // Create and realize engine
+      SLresult result = slCreateEngine(&engineObject, 0, nullptr, 0, nullptr, nullptr);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: slCreateEngine failed (%d)", result);
+        return false;
+      }
+      result = (*engineObject)->Realize(engineObject, SL_BOOLEAN_FALSE);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: Engine Realize failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+      result = (*engineObject)->GetInterface(engineObject, SL_IID_ENGINE, &engineEngine);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: GetInterface SL_IID_ENGINE failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+
+      // Create output mix
+      result = (*engineEngine)->CreateOutputMix(engineEngine, &outputMixObject, 0, nullptr, nullptr);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: CreateOutputMix failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+      result = (*outputMixObject)->Realize(outputMixObject, SL_BOOLEAN_FALSE);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: OutputMix Realize failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+
+      // Configure source: buffer queue + PCM format
+      SLDataLocator_AndroidSimpleBufferQueue loc_bufq{SL_DATALOCATOR_ANDROIDSIMPLEBUFFERQUEUE, (SLuint32) NUM_BUFFERS};
+      SLDataFormat_PCM format_pcm{};
+      format_pcm.formatType = SL_DATAFORMAT_PCM;
+      format_pcm.numChannels = (SLuint32) this->channels;
+      format_pcm.samplesPerSec = ToSLSampleRate(this->sampleRate);
+      format_pcm.bitsPerSample = SL_PCMSAMPLEFORMAT_FIXED_16;
+      format_pcm.containerSize = 16;
+      format_pcm.channelMask = (this->channels == 1) ? (SL_SPEAKER_FRONT_CENTER) : (SL_SPEAKER_FRONT_LEFT | SL_SPEAKER_FRONT_RIGHT);
+      format_pcm.endianness = SL_BYTEORDER_LITTLEENDIAN;
+
+      SLDataSource audioSrc{&loc_bufq, &format_pcm};
+
+      // Sink: OutputMix
+      SLDataLocator_OutputMix loc_outmix{SL_DATALOCATOR_OUTPUTMIX, outputMixObject};
+      SLDataSink audioSnk{&loc_outmix, nullptr};
+
+      // Create audio player; request buffer queue interface
+      const SLInterfaceID ids[] = {SL_IID_BUFFERQUEUE};
+      const SLboolean req[] = {SL_BOOLEAN_TRUE};
+      result = (*engineEngine)->CreateAudioPlayer(engineEngine, &playerObject, &audioSrc, &audioSnk, (SLuint32)(sizeof(ids) / sizeof(ids[0])), ids, req);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: CreateAudioPlayer failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+      result = (*playerObject)->Realize(playerObject, SL_BOOLEAN_FALSE);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: Player Realize failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+
+      // Interfaces
+      result = (*playerObject)->GetInterface(playerObject, SL_IID_PLAY, &playItf);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: GetInterface SL_IID_PLAY failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+      result = (*playerObject)->GetInterface(playerObject, SL_IID_BUFFERQUEUE, &bufferQueueItf);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: GetInterface SL_IID_BUFFERQUEUE failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+
+      // Setup buffers
+      pcmBuffers.assign(NUM_BUFFERS, std::vector<int16_t>(this->bufferSize * this->channels));
+      nextBufferIndex = 0;
+
+      // Register callback
+      result = (*bufferQueueItf)->RegisterCallback(bufferQueueItf, &OpenSLESAudioOutputDevice::BufferQueueCallback, this);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: RegisterCallback failed (%d)", result);
+        Cleanup();
+        return false;
+      }
+
+      initialized = true;
+      return true;
+    }
+
+    bool Start() override {
+      if (!initialized)
+        return false;
+      if (playing)
+        return true;
+
+      playing = true;
+      SLresult result = (*playItf)->SetPlayState(playItf, SL_PLAYSTATE_PLAYING);
+      if (result != SL_RESULT_SUCCESS) {
+        LOGE("OpenSLES: SetPlayState PLAYING failed (%d)", result);
+        return false;
+      }
+
+      // Enqueue initial buffers to kick off the callback chain
+      for (int i = 0; i < 2; ++i) {
+        EnqueueNextBuffer();
+      }
+
+      return true;
+    }
+
+    bool Stop() override {
+      if (!initialized)
+        return true;
+      playing = false;
+      if (playItf) {
+        (*playItf)->SetPlayState(playItf, SL_PLAYSTATE_STOPPED);
+      }
+      if (bufferQueueItf) {
+        (*bufferQueueItf)->Clear(bufferQueueItf);
+      }
+      return true;
+    }
+
+    bool WriteAudio(const float* data, uint32_t sampleCount) override {
+      if (!initialized)
+        return false;
+      std::lock_guard<std::mutex> lock(bufferMutex);
+      const uint32_t total = sampleCount * channels;
+      for (uint32_t i = 0; i < total; ++i) {
+        audioQueue.push(data[i]);
+      }
+      return true;
+    }
+
+    bool IsPlaying() const override {
+      return playing;
+    }
+    uint32_t GetPosition() const override {
+      return playbackPosition;
+    }
+
+  private:
+    static constexpr int NUM_BUFFERS = 4;
+    uint32_t sampleRate = 44100;
+    uint32_t channels = 2;
+    uint32_t bufferSize = 1024;
+    bool initialized = false;
+    std::atomic<bool> playing{false};
+    uint32_t playbackPosition = 0;
+
+    SLObjectItf engineObject = nullptr;
+    SLEngineItf engineEngine = nullptr;
+    SLObjectItf outputMixObject = nullptr;
+    SLObjectItf playerObject = nullptr;
+    SLPlayItf playItf = nullptr;
+    SLAndroidSimpleBufferQueueItf bufferQueueItf = nullptr;
+
+    std::vector<std::vector<int16_t>> pcmBuffers;
+    int nextBufferIndex = 0;
+    std::queue<float> audioQueue;
+    std::mutex bufferMutex;
+
+    static SLuint32 ToSLSampleRate(uint32_t rate) {
+      switch (rate) {
+        case 44100:
+          return SL_SAMPLINGRATE_44_1;
+        case 48000:
+          return SL_SAMPLINGRATE_48;
+        default:
+          return SL_SAMPLINGRATE_44_1;
+      }
+    }
+
+    static void BufferQueueCallback(SLAndroidSimpleBufferQueueItf, void* context) {
+      auto* self = static_cast<OpenSLESAudioOutputDevice *>(context);
+      if (self && self->playing) {
+        self->EnqueueNextBuffer();
+      }
+    }
+
+    void EnqueueNextBuffer() {
+      std::lock_guard<std::mutex> lock(bufferMutex);
+      auto& buf = pcmBuffers[nextBufferIndex];
+      const uint32_t totalSamples = bufferSize * channels;
+
+      for (uint32_t i = 0; i < totalSamples; ++i) {
+        if (!audioQueue.empty()) {
+          float s = audioQueue.front();
+          audioQueue.pop();
+          buf[i] = static_cast<int16_t>(std::clamp(s, -1.0f, 1.0f) * 32767.0f);
+        } else {
+          buf[i] = 0;
+        }
+      }
+
+      (*bufferQueueItf)->Enqueue(bufferQueueItf, buf.data(), totalSamples * sizeof(int16_t));
+      playbackPosition += bufferSize;
+      nextBufferIndex = (nextBufferIndex + 1) % NUM_BUFFERS;
+    }
+
+    void Cleanup() {
+      if (playerObject) {
+        (*playerObject)->Destroy(playerObject);
+        playerObject = nullptr;
+      }
+      if (outputMixObject) {
+        (*outputMixObject)->Destroy(outputMixObject);
+        outputMixObject = nullptr;
+      }
+      if (engineObject) {
+        (*engineObject)->Destroy(engineObject);
+        engineObject = nullptr;
+      }
+    }
+};
+
+#else
+
+// OpenAL audio output device implementation
+class OpenALAudioOutputDevice : public AudioOutputDevice {
+  public:
+    OpenALAudioOutputDevice() = default;
+    ~OpenALAudioOutputDevice() override {
+      OpenALAudioOutputDevice::Stop();
+      Cleanup();
+    }
+
+    bool Initialize(uint32_t sampleRate, uint32_t channels, uint32_t bufferSize) override {
+      this->sampleRate = sampleRate;
+      this->channels = channels;
+      this->bufferSize = bufferSize;
+
+      // Initialize OpenAL
+      device = alcOpenDevice(nullptr); // Use default device
+      if (!device) {
+        std::cerr << "Failed to open OpenAL device" << std::endl;
+        return false;
+      }
+
+      context = alcCreateContext(device, nullptr);
+      if (!context) {
+        std::cerr << "Failed to create OpenAL context" << std::endl;
+        alcCloseDevice(device);
+        device = nullptr;
+        return false;
+      }
+
+      if (!alcMakeContextCurrent(context)) {
+        std::cerr << "Failed to make OpenAL context current" << std::endl;
+        alcDestroyContext(context);
+        alcCloseDevice(device);
+        context = nullptr;
+        device = nullptr;
+        return false;
+      }
+
+      // Generate OpenAL source
+      alGenSources(1, &source);
+      CheckOpenALError("alGenSources");
+
+      // Generate OpenAL buffers for streaming
+      alGenBuffers(NUM_BUFFERS, buffers);
+      CheckOpenALError("alGenBuffers");
+
+      // Set source properties
+      alSourcef(source, AL_PITCH, 1.0f);
+      alSourcef(source, AL_GAIN, 1.0f);
+      alSource3f(source, AL_POSITION, 0.0f, 0.0f, 0.0f);
+      alSource3f(source, AL_VELOCITY, 0.0f, 0.0f, 0.0f);
+      alSourcei(source, AL_LOOPING, AL_FALSE);
+      CheckOpenALError("Source setup");
+
+      // Initialize audio buffer
+      audioBuffer.resize(bufferSize * channels);
+
+      // Initialize buffer tracking
+      queuedBufferCount = 0;
+      while (!availableBuffers.empty()) {
+        availableBuffers.pop();
+      }
+
+      initialized = true;
+      return true;
+    }
+
+    bool Start() override {
+      if (!initialized) {
+        std::cerr << "OpenAL audio output device not initialized" << std::endl;
+        return false;
+      }
+
+      if (playing) {
+        return true; // Already playing
+      }
+
+      playing = true;
+
+      // Start an audio playback thread
+      audioThread = std::thread(&OpenALAudioOutputDevice::AudioThreadFunction, this);
+
+      return true;
+    }
+
+    bool Stop() override {
+      if (!playing) {
+        return true; // Already stopped
+      }
+
+      playing = false;
+
+      // Wait for the audio thread to finish
+      if (audioThread.joinable()) {
+        audioThread.join();
+      }
+
+      // Stop OpenAL source
+      if (initialized && source != 0) {
+        alSourceStop(source);
+        CheckOpenALError("alSourceStop");
+      }
+
+      return true;
+    }
+
+    bool WriteAudio(const float* data, uint32_t sampleCount) override {
+      if (!initialized || !playing) {
+        return false;
+      }
+
+      std::lock_guard<std::mutex> lock(bufferMutex);
+
+      // Add audio data to the queue
+      for (uint32_t i = 0; i < sampleCount * channels; i++) {
+        audioQueue.push(data[i]);
+      }
+
+      return true;
+    }
+
+    [[nodiscard]] bool IsPlaying() const override {
+      return playing;
+    }
+
+    [[nodiscard]] uint32_t GetPosition() const override {
+      return playbackPosition;
+    }
+
+  private:
+    static constexpr int NUM_BUFFERS = 8;
+
+    uint32_t sampleRate = 44100;
+    uint32_t channels = 2;
+    uint32_t bufferSize = 1024;
+    bool initialized = false;
+    bool playing = false;
+    uint32_t playbackPosition = 0;
+
+    // OpenAL objects
+    ALCdevice* device = nullptr;
+    ALCcontext* context = nullptr;
+    ALuint source = 0;
+    ALuint buffers[NUM_BUFFERS]{};
+    int currentBuffer = 0;
+
+    std::vector<float> audioBuffer;
+    std::queue<float> audioQueue;
+    std::mutex bufferMutex;
+    std::thread audioThread;
+
+    // Buffer management for OpenAL streaming
+    std::queue<ALuint> availableBuffers;
+    int queuedBufferCount = 0;
+
+    void Cleanup() {
+      if (initialized) {
+        // Clean up OpenAL resources
+        if (source != 0) {
+          alDeleteSources(1, &source);
+          source = 0;
+        }
+
+        alDeleteBuffers(NUM_BUFFERS, buffers);
+
+        if (context) {
+          alcMakeContextCurrent(nullptr);
+          alcDestroyContext(context);
+          context = nullptr;
+        }
+
+        if (device) {
+          alcCloseDevice(device);
+          device = nullptr;
+        }
+
+        // Reset buffer tracking
+        queuedBufferCount = 0;
+        while (!availableBuffers.empty()) {
+          availableBuffers.pop();
+        }
+
+        initialized = false;
+      }
+    }
+
+    void AudioThreadFunction() {
+      // Calculate sleep time for audio buffer updates (in milliseconds)
+      const auto sleepTime = std::chrono::milliseconds(
+        static_cast<int>((bufferSize * 1000) / sampleRate / 8) // Eighth buffer time for responsiveness
+      );
+
+      while (playing) {
+        ProcessAudioBuffer();
+        std::this_thread::sleep_for(sleepTime);
+      }
+    }
+
+    void ProcessAudioBuffer() {
+      std::lock_guard<std::mutex> lock(bufferMutex);
+
+      // Fill audio buffer from queue in whole stereo frames to preserve channel alignment
+      uint32_t samplesProcessed = 0;
+      const uint32_t framesAvailable = static_cast<uint32_t>(audioQueue.size() / channels);
+      if (framesAvailable == 0) {
+        // Not enough data for a whole frame yet
+        return;
+      }
+      const uint32_t framesToSend = std::min(framesAvailable, bufferSize);
+      const uint32_t samplesToSend = framesToSend * channels;
+      for (uint32_t i = 0; i < samplesToSend; i++) {
+        audioBuffer[i] = audioQueue.front();
+        audioQueue.pop();
+      }
+      samplesProcessed = samplesToSend;
+
+      if (samplesProcessed > 0) {
+        // Convert float samples to 16-bit PCM for OpenAL
+        std::vector<int16_t> pcmBuffer(samplesProcessed);
+        for (uint32_t i = 0; i < samplesProcessed; i++) {
+          // Clamp and convert to 16-bit PCM
+          float sample = std::clamp(audioBuffer[i], -1.0f, 1.0f);
+          pcmBuffer[i] = static_cast<int16_t>(sample * 32767.0f);
+        }
+
+        // Check for processed buffers and unqueue them
+        ALint processed = 0;
+        alGetSourcei(source, AL_BUFFERS_PROCESSED, &processed);
+        CheckOpenALError("alGetSourcei AL_BUFFERS_PROCESSED");
+
+        // Unqueue processed buffers and add them to available buffers
+        while (processed > 0) {
+          ALuint buffer;
+          alSourceUnqueueBuffers(source, 1, &buffer);
+          CheckOpenALError("alSourceUnqueueBuffers");
+
+          // Add the unqueued buffer to available buffers
+          availableBuffers.push(buffer);
+          processed--;
+        }
+
+        // Only proceed if we have an available buffer
+        ALuint buffer = 0;
+        if (!availableBuffers.empty()) {
+          buffer = availableBuffers.front();
+          availableBuffers.pop();
+        } else if (queuedBufferCount < NUM_BUFFERS) {
+          // Use a buffer that hasn't been queued yet
+          buffer = buffers[queuedBufferCount];
+        } else {
+          // No available buffers, skip this frame
+          return;
+        }
+
+        // Validate buffer parameters
+        if (pcmBuffer.empty()) {
+          // Re-add buffer to available list if we can't use it
+          if (queuedBufferCount >= NUM_BUFFERS) {
+            availableBuffers.push(buffer);
+          }
+          return;
+        }
+
+        // Determine format based on channels
+        ALenum format = (channels == 1) ? AL_FORMAT_MONO16 : AL_FORMAT_STEREO16;
+
+        // Upload audio data to OpenAL buffer
+        alBufferData(buffer,
+                     format,
+                     pcmBuffer.data(),
+                     static_cast<ALsizei>(samplesProcessed * sizeof(int16_t)),
+                     static_cast<ALsizei>(sampleRate));
+        CheckOpenALError("alBufferData");
+
+        // Queue the buffer
+        alSourceQueueBuffers(source, 1, &buffer);
+        CheckOpenALError("alSourceQueueBuffers");
+
+        // Track that we've queued this buffer
+        if (queuedBufferCount < NUM_BUFFERS) {
+          queuedBufferCount++;
+        }
+
+        // Start playing if not already playing
+        ALint sourceState;
+        alGetSourcei(source, AL_SOURCE_STATE, &sourceState);
+        CheckOpenALError("alGetSourcei AL_SOURCE_STATE");
+
+        if (sourceState != AL_PLAYING) {
+          alSourcePlay(source);
+          CheckOpenALError("alSourcePlay");
+        }
+
+        playbackPosition += samplesProcessed / channels;
+      }
+    }
+};
+
+#endif
+
+AudioSystem::~AudioSystem() {
+  // Stop the audio thread first
+  stopAudioThread();
+
+  // Stop and clean up audio output device
+  if (outputDevice) {
+    outputDevice->Stop();
+    outputDevice.reset();
+  }
+
+  // Destructor implementation
+  sources.clear();
+  audioData.clear();
+
+  // Clean up HRTF buffers
+  cleanupHRTFBuffers();
+}
+
+void AudioSystem::GenerateSineWavePing(float* buffer, uint32_t sampleCount, uint32_t playbackPosition) {
+  constexpr float sampleRate = 44100.0f;
+  const float frequency = 800.0f; // 800Hz ping
+  constexpr float pingDuration = 0.75f; // 0.75 second ping duration
+  constexpr auto pingSamples = static_cast<uint32_t>(pingDuration * sampleRate);
+  constexpr float silenceDuration = 1.0f; // 1 second silence after ping
+  constexpr auto silenceSamples = static_cast<uint32_t>(silenceDuration * sampleRate);
+  constexpr uint32_t totalCycleSamples = pingSamples + silenceSamples;
+
+  const uint32_t attackSamples = static_cast<uint32_t>(0.001f * sampleRate); // ~1ms attack
+  const uint32_t releaseSamples = static_cast<uint32_t>(0.001f * sampleRate); // ~1ms release
+  constexpr float amplitude = 0.6f;
+
+  for (uint32_t i = 0; i < sampleCount; i++) {
+    uint32_t globalPosition = playbackPosition + i;
+    uint32_t cyclePosition = globalPosition % totalCycleSamples;
+
+    if (cyclePosition < pingSamples) {
+      float t = static_cast<float>(cyclePosition) / sampleRate;
+
+      // Minimal envelope for click prevention only
+      float envelope = 1.0f;
+      if (cyclePosition < attackSamples) {
+        envelope = static_cast<float>(cyclePosition) / static_cast<float>(std::max(1u, attackSamples));
+      } else if (cyclePosition > pingSamples - releaseSamples) {
+        uint32_t relPos = pingSamples - cyclePosition;
+        envelope = static_cast<float>(relPos) / static_cast<float>(std::max(1u, releaseSamples));
+      }
+
+      float sineWave = sinf(2.0f * std::numbers::pi_v<float> * frequency * t);
+      buffer[i] = amplitude * envelope * sineWave;
+    } else {
+      // Silence phase
+      buffer[i] = 0.0f;
+    }
+  }
+}
+
+bool AudioSystem::Initialize(Engine* engine, Renderer* renderer) {
+  // Store the engine reference for accessing active camera
+  this->engine = engine;
+
+  if (renderer) {
+    // Validate renderer if provided
+    if (!renderer->IsInitialized()) {
+      std::cerr << "AudioSystem::Initialize: Renderer is not initialized" << std::endl;
+      return false;
+    }
+
+    // Store the renderer for compute shader support
+    this->renderer = renderer;
+  } else {
+    this->renderer = nullptr;
+  }
+
+  // Generate default HRTF data for spatial audio processing
+  LoadHRTFData(""); // Pass empty filename to force generation of default HRTF data
+
+  // Enable HRTF processing by default for 3D spatial audio
+  EnableHRTF(true);
+
+  // Set default listener properties
+  SetListenerPosition(0.0f, 0.0f, 0.0f);
+  SetListenerOrientation(0.0f, 0.0f, -1.0f, 0.0f, 1.0f, 0.0f);
+  SetListenerVelocity(0.0f, 0.0f, 0.0f);
+  SetMasterVolume(1.0f);
+
+  // Initialize audio output device
+#if defined(PLATFORM_ANDROID)
+  outputDevice = std::make_unique<OpenSLESAudioOutputDevice>();
+#else
+  outputDevice = std::make_unique<OpenALAudioOutputDevice>();
+#endif
+
+  if (!outputDevice->Initialize(44100, 2, 1024)) {
+    std::cerr << "Failed to initialize audio output device" << std::endl;
+    return false;
+  }
+
+  // Start audio output
+  if (!outputDevice->Start()) {
+    std::cerr << "Failed to start audio output device" << std::endl;
+    return false;
+  }
+
+  // Start the background audio processing thread
+  startAudioThread();
+
+  initialized = true;
+  return true;
+}
+
+void AudioSystem::Update(std::chrono::milliseconds deltaTime) {
+  if (!initialized) {
+    return;
+  }
+
+  // Synchronize HRTF listener position and orientation with active camera
+  if (engine) {
+    const CameraComponent* activeCamera = engine->GetActiveCamera();
+    if (activeCamera) {
+      // Get camera position
+      glm::vec3 cameraPos = activeCamera->GetPosition();
+      SetListenerPosition(cameraPos.x, cameraPos.y, cameraPos.z);
+
+      // Calculate camera forward and up vectors for orientation
+      // The camera looks at its target, so forward = normalize(target - position)
+      glm::vec3 target = activeCamera->GetTarget();
+      glm::vec3 up = activeCamera->GetUp();
+      glm::vec3 forward = glm::normalize(target - cameraPos);
+
+      SetListenerOrientation(forward.x, forward.y, forward.z, up.x, up.y, up.z);
+    }
+  }
+
+  // Update audio sources and process spatial audio
+  for (auto& source : sources) {
+    if (!source->IsPlaying()) {
+      continue;
+    }
+
+    // Cast to ConcreteAudioSource to access timing methods
+    auto* concreteSource = dynamic_cast<ConcreteAudioSource *>(source.get());
+
+    // Update playback timing and delay logic
+    concreteSource->UpdatePlayback(deltaTime, 0);
+
+    // Only process audio if not in the delay phase
+    if (!concreteSource->ShouldProcessAudio()) {
+      continue;
+    }
+
+    // Process audio with HRTF spatial processing (works with or without renderer)
+    if (hrtfEnabled && !hrtfData.empty()) {
+      // Get source position for spatial processing
+      const float* sourcePosition = concreteSource->GetPosition();
+
+      // Accumulate samples based on real time and process in fixed-size chunks to avoid tiny buffers
+      double acc = concreteSource->GetSampleAccumulator();
+      acc += (static_cast<double>(deltaTime.count()) * 44100.0) / 1000.0; // ms -> samples
+      constexpr uint32_t kChunk = 33075;
+      uint32_t available = static_cast<uint32_t>(acc);
+      if (available < kChunk) {
+        // Not enough for a full chunk; keep accumulating
+        concreteSource->SetSampleAccumulator(acc);
+        continue;
+      }
+      // Process as many full chunks as available this frame
+      while (available >= kChunk) {
+        std::vector<float> inputBuffer(kChunk, 0.0f);
+        std::vector<float> outputBuffer(kChunk * 2, 0.0f);
+        uint32_t actualSamplesProcessed = 0;
+
+        // Generate audio signal from loaded audio data or debug ping
+        auto audioIt = audioData.find(concreteSource->GetName());
+        if (audioIt != audioData.end() && !audioIt->second.empty()) {
+          // Use actual loaded audio data with proper position tracking
+          const auto& data = audioIt->second;
+          uint32_t playbackPos = concreteSource->GetPlaybackPosition();
+
+          for (uint32_t i = 0; i < kChunk; i++) {
+            uint32_t dataIndex = (playbackPos + i) * 4; // 4 bytes per sample (16-bit stereo)
+
+            if (dataIndex + 1 < data.size()) {
+              // Convert from 16-bit PCM to float
+              int16_t sample = *reinterpret_cast<const int16_t *>(&data[dataIndex]);
+              inputBuffer[i] = static_cast<float>(sample) / 32768.0f;
+              actualSamplesProcessed++;
+            } else {
+              // Reached end of audio data
+              inputBuffer[i] = 0.0f;
+            }
+          }
+        } else {
+          // Generate sine wave ping for debugging
+          GenerateSineWavePing(inputBuffer.data(), kChunk, concreteSource->GetPlaybackPosition());
+          actualSamplesProcessed = kChunk;
+        }
+
+        // Build extended input [history | current] to preserve convolution continuity across chunks
+        uint32_t histLen = (hrtfSize > 0) ? (hrtfSize - 1) : 0;
+        static std::unordered_map<ConcreteAudioSource *, std::vector<float>> hrtfHistories;
+        auto& hist = hrtfHistories[concreteSource];
+        if (hist.size() != histLen) {
+          hist.assign(histLen, 0.0f);
+        }
+        std::vector<float> extendedInput(histLen + kChunk, 0.0f);
+        if (histLen > 0) {
+          std::memcpy(extendedInput.data(), hist.data(), histLen * sizeof(float));
+        }
+        std::memcpy(extendedInput.data() + histLen, inputBuffer.data(), kChunk * sizeof(float));
+
+        // Submit for GPU HRTF processing via the background thread (trim will occur in processAudioTask)
+        submitAudioTask(extendedInput.data(), static_cast<uint32_t>(extendedInput.size()), sourcePosition, actualSamplesProcessed, histLen);
+
+        // Update history with the tail of current input
+        if (histLen > 0) {
+          std::memcpy(hist.data(), inputBuffer.data() + (kChunk - histLen), histLen * sizeof(float));
+        }
+
+        // Update playback timing with actual samples processed
+        concreteSource->UpdatePlayback(std::chrono::milliseconds(0), actualSamplesProcessed);
+
+        // Consume one chunk from the accumulator
+        acc -= static_cast<double>(kChunk);
+        available -= kChunk;
+      }
+      // Store fractional remainder for next frame
+      concreteSource->SetSampleAccumulator(acc);
+    }
+  }
+
+  // Apply master volume changes to all active sources
+  for (auto& source : sources) {
+    if (source->IsPlaying()) {
+      // Master volume is applied during HRTF processing and individual source volume control
+      // Volume scaling is handled in the ProcessHRTF function
+    }
+  }
+
+  // Clean up finished audio sources
+  std::erase_if(sources,
+                [](const std::unique_ptr<AudioSource>& source) {
+                  // Keep all sources active for continuous playback
+                  // Audio sources can be stopped/started via their Play/Stop methods
+                  return false;
+                });
+
+  // Update timing for audio processing with low-latency chunks
+  static std::chrono::milliseconds accumulatedTime = std::chrono::milliseconds(0);
+  accumulatedTime += deltaTime;
+
+  // Process audio in 20ms chunks for optimal latency
+  constexpr std::chrono::milliseconds audioChunkTime = std::chrono::milliseconds(20); // 20ms chunks for real-time audio
+  if (accumulatedTime >= audioChunkTime) {
+    // Trigger audio buffer updates for smooth playback
+    // The HRTF processing ensures spatial audio is updated continuously
+    accumulatedTime = std::chrono::milliseconds(0);
+
+    // Update listener properties if they have changed
+    // This ensures spatial audio positioning stays current with camera movement
+  }
+}
+
+bool AudioSystem::LoadAudio(const std::string& filename, const std::string& name) {
+  // Open the WAV file
+  std::ifstream file(filename, std::ios::binary);
+  if (!file.is_open()) {
+    std::cerr << "Failed to open audio file: " << filename << std::endl;
+    return false;
+  }
+
+  // Read WAV header
+  struct WAVHeader {
+    char riff[4]; // "RIFF"
+    uint32_t fileSize; // File size - 8
+    char wave[4]; // "WAVE"
+    char fmt[4]; // "fmt "
+    uint32_t fmtSize; // Format chunk size
+    uint16_t audioFormat; // Audio format (1 = PCM)
+    uint16_t numChannels; // Number of channels
+    uint32_t sampleRate; // Sample rate
+    uint32_t byteRate; // Byte rate
+    uint16_t blockAlign; // Block align
+    uint16_t bitsPerSample; // Bits per sample
+    char data[4]; // "data"
+    uint32_t dataSize; // Data size
+  };
+
+  WAVHeader header{};
+  file.read(reinterpret_cast<char *>(&header), sizeof(WAVHeader));
+
+  // Validate WAV header
+  if (std::strncmp(header.riff, "RIFF", 4) != 0 ||
+    std::strncmp(header.wave, "WAVE", 4) != 0 ||
+    std::strncmp(header.fmt, "fmt ", 4) != 0 ||
+    std::strncmp(header.data, "data", 4) != 0) {
+    std::cerr << "Invalid WAV file format: " << filename << std::endl;
+    file.close();
+    return false;
+  }
+
+  // Only support PCM format for now
+  if (header.audioFormat != 1) {
+    std::cerr << "Unsupported audio format (only PCM supported): " << filename << std::endl;
+    file.close();
+    return false;
+  }
+
+  // Read audio data
+  std::vector<uint8_t> data(header.dataSize);
+  file.read(reinterpret_cast<char *>(data.data()), header.dataSize);
+  file.close();
+
+  if (file.gcount() != static_cast<std::streamsize>(header.dataSize)) {
+    std::cerr << "Failed to read complete audio data from: " << filename << std::endl;
+    return false;
+  }
+
+  // Store the audio data
+  audioData[name] = std::move(data);
+
+  return true;
+}
+
+AudioSource* AudioSystem::CreateAudioSource(const std::string& name) {
+  // Check if the audio data exists
+  auto it = audioData.find(name);
+  if (it == audioData.end()) {
+    std::cerr << "AudioSystem::CreateAudioSource: Audio data not found: " << name << std::endl;
+    return nullptr;
+  }
+
+  // Create a new audio source
+  auto source = std::make_unique<ConcreteAudioSource>(name);
+
+  // Calculate audio length in samples for timing
+  const auto& data = it->second;
+  if (!data.empty()) {
+    // Assuming 16-bit stereo audio at 44.1kHz (standard WAV format)
+    // The audio data reading uses dataIndex = (playbackPos + i) * 4
+    // So we need to calculate length based on how many individual samples we can read
+    // Each 4 bytes represents one stereo sample pair, so total individual samples = data.size() / 4
+    uint32_t totalSamples = static_cast<uint32_t>(data.size()) / 4;
+
+    // Set the audio length for proper timing
+    source->SetAudioLength(totalSamples);
+  }
+
+  // Store the source
+  sources.push_back(std::move(source));
+
+  return sources.back().get();
+}
+
+AudioSource* AudioSystem::CreateDebugPingSource(const std::string& name) {
+  // Create a new audio source for debugging
+  auto source = std::make_unique<ConcreteAudioSource>(name);
+
+  // Set up debug ping parameters
+  // The ping will cycle every 1.5 seconds (0.5s ping + 1.0s silence)
+  constexpr float sampleRate = 44100.0f;
+  constexpr float pingDuration = 0.5f;
+  constexpr float silenceDuration = 1.0f;
+  constexpr auto totalCycleSamples = static_cast<uint32_t>((pingDuration + silenceDuration) * sampleRate);
+
+  // For generated ping, let the generator control the 0.5s ping + 1.0s silence cycle.
+  // Disable source-level length/delay to avoid double-silence and audible resets.
+  source->SetAudioLength(0);
+
+  // Store the source
+  sources.push_back(std::move(source));
+
+  return sources.back().get();
+}
+
+void AudioSystem::SetListenerPosition(const float x, const float y, const float z) {
+  listenerPosition[0] = x;
+  listenerPosition[1] = y;
+  listenerPosition[2] = z;
+}
+
+void AudioSystem::SetListenerOrientation(const float forwardX,
+                                         const float forwardY,
+                                         const float forwardZ,
+                                         const float upX,
+                                         const float upY,
+                                         const float upZ) {
+  listenerOrientation[0] = forwardX;
+  listenerOrientation[1] = forwardY;
+  listenerOrientation[2] = forwardZ;
+  listenerOrientation[3] = upX;
+  listenerOrientation[4] = upY;
+  listenerOrientation[5] = upZ;
+}
+
+void AudioSystem::SetListenerVelocity(const float x, const float y, const float z) {
+  listenerVelocity[0] = x;
+  listenerVelocity[1] = y;
+  listenerVelocity[2] = z;
+}
+
+void AudioSystem::SetMasterVolume(const float volume) {
+  masterVolume = volume;
+}
+
+void AudioSystem::EnableHRTF(const bool enable) {
+  hrtfEnabled = enable;
+}
+
+bool AudioSystem::IsHRTFEnabled() const {
+  return hrtfEnabled;
+}
+
+void AudioSystem::SetHRTFCPUOnly([[maybe_unused]] const bool cpuOnly) {
+  // Enforce GPU-only HRTF processing: ignore CPU-only requests
+  hrtfCPUOnly = false;
+}
+
+bool AudioSystem::IsHRTFCPUOnly() const {
+  return hrtfCPUOnly;
+}
+
+bool AudioSystem::LoadHRTFData(const std::string& filename) {
+  // HRTF parameters
+  constexpr uint32_t hrtfSampleCount = 256; // Number of samples per impulse response
+  constexpr uint32_t positionCount = 36 * 13; // 36 azimuths (10-degree steps) * 13 elevations (15-degree steps)
+  constexpr uint32_t channelCount = 2; // Stereo (left and right ears)
+  const float sampleRate = 44100.0f; // Sample rate for HRTF data
+  const float speedOfSound = 343.0f; // Speed of sound in m/s
+  const float headRadius = 0.0875f; // Average head radius in meters
+
+  // Try to load from a file first (only if the filename is provided)
+  if (!filename.empty()) {
+    if (std::ifstream file(filename, std::ios::binary); file.is_open()) {
+      // Read the file header to determine a format
+      char header[4];
+      file.read(header, 4);
+
+      if (std::strncmp(header, "HRTF", 4) == 0) {
+        // Custom HRTF format
+        uint32_t fileHrtfSize, filePositionCount, fileChannelCount;
+        file.read(reinterpret_cast<char *>(&fileHrtfSize), sizeof(uint32_t));
+        file.read(reinterpret_cast<char *>(&filePositionCount), sizeof(uint32_t));
+        file.read(reinterpret_cast<char *>(&fileChannelCount), sizeof(uint32_t));
+
+        if (fileChannelCount == channelCount) {
+          hrtfData.resize(fileHrtfSize * filePositionCount * fileChannelCount);
+          file.read(reinterpret_cast<char *>(hrtfData.data()), static_cast<std::streamsize>(hrtfData.size() * sizeof(float)));
+
+          hrtfSize = fileHrtfSize;
+          numHrtfPositions = filePositionCount;
+
+          file.close();
+          return true;
+        }
+      }
+      file.close();
+    }
+  }
+
+  // Generate realistic HRTF data based on acoustic modeling
+  // Resize the HRTF data vector
+  hrtfData.resize(hrtfSampleCount * positionCount * channelCount);
+
+  // Generate HRTF impulse responses for each position
+  for (uint32_t pos = 0; pos < positionCount; pos++) {
+    // Calculate azimuth and elevation for this position
+    uint32_t azimuthIndex = pos % 36;
+    uint32_t elevationIndex = pos / 36;
+
+    float azimuth = (static_cast<float>(azimuthIndex) * 10.0f - 180.0f) * std::numbers::pi_v<float> / 180.0f;
+    float elevation = (static_cast<float>(elevationIndex) * 15.0f - 90.0f) * std::numbers::pi_v<float> / 180.0f;
+
+    // Convert to Cartesian coordinates
+    float x = std::cos(elevation) * std::sin(azimuth);
+    float y = std::sin(elevation);
+    float z = std::cos(elevation) * std::cos(azimuth);
+
+    for (uint32_t channel = 0; channel < channelCount; channel++) {
+      // Calculate ear position (left ear: -0.1m, right ear: +0.1m on x-axis)
+      float earX = (channel == 0) ? -0.1f : 0.1f;
+
+      // Calculate distance from source to ear
+      float dx = x - earX;
+      float dy = y;
+      float dz = z;
+      float distance = std::sqrt(dx * dx + dy * dy + dz * dz);
+
+      // Calculate time delay (ITD - Interaural Time Difference)
+      float timeDelay = distance / speedOfSound;
+      auto sampleDelay = static_cast<uint32_t>(timeDelay * sampleRate);
+
+      // Calculate head shadow effect (ILD - Interaural Level Difference)
+      float shadowFactor = 1.0f;
+      if (channel == 0 && azimuth > 0) {
+        // Left ear, source on right
+        shadowFactor = 0.3f + 0.7f * std::exp(-azimuth * 2.0f);
+      } else if (channel == 1 && azimuth < 0) {
+        // Right ear, source on left
+        shadowFactor = 0.3f + 0.7f * std::exp(azimuth * 2.0f);
+      }
+
+      // Generate impulse response
+      uint32_t samplesGenerated = 0;
+      for (uint32_t i = 0; i < hrtfSampleCount; i++) {
+        float value = 0.0f;
+
+        // Direct path impulse
+        if (i >= sampleDelay && i < sampleDelay + 10) {
+          float t = static_cast<float>(i - sampleDelay) / sampleRate;
+          value = shadowFactor * std::exp(-t * 1000.0f) * std::cos(2.0f * std::numbers::pi_v<float> * 1000.0f * t);
+        }
+
+        // Apply distance attenuation
+        value /= std::max(1.0f, distance);
+
+        uint32_t index = pos * hrtfSampleCount * channelCount + channel * hrtfSampleCount + i;
+        hrtfData[index] = value;
+      }
+    }
+  }
+
+  // Store HRTF parameters
+  hrtfSize = hrtfSampleCount;
+  numHrtfPositions = positionCount;
+
+  return true;
+}
+
+bool AudioSystem::ProcessHRTF(const float* inputBuffer, float* outputBuffer, uint32_t sampleCount, const float* sourcePosition) {
+  if (!hrtfEnabled) {
+    // If HRTF is disabled, just copy input to output
+    for (uint32_t i = 0; i < sampleCount; i++) {
+      outputBuffer[i * 2] = inputBuffer[i]; // Left channel
+      outputBuffer[i * 2 + 1] = inputBuffer[i]; // Right channel
+    }
+    return true;
+  }
+
+  // Check if we should use CPU-only processing or if Vulkan is not available
+  // Also force CPU processing if we've detected threading issues previously
+  static bool forceGPUFallback = false;
+  if (hrtfCPUOnly || !renderer || !renderer->IsInitialized() || forceGPUFallback) {
+    // Use CPU-based HRTF processing (either forced or fallback)
+
+    // Create buffers for HRTF processing if they don't exist or if the sample count has changed
+    if (!createHRTFBuffers(sampleCount)) {
+      std::cerr << "Failed to create HRTF buffers" << std::endl;
+      return false;
+    }
+
+    // Copy input data to input buffer
+    void* data = inputBufferMemory.mapMemory(0, sampleCount * sizeof(float));
+    memcpy(data, inputBuffer, sampleCount * sizeof(float));
+    inputBufferMemory.unmapMemory();
+
+    // Copy source and listener positions
+    memcpy(params.sourcePosition, sourcePosition, sizeof(float) * 3);
+    memcpy(params.listenerPosition, listenerPosition, sizeof(float) * 3);
+    memcpy(params.listenerOrientation, listenerOrientation, sizeof(float) * 6);
+    params.sampleCount = sampleCount;
+    params.hrtfSize = hrtfSize;
+    params.numHrtfPositions = numHrtfPositions;
+    params.padding = 0.0f;
+
+    // Copy parameters to parameter buffer using persistent memory mapping
+    if (persistentParamsMemory) {
+      memcpy(persistentParamsMemory, &params, sizeof(HRTFParams));
+    } else {
+      std::cerr << "WARNING: Persistent memory not available, falling back to map/unmap" << std::endl;
+      data = paramsBufferMemory.mapMemory(0, sizeof(HRTFParams));
+      memcpy(data, &params, sizeof(HRTFParams));
+      paramsBufferMemory.unmapMemory();
+    }
+
+    // Perform HRTF processing using CPU-based convolution
+    // This implementation provides real-time 3D audio spatialization
+
+    // Calculate direction from listener to source
+    float direction[3];
+    direction[0] = sourcePosition[0] - listenerPosition[0];
+    direction[1] = sourcePosition[1] - listenerPosition[1];
+    direction[2] = sourcePosition[2] - listenerPosition[2];
+
+    // Normalize direction
+    float length = std::sqrt(direction[0] * direction[0] + direction[1] * direction[1] + direction[2] * direction[2]);
+    if (length > 0.0001f) {
+      direction[0] /= length;
+      direction[1] /= length;
+      direction[2] /= length;
+    } else {
+      direction[0] = 0.0f;
+      direction[1] = 0.0f;
+      direction[2] = -1.0f; // Default to front
+    }
+
+    // Calculate azimuth and elevation
+    float azimuth = std::atan2(direction[0], direction[2]);
+    float elevation = std::asin(std::max(-1.0f, std::min(1.0f, direction[1])));
+
+    // Convert to indices
+    int azimuthIndex = static_cast<int>((azimuth + std::numbers::pi_v<float>) / (2.0f * std::numbers::pi_v<float>) * 36.0f) % 36;
+    int elevationIndex = static_cast<int>((elevation + std::numbers::pi_v<float> / 2.0f) / std::numbers::pi_v<float> * 13.0f);
+    elevationIndex = std::max(0, std::min(12, elevationIndex));
+
+    // Get HRTF index
+    int hrtfIndex = elevationIndex * 36 + azimuthIndex;
+    hrtfIndex = std::min(hrtfIndex, static_cast<int>(numHrtfPositions) - 1);
+
+    // Perform convolution for left and right ears with simple overlap-add using per-direction input history
+    static std::unordered_map<int, std::vector<float>> convHistories; // mono histories keyed by hrtfIndex
+    const uint32_t histLenDesired = (hrtfSize > 0) ? (hrtfSize - 1) : 0;
+    auto& convHistory = convHistories[hrtfIndex];
+    if (convHistory.size() != histLenDesired) {
+      convHistory.assign(histLenDesired, 0.0f);
+    }
+
+    // Build extended input: [history | current input]
+    std::vector<float> extInput(histLenDesired + sampleCount, 0.0f);
+    if (histLenDesired > 0) {
+      std::memcpy(extInput.data(), convHistory.data(), histLenDesired * sizeof(float));
+    }
+    if (sampleCount > 0) {
+      std::memcpy(extInput.data() + histLenDesired, inputBuffer, sampleCount * sizeof(float));
+    }
+
+    for (uint32_t i = 0; i < sampleCount; i++) {
+      float leftSample = 0.0f;
+      float rightSample = 0.0f;
+
+      // Convolve with HRTF impulse response using extended input
+      // extIndex = histLenDesired + i - j; ensure extIndex >= 0
+      uint32_t jMax = std::min<uint32_t>(hrtfSize - 1, histLenDesired + i);
+      for (uint32_t j = 0; j <= jMax; j++) {
+        uint32_t extIndex = histLenDesired + i - j;
+        uint32_t hrtfLeftIndex = hrtfIndex * hrtfSize * 2 + j;
+        uint32_t hrtfRightIndex = hrtfIndex * hrtfSize * 2 + hrtfSize + j;
+
+        if (hrtfLeftIndex < hrtfData.size() && hrtfRightIndex < hrtfData.size()) {
+          float in = extInput[extIndex];
+          leftSample += in * hrtfData[hrtfLeftIndex];
+          rightSample += in * hrtfData[hrtfRightIndex];
+        }
+      }
+
+      // Apply distance attenuation
+      float distanceAttenuation = 1.0f / std::max(1.0f, length);
+      leftSample *= distanceAttenuation;
+      rightSample *= distanceAttenuation;
+
+      // Write to output buffer
+      outputBuffer[i * 2] = leftSample;
+      outputBuffer[i * 2 + 1] = rightSample;
+    }
+
+    // Update history with the tail of the extended input
+    if (histLenDesired > 0) {
+      std::memcpy(convHistory.data(), extInput.data() + sampleCount, histLenDesired * sizeof(float));
+    }
+
+    return true;
+  } else {
+    // Use Vulkan shader-based HRTF processing with fallback to CPU
+    try {
+      // Validate HRTF data exists
+      if (hrtfData.empty()) {
+        LoadHRTFData(""); // Generate HRTF data
+      }
+
+      // Create buffers for HRTF processing if they don't exist or if the sample count has changed
+      if (!createHRTFBuffers(sampleCount)) {
+        std::cerr << "Failed to create HRTF buffers, falling back to CPU processing" << std::endl;
+        throw std::runtime_error("Buffer creation failed");
+      }
+
+      // Copy input data to input buffer
+      void* data = inputBufferMemory.mapMemory(0, sampleCount * sizeof(float));
+      memcpy(data, inputBuffer, sampleCount * sizeof(float));
+
+      inputBufferMemory.unmapMemory();
+
+      // Set up HRTF parameters with proper std140 uniform buffer layout
+      struct alignas(16) HRTFParams {
+        float listenerPosition[4]; // vec3 + padding (16 bytes) - offset 0
+        float listenerForward[4]; // vec3 + padding (16 bytes) - offset 16
+        float listenerUp[4]; // vec3 + padding (16 bytes) - offset 32
+        float sourcePosition[4]; // vec3 + padding (16 bytes) - offset 48
+        float sampleCount; // float (4 bytes) - offset 64
+        float padding1[3]; // Padding to align to 16-byte boundary - offset 68
+        uint32_t inputChannels; // uint (4 bytes) - offset 80
+        uint32_t outputChannels; // uint (4 bytes) - offset 84
+        uint32_t hrtfSize; // uint (4 bytes) - offset 88
+        uint32_t numHrtfPositions; // uint (4 bytes) - offset 92
+        float distanceAttenuation; // float (4 bytes) - offset 96
+        float dopplerFactor; // float (4 bytes) - offset 100
+        float reverbMix; // float (4 bytes) - offset 104
+        float padding2; // Padding to complete 16-byte alignment - offset 108
+      } params{};
+
+      // Copy listener and source positions with proper padding for GPU alignment
+      memcpy(params.listenerPosition, listenerPosition, sizeof(float) * 3);
+      params.listenerPosition[3] = 0.0f; // Padding for float3 alignment
+      memcpy(params.listenerForward, &listenerOrientation[0], sizeof(float) * 3); // Forward vector
+      params.listenerForward[3] = 0.0f; // Padding for float3 alignment
+      memcpy(params.listenerUp, &listenerOrientation[3], sizeof(float) * 3); // Up vector
+      params.listenerUp[3] = 0.0f; // Padding for float3 alignment
+      memcpy(params.sourcePosition, sourcePosition, sizeof(float) * 3);
+      params.sourcePosition[3] = 0.0f; // Padding for float3 alignment
+      params.sampleCount = static_cast<float>(sampleCount); // Number of samples to process
+      params.padding1[0] = params.padding1[1] = params.padding1[2] = 0.0f; // Initialize padding
+      params.inputChannels = 1; // Mono input
+      params.outputChannels = 2; // Stereo output
+      params.hrtfSize = hrtfSize;
+      params.numHrtfPositions = numHrtfPositions;
+      params.distanceAttenuation = 1.0f;
+      params.dopplerFactor = 1.0f;
+      params.reverbMix = 0.0f;
+      params.padding2 = 0.0f; // Initialize padding
+
+      // Copy parameters to parameter buffer using persistent memory mapping
+      if (persistentParamsMemory) {
+        memcpy(persistentParamsMemory, &params, sizeof(HRTFParams));
+      } else {
+        std::cerr << "ERROR: Persistent memory not available for GPU processing!" << std::endl;
+        throw std::runtime_error("Persistent memory required for GPU processing");
+      }
+
+      // Use renderer's main compute pipeline instead of dedicated HRTF pipeline
+      uint32_t workGroupSize = 64; // Must match the numthreads in the shader
+      uint32_t groupCountX = (sampleCount + workGroupSize - 1) / workGroupSize;
+
+      // Use renderer's main compute pipeline dispatch method
+      auto computeFence = renderer->DispatchCompute(groupCountX,
+                                                    1,
+                                                    1,
+                                                    *this->inputBuffer,
+                                                    *this->outputBuffer,
+                                                    *this->hrtfBuffer,
+                                                    *this->paramsBuffer);
+
+      // Wait for compute shader to complete using fence-based synchronization
+      const vk::raii::Device& device = renderer->GetRaiiDevice();
+      vk::Result result = device.waitForFences(*computeFence, VK_TRUE, UINT64_MAX);
+      if (result != vk::Result::eSuccess) {
+        std::cerr << "Failed to wait for compute fence: " << vk::to_string(result) << std::endl;
+        throw std::runtime_error("Fence wait failed");
+      }
+
+      // Copy results from output buffer to the output array
+      void* outputData = outputBufferMemory.mapMemory(0, sampleCount * 2 * sizeof(float));
+
+      memcpy(outputBuffer, outputData, sampleCount * 2 * sizeof(float));
+      outputBufferMemory.unmapMemory();
+
+      return true;
+    } catch (const std::exception& e) {
+      std::cerr << "GPU HRTF processing failed: " << e.what() << std::endl;
+      std::cerr << "CPU fallback disabled - GPU path required" << std::endl;
+      throw; // Re-throw the exception to ensure failure without CPU fallback
+    }
+  }
+}
+
+bool AudioSystem::createHRTFBuffers(uint32_t sampleCount) {
+  // Smart buffer reuse: only recreate if sample count changed significantly or buffers don't exist
+  if (currentSampleCount == sampleCount && *inputBuffer && *outputBuffer && *hrtfBuffer && *paramsBuffer) {
+    return true;
+  }
+
+  // Ensure all GPU operations complete before cleaning up existing buffers.
+  // External synchronization required (VVL): use renderer helper which serializes against queue usage.
+  if (renderer) {
+    renderer->WaitIdle();
+  }
+
+  // Clean up existing buffers only if we need to recreate them
+  cleanupHRTFBuffers();
+
+  if (!renderer) {
+    std::cerr << "AudioSystem::createHRTFBuffers: Renderer is null" << std::endl;
+    return false;
+  }
+
+  const vk::raii::Device& device = renderer->GetRaiiDevice();
+  try {
+    // Create input buffer (mono audio)
+    vk::BufferCreateInfo inputBufferInfo;
+    inputBufferInfo.size = sampleCount * sizeof(float);
+    inputBufferInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer;
+    inputBufferInfo.sharingMode = vk::SharingMode::eExclusive;
+
+    inputBuffer = vk::raii::Buffer(device, inputBufferInfo);
+
+    vk::MemoryRequirements inputMemRequirements = inputBuffer.getMemoryRequirements();
+
+    vk::MemoryAllocateInfo inputAllocInfo;
+    inputAllocInfo.allocationSize = inputMemRequirements.size;
+    inputAllocInfo.memoryTypeIndex = renderer->FindMemoryType(
+      inputMemRequirements.memoryTypeBits,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    inputBufferMemory = vk::raii::DeviceMemory(device, inputAllocInfo);
+    inputBuffer.bindMemory(*inputBufferMemory, 0);
+
+    // Create output buffer (stereo audio)
+    vk::BufferCreateInfo outputBufferInfo;
+    outputBufferInfo.size = sampleCount * 2 * sizeof(float); // Stereo (2 channels)
+    outputBufferInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer;
+    outputBufferInfo.sharingMode = vk::SharingMode::eExclusive;
+
+    outputBuffer = vk::raii::Buffer(device, outputBufferInfo);
+
+    vk::MemoryRequirements outputMemRequirements = outputBuffer.getMemoryRequirements();
+
+    vk::MemoryAllocateInfo outputAllocInfo;
+    outputAllocInfo.allocationSize = outputMemRequirements.size;
+    outputAllocInfo.memoryTypeIndex = renderer->FindMemoryType(
+      outputMemRequirements.memoryTypeBits,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    outputBufferMemory = vk::raii::DeviceMemory(device, outputAllocInfo);
+    outputBuffer.bindMemory(*outputBufferMemory, 0);
+
+    // Create HRTF data buffer
+    vk::BufferCreateInfo hrtfBufferInfo;
+    hrtfBufferInfo.size = hrtfData.size() * sizeof(float);
+    hrtfBufferInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer;
+    hrtfBufferInfo.sharingMode = vk::SharingMode::eExclusive;
+
+    hrtfBuffer = vk::raii::Buffer(device, hrtfBufferInfo);
+
+    vk::MemoryRequirements hrtfMemRequirements = hrtfBuffer.getMemoryRequirements();
+
+    vk::MemoryAllocateInfo hrtfAllocInfo;
+    hrtfAllocInfo.allocationSize = hrtfMemRequirements.size;
+    hrtfAllocInfo.memoryTypeIndex = renderer->FindMemoryType(
+      hrtfMemRequirements.memoryTypeBits,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    hrtfBufferMemory = vk::raii::DeviceMemory(device, hrtfAllocInfo);
+    hrtfBuffer.bindMemory(*hrtfBufferMemory, 0);
+
+    // Copy HRTF data to buffer
+    void* hrtfMappedMemory = hrtfBufferMemory.mapMemory(0, hrtfData.size() * sizeof(float));
+    memcpy(hrtfMappedMemory, hrtfData.data(), hrtfData.size() * sizeof(float));
+    hrtfBufferMemory.unmapMemory();
+
+    // Create parameters buffer - use the correct GPU structure size
+    // The GPU processing uses a larger aligned structure (112 bytes) not the header struct (64 bytes)
+    struct alignas(16) GPUHRTFParams {
+      float listenerPosition[4]; // vec3 + padding (16 bytes)
+      float listenerForward[4]; // vec3 + padding (16 bytes)
+      float listenerUp[4]; // vec3 + padding (16 bytes)
+      float sourcePosition[4]; // vec3 + padding (16 bytes)
+      float sampleCount; // float (4 bytes)
+      float padding1[3]; // Padding to align to 16-byte boundary
+      uint32_t inputChannels; // uint (4 bytes)
+      uint32_t outputChannels; // uint (4 bytes)
+      uint32_t hrtfSize; // uint (4 bytes)
+      uint32_t numHrtfPositions; // uint (4 bytes)
+      float distanceAttenuation; // float (4 bytes)
+      float dopplerFactor; // float (4 bytes)
+      float reverbMix; // float (4 bytes)
+      float padding2; // Padding to complete 16-byte alignment
+    };
+
+    vk::BufferCreateInfo paramsBufferInfo;
+    paramsBufferInfo.size = sizeof(GPUHRTFParams); // Use correct GPU structure size (112 bytes)
+    paramsBufferInfo.usage = vk::BufferUsageFlagBits::eUniformBuffer;
+    paramsBufferInfo.sharingMode = vk::SharingMode::eExclusive;
+
+    paramsBuffer = vk::raii::Buffer(device, paramsBufferInfo);
+
+    vk::MemoryRequirements paramsMemRequirements = paramsBuffer.getMemoryRequirements();
+
+    vk::MemoryAllocateInfo paramsAllocInfo;
+    paramsAllocInfo.allocationSize = paramsMemRequirements.size;
+    paramsAllocInfo.memoryTypeIndex = renderer->FindMemoryType(
+      paramsMemRequirements.memoryTypeBits,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    paramsBufferMemory = vk::raii::DeviceMemory(device, paramsAllocInfo);
+    paramsBuffer.bindMemory(*paramsBufferMemory, 0);
+
+    // Set up persistent memory mapping for parameters buffer to avoid repeated map/unmap operations
+    persistentParamsMemory = paramsBufferMemory.mapMemory(0, sizeof(GPUHRTFParams));
+    // Update current sample count to track buffer size
+    currentSampleCount = sampleCount;
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Error creating HRTF buffers: " << e.what() << std::endl;
+    cleanupHRTFBuffers();
+    return false;
+  }
+}
+
+void AudioSystem::cleanupHRTFBuffers() {
+  // Unmap persistent memory if it exists
+  if (persistentParamsMemory && *paramsBufferMemory) {
+    paramsBufferMemory.unmapMemory();
+    persistentParamsMemory = nullptr;
+  }
+
+  // With RAII, we just need to set the resources to nullptr
+  // The destructors will handle the cleanup
+  inputBuffer = nullptr;
+  inputBufferMemory = nullptr;
+  outputBuffer = nullptr;
+  outputBufferMemory = nullptr;
+  hrtfBuffer = nullptr;
+  hrtfBufferMemory = nullptr;
+  paramsBuffer = nullptr;
+  paramsBufferMemory = nullptr;
+
+  // Reset sample count tracking
+  currentSampleCount = 0;
+}
+
+// Threading implementation methods
+
+void AudioSystem::startAudioThread() {
+  if (audioThreadRunning.load()) {
+    return; // Thread already running
+  }
+
+  audioThreadShouldStop.store(false);
+  audioThreadRunning.store(true);
+
+  audioThread = std::thread(&AudioSystem::audioThreadLoop, this);
+}
+
+void AudioSystem::stopAudioThread() {
+  if (!audioThreadRunning.load()) {
+    return; // Thread not running
+  }
+
+  // Signal the thread to stop
+  audioThreadShouldStop.store(true);
+
+  // Wake up the thread if it's waiting
+  audioCondition.notify_all();
+
+  // Wait for the thread to finish
+  if (audioThread.joinable()) {
+    audioThread.join();
+  }
+
+  audioThreadRunning.store(false);
+}
+
+void AudioSystem::audioThreadLoop() {
+  while (!audioThreadShouldStop.load()) {
+    std::shared_ptr<AudioTask> task = nullptr;
+
+    // Wait for a task or stop signal
+    {
+      std::unique_lock<std::mutex> lock(taskQueueMutex);
+      audioCondition.wait(lock,
+                          [this] {
+                            return !audioTaskQueue.empty() || audioThreadShouldStop.load();
+                          });
+
+      if (audioThreadShouldStop.load()) {
+        break;
+      }
+
+      if (!audioTaskQueue.empty()) {
+        task = audioTaskQueue.front();
+        audioTaskQueue.pop();
+      }
+    }
+
+    // Process the task if we have one
+    if (task) {
+      processAudioTask(task);
+    }
+  }
+}
+
+void AudioSystem::processAudioTask(const std::shared_ptr<AudioTask>& task) {
+  // Process HRTF in the background thread
+  bool success = ProcessHRTF(task->inputBuffer.data(),
+                             task->outputBuffer.data(),
+                             task->sampleCount,
+                             task->sourcePosition);
+
+  if (success && task->outputDevice && task->outputDevice->IsPlaying()) {
+    // We used extended input of length sampleCount = histLen + outFrames.
+    // Trim the first trimFront frames from the stereo output and only write actualSamplesProcessed frames.
+    uint32_t startFrame = task->trimFront;
+    uint32_t framesToWrite = task->actualSamplesProcessed;
+    if (startFrame * 2 > task->outputBuffer.size()) {
+      startFrame = 0; // safety
+    }
+    if (startFrame * 2 + framesToWrite * 2 > task->outputBuffer.size()) {
+      framesToWrite = static_cast<uint32_t>((task->outputBuffer.size() / 2) - startFrame);
+    }
+    float* startPtr = task->outputBuffer.data() + startFrame * 2;
+    // Apply master volume only to the range we will write
+    for (uint32_t i = 0; i < framesToWrite * 2; i++) {
+      startPtr[i] *= task->masterVolume;
+    }
+    // Send processed audio directly to output device from background thread
+    if (!task->outputDevice->WriteAudio(startPtr, framesToWrite)) {
+      std::cerr << "Failed to write audio data to output device from background thread" << std::endl;
+    }
+  }
+}
+
+bool AudioSystem::submitAudioTask(const float* inputBuffer,
+                                  uint32_t sampleCount,
+                                  const float* sourcePosition,
+                                  uint32_t actualSamplesProcessed,
+                                  uint32_t trimFront) {
+  if (!audioThreadRunning.load()) {
+    // Fallback to synchronous processing if the thread is not running
+    std::vector<float> outputBuffer(sampleCount * 2);
+    bool success = ProcessHRTF(inputBuffer, outputBuffer.data(), sampleCount, sourcePosition);
+
+    if (success && outputDevice && outputDevice->IsPlaying()) {
+      // Apply master volume
+      for (uint32_t i = 0; i < sampleCount * 2; i++) {
+        outputBuffer[i] *= masterVolume;
+      }
+
+      // Send to audio output device
+      if (!outputDevice->WriteAudio(outputBuffer.data(), sampleCount)) {
+        std::cerr << "Failed to write audio data to output device" << std::endl;
+        return false;
+      }
+    }
+    return success;
+  }
+
+  // Create a new task for asynchronous processing
+  auto task = std::make_shared<AudioTask>();
+  task->inputBuffer.assign(inputBuffer, inputBuffer + sampleCount);
+  task->outputBuffer.resize(sampleCount * 2); // Stereo output
+  memcpy(task->sourcePosition, sourcePosition, sizeof(float) * 3);
+  task->sampleCount = sampleCount; // includes history frames
+  task->actualSamplesProcessed = actualSamplesProcessed; // new frames only (kChunk)
+  task->trimFront = sampleCount - actualSamplesProcessed; // history length (histLen)
+  task->outputDevice = outputDevice.get();
+  task->masterVolume = masterVolume;
+
+  // Submit the task to the queue (non-blocking)
+  {
+    std::lock_guard<std::mutex> lock(taskQueueMutex);
+    audioTaskQueue.push(task);
+  }
+  audioCondition.notify_one();
+
+  return true; // Return immediately without waiting
+}
+
+void AudioSystem::FlushOutput() {
+  // Stop background processing to avoid races while flushing
+  stopAudioThread();
+
+  // Clear any pending audio processing tasks
+  {
+    std::lock_guard<std::mutex> lock(taskQueueMutex);
+    std::queue<std::shared_ptr<AudioTask>> empty;
+    std::swap(audioTaskQueue, empty);
+  }
+
+  // Flush the output device buffers and queues by restart
+  if (outputDevice) {
+    outputDevice->Stop();
+    outputDevice->Start();
+  }
+
+  // Restart background processing
+  startAudioThread();
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/engine.cpp b/attachments/advanced_gltf/engine.cpp
new file mode 100644
index 000000000..1dc47daa1
--- /dev/null
+++ b/attachments/advanced_gltf/engine.cpp
@@ -0,0 +1,1084 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <chrono>
+#include <iostream>
+#include <random>
+#include <ranges>
+#include <stdexcept>
+#include <shared_mutex>
+#include <mutex>
+#include <vector>
+#include <string>
+#include <memory>
+#include <map>
+#include <unordered_map>
+
+
+#include "engine.h"
+#include "renderer.h"
+
+#include "mesh_component.h"
+#include "scene_loading.h"
+#include "renderer_advanced_types.h"
+
+// This implementation corresponds to the Engine_Architecture chapter in the tutorial:
+// @see en/Building_a_Simple_Engine/Engine_Architecture/02_architectural_patterns.adoc
+
+Engine::Engine() : resourceManager(std::make_unique<ResourceManager>()) {
+}
+
+bool Engine::IsMainThread() const {
+  return std::this_thread::get_id() == mainThreadId;
+}
+
+void Engine::ProcessPendingEntityRemovals() {
+  std::vector<std::string> names; {
+    std::lock_guard<std::mutex> lk(pendingEntityRemovalsMutex);
+    if (pendingEntityRemovalNames.empty())
+      return;
+    names.swap(pendingEntityRemovalNames);
+  }
+
+  // Process on the main thread only (safety)
+  if (!IsMainThread()) {
+    // Put them back; we'll retry next main-thread tick
+    std::lock_guard<std::mutex> lk(pendingEntityRemovalsMutex);
+    pendingEntityRemovalNames.insert(pendingEntityRemovalNames.end(), names.begin(), names.end());
+    return;
+  }
+
+  // Apply removals using the normal API (which takes the appropriate locks).
+  for (const auto& name : names) {
+    (void) RemoveEntity(name);
+  }
+}
+
+Engine::~Engine() {
+  Cleanup();
+}
+
+bool Engine::Initialize(const std::string& appName, int width, int height, bool enableValidationLayers) {
+  // Create platform
+#if defined(PLATFORM_ANDROID)
+  // For Android, the platform is created with the android_app
+  // This will be handled in the android_main function
+  return false;
+#else
+  // Record main thread identity for deferring destructive operations from background threads
+  mainThreadId = std::this_thread::get_id();
+
+  platform = CreatePlatform();
+  if (!platform->Initialize(appName, width, height)) {
+    return false;
+  }
+
+  // Set resize callback
+  platform->SetResizeCallback([this](int width, int height) {
+    HandleResize(width, height);
+  });
+
+  // Set mouse callback
+  platform->SetMouseCallback([this](float x, float y, uint32_t buttons) {
+    handleMouseInput(x, y, buttons);
+  });
+
+  // Set keyboard callback
+  platform->SetKeyboardCallback([this](uint32_t key, bool pressed) {
+    handleKeyInput(key, pressed);
+  });
+
+  // Set char callback
+  platform->SetCharCallback([this](uint32_t c) {
+    if (imguiSystem) {
+      imguiSystem->HandleChar(c);
+    }
+  });
+
+  // Create renderer
+  renderer = std::make_unique<Renderer>(platform.get());
+  if (!renderer->Initialize(appName, enableValidationLayers)) {
+    return false;
+  }
+
+  try {
+    // Model loader via constructor; also wire into renderer
+    modelLoader = std::make_unique<ModelLoader>(renderer.get());
+    renderer->SetModelLoader(modelLoader.get());
+
+    // Audio system via constructor
+    audioSystem = std::make_unique<AudioSystem>(this, renderer.get());
+
+    // Physics system via constructor (GPU enabled)
+    physicsSystem = std::make_unique<PhysicsSystem>(renderer.get(), true);
+
+    // ImGui via constructor, then connect audio system
+    imguiSystem = std::make_unique<ImGuiSystem>(renderer.get(), width, height);
+    imguiSystem->SetAudioSystem(audioSystem.get());
+  } catch (const std::exception& e) {
+    std::cerr << "Subsystem initialization failed: " << e.what() << std::endl;
+    return false;
+  }
+
+  // Generate ball material properties once at load time
+  GenerateBallMaterial();
+
+  // Initialize physics scaling system
+  InitializePhysicsScaling();
+
+  initialized = true;
+  return true;
+#endif
+}
+
+void Engine::Run() {
+  if (!initialized) {
+    throw std::runtime_error("Engine not initialized");
+  }
+
+  running = true;
+
+  // Start physics thread
+  physicsThreadRunning.store(true, std::memory_order_relaxed);
+  physicsThread = std::thread(&Engine::PhysicsThreadFunc, this);
+
+  // Main loop
+  while (running) {
+    // Process platform events
+    if (!platform->ProcessEvents()) {
+      running = false;
+      break;
+    }
+
+    // Calculate delta time
+    deltaTimeMs = CalculateDeltaTimeMs();
+
+    // Update frame counter and FPS
+    frameCount++;
+    fpsUpdateTimer += deltaTimeMs.count() * 0.001f;
+
+    // Update window title with FPS and frame time every second
+    if (fpsUpdateTimer >= 1.0f) {
+      uint64_t framesSinceLastUpdate = frameCount - lastFPSUpdateFrame;
+      double avgMs = 0.0;
+      if (framesSinceLastUpdate > 0 && fpsUpdateTimer > 0.0f) {
+        currentFPS = static_cast<float>(static_cast<double>(framesSinceLastUpdate) / static_cast<double>(fpsUpdateTimer));
+        avgMs = (fpsUpdateTimer / static_cast<double>(framesSinceLastUpdate)) * 1000.0;
+      } else {
+        // Avoid divide-by-zero; keep previous FPS and estimate avgMs from last delta
+        currentFPS = std::max(currentFPS, 1.0f);
+        avgMs = static_cast<double>(deltaTimeMs.count());
+      }
+
+      // Update window title with frame count, FPS, and frame time
+      std::string title = "Simple Engine - Frame: " + std::to_string(frameCount) +
+          " | FPS: " + std::to_string(static_cast<int>(currentFPS)) +
+          " | ms: " + std::to_string(static_cast<int>(avgMs));
+      platform->SetWindowTitle(title);
+
+      // Reset timer and frame counter for next update
+      fpsUpdateTimer = 0.0f;
+      lastFPSUpdateFrame = frameCount;
+    }
+
+    // Update
+    Update(deltaTimeMs);
+
+    // Render
+    Render();
+  }
+}
+
+void Engine::Cleanup() {
+  if (initialized) {
+    // Stop physics thread
+    if (physicsThreadRunning.load(std::memory_order_relaxed)) {
+      physicsThreadRunning.store(false, std::memory_order_relaxed);
+      if (physicsThread.joinable()) {
+        physicsThread.join();
+      }
+    }
+
+    // Wait for the device to be idle before cleaning up
+    if (renderer) {
+      renderer->WaitIdle();
+    }
+
+    // Clear entities
+    {
+      std::unique_lock<std::shared_mutex> lk(entitiesMutex);
+      entities.clear();
+      entityMap.clear();
+    }
+
+    // Clean up subsystems in reverse order of creation
+    imguiSystem.reset();
+    physicsSystem.reset();
+    audioSystem.reset();
+    modelLoader.reset();
+    renderer.reset();
+    platform.reset();
+
+    initialized = false;
+  }
+}
+
+Entity* Engine::CreateEntity(const std::string& name) {
+  std::unique_lock<std::shared_mutex> lk(entitiesMutex);
+  // Always allow duplicate names; map stores a representative entity
+  // Create the entity
+  auto entity = std::make_unique<Entity>(name);
+  // Add to the vector and map
+  entities.push_back(std::move(entity));
+  Entity* rawPtr = entities.back().get();
+  // Update the map to point to the most recently created entity with this name
+  entityMap[name] = rawPtr;
+
+  return rawPtr;
+}
+
+Entity* Engine::GetEntity(const std::string& name) {
+  std::shared_lock<std::shared_mutex> lk(entitiesMutex);
+  auto it = entityMap.find(name);
+  if (it != entityMap.end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+
+bool Engine::RemoveEntity(Entity* entity) {
+  if (!entity) {
+    return false;
+  }
+
+  // If called from a background thread, defer removal to avoid deleting entities
+  // while the render thread may be iterating a snapshot.
+  if (!IsMainThread()) {
+    std::lock_guard<std::mutex> lk(pendingEntityRemovalsMutex);
+    pendingEntityRemovalNames.push_back(entity->GetName());
+    return true;
+  }
+
+  std::unique_lock<std::shared_mutex> lk(entitiesMutex);
+
+  // Remember the name before erasing ownership
+  std::string name = entity->GetName();
+
+  // Find the entity in the vector
+  auto it = std::ranges::find_if(entities,
+                                 [entity](const std::unique_ptr<Entity>& e) {
+                                   return e.get() == entity;
+                                 });
+
+  if (it != entities.end()) {
+    // Remove from the vector (ownership)
+    entities.erase(it);
+
+    // Update the map: point to another entity with the same name if one exists
+    auto remainingIt = std::ranges::find_if(entities,
+                                            [&name](const std::unique_ptr<Entity>& e) {
+                                              return e->GetName() == name;
+                                            });
+
+    if (remainingIt != entities.end()) {
+      entityMap[name] = remainingIt->get();
+    } else {
+      entityMap.erase(name);
+    }
+
+    return true;
+  }
+
+  return false;
+}
+
+bool Engine::RemoveEntity(const std::string& name) {
+  // If called from a background thread, defer removal to avoid deleting entities
+  // while the render thread may be iterating a snapshot.
+  if (!IsMainThread()) {
+    std::lock_guard<std::mutex> lk(pendingEntityRemovalsMutex);
+    pendingEntityRemovalNames.push_back(name);
+    return true;
+  }
+
+  std::unique_lock<std::shared_mutex> lk(entitiesMutex);
+  auto it = entityMap.find(name);
+  if (it == entityMap.end())
+    return false;
+  Entity* entity = it->second;
+  if (!entity)
+    return false;
+
+  // Find the entity in the vector
+  auto vecIt = std::ranges::find_if(entities,
+                                    [entity](const std::unique_ptr<Entity>& e) {
+                                      return e.get() == entity;
+                                    });
+  if (vecIt == entities.end()) {
+    entityMap.erase(name);
+    return false;
+  }
+
+  entities.erase(vecIt);
+
+  // Update the map: point to another entity with the same name if one exists
+  auto remainingIt = std::ranges::find_if(entities,
+                                          [&name](const std::unique_ptr<Entity>& e) {
+                                            return e && e->GetName() == name;
+                                          });
+  if (remainingIt != entities.end()) {
+    entityMap[name] = remainingIt->get();
+  } else {
+    entityMap.erase(name);
+  }
+  return true;
+}
+
+void Engine::SetActiveCamera(CameraComponent* cameraComponent) {
+  activeCamera = cameraComponent;
+}
+
+const CameraComponent* Engine::GetActiveCamera() const {
+  return activeCamera;
+}
+
+const ResourceManager* Engine::GetResourceManager() const {
+  return resourceManager.get();
+}
+
+const Platform* Engine::GetPlatform() const {
+  return platform.get();
+}
+
+Renderer* Engine::GetRenderer() {
+  return renderer.get();
+}
+
+ModelLoader* Engine::GetModelLoader() {
+  return modelLoader.get();
+}
+
+const AudioSystem* Engine::GetAudioSystem() const {
+  return audioSystem.get();
+}
+
+PhysicsSystem* Engine::GetPhysicsSystem() {
+  return physicsSystem.get();
+}
+
+const ImGuiSystem* Engine::GetImGuiSystem() const {
+  return imguiSystem.get();
+}
+
+void Engine::handleMouseInput(float x, float y, uint32_t buttons) {
+  // Check if ImGui wants to capture mouse input first
+  bool imguiWantsMouse = imguiSystem && imguiSystem->WantCaptureMouse();
+
+  // Suppress right-click while loading
+  if (renderer&& renderer
+  
+  ->
+  IsLoading()
+  ) {
+    buttons &= ~2u; // clear right button bit
+  }
+
+  if (!imguiWantsMouse) {
+    // Handle mouse click for ball throwing (right mouse button)
+    if (buttons & 2) {
+      // Right mouse button (bit 1)
+      if (!cameraControl.mouseRightPressed) {
+        cameraControl.mouseRightPressed = true;
+        // Throw a ball on mouse click
+        ThrowBall(x, y);
+      }
+    } else {
+      cameraControl.mouseRightPressed = false;
+    }
+
+    // Handle camera rotation when left mouse button is pressed
+    if (buttons & 1) {
+      // Left mouse button (bit 0)
+      if (!cameraControl.mouseLeftPressed) {
+        cameraControl.mouseLeftPressed = true;
+        cameraControl.firstMouse = true;
+      }
+
+      if (cameraControl.firstMouse) {
+        cameraControl.lastMouseX = x;
+        cameraControl.lastMouseY = y;
+        cameraControl.firstMouse = false;
+      }
+
+      float xOffset = x - cameraControl.lastMouseX;
+      float yOffset = y - cameraControl.lastMouseY;
+      cameraControl.lastMouseX = x;
+      cameraControl.lastMouseY = y;
+
+      xOffset *= cameraControl.mouseSensitivity;
+      yOffset *= cameraControl.mouseSensitivity;
+
+      // Mouse look: positive X moves view to the right; positive Y moves view up.
+      // Platform mouse coordinates increase downward, so invert Y.
+      cameraControl.yaw -= xOffset;
+      cameraControl.pitch -= yOffset;
+
+      // Constrain pitch to avoid gimbal lock
+      if (cameraControl.pitch > 89.0f)
+        cameraControl.pitch = 89.0f;
+      if (cameraControl.pitch < -89.0f)
+        cameraControl.pitch = -89.0f;
+    } else {
+      cameraControl.mouseLeftPressed = false;
+    }
+  }
+
+  if (imguiSystem) {
+    imguiSystem->HandleMouse(x, y, buttons);
+  }
+
+  // Always perform hover detection (even when ImGui is active)
+  HandleMouseHover(x, y);
+}
+
+void Engine::PhysicsThreadFunc() {
+  auto lastTime = std::chrono::steady_clock::now();
+  while (physicsThreadRunning.load(std::memory_order_relaxed)) {
+    auto currentTime = std::chrono::steady_clock::now();
+    auto deltaTime = std::chrono::duration_cast<std::chrono::milliseconds>(currentTime - lastTime);
+    lastTime = currentTime;
+
+    if (deltaTime.count() > 0) {
+      if (physicsSystem) {
+        physicsSystem->Update(deltaTime);
+      }
+    }
+
+    // Cap physics to a reasonable rate (e.g., ~120Hz) to avoid 100% CPU usage
+    // and provide a stable simulation environment.
+    std::this_thread::sleep_for(std::chrono::milliseconds(8));
+  }
+}
+void Engine::handleKeyInput(uint32_t key, bool pressed) {
+#if !defined(PLATFORM_ANDROID)
+  switch (key) {
+    case GLFW_KEY_W:
+    case GLFW_KEY_UP:
+      cameraControl.moveForward = pressed;
+      break;
+    case GLFW_KEY_S:
+    case GLFW_KEY_DOWN:
+      cameraControl.moveBackward = pressed;
+      break;
+    case GLFW_KEY_A:
+    case GLFW_KEY_LEFT:
+      cameraControl.moveLeft = pressed;
+      break;
+    case GLFW_KEY_D:
+    case GLFW_KEY_RIGHT:
+      cameraControl.moveRight = pressed;
+      break;
+    case GLFW_KEY_Q:
+    case GLFW_KEY_PAGE_UP:
+      cameraControl.moveUp = pressed;
+      break;
+    case GLFW_KEY_E:
+    case GLFW_KEY_PAGE_DOWN:
+      cameraControl.moveDown = pressed;
+      break;
+    default:
+      break;
+  }
+
+  if (imguiSystem) {
+    imguiSystem->HandleKeyboard(key, pressed);
+  }
+#else
+  // Android uses different input handling via touch events
+  (void) key;
+  (void) pressed;
+#endif
+}
+
+void Engine::Update(TimeDelta deltaTime) {
+  // Apply any entity removals requested by background threads.
+  ProcessPendingEntityRemovals();
+
+  // During background scene loading we avoid touching the live entity
+  // list from the main thread. This lets the loading thread construct
+  // entities/components safely while the main thread only drives the
+  // UI/loading overlay.
+  if (renderer&& renderer
+  
+  ->
+  IsLoading()
+  ) {
+    if (imguiSystem) {
+      imguiSystem->NewFrame();
+    }
+    return;
+  }
+
+  // Process pending ball creations (outside rendering loop to avoid memory pool constraints)
+  ProcessPendingBalls();
+
+  if (activeCamera) {
+    glm::vec3 currentCameraPosition = activeCamera->GetPosition();
+    physicsSystem->SetCameraPosition(currentCameraPosition);
+  }
+
+  // Physics is now updated on a separate thread (see PhysicsThreadFunc)
+
+  // Update audio system
+  audioSystem->Update(deltaTime);
+
+  // Update ImGui system
+  imguiSystem->NewFrame();
+
+  // Update camera controls
+  if (activeCamera) {
+    UpdateCameraControls(deltaTime);
+  }
+
+  // Update all entities.
+  // Do not hold `entitiesMutex` while calling `Entity::Update()`.
+  // Background threads may need the unique lock to add entities during loading,
+  // and holding a shared lock for a long time can starve them.
+  std::vector<Entity *> snapshot; {
+    std::shared_lock<std::shared_mutex> lk(entitiesMutex);
+    snapshot.reserve(entities.size());
+    for (auto& uptr : entities) {
+      snapshot.push_back(uptr.get());
+    }
+  }
+  for (Entity* entity : snapshot) {
+    if (!entity || !entity->IsActive())
+      continue;
+    entity->Update(deltaTime);
+  }
+}
+
+void Engine::Render() {
+  // Ensure renderer is ready
+  if (!renderer || !renderer->IsInitialized()) {
+    return;
+  }
+
+  // Check if we have an active camera
+  if (!activeCamera) {
+    return;
+  }
+
+  // Apply any entity removals requested by background threads before taking a snapshot.
+  ProcessPendingEntityRemovals();
+
+  // Snapshot entity pointers under a short shared lock, then release the lock
+  // before rendering. This prevents starving the background loader/physics threads
+  // that need the unique lock to create entities/components.
+  std::vector<Entity *> snapshot; {
+    std::shared_lock<std::shared_mutex> lk(entitiesMutex);
+    snapshot.reserve(entities.size());
+    for (auto& uptr : entities) {
+      snapshot.push_back(uptr.get());
+    }
+  }
+
+  // Render the scene (ImGui will be rendered within the render pass)
+  renderer->Render(snapshot, activeCamera, imguiSystem.get());
+}
+
+std::chrono::milliseconds Engine::CalculateDeltaTimeMs() {
+  // Get current time using a steady clock to avoid system time jumps
+  uint64_t currentTime = static_cast<uint64_t>(
+    std::chrono::duration_cast<std::chrono::milliseconds>(
+      std::chrono::steady_clock::now().time_since_epoch())
+    .count());
+
+  // Initialize lastFrameTimeMs on first call
+  if (lastFrameTimeMs == 0) {
+    lastFrameTimeMs = currentTime;
+    return std::chrono::milliseconds(16); // ~16ms as a sane initial guess
+  }
+
+  // Calculate delta time in milliseconds
+  uint64_t delta = currentTime - lastFrameTimeMs;
+
+  // Update last frame time
+  lastFrameTimeMs = currentTime;
+
+  return std::chrono::milliseconds(static_cast<long long>(delta));
+}
+
+void Engine::HandleResize(int width, int height) const {
+  if (height <= 0 || width <= 0) {
+    return;
+  }
+  // Update the active camera's aspect ratio
+  if (activeCamera) {
+    activeCamera->SetAspectRatio(static_cast<float>(width) / static_cast<float>(height));
+  }
+
+  // Notify the renderer that the framebuffer has been resized
+  if (renderer) {
+    renderer->SetFramebufferResized();
+  }
+
+  // Notify ImGui system about the resize
+  if (imguiSystem) {
+    imguiSystem->HandleResize(static_cast<uint32_t>(width), static_cast<uint32_t>(height));
+  }
+}
+
+void Engine::UpdateCameraControls(TimeDelta deltaTime) {
+  if (!activeCamera)
+    return;
+
+  // Get a camera transform component
+  auto* cameraTransform = activeCamera->GetOwner()->GetComponent<TransformComponent>();
+  if (!cameraTransform)
+    return;
+
+  // Check if camera tracking is enabled
+  if (imguiSystem&& imguiSystem->IsCameraTrackingEnabled()) {
+    // Use the last spawned ball for tracking if it's still active
+    Entity* ballEntity = (g_lastSpawnedBall && g_lastSpawnedBall->IsActive()) ? g_lastSpawnedBall : nullptr;
+    
+    if (!ballEntity) {
+        // Fallback: Find the first active ball entity (only if we lost the tracked one)
+        std::shared_lock<std::shared_mutex> lk(entitiesMutex);
+        auto ballEntityIt = std::ranges::find_if(entities,
+                                               [](auto const& entity) {
+                                                 return entity && entity->IsActive() && (entity->GetName().compare(0, 5, "Ball_") == 0);
+                                               });
+        ballEntity = (ballEntityIt != entities.end()) ? ballEntityIt->get() : nullptr;
+        g_lastSpawnedBall = ballEntity;
+    }
+
+    if (ballEntity) {
+      // Get ball's transform component
+      auto* ballTransform = ballEntity->GetComponent<TransformComponent>();
+      if (ballTransform) {
+        glm::vec3 ballPosition = ballTransform->GetPosition();
+
+        // Position camera at a fixed offset from the ball for good viewing
+        glm::vec3 cameraOffset = glm::vec3(2.0f, 1.5f, 2.0f); // Behind and above the ball
+        glm::vec3 cameraPosition = ballPosition + cameraOffset;
+
+        // Update camera position and target
+        cameraTransform->SetPosition(cameraPosition);
+        activeCamera->SetTarget(ballPosition);
+
+        return; // Skip manual controls when tracking
+      }
+    }
+  }
+
+  // Manual camera controls (only when tracking is disabled)
+  // Calculate movement speed
+  float velocity = cameraControl.cameraSpeed * deltaTime.count() * .001f;
+
+  // Capture base orientation from GLTF camera once and then apply mouse deltas relative to it
+  if (!cameraControl.baseOrientationCaptured) {
+    // TransformComponent stores Euler in radians; convert to quaternion
+    glm::vec3 baseEuler = cameraTransform->GetRotation();
+    const glm::quat qx = glm::angleAxis(baseEuler.x, glm::vec3(1.0f, 0.0f, 0.0f));
+    const glm::quat qy = glm::angleAxis(baseEuler.y, glm::vec3(0.0f, 1.0f, 0.0f));
+    const glm::quat qz = glm::angleAxis(baseEuler.z, glm::vec3(0.0f, 0.0f, 1.0f));
+    // Match CameraComponent::UpdateViewMatrix composition (q = qz * qy * qx)
+    cameraControl.baseOrientation = qz * qy * qx;
+    cameraControl.baseOrientationCaptured = true;
+  }
+
+  // Build delta orientation from yaw/pitch mouse deltas (degrees -> radians)
+  const float yawRad = glm::radians(cameraControl.yaw);
+  const float pitchRad = glm::radians(cameraControl.pitch);
+  const glm::quat qDeltaY = glm::angleAxis(yawRad, glm::vec3(0.0f, 1.0f, 0.0f));
+  const glm::quat qDeltaX = glm::angleAxis(pitchRad, glm::vec3(1.0f, 0.0f, 0.0f));
+  // Apply yaw then pitch in the same convention as CameraComponent (ZYX overall), so delta = Ry * Rx
+  glm::quat qDelta = qDeltaY * qDeltaX;
+  glm::quat qFinal = cameraControl.baseOrientation * qDelta;
+
+  // Derive camera basis directly from rotated axes to avoid ambiguity
+  glm::vec3 right = glm::normalize(qFinal * glm::vec3(1.0f, 0.0f, 0.0f));
+  glm::vec3 up = glm::normalize(qFinal * glm::vec3(0.0f, 1.0f, 0.0f));
+  // Camera forward in world space.
+  // Our view/projection conventions assume the camera looks down -Z in its local space.
+  glm::vec3 front = glm::normalize(qFinal * glm::vec3(0.0f, 0.0f, -1.0f));
+
+  // Get the current camera position
+  glm::vec3 position = cameraTransform->GetPosition();
+
+  // Apply movement based on input
+  if (cameraControl.moveForward) {
+    position += front * velocity;
+  }
+  if (cameraControl.moveBackward) {
+    position -= front * velocity;
+  }
+  if (cameraControl.moveLeft) {
+    position -= right * velocity;
+  }
+  if (cameraControl.moveRight) {
+    position += right * velocity;
+  }
+  if (cameraControl.moveUp) {
+    position += up * velocity;
+  }
+  if (cameraControl.moveDown) {
+    position -= up * velocity;
+  }
+
+  // Update camera position
+  cameraTransform->SetPosition(position);
+  // Apply rotation to the camera transform based on GLTF base orientation plus mouse deltas
+  // TransformComponent expects radians Euler (ZYX order in our CameraComponent).
+  cameraTransform->SetRotation(glm::eulerAngles(qFinal));
+
+  // Update camera target based on a direction
+  glm::vec3 target = position + front;
+  activeCamera->SetTarget(target);
+
+  // Ensure the camera view matrix reflects the new transform immediately this frame
+  activeCamera->ForceViewMatrixUpdate();
+}
+
+void Engine::GenerateBallMaterial() {
+  // Generate 8 random material properties for PBR
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<float> dis(0.0f, 1.0f);
+
+  // Generate bright, vibrant albedo colors for better visibility
+  std::uniform_real_distribution<float> brightDis(0.6f, 1.0f); // Ensure bright colors
+  ballMaterial.albedo = glm::vec3(brightDis(gen), brightDis(gen), brightDis(gen));
+
+  // Random metallic value (0.0 to 1.0)
+  ballMaterial.metallic = dis(gen);
+
+  // Random roughness value (0.0 to 1.0)
+  ballMaterial.roughness = dis(gen);
+
+  // Random ambient occlusion (typically 0.8 to 1.0 for good lighting)
+  ballMaterial.ao = 0.8f + dis(gen) * 0.2f;
+
+  // Random emissive color (usually subtle)
+  ballMaterial.emissive = glm::vec3(dis(gen) * 0.3f, dis(gen) * 0.3f, dis(gen) * 0.3f);
+
+  // Decent bounciness (0.6 to 0.9) so bounces are clearly visible
+  ballMaterial.bounciness = 0.6f + dis(gen) * 0.3f;
+}
+
+void Engine::InitializePhysicsScaling() {
+  // Based on issue analysis: balls reaching 120+ m/s and extreme positions like (-244, -360, -244)
+  // The previous 200.0f force scale was causing supersonic speeds and balls flying out of scene
+  // Need much more conservative scaling for realistic visual gameplay
+
+  // Use smaller game unit scale for more controlled physics
+  physicsScaling.gameUnitsToMeters = 0.1f; // 1 game unit = 0.1 meter (10cm) - smaller scale
+
+  // Much reduced force scaling to prevent extreme speeds
+  // With base forces 0.01f-0.05f, this gives final forces of 0.001f-0.005f
+  physicsScaling.forceScale = 1.0f; // Minimal force scaling for realistic movement
+  physicsScaling.physicsTimeScale = 1.0f; // Keep time scale normal
+  physicsScaling.gravityScale = 1.0f; // Keep gravity proportional to scale
+
+  // Apply scaled gravity to physics system
+  glm::vec3 realWorldGravity(0.0f, -9.81f, 0.0f);
+  glm::vec3 scaledGravity = ScaleGravityForPhysics(realWorldGravity);
+  physicsSystem->SetGravity(scaledGravity);
+}
+
+float Engine::ScaleForceForPhysics(float gameForce) const {
+  // Scale force based on the relationship between game units and real world
+  // and the force scaling factor to make physics feel right
+  return gameForce * physicsScaling.forceScale * physicsScaling.gameUnitsToMeters;
+}
+
+glm::vec3 Engine::ScaleGravityForPhysics(const glm::vec3& realWorldGravity) const {
+  // Scale gravity based on game unit scale and gravity scaling factor
+  // If 1 game unit = 1 meter, then gravity should remain -9.81
+  // If 1 game unit = 0.1 meter, then gravity should be -0.981
+  return realWorldGravity * physicsScaling.gravityScale * physicsScaling.gameUnitsToMeters;
+}
+
+float Engine::ScaleTimeForPhysics(float deltaTime) const {
+  // Scale time for physics simulation if needed
+  // This can be used to slow down or speed up physics relative to rendering
+  return deltaTime * physicsScaling.physicsTimeScale;
+}
+
+void Engine::ThrowBall(float mouseX, float mouseY) {
+  if (!activeCamera || !physicsSystem) {
+    return;
+  }
+
+  // Get window dimensions
+  int windowWidth, windowHeight;
+  platform->GetWindowSize(&windowWidth, &windowHeight);
+
+  // Convert mouse coordinates to normalized device coordinates (-1 to 1)
+  float ndcX = (2.0f * mouseX) / static_cast<float>(windowWidth) - 1.0f;
+  float ndcY = 1.0f - (2.0f * mouseY) / static_cast<float>(windowHeight);
+
+  // Get camera matrices
+  glm::mat4 viewMatrix = activeCamera->GetViewMatrix();
+  glm::mat4 projMatrix = activeCamera->GetProjectionMatrix();
+
+  // Calculate inverse matrices
+  glm::mat4 invView = glm::inverse(viewMatrix);
+  glm::mat4 invProj = glm::inverse(projMatrix);
+
+  // Convert NDC to world space for direction
+  glm::vec4 rayClip = glm::vec4(ndcX, ndcY, -1.0f, 1.0f);
+  glm::vec4 rayEye = invProj * rayClip;
+  rayEye = glm::vec4(rayEye.x, rayEye.y, -1.0f, 0.0f);
+  glm::vec4 rayWorld = invView * rayEye;
+
+  // Calculate screen center in world coordinates
+  // Screen center is at NDC (0, 0) which corresponds to the center of the view
+  glm::vec4 screenCenterClip = glm::vec4(0.0f, 0.0f, -1.0f, 1.0f);
+  glm::vec4 screenCenterEye = invProj * screenCenterClip;
+  screenCenterEye = glm::vec4(screenCenterEye.x, screenCenterEye.y, -1.0f, 0.0f);
+  glm::vec4 screenCenterWorld = invView * screenCenterEye;
+  glm::vec3 screenCenterDirection = glm::normalize(glm::vec3(screenCenterWorld));
+
+  // Calculate world position for screen center at a reasonable distance from camera
+  glm::vec3 cameraPosition = activeCamera->GetPosition();
+  glm::vec3 screenCenterWorldPos = cameraPosition + screenCenterDirection * 2.0f; // 2 units in front of camera
+
+  // Calculate throw direction from screen center toward mouse position
+  glm::vec3 throwDirection = glm::normalize(glm::vec3(rayWorld));
+
+  // Add upward component for realistic arc trajectory
+  throwDirection.y += 0.3f; // Add upward bias for throwing arc
+  throwDirection = glm::normalize(throwDirection); // Re-normalize after modification
+
+  // Generate ball properties now
+  static int ballCounter = 0;
+  std::string ballName = "Ball_" + std::to_string(ballCounter++);
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+
+  // Launch balls from screen center toward mouse cursor
+  glm::vec3 spawnPosition = screenCenterWorldPos;
+
+  // Add small random variation to avoid identical paths
+  std::uniform_real_distribution<float> posDis(-0.1f, 0.1f);
+  spawnPosition.x += posDis(gen);
+  spawnPosition.y += posDis(gen);
+  spawnPosition.z += posDis(gen);
+
+  std::uniform_real_distribution<float> spinDis(-10.0f, 10.0f);
+  std::uniform_real_distribution<float> forceDis(15.0f, 35.0f); // Stronger force range for proper throwing feel
+
+  // Store ball creation data for processing outside rendering loop
+  PendingBall pendingBall;
+  pendingBall.spawnPosition = spawnPosition;
+  pendingBall.throwDirection = throwDirection; // This is now the corrected direction toward geometry
+  pendingBall.throwForce = ScaleForceForPhysics(forceDis(gen)); // Apply physics scaling to force
+  pendingBall.randomSpin = glm::vec3(spinDis(gen), spinDis(gen), spinDis(gen));
+  pendingBall.ballName = ballName;
+
+  pendingBalls.push_back(pendingBall);
+}
+
+void Engine::ProcessPendingBalls() {
+  // Process all pending balls
+  for (const auto& pendingBall : pendingBalls) {
+    // Create ball entity
+    Entity* ballEntity = CreateEntity(pendingBall.ballName);
+    if (!ballEntity) {
+      std::cerr << "Failed to create ball entity: " << pendingBall.ballName << std::endl;
+      continue;
+    }
+
+    // Add transform component
+    auto* transform = ballEntity->AddComponent<TransformComponent>();
+    if (!transform) {
+      std::cerr << "Failed to add TransformComponent to ball: " << pendingBall.ballName << std::endl;
+      continue;
+    }
+    transform->SetPosition(pendingBall.spawnPosition);
+    transform->SetScale(glm::vec3(1.0f)); // Tennis ball size scale
+
+    // Add mesh component with sphere geometry
+    auto* mesh = ballEntity->AddComponent<MeshComponent>();
+    if (!mesh) {
+      std::cerr << "Failed to add MeshComponent to ball: " << pendingBall.ballName << std::endl;
+      continue;
+    }
+    // Create tennis ball-sized, bright red sphere
+    glm::vec3 brightRed(1.0f, 0.0f, 0.0f);
+    mesh->CreateSphere(0.0335f, brightRed, 32); // Tennis ball radius, bright color, high detail
+    mesh->SetTexturePath(renderer->SHARED_BRIGHT_RED_ID); // Use bright red texture for visibility
+
+    // Verify mesh geometry was created
+    const auto& vertices = mesh->GetVertices();
+    const auto& indices = mesh->GetIndices();
+    if (vertices.empty() || indices.empty()) {
+      std::cerr << "ERROR: CreateSphere failed to generate geometry!" << std::endl;
+      continue;
+    }
+
+    // Pre-allocate Vulkan resources for this entity (now outside rendering loop)
+    if (!renderer->preAllocateEntityResources(ballEntity)) {
+      std::cerr << "Failed to pre-allocate resources for ball: " << pendingBall.ballName << std::endl;
+      continue;
+    }
+
+    // Create rigid body with sphere collision shape
+    RigidBody* rigidBody = physicsSystem->CreateRigidBody(ballEntity, CollisionShape::Sphere, 1.0f);
+    if (rigidBody) {
+      // Set bounciness from material
+      rigidBody->SetRestitution(ballMaterial.bounciness);
+
+      // Request an acceleration structure build so the new ball is included in Ray Query mode.
+      // We do this after creating the rigid body and initializing the entity.
+      renderer->RequestAccelerationStructureBuild("Ball spawned");
+
+      // Link ball entity to global for camera tracking optimization
+      g_lastSpawnedBall = ballEntity;
+
+      // Apply throw force and spin
+      glm::vec3 throwImpulse = pendingBall.throwDirection * pendingBall.throwForce;
+      rigidBody->ApplyImpulse(throwImpulse, glm::vec3(0.0f));
+      rigidBody->SetAngularVelocity(pendingBall.randomSpin);
+    }
+  }
+
+  // Clear processed balls
+  pendingBalls.clear();
+}
+
+void Engine::HandleMouseHover(float mouseX, float mouseY) {
+  // Update current mouse position for any systems that might need it
+  currentMouseX = mouseX;
+  currentMouseY = mouseY;
+}
+
+#if defined(PLATFORM_ANDROID)
+// Android-specific implementation
+bool Engine::InitializeAndroid(android_app* app, const std::string& appName, bool enableValidationLayers) {
+  // Create platform
+  platform = CreatePlatform(app);
+  if (!platform->Initialize(appName, 0, 0)) {
+    return false;
+  }
+
+  // Set resize callback
+  platform->SetResizeCallback([this](int width, int height) {
+    HandleResize(width, height);
+  });
+
+  // Set mouse callback
+  platform->SetMouseCallback([this](float x, float y, uint32_t buttons) {
+    // Check if ImGui wants to capture mouse input first
+    bool imguiWantsMouse = imguiSystem && imguiSystem->WantCaptureMouse();
+
+    if (!imguiWantsMouse) {
+      // Handle mouse click for ball throwing (right mouse button)
+      if (buttons & 2) {
+        // Right mouse button (bit 1)
+        if (!cameraControl.mouseRightPressed) {
+          cameraControl.mouseRightPressed = true;
+          // Throw a ball on mouse click
+          ThrowBall(x, y);
+        }
+      } else {
+        cameraControl.mouseRightPressed = false;
+      }
+    }
+
+    if (imguiSystem) {
+      imguiSystem->HandleMouse(x, y, buttons);
+    }
+  });
+
+  // Set keyboard callback
+  platform->SetKeyboardCallback([this](uint32_t key, bool pressed) {
+    if (imguiSystem) {
+      imguiSystem->HandleKeyboard(key, pressed);
+    }
+  });
+
+  // Set char callback
+  platform->SetCharCallback([this](uint32_t c) {
+    if (imguiSystem) {
+      imguiSystem->HandleChar(c);
+    }
+  });
+
+  // Create renderer
+  renderer = std::make_unique<Renderer>(platform.get());
+  if (!renderer->Initialize(appName, enableValidationLayers)) {
+    return false;
+  }
+
+  // Get window dimensions from platform for ImGui initialization
+  int width, height;
+  platform->GetWindowSize(&width, &height);
+
+  try {
+    // Model loader via constructor; also wire into renderer
+    modelLoader = std::make_unique<ModelLoader>(renderer.get());
+    renderer->SetModelLoader(modelLoader.get());
+
+    // Audio system via constructor
+    audioSystem = std::make_unique<AudioSystem>(this, renderer.get());
+
+    // Physics system via constructor (GPU enabled)
+    physicsSystem = std::make_unique<PhysicsSystem>(renderer.get(), true);
+
+    // ImGui via constructor, then connect audio system
+    imguiSystem = std::make_unique<ImGuiSystem>(renderer.get(), width, height);
+    imguiSystem->SetAudioSystem(audioSystem.get());
+  } catch (const std::exception& e) {
+    std::cerr << "Subsystem initialization failed: " << e.what() << std::endl;
+    return false;
+  }
+
+  // Generate ball material properties once at load time
+  GenerateBallMaterial();
+
+  // Initialize physics scaling system
+  InitializePhysicsScaling();
+
+  initialized = true;
+  return true;
+}
+
+void Engine::RunAndroid() {
+  if (!initialized) {
+    throw std::runtime_error("Engine not initialized");
+  }
+
+  running = true;
+
+  // Main loop is handled by the platform
+  // We just need to update and render when the platform is ready
+
+  // Calculate delta time
+  deltaTimeMs = CalculateDeltaTimeMs();
+
+  // Update
+  Update(deltaTimeMs);
+
+  // Render
+  Render();
+}
+#endif
\ No newline at end of file
diff --git a/attachments/advanced_gltf/engine.h b/attachments/advanced_gltf/engine.h
new file mode 100644
index 000000000..e74b29ac3
--- /dev/null
+++ b/attachments/advanced_gltf/engine.h
@@ -0,0 +1,412 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <chrono>
+#include <memory>
+#include <mutex>
+#include <shared_mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <vector>
+
+#include "audio_system.h"
+#include "camera_component.h"
+#include "entity.h"
+#include "imgui_system.h"
+#include "model_loader.h"
+#include "physics_system.h"
+#include "platform.h"
+#include "renderer.h"
+#include "resource_manager.h"
+
+/**
+ * @brief Main engine class that manages the game loop and subsystems.
+ *
+ * This class implements the core engine architecture as described in the Engine_Architecture chapter:
+ * @see en/Building_a_Simple_Engine/Engine_Architecture/02_architectural_patterns.adoc
+ */
+class Engine
+{
+  public:
+	using TimeDelta = std::chrono::milliseconds;
+	/**
+	 * @brief Default constructor.
+	 */
+	Engine();
+
+	/**
+	 * @brief Destructor for proper cleanup.
+	 */
+	~Engine();
+
+	/**
+	 * @brief Initialize the engine.
+	 * @param appName The name of the application.
+	 * @param width The width of the window.
+	 * @param height The height of the window.
+	 * @param enableValidationLayers Whether to enable Vulkan validation layers.
+	 * @return True if initialization was successful, false otherwise.
+	 */
+	bool Initialize(const std::string &appName, int width, int height, bool enableValidationLayers = true);
+
+	/**
+	 * @brief Run the main game loop.
+	 */
+	void Run();
+
+	/**
+	 * @brief Clean up engine resources.
+	 */
+	void Cleanup();
+
+	/**
+	 * @brief Create a new entity.
+	 * @param name The name of the entity.
+	 * @return A pointer to the newly created entity.
+	 */
+	Entity *CreateEntity(const std::string &name);
+
+	/**
+	 * @brief Get an entity by name.
+	 * @param name The name of the entity.
+	 * @return A pointer to the entity, or nullptr if not found.
+	 */
+	Entity *GetEntity(const std::string &name);
+
+	/**
+	 * @brief Get all entities.
+	 * @return A const reference to the vector of entities.
+	 */
+	const std::vector<std::unique_ptr<Entity>> &GetEntities() const
+	{
+		return entities;
+	}
+
+	/**
+	 * @brief Remove an entity.
+	 * @param entity The entity to remove.
+	 * @return True if the entity was removed, false otherwise.
+	 */
+	bool RemoveEntity(Entity *entity);
+
+	/**
+	 * @brief Remove an entity by name.
+	 * @param name The name of the entity to remove.
+	 * @return True if the entity was removed, false otherwise.
+	 */
+	bool RemoveEntity(const std::string &name);
+
+	/**
+	 * @brief Set the active camera.
+	 * @param cameraComponent The camera component to set as active.
+	 */
+	void SetActiveCamera(CameraComponent *cameraComponent);
+
+	/**
+	 * @brief Get the active camera.
+	 * @return A pointer to the active camera component, or nullptr if none is set.
+	 */
+	const CameraComponent *GetActiveCamera() const;
+
+	/**
+	 * @brief Get the resource manager.
+	 * @return A pointer to the resource manager.
+	 */
+	const ResourceManager *GetResourceManager() const;
+
+	/**
+	 * @brief Get the platform.
+	 * @return A pointer to the platform.
+	 */
+	const Platform *GetPlatform() const;
+
+	/**
+	 * @brief Get the renderer.
+	 * @return A pointer to the renderer.
+	 */
+	Renderer *GetRenderer();
+
+	/**
+	 * @brief Get the model loader.
+	 * @return A pointer to the model loader.
+	 */
+	ModelLoader *GetModelLoader();
+
+	/**
+	 * @brief Get the audio system.
+	 * @return A pointer to the audio system.
+	 */
+	const AudioSystem *GetAudioSystem() const;
+
+	/**
+	 * @brief Get the physics system.
+	 * @return A pointer to the physics system.
+	 */
+	PhysicsSystem *GetPhysicsSystem();
+
+	/**
+	 * @brief Get the ImGui system.
+	 * @return A pointer to the ImGui system.
+	 */
+	const ImGuiSystem *GetImGuiSystem() const;
+
+	/**
+	 * @brief Handles mouse input for interaction and camera control.
+	 *
+	 * This method processes mouse input for various functionalities, including interacting with the scene,
+	 * camera rotation, and delegating handling to ImGui or hover systems.
+	 *
+	 * @param x The x-coordinate of the mouse position.
+	 * @param y The y-coordinate of the mouse position.
+	 * @param buttons A bitmask representing the state of mouse buttons.
+	 *                Bit 0 corresponds to the left button, and Bit 1 corresponds to the right button.
+	 */
+	void handleMouseInput(float x, float y, uint32_t buttons);
+
+	/**
+	 * @brief Handles keyboard input events for controlling the camera and other subsystems.
+	 *
+	 * This method processes key press and release events to update the camera's movement state.
+	 * It also forwards the input to other subsystems like the ImGui interface if applicable.
+	 *
+	 * @param key The key code of the keyboard input.
+	 * @param pressed Indicates whether the key is pressed (true) or released (false).
+	 */
+	void handleKeyInput(uint32_t key, bool pressed);
+
+#if defined(PLATFORM_ANDROID)
+/**
+ * @brief Initialize the engine for Android.
+ * @param app The Android app.
+ * @param appName The name of the application.
+ * @param enableValidationLayers Whether to enable Vulkan validation layers.
+ * @return True if initialization was successful, false otherwise.
+ */
+#	if defined(NDEBUG)
+	bool InitializeAndroid(android_app *app, const std::string &appName, bool enableValidationLayers = false);
+#	else
+	bool InitializeAndroid(android_app *app, const std::string &appName, bool enableValidationLayers = true);
+#	endif
+
+	/**
+	 * @brief Run the engine on Android.
+	 */
+	void RunAndroid();
+#endif
+
+  private:
+	// Subsystems
+	std::unique_ptr<Platform>        platform;
+	std::unique_ptr<Renderer>        renderer;
+	std::unique_ptr<ResourceManager> resourceManager;
+	std::unique_ptr<ModelLoader>     modelLoader;
+	std::unique_ptr<AudioSystem>     audioSystem;
+	std::unique_ptr<PhysicsSystem>   physicsSystem;
+	std::unique_ptr<ImGuiSystem>     imguiSystem;
+
+	// Entities
+	// NOTE: Entities can be created from a background loading thread (see `main.cpp`).
+	// Protect the containers to avoid iterator invalidation/data races while the render thread
+	// iterates them.
+  public:
+        mutable std::shared_mutex                 entitiesMutex;
+        std::vector<std::unique_ptr<Entity>>      entities;
+	std::unordered_map<std::string, Entity *> entityMap;
+
+	// Main thread identity (used to defer destructive operations from background threads)
+	std::thread::id mainThreadId{};
+
+	// Background threads may request entity removal while the render thread is iterating snapshots.
+	// To keep `Entity*` snapshots safe, defer removals to the main thread at a safe point.
+	std::mutex              pendingEntityRemovalsMutex;
+	std::vector<std::string> pendingEntityRemovalNames;
+	void                    ProcessPendingEntityRemovals();
+	bool                    IsMainThread() const;
+
+	// Active camera
+	CameraComponent *activeCamera = nullptr;
+
+	// Engine state
+	bool initialized = false;
+	bool running     = false;
+
+	// Delta time calculation
+	// deltaTimeMs: time since last frame in milliseconds (for clarity)
+	std::chrono::milliseconds deltaTimeMs{0};
+	uint64_t                  lastFrameTimeMs = 0;
+
+	// Frame counter and FPS calculation
+	uint64_t frameCount         = 0;
+	float    fpsUpdateTimer     = 0.0f;
+	float    currentFPS         = 0.0f;
+	uint64_t lastFPSUpdateFrame = 0;
+
+	// Camera control state
+	struct CameraControlState
+	{
+		bool      moveForward             = false;
+		bool      moveBackward            = false;
+		bool      moveLeft                = false;
+		bool      moveRight               = false;
+		bool      moveUp                  = false;
+		bool      moveDown                = false;
+		bool      mouseLeftPressed        = false;
+		bool      mouseRightPressed       = false;
+		float     lastMouseX              = 0.0f;
+		float     lastMouseY              = 0.0f;
+		float     yaw                     = 0.0f;
+		float     pitch                   = 0.0f;
+		bool      firstMouse              = true;
+		float     cameraSpeed             = 5.0f;
+		float     mouseSensitivity        = 0.1f;
+		bool      baseOrientationCaptured = false;
+		glm::quat baseOrientation{1.0f, 0.0f, 0.0f, 0.0f};
+	} cameraControl;
+
+	// Mouse position tracking
+	float currentMouseX = 0.0f;
+	float currentMouseY = 0.0f;
+
+	// Ball material properties for PBR
+	struct BallMaterial
+	{
+		glm::vec3 albedo;
+		float     metallic;
+		float     roughness;
+		float     ao;
+		glm::vec3 emissive;
+		float     bounciness;
+	};
+
+	BallMaterial ballMaterial;
+
+	// Physics scaling configuration
+	// The bistro scene spans roughly 20 game units and represents a realistic cafe/bistro space
+	// Based on issue feedback: game units should NOT equal 1m and need proper scaling
+	// Analysis shows bistro geometry pieces are much smaller than assumed
+	struct PhysicsScaling
+	{
+		float gameUnitsToMeters = 0.1f;        // 1 game unit = 0.1 meter (10cm) - more realistic scale
+		float physicsTimeScale  = 1.0f;        // Normal time scale for stable physics
+		float forceScale        = 2.0f;        // Much reduced force scaling for visual gameplay (was 10.0f)
+		float gravityScale      = 0.1f;        // Scaled gravity for smaller world scale
+	};
+
+	PhysicsScaling physicsScaling;
+
+	// Pending ball creation data
+	struct PendingBall
+	{
+		glm::vec3   spawnPosition;
+		glm::vec3   throwDirection;
+		float       throwForce;
+		glm::vec3   randomSpin;
+		std::string ballName;
+	};
+
+	std::vector<PendingBall> pendingBalls;
+
+	/**
+	 * @brief Update the engine state.
+	 * @param deltaTime The time elapsed since the last update.
+	 */
+	// Accepts a time delta in milliseconds for clarity
+	void Update(TimeDelta deltaTime);
+
+	// Physics Threading
+	std::thread physicsThread;
+	std::atomic<bool> physicsThreadRunning{false};
+	std::mutex physicsDeltaTimeMutex;
+	std::chrono::milliseconds physicsDeltaTime{0};
+	void PhysicsThreadFunc();
+
+	/**
+	 * @brief Render the scene.
+	 */
+	void Render();
+
+	/**
+	 * @brief Calculate the time delta between frames.
+	 * @return The delta time in milliseconds (steady_clock based).
+	 */
+	std::chrono::milliseconds CalculateDeltaTimeMs();
+
+	/**
+	 * @brief Handle window resize events.
+	 * @param width The new width of the window.
+	 * @param height The new height of the window.
+	 */
+	void HandleResize(int width, int height) const;
+
+	/**
+	 * @brief Update camera controls based on input state.
+	 * @param deltaTime The time elapsed since the last update.
+	 */
+	void UpdateCameraControls(TimeDelta deltaTime);
+
+	/**
+	 * @brief Generate random PBR material properties for the ball.
+	 */
+	void GenerateBallMaterial();
+
+	/**
+	 * @brief Initialize physics scaling based on scene analysis.
+	 */
+	void InitializePhysicsScaling();
+
+	/**
+	 * @brief Convert a force value from game units to physics units.
+	 * @param gameForce Force in game units.
+	 * @return Force scaled for physics simulation.
+	 */
+	float ScaleForceForPhysics(float gameForce) const;
+
+	/**
+	 * @brief Convert gravity from real-world units to game physics units.
+	 * @param realWorldGravity Gravity in m/s².
+	 * @return Gravity scaled for game physics.
+	 */
+	glm::vec3 ScaleGravityForPhysics(const glm::vec3 &realWorldGravity) const;
+
+	/**
+	 * @brief Convert time delta for physics simulation.
+	 * @param deltaTime Real delta time.
+	 * @return Scaled delta time for physics.
+	 */
+	float ScaleTimeForPhysics(float deltaTime) const;
+
+	/**
+	 * @brief Throw a ball into the scene with random properties.
+	 * @param mouseX The x-coordinate of the mouse click.
+	 * @param mouseY The y-coordinate of the mouse click.
+	 */
+	void ThrowBall(float mouseX, float mouseY);
+
+	/**
+	 * @brief Process pending ball creations outside the rendering loop.
+	 */
+	void ProcessPendingBalls();
+
+	/**
+	 * @brief Handle mouse hover to track current mouse position.
+	 * @param mouseX The x-coordinate of the mouse position.
+	 * @param mouseY The y-coordinate of the mouse position.
+	 */
+	void HandleMouseHover(float mouseX, float mouseY);
+};
diff --git a/attachments/advanced_gltf/imgui_system.cpp b/attachments/advanced_gltf/imgui_system.cpp
new file mode 100644
index 000000000..abb793a6e
--- /dev/null
+++ b/attachments/advanced_gltf/imgui_system.cpp
@@ -0,0 +1,1108 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "imgui_system.h"
+#include "audio_system.h"
+#include "renderer.h"
+
+// Include ImGui headers
+#include "imgui/imgui.h"
+
+#include <iostream>
+
+// This implementation corresponds to the GUI chapter in the tutorial:
+// @see en/Building_a_Simple_Engine/GUI/02_imgui_setup.adoc
+
+ImGuiSystem::ImGuiSystem() {
+  // Constructor implementation
+}
+
+ImGuiSystem::~ImGuiSystem() {
+  // Destructor implementation
+  Cleanup();
+}
+
+bool ImGuiSystem::Initialize(Renderer* renderer, uint32_t width, uint32_t height) {
+  if (initialized) {
+    return true;
+  }
+
+  this->renderer = renderer;
+  this->width = width;
+  this->height = height;
+
+  // Create ImGui context
+  context = ImGui::CreateContext();
+  if (!context) {
+    std::cerr << "Failed to create ImGui context" << std::endl;
+    return false;
+  }
+
+  // Configure ImGui
+  ImGuiIO& io = ImGui::GetIO();
+  // Set display size
+  io.DisplaySize = ImVec2(static_cast<float>(width), static_cast<float>(height));
+  io.DisplayFramebufferScale = ImVec2(1.0f, 1.0f);
+
+  // Set up ImGui style
+  ImGui::StyleColorsDark();
+
+  // Create Vulkan resources
+  if (!createResources()) {
+    std::cerr << "Failed to create ImGui Vulkan resources" << std::endl;
+    Cleanup();
+    return false;
+  }
+
+  // Initialize per-frame buffers containers
+  if (renderer) {
+    uint32_t frames = renderer->GetMaxFramesInFlight();
+    vertexBuffers.clear();
+    vertexBuffers.reserve(frames);
+    vertexBufferMemories.clear();
+    vertexBufferMemories.reserve(frames);
+    indexBuffers.clear();
+    indexBuffers.reserve(frames);
+    indexBufferMemories.clear();
+    indexBufferMemories.reserve(frames);
+    for (uint32_t i = 0; i < frames; ++i) {
+      vertexBuffers.emplace_back(nullptr);
+      vertexBufferMemories.emplace_back(nullptr);
+      indexBuffers.emplace_back(nullptr);
+      indexBufferMemories.emplace_back(nullptr);
+    }
+    vertexCounts.assign(frames, 0);
+    indexCounts.assign(frames, 0);
+  }
+
+  initialized = true;
+  return true;
+}
+
+void ImGuiSystem::Cleanup() {
+  if (!initialized) {
+    return;
+  }
+
+  // Wait for the device to be idle before cleaning up
+  if (renderer) {
+    renderer->WaitIdle();
+  }
+  // Destroy ImGui context
+  if (context) {
+    ImGui::DestroyContext(context);
+    context = nullptr;
+  }
+
+  initialized = false;
+}
+
+void ImGuiSystem::SetAudioSystem(AudioSystem* audioSystem) {
+  this->audioSystem = audioSystem;
+
+  // Load the grass-step-right.wav file and create audio source
+  if (audioSystem) {
+    if (audioSystem->LoadAudio("../../Assets/grass-step-right.wav", "grass_step") ||
+        audioSystem->LoadAudio("../Assets/grass-step-right.wav", "grass_step")) {
+      audioSource = audioSystem->CreateAudioSource("grass_step");
+      if (audioSource) {
+        audioSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ);
+        audioSource->SetVolume(0.8f);
+        audioSource->SetLoop(true);
+        std::cout << "Audio source created and configured for HRTF demo" << std::endl;
+      }
+    }
+
+    // Also create a debug ping source for testing
+    debugPingSource = audioSystem->CreateDebugPingSource("debug_ping");
+    if (debugPingSource) {
+      debugPingSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ);
+      debugPingSource->SetVolume(0.8f);
+      debugPingSource->SetLoop(true);
+      std::cout << "Debug ping source created for audio debugging" << std::endl;
+    }
+  }
+}
+
+void ImGuiSystem::NewFrame() {
+  if (!initialized) {
+    return;
+  }
+
+  // Reset the flag at the start of each frame
+  frameAlreadyRendered = false;
+
+  ImGui::NewFrame();
+
+  // Loading overlay: show a fullscreen progress bar while the initial scene is loading.
+  // The bar resets between phases (Textures -> Physics -> AS -> Finalizing) so users
+  // don't stare at a 100% bar while the engine is still doing work.
+  if (renderer) {
+    const bool modelLoading = renderer->IsLoading();
+    if (modelLoading) {
+      ImGuiIO& io = ImGui::GetIO();
+      // Suppress right-click while loading
+      if (io.MouseDown[1])
+        io.MouseDown[1] = false;
+
+      const ImVec2 dispSize = io.DisplaySize;
+
+      ImGui::SetNextWindowPos(ImVec2(0, 0));
+      ImGui::SetNextWindowSize(dispSize);
+      ImGuiWindowFlags flags = ImGuiWindowFlags_NoTitleBar |
+          ImGuiWindowFlags_NoResize |
+          ImGuiWindowFlags_NoMove |
+          ImGuiWindowFlags_NoScrollbar |
+          ImGuiWindowFlags_NoCollapse |
+          ImGuiWindowFlags_NoSavedSettings |
+          ImGuiWindowFlags_NoBringToFrontOnFocus |
+          ImGuiWindowFlags_NoNav;
+
+      if (ImGui::Begin("##LoadingOverlay", nullptr, flags)) {
+        ImGui::PushStyleVar(ImGuiStyleVar_WindowPadding, ImVec2(0, 0));
+        // Center the progress elements
+        const float barWidth = dispSize.x * 0.8f;
+        const float barX = (dispSize.x - barWidth) * 0.5f;
+        const float barY = dispSize.y * 0.45f;
+        ImGui::SetCursorPos(ImVec2(barX, barY));
+        ImGui::BeginGroup();
+
+        // Phase-aware progress (resets between phases).
+        float frac = 0.0f;
+        auto phase = renderer->GetLoadingPhase();
+        if (phase == Renderer::LoadingPhase::Textures) {
+          const uint32_t scheduled = renderer->GetTextureTasksScheduled();
+          const uint32_t completed = renderer->GetTextureTasksCompleted();
+          frac = (scheduled > 0) ? (static_cast<float>(completed) / static_cast<float>(scheduled)) : 0.0f;
+        } else if (phase == Renderer::LoadingPhase::AccelerationStructures) {
+          frac = renderer->GetASBuildProgress();
+        } else {
+          frac = renderer->GetLoadingPhaseProgress();
+        }
+        ImGui::ProgressBar(frac, ImVec2(barWidth, 0.0f));
+        ImGui::Dummy(ImVec2(0.0f, 10.0f));
+        ImGui::SetCursorPosX(barX);
+        ImGui::Text("Loading: %s", renderer->GetLoadingPhaseName());
+        if (phase == Renderer::LoadingPhase::Textures) {
+          const uint32_t scheduled = renderer->GetTextureTasksScheduled();
+          const uint32_t completed = renderer->GetTextureTasksCompleted();
+          ImGui::Text("Textures: %u/%u", completed, scheduled);
+        } else if (phase == Renderer::LoadingPhase::AccelerationStructures) {
+          const uint32_t done = renderer->GetASBuildItemsDone();
+          const uint32_t total = renderer->GetASBuildItemsTotal();
+          ImGui::Text("%s (%u/%u, %.1fs)", renderer->GetASBuildStage(), done, total, renderer->GetASBuildElapsedSeconds());
+        }
+        ImGui::EndGroup();
+        ImGui::PopStyleVar();
+      }
+      ImGui::End();
+      return;
+    }
+  }
+
+  // --- Streaming status: small progress indicator in the upper-right ---
+  // Once the scene is visible, textures may continue streaming to the GPU.
+  // Show a compact progress bar in the top-right while there are still
+  // outstanding texture tasks, and hide it once everything is fully loaded.
+  if (renderer) {
+    const uint32_t uploadTotal = renderer->GetUploadJobsTotal();
+    const uint32_t uploadDone = renderer->GetUploadJobsCompleted();
+    const bool modelLoading = renderer->IsLoading();
+    const bool showASBuild = renderer->ShouldShowASBuildProgressInUI();
+
+    // Acceleration structure build can happen after initial load completes.
+    // If it takes a long time, show a compact progress window.
+    if (!modelLoading && showASBuild) {
+      ImGuiIO& io = ImGui::GetIO();
+      const ImVec2 dispSize = io.DisplaySize;
+
+      const float windowWidth = std::min(320.0f, dispSize.x * 0.42f);
+      const float windowHeight = 90.0f;
+      const ImVec2 winPos(dispSize.x - windowWidth - 10.0f, 10.0f);
+
+      ImGui::SetNextWindowPos(winPos, ImGuiCond_Always);
+      ImGui::SetNextWindowSize(ImVec2(windowWidth, windowHeight));
+      ImGuiWindowFlags flags = ImGuiWindowFlags_NoResize |
+          ImGuiWindowFlags_NoMove |
+          ImGuiWindowFlags_NoCollapse |
+          ImGuiWindowFlags_NoSavedSettings;
+
+      if (ImGui::Begin("##ASBuildStatus", nullptr, flags)) {
+        ImGui::Text("Building acceleration structures...");
+        const float asFrac = renderer->GetASBuildProgress();
+        ImGui::ProgressBar(asFrac, ImVec2(-1.0f, 0.0f));
+        const uint32_t done = renderer->GetASBuildItemsDone();
+        const uint32_t total = renderer->GetASBuildItemsTotal();
+        ImGui::Text("%s (%u/%u, %.1fs)",
+                    renderer->GetASBuildStage(),
+                    done,
+                    total,
+                    renderer->GetASBuildElapsedSeconds());
+      }
+      ImGui::End();
+    }
+
+    if (!modelLoading && uploadTotal > 0 && uploadDone < uploadTotal) {
+      ImGuiIO& io = ImGui::GetIO();
+      const ImVec2 dispSize = io.DisplaySize;
+
+      const float windowWidth = std::min(260.0f, dispSize.x * 0.35f);
+      const float windowHeight = 120.0f;
+      // If the AS build status window is visible, offset streaming window below it.
+      const float yBase = 10.0f + (showASBuild ? (90.0f + 10.0f) : 0.0f);
+      const ImVec2 winPos(dispSize.x - windowWidth - 10.0f, yBase);
+
+      ImGui::SetNextWindowPos(winPos, ImGuiCond_Always);
+      ImGui::SetNextWindowSize(ImVec2(windowWidth, windowHeight));
+      ImGuiWindowFlags flags = ImGuiWindowFlags_NoTitleBar |
+          ImGuiWindowFlags_NoResize |
+          ImGuiWindowFlags_NoMove |
+          ImGuiWindowFlags_NoScrollbar |
+          ImGuiWindowFlags_NoSavedSettings |
+          ImGuiWindowFlags_NoCollapse;
+
+      if (ImGui::Begin("##StreamingTextures", nullptr, flags)) {
+        ImGui::TextUnformatted("Streaming textures to GPU");
+        float frac = (uploadTotal > 0) ? (float) uploadDone / (float) uploadTotal : 0.0f;
+        ImGui::ProgressBar(frac, ImVec2(-1.0f, 0.0f));
+
+        // Perf counters
+        const double mbps = renderer->GetUploadThroughputMBps();
+        const double avgMs = renderer->GetAverageUploadMs();
+        const double totalMB = (double) renderer->GetBytesUploadedTotal() / (1024.0 * 1024.0);
+        ImGui::Text("Throughput: %.1f MB/s", mbps);
+        ImGui::SameLine();
+        ImGui::Text("Avg upload: %.2f ms/tex", avgMs);
+        ImGui::Text("Total uploaded: %.1f MB", totalMB);
+      }
+      ImGui::End();
+    }
+  }
+
+  // Create HRTF Audio Control UI
+  ImGui::Begin("HRTF Audio Controls");
+  ImGui::Text("3D Audio Position Control");
+
+  // Audio source selection
+  ImGui::Separator();
+  ImGui::Text("Audio Source Selection:");
+
+  static bool useDebugPing = false;
+  if (ImGui::Checkbox("Use Debug Ping (800Hz sine wave)", &useDebugPing)) {
+    // Stop current audio
+    if (audioSource&& audioSource
+    
+    ->
+    IsPlaying()
+    ) {
+      audioSource->Stop();
+    }
+    if (debugPingSource&& debugPingSource
+    
+    ->
+    IsPlaying()
+    ) {
+      debugPingSource->Stop();
+    }
+    std::cout << "Switched to " << (useDebugPing ? "debug ping" : "file audio") << " source" << std::endl;
+  }
+
+  // Display current audio source position
+  ImGui::Text("Audio Source Position: (%.2f, %.2f, %.2f)", audioSourceX, audioSourceY, audioSourceZ);
+  ImGui::Text("Current Source: %s", useDebugPing ? "Debug Ping (800Hz)" : "grass-step-right.wav");
+
+  // Directional control buttons
+  ImGui::Separator();
+  ImGui::Text("Directional Controls:");
+
+  // Get current active source
+  AudioSource* currentSource = useDebugPing ? debugPingSource : audioSource;
+
+  // Up button
+  if (ImGui::Button("Up")) {
+    audioSourceY += 0.5f;
+    if (currentSource) {
+      currentSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ);
+    }
+    std::cout << (useDebugPing ? "Debug ping" : "Audio") << " moved up to (" << audioSourceX << ", " << audioSourceY << ", " << audioSourceZ << ")" << std::endl;
+  }
+
+  // Left and Right buttons on same line
+  if (ImGui::Button("Left")) {
+    audioSourceX -= 0.5f;
+    if (currentSource) {
+      currentSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ);
+    }
+    std::cout << (useDebugPing ? "Debug ping" : "Audio") << " moved left to (" << audioSourceX << ", " << audioSourceY << ", " << audioSourceZ << ")" << std::endl;
+  }
+  ImGui::SameLine();
+  if (ImGui::Button("Right")) {
+    audioSourceX += 0.5f;
+    if (currentSource) {
+      currentSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ);
+    }
+    std::cout << (useDebugPing ? "Debug ping" : "Audio") << " moved right to (" << audioSourceX << ", " << audioSourceY << ", " << audioSourceZ << ")" << std::endl;
+  }
+
+  // Down button
+  if (ImGui::Button("Down")) {
+    audioSourceY -= 0.5f;
+    if (currentSource) {
+      currentSource->SetPosition(audioSourceX, audioSourceY, audioSourceZ);
+    }
+    std::cout << (useDebugPing ? "Debug ping" : "Audio") << " moved down to (" << audioSourceX << ", " << audioSourceY << ", " << audioSourceZ << ")" << std::endl;
+  }
+
+  // Audio playback controls
+  ImGui::Separator();
+  ImGui::Text("Playback Controls:");
+
+  // Play button
+  if (ImGui::Button("Play")) {
+    if (currentSource) {
+      currentSource->Play();
+      if (audioSystem) {
+        audioSystem->FlushOutput();
+      }
+      if (useDebugPing) {
+        std::cout << "Started playing debug ping (800Hz sine wave) with HRTF processing" << std::endl;
+      } else {
+        std::cout << "Started playing grass-step-right.wav with HRTF processing" << std::endl;
+      }
+    } else {
+      std::cout << "No audio source available - audio system not initialized" << std::endl;
+    }
+  }
+  ImGui::SameLine();
+
+  // Stop button
+  if (ImGui::Button("Stop")) {
+    if (currentSource) {
+      currentSource->Stop();
+      if (useDebugPing) {
+        std::cout << "Stopped debug ping playback" << std::endl;
+      } else {
+        std::cout << "Stopped audio playback" << std::endl;
+      }
+    }
+  }
+
+  // Additional info
+  ImGui::Separator();
+  if (audioSystem&& audioSystem
+  
+  ->
+  IsHRTFEnabled()
+  ) {
+    ImGui::Text("HRTF Processing: ENABLED");
+    ImGui::Text("Use directional buttons to move the audio source in 3D space");
+    ImGui::Text("You should hear the audio move around you!");
+
+    // HRTF Processing Mode: GPU only (checkbox removed)
+    ImGui::Separator();
+    ImGui::Text("HRTF Processing Mode:");
+    ImGui::Text("Current Mode: Vulkan shader processing (GPU)");
+  }
+  else {
+    ImGui::Text("HRTF Processing: DISABLED");
+  }
+
+  // Ball Debugging Controls
+  ImGui::Separator();
+  ImGui::Text("Ball Debugging Controls:");
+
+  if (ImGui::Checkbox("Ball-Only Rendering", &ballOnlyRenderingEnabled)) {
+    std::cout << "Ball-only rendering " << (ballOnlyRenderingEnabled ? "enabled" : "disabled") << std::endl;
+  }
+  ImGui::SameLine();
+  if (ImGui::Button("?##BallOnlyHelp")) {
+    // Help tooltip will be shown on hover
+  }
+  if (ImGui::IsItemHovered()) {
+    ImGui::SetTooltip("When enabled, only balls will be rendered.\nAll other geometry (bistro scene) will be hidden.");
+  }
+
+  if (ImGui::Checkbox("Camera Track Ball", &cameraTrackingEnabled)) {
+    std::cout << "Camera tracking " << (cameraTrackingEnabled ? "enabled" : "disabled") << std::endl;
+  }
+  ImGui::SameLine();
+  if (ImGui::Button("?##CameraTrackHelp")) {
+    // Help tooltip will be shown on hover
+  }
+  if (ImGui::IsItemHovered()) {
+    ImGui::SetTooltip("When enabled, camera will automatically\nfollow and look at the ball.");
+  }
+
+  // Status display
+  if (ballOnlyRenderingEnabled) {
+    ImGui::Text("Status: Only balls are being rendered");
+  } else {
+    ImGui::Text("Status: All geometry is being rendered");
+  }
+
+  if (cameraTrackingEnabled) {
+    ImGui::Text("Camera: Tracking ball automatically");
+  } else {
+    ImGui::Text("Camera: Manual control (WASD + mouse)");
+  }
+
+  // Texture loading progress
+  if (renderer) {
+    const uint32_t scheduled = renderer->GetTextureTasksScheduled();
+    const uint32_t completed = renderer->GetTextureTasksCompleted();
+    if (scheduled > 0 && completed < scheduled) {
+      ImGui::Separator();
+      float frac = scheduled ? (float) completed / (float) scheduled : 1.0f;
+      ImGui::Text("Loading textures: %u / %u", completed, scheduled);
+      ImGui::ProgressBar(frac, ImVec2(-FLT_MIN, 0.0f));
+      ImGui::Text("You can continue interacting while textures stream in...");
+    }
+  }
+
+  ImGui::End();
+}
+
+void ImGuiSystem::Render(vk::raii::CommandBuffer& commandBuffer, uint32_t frameIndex) {
+  if (!initialized) {
+    return;
+  }
+
+  // End the frame and prepare for rendering
+  ImGui::Render();
+
+  // Update vertex and index buffers for this frame
+  updateBuffers(frameIndex);
+
+  // Record rendering commands
+  ImDrawData* drawData = ImGui::GetDrawData();
+  if (!drawData || drawData->CmdListsCount == 0) {
+    return;
+  }
+
+  try {
+    // Bind the pipeline
+    commandBuffer.bindPipeline(vk::PipelineBindPoint::eGraphics, *pipeline);
+
+    // Set viewport
+    vk::Viewport viewport;
+    viewport.width = ImGui::GetIO().DisplaySize.x;
+    viewport.height = ImGui::GetIO().DisplaySize.y;
+    viewport.minDepth = 0.0f;
+    viewport.maxDepth = 1.0f;
+    commandBuffer.setViewport(0, {viewport});
+
+    // Set push constants
+    struct PushConstBlock {
+      float scale[2];
+      float translate[2];
+    } pushConstBlock{};
+
+    pushConstBlock.scale[0] = 2.0f / ImGui::GetIO().DisplaySize.x;
+    pushConstBlock.scale[1] = 2.0f / ImGui::GetIO().DisplaySize.y;
+    pushConstBlock.translate[0] = -1.0f;
+    pushConstBlock.translate[1] = -1.0f;
+
+    commandBuffer.pushConstants<PushConstBlock>(*pipelineLayout, vk::ShaderStageFlagBits::eVertex, 0, pushConstBlock);
+
+    // Bind vertex and index buffers for this frame
+    commandBuffer.bindVertexBuffers(0, *vertexBuffers[frameIndex], vk::DeviceSize{0});
+    commandBuffer.bindIndexBuffer(*indexBuffers[frameIndex], 0, vk::IndexType::eUint16);
+
+    // Render command lists
+    int vertexOffset = 0;
+    int indexOffset = 0;
+
+    for (int i = 0; i < drawData->CmdListsCount; i++) {
+      const ImDrawList* cmdList = drawData->CmdLists[i];
+
+      for (int j = 0; j < cmdList->CmdBuffer.Size; j++) {
+        const ImDrawCmd* pcmd = &cmdList->CmdBuffer[j];
+
+        // Set scissor rectangle
+        vk::Rect2D scissor;
+        scissor.offset.x = std::max(static_cast<int32_t>(pcmd->ClipRect.x), 0);
+        scissor.offset.y = std::max(static_cast<int32_t>(pcmd->ClipRect.y), 0);
+        scissor.extent.width = static_cast<uint32_t>(pcmd->ClipRect.z - pcmd->ClipRect.x);
+        scissor.extent.height = static_cast<uint32_t>(pcmd->ClipRect.w - pcmd->ClipRect.y);
+        commandBuffer.setScissor(0, {scissor});
+
+        // Bind descriptor set (font texture)
+        commandBuffer.bindDescriptorSets(vk::PipelineBindPoint::eGraphics, *pipelineLayout, 0, {*descriptorSet}, {});
+
+        // Draw
+        commandBuffer.drawIndexed(pcmd->ElemCount, 1, indexOffset, vertexOffset, 0);
+        indexOffset += pcmd->ElemCount;
+      }
+
+      vertexOffset += cmdList->VtxBuffer.Size;
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to render ImGui: " << e.what() << std::endl;
+  }
+}
+
+void ImGuiSystem::HandleMouse(float x, float y, uint32_t buttons) {
+  if (!initialized) {
+    return;
+  }
+
+  ImGuiIO& io = ImGui::GetIO();
+
+  // Update mouse position
+  io.MousePos = ImVec2(x, y);
+
+  // Update mouse buttons
+  io.MouseDown[0] = (buttons & 0x01) != 0; // Left button
+  io.MouseDown[1] = (buttons & 0x02) != 0; // Right button
+  io.MouseDown[2] = (buttons & 0x04) != 0; // Middle button
+}
+
+void ImGuiSystem::HandleKeyboard(uint32_t key, bool pressed) {
+  if (!initialized) {
+    return;
+  }
+
+  ImGuiIO& io = ImGui::GetIO();
+
+  // Update key state
+  if (key < 512) {
+    io.KeysDown[key] = pressed;
+  }
+
+  // Update modifier keys
+  // Using GLFW key codes instead of Windows-specific VK_* constants
+  io.KeyCtrl = io.KeysDown[341] || io.KeysDown[345]; // Left/Right Control
+  io.KeyShift = io.KeysDown[340] || io.KeysDown[344]; // Left/Right Shift
+  io.KeyAlt = io.KeysDown[342] || io.KeysDown[346]; // Left/Right Alt
+  io.KeySuper = io.KeysDown[343] || io.KeysDown[347]; // Left/Right Super
+}
+
+void ImGuiSystem::HandleChar(uint32_t c) {
+  if (!initialized) {
+    return;
+  }
+
+  ImGuiIO& io = ImGui::GetIO();
+  io.AddInputCharacter(c);
+}
+
+void ImGuiSystem::HandleResize(uint32_t width, uint32_t height) {
+  if (!initialized) {
+    return;
+  }
+
+  this->width = width;
+  this->height = height;
+
+  ImGuiIO& io = ImGui::GetIO();
+  io.DisplaySize = ImVec2(static_cast<float>(width), static_cast<float>(height));
+}
+
+bool ImGuiSystem::WantCaptureKeyboard() const {
+  if (!initialized) {
+    return false;
+  }
+
+  return ImGui::GetIO().WantCaptureKeyboard;
+}
+
+bool ImGuiSystem::WantCaptureMouse() const {
+  if (!initialized) {
+    return false;
+  }
+
+  return ImGui::GetIO().WantCaptureMouse;
+}
+
+bool ImGuiSystem::createResources() {
+  // Create all Vulkan resources needed for ImGui rendering
+  if (!createFontTexture()) {
+    return false;
+  }
+
+  if (!createDescriptorSetLayout()) {
+    return false;
+  }
+
+  if (!createDescriptorPool()) {
+    return false;
+  }
+
+  if (!createDescriptorSet()) {
+    return false;
+  }
+
+  if (!createPipelineLayout()) {
+    return false;
+  }
+
+  if (!createPipeline()) {
+    return false;
+  }
+
+  return true;
+}
+
+bool ImGuiSystem::createFontTexture() {
+  // Get font texture from ImGui
+  ImGuiIO& io = ImGui::GetIO();
+  unsigned char* fontData;
+  int texWidth, texHeight;
+  io.Fonts->GetTexDataAsRGBA32(&fontData, &texWidth, &texHeight);
+  vk::DeviceSize uploadSize = texWidth * texHeight * 4 * sizeof(char);
+
+  try {
+    // Create the font image
+    vk::ImageCreateInfo imageInfo;
+    imageInfo.imageType = vk::ImageType::e2D;
+    imageInfo.format = vk::Format::eR8G8B8A8Unorm;
+    imageInfo.extent.width = static_cast<uint32_t>(texWidth);
+    imageInfo.extent.height = static_cast<uint32_t>(texHeight);
+    imageInfo.extent.depth = 1;
+    imageInfo.mipLevels = 1;
+    imageInfo.arrayLayers = 1;
+    imageInfo.samples = vk::SampleCountFlagBits::e1;
+    imageInfo.tiling = vk::ImageTiling::eOptimal;
+    imageInfo.usage = vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferDst;
+    imageInfo.sharingMode = vk::SharingMode::eExclusive;
+    imageInfo.initialLayout = vk::ImageLayout::eUndefined;
+
+    const vk::raii::Device& device = renderer->GetRaiiDevice();
+    fontImage = vk::raii::Image(device, imageInfo);
+
+    // Allocate memory for the image
+    vk::MemoryRequirements memRequirements = fontImage.getMemoryRequirements();
+
+    vk::MemoryAllocateInfo allocInfo;
+    allocInfo.allocationSize = memRequirements.size;
+    allocInfo.memoryTypeIndex = renderer->FindMemoryType(memRequirements.memoryTypeBits, vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+    fontMemory = vk::raii::DeviceMemory(device, allocInfo);
+    fontImage.bindMemory(*fontMemory, 0);
+
+    // Create a staging buffer for uploading the font data
+    vk::BufferCreateInfo bufferInfo;
+    bufferInfo.size = uploadSize;
+    bufferInfo.usage = vk::BufferUsageFlagBits::eTransferSrc;
+    bufferInfo.sharingMode = vk::SharingMode::eExclusive;
+
+    vk::raii::Buffer stagingBuffer(device, bufferInfo);
+
+    vk::MemoryRequirements stagingMemRequirements = stagingBuffer.getMemoryRequirements();
+
+    vk::MemoryAllocateInfo stagingAllocInfo;
+    stagingAllocInfo.allocationSize = stagingMemRequirements.size;
+    stagingAllocInfo.memoryTypeIndex = renderer->FindMemoryType(stagingMemRequirements.memoryTypeBits,
+                                                                vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    vk::raii::DeviceMemory stagingBufferMemory(device, stagingAllocInfo);
+    stagingBuffer.bindMemory(*stagingBufferMemory, 0);
+
+    // Copy font data to staging buffer
+    void* data = stagingBufferMemory.mapMemory(0, uploadSize);
+    memcpy(data, fontData, uploadSize);
+    stagingBufferMemory.unmapMemory();
+
+    // Transition image layout and copy data
+    renderer->TransitionImageLayout(*fontImage,
+                                    vk::Format::eR8G8B8A8Unorm,
+                                    vk::ImageLayout::eUndefined,
+                                    vk::ImageLayout::eTransferDstOptimal);
+    renderer->CopyBufferToImage(*stagingBuffer,
+                                *fontImage,
+                                static_cast<uint32_t>(texWidth),
+                                static_cast<uint32_t>(texHeight));
+    renderer->TransitionImageLayout(*fontImage,
+                                    vk::Format::eR8G8B8A8Unorm,
+                                    vk::ImageLayout::eTransferDstOptimal,
+                                    vk::ImageLayout::eShaderReadOnlyOptimal);
+
+    // Staging buffer and memory will be automatically cleaned up by RAII
+
+    // Create image view
+    vk::ImageViewCreateInfo viewInfo;
+    viewInfo.image = *fontImage;
+    viewInfo.viewType = vk::ImageViewType::e2D;
+    viewInfo.format = vk::Format::eR8G8B8A8Unorm;
+    viewInfo.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor;
+    viewInfo.subresourceRange.baseMipLevel = 0;
+    viewInfo.subresourceRange.levelCount = 1;
+    viewInfo.subresourceRange.baseArrayLayer = 0;
+    viewInfo.subresourceRange.layerCount = 1;
+
+    fontView = vk::raii::ImageView(device, viewInfo);
+
+    // Create sampler
+    vk::SamplerCreateInfo samplerInfo;
+    samplerInfo.magFilter = vk::Filter::eLinear;
+    samplerInfo.minFilter = vk::Filter::eLinear;
+    samplerInfo.mipmapMode = vk::SamplerMipmapMode::eLinear;
+    samplerInfo.addressModeU = vk::SamplerAddressMode::eClampToEdge;
+    samplerInfo.addressModeV = vk::SamplerAddressMode::eClampToEdge;
+    samplerInfo.addressModeW = vk::SamplerAddressMode::eClampToEdge;
+    samplerInfo.mipLodBias = 0.0f;
+    samplerInfo.anisotropyEnable = VK_FALSE;
+    samplerInfo.maxAnisotropy = 1.0f;
+    samplerInfo.compareEnable = VK_FALSE;
+    samplerInfo.compareOp = vk::CompareOp::eAlways;
+    samplerInfo.minLod = 0.0f;
+    samplerInfo.maxLod = 0.0f;
+    samplerInfo.borderColor = vk::BorderColor::eFloatOpaqueWhite;
+    samplerInfo.unnormalizedCoordinates = VK_FALSE;
+
+    fontSampler = vk::raii::Sampler(device, samplerInfo);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create font texture: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool ImGuiSystem::createDescriptorSetLayout() {
+  try {
+    vk::DescriptorSetLayoutBinding binding;
+    binding.descriptorType = vk::DescriptorType::eCombinedImageSampler;
+    binding.descriptorCount = 1;
+    binding.stageFlags = vk::ShaderStageFlagBits::eFragment;
+    binding.binding = 0;
+
+    vk::DescriptorSetLayoutCreateInfo layoutInfo;
+    layoutInfo.bindingCount = 1;
+    layoutInfo.pBindings = &binding;
+
+    const vk::raii::Device& device = renderer->GetRaiiDevice();
+    descriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create descriptor set layout: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool ImGuiSystem::createDescriptorPool() {
+  try {
+    vk::DescriptorPoolSize poolSize;
+    poolSize.type = vk::DescriptorType::eCombinedImageSampler;
+    poolSize.descriptorCount = 1;
+
+    vk::DescriptorPoolCreateInfo poolInfo;
+    poolInfo.flags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet;
+    poolInfo.maxSets = 1;
+    poolInfo.poolSizeCount = 1;
+    poolInfo.pPoolSizes = &poolSize;
+
+    const vk::raii::Device& device = renderer->GetRaiiDevice();
+    descriptorPool = vk::raii::DescriptorPool(device, poolInfo);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create descriptor pool: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool ImGuiSystem::createDescriptorSet() {
+  try {
+    vk::DescriptorSetAllocateInfo allocInfo;
+    allocInfo.descriptorPool = *descriptorPool;
+    allocInfo.descriptorSetCount = 1;
+    allocInfo.pSetLayouts = &(*descriptorSetLayout);
+
+    const vk::raii::Device& device = renderer->GetRaiiDevice();
+    vk::raii::DescriptorSets descriptorSets(device, allocInfo);
+    descriptorSet = std::move(descriptorSets[0]); // Store the first (and only) descriptor set
+    std::cout << "ImGui created descriptor set with handle: " << *descriptorSet << std::endl;
+
+    // Update descriptor set
+    vk::DescriptorImageInfo imageInfo;
+    imageInfo.imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
+    imageInfo.imageView = *fontView;
+    imageInfo.sampler = *fontSampler;
+
+    vk::WriteDescriptorSet writeSet;
+    writeSet.dstSet = *descriptorSet;
+    writeSet.descriptorCount = 1;
+    writeSet.descriptorType = vk::DescriptorType::eCombinedImageSampler;
+    writeSet.pImageInfo = &imageInfo;
+    writeSet.dstBinding = 0;
+
+    device.updateDescriptorSets({writeSet}, {});
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create descriptor set: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool ImGuiSystem::createPipelineLayout() {
+  try {
+    // Push constant range for the transformation matrix
+    vk::PushConstantRange pushConstantRange;
+    pushConstantRange.stageFlags = vk::ShaderStageFlagBits::eVertex;
+    pushConstantRange.offset = 0;
+    pushConstantRange.size = sizeof(float) * 4; // 2 floats for scale, 2 floats for translate
+
+    // Create pipeline layout
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo;
+    pipelineLayoutInfo.setLayoutCount = 1;
+    pipelineLayoutInfo.pSetLayouts = &(*descriptorSetLayout);
+    pipelineLayoutInfo.pushConstantRangeCount = 1;
+    pipelineLayoutInfo.pPushConstantRanges = &pushConstantRange;
+
+    const vk::raii::Device& device = renderer->GetRaiiDevice();
+    pipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create pipeline layout: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool ImGuiSystem::createPipeline() {
+  try {
+    // Load shaders
+    vk::raii::ShaderModule shaderModule = renderer->CreateShaderModule("shaders/imgui.spv");
+
+    // Shader stage creation
+    vk::PipelineShaderStageCreateInfo vertShaderStageInfo;
+    vertShaderStageInfo.stage = vk::ShaderStageFlagBits::eVertex;
+    vertShaderStageInfo.module = *shaderModule;
+    vertShaderStageInfo.pName = "VSMain";
+
+    vk::PipelineShaderStageCreateInfo fragShaderStageInfo;
+    fragShaderStageInfo.stage = vk::ShaderStageFlagBits::eFragment;
+    fragShaderStageInfo.module = *shaderModule;
+    fragShaderStageInfo.pName = "PSMain";
+
+    std::array shaderStages = {vertShaderStageInfo, fragShaderStageInfo};
+
+    // Vertex input
+    vk::VertexInputBindingDescription bindingDescription;
+    bindingDescription.binding = 0;
+    bindingDescription.stride = sizeof(ImDrawVert);
+    bindingDescription.inputRate = vk::VertexInputRate::eVertex;
+
+    std::array<vk::VertexInputAttributeDescription, 3> attributeDescriptions;
+    attributeDescriptions[0].binding = 0;
+    attributeDescriptions[0].location = 0;
+    attributeDescriptions[0].format = vk::Format::eR32G32Sfloat;
+    attributeDescriptions[0].offset = offsetof(ImDrawVert, pos);
+
+    attributeDescriptions[1].binding = 0;
+    attributeDescriptions[1].location = 1;
+    attributeDescriptions[1].format = vk::Format::eR32G32Sfloat;
+    attributeDescriptions[1].offset = offsetof(ImDrawVert, uv);
+
+    attributeDescriptions[2].binding = 0;
+    attributeDescriptions[2].location = 2;
+    attributeDescriptions[2].format = vk::Format::eR8G8B8A8Unorm;
+    attributeDescriptions[2].offset = offsetof(ImDrawVert, col);
+
+    vk::PipelineVertexInputStateCreateInfo vertexInputInfo;
+    vertexInputInfo.vertexBindingDescriptionCount = 1;
+    vertexInputInfo.pVertexBindingDescriptions = &bindingDescription;
+    vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(attributeDescriptions.size());
+    vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data();
+
+    // Input assembly
+    vk::PipelineInputAssemblyStateCreateInfo inputAssembly;
+    inputAssembly.topology = vk::PrimitiveTopology::eTriangleList;
+    inputAssembly.primitiveRestartEnable = VK_FALSE;
+
+    // Viewport and scissor
+    vk::PipelineViewportStateCreateInfo viewportState;
+    viewportState.viewportCount = 1;
+    viewportState.scissorCount = 1;
+    viewportState.pViewports = nullptr; // Dynamic state
+    viewportState.pScissors = nullptr; // Dynamic state
+
+    // Rasterization
+    vk::PipelineRasterizationStateCreateInfo rasterizer;
+    rasterizer.depthClampEnable = VK_FALSE;
+    rasterizer.rasterizerDiscardEnable = VK_FALSE;
+    rasterizer.polygonMode = vk::PolygonMode::eFill;
+    rasterizer.lineWidth = 1.0f;
+    rasterizer.cullMode = vk::CullModeFlagBits::eNone;
+    rasterizer.frontFace = vk::FrontFace::eCounterClockwise;
+    rasterizer.depthBiasEnable = VK_FALSE;
+
+    // Multisampling
+    vk::PipelineMultisampleStateCreateInfo multisampling;
+    multisampling.sampleShadingEnable = VK_FALSE;
+    multisampling.rasterizationSamples = vk::SampleCountFlagBits::e1;
+
+    // Depth and stencil testing
+    vk::PipelineDepthStencilStateCreateInfo depthStencil;
+    depthStencil.depthTestEnable = VK_FALSE;
+    depthStencil.depthWriteEnable = VK_FALSE;
+    depthStencil.depthCompareOp = vk::CompareOp::eLessOrEqual;
+    depthStencil.depthBoundsTestEnable = VK_FALSE;
+    depthStencil.stencilTestEnable = VK_FALSE;
+
+    // Color blending
+    vk::PipelineColorBlendAttachmentState colorBlendAttachment;
+    colorBlendAttachment.colorWriteMask =
+        vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG |
+        vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA;
+    colorBlendAttachment.blendEnable = VK_TRUE;
+    colorBlendAttachment.srcColorBlendFactor = vk::BlendFactor::eSrcAlpha;
+    colorBlendAttachment.dstColorBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha;
+    colorBlendAttachment.colorBlendOp = vk::BlendOp::eAdd;
+    colorBlendAttachment.srcAlphaBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha;
+    colorBlendAttachment.dstAlphaBlendFactor = vk::BlendFactor::eZero;
+    colorBlendAttachment.alphaBlendOp = vk::BlendOp::eAdd;
+
+    vk::PipelineColorBlendStateCreateInfo colorBlending;
+    colorBlending.logicOpEnable = VK_FALSE;
+    colorBlending.attachmentCount = 1;
+    colorBlending.pAttachments = &colorBlendAttachment;
+
+    // Dynamic state
+    std::vector<vk::DynamicState> dynamicStates = {
+      vk::DynamicState::eViewport,
+      vk::DynamicState::eScissor
+    };
+
+    vk::PipelineDynamicStateCreateInfo dynamicState;
+    dynamicState.dynamicStateCount = static_cast<uint32_t>(dynamicStates.size());
+    dynamicState.pDynamicStates = dynamicStates.data();
+
+    vk::Format depthFormat = renderer->findDepthFormat();
+    // Create the graphics pipeline with dynamic rendering
+    vk::PipelineRenderingCreateInfo renderingInfo;
+    renderingInfo.colorAttachmentCount = 1;
+    vk::Format colorFormat = renderer->GetSwapChainImageFormat(); // Get the actual swapchain format
+    renderingInfo.pColorAttachmentFormats = &colorFormat;
+    renderingInfo.depthAttachmentFormat = depthFormat;
+
+    vk::GraphicsPipelineCreateInfo pipelineInfo;
+    pipelineInfo.stageCount = static_cast<uint32_t>(shaderStages.size());
+    pipelineInfo.pStages = shaderStages.data();
+    pipelineInfo.pVertexInputState = &vertexInputInfo;
+    pipelineInfo.pInputAssemblyState = &inputAssembly;
+    pipelineInfo.pViewportState = &viewportState;
+    pipelineInfo.pRasterizationState = &rasterizer;
+    pipelineInfo.pMultisampleState = &multisampling;
+    pipelineInfo.pDepthStencilState = &depthStencil;
+    pipelineInfo.pColorBlendState = &colorBlending;
+    pipelineInfo.pDynamicState = &dynamicState;
+    pipelineInfo.layout = *pipelineLayout;
+    pipelineInfo.pNext = &renderingInfo;
+    pipelineInfo.basePipelineHandle = nullptr;
+
+    const vk::raii::Device& device = renderer->GetRaiiDevice();
+    pipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create graphics pipeline: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+void ImGuiSystem::updateBuffers(uint32_t frameIndex) {
+  ImDrawData* drawData = ImGui::GetDrawData();
+  if (!drawData || drawData->CmdListsCount == 0) {
+    return;
+  }
+
+  try {
+    const vk::raii::Device& device = renderer->GetRaiiDevice();
+
+    // Calculate required buffer sizes
+    vk::DeviceSize vertexBufferSize = drawData->TotalVtxCount * sizeof(ImDrawVert);
+    vk::DeviceSize indexBufferSize = drawData->TotalIdxCount * sizeof(ImDrawIdx);
+
+    // Resize buffers if needed for this frame
+    if (frameIndex >= vertexCounts.size())
+      return; // Safety
+
+    if (static_cast<uint32_t>(drawData->TotalVtxCount) > vertexCounts[frameIndex]) {
+      // Clean up old buffer
+      vertexBuffers[frameIndex] = vk::raii::Buffer(nullptr);
+      vertexBufferMemories[frameIndex] = vk::raii::DeviceMemory(nullptr);
+
+      // Create new vertex buffer
+      vk::BufferCreateInfo bufferInfo;
+      bufferInfo.size = vertexBufferSize;
+      bufferInfo.usage = vk::BufferUsageFlagBits::eVertexBuffer;
+      bufferInfo.sharingMode = vk::SharingMode::eExclusive;
+
+      vertexBuffers[frameIndex] = vk::raii::Buffer(device, bufferInfo);
+
+      vk::MemoryRequirements memRequirements = vertexBuffers[frameIndex].getMemoryRequirements();
+
+      vk::MemoryAllocateInfo allocInfo;
+      allocInfo.allocationSize = memRequirements.size;
+      allocInfo.memoryTypeIndex = renderer->FindMemoryType(memRequirements.memoryTypeBits,
+                                                           vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+      vertexBufferMemories[frameIndex] = vk::raii::DeviceMemory(device, allocInfo);
+      vertexBuffers[frameIndex].bindMemory(*vertexBufferMemories[frameIndex], 0);
+      vertexCounts[frameIndex] = drawData->TotalVtxCount;
+    }
+
+    if (static_cast<uint32_t>(drawData->TotalIdxCount) > indexCounts[frameIndex]) {
+      // Clean up old buffer
+      indexBuffers[frameIndex] = vk::raii::Buffer(nullptr);
+      indexBufferMemories[frameIndex] = vk::raii::DeviceMemory(nullptr);
+
+      // Create new index buffer
+      vk::BufferCreateInfo bufferInfo;
+      bufferInfo.size = indexBufferSize;
+      bufferInfo.usage = vk::BufferUsageFlagBits::eIndexBuffer;
+      bufferInfo.sharingMode = vk::SharingMode::eExclusive;
+
+      indexBuffers[frameIndex] = vk::raii::Buffer(device, bufferInfo);
+
+      vk::MemoryRequirements memRequirements = indexBuffers[frameIndex].getMemoryRequirements();
+
+      vk::MemoryAllocateInfo allocInfo;
+      allocInfo.allocationSize = memRequirements.size;
+      allocInfo.memoryTypeIndex = renderer->FindMemoryType(memRequirements.memoryTypeBits,
+                                                           vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+      indexBufferMemories[frameIndex] = vk::raii::DeviceMemory(device, allocInfo);
+      indexBuffers[frameIndex].bindMemory(*indexBufferMemories[frameIndex], 0);
+      indexCounts[frameIndex] = drawData->TotalIdxCount;
+    }
+
+    // Upload data to buffers for this frame (only if we have data to upload)
+    if (drawData->TotalVtxCount > 0 && drawData->TotalIdxCount > 0) {
+      void* vtxMappedMemory = vertexBufferMemories[frameIndex].mapMemory(0, vertexBufferSize);
+      void* idxMappedMemory = indexBufferMemories[frameIndex].mapMemory(0, indexBufferSize);
+
+      ImDrawVert* vtxDst = static_cast<ImDrawVert *>(vtxMappedMemory);
+      ImDrawIdx* idxDst = static_cast<ImDrawIdx *>(idxMappedMemory);
+
+      for (int n = 0; n < drawData->CmdListsCount; n++) {
+        const ImDrawList* cmdList = drawData->CmdLists[n];
+        memcpy(vtxDst, cmdList->VtxBuffer.Data, cmdList->VtxBuffer.Size * sizeof(ImDrawVert));
+        memcpy(idxDst, cmdList->IdxBuffer.Data, cmdList->IdxBuffer.Size * sizeof(ImDrawIdx));
+        vtxDst += cmdList->VtxBuffer.Size;
+        idxDst += cmdList->IdxBuffer.Size;
+      }
+
+      vertexBufferMemories[frameIndex].unmapMemory();
+      indexBufferMemories[frameIndex].unmapMemory();
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to update buffers: " << e.what() << std::endl;
+  }
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/install_dependencies_linux.sh b/attachments/advanced_gltf/install_dependencies_linux.sh
new file mode 100755
index 000000000..fe2157ee7
--- /dev/null
+++ b/attachments/advanced_gltf/install_dependencies_linux.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# Copyright (c) 2026 Holochip Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 the "License";
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Installs all dependencies for the Advanced glTF tutorial on Linux.
+# Delegates to the simple_engine install script (which handles glm, GLFW,
+# OpenAL, tinygltf, KTX, etc.) then notes that JoltPhysics is fetched
+# automatically by CMake via FetchContent — no manual installation needed.
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+SE_SCRIPT="${SCRIPT_DIR}/../simple_engine/install_dependencies_linux.sh"
+
+if [[ ! -f "${SE_SCRIPT}" ]]; then
+    echo "Error: simple_engine install script not found at ${SE_SCRIPT}" >&2
+    exit 1
+fi
+
+echo "=== Installing simple_engine dependencies ==="
+bash "${SE_SCRIPT}"
+
+echo ""
+echo "=== Advanced glTF tutorial additional dependencies ==="
+echo "JoltPhysics v5.2.0 is fetched automatically by CMake (FetchContent)."
+echo "No additional manual installation is required."
+echo ""
+echo "Build instructions:"
+echo "  cd attachments/advanced_gltf"
+echo "  mkdir build && cd build"
+echo "  cmake .."
+echo "  cmake --build . --parallel"
diff --git a/attachments/advanced_gltf/install_dependencies_windows.bat b/attachments/advanced_gltf/install_dependencies_windows.bat
new file mode 100644
index 000000000..e90986e97
--- /dev/null
+++ b/attachments/advanced_gltf/install_dependencies_windows.bat
@@ -0,0 +1,48 @@
+@echo off
+rem Copyright (c) 2026 Holochip Corporation
+rem
+rem SPDX-License-Identifier: Apache-2.0
+rem
+rem Licensed under the Apache License, Version 2.0 the "License";
+rem you may not use this file except in compliance with the License.
+rem You may obtain a copy of the License at
+rem
+rem     http://www.apache.org/licenses/LICENSE-2.0
+rem
+rem Unless required by applicable law or agreed to in writing, software
+rem distributed under the License is distributed on an "AS IS" BASIS,
+rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+rem See the License for the specific language governing permissions and
+rem limitations under the License.
+
+rem Installs all dependencies for the Advanced glTF tutorial on Windows.
+rem Delegates to the simple_engine install script (which uses vcpkg for glm,
+rem GLFW, OpenAL, tinygltf, KTX, etc.) then notes that JoltPhysics is fetched
+rem automatically by CMake via FetchContent.
+setlocal enabledelayedexpansion
+
+set SCRIPT_DIR=%~dp0
+set SE_SCRIPT=%SCRIPT_DIR%..\simple_engine\install_dependencies_windows.bat
+
+if not exist "%SE_SCRIPT%" (
+    echo Error: simple_engine install script not found at %SE_SCRIPT% >&2
+    exit /b 1
+)
+
+echo === Installing simple_engine dependencies ===
+call "%SE_SCRIPT%"
+if errorlevel 1 (
+    echo Error: simple_engine dependency installation failed. >&2
+    exit /b 1
+)
+
+echo.
+echo === Advanced glTF tutorial additional dependencies ===
+echo JoltPhysics v5.2.0 is fetched automatically by CMake (FetchContent^).
+echo No additional manual installation is required.
+echo.
+echo Build instructions:
+echo   cd attachments\advanced_gltf
+echo   mkdir build ^&^& cd build
+echo   cmake ..
+echo   cmake --build . --parallel
diff --git a/attachments/advanced_gltf/main.cpp b/attachments/advanced_gltf/main.cpp
new file mode 100644
index 000000000..d0a2129ca
--- /dev/null
+++ b/attachments/advanced_gltf/main.cpp
@@ -0,0 +1,187 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "camera_component.h"
+#include "crash_reporter.h"
+#include "engine.h"
+#include "scene_loading.h"
+#include "transform_component.h"
+#include "tutorial_demo.h"
+
+#include <iostream>
+#include <stdexcept>
+#include <thread>
+
+
+// Constants
+constexpr int WINDOW_WIDTH  = 800;
+constexpr int WINDOW_HEIGHT = 600;
+#if defined(NDEBUG)
+constexpr bool ENABLE_VALIDATION_LAYERS = false;
+#else
+constexpr bool ENABLE_VALIDATION_LAYERS = true;
+#endif
+
+/**
+ * @brief Set up a simple scene with a camera and some objects.
+ * @param engine The engine to set up the scene in.
+ */
+void SetupScene(Engine *engine)
+{
+	// Create a camera entity
+	Entity *cameraEntity = engine->CreateEntity("Camera");
+	if (!cameraEntity)
+	{
+		throw std::runtime_error("Failed to create camera entity");
+	}
+
+	// Add a transform component to the camera
+	auto *cameraTransform = cameraEntity->AddComponent<TransformComponent>();
+	cameraTransform->SetPosition(glm::vec3(0.0f, 0.0f, 3.0f));
+
+	// Add a camera component to the camera entity
+	auto *camera = cameraEntity->AddComponent<CameraComponent>();
+	camera->SetAspectRatio(static_cast<float>(WINDOW_WIDTH) / static_cast<float>(WINDOW_HEIGHT));
+
+	// Set the camera as the active camera
+	engine->SetActiveCamera(camera);
+
+	// Add the TutorialDemoComponent to a dedicated entity
+	if (Entity* tutorialDemo = engine->CreateEntity("TutorialDemo")) {
+		tutorialDemo->AddComponent<TutorialDemoComponent>(engine);
+	}
+
+	// Kick off GLTF model loading on a background thread so the main loop
+	// can start and render the UI/progress bar while the scene is being
+	// constructed. Engine::Update will avoid updating entities while
+	// loading is in progress to prevent data races.
+	if (auto *renderer = engine->GetRenderer())
+	{
+		renderer->SetLoading(true);
+		renderer->SetLoadingPhase(Renderer::LoadingPhase::Textures);
+	}
+	std::thread([engine] {
+		LoadGLTFModel(engine, "../../Assets/bistro/bistro.gltf");
+
+        // Give the Bistro a moment to settle and position the camera
+        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+
+        // Position the animated models relative to the camera AFTER the bistro's glTF
+        // camera has repositioned it (that happens synchronously during the bistro load
+        // above). We sample the camera pose ONCE here and bake fixed world positions into
+        // the entities, so the models stay put in the scene and do NOT follow the camera
+        // as the user looks around afterwards.
+        glm::vec3 foxPos(0.0f, 0.0f, 0.0f);
+        glm::vec3 cubePos(2.0f, 0.0f, 0.0f);
+        if (Entity* cameraEntity = engine->GetEntity("Camera")) {
+            if (auto* transform = cameraEntity->GetComponent<TransformComponent>()) {
+                const glm::vec3 camPos = transform->GetPosition();
+                // TransformComponent stores radians; glm::quat(vec3) expects radians.
+                const glm::quat q = glm::quat(transform->GetRotation());
+                const glm::vec3 forward = q * glm::vec3(0.0f, 0.0f, -1.0f);
+                const glm::vec3 right   = q * glm::vec3(1.0f, 0.0f, 0.0f);
+                // Fox a few metres in front of the camera; cube just to its right so both
+                // animated models are framed together.
+                foxPos  = camPos + forward * 5.0f;
+                cubePos = foxPos + right * 2.0f;
+                std::cout << "[Spawn] Fox at " << foxPos.x << ", " << foxPos.y << ", " << foxPos.z << std::endl;
+            }
+        }
+
+        // Load the Fox character. The Khronos Fox model is authored in centimetre-scale
+        // units (~140 units tall), so at unit scale it dwarfs the metre-scale bistro.
+        // Scale by 0.01 to bring it to a realistic ~1.4 m.
+        LoadGLTFModel(engine, "../assets/Fox/glTF/Fox.gltf", foxPos, glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.01f), 0.0f);
+
+        // Also load the Morph Cube, just beside the Fox. Unit scale keeps it proportionate
+        // to the ~1.4 m Fox.
+        LoadGLTFModel(engine, "../assets/AnimatedMorphCube/glTF/AnimatedMorphCube.gltf", cubePos, glm::vec3(0.0f), glm::vec3(1.0f), 0.0f);
+	}).detach();
+}
+
+#if defined(PLATFORM_ANDROID)
+/**
+ * @brief Android entry point.
+ * @param app The Android app.
+ */
+void android_main(android_app *app)
+{
+	try
+	{
+		// Create the engine
+		Engine engine;
+
+		// Initialize the engine
+		if (!engine.InitializeAndroid(app, "Simple Engine", ENABLE_VALIDATION_LAYERS))
+		{
+			throw std::runtime_error("Failed to initialize engine");
+		}
+
+		// Set up the scene
+		SetupScene(&engine);
+
+		// Run the engine
+		engine.RunAndroid();
+	}
+	catch (const std::exception &e)
+	{
+		LOGE("Exception: %s", e.what());
+	}
+}
+#else
+/**
+ * @brief Desktop entry point.
+ * @return The exit code.
+ */
+int main(int, char *[])
+{
+	try
+	{
+		// Enable minidump generation for Release-only crashes (e.g., stack cookie failures / fast-fail).
+		// Writes dumps under the current working directory (the build/run directory).
+		CrashReporter::GetInstance().Initialize("crashes", "SimpleEngine", "1.0.0");
+
+		// Create the engine
+		Engine engine;
+
+		// Initialize the engine
+		if (!engine.Initialize("Simple Engine", WINDOW_WIDTH, WINDOW_HEIGHT, ENABLE_VALIDATION_LAYERS))
+		{
+			throw std::runtime_error("Failed to initialize engine");
+		}
+
+		if (auto* r = engine.GetRenderer()) {
+			r->SetRayQueryStaticOnly(false);
+		}
+
+		// Set up the scene
+		SetupScene(&engine);
+
+		// Run the engine
+		engine.Run();
+
+		CrashReporter::GetInstance().Cleanup();
+
+		return 0;
+	}
+	catch (const std::exception &e)
+	{
+		std::cerr << "Exception: " << e.what() << std::endl;
+		CrashReporter::GetInstance().Cleanup();
+		return 1;
+	}
+}
+#endif
diff --git a/attachments/advanced_gltf/mesh_component.cpp b/attachments/advanced_gltf/mesh_component.cpp
new file mode 100644
index 000000000..57daa29c8
--- /dev/null
+++ b/attachments/advanced_gltf/mesh_component.cpp
@@ -0,0 +1,202 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "mesh_component.h"
+#include "model_loader.h"
+#include <cmath>
+#include <limits>
+
+// Helper to transform an AABB by a matrix
+static void transformAABBLocal(const glm::mat4 &M,
+                               const glm::vec3 &localMin,
+                               const glm::vec3 &localMax,
+                               glm::vec3       &outMin,
+                               glm::vec3       &outMax)
+{
+	const glm::vec3 c = 0.5f * (localMin + localMax);
+	const glm::vec3 e = 0.5f * (localMax - localMin);
+
+	const glm::vec3 worldCenter = glm::vec3(M * glm::vec4(c, 1.0f));
+	const glm::mat3 A            = glm::mat3(M);
+	const glm::mat3 AbsA         = glm::mat3(glm::abs(A[0]), glm::abs(A[1]), glm::abs(A[2]));
+	const glm::vec3 worldExtents = AbsA * e;
+
+	outMin = worldCenter - worldExtents;
+	outMax = worldCenter + worldExtents;
+}
+
+void MeshComponent::RecomputeMeshAABB()
+{
+	if (meshAABBValid)
+		return;
+
+	if (vertices.empty())
+	{
+		meshAABBMin   = glm::vec3(0.0f);
+		meshAABBMax   = glm::vec3(0.0f);
+		meshAABBValid = false;
+		return;
+	}
+	glm::vec3 minB = vertices[0].position;
+	glm::vec3 maxB = vertices[0].position;
+	for (const auto &v : vertices)
+	{
+		minB = glm::min(minB, v.position);
+		maxB = glm::max(maxB, v.position);
+	}
+	meshAABBMin   = minB;
+	meshAABBMax   = maxB;
+	meshAABBValid = true;
+}
+
+void MeshComponent::RecomputeLocalAABB()
+{
+	// First ensure base mesh AABB is up to date
+	RecomputeMeshAABB();
+
+	if (!meshAABBValid)
+	{
+		localAABBMin   = glm::vec3(0.0f);
+		localAABBMax   = glm::vec3(0.0f);
+		localAABBValid = false;
+		return;
+	}
+
+	if (instances.empty())
+	{
+		// No instances: local AABB is just the mesh AABB
+		localAABBMin   = meshAABBMin;
+		localAABBMax   = meshAABBMax;
+		localAABBValid = true;
+	}
+	else
+	{
+		// Union of all transformed instance AABBs
+		glm::vec3 fullMin(std::numeric_limits<float>::max());
+		glm::vec3 fullMax(-std::numeric_limits<float>::max());
+
+		for (const auto &inst : instances)
+		{
+			glm::vec3 instMin, instMax;
+			transformAABBLocal(inst.modelMatrix, meshAABBMin, meshAABBMax, instMin, instMax);
+			fullMin = glm::min(fullMin, instMin);
+			fullMax = glm::max(fullMax, instMax);
+		}
+
+		localAABBMin   = fullMin;
+		localAABBMax   = fullMax;
+		localAABBValid = true;
+	}
+}
+
+// Most of the MeshComponent class implementation is in the header file
+// This file is mainly for any methods that might need additional implementation
+
+void MeshComponent::CreateSphere(float radius, const glm::vec3 &color, int segments)
+{
+	vertices.clear();
+	indices.clear();
+
+	// Generate sphere vertices using parametric equations
+	for (int lat = 0; lat <= segments; ++lat)
+	{
+		const auto  theta    = static_cast<float>(lat * M_PI / segments);        // Latitude angle (0 to PI)
+		const float sinTheta = sinf(theta);
+		const float cosTheta = cosf(theta);
+
+		for (int lon = 0; lon <= segments; ++lon)
+		{
+			const auto  phi    = static_cast<float>(lon * 2.0 * M_PI / segments);        // Longitude angle (0 to 2*PI)
+			const float sinPhi = sinf(phi);
+			const float cosPhi = cosf(phi);
+
+			// Calculate position
+			glm::vec3 position = {
+			    radius * sinTheta * cosPhi,
+			    radius * cosTheta,
+			    radius * sinTheta * sinPhi};
+
+			// Normal is the same as normalized position for a sphere centered at origin
+			glm::vec3 normal = glm::normalize(position);
+
+			// Texture coordinates
+			const glm::vec2 texCoord = {
+			    static_cast<float>(lon) / static_cast<float>(segments),
+			    static_cast<float>(lat) / static_cast<float>(segments)};
+
+			// Calculate tangent (derivative with respect to longitude). Handle poles robustly.
+			glm::vec3 tangent = {
+			    -sinTheta * sinPhi,
+			    0.0f,
+			    sinTheta * cosPhi};
+			float len2 = glm::dot(tangent, tangent);
+			if (len2 < 1e-12f)
+			{
+				// At poles sinTheta ~ 0 -> fallback tangent orthogonal to normal
+				glm::vec3 t = glm::cross(normal, glm::vec3(0.0f, 0.0f, 1.0f));
+				if (glm::length(t) < 1e-12f)
+				{
+					t = glm::cross(normal, glm::vec3(1.0f, 0.0f, 0.0f));
+				}
+				tangent = glm::normalize(t);
+			}
+			else
+			{
+				tangent = glm::normalize(tangent);
+			}
+
+			vertices.push_back({position,
+			                    normal,
+			                    texCoord,
+			                    glm::vec4(tangent, 1.0f)});
+		}
+	}
+
+	// Generate indices for triangles
+	for (int lat = 0; lat < segments; ++lat)
+	{
+		for (int lon = 0; lon < segments; ++lon)
+		{
+			const int current = lat * (segments + 1) + lon;
+			const int next    = current + segments + 1;
+
+			// Create two triangles for each quad
+			indices.push_back(current);
+			indices.push_back(next);
+			indices.push_back(current + 1);
+
+			indices.push_back(current + 1);
+			indices.push_back(next);
+			indices.push_back(next + 1);
+		}
+	}
+
+	RecomputeLocalAABB();
+}
+
+void MeshComponent::LoadFromModel(const Model *model)
+{
+	if (!model)
+	{
+		return;
+	}
+
+	// Copy vertex and index data from the model
+	vertices = model->GetVertices();
+	indices  = model->GetIndices();
+
+	RecomputeLocalAABB();
+}
diff --git a/attachments/advanced_gltf/model_loader.cpp b/attachments/advanced_gltf/model_loader.cpp
new file mode 100644
index 000000000..128548171
--- /dev/null
+++ b/attachments/advanced_gltf/model_loader.cpp
@@ -0,0 +1,2215 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <cctype>
+#include <filesystem>
+#include <iostream>
+#include <numeric>
+#include <set>
+#include <tiny_gltf.h>
+#define STB_IMAGE_IMPLEMENTATION
+#include <stb_image.h>
+#include "mikktspace.h"
+#include <ktx.h>
+#include <shared_mutex>
+#include <mutex>
+#include <chrono>
+#include <vector>
+#include <unordered_map>
+#include <string>
+
+#include "model_loader.h"
+#include "mesh_component.h"
+#include "renderer.h"
+#include "renderer_advanced_types.h"
+
+AdvancedModelData& GetAdvancedModelData(const Model* model) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    return g_modelData[model];
+}
+
+void AdvancedModel_ProcessSkins(ModelLoader* loader, const tinygltf::Model& gltfModel, Model* model) {
+    auto& advanced = GetAdvancedModelData(model);
+    advanced.skins.clear();
+    for (const auto& gltfSkin : gltfModel.skins) {
+        Skin skin;
+        skin.name = gltfSkin.name;
+        skin.skeletonRoot = gltfSkin.skeleton;
+        skin.joints = gltfSkin.joints;
+
+        if (gltfSkin.inverseBindMatrices > -1) {
+            const auto& accessor = gltfModel.accessors[gltfSkin.inverseBindMatrices];
+            const auto& bufferView = gltfModel.bufferViews[accessor.bufferView];
+            const auto& buffer = gltfModel.buffers[bufferView.buffer];
+            skin.inverseBindMatrices.resize(accessor.count);
+            std::memcpy(skin.inverseBindMatrices.data(), &buffer.data[accessor.byteOffset + bufferView.byteOffset], accessor.count * sizeof(glm::mat4));
+        }
+        advanced.skins.push_back(std::move(skin));
+    }
+}
+// and our C++ MaterialMesh vertex data. It's passed via the m_pUserData pointer.
+struct MikkTSpaceInterface {
+  std::vector<Vertex>* vertices;
+  std::vector<uint32_t>* indices;
+};
+
+// These static callback functions are required by the MikkTSpace library.
+// They are defined here at file-scope so they are not part of the ModelLoader class.
+static int getNumFaces(const SMikkTSpaceContext* pContext) {
+  auto* userData = static_cast<MikkTSpaceInterface *>(pContext->m_pUserData);
+  return static_cast<int>(userData->indices->size() / 3);
+}
+
+static int getNumVerticesOfFace(const SMikkTSpaceContext* pContext, const int iFace) {
+  return 3;
+}
+
+static void getPosition(const SMikkTSpaceContext* pContext, float fvPosOut[], const int iFace, const int iVert) {
+  auto* userData = static_cast<MikkTSpaceInterface *>(pContext->m_pUserData);
+  uint32_t index = (*userData->indices)[iFace * 3 + iVert];
+  const glm::vec3& pos = (*userData->vertices)[index].position;
+  fvPosOut[0] = pos.x;
+  fvPosOut[1] = pos.y;
+  fvPosOut[2] = pos.z;
+}
+
+static void getNormal(const SMikkTSpaceContext* pContext, float fvNormOut[], const int iFace, const int iVert) {
+  auto* userData = static_cast<MikkTSpaceInterface *>(pContext->m_pUserData);
+  uint32_t index = (*userData->indices)[iFace * 3 + iVert];
+  const glm::vec3& norm = (*userData->vertices)[index].normal;
+  fvNormOut[0] = norm.x;
+  fvNormOut[1] = norm.y;
+  fvNormOut[2] = norm.z;
+}
+
+static void getTexCoord(const SMikkTSpaceContext* pContext, float fvTexcOut[], const int iFace, const int iVert) {
+  auto* userData = static_cast<MikkTSpaceInterface *>(pContext->m_pUserData);
+  uint32_t index = (*userData->indices)[iFace * 3 + iVert];
+  const glm::vec2& uv = (*userData->vertices)[index].texCoord;
+  fvTexcOut[0] = uv.x;
+  fvTexcOut[1] = uv.y;
+}
+
+static void setTSpaceBasic(const SMikkTSpaceContext* pContext, const float fvTangent[], const float fSign, const int iFace, const int iVert) {
+  auto* userData = static_cast<MikkTSpaceInterface *>(pContext->m_pUserData);
+  uint32_t index = (*userData->indices)[iFace * 3 + iVert];
+  Vertex& vert = (*userData->vertices)[index];
+  vert.tangent.x = fvTangent[0];
+  vert.tangent.y = fvTangent[1];
+  vert.tangent.z = fvTangent[2];
+  // Clamp handedness to +/-1 to avoid tiny floating deviations
+  vert.tangent.w = (fSign >= 0.0f) ? 1.0f : -1.0f;
+}
+
+// KTX2 decoding for GLTF images
+#include <ktx.h>
+
+// Helper: load KTX2 file from disk into RGBA8 CPU buffer
+static bool LoadKTX2FileToRGBA(const std::string& filePath, std::vector<uint8_t>& outData, int& width, int& height, int& channels) {
+  ktxTexture2* ktxTex = nullptr;
+  KTX_error_code result = ktxTexture2_CreateFromNamedFile(filePath.c_str(), KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, &ktxTex);
+  if (result != KTX_SUCCESS || !ktxTex) {
+    return false;
+  }
+  bool needsTranscode = ktxTexture2_NeedsTranscoding(ktxTex);
+  if (needsTranscode) {
+    result = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_RGBA32, 0);
+    if (result != KTX_SUCCESS) {
+      ktxTexture_Destroy(reinterpret_cast<ktxTexture *>(ktxTex));
+      return false;
+    }
+  }
+  width = static_cast<int>(ktxTex->baseWidth);
+  height = static_cast<int>(ktxTex->baseHeight);
+  channels = 4;
+  ktx_size_t offset;
+  ktxTexture_GetImageOffset(reinterpret_cast<ktxTexture *>(ktxTex), 0, 0, 0, &offset);
+  const uint8_t* levelData = ktxTexture_GetData(reinterpret_cast<ktxTexture *>(ktxTex)) + offset;
+  size_t levelSize = needsTranscode ? static_cast<size_t>(width) * static_cast<size_t>(height) * 4 : ktxTexture_GetImageSize(reinterpret_cast<ktxTexture *>(ktxTex), 0);
+  outData.resize(levelSize);
+  std::memcpy(outData.data(), levelData, levelSize);
+  ktxTexture_Destroy(reinterpret_cast<ktxTexture *>(ktxTex));
+  return true;
+}
+
+// Emissive scaling factor to convert from Blender units to engine units
+#define EMISSIVE_SCALE_FACTOR (1.0f / 638.0f)
+#define LIGHT_SCALE_FACTOR (1.0f / 638.0f)
+
+ModelLoader::~ModelLoader() {
+  // Destructor implementation
+  models.clear();
+  materials.clear();
+}
+
+bool ModelLoader::Initialize(Renderer* _renderer) {
+  renderer = _renderer;
+
+  if (!renderer) {
+    std::cerr << "ModelLoader::Initialize: Renderer is null" << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+Model* ModelLoader::LoadGLTF(const std::string& filename) {
+  // Check if the model is already loaded
+  auto it = models.find(filename);
+  if (it != models.end()) {
+    return it->second.get();
+  }
+
+  // Create a new model
+  auto model = std::make_unique<Model>(filename);
+
+  // Parse the GLTF file
+  if (!ParseGLTF(filename, model.get())) {
+    std::cerr << "ModelLoader::LoadGLTF: Failed to parse GLTF file: " << filename << std::endl;
+    return nullptr;
+  }
+
+  // Store the model
+  models[filename] = std::move(model);
+
+  return models[filename].get();
+}
+
+Model* ModelLoader::GetModel(const std::string& name) {
+  auto it = models.find(name);
+  if (it != models.end()) {
+    return it->second.get();
+  }
+  return nullptr;
+}
+
+// Static helper function to lowercase a string (ASCII only)
+static std::string ToLower(const std::string& s) {
+  std::string out = s;
+  std::ranges::transform(out,
+                         out.begin(),
+                         [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+  return out;
+}
+
+// Static helper function for loading KTX2 images in GLTF files
+static bool LoadKTX2Image(tinygltf::Image* image,
+                          const int image_idx,
+                          std::string* err,
+                          std::string* warn,
+                          int req_width,
+                          int req_height,
+                          const unsigned char* bytes,
+                          int size,
+                          void* user_data) {
+  // 1. Try KTX2 first using libktx
+  if (size >= 12 && std::memcmp(bytes, "\xABKTX 20\xBB\r\n\x1A\n", 12) == 0) {
+    ktxTexture2* ktxTex = nullptr;
+    KTX_error_code result = ktxTexture2_CreateFromMemory(bytes, size, KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, &ktxTex);
+    if (result == KTX_SUCCESS && ktxTex) {
+      bool needsTranscode = ktxTexture2_NeedsTranscoding(ktxTex);
+      if (needsTranscode) {
+        result = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_RGBA32, 0);
+        if (result != KTX_SUCCESS) {
+          if (err)
+            *err = "Failed to transcode KTX2 image: " + std::to_string(result);
+          ktxTexture_Destroy(reinterpret_cast<ktxTexture *>(ktxTex));
+          return false;
+        }
+      }
+      image->width = static_cast<int>(ktxTex->baseWidth);
+      image->height = static_cast<int>(ktxTex->baseHeight);
+      image->component = 4;
+      image->bits = 8;
+      image->pixel_type = TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE;
+
+      ktx_size_t offset;
+      ktxTexture_GetImageOffset(reinterpret_cast<ktxTexture *>(ktxTex), 0, 0, 0, &offset);
+      const uint8_t* levelData = ktxTexture_GetData(reinterpret_cast<ktxTexture *>(ktxTex)) + offset;
+      size_t levelSize = needsTranscode ? static_cast<size_t>(image->width) * static_cast<size_t>(image->height) * 4 : ktxTexture_GetImageSize(reinterpret_cast<ktxTexture *>(ktxTex), 0);
+      image->image.resize(levelSize);
+      std::memcpy(image->image.data(), levelData, levelSize);
+      ktxTexture_Destroy(reinterpret_cast<ktxTexture *>(ktxTex));
+      return true;
+    }
+  }
+
+  // 2. Fallback to stb_image for common formats (PNG, JPEG, etc.)
+  int w, h, comp;
+  unsigned char* data = stbi_load_from_memory(bytes, size, &w, &h, &comp, 4);
+  if (data) {
+    image->width = w;
+    image->height = h;
+    image->component = 4;
+    image->bits = 8;
+    image->pixel_type = TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE;
+    image->image.assign(data, data + (w * h * 4));
+    stbi_image_free(data);
+    return true;
+  }
+
+  if (err) {
+    *err = "Failed to load image via KTX2 or stb_image (size=" + std::to_string(size) + ")";
+  }
+  std::cerr << "ModelLoader: Image decoding failed (size=" << size << ")" << std::endl;
+  return false;
+}
+void ModelLoader::ProcessMaterials(const tinygltf::Model& gltfModel,
+                                   const std::string& baseTexturePath,
+                                   std::set<std::string>& loadedTextures) {
+  // Build/refresh an index -> material mapping that matches glTF material indices.
+  materialsByIndex.clear();
+  materialsByIndex.resize(gltfModel.materials.size(), nullptr);
+
+  // Reserve a globally-unique index range for this model's materials so ray-query material
+  // slots don't collide with previously-loaded models (e.g. Fox material 0 vs bistro material 0).
+  m_currentModelMaterialBase = m_globalMaterialBase;
+  m_globalMaterialBase += static_cast<uint32_t>(gltfModel.materials.size());
+
+  // Process materials first
+  for (size_t i = 0; i < gltfModel.materials.size(); ++i) {
+    const auto& gltfMaterial = gltfModel.materials[i];
+
+    // Create PBR material
+    auto material = std::make_unique<Material>(gltfMaterial.name.empty() ? ("material_" + std::to_string(i)) : gltfMaterial.name);
+
+    // Extract PBR properties
+    if (gltfMaterial.pbrMetallicRoughness.baseColorFactor.size() >= 3) {
+      material->albedo = glm::vec3(
+        gltfMaterial.pbrMetallicRoughness.baseColorFactor[0],
+        gltfMaterial.pbrMetallicRoughness.baseColorFactor[1],
+        gltfMaterial.pbrMetallicRoughness.baseColorFactor[2]);
+      if (gltfMaterial.pbrMetallicRoughness.baseColorFactor.size() >= 4) {
+        material->alpha = static_cast<float>(gltfMaterial.pbrMetallicRoughness.baseColorFactor[3]);
+      }
+    }
+    material->metallic = static_cast<float>(gltfMaterial.pbrMetallicRoughness.metallicFactor);
+    material->roughness = static_cast<float>(gltfMaterial.pbrMetallicRoughness.roughnessFactor);
+
+    if (gltfMaterial.emissiveFactor.size() >= 3) {
+      material->emissive = glm::vec3(
+        gltfMaterial.emissiveFactor[0],
+        gltfMaterial.emissiveFactor[1],
+        gltfMaterial.emissiveFactor[2]);
+      material->emissive *= light_scale;
+    }
+
+    // Parse KHR_materials_emissive_strength extension
+    auto extensionIt = gltfMaterial.extensions.find("KHR_materials_emissive_strength");
+    if (extensionIt != gltfMaterial.extensions.end()) {
+      hasEmissiveStrengthExtension = true;
+      const tinygltf::Value& extension = extensionIt->second;
+      if (extension.Has("emissiveStrength") && extension.Get("emissiveStrength").IsNumber()) {
+        material->emissiveStrength = static_cast<float>(extension.Get("emissiveStrength").Get<double>());
+      }
+    } else {
+      material->emissiveStrength = 0.00058f;
+    }
+
+    // Alpha mode / cutoff
+    material->alphaMode = gltfMaterial.alphaMode.empty() ? std::string("OPAQUE") : gltfMaterial.alphaMode;
+    material->alphaCutoff = static_cast<float>(gltfMaterial.alphaCutoff);
+
+    // Transmission (KHR_materials_transmission)
+    auto transIt = gltfMaterial.extensions.find("KHR_materials_transmission");
+    if (transIt != gltfMaterial.extensions.end()) {
+      const tinygltf::Value& ext = transIt->second;
+      if (ext.Has("transmissionFactor") && ext.Get("transmissionFactor").IsNumber()) {
+        material->transmissionFactor = static_cast<float>(ext.Get("transmissionFactor").Get<double>());
+      }
+    }
+
+    // Classify obvious architectural glass and liquid materials for
+    // specialized rendering. This is a heuristic based primarily on
+    // material name.
+    {
+      std::string lowerName = ToLower(material->GetName());
+      bool nameSuggestsGlass =
+          (lowerName.find("glass") != std::string::npos) ||
+          (lowerName.find("window") != std::string::npos);
+
+      bool probablyLiquid =
+          (lowerName.find("beer") != std::string::npos) ||
+          (lowerName.find("wine") != std::string::npos) ||
+          (lowerName.find("liquid") != std::string::npos);
+
+      if (nameSuggestsGlass && !probablyLiquid) {
+        material->isGlass = true;
+      }
+
+      if (probablyLiquid) {
+        material->isLiquid = true;
+
+        // Slightly boost liquid visibility.
+        material->albedo *= 1.4f;
+        material->albedo = glm::clamp(material->albedo, glm::vec3(0.0f), glm::vec3(4.0f));
+
+        // Slightly reduce roughness so specular highlights from
+        // lights help liquids stand out.
+        material->roughness = glm::clamp(material->roughness * 0.8f, 0.0f, 1.0f);
+
+        // Ensure the liquid is not fully transparent by default.
+        material->alpha = glm::clamp(material->alpha * 1.2f, 0.15f, 1.0f);
+      }
+    }
+
+    // Specular-Glossiness (KHR_materials_pbrSpecularGlossiness)
+    auto sgIt = gltfMaterial.extensions.find("KHR_materials_pbrSpecularGlossiness");
+    if (sgIt != gltfMaterial.extensions.end()) {
+      const tinygltf::Value& ext = sgIt->second;
+      material->useSpecularGlossiness = true;
+      // diffuseFactor -> albedo and alpha
+      if (ext.Has("diffuseFactor") && ext.Get("diffuseFactor").IsArray()) {
+        const auto& arr = ext.Get("diffuseFactor").Get<tinygltf::Value::Array>();
+        if (arr.size() >= 3) {
+          material->albedo = glm::vec3(
+            arr[0].IsNumber() ? static_cast<float>(arr[0].Get<double>()) : material->albedo.r,
+            arr[1].IsNumber() ? static_cast<float>(arr[1].Get<double>()) : material->albedo.g,
+            arr[2].IsNumber() ? static_cast<float>(arr[2].Get<double>()) : material->albedo.b);
+          if (arr.size() >= 4 && arr[3].IsNumber()) {
+            material->alpha = static_cast<float>(arr[3].Get<double>());
+          }
+        }
+      }
+      // specularFactor (vec3)
+      if (ext.Has("specularFactor") && ext.Get("specularFactor").IsArray()) {
+        const auto& arr = ext.Get("specularFactor").Get<tinygltf::Value::Array>();
+        if (arr.size() >= 3) {
+          material->specularFactor = glm::vec3(
+            arr[0].IsNumber() ? static_cast<float>(arr[0].Get<double>()) : material->specularFactor.r,
+            arr[1].IsNumber() ? static_cast<float>(arr[1].Get<double>()) : material->specularFactor.g,
+            arr[2].IsNumber() ? static_cast<float>(arr[2].Get<double>()) : material->specularFactor.b);
+        }
+      }
+      // glossinessFactor (float)
+      if (ext.Has("glossinessFactor") && ext.Get("glossinessFactor").IsNumber()) {
+        material->glossinessFactor = static_cast<float>(ext.Get("glossinessFactor").Get<double>());
+      }
+
+      // Load diffuseTexture into albedoTexturePath if present
+      if (ext.Has("diffuseTexture") && ext.Get("diffuseTexture").IsObject()) {
+        const auto& diffObj = ext.Get("diffuseTexture");
+        if (diffObj.Has("index") && diffObj.Get("index").IsInt()) {
+          int texIndex = diffObj.Get("index").Get<int>();
+          if (texIndex >= 0 && texIndex < static_cast<int>(gltfModel.textures.size())) {
+            const auto& texture = gltfModel.textures[texIndex];
+            int imageIndex = -1;
+            if (texture.source >= 0 && texture.source < static_cast<int>(gltfModel.images.size())) {
+              imageIndex = texture.source;
+            } else {
+              auto extBasis = texture.extensions.find("KHR_texture_basisu");
+              if (extBasis != texture.extensions.end()) {
+                const tinygltf::Value& e = extBasis->second;
+                if (e.Has("source") && e.Get("source").IsInt()) {
+                  int src = e.Get("source").Get<int>();
+                  if (src >= 0 && src < static_cast<int>(gltfModel.images.size()))
+                    imageIndex = src;
+                }
+              }
+            }
+            if (imageIndex >= 0) {
+              const auto& image = gltfModel.images[imageIndex];
+              std::string textureId = "gltf_baseColor_" + std::to_string(texIndex);
+              if (!image.image.empty()) {
+                renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+                material->albedoTexturePath = textureId;
+              } else if (!image.uri.empty()) {
+                std::string filePath = baseTexturePath + image.uri;
+                renderer->LoadTextureAsync(filePath);
+                material->albedoTexturePath = filePath;
+              }
+            }
+          }
+        }
+      }
+      // Load specularGlossinessTexture into specGlossTexturePath and mirror to metallicRoughnessTexturePath (binding 2)
+      if (ext.Has("specularGlossinessTexture") && ext.Get("specularGlossinessTexture").IsObject()) {
+        const auto& sgObj = ext.Get("specularGlossinessTexture");
+        if (sgObj.Has("index") && sgObj.Get("index").IsInt()) {
+          int texIndex = sgObj.Get("index").Get<int>();
+          if (texIndex >= 0 && texIndex < static_cast<int>(gltfModel.textures.size())) {
+            const auto& texture = gltfModel.textures[texIndex];
+            if (texture.source >= 0 && texture.source < static_cast<int>(gltfModel.images.size())) {
+              std::string textureId = "gltf_specGloss_" + std::to_string(texIndex);
+              const auto& image = gltfModel.images[texture.source];
+              if (!image.image.empty()) {
+                // Embedded image data (already decoded by tinygltf image loader)
+                renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component, false);
+                material->specGlossTexturePath = textureId;
+                material->metallicRoughnessTexturePath = textureId; // reuse binding 2
+              } else if (!image.uri.empty()) {
+                // External KTX2 file: offload libktx decode + upload to renderer worker threads
+                std::string filePath = baseTexturePath + image.uri;
+                renderer->RegisterTextureAlias(textureId, filePath);
+                renderer->LoadTextureAsync(filePath);
+                material->specGlossTexturePath = textureId;
+                material->metallicRoughnessTexturePath = textureId; // reuse binding 2
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // Extract texture information and load embedded texture data
+    if (gltfMaterial.pbrMetallicRoughness.baseColorTexture.index >= 0) {
+      int texIndex = gltfMaterial.pbrMetallicRoughness.baseColorTexture.index;
+      if (texIndex < gltfModel.textures.size()) {
+        const auto& texture = gltfModel.textures[texIndex];
+        int imageIndex = -1;
+        if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+          imageIndex = texture.source;
+        } else {
+          auto extIt = texture.extensions.find("KHR_texture_basisu");
+          if (extIt != texture.extensions.end()) {
+            const tinygltf::Value& ext = extIt->second;
+            if (ext.Has("source") && ext.Get("source").IsInt()) {
+              int src = ext.Get("source").Get<int>();
+              if (src >= 0 && src < static_cast<int>(gltfModel.images.size())) {
+                imageIndex = src;
+              }
+            }
+          }
+        }
+        if (imageIndex >= 0) {
+          std::string textureId = "gltf_baseColor_" + std::to_string(texIndex);
+          material->albedoTexturePath = textureId;
+
+          // Load texture data (embedded or external)
+          const auto& image = gltfModel.images[imageIndex];
+          if (!image.image.empty()) {
+            // Always use memory-based upload (KTX2 already decoded by SetImageLoader)
+            renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component, false);
+            material->albedoTexturePath = textureId;
+          } else if (!image.uri.empty()) {
+            // Offload KTX2 file reading/upload to renderer thread pool
+            std::string filePath = baseTexturePath + image.uri;
+            renderer->RegisterTextureAlias(textureId, filePath);
+            renderer->LoadTextureAsync(filePath, false);
+            material->albedoTexturePath = textureId;
+          } else {
+            std::cerr << "    Warning: No decoded image bytes for base color texture index " << texIndex << std::endl;
+          }
+        }
+      }
+    }
+
+    if (gltfMaterial.pbrMetallicRoughness.metallicRoughnessTexture.index >= 0) {
+      int texIndex = gltfMaterial.pbrMetallicRoughness.metallicRoughnessTexture.index;
+      if (texIndex < gltfModel.textures.size()) {
+        const auto& texture = gltfModel.textures[texIndex];
+        if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+          std::string textureId = "gltf_texture_" + std::to_string(texIndex);
+          material->metallicRoughnessTexturePath = textureId;
+
+          // Load texture data (embedded or external)
+          const auto& image = gltfModel.images[texture.source];
+          if (!image.image.empty()) {
+            // Load embedded texture data asynchronously
+            renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+          } else if (!image.uri.empty()) {
+            // Offload KTX2 file reading/upload to renderer thread pool
+            std::string filePath = baseTexturePath + image.uri;
+            renderer->RegisterTextureAlias(textureId, filePath);
+            renderer->LoadTextureAsync(filePath);
+            material->metallicRoughnessTexturePath = textureId;
+          } else {
+            std::cerr << "    Warning: No decoded bytes for metallic-roughness texture index " << texIndex << std::endl;
+          }
+        }
+      }
+    }
+
+    if (gltfMaterial.normalTexture.index >= 0) {
+      int texIndex = gltfMaterial.normalTexture.index;
+      if (texIndex < gltfModel.textures.size()) {
+        const auto& texture = gltfModel.textures[texIndex];
+        int imageIndex = -1;
+        if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+          imageIndex = texture.source;
+        } else {
+          auto extIt = texture.extensions.find("KHR_texture_basisu");
+          if (extIt != texture.extensions.end()) {
+            const tinygltf::Value& ext = extIt->second;
+            if (ext.Has("source") && ext.Get("source").IsInt()) {
+              int src = ext.Get("source").Get<int>();
+              if (src >= 0 && src < static_cast<int>(gltfModel.images.size())) {
+                imageIndex = src;
+              }
+            }
+          }
+        }
+        if (imageIndex >= 0) {
+          std::string textureId = "gltf_texture_" + std::to_string(texIndex);
+          material->normalTexturePath = textureId;
+
+          // Load texture data (embedded or external)
+          const auto& image = gltfModel.images[imageIndex];
+          if (!image.image.empty()) {
+            renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+            material->normalTexturePath = textureId;
+          } else if (!image.uri.empty()) {
+            // Offload KTX2 file reading/upload to renderer thread pool
+            std::string filePath = baseTexturePath + image.uri;
+            renderer->RegisterTextureAlias(textureId, filePath);
+            renderer->LoadTextureAsync(filePath);
+            material->normalTexturePath = textureId;
+          } else {
+            std::cerr << "    Warning: No decoded bytes for normal texture index " << texIndex << std::endl;
+          }
+        }
+      }
+    }
+
+    if (gltfMaterial.occlusionTexture.index >= 0) {
+      int texIndex = gltfMaterial.occlusionTexture.index;
+      if (texIndex < gltfModel.textures.size()) {
+        const auto& texture = gltfModel.textures[texIndex];
+        if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+          std::string textureId = "gltf_texture_" + std::to_string(texIndex);
+          material->occlusionTexturePath = textureId;
+
+          // Load texture data (embedded or external)
+          const auto& image = gltfModel.images[texture.source];
+          if (!image.image.empty()) {
+            // Schedule embedded texture upload
+            renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+          } else if (!image.uri.empty()) {
+            // Offload KTX2 file reading/upload to renderer thread pool
+            std::string filePath = baseTexturePath + image.uri;
+            renderer->RegisterTextureAlias(textureId, filePath);
+            renderer->LoadTextureAsync(filePath);
+            material->occlusionTexturePath = textureId;
+          } else {
+            std::cerr << "    Warning: No decoded bytes for occlusion texture index " << texIndex << std::endl;
+          }
+        }
+      }
+    }
+
+    if (gltfMaterial.emissiveTexture.index >= 0) {
+      int texIndex = gltfMaterial.emissiveTexture.index;
+      if (texIndex < gltfModel.textures.size()) {
+        const auto& texture = gltfModel.textures[texIndex];
+        if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+          std::string textureId = "gltf_texture_" + std::to_string(texIndex);
+          material->emissiveTexturePath = textureId;
+
+          // Load texture data (embedded or external)
+          const auto& image = gltfModel.images[texture.source];
+          if (!image.image.empty()) {
+            // Schedule embedded texture upload
+            renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+          } else if (!image.uri.empty()) {
+            // Offload KTX2 file reading/upload to renderer thread pool
+            std::string filePath = baseTexturePath + image.uri;
+            renderer->RegisterTextureAlias(textureId, filePath);
+            renderer->LoadTextureAsync(filePath);
+            material->emissiveTexturePath = textureId;
+          } else {
+            std::cerr << "    Warning: No decoded bytes for emissive texture index " << texIndex << std::endl;
+          }
+        }
+      }
+    }
+
+    // Store the material
+    Material* rawPtr = material.get();
+    materials[material->GetName()] = std::move(material);
+    if (i < materialsByIndex.size()) {
+      materialsByIndex[i] = rawPtr;
+    }
+  }
+
+  // Handle KHR_materials_pbrSpecularGlossiness.diffuseTexture for baseColor when still missing
+  for (size_t i = 0; i < gltfModel.materials.size(); ++i) {
+    const auto& gltfMaterial = gltfModel.materials[i];
+    std::string matName = gltfMaterial.name.empty() ? ("material_" + std::to_string(i)) : gltfMaterial.name;
+    auto matIt = materials.find(matName);
+    if (matIt == materials.end())
+      continue;
+    Material* mat = matIt->second.get();
+    if (!mat || !mat->albedoTexturePath.empty())
+      continue;
+    auto extIt = gltfMaterial.extensions.find("KHR_materials_pbrSpecularGlossiness");
+    if (extIt != gltfMaterial.extensions.end()) {
+      const tinygltf::Value& ext = extIt->second;
+      if (ext.Has("diffuseTexture") && ext.Get("diffuseTexture").IsObject()) {
+        const auto& diffObj = ext.Get("diffuseTexture");
+        if (diffObj.Has("index") && diffObj.Get("index").IsInt()) {
+          int texIndex = diffObj.Get("index").Get<int>();
+          if (texIndex >= 0 && texIndex < static_cast<int>(gltfModel.textures.size())) {
+            const auto& texture = gltfModel.textures[texIndex];
+            int imageIndex = -1;
+            if (texture.source >= 0 && texture.source < static_cast<int>(gltfModel.images.size())) {
+              imageIndex = texture.source;
+            } else {
+              auto extBasis = texture.extensions.find("KHR_texture_basisu");
+              if (extBasis != texture.extensions.end()) {
+                const tinygltf::Value& e = extBasis->second;
+                if (e.Has("source") && e.Get("source").IsInt()) {
+                  int src = e.Get("source").Get<int>();
+                  if (src >= 0 && src < static_cast<int>(gltfModel.images.size()))
+                    imageIndex = src;
+                }
+              }
+            }
+            if (imageIndex >= 0) {
+              const auto& image = gltfModel.images[imageIndex];
+              std::string texIdOrPath;
+              if (!image.uri.empty()) {
+                texIdOrPath = baseTexturePath + image.uri;
+                // Schedule async load; libktx decoding will occur on renderer worker threads
+                renderer->LoadTextureAsync(texIdOrPath, false);
+                mat->albedoTexturePath = texIdOrPath;
+              }
+              if (mat->albedoTexturePath.empty() && !image.image.empty()) {
+                // Upload embedded image data (already decoded via our image loader when KTX2)
+                texIdOrPath = "gltf_baseColor_" + std::to_string(texIndex);
+                renderer->LoadTextureFromMemoryAsync(texIdOrPath, image.image.data(), image.width, image.height, image.component, false);
+                mat->albedoTexturePath = texIdOrPath;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Heuristic pass: fill missing baseColor (albedo) by deriving from normal map filenames
+  // Many Bistro materials have no baseColorTexture index. When that happens, try inferring
+  // the base color from the normal map by replacing common suffixes like _ddna -> _d/_c/_diffuse/_basecolor/_albedo.
+  for (auto& kv : materials) {
+    auto& material = kv.second;
+    Material* mat = material.get();
+    if (!mat)
+      continue;
+    if (!mat->albedoTexturePath.empty())
+      continue; // already set
+    // Only attempt if we have an external normal texture path to derive from
+    if (mat->normalTexturePath.empty())
+      continue;
+    const std::string& normalPath = mat->normalTexturePath;
+    // Skip embedded IDs like gltf_* which were already handled by memory uploads
+    if (normalPath.rfind("gltf_", 0) == 0)
+      continue;
+
+    std::string candidateBase = normalPath;
+    std::string normalLower = candidateBase;
+    for (auto& ch : normalLower)
+      ch = static_cast<char>(std::tolower(static_cast<unsigned char>(ch)));
+    size_t pos = normalLower.find("_ddna");
+    if (pos == std::string::npos) {
+      // Try a few additional normal suffixes seen in the wild
+      pos = normalLower.find("_n");
+    }
+    if (pos != std::string::npos) {
+      static const char* suffixes[] = {"_d", "_c", "_cm", "_diffuse", "_basecolor", "_albedo"};
+      for (const char* suf : suffixes) {
+        std::string cand = candidateBase;
+        cand.replace(pos, normalLower[pos] == '_' && normalLower.compare(pos, 5, "_ddna") == 0 ? 5 : 2, suf);
+        // Ensure the file exists before attempting to load
+        if (std::filesystem::exists(cand)) {
+          // Schedule async load; libktx decoding will occur on renderer worker threads
+          renderer->LoadTextureAsync(cand, false);
+          mat->albedoTexturePath = cand;
+          break;
+        }
+      }
+    }
+  }
+
+  // Secondary heuristic: scan glTF images for base color by material-name match when still missing
+  for (auto& [materialName, materialPtr] : materials) {
+    Material* mat = materialPtr.get();
+    if (!mat)
+      continue;
+    if (!mat->albedoTexturePath.empty())
+      continue; // already resolved
+    // Try to find an image URI that looks like the base color for this material
+    std::string materialNameLower = materialName;
+    std::ranges::transform(materialNameLower, materialNameLower.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+    for (const auto& image : gltfModel.images) {
+      if (image.uri.empty())
+        continue;
+      std::string imageUri = image.uri;
+      std::string imageUriLower = imageUri;
+      std::ranges::transform(imageUriLower, imageUriLower.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+      bool looksBase = imageUriLower.find("basecolor") != std::string::npos ||
+          imageUriLower.find("albedo") != std::string::npos ||
+          imageUriLower.find("diffuse") != std::string::npos;
+      if (!looksBase)
+        continue;
+      bool nameMatches = imageUriLower.find(materialNameLower) != std::string::npos;
+      if (!nameMatches) {
+        // Best-effort: try prefix of image name before '_' against material name
+        size_t underscore = imageUriLower.find('_');
+        if (underscore != std::string::npos) {
+          std::string prefix = imageUriLower.substr(0, underscore);
+          nameMatches = materialNameLower.find(prefix) != std::string::npos;
+        }
+      }
+      if (!nameMatches)
+        continue;
+
+      std::string textureId = baseTexturePath + imageUri; // use path string as ID for cache
+      if (!image.image.empty()) {
+        renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+        mat->albedoTexturePath = textureId;
+        break;
+      } else {
+        // Fallback: offload KTX2 file load to renderer threads
+        renderer->LoadTextureAsync(textureId);
+        mat->albedoTexturePath = textureId;
+        break;
+      }
+    }
+  }
+}
+void ModelLoader::ProcessCameras(const tinygltf::Model& gltfModel, Model* model) {
+  if (!gltfModel.cameras.empty()) {
+    std::cout << "Found " << gltfModel.cameras.size() << " camera(s) in GLTF file" << std::endl;
+
+    for (size_t i = 0; i < gltfModel.cameras.size(); ++i) {
+      const auto& gltfCamera = gltfModel.cameras[i];
+      std::cout << "  Camera " << i << ": " << gltfCamera.name << std::endl;
+
+      // Store camera data in the model for later use
+      CameraData cameraData;
+      cameraData.name = gltfCamera.name.empty() ? ("camera_" + std::to_string(i)) : gltfCamera.name;
+
+      if (gltfCamera.type == "perspective") {
+        cameraData.isPerspective = true;
+        cameraData.fov = static_cast<float>(gltfCamera.perspective.yfov);
+        cameraData.aspectRatio = static_cast<float>(gltfCamera.perspective.aspectRatio);
+        cameraData.nearPlane = static_cast<float>(gltfCamera.perspective.znear);
+        cameraData.farPlane = static_cast<float>(gltfCamera.perspective.zfar);
+        std::cout << "    Perspective camera: FOV=" << cameraData.fov
+            << ", Aspect=" << cameraData.aspectRatio
+            << ", Near=" << cameraData.nearPlane
+            << ", Far=" << cameraData.farPlane << std::endl;
+      } else if (gltfCamera.type == "orthographic") {
+        cameraData.isPerspective = false;
+        cameraData.orthographicSize = static_cast<float>(gltfCamera.orthographic.ymag);
+        cameraData.nearPlane = static_cast<float>(gltfCamera.orthographic.znear);
+        cameraData.farPlane = static_cast<float>(gltfCamera.orthographic.zfar);
+        std::cout << "    Orthographic camera: Size=" << cameraData.orthographicSize
+            << ", Near=" << cameraData.nearPlane
+            << ", Far=" << cameraData.farPlane << std::endl;
+      }
+
+      // Find the node that uses this camera to get transform information
+      for (const auto& node : gltfModel.nodes) {
+        if (node.camera == static_cast<int>(i)) {
+          // Extract transform from node
+          if (node.translation.size() == 3) {
+            cameraData.position = glm::vec3(
+              static_cast<float>(node.translation[0]),
+              static_cast<float>(node.translation[1]),
+              static_cast<float>(node.translation[2]));
+          }
+
+          if (node.rotation.size() == 4) {
+            cameraData.rotation = glm::quat(
+              static_cast<float>(node.rotation[3]),
+              // w
+              static_cast<float>(node.rotation[0]),
+              // x
+              static_cast<float>(node.rotation[1]),
+              // y
+              static_cast<float>(node.rotation[2]) // z
+            );
+          }
+
+          std::cout << "    Position: (" << cameraData.position.x << ", "
+              << cameraData.position.y << ", " << cameraData.position.z << ")" << std::endl;
+          break;
+        }
+      }
+
+      model->cameras.push_back(cameraData);
+    }
+  }
+}
+void ModelLoader::ProcessAnimations(const tinygltf::Model& gltfModel, Model* model) {
+  if (!gltfModel.animations.empty()) {
+    std::cout << "Found " << gltfModel.animations.size() << " animation(s) in GLTF file" << std::endl;
+
+    std::vector<Animation> parsedAnimations;
+    parsedAnimations.reserve(gltfModel.animations.size());
+
+    for (size_t animIdx = 0; animIdx < gltfModel.animations.size(); ++animIdx) {
+      const auto& gltfAnim = gltfModel.animations[animIdx];
+
+      Animation anim;
+      anim.name = gltfAnim.name.empty() ? ("animation_" + std::to_string(animIdx)) : gltfAnim.name;
+
+      // Parse samplers
+      anim.samplers.reserve(gltfAnim.samplers.size());
+      for (const auto& gltfSampler : gltfAnim.samplers) {
+        AnimationSampler sampler;
+
+        // Parse interpolation type
+        if (gltfSampler.interpolation == "STEP") {
+          sampler.interpolation = AnimationInterpolation::Step;
+        } else if (gltfSampler.interpolation == "CUBICSPLINE") {
+          sampler.interpolation = AnimationInterpolation::CubicSpline;
+        } else {
+          sampler.interpolation = AnimationInterpolation::Linear;
+        }
+
+        // Read input (time) accessor
+        if (gltfSampler.input >= 0 && gltfSampler.input < static_cast<int>(gltfModel.accessors.size())) {
+          const auto& inputAccessor = gltfModel.accessors[gltfSampler.input];
+          const auto& inputBufferView = gltfModel.bufferViews[inputAccessor.bufferView];
+          const auto& inputBuffer = gltfModel.buffers[inputBufferView.buffer];
+
+          const float* inputData = reinterpret_cast<const float *>(
+            &inputBuffer.data[inputBufferView.byteOffset + inputAccessor.byteOffset]);
+
+          sampler.inputTimes.resize(inputAccessor.count);
+          for (size_t i = 0; i < inputAccessor.count; ++i) {
+            sampler.inputTimes[i] = inputData[i];
+          }
+        }
+
+        // Read output (value) accessor
+        if (gltfSampler.output >= 0 && gltfSampler.output < static_cast<int>(gltfModel.accessors.size())) {
+          const auto& outputAccessor = gltfModel.accessors[gltfSampler.output];
+          const auto& outputBufferView = gltfModel.bufferViews[outputAccessor.bufferView];
+          const auto& outputBuffer = gltfModel.buffers[outputBufferView.buffer];
+
+          const float* outputData = reinterpret_cast<const float *>(
+            &outputBuffer.data[outputBufferView.byteOffset + outputAccessor.byteOffset]);
+
+          // Determine number of floats per element based on accessor type
+          size_t componentsPerElement = 1;
+          if (outputAccessor.type == TINYGLTF_TYPE_VEC3) {
+            componentsPerElement = 3;
+          } else if (outputAccessor.type == TINYGLTF_TYPE_VEC4) {
+            componentsPerElement = 4;
+          }
+
+          size_t totalFloats = outputAccessor.count * componentsPerElement;
+          sampler.outputValues.resize(totalFloats);
+          for (size_t i = 0; i < totalFloats; ++i) {
+            sampler.outputValues[i] = outputData[i];
+          }
+        }
+
+        anim.samplers.push_back(std::move(sampler));
+      }
+
+      // Parse channels
+      anim.channels.reserve(gltfAnim.channels.size());
+      for (const auto& gltfChannel : gltfAnim.channels) {
+        AnimationChannel channel;
+        channel.samplerIndex = gltfChannel.sampler;
+        channel.targetNode = gltfChannel.target_node;
+
+        // Parse target path
+        if (gltfChannel.target_path == "translation") {
+          channel.path = AnimationPath::Translation;
+        } else if (gltfChannel.target_path == "rotation") {
+          channel.path = AnimationPath::Rotation;
+        } else if (gltfChannel.target_path == "scale") {
+          channel.path = AnimationPath::Scale;
+        } else if (gltfChannel.target_path == "weights") {
+          channel.path = AnimationPath::Weights;
+        }
+
+        anim.channels.push_back(channel);
+      }
+
+      std::cout << "  Animation '" << anim.name << "': "
+          << anim.samplers.size() << " samplers, "
+          << anim.channels.size() << " channels, "
+          << "duration=" << anim.GetDuration() << "s" << std::endl;
+
+      parsedAnimations.push_back(std::move(anim));
+    }
+
+    model->SetAnimations(parsedAnimations);
+    std::cout << "Loaded " << parsedAnimations.size() << " animations into model" << std::endl;
+  }
+}
+
+
+bool ModelLoader::ParseGLTF(const std::string& filename, Model* model) {
+  std::cout << "Parsing GLTF file: " << filename << std::endl;
+
+  // Extract the directory path from the model file to use as a base path for textures
+  std::filesystem::path modelPath(filename);
+  std::filesystem::path baseDir = std::filesystem::absolute(modelPath).parent_path();
+  std::string baseTexturePath = baseDir.string();
+  if (!baseTexturePath.empty() && baseTexturePath.back() != '/') {
+    baseTexturePath += "/";
+  }
+  std::cout << "Using base texture path: " << baseTexturePath << std::endl;
+
+  // Create tinygltf loader
+  tinygltf::Model gltfModel;
+  tinygltf::TinyGLTF loader;
+  std::string err;
+  std::string warn;
+
+  // Set up image loader: prefer KTX2 via libktx; fallback to stb for other formats
+  loader.SetImageLoader(LoadKTX2Image, nullptr);
+
+  // Load the GLTF file
+  bool ret = false;
+  if (filename.find(".glb") != std::string::npos) {
+    ret = loader.LoadBinaryFromFile(&gltfModel, &err, &warn, filename);
+  } else {
+    ret = loader.LoadASCIIFromFile(&gltfModel, &err, &warn, filename);
+  }
+
+  if (!warn.empty()) {
+    std::cout << "GLTF Warning: " << warn << std::endl;
+  }
+
+  if (!err.empty()) {
+    std::cerr << "GLTF Error: " << err << std::endl;
+    return false;
+  }
+
+  if (!ret) {
+    std::cerr << "Failed to parse GLTF file: " << filename << std::endl;
+    return false;
+  }
+
+  // Extract mesh data from the first mesh (for now, we'll handle multiple meshes later)
+  if (gltfModel.meshes.empty()) {
+    std::cerr << "No meshes found in GLTF file" << std::endl;
+    return false;
+  }
+
+  light_scale = 1.0f;
+  // Test if generator is blender and apply the blender factor see the issue here: https://github.com/KhronosGroup/glTF/issues/2473
+  if (gltfModel.asset.generator.find("blender") != std::string::npos) {
+    std::cout << "Blender generator detected, applying blender factor" << std::endl;
+    light_scale = EMISSIVE_SCALE_FACTOR;
+  }
+
+  // Track loaded textures to prevent loading the same texture multiple times
+  std::set<std::string> loadedTextures;
+
+  // Process materials first
+  ProcessMaterials(gltfModel, baseTexturePath, loadedTextures);
+
+  // Process cameras from the GLTF file
+  ProcessCameras(gltfModel, model);
+
+  // Process animations from the GLTF file
+  ProcessAnimations(gltfModel, model);
+
+  // Process skins from the GLTF file
+  AdvancedModel_ProcessSkins(this, gltfModel, model);
+
+  // Collect all animated node indices from parsed animations
+  std::set<int> animatedNodeIndices;
+  for (const auto& anim : model->GetAnimations()) {
+    for (const auto& channel : anim.channels) {
+      if (channel.targetNode >= 0) {
+        animatedNodeIndices.insert(channel.targetNode);
+      }
+    }
+  }
+  if (!animatedNodeIndices.empty()) {
+    std::cout << "[Animation] Found " << animatedNodeIndices.size() << " unique animated node(s)" << std::endl;
+  }
+
+  std::map<int, std::vector<glm::mat4>> meshInstanceTransforms; // Map from mesh index to all instance transforms
+  std::unordered_map<int, glm::mat4> animatedNodeTransforms; // Map from animated node index to world transform
+  std::unordered_map<int, std::vector<int>> nodeChildren;
+  std::unordered_map<int, glm::mat4> nodeLocalTransforms;
+  std::unordered_map<int, glm::vec3> nodeLocalTranslations;
+  std::unordered_map<int, glm::quat> nodeLocalRotations;
+  std::unordered_map<int, glm::vec3> nodeLocalScales;
+  std::vector<int> rootNodes;
+  std::unordered_map<int, int> animatedNodeMeshes; // Map from animated node index to mesh index
+  std::unordered_map<int, int> nodeSkins; // nodeIndex -> skinIndex
+
+  // Helper function to calculate transform matrix from the GLTF node
+  auto calculateNodeTransform = [](const tinygltf::Node& node) -> glm::mat4 {
+    glm::mat4 transform;
+
+    // Apply matrix if present
+    if (node.matrix.size() == 16) {
+      // GLTF matrices are column-major, the same as GLM
+      transform = glm::mat4(
+        node.matrix[0],
+        node.matrix[1],
+        node.matrix[2],
+        node.matrix[3],
+        node.matrix[4],
+        node.matrix[5],
+        node.matrix[6],
+        node.matrix[7],
+        node.matrix[8],
+        node.matrix[9],
+        node.matrix[10],
+        node.matrix[11],
+        node.matrix[12],
+        node.matrix[13],
+        node.matrix[14],
+        node.matrix[15]);
+    } else {
+      // Build transform from TRS components
+      glm::mat4 translation = glm::mat4(1.0f);
+      glm::mat4 rotation = glm::mat4(1.0f);
+      glm::mat4 scale = glm::mat4(1.0f);
+
+      // Translation
+      if (node.translation.size() == 3) {
+        translation = glm::translate(glm::mat4(1.0f),
+                                     glm::vec3(
+                                       static_cast<float>(node.translation[0]),
+                                       static_cast<float>(node.translation[1]),
+                                       static_cast<float>(node.translation[2])));
+      }
+
+      // Rotation (quaternion)
+      if (node.rotation.size() == 4) {
+        glm::quat quat(
+          static_cast<float>(node.rotation[3]),
+          // w
+          static_cast<float>(node.rotation[0]),
+          // x
+          static_cast<float>(node.rotation[1]),
+          // y
+          static_cast<float>(node.rotation[2]) // z
+        );
+        rotation = glm::mat4_cast(quat);
+      }
+
+      // Scale
+      if (node.scale.size() == 3) {
+        scale = glm::scale(glm::mat4(1.0f),
+                           glm::vec3(
+                             static_cast<float>(node.scale[0]),
+                             static_cast<float>(node.scale[1]),
+                             static_cast<float>(node.scale[2])));
+      }
+
+      // Combine: T * R * S
+      transform = translation * rotation * scale;
+    }
+
+    return transform;
+  };
+
+  // Recursive function to traverse scene hierarchy
+  std::function < void(int, const glm::mat4 &) > traverseNode = [&](int nodeIndex, const glm::mat4& parentTransform) {
+    if (nodeIndex < 0 || nodeIndex >= gltfModel.nodes.size()) {
+      return;
+    }
+
+    const tinygltf::Node& node = gltfModel.nodes[nodeIndex];
+
+    // Calculate this node's transform
+    glm::mat4 nodeTransform = calculateNodeTransform(node);
+    nodeLocalTransforms[nodeIndex] = nodeTransform;
+
+    // Capture decomposed TRS for animation
+    if (node.translation.size() == 3) {
+        nodeLocalTranslations[nodeIndex] = glm::vec3(node.translation[0], node.translation[1], node.translation[2]);
+    } else {
+        nodeLocalTranslations[nodeIndex] = glm::vec3(0.0f);
+    }
+    if (node.rotation.size() == 4) {
+        nodeLocalRotations[nodeIndex] = glm::quat(static_cast<float>(node.rotation[3]), static_cast<float>(node.rotation[0]), 
+                                                 static_cast<float>(node.rotation[1]), static_cast<float>(node.rotation[2]));
+    } else {
+        nodeLocalRotations[nodeIndex] = glm::quat(1.0f, 0.0f, 0.0f, 0.0f);
+    }
+    if (node.scale.size() == 3) {
+        nodeLocalScales[nodeIndex] = glm::vec3(node.scale[0], node.scale[1], node.scale[2]);
+    } else {
+        nodeLocalScales[nodeIndex] = glm::vec3(1.0f);
+    }
+
+    glm::mat4 worldTransform = parentTransform * nodeTransform;
+
+    // Capture children info
+    if (!node.children.empty()) {
+        nodeChildren[nodeIndex] = node.children;
+    }
+
+    // If this node has a mesh, add the transform to the instances list
+    if (node.mesh >= 0 && node.mesh < gltfModel.meshes.size()) {
+      meshInstanceTransforms[node.mesh].push_back(worldTransform);
+      if (node.skin >= 0) {
+        nodeSkins[nodeIndex] = node.skin;
+        std::cout << "[Animation] Node " << nodeIndex << " has skin " << node.skin << std::endl;
+      }
+    }
+
+    // If this node is animated or has a skin, capture its world transform and mesh reference
+    if (animatedNodeIndices.contains(nodeIndex) || node.skin >= 0) {
+      animatedNodeTransforms[nodeIndex] = worldTransform;
+      if (node.mesh >= 0) {
+        animatedNodeMeshes[nodeIndex] = node.mesh;
+        std::cout << "[Animation] Captured transform for animated node " << nodeIndex
+            << " (" << node.name << ") with mesh " << node.mesh << std::endl;
+      } else {
+        std::cout << "[Animation] Captured transform for animated node " << nodeIndex
+            << " (" << node.name << ") - no mesh" << std::endl;
+      }
+    }
+
+    // Recursively process children
+    for (int childIndex : node.children) {
+      traverseNode(childIndex, worldTransform);
+    }
+  };
+
+  // Process all scenes (typically there's only one default scene)
+  if (!gltfModel.scenes.empty()) {
+    int defaultScene = gltfModel.defaultScene >= 0 ? gltfModel.defaultScene : 0;
+    if (defaultScene < gltfModel.scenes.size()) {
+      const tinygltf::Scene& scene = gltfModel.scenes[defaultScene];
+
+      // Traverse all root nodes in the scene
+      for (int rootNodeIndex : scene.nodes) {
+        rootNodes.push_back(rootNodeIndex);
+        traverseNode(rootNodeIndex, glm::mat4(1.0f));
+      }
+    }
+  }
+
+  // Store hierarchy info in the model
+  auto& advanced = GetAdvancedModelData(model);
+  advanced.nodeChildren = nodeChildren;
+  advanced.nodeLocalTransforms = nodeLocalTransforms;
+  advanced.nodeLocalTranslations = nodeLocalTranslations;
+  advanced.nodeLocalRotations = nodeLocalRotations;
+  advanced.nodeLocalScales = nodeLocalScales;
+  advanced.rootNodes = rootNodes;
+  advanced.nodeSkins = nodeSkins;
+
+  // Store animated node transforms in the model for use by AnimationComponent
+  if (!animatedNodeTransforms.empty()) {
+    model->SetAnimatedNodeTransforms(animatedNodeTransforms);
+    std::cout << "[Animation] Stored " << animatedNodeTransforms.size()
+        << " animated node transform(s) in model" << std::endl;
+  }
+
+  // Store animated node mesh mappings for linking geometry entities to animations
+  if (!animatedNodeMeshes.empty()) {
+    model->SetAnimatedNodeMeshes(animatedNodeMeshes);
+    std::cout << "[Animation] Stored " << animatedNodeMeshes.size()
+        << " animated node mesh mapping(s) in model" << std::endl;
+  }
+
+  std::map<std::string, MaterialMesh> geometryMaterialMeshMap; // Map from geometry+material hash to unique MaterialMesh
+
+  struct DeformableData {
+      bool isDeformable = false;
+      std::vector<glm::uvec4> jointIndices;
+      std::vector<glm::vec4> jointWeights;
+      int numMorphTargets = 0;
+      std::vector<std::vector<glm::vec3>> morphPositions;
+  };
+  std::map<std::string, DeformableData> geometryDeformableDataMap;
+
+  // Helper function to create a geometry hash for deduplication
+  auto createGeometryHash = [](const tinygltf::Primitive& primitive, int materialIndex) -> std::string {
+    std::string hash = "mat_" + std::to_string(materialIndex);
+
+    // Add primitive attribute hashes to ensure unique geometry identification
+    if (primitive.indices >= 0) {
+      hash += "_idx_" + std::to_string(primitive.indices);
+    }
+
+    for (const auto& [attrName, type] : primitive.attributes) {
+      hash += "_" + attrName + "_" + std::to_string(type);
+    }
+
+    return hash;
+  };
+
+  // Process all meshes with improved instancing support
+  for (size_t meshIndex = 0; meshIndex < gltfModel.meshes.size(); ++meshIndex) {
+    const auto& mesh = gltfModel.meshes[meshIndex];
+
+    // Check if this mesh has instances
+    auto instanceIt = meshInstanceTransforms.find(static_cast<int>(meshIndex));
+    std::vector<glm::mat4> instances;
+
+    if (instanceIt == meshInstanceTransforms.end() || instanceIt->second.empty()) {
+      instances.emplace_back(1.0f); // Identity transform at origin
+    } else {
+      instances = instanceIt->second;
+    }
+
+    // Process each primitive (material group) in this mesh
+    for (const auto& primitive : mesh.primitives) {
+      // Get the material index for this primitive
+      int materialIndex = primitive.material;
+      if (materialIndex < 0) {
+        materialIndex = -1; // Use -1 for primitives without materials
+      }
+
+      // Create a unique geometry hash for this primitive and material combination
+      std::string geometryHash = createGeometryHash(primitive, materialIndex);
+
+      // Use try_emplace to efficiently insert if not present and get reference
+      auto [it, inserted] = geometryMaterialMeshMap.try_emplace(geometryHash);
+
+      if (inserted) {
+        // New entry was created - initialize it
+        MaterialMesh& materialMesh = it->second;
+        materialMesh.materialIndex = materialIndex;
+        materialMesh.globalMaterialIndex = (materialIndex >= 0)
+            ? static_cast<int>(static_cast<uint32_t>(materialIndex) + m_currentModelMaterialBase)
+            : -1;
+        materialMesh.sourceMeshIndex = static_cast<int>(meshIndex); // Track source mesh for animations
+        
+        // Set number of morph targets
+        int numTargets = static_cast<int>(primitive.targets.size());
+        if (numTargets > 0) {
+          std::vector<std::vector<glm::vec3>> morphPositions(numTargets);
+          for (int t = 0; t < numTargets; ++t) {
+            const auto& target = primitive.targets[t];
+            auto posIt = target.find("POSITION");
+            if (posIt != target.end()) {
+              const tinygltf::Accessor& acc = gltfModel.accessors[posIt->second];
+              const tinygltf::BufferView& bv = gltfModel.bufferViews[acc.bufferView];
+              const tinygltf::Buffer& buf = gltfModel.buffers[bv.buffer];
+              const float* data = reinterpret_cast<const float*>(&buf.data[bv.byteOffset + acc.byteOffset]);
+              morphPositions[t].resize(acc.count);
+              for (size_t i = 0; i < acc.count; ++i) {
+                morphPositions[t][i] = glm::vec3(data[i*3], data[i*3+1], data[i*3+2]);
+              }
+            }
+          }
+          
+          auto& defData = geometryDeformableDataMap[geometryHash];
+          defData.numMorphTargets = numTargets;
+          defData.morphPositions = std::move(morphPositions);
+
+          std::cout << "      Extracted " << numTargets << " morph target(s) for material " << materialMesh.materialName << std::endl;
+        }
+
+        // Set material name
+        if (materialIndex >= 0 && materialIndex < gltfModel.materials.size()) {
+          const auto& gltfMaterial = gltfModel.materials[materialIndex];
+          materialMesh.materialName = gltfMaterial.name.empty() ? ("material_" + std::to_string(materialIndex)) : gltfMaterial.name;
+        } else {
+          materialMesh.materialName = "no_material";
+        }
+      }
+
+      MaterialMesh& materialMesh = it->second;
+
+      // Only process geometry if this MaterialMesh is empty (first time processing this geometry)
+      if (materialMesh.vertices.empty()) {
+        auto vertexOffsetInMaterialMesh = static_cast<uint32_t>(materialMesh.vertices.size());
+
+        // Get the position accessor, which defines the vertex count.
+        auto posIt = primitive.attributes.find("POSITION");
+        if (posIt == primitive.attributes.end())
+          continue;
+        const tinygltf::Accessor& posAccessor = gltfModel.accessors[posIt->second];
+
+        // Get indices for this primitive (your existing code is correct)
+        if (primitive.indices >= 0) {
+          const tinygltf::Accessor& indexAccessor = gltfModel.accessors[primitive.indices];
+          const tinygltf::BufferView& indexBufferView = gltfModel.bufferViews[indexAccessor.bufferView];
+          const tinygltf::Buffer& indexBuffer = gltfModel.buffers[indexBufferView.buffer];
+          const void* indexData = &indexBuffer.data[indexBufferView.byteOffset + indexAccessor.byteOffset];
+          if (indexAccessor.componentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT) {
+            const auto* buf = static_cast<const uint16_t *>(indexData);
+            for (size_t i = 0; i < indexAccessor.count; ++i) {
+              materialMesh.indices.push_back(buf[i] + vertexOffsetInMaterialMesh);
+            }
+          } else if (indexAccessor.componentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_INT) {
+            const auto* buf = static_cast<const uint32_t *>(indexData);
+            for (size_t i = 0; i < indexAccessor.count; ++i) {
+              materialMesh.indices.push_back(buf[i] + vertexOffsetInMaterialMesh);
+            }
+          }
+        } else {
+          // Generate sequential indices for non-indexed geometry
+          std::cout << "      Generating sequential indices for non-indexed primitive of material " << materialMesh.materialName << std::endl;
+          for (uint32_t i = 0; i < static_cast<uint32_t>(posAccessor.count); ++i) {
+            materialMesh.indices.push_back(i + vertexOffsetInMaterialMesh);
+          }
+        }
+
+        // --- START: FINAL SAFE AND CORRECT VERTEX LOADING ---
+
+        // Get data pointers and strides for all available attributes ONCE before the loop.
+        const tinygltf::BufferView& posBufferView = gltfModel.bufferViews[posAccessor.bufferView];
+        const tinygltf::Buffer& buffer = gltfModel.buffers[posBufferView.buffer];
+        const unsigned char* pPositions = &buffer.data[posBufferView.byteOffset + posAccessor.byteOffset];
+        const size_t posByteStride = posBufferView.byteStride == 0 ? sizeof(glm::vec3) : posBufferView.byteStride;
+
+        const unsigned char* pNormals = nullptr;
+        size_t normalByteStride = 0;
+        auto normalIt = primitive.attributes.find("NORMAL");
+        if (normalIt != primitive.attributes.end()) {
+          const tinygltf::Accessor& normalAccessor = gltfModel.accessors[normalIt->second];
+          const tinygltf::BufferView& normalBufferView = gltfModel.bufferViews[normalAccessor.bufferView];
+          pNormals = &gltfModel.buffers[normalBufferView.buffer].data[normalBufferView.byteOffset + normalAccessor.byteOffset];
+          normalByteStride = normalBufferView.byteStride == 0 ? sizeof(glm::vec3) : normalBufferView.byteStride;
+        }
+
+        const unsigned char* pTexCoords = nullptr;
+        size_t texCoordByteStride = 0;
+        auto texCoordIt = primitive.attributes.find("TEXCOORD_0");
+        if (texCoordIt != primitive.attributes.end()) {
+          const tinygltf::Accessor& texCoordAccessor = gltfModel.accessors[texCoordIt->second];
+          const tinygltf::BufferView& texCoordBufferView = gltfModel.bufferViews[texCoordAccessor.bufferView];
+          pTexCoords = &gltfModel.buffers[texCoordBufferView.buffer].data[texCoordBufferView.byteOffset + texCoordAccessor.byteOffset];
+          texCoordByteStride = texCoordBufferView.byteStride == 0 ? sizeof(glm::vec2) : texCoordBufferView.byteStride;
+        }
+
+        const unsigned char* pTangents = nullptr;
+        size_t tangentByteStride = 0;
+        auto tangentIt = primitive.attributes.find("TANGENT");
+        bool hasTangents = (tangentIt != primitive.attributes.end());
+        
+        std::vector<glm::uvec4> materialMeshJointIndices;
+        std::vector<glm::vec4> materialMeshJointWeights;
+        materialMeshJointIndices.reserve(posAccessor.count);
+        materialMeshJointWeights.reserve(posAccessor.count);
+        if (hasTangents) {
+          const tinygltf::Accessor& tangentAccessor = gltfModel.accessors[tangentIt->second];
+          const tinygltf::BufferView& tangentBufferView = gltfModel.bufferViews[tangentAccessor.bufferView];
+          pTangents = &gltfModel.buffers[tangentBufferView.buffer].data[tangentBufferView.byteOffset + tangentAccessor.byteOffset];
+          tangentByteStride = tangentBufferView.byteStride == 0 ? sizeof(glm::vec4) : tangentBufferView.byteStride;
+        }
+
+        const unsigned char* pJoints = nullptr;
+        size_t jointsByteStride = 0;
+        int jointsComponentType = 0;
+        auto jointsIt = primitive.attributes.find("JOINTS_0");
+        if (jointsIt != primitive.attributes.end()) {
+          const tinygltf::Accessor& jointsAccessor = gltfModel.accessors[jointsIt->second];
+          const tinygltf::BufferView& jointsBufferView = gltfModel.bufferViews[jointsAccessor.bufferView];
+          pJoints = &gltfModel.buffers[jointsBufferView.buffer].data[jointsBufferView.byteOffset + jointsAccessor.byteOffset];
+          jointsByteStride = jointsBufferView.byteStride == 0 ? (jointsAccessor.componentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT ? 4 * sizeof(uint16_t) : 4 * sizeof(uint8_t)) : jointsBufferView.byteStride;
+          jointsComponentType = jointsAccessor.componentType;
+        }
+
+        const unsigned char* pWeights = nullptr;
+        size_t weightsByteStride = 0;
+        auto weightsIt = primitive.attributes.find("WEIGHTS_0");
+        if (weightsIt != primitive.attributes.end()) {
+          const tinygltf::Accessor& weightsAccessor = gltfModel.accessors[weightsIt->second];
+          const tinygltf::BufferView& weightsBufferView = gltfModel.bufferViews[weightsAccessor.bufferView];
+          pWeights = &gltfModel.buffers[weightsBufferView.buffer].data[weightsBufferView.byteOffset + weightsAccessor.byteOffset];
+          weightsByteStride = weightsBufferView.byteStride == 0 ? 4 * sizeof(float) : weightsBufferView.byteStride;
+        }
+
+        // Append vertices for this primitive preserving prior vertices
+        size_t baseVertex = materialMesh.vertices.size();
+        materialMesh.vertices.resize(baseVertex + posAccessor.count);
+
+        // Use a SINGLE, SAFE loop to load all vertex data.
+        for (size_t i = 0; i < posAccessor.count; ++i) {
+          auto& v = materialMesh.vertices[baseVertex + i];
+
+          v.position = *reinterpret_cast<const glm::vec3 *>(pPositions + i * posByteStride);
+
+          if (pNormals) {
+            v.normal = *reinterpret_cast<const glm::vec3 *>(pNormals + i * normalByteStride);
+          } else {
+            v.normal = glm::vec3(0.0f, 0.0f, 1.0f);
+          }
+          // Normalize normals to ensure consistent magnitude
+          if (glm::dot(v.normal, v.normal) > 0.0f) {
+            v.normal = glm::normalize(v.normal);
+          } else {
+            v.normal = glm::vec3(0.0f, 0.0f, 1.0f);
+          }
+
+          if (pTexCoords) {
+            v.texCoord = *reinterpret_cast<const glm::vec2 *>(pTexCoords + i * texCoordByteStride);
+          } else {
+            v.texCoord = glm::vec2(0.0f, 0.0f);
+          }
+
+          if (hasTangents && pTangents) {
+            // Load glTF tangent and ensure it is normalized and orthogonal to the normal.
+            glm::vec4 t4 = *reinterpret_cast<const glm::vec4 *>(pTangents + i * tangentByteStride);
+            glm::vec3 T = glm::vec3(t4);
+            // Normalize tangent and make it orthogonal to normal to avoid skewed TBN
+            if (glm::dot(T, T) > 0.0f) {
+              T = glm::normalize(T);
+              T = glm::normalize(T - v.normal * glm::dot(v.normal, T));
+            } else {
+              T = glm::vec3(1.0f, 0.0f, 0.0f);
+            }
+            float w = (t4.w >= 0.0f) ? 1.0f : -1.0f; // clamp handedness to +/-1
+            v.tangent = glm::vec4(T, w);
+          } else {
+            // No tangents in source: use a safe default tangent (T=+X, handedness=+1)
+            v.tangent = glm::vec4(1.0f, 0.0f, 0.0f, 1.0f);
+          }
+
+          if (pJoints) {
+            if (jointsComponentType == TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT) {
+              const uint16_t* j = reinterpret_cast<const uint16_t *>(pJoints + i * jointsByteStride);
+              materialMeshJointIndices.push_back(glm::uvec4(j[0], j[1], j[2], j[3]));
+            } else {
+              const uint8_t* j = reinterpret_cast<const uint8_t *>(pJoints + i * jointsByteStride);
+              materialMeshJointIndices.push_back(glm::uvec4(j[0], j[1], j[2], j[3]));
+            }
+          } else {
+            materialMeshJointIndices.push_back(glm::uvec4(0));
+          }
+
+          if (pWeights) {
+            materialMeshJointWeights.push_back(*reinterpret_cast<const glm::vec4 *>(pWeights + i * weightsByteStride));
+          } else {
+            materialMeshJointWeights.push_back(glm::vec4(0.0f));
+          }
+        }
+        
+        if (pJoints || pWeights || !primitive.targets.empty()) {
+            auto& defData = geometryDeformableDataMap[geometryHash];
+            defData.isDeformable = true;
+            if (pJoints || pWeights) {
+                defData.jointIndices = materialMeshJointIndices;
+                defData.jointWeights = materialMeshJointWeights;
+            } else {
+                // For morph-only meshes, provide identity skinning data
+                defData.jointIndices.assign(posAccessor.count, glm::uvec4(0));
+                defData.jointWeights.assign(posAccessor.count, glm::vec4(1.0f, 0.0f, 0.0f, 0.0f));
+            }
+        }
+
+        // AFTER the mesh is fully built, generate tangents via MikkTSpace ONLY if the source mesh lacks glTF tangents.
+        if (!hasTangents) {
+          if (pNormals && pTexCoords && !materialMesh.indices.empty()) {
+            MikkTSpaceInterface mikkInterface;
+            mikkInterface.vertices = &materialMesh.vertices;
+            mikkInterface.indices = &materialMesh.indices;
+
+            SMikkTSpaceInterface sm_interface{};
+            sm_interface.m_getNumFaces = getNumFaces;
+            sm_interface.m_getNumVerticesOfFace = getNumVerticesOfFace;
+            sm_interface.m_getPosition = getPosition;
+            sm_interface.m_getNormal = getNormal;
+            sm_interface.m_getTexCoord = getTexCoord;
+            sm_interface.m_setTSpaceBasic = setTSpaceBasic;
+
+            SMikkTSpaceContext mikk_context{};
+            mikk_context.m_pInterface = &sm_interface;
+            mikk_context.m_pUserData = &mikkInterface;
+
+            if (genTangSpaceDefault(&mikk_context)) {
+              std::cout << "      Generated tangents (MikkTSpace) for material: " << materialMesh.materialName << std::endl;
+            } else {
+              std::cerr << "      Failed to generate tangents for material: " << materialMesh.materialName << std::endl;
+            }
+          } else {
+            std::cout << "      Skipping tangent generation (missing normals, UVs, or indices) for material: " << materialMesh.materialName << std::endl;
+          }
+        } else {
+          std::cout << "      Using glTF-provided tangents for material: " << materialMesh.materialName << std::endl;
+        }
+        // --- END: FINAL SAFE AND CORRECT VERTEX LOADING ---
+      }
+
+      // Add all instances to this MaterialMesh (both new and existing geometry)
+      for (const glm::mat4& instanceTransform : instances) {
+        materialMesh.AddInstance(instanceTransform, static_cast<uint32_t>(materialIndex));
+      }
+    }
+  }
+
+  // Convert geometry-based material mesh map to vector
+  std::vector<MaterialMesh> modelMaterialMeshes;
+  std::vector<std::string> modelMaterialMeshHashes;
+  modelMaterialMeshes.reserve(geometryMaterialMeshMap.size());
+  modelMaterialMeshHashes.reserve(geometryMaterialMeshMap.size());
+  for (auto& kv : geometryMaterialMeshMap) {
+    modelMaterialMeshHashes.push_back(kv.first);
+    modelMaterialMeshes.push_back(std::move(kv.second));
+  }
+
+  // Process texture loading for each MaterialMesh
+  std::vector<Vertex> combinedVertices;
+  std::vector<uint32_t> combinedIndices;
+
+  // Reserve space for combined mesh data to avoid reallocations
+  size_t totalVertices = 0;
+  size_t totalIndices = 0;
+  for (const auto& materialMesh : modelMaterialMeshes) {
+    if (!materialMesh.instances.empty()) {
+      totalVertices += materialMesh.vertices.size();
+      totalIndices += materialMesh.indices.size();
+    }
+  }
+  combinedVertices.reserve(totalVertices);
+  combinedIndices.reserve(totalIndices);
+
+  // Process texture loading for each MaterialMesh
+  for (auto& materialMesh : modelMaterialMeshes) {
+    int materialIndex = materialMesh.materialIndex;
+
+    // Get ALL texture paths for this material (same as ParseGLTFDataOnly)
+    if (materialIndex >= 0 && materialIndex < gltfModel.materials.size()) {
+      const auto& gltfMaterial = gltfModel.materials[materialIndex];
+
+      // Extract base color texture
+      if (gltfMaterial.pbrMetallicRoughness.baseColorTexture.index >= 0) {
+        int texIndex = gltfMaterial.pbrMetallicRoughness.baseColorTexture.index;
+        if (texIndex < gltfModel.textures.size()) {
+          const auto& texture = gltfModel.textures[texIndex];
+          int imageIndex = -1;
+          if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+            imageIndex = texture.source;
+          } else {
+            auto extIt = texture.extensions.find("KHR_texture_basisu");
+            if (extIt != texture.extensions.end()) {
+              const tinygltf::Value& ext = extIt->second;
+              if (ext.Has("source") && ext.Get("source").IsInt()) {
+                int src = ext.Get("source").Get<int>();
+                if (src >= 0 && src < static_cast<int>(gltfModel.images.size())) {
+                  imageIndex = src;
+                }
+              }
+            }
+          }
+          if (imageIndex >= 0) {
+            std::string textureId = "gltf_baseColor_" + std::to_string(texIndex);
+            materialMesh.baseColorTexturePath = textureId;
+            materialMesh.texturePath = textureId; // Keep for backward compatibility (now baseColor‑tagged)
+
+            // Load texture data (embedded or external) with caching
+            const auto& image = gltfModel.images[imageIndex];
+            if (!image.image.empty()) {
+              if (!loadedTextures.contains(textureId)) {
+                renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component, false);
+                loadedTextures.insert(textureId);
+              }
+            } else {
+              std::cerr << "      Warning: No decoded bytes for baseColor texture index " << texIndex << std::endl;
+            }
+          }
+        }
+      } else {
+        // Since texture indices are -1, try to find external texture files by material name
+        std::string materialName = materialMesh.materialName;
+
+        // Look for external texture files that match this specific material (case-insensitive)
+        for (const auto& image : gltfModel.images) {
+          if (!image.uri.empty()) {
+            std::string imageUri = image.uri;
+            // Lowercase copies for robust matching
+            std::string imageUriLower = imageUri;
+            std::ranges::transform(imageUriLower, imageUriLower.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+            std::string materialNameLower = materialName;
+            std::ranges::transform(materialNameLower, materialNameLower.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+
+            // Check if this image belongs to this specific material based on naming patterns
+            // Look for basecolor/albedo/diffuse textures that match the material name
+            if ((imageUriLower.find("basecolor") != std::string::npos ||
+                imageUriLower.find("albedo") != std::string::npos ||
+                imageUriLower.find("diffuse") != std::string::npos) &&
+              (imageUriLower.find(materialNameLower) != std::string::npos ||
+                materialNameLower.find(imageUriLower.substr(0, imageUriLower.find('_'))) != std::string::npos)) {
+              // Use the relative path from the GLTF directory
+              std::string textureId = baseTexturePath + imageUri;
+              if (!image.image.empty()) {
+                renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+                materialMesh.baseColorTexturePath = textureId;
+                materialMesh.texturePath = textureId;
+              } else {
+                // Fallback: offload KTX2 file load to renderer worker threads
+                renderer->LoadTextureAsync(textureId, false);
+                materialMesh.baseColorTexturePath = textureId;
+                materialMesh.texturePath = textureId;
+              }
+              break;
+            }
+          }
+        }
+      }
+
+      // Extract normal texture
+      if (gltfMaterial.normalTexture.index >= 0) {
+        int texIndex = gltfMaterial.normalTexture.index;
+        if (texIndex < gltfModel.textures.size()) {
+          const auto& texture = gltfModel.textures[texIndex];
+          if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+            std::string textureId = "gltf_texture_" + std::to_string(texIndex);
+            materialMesh.normalTexturePath = textureId;
+
+            // Load texture data (embedded or external)
+            const auto& image = gltfModel.images[texture.source];
+            if (!image.image.empty()) {
+              // Load embedded texture data
+              renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+            } else if (!image.uri.empty()) {
+              // Fallback: offload KTX2 normal map load to renderer worker threads
+              std::string filePath = baseTexturePath + image.uri;
+              renderer->RegisterTextureAlias(textureId, filePath);
+              renderer->LoadTextureAsync(filePath);
+              materialMesh.normalTexturePath = textureId;
+            } else {
+              std::cerr << "    Warning: No decoded bytes for normal texture index " << texIndex << std::endl;
+            }
+          }
+        }
+      } else {
+        // Heuristic: search images for a normal texture for this material and load from memory
+        std::string materialName = materialMesh.materialName;
+        for (const auto& image : gltfModel.images) {
+          if (!image.uri.empty()) {
+            std::string imageUri = image.uri;
+            if (imageUri.find("Normal") != std::string::npos &&
+              (imageUri.find(materialName) != std::string::npos ||
+                materialName.find(imageUri.substr(0, imageUri.find('_'))) != std::string::npos)) {
+              std::string textureId = baseTexturePath + imageUri;
+              if (!image.image.empty()) {
+                renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+                materialMesh.normalTexturePath = textureId;
+              } else {
+                std::cerr << "      Warning: Heuristic normal image has no decoded bytes: " << imageUri << std::endl;
+              }
+              break;
+            }
+          }
+        }
+      }
+
+      // Extract metallic-roughness texture
+      if (gltfMaterial.pbrMetallicRoughness.metallicRoughnessTexture.index >= 0) {
+        int texIndex = gltfMaterial.pbrMetallicRoughness.metallicRoughnessTexture.index;
+        if (texIndex < gltfModel.textures.size()) {
+          const auto& texture = gltfModel.textures[texIndex];
+          if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+            std::string textureId = "gltf_texture_" + std::to_string(texIndex);
+            materialMesh.metallicRoughnessTexturePath = textureId;
+
+            // Load texture data (embedded or external)
+            const auto& image = gltfModel.images[texture.source];
+            if (!image.image.empty()) {
+              renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+              materialMesh.metallicRoughnessTexturePath = textureId;
+            } else {
+              std::cerr << "      Warning: No decoded bytes for metallic-roughness texture index " << texIndex << std::endl;
+            }
+          }
+        }
+      } else {
+        // Look for external metallic-roughness texture files that match this specific material
+        std::string materialName = materialMesh.materialName;
+        for (const auto& image : gltfModel.images) {
+          if (!image.uri.empty()) {
+            std::string imageUri = image.uri;
+            if ((imageUri.find("Metallic") != std::string::npos ||
+                imageUri.find("Roughness") != std::string::npos ||
+                imageUri.find("Specular") != std::string::npos) &&
+              (imageUri.find(materialName) != std::string::npos ||
+                materialName.find(imageUri.substr(0, imageUri.find('_'))) != std::string::npos)) {
+              std::string texturePath = baseTexturePath + imageUri;
+              materialMesh.metallicRoughnessTexturePath = texturePath;
+              std::cout << "      Found external metallic-roughness texture for " << materialName << ": " << texturePath << std::endl;
+              break;
+            }
+          }
+        }
+      }
+
+      // Extract occlusion texture
+      if (gltfMaterial.occlusionTexture.index >= 0) {
+        int texIndex = gltfMaterial.occlusionTexture.index;
+        if (texIndex < gltfModel.textures.size()) {
+          const auto& texture = gltfModel.textures[texIndex];
+          if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+            std::string textureId = "gltf_texture_" + std::to_string(texIndex);
+            materialMesh.occlusionTexturePath = textureId;
+
+            // Load texture data (embedded or external)
+            const auto& image = gltfModel.images[texture.source];
+            if (!image.image.empty()) {
+              if (renderer->LoadTextureFromMemory(textureId,
+                                                  image.image.data(),
+                                                  image.width,
+                                                  image.height,
+                                                  image.component)) {
+                materialMesh.occlusionTexturePath = textureId;
+                std::cout << "      Loaded occlusion texture from memory: " << textureId
+                    << " (" << image.width << "x" << image.height << ")" << std::endl;
+              } else {
+                std::cerr << "      Failed to load occlusion texture from memory: " << textureId << std::endl;
+              }
+            } else {
+              std::cerr << "      Warning: No decoded bytes for occlusion texture index " << texIndex << std::endl;
+            }
+          }
+        }
+      } else {
+        // Heuristic: search images for an occlusion texture for this material and load from memory
+        std::string materialName = materialMesh.materialName;
+        for (const auto& image : gltfModel.images) {
+          if (!image.uri.empty()) {
+            std::string imageUri = image.uri;
+            if ((imageUri.find("Occlusion") != std::string::npos ||
+                imageUri.find("AO") != std::string::npos) &&
+              (imageUri.find(materialName) != std::string::npos ||
+                materialName.find(imageUri.substr(0, imageUri.find('_'))) != std::string::npos)) {
+              std::string textureId = baseTexturePath + imageUri;
+              if (!image.image.empty()) {
+                renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+                materialMesh.occlusionTexturePath = textureId;
+              } else {
+                std::cerr << "      Warning: Heuristic occlusion image has no decoded bytes: " << imageUri << std::endl;
+              }
+              break;
+            }
+          }
+        }
+      }
+
+      // Extract emissive texture
+      if (gltfMaterial.emissiveTexture.index >= 0) {
+        int texIndex = gltfMaterial.emissiveTexture.index;
+        if (texIndex < gltfModel.textures.size()) {
+          const auto& texture = gltfModel.textures[texIndex];
+          if (texture.source >= 0 && texture.source < gltfModel.images.size()) {
+            std::string textureId = "gltf_texture_" + std::to_string(texIndex);
+            materialMesh.emissiveTexturePath = textureId;
+
+            // Load texture data (embedded or external)
+            const auto& image = gltfModel.images[texture.source];
+            if (!image.image.empty()) {
+              // Load embedded texture data
+              renderer->LoadTextureFromMemoryAsync(textureId, image.image.data(), image.width, image.height, image.component);
+            } else if (!image.uri.empty()) {
+              // Record external texture file path (loaded later by renderer)
+              std::string texturePath = baseTexturePath + image.uri;
+              materialMesh.emissiveTexturePath = texturePath;
+            }
+          }
+        }
+      } else {
+        // Look for external emissive texture files that match this specific material
+        std::string materialName = materialMesh.materialName;
+        for (const auto& image : gltfModel.images) {
+          if (!image.uri.empty()) {
+            std::string imageUri = image.uri;
+            if ((imageUri.find("Emissive") != std::string::npos ||
+                imageUri.find("Emission") != std::string::npos) &&
+              (imageUri.find(materialName) != std::string::npos ||
+                materialName.find(imageUri.substr(0, imageUri.find('_'))) != std::string::npos)) {
+              std::string texturePath = baseTexturePath + imageUri;
+              materialMesh.emissiveTexturePath = texturePath;
+              break;
+            }
+          }
+        }
+      }
+    }
+
+    // Add to combined mesh for backward compatibility (keep vertices in an original coordinate system)
+    if (!materialMesh.instances.empty()) {
+      size_t vertexOffset = combinedVertices.size();
+
+      // Instance transforms should be handled by the instancing system, not applied to vertex data
+      for (const auto& vertex : materialMesh.vertices) {
+        // Use vertices as-is without any transformation
+        combinedVertices.push_back(vertex);
+      }
+
+      for (uint32_t index : materialMesh.indices) {
+        combinedIndices.push_back(index + static_cast<uint32_t>(vertexOffset));
+      }
+    }
+  }
+
+  // Store material meshes for this model
+  materialMeshes[filename] = modelMaterialMeshes;
+  
+  // Re-apply deformable/skinning/morph data to the final MaterialMesh objects (now that their addresses are stable)
+  auto& finalMeshes = materialMeshes[filename];
+  for (size_t i = 0; i < finalMeshes.size(); ++i) {
+      const std::string& hash = modelMaterialMeshHashes[i];
+      auto it = geometryDeformableDataMap.find(hash);
+      if (it != geometryDeformableDataMap.end()) {
+          SetMaterialMeshDeformable(&finalMeshes[i], it->second.isDeformable);
+          SetMaterialMeshJointsAndWeights(&finalMeshes[i], it->second.jointIndices, it->second.jointWeights);
+          SetMaterialMeshMorphTargetCount(&finalMeshes[i], it->second.numMorphTargets);
+          if (it->second.numMorphTargets > 0) {
+              SetMaterialMeshMorphPositions(&finalMeshes[i], it->second.morphPositions);
+          }
+      }
+  }
+
+  // Set the combined mesh data in the model for backward compatibility
+  model->SetVertices(combinedVertices);
+  model->SetIndices(combinedIndices);
+
+  // Extract lights from the GLTF model
+  std::cout << "Extracting lights from GLTF model..." << std::endl;
+
+  // Extract punctual lights (KHR_lights_punctual extension)
+  if (ExtractPunctualLights(gltfModel, filename)) {
+    std::cerr << "Warning: Failed to extract punctual lights from " << filename << std::endl;
+  }
+
+  std::cout << "GLTF model loaded successfully with " << combinedVertices.size() << " vertices and " << combinedIndices.size() << " indices" << std::endl;
+  return true;
+}
+
+std::vector<ExtractedLight> ModelLoader::GetExtractedLights(const std::string& modelName) const {
+  std::vector<ExtractedLight> lights;
+
+  // First, try to get punctual lights from the extracted lights storage
+  auto lightIt = extractedLights.find(modelName);
+  if (lightIt != extractedLights.end()) {
+    lights = lightIt->second;
+    std::cout << "Found " << lights.size() << " punctual lights for model: " << modelName << std::endl;
+  }
+
+  // Now extract emissive materials as light sources
+  auto materialMeshIt = materialMeshes.find(modelName);
+  if (materialMeshIt != materialMeshes.end()) {
+    for (const auto& materialMesh : materialMeshIt->second) {
+      // Get the material for this mesh
+      auto materialIt = materials.find(materialMesh.materialName);
+      if (materialIt != materials.end()) {
+        const Material* material = materialIt->second.get();
+
+        // Check if this material has emissive properties (no threshold filtering)
+        float emissiveIntensity = glm::length(material->emissive) * material->emissiveStrength;
+        if (emissiveIntensity >= 0.1f) {
+          // Calculate the center position and an approximate size of the emissive surface
+          glm::vec3 center(0.0f);
+          glm::vec3 minB(std::numeric_limits<float>::max());
+          glm::vec3 maxB(-std::numeric_limits<float>::max());
+          if (!materialMesh.vertices.empty()) {
+            for (const auto& vertex : materialMesh.vertices) {
+              center += vertex.position;
+              minB = glm::min(minB, vertex.position);
+              maxB = glm::max(maxB, vertex.position);
+            }
+            center /= static_cast<float>(materialMesh.vertices.size());
+          }
+          glm::vec3 extent = glm::max(maxB - minB, glm::vec3(0.0f));
+          float diag = glm::length(extent);
+          float baseRange = std::max(0.5f * diag, 0.25f); // base range in local units
+
+          // Calculate a reasonable direction (average normal of the surface)
+          glm::vec3 avgNormal(0.0f);
+          if (!materialMesh.vertices.empty()) {
+            avgNormal = std::accumulate(
+              materialMesh.vertices.begin(),
+              materialMesh.vertices.end(),
+              glm::vec3(0.0f),
+              [](const glm::vec3& acc, const Vertex& vertex) { return acc + vertex.normal; }
+            );
+            avgNormal = glm::normalize(avgNormal / static_cast<float>(materialMesh.vertices.size()));
+          } else {
+            avgNormal = glm::vec3(0.0f, -1.0f, 0.0f); // Default downward direction
+          }
+
+          // Create emissive light(s) transformed by each instance's model matrix
+          if (!materialMesh.instances.empty()) {
+            for (const auto& inst : materialMesh.instances) {
+              glm::mat4 M = inst.getModelMatrix();
+              glm::vec3 worldCenter = glm::vec3(M * glm::vec4(center, 1.0f));
+              glm::mat3 normalMat = glm::transpose(glm::inverse(glm::mat3(M)));
+              glm::vec3 worldNormal = glm::normalize(normalMat * avgNormal);
+
+              // Estimate a uniform scale factor from the instance transform
+              float sx = glm::length(glm::vec3(M[0]));
+              float sy = glm::length(glm::vec3(M[1]));
+              float sz = glm::length(glm::vec3(M[2]));
+              float sMax = std::max(sx, std::max(sy, sz));
+              // Slightly conservative halo; avoid massive ranges that wash out the scene
+              float worldRange = baseRange * std::max(1.0f, sMax) * 1.25f;
+
+              ExtractedLight emissiveLight;
+              emissiveLight.type = ExtractedLight::Type::Emissive;
+              emissiveLight.position = worldCenter;
+              // Separate chroma from intensity to avoid double-powering color and intensity
+              glm::vec3 chroma = material->emissive;
+              float chromaMag = glm::length(chroma);
+              emissiveLight.color = (chromaMag > 1e-6f) ? (chroma / chromaMag) : chroma;
+              float strength = hasEmissiveStrengthExtension ? material->emissiveStrength : 1.0f;
+              // Use a surface-area proxy from local bounds (diag^2) scaled by instance size, not range^2
+              float areaProxy = std::max(diag * diag * std::max(1.0f, sMax), 0.01f);
+              float intensityRaw = strength * chromaMag * areaProxy * 0.08f; // conservative scalar
+              // Clamp to a reasonable band to avoid blowing out exposure
+              emissiveLight.intensity = glm::clamp(intensityRaw, 0.25f, 50.0f);
+              emissiveLight.range = worldRange;
+              emissiveLight.sourceMaterial = material->GetName();
+              emissiveLight.direction = worldNormal;
+
+              lights.push_back(emissiveLight);
+
+              std::cout << "Created emissive light from material '" << material->GetName()
+                  << "' at world position (" << worldCenter.x << ", " << worldCenter.y << ", " << worldCenter.z
+                  << ") with intensity " << emissiveIntensity << std::endl;
+            }
+          } else {
+            // No explicit instances; use identity transform
+            ExtractedLight emissiveLight;
+            emissiveLight.type = ExtractedLight::Type::Emissive;
+            emissiveLight.position = center;
+            // Separate chroma from intensity
+            glm::vec3 chroma = material->emissive;
+            float chromaMag = glm::length(chroma);
+            emissiveLight.color = (chromaMag > 1e-6f) ? (chroma / chromaMag) : chroma;
+            float strength = hasEmissiveStrengthExtension ? material->emissiveStrength : 1.0f;
+            float worldRange = baseRange * 1.25f;
+            float areaProxy = std::max(diag * diag, 0.01f);
+            float intensityRaw = strength * chromaMag * areaProxy * 0.08f;
+            emissiveLight.intensity = glm::clamp(intensityRaw, 0.25f, 50.0f);
+            emissiveLight.range = worldRange;
+            emissiveLight.sourceMaterial = material->GetName();
+            emissiveLight.direction = avgNormal;
+
+            lights.push_back(emissiveLight);
+
+            std::cout << "Created emissive light from material '" << material->GetName()
+                << "' at position (" << center.x << ", " << center.y << ", " << center.z
+                << ") with intensity " << emissiveIntensity << std::endl;
+          }
+        }
+      }
+    }
+  }
+
+  std::cout << "Total lights extracted for model '" << modelName << "': " << lights.size()
+      << " (including emissive-derived lights)" << std::endl;
+
+  return lights;
+}
+
+const std::vector<MaterialMesh>& ModelLoader::GetMaterialMeshes(const std::string& modelName) const {
+  auto it = materialMeshes.find(modelName);
+  if (it != materialMeshes.end()) {
+    return it->second;
+  }
+  // Return a static empty vector to avoid creating temporary objects.
+  static const std::vector<MaterialMesh> emptyVector;
+  return emptyVector;
+}
+
+const Material* ModelLoader::GetMaterial(const std::string& materialName) const {
+  auto it = materials.find(materialName);
+  if (it != materials.end()) {
+    return it->second.get();
+  }
+  return nullptr;
+}
+
+const Material* ModelLoader::GetMaterialByIndex(uint32_t materialIndex) const {
+  if (materialIndex < materialsByIndex.size()) {
+    return materialsByIndex[materialIndex];
+  }
+  return nullptr;
+}
+
+const std::vector<Animation>& ModelLoader::GetAnimations(const std::string& modelName) const {
+  auto it = models.find(modelName);
+  if (it != models.end() && it->second) {
+    return it->second->GetAnimations();
+  }
+  // Return a static empty vector to avoid creating temporary objects.
+  static const std::vector<Animation> emptyVector;
+  return emptyVector;
+}
+
+bool ModelLoader::ExtractPunctualLights(const tinygltf::Model& gltfModel, const std::string& modelName) {
+  std::cout << "Extracting punctual lights from model: " << modelName << std::endl;
+
+  std::vector<ExtractedLight> lights;
+
+  // Check if the model has the KHR_lights_punctual extension
+  auto extensionIt = gltfModel.extensions.find("KHR_lights_punctual");
+  if (extensionIt != gltfModel.extensions.end()) {
+    std::cout << "  Found KHR_lights_punctual extension" << std::endl;
+
+    // Parse the punctual lights from the extension
+    const tinygltf::Value& extension = extensionIt->second;
+    if (extension.Has("lights") && extension.Get("lights").IsArray()) {
+      const tinygltf::Value::Array& lightsArray = extension.Get("lights").Get<tinygltf::Value::Array>();
+
+      for (size_t i = 0; i < lightsArray.size(); ++i) {
+        const tinygltf::Value& lightValue = lightsArray[i];
+        if (!lightValue.IsObject())
+          continue;
+
+        ExtractedLight light;
+
+        // Parse light type
+        if (lightValue.Has("type") && lightValue.Get("type").IsString()) {
+          std::string type = lightValue.Get("type").Get<std::string>();
+          if (type == "directional") {
+            light.type = ExtractedLight::Type::Directional;
+          } else if (type == "point") {
+            light.type = ExtractedLight::Type::Point;
+          } else if (type == "spot") {
+            light.type = ExtractedLight::Type::Spot;
+          }
+        }
+
+        // Parse light color
+        if (lightValue.Has("color") && lightValue.Get("color").IsArray()) {
+          const tinygltf::Value::Array& colorArray = lightValue.Get("color").Get<tinygltf::Value::Array>();
+          if (colorArray.size() >= 3) {
+            light.color = glm::vec3(
+              colorArray[0].IsNumber() ? static_cast<float>(colorArray[0].Get<double>()) : 1.0f,
+              colorArray[1].IsNumber() ? static_cast<float>(colorArray[1].Get<double>()) : 1.0f,
+              colorArray[2].IsNumber() ? static_cast<float>(colorArray[2].Get<double>()) : 1.0f);
+          }
+        }
+
+        // Parse light intensity
+        if (lightValue.Has("intensity") && lightValue.Get("intensity").IsNumber()) {
+          light.intensity = static_cast<float>(lightValue.Get("intensity").Get<double>()) * LIGHT_SCALE_FACTOR;
+        }
+
+        // Parse light range (for point and spotlights)
+        if (lightValue.Has("range") && lightValue.Get("range").IsNumber()) {
+          light.range = static_cast<float>(lightValue.Get("range").Get<double>());
+        }
+
+        // Parse spotlights specific parameters
+        if (light.type == ExtractedLight::Type::Spot && lightValue.Has("spot")) {
+          const tinygltf::Value& spotValue = lightValue.Get("spot");
+          if (spotValue.Has("innerConeAngle") && spotValue.Get("innerConeAngle").IsNumber()) {
+            light.innerConeAngle = static_cast<float>(spotValue.Get("innerConeAngle").Get<double>());
+          }
+          if (spotValue.Has("outerConeAngle") && spotValue.Get("outerConeAngle").IsNumber()) {
+            light.outerConeAngle = static_cast<float>(spotValue.Get("outerConeAngle").Get<double>());
+          }
+        }
+
+        lights.push_back(light);
+        std::cout << "    Parsed punctual light " << i << ": type=" << static_cast<int>(light.type)
+            << ", intensity=" << light.intensity << std::endl;
+      }
+    }
+  } else {
+    std::cout << "  No KHR_lights_punctual extension found" << std::endl;
+  }
+
+  // Compute world transforms for all nodes in the default scene
+  std::vector<glm::mat4> nodeWorldTransforms(gltfModel.nodes.size(), glm::mat4(1.0f));
+
+  auto calcLocal = [](const tinygltf::Node& n) -> glm::mat4 {
+    // If matrix is provided, use it
+    if (n.matrix.size() == 16) {
+      glm::mat4 m(1.0f);
+      for (int r = 0; r < 4; ++r) {
+        for (int c = 0; c < 4; ++c) {
+          m[c][r] = static_cast<float>(n.matrix[r * 4 + c]);
+        }
+      }
+      return m;
+    }
+    // Otherwise compose TRS
+    glm::mat4 T(1.0f), R(1.0f), S(1.0f);
+    if (n.translation.size() == 3) {
+      T = glm::translate(glm::mat4(1.0f),
+                         glm::vec3(
+                           static_cast<float>(n.translation[0]),
+                           static_cast<float>(n.translation[1]),
+                           static_cast<float>(n.translation[2])));
+    }
+    if (n.rotation.size() == 4) {
+      glm::quat q(
+        static_cast<float>(n.rotation[3]),
+        static_cast<float>(n.rotation[0]),
+        static_cast<float>(n.rotation[1]),
+        static_cast<float>(n.rotation[2]));
+      R = glm::mat4_cast(q);
+    }
+    if (n.scale.size() == 3) {
+      S = glm::scale(glm::mat4(1.0f),
+                     glm::vec3(
+                       static_cast<float>(n.scale[0]),
+                       static_cast<float>(n.scale[1]),
+                       static_cast<float>(n.scale[2])));
+    }
+    return T * R * S;
+  };
+
+  std::function < void(int, const glm::mat4 &) > traverseNode = [&](int nodeIndex, const glm::mat4& parent) {
+    if (nodeIndex < 0 || nodeIndex >= static_cast<int>(gltfModel.nodes.size()))
+      return;
+    const tinygltf::Node& n = gltfModel.nodes[nodeIndex];
+    glm::mat4 local = calcLocal(n);
+    glm::mat4 world = parent * local;
+    nodeWorldTransforms[nodeIndex] = world;
+    for (int child : n.children) {
+      traverseNode(child, world);
+    }
+  };
+
+  if (!gltfModel.scenes.empty()) {
+    int sceneIndex = gltfModel.defaultScene >= 0 ? gltfModel.defaultScene : 0;
+    if (sceneIndex < static_cast<int>(gltfModel.scenes.size())) {
+      const tinygltf::Scene& scene = gltfModel.scenes[sceneIndex];
+      for (int root : scene.nodes) {
+        traverseNode(root, glm::mat4(1.0f));
+      }
+    }
+  } else {
+    // Fallback: traverse all nodes as roots
+    for (int i = 0; i < static_cast<int>(gltfModel.nodes.size()); ++i) {
+      traverseNode(i, glm::mat4(1.0f));
+    }
+  }
+
+  // Now assign positions and directions using world transforms
+  for (size_t nodeIndex = 0; nodeIndex < gltfModel.nodes.size(); ++nodeIndex) {
+    const auto& node = gltfModel.nodes[nodeIndex];
+    if (node.extensions.contains("KHR_lights_punctual")) {
+      const tinygltf::Value& nodeExtension = node.extensions.at("KHR_lights_punctual");
+      if (nodeExtension.Has("light") && nodeExtension.Get("light").IsInt()) {
+        int lightIndex = nodeExtension.Get("light").Get<int>();
+        if (lightIndex >= 0 && lightIndex < static_cast<int>(lights.size())) {
+          const glm::mat4& W = nodeWorldTransforms[nodeIndex];
+          // Position from world transform origin
+          glm::vec3 pos = glm::vec3(W * glm::vec4(0, 0, 0, 1));
+          lights[lightIndex].position = pos;
+
+          // Direction for directional/spot: transform -Z
+          if (lights[lightIndex].type == ExtractedLight::Type::Directional ||
+            lights[lightIndex].type == ExtractedLight::Type::Spot) {
+            glm::mat3 rot = glm::mat3(W);
+            glm::vec3 dir = glm::normalize(rot * glm::vec3(0.0f, 0.0f, -1.0f));
+            lights[lightIndex].direction = dir;
+          }
+
+          std::cout << "    Light " << lightIndex << " positioned at ("
+              << lights[lightIndex].position.x << ", "
+              << lights[lightIndex].position.y << ", "
+              << lights[lightIndex].position.z << ")" << std::endl;
+        }
+      }
+    }
+  }
+
+  // Store the extracted lights
+  extractedLights[modelName] = lights;
+
+  std::cout << "  Extracted " << lights.size() << " total lights from model" << std::endl;
+  return lights.empty();
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/model_loader.h b/attachments/advanced_gltf/model_loader.h
new file mode 100644
index 000000000..6eedf7093
--- /dev/null
+++ b/attachments/advanced_gltf/model_loader.h
@@ -0,0 +1,515 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "mesh_component.h"
+#include <glm/glm.hpp>
+#include <glm/gtc/quaternion.hpp>
+#include <memory>
+#include <set>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+class Renderer;
+class Mesh;
+class Material;
+
+// Forward declaration for tinygltf
+namespace tinygltf {
+class Model;
+}
+
+class Material {
+  public:
+    explicit Material(std::string name) : name(std::move(name)) {
+    }
+    ~Material() = default;
+
+    [[nodiscard]] const std::string& GetName() const {
+      return name;
+    }
+
+    // PBR properties (Metallic-Roughness default)
+    glm::vec3 albedo = glm::vec3(1.0f);
+    float metallic = 0.0f;
+    float roughness = 1.0f;
+    float ao = 1.0f;
+    glm::vec3 emissive = glm::vec3(0.0f);
+    float ior = 1.5f; // Index of refraction
+    float emissiveStrength = 1.0f; // KHR_materials_emissive_strength extension
+    float alpha = 1.0f; // Base color alpha (from MR baseColorFactor or SpecGloss diffuseFactor)
+    float transmissionFactor = 0.0f; // KHR_materials_transmission: 0=opaque, 1=fully transmissive
+
+    // Specular-Glossiness workflow (KHR_materials_pbrSpecularGlossiness)
+    bool useSpecularGlossiness = false;
+    glm::vec3 specularFactor = glm::vec3(0.04f);
+    float glossinessFactor = 1.0f;
+    std::string specGlossTexturePath; // Stored separately; also mirrored to metallicRoughnessTexturePath for binding 2
+
+    // Alpha handling (glTF alphaMode and cutoff)
+    std::string alphaMode = "OPAQUE"; // "OPAQUE", "MASK", or "BLEND"
+    float alphaCutoff = 0.5f; // Used when alphaMode == MASK
+
+    // Texture paths for PBR materials
+    std::string albedoTexturePath;
+    std::string normalTexturePath;
+    std::string metallicRoughnessTexturePath;
+    std::string occlusionTexturePath;
+    std::string emissiveTexturePath;
+
+    // Hint used by the renderer to select a specialized glass rendering path
+    // for architectural glass (windows, lamp glass, etc.). Set by ModelLoader
+    // based on material name/properties; defaults to false so non-glass
+    // materials continue to use the generic PBR path.
+    bool isGlass = false;
+
+    // Hint used by the renderer to preferentially render inner liquid volumes
+    // before outer glass shells (e.g., beer/wine in bar glasses). Set by
+    // ModelLoader based on material name/properties; defaults to false.
+    bool isLiquid = false;
+
+  private:
+    std::string name;
+};
+
+/**
+ * @brief Structure representing a light source extracted from GLTF.
+ */
+struct ExtractedLight {
+  enum class Type {
+    Directional,
+    Point,
+    Spot,
+    Emissive // Light derived from emissive material
+  };
+
+  Type type = Type::Point;
+  glm::vec3 position = glm::vec3(0.0f);
+  glm::vec3 direction = glm::vec3(0.0f, -1.0f, 0.0f); // For directional/spotlights
+  glm::vec3 color = glm::vec3(1.0f);
+  float intensity = 1.0f;
+  float range = 100.0f; // For point/spotlights
+  float innerConeAngle = 0.0f; // For spotlights
+  float outerConeAngle = 0.785398f; // For spotlights (45 degrees)
+  std::string sourceMaterial; // Name of source material (for emissive lights)
+};
+
+/**
+ * @brief Structure representing camera data extracted from GLTF.
+ */
+struct CameraData {
+  std::string name;
+  bool isPerspective = true;
+
+  // Perspective camera properties
+  float fov = 0.785398f; // 45 degrees in radians
+  float aspectRatio = 1.0f;
+
+  // Orthographic camera properties
+  float orthographicSize = 1.0f;
+
+  // Common properties
+  float nearPlane = 0.1f;
+  float farPlane = 1000.0f;
+
+  // Transform properties
+  glm::vec3 position = glm::vec3(0.0f);
+  glm::quat rotation = glm::quat(1.0f, 0.0f, 0.0f, 0.0f); // Identity quaternion
+};
+
+/**
+ * @brief Interpolation type for animation samplers.
+ */
+enum class AnimationInterpolation {
+  Linear,
+  Step,
+  CubicSpline
+};
+
+/**
+ * @brief Target path for animation channels.
+ */
+enum class AnimationPath {
+  Translation,
+  Rotation,
+  Scale,
+  Weights // For morph targets (not yet implemented)
+};
+
+/**
+ * @brief Sampler for animation keyframes.
+ * Contains input (time) and output (value) data for interpolation.
+ */
+struct AnimationSampler {
+  std::vector<float> inputTimes; // Keyframe timestamps in seconds
+  std::vector<float> outputValues; // Keyframe values (vec3 for T/S, vec4 for R)
+  AnimationInterpolation interpolation = AnimationInterpolation::Linear;
+
+  // Get the duration of this sampler
+  [[nodiscard]] float GetDuration() const {
+    return inputTimes.empty() ? 0.0f : inputTimes.back();
+  }
+};
+
+/**
+ * @brief Channel connecting a sampler to a target node property.
+ */
+struct AnimationChannel {
+  int samplerIndex = -1; // Index into Animation::samplers
+  int targetNode = -1; // glTF node index being animated
+  AnimationPath path = AnimationPath::Translation;
+};
+
+/**
+ * @brief A complete animation clip containing multiple channels.
+ */
+struct Animation {
+  std::string name;
+  std::vector<AnimationSampler> samplers;
+  std::vector<AnimationChannel> channels;
+
+  // Get the total duration of this animation
+  [[nodiscard]] float GetDuration() const {
+    float maxDuration = 0.0f;
+    for (const auto& sampler : samplers) {
+      maxDuration = std::max(maxDuration, sampler.GetDuration());
+    }
+    return maxDuration;
+  }
+};
+
+/**
+ * @brief Structure representing mesh data for a specific material.
+ */
+struct MaterialMesh {
+  int materialIndex;          // per-glTF-file material index (used to index gltfModel.materials)
+  int globalMaterialIndex = -1; // globally-unique index across all loaded models; used for the
+                                // entity name and the ray-query material slot so materials from
+                                // different models (e.g. bistro vs Fox) don't collide at the same slot
+  std::string materialName;
+  std::vector<Vertex> vertices;
+  std::vector<uint32_t> indices;
+
+  // Track which glTF mesh index this MaterialMesh came from (for animation targeting)
+  int sourceMeshIndex = -1;
+
+  // All PBR texture paths for this material
+  std::string texturePath; // Primary texture path (baseColor) - kept for backward compatibility
+  std::string baseColorTexturePath; // Base color (albedo) texture
+  std::string normalTexturePath; // Normal map texture
+  std::string metallicRoughnessTexturePath; // Metallic-roughness texture
+  std::string occlusionTexturePath; // Ambient occlusion texture
+  std::string emissiveTexturePath; // Emissive texture
+
+  // Instancing support
+  std::vector<InstanceData> instances; // Instance data for instanced rendering
+  bool isInstanced = false; // Flag to indicate if this mesh uses instancing
+
+  /**
+	 * @brief Add an instance with the given transform matrix.
+	 * @param transform The transform matrix for this instance.
+	 * @param matIndex The material index for this instance (default: use materialIndex).
+	 */
+  void AddInstance(const glm::mat4& transform, uint32_t matIndex = 0) {
+    if (matIndex == 0)
+      matIndex = static_cast<uint32_t>(materialIndex);
+    instances.emplace_back(transform, matIndex);
+    isInstanced = instances.size() > 1;
+  }
+
+  /**
+	 * @brief Get the number of instances.
+	 * @return Number of instances (0 if not instanced, >= 1 if instanced).
+	 */
+  [[nodiscard]] size_t GetInstanceCount() const {
+    return instances.size();
+  }
+
+  /**
+	 * @brief Check if this mesh uses instancing.
+	 * @return True if instanced (more than 1 instance), false otherwise.
+	 */
+  [[nodiscard]] bool IsInstanced() const {
+    return isInstanced;
+  }
+};
+
+/**
+ * @brief Class representing a 3D model.
+ */
+class Model {
+  public:
+    explicit Model(std::string name) : name(std::move(name)) {
+    }
+    ~Model() = default;
+
+    [[nodiscard]] const std::string& GetName() const {
+      return name;
+    }
+
+    // Mesh data access methods
+    [[nodiscard]] const std::vector<Vertex>& GetVertices() const {
+      return vertices;
+    }
+    [[nodiscard]] const std::vector<uint32_t>& GetIndices() const {
+      return indices;
+    }
+
+    // Methods to set mesh data (used by parser)
+    void SetVertices(const std::vector<Vertex>& newVertices) {
+      vertices = newVertices;
+    }
+    void SetIndices(const std::vector<uint32_t>& newIndices) {
+      indices = newIndices;
+    }
+
+    // Camera data access methods
+    [[nodiscard]] const std::vector<CameraData>& GetCameras() const {
+      return cameras;
+    }
+
+    // Animation data access methods
+    [[nodiscard]] const std::vector<Animation>& GetAnimations() const {
+      return animations;
+    }
+    void SetAnimations(const std::vector<Animation>& anims) {
+      animations = anims;
+    }
+
+    // Animated node transforms: maps glTF node index to its base world transform
+    // Used by AnimationComponent to find entities for animation targets
+    [[nodiscard]] const std::unordered_map<int, glm::mat4>& GetAnimatedNodeTransforms() const {
+      return animatedNodeTransforms;
+    }
+    void SetAnimatedNodeTransforms(const std::unordered_map<int, glm::mat4>& transforms) {
+      animatedNodeTransforms = transforms;
+    }
+
+    // Node hierarchy and local transforms for skeletal animation
+    [[nodiscard]] const std::unordered_map<int, std::vector<int>>& GetNodeChildren() const {
+      return nodeChildren;
+    }
+    void SetNodeChildren(const std::unordered_map<int, std::vector<int>>& children) {
+      nodeChildren = children;
+    }
+
+    [[nodiscard]] const std::unordered_map<int, glm::mat4>& GetNodeLocalTransforms() const {
+      return nodeLocalTransforms;
+    }
+    void SetNodeLocalTransforms(const std::unordered_map<int, glm::mat4>& transforms) {
+      nodeLocalTransforms = transforms;
+    }
+
+    [[nodiscard]] const std::vector<int>& GetRootNodes() const {
+      return rootNodes;
+    }
+    void SetRootNodes(const std::vector<int>& roots) {
+      rootNodes = roots;
+    }
+
+    // Animated node to mesh mapping: maps glTF node index to mesh index
+    // Used to link animated nodes to their geometry entities
+    [[nodiscard]] const std::unordered_map<int, int>& GetAnimatedNodeMeshes() const {
+      return animatedNodeMeshes;
+    }
+    void SetAnimatedNodeMeshes(const std::unordered_map<int, int>& meshes) {
+      animatedNodeMeshes = meshes;
+    }
+
+    std::vector<CameraData> cameras;
+    std::vector<Animation> animations;
+    std::unordered_map<int, glm::mat4> animatedNodeTransforms;
+    std::unordered_map<int, std::vector<int>> nodeChildren;
+    std::unordered_map<int, glm::mat4> nodeLocalTransforms;
+    std::vector<int> rootNodes;
+    std::unordered_map<int, int> animatedNodeMeshes; // nodeIndex -> meshIndex
+
+  private:
+    std::string name;
+    std::vector<Vertex> vertices;
+    std::vector<uint32_t> indices;
+};
+
+/**
+ * @brief Class for loading and managing 3D models.
+ */
+class ModelLoader {
+  public:
+    /**
+	 * @brief Default constructor.
+	 */
+    ModelLoader() = default;
+    // Constructor-based initialization to replace separate Initialize() calls
+    explicit ModelLoader(Renderer* _renderer) {
+      if (!Initialize(_renderer)) {
+        throw std::runtime_error("ModelLoader: initialization failed");
+      }
+    }
+
+    /**
+	 * @brief Destructor for proper cleanup.
+	 */
+    ~ModelLoader();
+
+    /**
+	 * @brief Load a model from a GLTF file.
+	 * @param filename The path to the GLTF file.
+	 * @return Pointer to the loaded model, or nullptr if loading failed.
+	 */
+    Model* LoadGLTF(const std::string& filename);
+
+    /**
+	 * @brief Get a model by name.
+	 * @param name The name of the model.
+	 * @return Pointer to the model, or nullptr if not found.
+	 */
+    Model* GetModel(const std::string& name);
+
+    /**
+	 * @brief Get extracted lights from a loaded model.
+	 * @param modelName The name of the model.
+	 * @return Vector of extracted lights from the model.
+	 */
+    std::vector<ExtractedLight> GetExtractedLights(const std::string& modelName) const;
+
+    /**
+	 * @brief Get material-specific meshes from a loaded model.
+	 * @param modelName The name of the model.
+	 * @return Vector of material meshes from the model.
+	 */
+    const std::vector<MaterialMesh>& GetMaterialMeshes(const std::string& modelName) const;
+
+    /**
+	 * @brief Get a material by name.
+	 * @param materialName The name of the material.
+	 * @return Pointer to the material, or nullptr if not found.
+	 */
+    const Material* GetMaterial(const std::string& materialName) const;
+
+    /**
+	 * @brief Get a material by its glTF material index.
+	 * @param materialIndex The material index as used by glTF primitives/instances.
+	 * @return Pointer to the material, or nullptr if out of range / not available.
+	 */
+    const Material* GetMaterialByIndex(uint32_t materialIndex) const;
+
+    /**
+	 * @brief Get animations from a loaded model.
+	 * @param modelName The name of the model.
+	 * @return Vector of animations from the model.
+	 */
+    const std::vector<Animation>& GetAnimations(const std::string& modelName) const;
+
+  private:
+    /**
+	 * @brief Initialize the model loader (called by constructor).
+	 * @param _renderer Pointer to the renderer.
+	 * @return True if initialization was successful, false otherwise.
+	 */
+    bool Initialize(Renderer* _renderer);
+
+    /**
+	 * @brief Helper function to process materials from GLTF model.
+	 * @param gltfModel The loaded GLTF model.
+	 * @param baseTexturePath Base path for loading textures.
+	 * @param loadedTextures Set to track already-loaded textures.
+	 */
+    void ProcessMaterials(const tinygltf::Model& gltfModel,
+                          const std::string& baseTexturePath,
+                          std::set<std::string>& loadedTextures);
+
+    /**
+	 * @brief Helper function to process cameras from GLTF model.
+	 * @param gltfModel The loaded GLTF model.
+	 * @param model The model to add cameras to.
+	 */
+    void ProcessCameras(const tinygltf::Model& gltfModel, Model* model);
+
+    /**
+	 * @brief Helper function to process animations from GLTF model.
+	 * @param gltfModel The loaded GLTF model.
+	 * @param model The model to add animations to.
+	 */
+    void ProcessAnimations(const tinygltf::Model& gltfModel, Model* model);
+
+    /**
+	 * @brief Helper function to process scene hierarchy and meshes from GLTF model.
+	 * @param gltfModel The loaded GLTF model.
+	 * @param baseTexturePath Base path for loading textures.
+	 * @param animatedNodeIndices Set of animated node indices.
+	 * @param loadedTextures Set to track already-loaded textures.
+	 * @param filename The model filename.
+	 * @param model The model to add mesh data to.
+	 * @return Pair of combined vertices and indices vectors.
+	 */
+    std::pair<std::vector<Vertex>, std::vector<uint32_t>> ProcessMeshes(
+      const tinygltf::Model& gltfModel,
+      const std::string& baseTexturePath,
+      const std::set<int>& animatedNodeIndices,
+      std::set<std::string>& loadedTextures,
+      const std::string& filename,
+      Model* model);
+
+    // Reference to the renderer
+    Renderer* renderer = nullptr;
+
+    // Loaded models
+    std::unordered_map<std::string, std::unique_ptr<Model>> models;
+
+    // Loaded materials
+    std::unordered_map<std::string, std::unique_ptr<Material>> materials;
+
+    // Running base for assigning globally-unique material indices. Materials accumulate
+    // across model loads (the `materials` map is keyed by name and never cleared per-load),
+    // but each glTF reuses per-file indices starting at 0. To keep ray-query material slots
+    // unique across models we offset each model's indices by the number of materials loaded
+    // before it. m_currentModelMaterialBase is the base for the model currently being parsed.
+    uint32_t m_globalMaterialBase = 0;
+    uint32_t m_currentModelMaterialBase = 0;
+
+    // Mapping from glTF material index -> Material pointer (rebuilt on each model load).
+    std::vector<Material*> materialsByIndex;
+
+    // Extracted lights per model
+    std::unordered_map<std::string, std::vector<ExtractedLight>> extractedLights;
+
+    // Material meshes per model
+    std::unordered_map<std::string, std::vector<MaterialMesh>> materialMeshes;
+
+    bool hasEmissiveStrengthExtension = false;
+
+    float light_scale = 1.0f;
+
+    /**
+	 * @brief Parse a GLTF file.
+	 * @param filename The path to the GLTF file.
+	 * @param model The model to populate.
+	 * @return True if parsing was successful, false otherwise.
+	 */
+    bool ParseGLTF(const std::string& filename, Model* model);
+
+    /**
+	 * @brief Extract lights from GLTF punctual lights extension.
+	 * @param gltfModel The loaded GLTF model.
+	 * @param modelName The name of the model.
+	 * @return True if extraction was successful, false otherwise.
+	 */
+    bool ExtractPunctualLights(const class tinygltf::Model& gltfModel, const std::string& modelName);
+};
\ No newline at end of file
diff --git a/attachments/advanced_gltf/morph_accumulate.slang b/attachments/advanced_gltf/morph_accumulate.slang
new file mode 100644
index 000000000..b8948d0a1
--- /dev/null
+++ b/attachments/advanced_gltf/morph_accumulate.slang
@@ -0,0 +1,143 @@
+/* Copyright (c) 2026 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Morph target accumulation + skeletal skinning — Chapter 5 (Morph Targets).
+//
+// REQUIREMENTS:
+//   Vulkan device feature: descriptorBindingVariableDescriptorCount (VK_EXT_descriptor_indexing)
+//   Vulkan device feature: runtimeDescriptorArray
+//   The descriptor set layout for set=1 must be created with VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT.
+//
+// Two-pass operation per vertex:
+//   Pass 1: accumulate weighted morph target position deltas onto the base position.
+//   Pass 2: apply skeletal skinning via the blended joint matrix.
+
+struct InputVertex {
+    float p[3];
+    float n[3];
+    float uv[2];
+    float t[4];
+};
+
+struct OutputVertex {
+    float p[3];
+    float n[3];
+    float uv[2];
+    float t[4];
+};
+
+// Set 0: per-mesh buffers
+[[vk::binding(0, 0)]] StructuredBuffer<InputVertex>    base_vertices;
+[[vk::binding(1, 0)]] RWStructuredBuffer<OutputVertex> output_vertices;
+[[vk::binding(2, 0)]] StructuredBuffer<float4x4>       joint_matrices;
+[[vk::binding(3, 0)]] StructuredBuffer<uint4>          joint_indices;
+[[vk::binding(4, 0)]] StructuredBuffer<float4>         joint_weights;
+
+// Set 1: runtime-length array of morph delta buffers (one StructuredBuffer per target).
+// Requires VK_EXT_descriptor_indexing + runtimeDescriptorArray.
+struct MorphDelta {
+    float d[3];
+};
+[[vk::binding(0, 1)]] StructuredBuffer<MorphDelta>         morph_targets[];
+
+struct MorphWeightBlock {
+    float    weights[24];
+    uint32_t active_count;
+    uint32_t apply_skinning; // 1 = apply skeletal skinning to the position, 0 = morph only
+    uint32_t pad[2];
+};
+
+struct PushConstants {
+    uint32_t       vertex_count;
+    uint32_t       morph_indices[24]; // Which descriptor slot each active target lives in
+    MorphWeightBlock morph_weights;
+};
+[[vk::push_constant]] PushConstants pc;
+
+[shader("compute")]
+[numthreads(64, 1, 1)]
+void main(uint3 dispatch_id : SV_DispatchThreadID)
+{
+    uint vertex_idx = dispatch_id.x;
+    if (vertex_idx >= pc.vertex_count) return;
+
+    InputVertex base = base_vertices[vertex_idx];
+    float3 base_pos = float3(base.p[0], base.p[1], base.p[2]);
+    float3 base_nrm = float3(base.n[0], base.n[1], base.n[2]);
+    float4 base_tan = float4(base.t[0], base.t[1], base.t[2], base.t[3]);
+
+    // --- Pass 1: morph accumulation ---
+    float3 morphed_position = base_pos;
+    float3 morphed_normal   = base_nrm;
+
+    for (uint m = 0; m < pc.morph_weights.active_count; ++m) {
+        float weight = pc.morph_weights.weights[m];
+        if (abs(weight) < 1e-5f) continue;
+
+        uint slot = pc.morph_indices[m];
+        MorphDelta delta_raw = morph_targets[NonUniformResourceIndex(slot)][vertex_idx];
+        float3 delta = float3(delta_raw.d[0], delta_raw.d[1], delta_raw.d[2]);
+        morphed_position += weight * delta;
+        // Normal deltas are stored in a separate buffer in full implementations;
+        // here we approximate by reusing the position delta direction.
+    }
+
+    // --- Pass 2: skeletal skinning ---
+    uint4 j_idx = joint_indices[vertex_idx];
+    float4 j_w = joint_weights[vertex_idx];
+
+    float4x4 skin_matrix =
+        j_w.x * joint_matrices[j_idx.x] +
+        j_w.y * joint_matrices[j_idx.y] +
+        j_w.z * joint_matrices[j_idx.z] +
+        j_w.w * joint_matrices[j_idx.w];
+
+    // Only apply skinning when the mesh actually has joint data. Morph-only meshes (no
+    // skin) leave the joint buffers unbound/zero, so blindly applying skin_matrix would
+    // collapse them to the origin; the flag lets those meshes pass the morphed position
+    // through unchanged.
+    float3x3 skin_rot = float3x3(
+        skin_matrix[0].xyz,
+        skin_matrix[1].xyz,
+        skin_matrix[2].xyz);
+    float3 final_position = morphed_position;
+    float3 final_normal   = morphed_normal;
+    float3 final_tangent  = base_tan.xyz;
+    if (pc.morph_weights.apply_skinning != 0u) {
+        final_position = mul(skin_matrix, float4(morphed_position, 1.0)).xyz;
+        final_normal   = normalize(mul(skin_rot, morphed_normal));
+        final_tangent  = normalize(mul(skin_rot, base_tan.xyz));
+    }
+
+    OutputVertex out;
+    out.p[0] = final_position.x;
+    out.p[1] = final_position.y;
+    out.p[2] = final_position.z;
+
+    out.n[0] = final_normal.x;
+    out.n[1] = final_normal.y;
+    out.n[2] = final_normal.z;
+
+    out.t[0] = final_tangent.x;
+    out.t[1] = final_tangent.y;
+    out.t[2] = final_tangent.z;
+    out.t[3] = base_tan.w;
+    
+    out.uv[0] = base.uv[0];
+    out.uv[1] = base.uv[1];
+    
+    output_vertices[vertex_idx] = out;
+}
diff --git a/attachments/advanced_gltf/node.h b/attachments/advanced_gltf/node.h
new file mode 100644
index 000000000..0ba18672f
--- /dev/null
+++ b/attachments/advanced_gltf/node.h
@@ -0,0 +1,136 @@
+#pragma once
+#include <cstdint>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <glm/glm.hpp>
+#include <glm/gtc/quaternion.hpp>
+#include <glm/gtx/quaternion.hpp>
+#include <glm/gtx/matrix_decompose.hpp>
+
+// Canonical from appendix_types.adoc
+static constexpr uint32_t INVALID_NODE_INDEX = 0xFFFFFFFF;
+
+enum TransformStatus : uint8_t {
+    Clean      = 0,
+    LocalDirty = 1 << 0,
+    WorldDirty = 1 << 1,
+};
+
+// Physics collider metadata extracted from glTF node extras
+struct ColliderDef {
+    enum class Shape { CAPSULE, BOX, NONE };
+    Shape     shape           = Shape::NONE;
+    float     radius          = 0.0f;
+    float     half_height     = 0.0f;
+    glm::vec3 box_half_extents = {0, 0, 0};
+    float     mass            = 1.0f;
+    std::string collision_group;
+    std::string collision_mask;
+};
+
+// Joint constraint metadata extracted from glTF node extras
+struct ConstraintDef {
+    enum class Type { NONE, BALL_SOCKET, HINGE };
+    Type      type            = Type::NONE;
+    float     swing_limit_deg = 180.0f;
+    float     twist_limit_deg = 180.0f;
+    float     hinge_min_deg   = -180.0f;
+    float     hinge_max_deg   =  180.0f;
+    glm::vec3 hinge_axis      = {0, 0, 1};
+    std::string parent_bone;
+};
+
+struct Node {
+    uint32_t              node_index;
+    uint32_t              parent_index = INVALID_NODE_INDEX;
+    std::vector<uint32_t> child_indices;
+    std::string           name;
+
+    // Local SRT components
+    glm::vec3 translation    = {0, 0, 0};
+    glm::quat local_rotation = glm::identity<glm::quat>();
+    glm::vec3 scale          = {1, 1, 1};
+
+    glm::mat4 world_matrix = glm::mat4(1.0f);
+    uint8_t   status       = TransformStatus::Clean;
+    bool      is_joint     = false;
+
+    ColliderDef  collider_def;
+    ConstraintDef constraint_def;
+
+    void mark_dirty() {
+        status |= TransformStatus::LocalDirty | TransformStatus::WorldDirty;
+    }
+
+    glm::mat4 get_local_matrix() const {
+        return glm::translate(glm::mat4(1.0f), translation) *
+               glm::mat4_cast(local_rotation) *
+               glm::scale(glm::mat4(1.0f), scale);
+    }
+
+    // Extracts rotation from world_matrix, stripping scale.
+    glm::quat get_world_rotation() const {
+        glm::mat3 rs = glm::mat3(world_matrix);
+        glm::mat3 r;
+        r[0] = glm::normalize(rs[0]);
+        r[1] = glm::normalize(rs[1]);
+        r[2] = glm::normalize(rs[2]);
+        return glm::quat_cast(r);
+    }
+};
+
+class SceneGraph {
+public:
+    std::vector<Node> nodes;
+
+    // Linear pass — only correct when nodes are topologically sorted (parent before child).
+    void update_transforms() {
+        for (auto& node : nodes) {
+            if (node.status & TransformStatus::WorldDirty) {
+                if (node.parent_index != INVALID_NODE_INDEX) {
+                    node.world_matrix =
+                        nodes[node.parent_index].world_matrix * node.get_local_matrix();
+                } else {
+                    node.world_matrix = node.get_local_matrix();
+                }
+                for (uint32_t child : node.child_indices)
+                    nodes[child].status |= TransformStatus::WorldDirty;
+                node.status = TransformStatus::Clean;
+            }
+        }
+    }
+
+    // Recursive subtree update — correct regardless of node order; used after IK/physics writes.
+    void update_world_matrices_subtree(uint32_t index) {
+        Node& node = nodes[index];
+        if (node.parent_index != INVALID_NODE_INDEX) {
+            node.world_matrix =
+                nodes[node.parent_index].world_matrix * node.get_local_matrix();
+        } else {
+            node.world_matrix = node.get_local_matrix();
+        }
+        for (uint32_t child : node.child_indices) {
+            nodes[child].status |= TransformStatus::WorldDirty;
+            update_world_matrices_subtree(child);
+        }
+        node.status = TransformStatus::Clean;
+    }
+};
+
+// Free-function overload used by IK solvers (wraps the SceneGraph member).
+// IK chapters call update_world_matrices_subtree(nodes, idx) — this adapts that call site.
+inline void update_world_matrices_subtree(std::vector<Node>& nodes, uint32_t index) {
+    Node& node = nodes[index];
+    if (node.parent_index != INVALID_NODE_INDEX) {
+        node.world_matrix =
+            nodes[node.parent_index].world_matrix * node.get_local_matrix();
+    } else {
+        node.world_matrix = node.get_local_matrix();
+    }
+    for (uint32_t child : node.child_indices) {
+        nodes[child].status |= TransformStatus::WorldDirty;
+        update_world_matrices_subtree(nodes, child);
+    }
+    node.status = TransformStatus::Clean;
+}
diff --git a/attachments/advanced_gltf/pbr_heatmap.slang b/attachments/advanced_gltf/pbr_heatmap.slang
new file mode 100644
index 000000000..4315b2aeb
--- /dev/null
+++ b/attachments/advanced_gltf/pbr_heatmap.slang
@@ -0,0 +1,116 @@
+/* Copyright (c) 2026 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Skinning heatmap diagnostic shader — Chapter 6 (Debugging & Visual Auditing).
+//
+// Two rendering modes controlled by pipeline selection (not specialization constants,
+// for clarity — see Chapter 6 for the specialization-constant approach):
+//
+//   DOMINANT BONE:   Colors each pixel by which joint has the largest weight.
+//                    Exposed via fragment_dominant_bone().
+//
+//   WEIGHT DISTRIBUTION: Colors by how many joints have non-trivial influence
+//                    (blue = 1 joint, red = 4 joints). Exposed via fragment_weight_distribution().
+//
+// Both fragment shaders share the same vertex shader (vertex_main).
+
+// Per-camera data passed via a uniform/push constant in the real pipeline.
+// Declared here as a push constant for simplicity.
+struct CameraPushConstants {
+    float4x4 view_proj;
+};
+[[vk::push_constant]] CameraPushConstants camera;
+
+// Vertex input mirrors the skinned OutputVertex layout + joint data from parallel buffers.
+struct Vertex {
+    float3 position;
+    float3 normal;
+    float4 tangent;
+    float2 texcoord;
+};
+
+[[vk::binding(0, 0)]] StructuredBuffer<Vertex>    vertices;
+[[vk::binding(1, 0)]] StructuredBuffer<float4x4>  joint_matrices;  // For world position
+[[vk::binding(2, 0)]] StructuredBuffer<uint4>     joint_indices;
+[[vk::binding(3, 0)]] StructuredBuffer<float4>    joint_weights;
+[[vk::binding(4, 0)]] StructuredBuffer<float4>    joint_colors;    // One RGBA per joint
+
+struct VertexOut {
+    float4 position : SV_Position;
+    float4 weights  : TEXCOORD0;
+    uint4  joints   : TEXCOORD1;
+};
+
+[shader("vertex")]
+VertexOut vertex_main(uint vertex_id : SV_VertexID)
+{
+    Vertex v = vertices[vertex_id];
+    uint4 j_idx = joint_indices[vertex_id];
+    float4 j_w = joint_weights[vertex_id];
+
+    float4x4 skin_matrix =
+        j_w.x * joint_matrices[j_idx.x] +
+        j_w.y * joint_matrices[j_idx.y] +
+        j_w.z * joint_matrices[j_idx.z] +
+        j_w.w * joint_matrices[j_idx.w];
+
+    VertexOut out;
+    out.position = mul(camera.view_proj, mul(skin_matrix, float4(v.position, 1.0)));
+    out.weights  = j_w;
+    out.joints   = j_idx;
+    return out;
+}
+
+// Colors each pixel by the dominant (highest-weight) joint using a per-joint color table.
+// Immediately reveals stray vertex assignments and weight-painting territory errors.
+[shader("fragment")]
+float4 fragment_dominant_bone(VertexOut input) : SV_Target
+{
+    uint  dominant = 0;
+    float max_w    = input.weights.x;
+
+    if (input.weights.y > max_w) { max_w = input.weights.y; dominant = 1; }
+    if (input.weights.z > max_w) { max_w = input.weights.z; dominant = 2; }
+    if (input.weights.w > max_w) { max_w = input.weights.w; dominant = 3; }
+
+    uint actual_joint = input.joints[dominant];
+    return joint_colors[actual_joint];
+}
+
+// Colors by number of joints with weight > 5% — blue (1 bone) through red (4 bones).
+// Helps identify regions where weight painting is unnecessarily noisy.
+[shader("fragment")]
+float4 fragment_weight_distribution(VertexOut input) : SV_Target
+{
+    float complexity = 0.0f;
+    float4 w = input.weights;
+    if (w.x > 0.05f) complexity += 1.0f;
+    if (w.y > 0.05f) complexity += 1.0f;
+    if (w.z > 0.05f) complexity += 1.0f;
+    if (w.w > 0.05f) complexity += 1.0f;
+
+    float t = (complexity - 1.0f) / 3.0f; // [0..1]: 1 bone → 4 bones
+
+    float3 cool = float3(0, 0, 1);
+    float3 mid  = float3(0, 1, 0);
+    float3 warm = float3(1, 0, 0);
+
+    float3 color;
+    if (t < 0.5f) color = lerp(cool, mid, t * 2.0f);
+    else          color = lerp(mid, warm, (t - 0.5f) * 2.0f);
+
+    return float4(color, 1.0f);
+}
diff --git a/attachments/advanced_gltf/physics_interface.h b/attachments/advanced_gltf/physics_interface.h
new file mode 100644
index 000000000..ef61873d7
--- /dev/null
+++ b/attachments/advanced_gltf/physics_interface.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2026 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <glm/glm.hpp>
+#include <glm/gtc/quaternion.hpp>
+#include <cstdint>
+#include <memory>
+
+// Forward declarations — callers that only use PhysicsWorld* need not include Jolt headers.
+namespace JPH {
+    class BodyID;
+    class BodyCreationSettings;
+    enum class EMotionType : std::uint8_t;
+}
+
+// POD transform used to exchange pose data across the physics boundary.
+struct PhysicsPose {
+    glm::vec3 position;
+    glm::quat orientation;
+
+    glm::mat4 to_matrix() const {
+        return glm::translate(glm::mat4(1.0f), position) * glm::mat4_cast(orientation);
+    }
+};
+
+// Abstract physics world interface — canonical from appendix_types.adoc.
+// JoltPhysicsWorld (physics_world_jolt.cpp) is the only concrete implementation in this tutorial.
+class PhysicsWorld {
+public:
+    virtual ~PhysicsWorld() = default;
+
+    // --- Global lifecycle ---
+    static void global_init();
+    static void global_shutdown();
+    static std::unique_ptr<PhysicsWorld> create();
+
+    // --- Body management ---
+    virtual JPH::BodyID create_body(const JPH::BodyCreationSettings& settings) = 0;
+    virtual void        destroy_body(JPH::BodyID body_id) = 0;
+    virtual void        set_motion_type(JPH::BodyID body_id, JPH::EMotionType type) = 0;
+    virtual void        set_object_layer(JPH::BodyID body_id, uint16_t layer) = 0;
+    virtual void        activate_body(JPH::BodyID body_id) = 0;
+
+    // --- Kinematic/dynamic sync ---
+    virtual void        move_kinematic(JPH::BodyID body_id, const PhysicsPose& pose) = 0;
+    virtual PhysicsPose get_body_pose(JPH::BodyID body_id) const = 0;
+    virtual glm::vec3   get_linear_velocity(JPH::BodyID body_id) const = 0;
+    virtual void        set_linear_velocity(JPH::BodyID body_id, const glm::vec3& velocity) = 0;
+
+    // --- Constraints ---
+    // swing/twist limits are in radians.
+    virtual void create_ball_socket_constraint(JPH::BodyID p1, JPH::BodyID p2,
+                                               float swing_rad, float twist_rad) = 0;
+    // min_angle/max_angle are in radians.
+    virtual void create_hinge_constraint(JPH::BodyID p1, JPH::BodyID p2,
+                                         const glm::vec3& axis,
+                                         float min_angle_rad, float max_angle_rad) = 0;
+
+    // --- Simulation ---
+    virtual void step(float delta_seconds) = 0;
+
+    // --- Queries ---
+    virtual bool raycast(const glm::vec3& origin, const glm::vec3& direction, float max_distance,
+                         float& out_distance, glm::vec3& out_normal, JPH::BodyID& out_body_id) const = 0;
+};
diff --git a/attachments/advanced_gltf/physics_system.cpp b/attachments/advanced_gltf/physics_system.cpp
new file mode 100644
index 000000000..525044344
--- /dev/null
+++ b/attachments/advanced_gltf/physics_system.cpp
@@ -0,0 +1,514 @@
+#include <Jolt/Jolt.h>
+#include "physics_system.h"
+#include "entity.h"
+#include "mesh_component.h"
+#include "renderer.h"
+#include "renderer_advanced_types.h"
+#include "transform_component.h"
+#include "physics_interface.h"
+#include <iostream>
+#include <algorithm>
+#include <chrono>
+#include <unordered_map>
+#include <glm/gtc/quaternion.hpp>
+#include <glm/gtx/norm.hpp>
+#include <Jolt/Physics/Body/BodyCreationSettings.h>
+#include <Jolt/Physics/Collision/Shape/BoxShape.h>
+#include <Jolt/Physics/Collision/Shape/SphereShape.h>
+#include <Jolt/Physics/Collision/Shape/CapsuleShape.h>
+#include <Jolt/Physics/Collision/Shape/MeshShape.h>
+#include <Jolt/Physics/Collision/Shape/RotatedTranslatedShape.h>
+
+
+namespace {
+    // Broad-phase layers: static world geometry vs everything else.
+    namespace ObjLayers {
+        static constexpr uint16_t NON_MOVING = 0;
+        static constexpr uint16_t MOVING     = 1;
+    }
+
+    // Static map to associate PhysicsSystem instances with their Jolt worlds
+    static std::unordered_map<const PhysicsSystem*, std::unique_ptr<PhysicsWorld>> g_physicsWorlds;
+    static std::mutex g_worldsMutex;
+
+    PhysicsWorld* GetWorld(const PhysicsSystem* system) {
+        std::lock_guard<std::mutex> lock(g_worldsMutex);
+        auto it = g_physicsWorlds.find(system);
+        return it != g_physicsWorlds.end() ? it->second.get() : nullptr;
+    }
+}
+
+class ConcreteRigidBody final : public RigidBody {
+public:
+    ConcreteRigidBody(Entity* entity, PhysicsWorld* world, JPH::BodyID bodyId, bool isStatic)
+        : entity(entity), physicsWorld(world), bodyId(bodyId), isStatic(isStatic) {}
+
+    ~ConcreteRigidBody() override {
+        if (physicsWorld && !bodyId.IsInvalid()) {
+            physicsWorld->destroy_body(bodyId);
+        }
+    }
+
+    void SetPosition(const glm::vec3& position) override {
+        PhysicsPose pose = physicsWorld->get_body_pose(bodyId);
+        pose.position = position;
+        physicsWorld->move_kinematic(bodyId, pose);
+    }
+
+    void SetRotation(const glm::quat& rotation) override {
+        PhysicsPose pose = physicsWorld->get_body_pose(bodyId);
+        pose.orientation = rotation;
+        physicsWorld->move_kinematic(bodyId, pose);
+    }
+
+    void SetScale(const glm::vec3& scale) override {
+        // Jolt doesn't support dynamic scaling of shapes easily. 
+        // Scale should be set during creation.
+    }
+
+    void SetMass(float mass) override {
+        // Mass is handled via EMotionType and mass properties in Jolt.
+        // For simplicity, we toggle Kinematic/Dynamic in PhysicsSystem::Update.
+    }
+
+    void SetRestitution(float restitution) override {
+        // Jolt handles this in BodyCreationSettings.
+    }
+
+    void SetFriction(float friction) override {
+        // Jolt handles this in BodyCreationSettings.
+    }
+
+    void ApplyForce(const glm::vec3& force, const glm::vec3& localPosition) override {
+        // Not implemented in the minimal wrapper, but could be added.
+    }
+
+    void ApplyImpulse(const glm::vec3& impulse, const glm::vec3& localPosition) override {
+        // Not implemented in the minimal wrapper.
+    }
+
+    void SetLinearVelocity(const glm::vec3& velocity) override {
+        physicsWorld->set_linear_velocity(bodyId, velocity);
+    }
+
+    void SetAngularVelocity(const glm::vec3& velocity) override {
+        // Not implemented in the minimal wrapper.
+    }
+
+    [[nodiscard]] glm::vec3 GetPosition() const override {
+        return physicsWorld->get_body_pose(bodyId).position;
+    }
+
+    [[nodiscard]] glm::quat GetRotation() const override {
+        return physicsWorld->get_body_pose(bodyId).orientation;
+    }
+
+    [[nodiscard]] glm::vec3 GetLinearVelocity() const override {
+        return physicsWorld->get_linear_velocity(bodyId);
+    }
+
+    [[nodiscard]] glm::vec3 GetAngularVelocity() const override {
+        return glm::vec3(0.0f);
+    }
+
+    void SetKinematic(bool kinematic) override {
+        if (isStatic) return;
+        isKinematic = kinematic;
+        physicsWorld->set_motion_type(bodyId, kinematic ? JPH::EMotionType::Kinematic : JPH::EMotionType::Dynamic);
+    }
+
+    [[nodiscard]] bool IsKinematic() const override {
+        return isKinematic;
+    }
+
+    [[nodiscard]] bool IsStatic() const { return isStatic; }
+
+    JPH::BodyID GetBodyID() const { return bodyId; }
+    Entity* GetEntity() const { return entity; }
+
+private:
+    Entity* entity;
+    PhysicsWorld* physicsWorld;
+    JPH::BodyID bodyId;
+    bool isKinematic = false;
+    bool isStatic = false;
+};
+
+PhysicsSystem::~PhysicsSystem() {
+    std::lock_guard<std::mutex> lock(rigidBodiesMutex);
+    rigidBodies.clear(); // Destroy all bodies before shutting down the world
+    {
+        std::lock_guard<std::mutex> lockW(g_worldsMutex);
+        g_physicsWorlds.erase(this);
+    }
+    PhysicsWorld::global_shutdown();
+}
+
+bool PhysicsSystem::Initialize() {
+    PhysicsWorld::global_init();
+    {
+        std::lock_guard<std::mutex> lock(g_worldsMutex);
+        g_physicsWorlds[this] = PhysicsWorld::create();
+    }
+    initialized = true;
+    return true;
+}
+
+void PhysicsSystem::Update(std::chrono::milliseconds deltaTime) {
+    PhysicsWorld* physicsWorld = GetWorld(this);
+    if (!physicsWorld) return;
+
+    // 0. Streaming: promote/evict static colliders based on camera distance.
+    // This keeps the active body count bounded (only colliders near the camera),
+    // which avoids the multi-second hitch caused by creating 500+ Jolt bodies
+    // up-front and the associated per-frame sync cost.
+    glm::vec3 camPos; {
+        std::lock_guard<std::mutex> lk(cameraPositionMutex);
+        camPos = cameraPosition;
+    }
+    const float promoteR2 = streamingRadius * streamingRadius;
+    const float evictR2 = streamingEvictRadius * streamingEvictRadius;
+    std::vector<Entity*> toEvict;
+    {
+        std::lock_guard<std::mutex> lk(streamingMutex);
+        // Bounded work per frame: only promote up to N candidates / frame so we
+        // never block the physics thread on a sudden flood (e.g. teleport).
+        constexpr size_t MAX_PROMOTIONS_PER_FRAME = 32;
+        size_t promoted = 0;
+        for (auto& sc : streamingCandidates) {
+            float d2 = glm::length2(sc.center - camPos);
+            if (!sc.active) {
+                if (d2 <= promoteR2 && promoted < MAX_PROMOTIONS_PER_FRAME) {
+                    std::lock_guard<std::mutex> lk2(pendingMutex);
+                    pendingCreations.push_back({sc.entity, sc.shape, sc.mass, sc.kinematic, sc.restitution, sc.friction});
+                    sc.active = true;
+                    ++promoted;
+                }
+            } else {
+                if (d2 > evictR2) {
+                    toEvict.push_back(sc.entity);
+                    sc.active = false;
+                }
+            }
+        }
+    }
+    // Evict outside the streaming lock to avoid nested-lock acquisition on rigidBodiesMutex.
+    if (!toEvict.empty()) {
+        std::lock_guard<std::mutex> lock(rigidBodiesMutex);
+        for (Entity* e : toEvict) {
+            auto it = std::find_if(rigidBodies.begin(), rigidBodies.end(),
+                [e](const auto& rb) {
+                    auto* crb = static_cast<ConcreteRigidBody*>(rb.get());
+                    return crb && crb->GetEntity() == e && crb->IsStatic();
+                });
+            if (it != rigidBodies.end()) {
+                rigidBodies.erase(it);
+            }
+        }
+    }
+
+    // 1. Process pending creations (batched to avoid hanging the first frame)
+    static constexpr uint32_t MAX_CREATIONS_PER_FRAME = 100;
+    std::vector<PendingCreation> toCreate; {
+        std::lock_guard<std::mutex> lk(pendingMutex);
+        size_t count = std::min<size_t>(pendingCreations.size(), MAX_CREATIONS_PER_FRAME);
+        if (count > 0) {
+            toCreate.assign(pendingCreations.begin(), pendingCreations.begin() + count);
+            pendingCreations.erase(pendingCreations.begin(), pendingCreations.begin() + count);
+        }
+    }
+    
+    for (auto& pc : toCreate) {
+        RigidBody* rb = CreateRigidBody(pc.entity, pc.shape, pc.mass);
+        if (rb) {
+            rb->SetKinematic(pc.kinematic);
+            rb->SetRestitution(pc.restitution);
+            rb->SetFriction(pc.friction);
+        }
+    }
+
+    // 2. Step the simulation
+    float dt = deltaTime.count() * 0.001f;
+    physicsWorld->step(dt);
+
+    // 3. Sync physics back to entities
+    std::lock_guard<std::mutex> lock(rigidBodiesMutex);
+    for (auto& rb : rigidBodies) {
+        auto* crb = static_cast<ConcreteRigidBody*>(rb.get());
+        
+        if (crb->IsStatic()) continue;
+
+        Entity* entity = crb->GetEntity();
+        if (!entity) continue;
+
+        auto* transform = entity->GetComponent<TransformComponent>();
+        if (!transform) continue;
+
+        // If it's a dynamic body (Fox when released), update the entity
+        if (!crb->IsKinematic()) {
+            PhysicsPose pose = physicsWorld->get_body_pose(crb->GetBodyID());
+            if (pose.position != transform->GetPosition() || 
+                pose.orientation != glm::quat(transform->GetRotation())) {
+                transform->SetPosition(pose.position);
+                transform->SetRotation(glm::eulerAngles(pose.orientation));
+            }
+        } else {
+            // If it's kinematic, move the physics body to follow the entity (e.g. Fox when grabbed)
+            PhysicsPose pose;
+            pose.position = transform->GetPosition();
+            pose.orientation = glm::quat(transform->GetRotation());
+            physicsWorld->move_kinematic(crb->GetBodyID(), pose);
+        }
+    }
+}
+
+RigidBody* PhysicsSystem::CreateRigidBody(Entity* entity, CollisionShape shape, float mass) {
+    PhysicsWorld* physicsWorld = GetWorld(this);
+    if (!entity || !physicsWorld) return nullptr;
+
+    auto* transform = entity->GetComponent<TransformComponent>();
+    glm::vec3 pos = transform ? transform->GetPosition() : glm::vec3(0.0f);
+    glm::quat rot = transform ? glm::quat(transform->GetRotation()) : glm::quat(1.0f, 0.0f, 0.0f, 0.0f);
+    glm::vec3 scale = transform ? transform->GetScale() : glm::vec3(1.0f);
+
+    JPH::BodyCreationSettings settings;
+    settings.mPosition = JPH::RVec3(pos.x, pos.y, pos.z);
+    settings.mRotation = JPH::Quat(rot.x, rot.y, rot.z, rot.w);
+    
+    // Choose layer based on mass/kinematic
+    bool isStatic = (mass <= 0.0f);
+    settings.mObjectLayer = isStatic ? ObjLayers::NON_MOVING : ObjLayers::MOVING;
+    settings.mMotionType = isStatic ? JPH::EMotionType::Static : JPH::EMotionType::Dynamic;
+    
+    if (!isStatic) {
+        settings.mAllowSleeping = false;
+        settings.mMotionQuality = JPH::EMotionQuality::LinearCast;
+    }
+
+    if (shape == CollisionShape::Box) {
+        auto* mc = entity->GetComponent<MeshComponent>();
+        if (mc && mc->HasLocalAABB()) {
+            glm::vec3 localMin = mc->GetLocalAABBMin();
+            glm::vec3 localMax = mc->GetLocalAABBMax();
+            glm::vec3 center = (localMin + localMax) * 0.5f;
+            glm::vec3 halfExtents = (localMax - localMin) * 0.5f;
+
+            // Add minimum thickness (especially for floors/walls) to prevent tunnelling
+            float hx = std::max(0.01f, std::abs(halfExtents.x * scale.x));
+            float hy = std::max(0.01f, std::abs(halfExtents.y * scale.y));
+            float hz = std::max(0.01f, std::abs(halfExtents.z * scale.z));
+            
+            if (isStatic) {
+                // For static environment objects, ensure at least 10cm thickness in each dimension
+                // to make them "solid" enough for fast-moving dynamic objects.
+                hx = std::max(hx, 0.1f);
+                hy = std::max(hy, 0.1f);
+                hz = std::max(hz, 0.1f);
+            }
+
+            auto* boxSettings = new JPH::BoxShapeSettings(JPH::Vec3(hx, hy, hz));
+            boxSettings->mConvexRadius = 0.0f;
+            
+            // Use RotatedTranslatedShape to offset the box geometry so the body's origin matches the entity's origin.
+            // This ensures that syncing physics back to the entity (and moving kinematic bodies) works correctly without manual offsets.
+            auto* s = new JPH::RotatedTranslatedShapeSettings(JPH::Vec3(center.x * scale.x, center.y * scale.y, center.z * scale.z), JPH::Quat::sIdentity(), boxSettings);
+            settings.SetShapeSettings(s);
+        } else {
+            auto* s = new JPH::BoxShapeSettings(JPH::Vec3(std::max(0.01f, std::abs(scale.x) * 0.5f),
+                                                         std::max(0.01f, std::abs(scale.y) * 0.5f),
+                                                         std::max(0.01f, std::abs(scale.z) * 0.5f)));
+            s->mConvexRadius = 0.0f;
+            settings.SetShapeSettings(s);
+        }
+    } else if (shape == CollisionShape::Sphere) {
+        auto* mc = entity->GetComponent<MeshComponent>();
+        if (mc && mc->HasLocalAABB()) {
+            glm::vec3 localMin = mc->GetLocalAABBMin();
+            glm::vec3 localMax = mc->GetLocalAABBMax();
+            glm::vec3 halfExtents = (localMax - localMin) * 0.5f;
+            float hx = std::abs(halfExtents.x * scale.x);
+            float hy = std::abs(halfExtents.y * scale.y);
+            float hz = std::abs(halfExtents.z * scale.z);
+            float radius = std::max(hx, std::max(hy, hz));
+            settings.SetShapeSettings(new JPH::SphereShapeSettings(std::max(0.01f, radius)));
+        } else {
+            settings.SetShapeSettings(new JPH::SphereShapeSettings(std::max(0.01f, std::abs(scale.x) * 0.5f)));
+        }
+    } else if (shape == CollisionShape::Capsule) {
+        settings.SetShapeSettings(new JPH::CapsuleShapeSettings(std::max(0.01f, std::abs(scale.y) * 0.5f),
+                                                               std::max(0.01f, std::abs(scale.x) * 0.5f)));
+    } else if (shape == CollisionShape::Mesh) {
+        // Static triangle-mesh collider (mesh shapes are static-only). Bistro
+        // geometry is heavily GPU-instanced: a mesh authored in a small local
+        // space is placed, often many times, via per-instance model matrices.
+        // The renderer draws each instance at
+        //   worldPos = entityModel * instanceModel * localPos
+        // so we bake that same transform into the collider vertices and create
+        // the body at the identity pose. Baking the full transform (rather than
+        // using the TransformComponent's position/scale alone) is what keeps the
+        // collider aligned with the visible geometry; it also scales tiny local
+        // triangles up to world size so they survive the degenerate-area filter.
+        // A non-instanced mesh is treated as a single identity instance.
+        auto* mc = entity->GetComponent<MeshComponent>();
+        if (mc) {
+            const auto& vertices = mc->GetVertices();
+            const auto& indices = mc->GetIndices();
+
+            if (!vertices.empty() && !indices.empty()) {
+                const glm::mat4 entityModel = transform ? transform->GetModelMatrix() : glm::mat4(1.0f);
+
+                const auto& instances = mc->GetInstances();
+                std::vector<glm::mat4> worldXforms;
+                if (instances.empty()) {
+                    worldXforms.push_back(entityModel);
+                } else {
+                    worldXforms.reserve(instances.size());
+                    for (const auto& inst : instances) {
+                        worldXforms.push_back(entityModel * inst.getModelMatrix());
+                    }
+                }
+
+                // Cap total triangles so dense instanced foliage (hundreds of
+                // copies of a high-poly bush) doesn't build a multi-million-tri
+                // shape on the physics thread. Such decorative meshes simply get
+                // no collider, which is fine for a walkable environment.
+                const size_t triCount = indices.size() / 3;
+                constexpr size_t MAX_TOTAL_TRIS = 200000;
+                if (triCount * worldXforms.size() <= MAX_TOTAL_TRIS) {
+                    JPH::VertexList joltVertices;
+                    joltVertices.reserve(vertices.size() * worldXforms.size());
+                    JPH::IndexedTriangleList joltTriangles;
+                    joltTriangles.reserve(triCount * worldXforms.size());
+
+                    for (const glm::mat4& M : worldXforms) {
+                        const uint32_t base = static_cast<uint32_t>(joltVertices.size());
+                        for (const auto& v : vertices) {
+                            glm::vec3 wp = glm::vec3(M * glm::vec4(v.position, 1.0f));
+                            joltVertices.push_back(JPH::Float3(wp.x, wp.y, wp.z));
+                        }
+                        for (size_t i = 0; i + 2 < indices.size(); i += 3) {
+                            const uint32_t i0 = base + indices[i];
+                            const uint32_t i1 = base + indices[i + 1];
+                            const uint32_t i2 = base + indices[i + 2];
+                            JPH::Vec3 v0(joltVertices[i0].x, joltVertices[i0].y, joltVertices[i0].z);
+                            JPH::Vec3 v1(joltVertices[i1].x, joltVertices[i1].y, joltVertices[i1].z);
+                            JPH::Vec3 v2(joltVertices[i2].x, joltVertices[i2].y, joltVertices[i2].z);
+                            // Filter out degenerate (zero-area) triangles in world space.
+                            if ((v1 - v0).Cross(v2 - v0).LengthSq() > 1e-12f) {
+                                joltTriangles.push_back(JPH::IndexedTriangle(i0, i1, i2));
+                            }
+                        }
+                    }
+
+                    if (!joltTriangles.empty()) {
+                        settings.SetShapeSettings(new JPH::MeshShapeSettings(joltVertices, joltTriangles));
+                        // Vertices are already in world space — the body must sit
+                        // at the identity pose so the collider lands on the geometry.
+                        settings.mPosition = JPH::RVec3(0.0f, 0.0f, 0.0f);
+                        settings.mRotation = JPH::Quat::sIdentity();
+                    }
+                }
+            }
+        }
+    }
+
+    if (!settings.GetShape()) {
+        std::cerr << "PhysicsSystem: Failed to create shape for entity " << entity->GetName() << std::endl;
+        return nullptr;
+    }
+
+    JPH::BodyID bodyId = physicsWorld->create_body(settings);
+    if (bodyId.IsInvalid()) return nullptr;
+
+    auto rb = std::make_unique<ConcreteRigidBody>(entity, physicsWorld, bodyId, isStatic);
+    RigidBody* ptr = rb.get();
+    
+    std::lock_guard<std::mutex> lock(rigidBodiesMutex);
+    rigidBodies.push_back(std::move(rb));
+    
+    return ptr;
+}
+
+bool PhysicsSystem::DestroyRigidBody(RigidBody* rigidBody) {
+    if (!rigidBody) return false;
+    std::lock_guard<std::mutex> lock(rigidBodiesMutex);
+    auto it = std::find_if(rigidBodies.begin(), rigidBodies.end(), [rigidBody](const auto& rb) { return rb.get() == rigidBody; });
+    if (it != rigidBodies.end()) {
+        rigidBodies.erase(it);
+        return true;
+    }
+    return false;
+}
+
+void PhysicsSystem::SetGravity(const glm::vec3& _gravity) {
+    // Jolt physics world doesn't have a simple SetGravity in our wrapper, 
+    // but it's usually handled in JPH::PhysicsSystem.
+}
+
+glm::vec3 PhysicsSystem::GetGravity() const {
+    return glm::vec3(0.0f, -9.81f, 0.0f);
+}
+
+bool PhysicsSystem::Raycast(const glm::vec3& origin, const glm::vec3& direction, float maxDistance,
+                           glm::vec3* hitPosition, glm::vec3* hitNormal, Entity** hitEntity) const {
+    PhysicsWorld* physicsWorld = GetWorld(this);
+    if (!physicsWorld) return false;
+    
+    JPH::BodyID bodyId;
+    float distance;
+    glm::vec3 normal;
+    if (physicsWorld->raycast(origin, direction, maxDistance, distance, normal, bodyId)) {
+        if (hitPosition) *hitPosition = origin + direction * distance;
+        if (hitNormal) *hitNormal = normal;
+        if (hitEntity) {
+            std::lock_guard<std::mutex> lock(rigidBodiesMutex);
+            for (const auto& rb : rigidBodies) {
+                auto* crb = static_cast<ConcreteRigidBody*>(rb.get());
+                if (crb->GetBodyID() == bodyId) {
+                    *hitEntity = crb->GetEntity();
+                    break;
+                }
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+void PhysicsSystem::EnqueueRigidBodyCreation(Entity* entity, CollisionShape shape, float mass, bool kinematic, float restitution, float friction) {
+    std::lock_guard<std::mutex> lk(pendingMutex);
+    pendingCreations.push_back({entity, shape, mass, kinematic, restitution, friction});
+}
+
+void PhysicsSystem::RegisterStreamingCollider(Entity* entity,
+                                              CollisionShape shape,
+                                              float mass,
+                                              bool kinematic,
+                                              float restitution,
+                                              float friction,
+                                              const glm::vec3& center) {
+    std::lock_guard<std::mutex> lk(streamingMutex);
+    StreamingCandidate sc;
+    sc.entity = entity;
+    sc.shape = shape;
+    sc.mass = mass;
+    sc.kinematic = kinematic;
+    sc.restitution = restitution;
+    sc.friction = friction;
+    sc.center = center;
+    sc.active = false;
+    entityToStreamingIndex[entity] = streamingCandidates.size();
+    streamingCandidates.push_back(sc);
+}
+
+
+void PhysicsSystem::CleanupMarkedBodies() {
+    // Handled via DestroyRigidBody
+}
+
+void PhysicsSystem::UpdateGPUPhysicsData(std::chrono::milliseconds deltaTime) const {}
+void PhysicsSystem::ReadbackGPUPhysicsData() const {}
+void PhysicsSystem::SimulatePhysicsOnGPU(std::chrono::milliseconds deltaTime) const {}
+void PhysicsSystem::CleanupVulkanResources() {}
+
+void PhysicsSystem::CreateMappedBuffer(vk::DeviceSize size, vk::BufferUsageFlags usage, vk::raii::Buffer& buffer, vk::raii::DeviceMemory& memory, const std::string& errorPrefix) {}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/physics_system.h b/attachments/advanced_gltf/physics_system.h
new file mode 100644
index 000000000..acc715756
--- /dev/null
+++ b/attachments/advanced_gltf/physics_system.h
@@ -0,0 +1,491 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <chrono>
+#include <glm/glm.hpp>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <unordered_map>
+#include <vector>
+#include <vulkan/vulkan_raii.hpp>
+
+class Entity;
+class Renderer;
+
+/**
+ * @brief Enum for different collision shapes.
+ */
+enum class CollisionShape {
+  Box,
+  Sphere,
+  Capsule,
+  Mesh
+};
+
+/**
+ * @brief Class representing a rigid body for physics simulation.
+ */
+class RigidBody {
+  public:
+    /**
+	 * @brief Default constructor.
+	 */
+    RigidBody() = default;
+
+    /**
+	 * @brief Destructor for proper cleanup.
+	 */
+    virtual ~RigidBody() = default;
+
+    /**
+	 * @brief Set the position of the rigid body.
+	 * @param position The position.
+	 */
+    virtual void SetPosition(const glm::vec3& position) = 0;
+
+    /**
+	 * @brief Set the rotation of the rigid body.
+	 * @param rotation The rotation quaternion.
+	 */
+    virtual void SetRotation(const glm::quat& rotation) = 0;
+
+    /**
+	 * @brief Set the scale of the rigid body.
+	 * @param scale The scale.
+	 */
+    virtual void SetScale(const glm::vec3& scale) = 0;
+
+    /**
+	 * @brief Set the mass of the rigid body.
+	 * @param mass The mass.
+	 */
+    virtual void SetMass(float mass) = 0;
+
+    /**
+	 * @brief Set the restitution (bounciness) of the rigid body.
+	 * @param restitution The restitution (0.0f to 1.0f).
+	 */
+    virtual void SetRestitution(float restitution) = 0;
+
+    /**
+	 * @brief Set the friction of the rigid body.
+	 * @param friction The friction (0.0f to 1.0f).
+	 */
+    virtual void SetFriction(float friction) = 0;
+
+    /**
+	 * @brief Apply a force to the rigid body.
+	 * @param force The force vector.
+	 * @param localPosition The local position to apply the force at.
+	 */
+    virtual void ApplyForce(const glm::vec3& force, const glm::vec3& localPosition = glm::vec3(0.0f)) = 0;
+
+    /**
+	 * @brief Apply an impulse to the rigid body.
+	 * @param impulse The impulse vector.
+	 * @param localPosition The local position to apply the impulse at.
+	 */
+    virtual void ApplyImpulse(const glm::vec3& impulse, const glm::vec3& localPosition = glm::vec3(0.0f)) = 0;
+
+    /**
+	 * @brief Set the linear velocity of the rigid body.
+	 * @param velocity The linear velocity.
+	 */
+    virtual void SetLinearVelocity(const glm::vec3& velocity) = 0;
+
+    /**
+	 * @brief Set the angular velocity of the rigid body.
+	 * @param velocity The angular velocity.
+	 */
+    virtual void SetAngularVelocity(const glm::vec3& velocity) = 0;
+
+    /**
+	 * @brief Get the position of the rigid body.
+	 * @return The position.
+	 */
+    [[nodiscard]] virtual glm::vec3 GetPosition() const = 0;
+
+    /**
+	 * @brief Get the rotation of the rigid body.
+	 * @return The rotation quaternion.
+	 */
+    [[nodiscard]] virtual glm::quat GetRotation() const = 0;
+
+    /**
+	 * @brief Get the linear velocity of the rigid body.
+	 * @return The linear velocity.
+	 */
+    [[nodiscard]] virtual glm::vec3 GetLinearVelocity() const = 0;
+
+    /**
+	 * @brief Get the angular velocity of the rigid body.
+	 * @return The angular velocity.
+	 */
+    [[nodiscard]] virtual glm::vec3 GetAngularVelocity() const = 0;
+
+    /**
+	 * @brief Set whether the rigid body is kinematic.
+	 * @param kinematic Whether the rigid body is kinematic.
+	 */
+    virtual void SetKinematic(bool kinematic) = 0;
+
+    /**
+	 * @brief Check if the rigid body is kinematic.
+	 * @return True if kinematic, false otherwise.
+	 */
+    [[nodiscard]] virtual bool IsKinematic() const = 0;
+};
+
+/**
+ * @brief Structure for GPU physics data.
+ */
+struct GPUPhysicsData {
+  glm::vec4 position; // xyz = position, w = inverse mass
+  glm::vec4 rotation; // quaternion
+  glm::vec4 linearVelocity; // xyz = velocity, w = restitution
+  glm::vec4 angularVelocity; // xyz = angular velocity, w = friction
+  glm::vec4 force; // xyz = force, w = is kinematic (0 or 1)
+  glm::vec4 torque; // xyz = torque, w = use gravity (0 or 1)
+  glm::vec4 colliderData; // type-specific data (e.g., radius for spheres)
+  glm::vec4 colliderData2; // additional collider data (e.g., box half extents)
+};
+
+/**
+ * @brief Structure for GPU collision data.
+ */
+struct GPUCollisionData {
+  uint32_t bodyA;
+  uint32_t bodyB;
+  glm::vec4 contactNormal; // xyz = normal, w = penetration depth
+  glm::vec4 contactPoint; // xyz = contact point, w = unused
+};
+
+/**
+ * @brief Structure for physics simulation parameters.
+ */
+struct PhysicsParams {
+  float deltaTime; // Time step - 4 bytes
+  uint32_t numBodies; // Number of rigid bodies - 4 bytes
+  uint32_t maxCollisions; // Maximum number of collisions - 4 bytes
+  float padding; // Explicit padding to align gravity to 16-byte boundary - 4 bytes
+  glm::vec4 gravity; // Gravity vector (xyz) + padding (w) - 16 bytes
+  // Total: 32 bytes (aligned to 16-byte boundaries for std140 layout)
+};
+
+/**
+ * @brief Structure to store collision prediction data for a ray-based collision system.
+ */
+struct CollisionPrediction {
+  float collisionTime = -1.0f; // Time within deltaTime when the collision occurs (-1 = no collision)
+  glm::vec3 collisionPoint; // World position where collision occurs
+  glm::vec3 collisionNormal; // Surface normal at collision point
+  glm::vec3 newVelocity; // Predicted velocity after bounce
+  Entity* hitEntity = nullptr; // Entity that was hit
+  bool isValid = false; // Whether this prediction is valid
+};
+
+/**
+ * @brief Class for managing physics simulation.
+ *
+ * This class implements the physics system as described in the Subsystems chapter:
+ * @see en/Building_a_Simple_Engine/Subsystems/04_physics_basics.adoc
+ * @see en/Building_a_Simple_Engine/Subsystems/05_vulkan_physics.adoc
+ */
+class PhysicsSystem {
+  public:
+    /**
+	 * @brief Default constructor.
+	 */
+    PhysicsSystem() = default;
+
+    // Constructor-based initialization replacing separate Initialize/Set* calls
+    explicit PhysicsSystem(Renderer* _renderer, bool enableGPU = true) {
+      SetRenderer(_renderer);
+      SetGPUAccelerationEnabled(enableGPU);
+      if (!Initialize()) {
+        throw std::runtime_error("PhysicsSystem: initialization failed");
+      }
+    }
+
+    /**
+	 * @brief Destructor for proper cleanup.
+	 */
+    ~PhysicsSystem();
+
+    /**
+	 * @brief Update the physics system.
+	 * @param deltaTime The time elapsed since the last update.
+	 */
+    void Update(std::chrono::milliseconds deltaTime);
+
+    /**
+	 * @brief Create a rigid body.
+	 * @param entity The entity to attach the rigid body to.
+	 * @param shape The collision shape.
+	 * @param mass The mass.
+	 * @return Pointer to the created rigid body, or nullptr if creation failed.
+	 */
+    RigidBody* CreateRigidBody(Entity* entity, CollisionShape shape, float mass);
+
+    /**
+	 * @brief Destroy a rigid body.
+	 * @param rigidBody The rigid body to destroy.
+	 * @return True if destruction was successful, false otherwise.
+	 */
+    bool DestroyRigidBody(RigidBody* rigidBody);
+
+    /**
+	 * @brief Set the gravity of the physics world.
+	 * @param _gravity The gravity vector.
+	 */
+    void SetGravity(const glm::vec3& _gravity);
+
+    /**
+	 * @brief Get the gravity of the physics world.
+	 * @return The gravity vector.
+	 */
+    [[nodiscard]] glm::vec3 GetGravity() const;
+
+    /**
+	 * @brief Perform a raycast.
+	 * @param origin The origin of the ray.
+	 * @param direction The direction of the ray.
+	 * @param maxDistance The maximum distance of the ray.
+	 * @param hitPosition Output parameter for the hit position.
+	 * @param hitNormal Output parameter for the hit normal.
+	 * @param hitEntity Output parameter for the hit entity.
+	 * @return True if the ray hit something, false otherwise.
+	 */
+    bool Raycast(const glm::vec3& origin,
+                 const glm::vec3& direction,
+                 float maxDistance,
+                 glm::vec3* hitPosition,
+                 glm::vec3* hitNormal,
+                 Entity** hitEntity) const;
+
+    /**
+	 * @brief Enable or disable GPU acceleration.
+	 * @param enabled Whether GPU acceleration is enabled.
+	 */
+    void SetGPUAccelerationEnabled(bool enabled) {
+      // Enforce GPU-only policy: disabling GPU acceleration is not allowed in this project.
+      // Ignore attempts to disable and keep GPU acceleration enabled.
+      gpuAccelerationEnabled = true;
+    }
+
+    /**
+	 * @brief Check if GPU acceleration is enabled.
+	 * @return True, if GPU acceleration is enabled, false otherwise.
+	 */
+    [[nodiscard]] bool IsGPUAccelerationEnabled() const {
+      return gpuAccelerationEnabled;
+    }
+
+    /**
+	 * @brief Set the maximum number of objects that can be simulated on the GPU.
+	 * @param maxObjects The maximum number of objects.
+	 */
+    void SetMaxGPUObjects(uint32_t maxObjects) {
+      maxGPUObjects = maxObjects;
+    }
+
+    /**
+	 * @brief Set the renderer to use during GPU acceleration.
+	 * @param _renderer The renderer.
+	 */
+    void SetRenderer(Renderer* _renderer) {
+      renderer = _renderer;
+    }
+
+    /**
+	 * @brief Set the current camera position for geometry-relative ball checking.
+	 * @param _cameraPosition The current camera position.
+	 */
+    void SetCameraPosition(const glm::vec3& _cameraPosition) {
+      std::lock_guard<std::mutex> lock(cameraPositionMutex);
+      cameraPosition = _cameraPosition;
+    }
+
+    // Thread-safe enqueue for rigid body creation from any thread
+    void EnqueueRigidBodyCreation(Entity* entity,
+                                  CollisionShape shape,
+                                  float mass,
+                                  bool kinematic,
+                                  float restitution,
+                                  float friction);
+
+  private:
+    /**
+	 * @brief Initialize the physics system (called by constructor).
+	 * @return True if initialization was successful, false otherwise.
+	 */
+    bool Initialize();
+
+    /**
+	 * @brief Clean up rigid bodies that are marked for removal.
+	 */
+    void CleanupMarkedBodies();
+
+    /**
+	 * @brief Helper function to create a mapped buffer with memory allocation.
+	 * @param size The size of the buffer in bytes.
+	 * @param usage The buffer usage flags.
+	 * @param buffer Reference to the buffer RAII object.
+	 * @param memory Reference to the memory RAII object.
+	 * @param errorPrefix Prefix for error messages.
+	 */
+    void CreateMappedBuffer(vk::DeviceSize size,
+                            vk::BufferUsageFlags usage,
+                            vk::raii::Buffer& buffer,
+                            vk::raii::DeviceMemory& memory,
+                            const std::string& errorPrefix);
+
+    // Pending rigid body creations queued from background threads
+    struct PendingCreation {
+      Entity* entity;
+      CollisionShape shape;
+      float mass;
+      bool kinematic;
+      float restitution;
+      float friction;
+    };
+    std::mutex pendingMutex;
+    std::vector<PendingCreation> pendingCreations;
+
+    // ------------------------------------------------------------------
+    // Streaming colliders: for very large scenes (Bistro has ~500+ static
+    // colliders), creating Jolt bodies for everything up-front causes
+    // multi-second hitches and burns memory. Instead, static colliders are
+    // registered as "streaming candidates" with a precomputed world-space
+    // center; PhysicsSystem::Update promotes candidates within
+    // `streamingRadius` of the camera to real bodies, and removes bodies
+    // that drift beyond `streamingEvictRadius`.
+    // ------------------------------------------------------------------
+    struct StreamingCandidate {
+      Entity* entity;
+      CollisionShape shape;
+      float mass;
+      bool kinematic;
+      float restitution;
+      float friction;
+      glm::vec3 center;
+      bool active = false; // true while a real body exists for this candidate
+    };
+    std::mutex streamingMutex;
+    std::vector<StreamingCandidate> streamingCandidates;
+    // Map entity -> index into streamingCandidates for fast eviction lookup.
+    std::unordered_map<Entity*, size_t> entityToStreamingIndex;
+    // Streaming distances (squared, in world units). Defaults tuned for Bistro
+    // where ~15m surrounds the immediate room with walls/floor.
+    float streamingRadius = 20.0f;
+    float streamingEvictRadius = 30.0f;
+
+  public:
+    // Register a static collider for distance-based streaming. The body is NOT
+    // created until the camera moves within `streamingRadius` of `center`.
+    void RegisterStreamingCollider(Entity* entity,
+                                   CollisionShape shape,
+                                   float mass,
+                                   bool kinematic,
+                                   float restitution,
+                                   float friction,
+                                   const glm::vec3& center);
+
+  private:
+
+    // Rigid bodies
+    mutable std::mutex rigidBodiesMutex; // Protect concurrent access to rigidBodies
+    std::vector<std::unique_ptr<RigidBody>> rigidBodies;
+
+    // Gravity
+    glm::vec3 gravity = glm::vec3(0.0f, -9.81f, 0.0f);
+
+    // Whether the physics system is initialized
+    bool initialized = false;
+
+    // GPU acceleration
+    bool gpuAccelerationEnabled = false;
+    uint32_t maxGPUObjects = 1024;
+    uint32_t maxGPUCollisions = 4096;
+    Renderer* renderer = nullptr;
+
+    // Camera position for geometry-relative ball checking
+    mutable std::mutex cameraPositionMutex;
+    glm::vec3 cameraPosition = glm::vec3(0.0f, 0.0f, 0.0f);
+
+    // Vulkan resources for physics simulation
+    struct VulkanResources {
+      // Shader modules
+      vk::raii::ShaderModule integrateShaderModule = nullptr;
+      vk::raii::ShaderModule broadPhaseShaderModule = nullptr;
+      vk::raii::ShaderModule narrowPhaseShaderModule = nullptr;
+      vk::raii::ShaderModule resolveShaderModule = nullptr;
+
+      // Pipeline layouts and compute pipelines
+      vk::raii::DescriptorSetLayout descriptorSetLayout = nullptr;
+      vk::raii::PipelineLayout pipelineLayout = nullptr;
+      vk::raii::Pipeline integratePipeline = nullptr;
+      vk::raii::Pipeline broadPhasePipeline = nullptr;
+      vk::raii::Pipeline narrowPhasePipeline = nullptr;
+      vk::raii::Pipeline resolvePipeline = nullptr;
+
+      // Descriptor pool and sets
+      vk::raii::DescriptorPool descriptorPool = nullptr;
+      std::vector<vk::raii::DescriptorSet> descriptorSets;
+
+      // Buffers for physics data
+      vk::raii::Buffer physicsBuffer = nullptr;
+      vk::raii::DeviceMemory physicsBufferMemory = nullptr;
+      vk::raii::Buffer collisionBuffer = nullptr;
+      vk::raii::DeviceMemory collisionBufferMemory = nullptr;
+      vk::raii::Buffer pairBuffer = nullptr;
+      vk::raii::DeviceMemory pairBufferMemory = nullptr;
+      vk::raii::Buffer counterBuffer = nullptr;
+      vk::raii::DeviceMemory counterBufferMemory = nullptr;
+      vk::raii::Buffer paramsBuffer = nullptr;
+      vk::raii::DeviceMemory paramsBufferMemory = nullptr;
+
+      // Persistent mapped memory pointers for improved performance
+      void* persistentPhysicsMemory = nullptr;
+      void* persistentCounterMemory = nullptr;
+      void* persistentParamsMemory = nullptr;
+
+      // Command buffer for compute operations
+      vk::raii::CommandPool commandPool = nullptr;
+      vk::raii::CommandBuffer commandBuffer = nullptr;
+
+      // Dedicated fence for compute synchronization
+      vk::raii::Fence computeFence = nullptr;
+    };
+
+    VulkanResources vulkanResources;
+
+    // Initialize Vulkan resources for physics simulation
+    bool InitializeVulkanResources();
+    void CleanupVulkanResources();
+
+    // Update physics data on the GPU
+    void UpdateGPUPhysicsData(std::chrono::milliseconds deltaTime) const;
+
+    // Read back physics data from the GPU
+    void ReadbackGPUPhysicsData() const;
+
+    // Perform GPU-accelerated physics simulation
+    void SimulatePhysicsOnGPU(std::chrono::milliseconds deltaTime) const;
+};
\ No newline at end of file
diff --git a/attachments/advanced_gltf/physics_world_jolt.cpp b/attachments/advanced_gltf/physics_world_jolt.cpp
new file mode 100644
index 000000000..078af6fba
--- /dev/null
+++ b/attachments/advanced_gltf/physics_world_jolt.cpp
@@ -0,0 +1,266 @@
+/* Copyright (c) 2026 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// Jolt Physics 5.x wrapper implementing PhysicsWorld.
+// Tested against JoltPhysics commit pinned in CMakeLists.txt (tag v5.2.0).
+// The Jolt API is not ABI-stable across major versions — check their CHANGELOG if you update.
+#include "physics_interface.h"
+
+#include <Jolt/Jolt.h>
+#include <Jolt/RegisterTypes.h>
+#include <Jolt/Core/Factory.h>
+#include <Jolt/Core/TempAllocator.h>
+#include <Jolt/Core/JobSystemThreadPool.h>
+#include <Jolt/Physics/PhysicsSettings.h>
+#include <Jolt/Physics/PhysicsSystem.h>
+#include <Jolt/Physics/Body/BodyCreationSettings.h>
+#include <Jolt/Physics/Body/BodyActivationListener.h>
+#include <Jolt/Physics/Collision/Shape/BoxShape.h>
+#include <Jolt/Physics/Collision/Shape/CapsuleShape.h>
+#include <Jolt/Physics/Collision/Shape/SphereShape.h>
+#include <Jolt/Physics/Constraints/PointConstraint.h>
+#include <Jolt/Physics/Constraints/HingeConstraint.h>
+#include <Jolt/Physics/Constraints/SwingTwistConstraint.h>
+#include <Jolt/Physics/Collision/RayCast.h>
+#include <Jolt/Physics/Collision/CastResult.h>
+
+#include <cassert>
+
+// Broad-phase layers: static world geometry vs everything else.
+namespace BPLayers {
+    static constexpr JPH::BroadPhaseLayer NON_MOVING(0);
+    static constexpr JPH::BroadPhaseLayer MOVING(1);
+    static constexpr uint32_t NUM_LAYERS = 2;
+}
+
+// Object layers: maps to broad-phase layers.
+namespace ObjLayers {
+    static constexpr JPH::ObjectLayer NON_MOVING = 0;
+    static constexpr JPH::ObjectLayer MOVING     = 1;
+    static constexpr JPH::ObjectLayer NUM_LAYERS = 2;
+}
+
+class BPLayerInterfaceImpl final : public JPH::BroadPhaseLayerInterface {
+public:
+    BPLayerInterfaceImpl() {
+        obj_to_bp[ObjLayers::NON_MOVING] = BPLayers::NON_MOVING;
+        obj_to_bp[ObjLayers::MOVING]     = BPLayers::MOVING;
+    }
+    uint32_t GetNumBroadPhaseLayers() const override { return BPLayers::NUM_LAYERS; }
+    JPH::BroadPhaseLayer GetBroadPhaseLayer(JPH::ObjectLayer layer) const override {
+        assert(layer < ObjLayers::NUM_LAYERS);
+        return obj_to_bp[layer];
+    }
+#if defined(JPH_EXTERNAL_PROFILE) || defined(JPH_PROFILE_ENABLED)
+    const char* GetBroadPhaseLayerName(JPH::BroadPhaseLayer layer) const override {
+        return layer == BPLayers::NON_MOVING ? "NON_MOVING" : "MOVING";
+    }
+#endif
+private:
+    JPH::BroadPhaseLayer obj_to_bp[ObjLayers::NUM_LAYERS];
+};
+
+class ObjVsBPFilter final : public JPH::ObjectVsBroadPhaseLayerFilter {
+public:
+    bool ShouldCollide(JPH::ObjectLayer obj, JPH::BroadPhaseLayer bp) const override {
+        if (obj == ObjLayers::NON_MOVING) return bp == BPLayers::MOVING;
+        return true; // MOVING collides with everything
+    }
+};
+
+class ObjVsObjFilter final : public JPH::ObjectLayerPairFilter {
+public:
+    bool ShouldCollide(JPH::ObjectLayer a, JPH::ObjectLayer b) const override {
+        if (a == ObjLayers::NON_MOVING) return b == ObjLayers::MOVING;
+        return true;
+    }
+};
+
+// Concrete implementation — owns the JPH::PhysicsSystem.
+class JoltPhysicsWorld final : public PhysicsWorld {
+public:
+    // Initializes Jolt's global factory and registers built-in types.
+    // Call this once at application startup before constructing any JoltPhysicsWorld.
+    static void global_init() {
+        JPH::RegisterDefaultAllocator();
+        JPH::Factory::sInstance = new JPH::Factory();
+        JPH::RegisterTypes();
+    }
+
+    // Call at application shutdown after all JoltPhysicsWorld instances are destroyed.
+    static void global_shutdown() {
+        JPH::UnregisterTypes();
+        delete JPH::Factory::sInstance;
+        JPH::Factory::sInstance = nullptr;
+    }
+
+    explicit JoltPhysicsWorld(
+        uint32_t max_bodies        = 20480,
+        uint32_t max_body_pairs    = 65536,
+        uint32_t max_contact_constraints = 32768,
+        uint32_t num_worker_threads = 4)
+    {
+        temp_allocator_ = std::make_unique<JPH::TempAllocatorImpl>(32 * 1024 * 1024);
+        job_system_     = std::make_unique<JPH::JobSystemThreadPool>(
+            JPH::cMaxPhysicsJobs, JPH::cMaxPhysicsBarriers, num_worker_threads);
+
+        system_.Init(max_bodies, 0, max_body_pairs, max_contact_constraints,
+                     bp_layer_interface_, obj_vs_bp_filter_, obj_vs_obj_filter_);
+        body_interface_ = &system_.GetBodyInterface();
+    }
+
+    JPH::BodyID create_body(const JPH::BodyCreationSettings& settings) override {
+        JPH::Body* body = body_interface_->CreateBody(settings);
+        if (!body) return JPH::BodyID();
+        body_interface_->AddBody(body->GetID(), JPH::EActivation::Activate);
+        return body->GetID();
+    }
+
+    void destroy_body(JPH::BodyID id) override {
+        body_interface_->RemoveBody(id);
+        body_interface_->DestroyBody(id);
+    }
+
+    void set_motion_type(JPH::BodyID id, JPH::EMotionType type) override {
+        body_interface_->SetMotionType(id, type, JPH::EActivation::Activate);
+    }
+
+    void set_object_layer(JPH::BodyID id, uint16_t layer) override {
+        body_interface_->SetObjectLayer(id, layer);
+    }
+
+    void activate_body(JPH::BodyID id) override {
+        body_interface_->ActivateBody(id);
+    }
+
+    void move_kinematic(JPH::BodyID id, const PhysicsPose& pose) override {
+        body_interface_->SetPositionAndRotation(
+            id,
+            JPH::RVec3(pose.position.x, pose.position.y, pose.position.z),
+            JPH::Quat(pose.orientation.x, pose.orientation.y,
+                      pose.orientation.z, pose.orientation.w),
+            JPH::EActivation::Activate);
+    }
+
+    PhysicsPose get_body_pose(JPH::BodyID id) const override {
+        JPH::RVec3 pos;
+        JPH::Quat  rot;
+        body_interface_->GetPositionAndRotation(id, pos, rot);
+        return {
+            glm::vec3(pos.GetX(), pos.GetY(), pos.GetZ()),
+            glm::quat(rot.GetW(), rot.GetX(), rot.GetY(), rot.GetZ()),
+        };
+    }
+
+    glm::vec3 get_linear_velocity(JPH::BodyID id) const override {
+        JPH::Vec3 v = body_interface_->GetLinearVelocity(id);
+        return glm::vec3(v.GetX(), v.GetY(), v.GetZ());
+    }
+
+    void set_linear_velocity(JPH::BodyID id, const glm::vec3& v) override {
+        body_interface_->SetLinearVelocity(id, JPH::Vec3(v.x, v.y, v.z));
+    }
+
+    // Approximates a ball-socket using SwingTwistConstraint.
+    void create_ball_socket_constraint(JPH::BodyID p1, JPH::BodyID p2,
+                                       float swing_rad, float twist_rad) override {
+        JPH::SwingTwistConstraintSettings s;
+        s.mSpace          = JPH::EConstraintSpace::LocalToBodyCOM;
+        s.mNormalHalfConeAngle = swing_rad;
+        s.mPlaneHalfConeAngle  = swing_rad;
+        s.mTwistMinAngle  = -twist_rad;
+        s.mTwistMaxAngle  =  twist_rad;
+
+        JPH::BodyLockWrite lock1(system_.GetBodyLockInterface(), p1);
+        JPH::BodyLockWrite lock2(system_.GetBodyLockInterface(), p2);
+        if (lock1.Succeeded() && lock2.Succeeded()) {
+            auto* c = static_cast<JPH::SwingTwistConstraint*>(
+                s.Create(lock1.GetBody(), lock2.GetBody()));
+            system_.AddConstraint(c);
+        }
+    }
+
+    void create_hinge_constraint(JPH::BodyID p1, JPH::BodyID p2,
+                                 const glm::vec3& axis,
+                                 float min_angle_rad, float max_angle_rad) override {
+        JPH::HingeConstraintSettings s;
+        s.mSpace      = JPH::EConstraintSpace::WorldSpace;
+        s.mHingeAxis1 = s.mHingeAxis2 = JPH::Vec3(axis.x, axis.y, axis.z);
+        s.mNormalAxis1 = s.mNormalAxis2 = JPH::Vec3(0, 1, 0); // arbitrary perp
+        s.mLimitsMin  = min_angle_rad;
+        s.mLimitsMax  = max_angle_rad;
+
+        JPH::BodyLockWrite lock1(system_.GetBodyLockInterface(), p1);
+        JPH::BodyLockWrite lock2(system_.GetBodyLockInterface(), p2);
+        if (lock1.Succeeded() && lock2.Succeeded()) {
+            auto* c = static_cast<JPH::HingeConstraint*>(
+                s.Create(lock1.GetBody(), lock2.GetBody()));
+            system_.AddConstraint(c);
+        }
+    }
+
+    void step(float delta_seconds) override {
+        // cCollisionSteps=1 is fine for game-rate updates (60 Hz).
+        // Increase to 2 or 3 for ragdoll simulations to improve constraint stability.
+        system_.Update(delta_seconds, /*cCollisionSteps=*/1, temp_allocator_.get(), job_system_.get());
+    }
+
+    bool raycast(const glm::vec3& origin, const glm::vec3& direction, float max_distance,
+                 float& out_distance, glm::vec3& out_normal, JPH::BodyID& out_body_id) const override {
+        JPH::RRayCast ray(JPH::RVec3(origin.x, origin.y, origin.z),
+                          JPH::Vec3(direction.x * max_distance, direction.y * max_distance, direction.z * max_distance));
+        JPH::RayCastResult result;
+        if (system_.GetNarrowPhaseQuery().CastRay(ray, result)) {
+            out_distance = result.mFraction * max_distance;
+            out_body_id  = result.mBodyID;
+
+            JPH::BodyLockRead lock(system_.GetBodyLockInterface(), out_body_id);
+            if (lock.Succeeded()) {
+                JPH::Vec3 normal = lock.GetBody().GetWorldSpaceSurfaceNormal(result.mSubShapeID2, ray.GetPointOnRay(result.mFraction));
+                out_normal = glm::vec3(normal.GetX(), normal.GetY(), normal.GetZ());
+            } else {
+                out_normal = glm::vec3(0, 1, 0);
+            }
+            return true;
+        }
+        return false;
+    }
+
+    JPH::PhysicsSystem& get_physics_system() { return system_; }
+
+private:
+    BPLayerInterfaceImpl bp_layer_interface_;
+    ObjVsBPFilter        obj_vs_bp_filter_;
+    ObjVsObjFilter       obj_vs_obj_filter_;
+
+    std::unique_ptr<JPH::TempAllocatorImpl>    temp_allocator_;
+    std::unique_ptr<JPH::JobSystemThreadPool>  job_system_;
+
+    JPH::PhysicsSystem   system_;
+    JPH::BodyInterface*  body_interface_ = nullptr;
+};
+
+void PhysicsWorld::global_init() {
+    JoltPhysicsWorld::global_init();
+}
+
+void PhysicsWorld::global_shutdown() {
+    JoltPhysicsWorld::global_shutdown();
+}
+
+std::unique_ptr<PhysicsWorld> PhysicsWorld::create() {
+    return std::make_unique<JoltPhysicsWorld>();
+}
diff --git a/attachments/advanced_gltf/renderer.h b/attachments/advanced_gltf/renderer.h
new file mode 100644
index 000000000..fafb1db6c
--- /dev/null
+++ b/attachments/advanced_gltf/renderer.h
@@ -0,0 +1,1932 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <chrono>
+#include <condition_variable>
+#include <cstddef>
+#include <deque>
+#include <future>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <optional>
+#include <shared_mutex>
+#include <string>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <vulkan/vk_platform.h>
+#include <vulkan/vulkan_hpp_macros.hpp>
+#include <vulkan/vulkan_raii.hpp>
+
+#include "camera_component.h"
+#include "entity.h"
+#include "memory_pool.h"
+#include "mesh_component.h"
+#include "model_loader.h"
+#include "platform.h"
+#include "thread_pool.h"
+
+// Fallback defines for optional extension names (allow compiling against older headers)
+#ifndef VK_EXT_ROBUSTNESS_2_EXTENSION_NAME
+#	define VK_EXT_ROBUSTNESS_2_EXTENSION_NAME "VK_EXT_robustness2"
+#endif
+#ifndef VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME
+#	define VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME "VK_KHR_dynamic_rendering_local_read"
+#endif
+#ifndef VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME
+#	define VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME "VK_EXT_shader_tile_image"
+#endif
+
+// Forward declarations
+class ImGuiSystem;
+
+/**
+ * @brief Structure for Vulkan queue family indices.
+ */
+struct QueueFamilyIndices {
+  std::optional<uint32_t> graphicsFamily;
+  std::optional<uint32_t> presentFamily;
+  std::optional<uint32_t> computeFamily;
+  std::optional<uint32_t> transferFamily; // optional dedicated transfer queue family
+
+  [[nodiscard]] bool isComplete() const {
+    return graphicsFamily.has_value() && presentFamily.has_value() && computeFamily.has_value();
+  }
+};
+
+/**
+ * @brief Structure for swap chain support details.
+ */
+struct SwapChainSupportDetails {
+  vk::SurfaceCapabilitiesKHR capabilities;
+  std::vector<vk::SurfaceFormatKHR> formats;
+  std::vector<vk::PresentModeKHR> presentModes;
+};
+
+/**
+ * @brief Structure for individual light data in the storage buffer.
+ */
+struct LightData {
+  alignas(16) glm::vec4 position; // Light position (w component used for direction vs position)
+  alignas(16) glm::vec4 color; // Light color and intensity
+  alignas(16) glm::mat4 lightSpaceMatrix; // Light space matrix for shadow mapping
+  alignas(16) glm::vec4 direction; // Light direction (for directional/spotlights)
+  alignas(4) int lightType; // 0=Point, 1=Directional, 2=Spot, 3=Emissive
+  alignas(4) float range; // Light range
+  alignas(4) float innerConeAngle; // For spotlights
+  alignas(4) float outerConeAngle; // For spotlights
+};
+
+struct ShadowUniforms {
+  alignas(16) glm::mat4 view;
+  alignas(16) glm::mat4 proj;
+};
+
+struct ShadowPushConstants {
+  alignas(16) glm::mat4 model;
+};
+
+/**
+ * @brief Structure for the uniform buffer object (now without fixed light arrays).
+ */
+struct UniformBufferObject {
+  alignas(16) glm::mat4 model;
+  alignas(16) glm::mat4 view;
+  alignas(16) glm::mat4 proj;
+  alignas(16) glm::vec4 camPos;
+  alignas(4) float exposure;
+  alignas(4) float gamma;
+  alignas(4) float prefilteredCubeMipLevels;
+  alignas(4) float scaleIBLAmbient;
+  alignas(4) int lightCount;
+  alignas(4) int padding0; // match shader UBO layout
+  alignas(4) float padding1; // match shader UBO layout
+  alignas(4) float padding2; // match shader UBO layout
+  alignas(8) glm::vec2 screenDimensions;
+  alignas(4) float nearZ;
+  alignas(4) float farZ;
+  alignas(4) float slicesZ;
+  alignas(4) float _uboPad3;
+  // Planar reflections
+  alignas(16) glm::mat4 reflectionVP; // projection * mirroredView
+  alignas(4) int reflectionEnabled; // 1 when sampling reflection in main pass
+  alignas(4) int reflectionPass; // 1 during reflection render pass
+  alignas(8) glm::vec2 _reflectPad0;
+  alignas(16) glm::vec4 clipPlaneWS; // world-space plane ax+by+cz+d=0
+  // Controls
+  alignas(4) float reflectionIntensity; // scales reflection mix in glass
+  alignas(4) int enableRayQueryReflections = 1; // 1 to enable reflections in ray query mode
+  alignas(4) int enableRayQueryTransparency = 1; // 1 to enable transparency/refraction in ray query mode
+  alignas(4) float _padReflect[1]{};
+  // Ray-query specific: number of per-instance geometry infos in buffer
+  alignas(4) int geometryInfoCount{0};
+  alignas(4) int _padGeo0{0};
+  alignas(4) int _padGeo1{0};
+  alignas(4) int _padGeo2{0};
+  alignas(16) glm::vec4 _rqReservedWorldPos{0.0f, 0.0f, 0.0f, 0.0f};
+  // Ray-query specific: number of materials in materialBuffer
+  alignas(4) int materialCount{0};
+  alignas(4) int _padMat0{0};
+  alignas(4) int _padMat1{0};
+  alignas(4) int _padMat2{0};
+};
+
+// Ray Query uses a dedicated uniform buffer with its own tightly-defined layout.
+// This avoids relying on the (much larger) shared raster UBO layout and prevents
+// CPU↔shader layout drift from breaking Ray Query-only fields.
+//
+// IMPORTANT: This layout must match `RayQueryUniforms` in `shaders/ray_query.slang`.
+struct RayQueryUniformBufferObject {
+  alignas(16) glm::mat4 model;
+  alignas(16) glm::mat4 view;
+  alignas(16) glm::mat4 proj;
+  alignas(16) glm::vec4 camPos;
+
+  alignas(4) float exposure;
+  alignas(4) float gamma;
+  // Match raster UBO conventions so Ray Query can run the same lighting math.
+  alignas(4) float scaleIBLAmbient;
+  alignas(4) int lightCount;
+  alignas(4) int enableRayQueryReflections;
+  alignas(4) int enableRayQueryTransparency;
+
+  alignas(8) glm::vec2 screenDimensions;
+  alignas(4) int geometryInfoCount;
+  alignas(4) int materialCount;
+  alignas(4) int _pad0; // used for rayQueryMaxBounces
+  // Thick-glass controls (RQ-only)
+  alignas(4) int enableThickGlass; // 0/1 toggle
+  alignas(4) float thicknessClamp; // max thickness in meters
+  alignas(4) float absorptionScale; // scales sigma_a
+  alignas(4) int _pad1; // Ray Query: enable hard shadows for direct lighting (0/1)
+  // Ray Query soft shadows (area-light approximation)
+  alignas(4) int shadowSampleCount; // 1 = hard shadows; >1 = multi-sample
+  alignas(4) float shadowSoftness; // 0 = hard; otherwise scales effective light radius (fraction of range)
+  alignas(4) float reflectionIntensity; // User control for glass reflection strength
+  alignas(4) float _padShadow[2]{};
+};
+
+static_assert(sizeof(RayQueryUniformBufferObject) == 288, "RayQueryUniformBufferObject size must match shader layout");
+static_assert(offsetof(RayQueryUniformBufferObject, model) == 0);
+static_assert(offsetof(RayQueryUniformBufferObject, view) == 64);
+static_assert(offsetof(RayQueryUniformBufferObject, proj) == 128);
+static_assert(offsetof(RayQueryUniformBufferObject, camPos) == 192);
+static_assert(offsetof(RayQueryUniformBufferObject, exposure) == 208);
+static_assert(offsetof(RayQueryUniformBufferObject, gamma) == 212);
+static_assert(offsetof(RayQueryUniformBufferObject, scaleIBLAmbient) == 216);
+static_assert(offsetof(RayQueryUniformBufferObject, lightCount) == 220);
+static_assert(offsetof(RayQueryUniformBufferObject, enableRayQueryReflections) == 224);
+static_assert(offsetof(RayQueryUniformBufferObject, enableRayQueryTransparency) == 228);
+static_assert(offsetof(RayQueryUniformBufferObject, screenDimensions) == 232);
+static_assert(offsetof(RayQueryUniformBufferObject, geometryInfoCount) == 240);
+static_assert(offsetof(RayQueryUniformBufferObject, materialCount) == 244);
+static_assert(offsetof(RayQueryUniformBufferObject, _pad0) == 248);
+static_assert(offsetof(RayQueryUniformBufferObject, enableThickGlass) == 252);
+static_assert(offsetof(RayQueryUniformBufferObject, thicknessClamp) == 256);
+static_assert(offsetof(RayQueryUniformBufferObject, absorptionScale) == 260);
+static_assert(offsetof(RayQueryUniformBufferObject, _pad1) == 264);
+static_assert(offsetof(RayQueryUniformBufferObject, shadowSampleCount) == 268);
+static_assert(offsetof(RayQueryUniformBufferObject, shadowSoftness) == 272);
+
+/**
+ * @brief Structure for PBR material properties.
+ * This structure must match the PushConstants structure in the PBR shader.
+ */
+struct MaterialProperties {
+  alignas(16) glm::vec4 baseColorFactor;
+  alignas(4) float metallicFactor;
+  alignas(4) float roughnessFactor;
+  alignas(4) int baseColorTextureSet;
+  alignas(4) int physicalDescriptorTextureSet;
+  alignas(4) int normalTextureSet;
+  alignas(4) int occlusionTextureSet;
+  alignas(4) int emissiveTextureSet;
+  alignas(4) float alphaMask;
+  alignas(4) float alphaMaskCutoff;
+  alignas(16) glm::vec3 emissiveFactor; // Emissive factor for HDR emissive sources
+  alignas(4) float emissiveStrength; // KHR_materials_emissive_strength extension
+  alignas(4) float transmissionFactor; // KHR_materials_transmission
+  alignas(4) int useSpecGlossWorkflow; // 1 if using KHR_materials_pbrSpecularGlossiness
+  alignas(4) float glossinessFactor; // SpecGloss glossiness scalar
+  alignas(16) glm::vec3 specularFactor; // SpecGloss specular color factor
+  alignas(4) float ior = 1.5f; // index of refraction
+  alignas(4) bool hasEmissiveStrengthExtension;
+};
+
+/**
+ * @brief Rendering mode selection
+ */
+enum class RenderMode {
+  Rasterization, // Traditional rasterization pipeline
+  RayQuery // Ray query compute shader
+};
+
+/**
+ * @brief Class for managing Vulkan rendering.
+ *
+ * This class implements the rendering pipeline as described in the Engine_Architecture chapter:
+ * @see en/Building_a_Simple_Engine/Engine_Architecture/05_rendering_pipeline.adoc
+ */
+class Renderer {
+  public:
+    /**
+	 * @brief Constructor with a platform.
+	 * @param platform The platform to use for rendering.
+	 */
+    explicit Renderer(Platform* platform);
+
+    /**
+	 * @brief Destructor for proper cleanup.
+	 */
+    ~Renderer();
+
+    /**
+	 * @brief Initialize the renderer.
+	 * @param appName The name of the application.
+	 * @param enableValidationLayers Whether to enable validation layers.
+	 * @return True if initialization was successful, false otherwise.
+	 */
+    bool Initialize(const std::string& appName, bool enableValidationLayers = true);
+
+    /**
+	 * @brief Clean up renderer resources.
+	 */
+    void Cleanup();
+
+    /**
+	 * @brief Render the scene.
+	 * @param entities The entities to render.
+	 * @param camera The camera to use for rendering.
+	 * @param imguiSystem The ImGui system for UI rendering (optional).
+	 */
+    void Render(const std::vector<std::unique_ptr<Entity>>& entities, CameraComponent* camera, ImGuiSystem* imguiSystem = nullptr);
+
+    // Render overload that accepts a snapshot of raw entity pointers.
+    // This allows the Engine to release its entity-container lock before rendering
+    // (avoiding writer starvation of background loading/physics threads).
+    void Render(const std::vector<Entity *>& entities, CameraComponent* camera, ImGuiSystem* imguiSystem = nullptr);
+
+    /**
+	 * @brief Wait for the device to be idle.
+	 */
+    void WaitIdle();
+
+    /**
+	 * @brief Wait for fences with periodic watchdog kicks to prevent false hang detection.
+	 * Must be called from the render thread.
+	 */
+    vk::Result waitForFencesSafe(const std::vector<vk::Fence>& fences, vk::Bool32 waitAll, uint64_t timeoutNs = 100'000'000ULL);
+
+    /**
+	 * @brief Wait for fences with periodic watchdog kicks to prevent false hang detection.
+	 * Must be called from the render thread. Overload for a single fence.
+	 */
+    vk::Result waitForFencesSafe(vk::Fence fence, vk::Bool32 waitAll, uint64_t timeoutNs = 100'000'000ULL);
+
+    /**
+	 * @brief Dispatch a compute shader.
+	 * @param groupCountX The number of local workgroups to dispatch in the X dimension.
+	 * @param groupCountY The number of local workgroups to dispatch in the Y dimension.
+	 * @param groupCountZ The number of local workgroups to dispatch in the Z dimension.
+	 * @param inputBuffer The input buffer.
+	 * @param outputBuffer The output buffer.
+	 * @param hrtfBuffer The HRTF data buffer.
+	 * @param paramsBuffer The parameters buffer.
+	 * @return A fence that can be used to synchronize with the compute operation.
+	 */
+    vk::raii::Fence DispatchCompute(uint32_t groupCountX,
+                                    uint32_t groupCountY,
+                                    uint32_t groupCountZ,
+                                    vk::Buffer inputBuffer,
+                                    vk::Buffer outputBuffer,
+                                    vk::Buffer hrtfBuffer,
+                                    vk::Buffer paramsBuffer);
+
+    /**
+	 * @brief Check if the renderer is initialized.
+	 * @return True if the renderer is initialized, false otherwise.
+	 */
+    bool IsInitialized() const {
+      return initialized;
+    }
+
+    /**
+	 * @brief Get the Vulkan device.
+	 * @return The Vulkan device.
+	 */
+    vk::Device GetDevice() const {
+      return *device;
+    }
+
+    // Expose max frames in flight for per-frame resource duplication
+    uint32_t GetMaxFramesInFlight() const {
+      return MAX_FRAMES_IN_FLIGHT;
+    }
+
+    /**
+	 * @brief Get the Vulkan RAII device.
+	 * @return The Vulkan RAII device.
+	 */
+    const vk::raii::Device& GetRaiiDevice() const {
+      return device;
+    }
+
+    // Expose uploads timeline semaphore and last value for external waits
+    vk::Semaphore GetUploadsTimelineSemaphore() const {
+      return *uploadsTimeline;
+    }
+    uint64_t GetUploadsTimelineValue() const {
+      return uploadTimelineLastSubmitted.load(std::memory_order_relaxed);
+    }
+
+    /**
+	 * @brief Get the compute queue.
+	 * @return The compute queue.
+	 */
+    vk::Queue GetComputeQueue() const {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      return *computeQueue;
+    }
+
+    /**
+	 * @brief Find a suitable memory type.
+	 * @param typeFilter The type filter.
+	 * @param properties The memory properties.
+	 * @return The memory type index.
+	 */
+    uint32_t FindMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const {
+      return findMemoryType(typeFilter, properties);
+    }
+
+    /**
+	 * @brief Get the compute queue family index.
+	 * @return The compute queue family index.
+	 */
+    uint32_t GetComputeQueueFamilyIndex() const {
+      if (queueFamilyIndices.computeFamily.has_value()) {
+        return queueFamilyIndices.computeFamily.value();
+      }
+      // Fallback to graphics family to avoid crashes on devices without a separate compute queue
+      return queueFamilyIndices.graphicsFamily.value();
+    }
+
+    /**
+	 * @brief Submit a command buffer to the compute queue with proper dispatch loader preservation.
+	 * @param commandBuffer The command buffer to submit.
+	 * @param fence The fence to signal when the operation completes.
+	 */
+    void SubmitToComputeQueue(vk::CommandBuffer commandBuffer, vk::Fence fence) const {
+      // Use mutex to ensure thread-safe access to queues
+      vk::SubmitInfo submitInfo{
+        .commandBufferCount = 1,
+        .pCommandBuffers = &commandBuffer
+      };
+      std::lock_guard<std::mutex> lock(queueMutex);
+      // Prefer compute queue when available; otherwise, fall back to graphics queue to avoid crashes
+      if (*computeQueue) {
+        computeQueue.submit(submitInfo, fence);
+      } else {
+        graphicsQueue.submit(submitInfo, fence);
+      }
+    }
+
+    /**
+	 * @brief Create a shader module from SPIR-V code.
+	 * @param code The SPIR-V code.
+	 * @return The shader module.
+	 */
+    vk::raii::ShaderModule CreateShaderModule(const std::vector<char>& code) {
+      return createShaderModule(code);
+    }
+
+    /**
+	 * @brief Create a shader module from a file.
+	 * @param filename The filename.
+	 * @return The shader module.
+	 */
+    vk::raii::ShaderModule CreateShaderModule(const std::string& filename) {
+      auto code = readFile(filename);
+      return createShaderModule(code);
+    }
+
+    /**
+	 * @brief Load a texture from a file.
+	 * @param texturePath The path to the texture file.
+	 * @return True if the texture was loaded successfully, false otherwise.
+	 */
+    bool LoadTexture(const std::string& texturePath);
+
+    // Asynchronous texture loading APIs (thread-pool backed).
+    // The 'critical' flag is used to front-load important textures (e.g.,
+    // baseColor/albedo) so the scene looks mostly correct before the loading
+    // screen disappears. Non-critical textures (normals, MR, AO, emissive)
+    // can stream in after geometry is visible.
+    std::future<bool> LoadTextureAsync(const std::string& texturePath, bool critical = false);
+
+    /**
+	 * @brief Load a texture from raw image data in memory.
+	 * @param textureId The identifier for the texture.
+	 * @param imageData The raw image data.
+	 * @param width The width of the image.
+	 * @param height The height of the image.
+	 * @param channels The number of channels in the image.
+	 * @return True if the texture was loaded successfully, false otherwise.
+	 */
+    bool LoadTextureFromMemory(const std::string& textureId,
+                               const unsigned char* imageData,
+                               int width,
+                               int height,
+                               int channels);
+
+    // Asynchronous upload from memory (RGBA/RGB/other). Safe for concurrent calls.
+    std::future<bool> LoadTextureFromMemoryAsync(const std::string& textureId,
+                                                 const unsigned char* imageData,
+                                                 int width,
+                                                 int height,
+                                                 int channels,
+                                                 bool critical = false);
+
+    // Progress query for UI
+    uint32_t GetTextureTasksScheduled() const {
+      return textureTasksScheduled.load();
+    }
+    uint32_t GetTextureTasksCompleted() const {
+      return textureTasksCompleted.load();
+    }
+
+    // GPU upload progress (per-texture jobs processed on the main thread).
+    uint32_t GetUploadJobsTotal() const {
+      return uploadJobsTotal.load();
+    }
+    uint32_t GetUploadJobsCompleted() const {
+      return uploadJobsCompleted.load();
+    }
+
+    // --- Acceleration structure build progress (for UI) ---
+    // Exposed so the loading overlay can show meaningful progress when
+    // BLAS/TLAS builds take a long time (>= ~10 seconds).
+    bool IsASBuildInProgress() const {
+      return asBuildUiActive.load(std::memory_order_relaxed);
+    }
+    float GetASBuildProgress() const {
+      return asBuildUiProgress.load(std::memory_order_relaxed);
+    }
+    uint32_t GetASBuildItemsDone() const {
+      return asBuildUiDone.load(std::memory_order_relaxed);
+    }
+    uint32_t GetASBuildItemsTotal() const {
+      return asBuildUiTotal.load(std::memory_order_relaxed);
+    }
+    const char* GetASBuildStage() const {
+      return asBuildUiStage.load(std::memory_order_relaxed);
+    }
+    double GetASBuildElapsedSeconds() const {
+      const uint64_t start = asBuildUiStartNs.load(std::memory_order_relaxed);
+      if (start == 0)
+        return 0.0;
+      const uint64_t now = static_cast<uint64_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+          std::chrono::steady_clock::now().time_since_epoch())
+        .count());
+      if (now <= start)
+        return 0.0;
+      return static_cast<double>(now - start) / 1'000'000'000.0;
+    }
+    bool ShouldShowASBuildProgressInUI() const {
+      return IsASBuildInProgress() && GetASBuildElapsedSeconds() >= 10.0;
+    }
+
+    // Block until all currently-scheduled texture tasks have completed.
+    // Intended for use during initial scene loading so that descriptor
+    // creation sees the final textureResources instead of fallbacks.
+    void WaitForAllTextureTasks();
+
+    // Process pending texture GPU uploads on the calling thread.
+    // This should be invoked from the main/render thread so that all
+    // Vulkan work happens from a single thread while worker threads
+    // perform only CPU-side decoding.
+    //
+    // Parameters allow us to:
+    //  - limit the number of jobs processed per call (for streaming), and
+    //  - choose whether to include critical and/or non-critical jobs.
+    void ProcessPendingTextureJobs(uint32_t maxJobs = UINT32_MAX,
+                                   bool includeCritical = true,
+                                   bool includeNonCritical = true);
+
+    // Track which entities use a given texture ID so that descriptor sets
+    // can be refreshed when textures finish streaming in.
+    void RegisterTextureUser(const std::string& textureId, Entity* entity);
+    void OnTextureUploaded(const std::string& textureId);
+
+    // Global loading state (model/scene). Consider the scene "loading" while
+    // either the model is being parsed/instantiated OR there are still
+    // outstanding critical texture uploads (e.g., baseColor/albedo).
+    // Loading state: show blocking loading overlay only until the initial scene is ready.
+    // Background streaming may continue after that without blocking the scene.
+    enum class LoadingPhase : uint32_t {
+      Scene = 0,
+      Textures,
+      Physics,
+      AccelerationStructures,
+      Finalizing
+    };
+    LoadingPhase GetLoadingPhase() const {
+      return static_cast<LoadingPhase>(loadingPhase.load(std::memory_order_relaxed));
+    }
+    const char* GetLoadingPhaseName() const {
+      switch (GetLoadingPhase()) {
+        case LoadingPhase::Scene:
+          return "Scene";
+        case LoadingPhase::Textures:
+          return "Textures";
+        case LoadingPhase::Physics:
+          return "Physics";
+        case LoadingPhase::AccelerationStructures:
+          return "Acceleration Structures";
+        case LoadingPhase::Finalizing:
+          return "Finalizing";
+        default:
+          return "Loading";
+      }
+    }
+    float GetLoadingPhaseProgress() const {
+      return std::clamp(loadingPhaseProgress.load(std::memory_order_relaxed), 0.0f, 1.0f);
+    }
+    void SetLoadingPhase(LoadingPhase phase) {
+      loadingPhase.store(static_cast<uint32_t>(phase), std::memory_order_relaxed);
+      loadingPhaseProgress.store(0.0f, std::memory_order_relaxed);
+    }
+    void SetLoadingPhaseProgress(float v) {
+      loadingPhaseProgress.store(std::clamp(v, 0.0f, 1.0f), std::memory_order_relaxed);
+    }
+    void MarkInitialLoadComplete() {
+      initialLoadComplete.store(true, std::memory_order_relaxed);
+      SetLoadingPhase(LoadingPhase::Finalizing);
+      loadingPhaseProgress.store(1.0f, std::memory_order_relaxed);
+    }
+    bool IsLoading() const {
+      // Keep the blocking overlay visible until the engine has finished
+      // post-load blockers (AS build, descriptor cold-init, etc.).
+      return (loadingFlag.load(std::memory_order_relaxed) || criticalJobsOutstanding.load(std::memory_order_relaxed) > 0u ||
+        !initialLoadComplete.load(std::memory_order_relaxed));
+    }
+    // True only while the model/scene is still being constructed or while critical
+    // texture jobs remain outstanding. This excludes the "finalizing" stage where
+    // the render thread may still be doing post-load work (AS build, descriptor init).
+    //
+    // IMPORTANT: Do NOT use critical texture completion as a gate for starting TLAS/BLAS builds.
+    // AS builds depend on geometry buffers and instance transforms, not on texture readiness.
+    bool IsSceneLoaderActive() const {
+      return loadingFlag.load(std::memory_order_relaxed);
+    }
+    void SetLoading(bool v) {
+      loadingFlag.store(v, std::memory_order_relaxed);
+      if (v) {
+        // New load cycle starting
+        initialLoadComplete.store(false, std::memory_order_relaxed);
+        SetLoadingPhase(LoadingPhase::Scene);
+      }
+    }
+
+	// Descriptor set deferred update machinery
+	void MarkEntityDescriptorsDirty(Entity *entity);
+	void ProcessDirtyDescriptorsForFrame(uint32_t frameIndex);
+
+    // Texture aliasing: map canonical IDs to actual loaded keys (e.g., file paths) to avoid duplicates
+    inline void RegisterTextureAlias(const std::string& aliasId, const std::string& targetId) {
+      std::unique_lock<std::shared_mutex> lock(textureResourcesMutex);
+      if (aliasId.empty() || targetId.empty())
+        return;
+      // Resolve targetId without re-locking by walking the alias map directly
+      std::string resolved = targetId;
+      for (int i = 0; i < 8; ++i) {
+        auto it = textureAliases.find(resolved);
+        if (it == textureAliases.end())
+          break;
+        if (it->second == resolved)
+          break;
+        resolved = it->second;
+      }
+      if (aliasId == resolved) {
+        textureAliases.erase(aliasId);
+      } else {
+        textureAliases[aliasId] = resolved;
+      }
+    }
+    inline std::string ResolveTextureId(const std::string& id) const {
+      std::shared_lock<std::shared_mutex> lock(textureResourcesMutex);
+      std::string cur = id;
+      for (int i = 0; i < 8; ++i) {
+        // prevent pathological cycles
+        auto it = textureAliases.find(cur);
+        if (it == textureAliases.end())
+          break;
+        if (it->second == cur)
+          break; // self-alias guard
+        cur = it->second;
+      }
+      return cur;
+    }
+
+    /**
+	 * @brief Transition an image layout.
+	 * @param image The image.
+	 * @param format The image format.
+	 * @param oldLayout The old layout.
+	 * @param newLayout The new layout.
+	 */
+    void TransitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout) {
+      transitionImageLayout(image, format, oldLayout, newLayout);
+    }
+
+    /**
+	 * @brief Copy a buffer to an image.
+	 * @param buffer The buffer.
+	 * @param image The image.
+	 * @param width The image width.
+	 * @param height The image height.
+	 */
+    void CopyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height) {
+      // Create a default single region for backward compatibility
+      std::vector<vk::BufferImageCopy> regions = {
+        {
+          .bufferOffset = 0,
+          .bufferRowLength = 0,
+          .bufferImageHeight = 0,
+          .imageSubresource = {
+            .aspectMask = vk::ImageAspectFlagBits::eColor,
+            .mipLevel = 0,
+            .baseArrayLayer = 0,
+            .layerCount = 1
+          },
+          .imageOffset = {0, 0, 0},
+          .imageExtent = {width, height, 1}
+        }
+      };
+      copyBufferToImage(buffer, image, width, height, regions);
+    }
+
+    /**
+	 * @brief Get the current command buffer.
+	 * @return The current command buffer.
+	 */
+    vk::raii::CommandBuffer& GetCurrentCommandBuffer() {
+      return commandBuffers[currentFrame];
+    }
+
+    /**
+	 * @brief Get the swap chain image format.
+	 * @return The swap chain image format.
+	 */
+    vk::Format GetSwapChainImageFormat() const {
+      return swapChainImageFormat;
+    }
+
+    /**
+	 * @brief Set the framebuffer resized flag.
+	 * This should be called when the window is resized to trigger swap chain recreation.
+	 */
+    void SetFramebufferResized() {
+      framebufferResized.store(true, std::memory_order_relaxed);
+    }
+
+    /**
+	 * @brief Set the model loader reference for accessing extracted lights.
+	 * @param _modelLoader Pointer to the model loader.
+	 */
+    void SetModelLoader(ModelLoader* _modelLoader) {
+      modelLoader = _modelLoader;
+      // Materials are resolved via ModelLoader; invalidate cached per-entity material info.
+      for (auto& kv : entityResources) {
+        kv.second.materialCacheValid = false;
+        kv.second.cachedMaterial = nullptr;
+        kv.second.cachedIsBlended = false;
+        kv.second.cachedIsGlass = false;
+        kv.second.cachedIsLiquid = false;
+        kv.second.cachedMaterialProps = MaterialProperties{};
+      }
+    }
+
+    /**
+	 * @brief Set static lights loaded during model initialization.
+	 * @param lights The lights to store statically.
+	 */
+    void SetStaticLights(const std::vector<ExtractedLight>& lights) {
+      staticLights = lights;
+      std::cout << "[Lights] staticLights set: " << staticLights.size() << " entries" << std::endl;
+    }
+
+    /**
+	 * @brief Set the gamma correction value for PBR rendering.
+	 * @param _gamma The gamma correction value (typically 2.2).
+	 */
+    void SetGamma(float _gamma) {
+      gamma = _gamma;
+    }
+
+    /**
+	 * @brief Set the exposure value for HDR tone mapping.
+	 * @param _exposure The exposure value (1.0 = no adjustment).
+	 */
+    void SetExposure(float _exposure) {
+      exposure = _exposure;
+    }
+
+    // Reflection intensity (UI + shader control)
+    void SetReflectionIntensity(float v) {
+      reflectionIntensity = v;
+    }
+    float GetReflectionIntensity() const {
+      return reflectionIntensity;
+    }
+
+    void SetPlanarReflectionsEnabled(bool enabled);
+    void TogglePlanarReflections();
+    bool IsPlanarReflectionsEnabled() const {
+      return enablePlanarReflections;
+    }
+
+    // Ray query rendering mode control
+    void SetRenderMode(RenderMode mode) {
+      currentRenderMode = mode;
+    }
+    RenderMode GetRenderMode() const {
+      return currentRenderMode;
+    }
+    void ToggleRenderMode() {
+      currentRenderMode = (currentRenderMode == RenderMode::Rasterization) ? RenderMode::RayQuery : RenderMode::Rasterization;
+    }
+
+    // Ray query capability getters
+    bool GetRayQueryEnabled() const {
+      return rayQueryEnabled;
+    }
+    bool GetAccelerationStructureEnabled() const {
+      return accelerationStructureEnabled;
+    }
+
+    // Ray Query static-only mode (disable animation/physics updates and TLAS refits to render a static opaque scene)
+    void SetRayQueryStaticOnly(bool v) {
+      rayQueryStaticOnly = v;
+    }
+    bool IsRayQueryStaticOnly() const {
+      return rayQueryStaticOnly;
+    }
+
+    /**
+	 * @brief Request acceleration structure build at next safe frame point.
+	 * Safe to call from any thread (e.g., background loading thread).
+	 */
+    void RequestAccelerationStructureBuild() {
+      if (!accelerationStructureEnabled || !rayQueryEnabled)
+        return;
+      // Record when the request was made so the render loop can enforce a bounded deferral
+      // policy (avoid getting stuck waiting for “perfect” readiness forever).
+      // NOTE: `asBuildRequested` may already be true due to other triggers; still ensure
+      // the request timestamp is armed so the timeout logic can work.
+      if (asBuildRequestStartNs.load(std::memory_order_relaxed) == 0) {
+        const uint64_t nowNs = static_cast<uint64_t>(
+          std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::steady_clock::now().time_since_epoch())
+          .count());
+        asBuildRequestStartNs.store(nowNs, std::memory_order_relaxed);
+      }
+      // Allow AS build to take longer than the watchdog threshold (large scenes in Debug).
+      watchdogSuppressed.store(true, std::memory_order_relaxed);
+      asBuildRequested.store(true, std::memory_order_release);
+    }
+    // Overload with reason tracking for diagnostics
+    void RequestAccelerationStructureBuild(const char* reason) {
+      if (!accelerationStructureEnabled || !rayQueryEnabled)
+        return;
+      if (asBuildRequestStartNs.load(std::memory_order_relaxed) == 0) {
+        const uint64_t nowNs = static_cast<uint64_t>(
+          std::chrono::duration_cast<std::chrono::nanoseconds>(
+            std::chrono::steady_clock::now().time_since_epoch())
+          .count());
+        asBuildRequestStartNs.store(nowNs, std::memory_order_relaxed);
+      }
+      if (reason) {
+        lastASBuildRequestReason = reason;
+        std::cout << "[AS] Requesting rebuild. Reason: " << reason << std::endl;
+      } else {
+        lastASBuildRequestReason = "(no reason)";
+      }
+
+      // Explicit requests bypass the freeze to ensure dynamic objects (like balls) are added
+      asDevOverrideAllowRebuild = true;
+
+      watchdogSuppressed.store(true, std::memory_order_relaxed);
+      asBuildRequested.store(true, std::memory_order_release);
+    }
+
+    /**
+	 * @brief Build acceleration structures for ray query rendering.
+	 * @param entities The entities to include in the acceleration structures.
+	 * @return True if successful, false otherwise.
+	 */
+    bool buildAccelerationStructures(const std::vector<Entity *>& entities);
+
+    // Refit/UPDATE the TLAS with latest entity transforms (no rebuild)
+    bool refitTopLevelAS(const std::vector<Entity *>& entities, CameraComponent* camera);
+
+    // Record BLAS UPDATE commands for all deformable meshes into an already-open command buffer.
+    // Must be called after skinning so BVH and vertex data are both current-frame.
+    void refitBLASInline(vk::raii::CommandBuffer& cmd);
+
+    /**
+	 * @brief Update ray query descriptor sets with current resources.
+	 * @param frameIndex The frame index to update (or all frames if not specified).
+	 * @return True if successful, false otherwise.
+	 */
+    bool updateRayQueryDescriptorSets(uint32_t frameIndex, const std::vector<Entity *>& entities);
+
+    /**
+	 * @brief Create or resize light storage buffers to accommodate the given number of lights.
+	 * @param lightCount The number of lights to accommodate.
+	 * @return True if successful, false otherwise.
+	 */
+    bool createOrResizeLightStorageBuffers(size_t lightCount);
+
+    /**
+	 * @brief Update the light storage buffer with current light data.
+	 * @param frameIndex The current frame index.
+	 * @param lights The light data to upload.
+	 * @return True if successful, false otherwise.
+	 */
+    bool updateLightStorageBuffer(uint32_t frameIndex, const std::vector<ExtractedLight>& lights, CameraComponent* camera = nullptr);
+
+    /**
+	 * @brief Update all existing descriptor sets with new light storage buffer references.
+	 * Called when light storage buffers are recreated to ensure descriptor sets reference valid buffers.
+	 */
+    // Update PBR descriptor sets to point to the latest light SSBOs.
+    // When allFrames=true, refresh all frames (use only when the device is idle — e.g., after waitIdle()).
+    // Otherwise, refresh only the current frame at the frame safe point to avoid touching in‑flight frames.
+    void updateAllDescriptorSetsWithNewLightBuffers(bool allFrames = false);
+
+    // Upload helper: record both layout transitions and the copy in a single submit with a fence
+    void uploadImageFromStaging(vk::Buffer staging,
+                                vk::Image image,
+                                vk::Format format,
+                                const std::vector<vk::BufferImageCopy>& regions,
+                                uint32_t mipLevels = 1);
+
+    // Generate full mip chain for a 2D color image using GPU blits
+    void generateMipmaps(vk::Image image,
+                         vk::Format format,
+                         int32_t texWidth,
+                         int32_t texHeight,
+                         uint32_t mipLevels);
+
+    vk::Format findDepthFormat();
+
+    /**
+	 * @brief Pre-allocate all Vulkan resources for an entity during scene loading.
+	 * @param entity The entity to pre-allocate resources for.
+	 * @return True if pre-allocation was successful, false otherwise.
+	 */
+    bool preAllocateEntityResources(Entity* entity);
+
+    /**
+	 * @brief Pre-allocate Vulkan resources for a batch of entities, batching mesh uploads.
+	 *
+	 * This variant is optimized for large scene loads (e.g., GLTF Bistro). It will:
+	 *  - Create per-mesh GPU buffers as usual, but record all buffer copy commands
+	 *    into a single command buffer and submit them in one batch.
+	 *  - Then create uniform buffers and descriptor sets per entity.
+	 *
+	 * Callers that load many geometry entities at once (like GLTF scene loading)
+	 * should prefer this over repeated preAllocateEntityResources() calls.
+	 */
+    bool preAllocateEntityResourcesBatch(const std::vector<Entity *>& entities);
+
+    // Thread-safe: enqueue entities that need GPU-side resource preallocation.
+    // The actual Vulkan work will be performed on the render thread at the frame-start safe point.
+    void EnqueueEntityPreallocationBatch(const std::vector<Entity *>& entities);
+    void EnqueueInstanceBufferRecreation(Entity* entity);
+
+    /**
+	 * @brief Recreate the instance buffer for an entity that had its instances cleared.
+	 *
+	 * When an entity that was originally set up for instanced rendering needs to be
+	 * converted to a single non-instanced entity (e.g., for animation), this method
+	 * recreates the GPU instance buffer with a single identity instance.
+	 *
+	 * @param entity The entity whose instance buffer should be recreated.
+	 * @return True if successful, false otherwise.
+	 */
+    bool recreateInstanceBuffer(Entity* entity);
+
+    // Shared default PBR texture identifiers (to avoid creating hundreds of identical textures)
+    static const std::string SHARED_DEFAULT_ALBEDO_ID;
+    static const std::string SHARED_DEFAULT_NORMAL_ID;
+    static const std::string SHARED_DEFAULT_METALLIC_ROUGHNESS_ID;
+    static const std::string SHARED_DEFAULT_OCCLUSION_ID;
+    static const std::string SHARED_DEFAULT_EMISSIVE_ID;
+    static const std::string SHARED_BRIGHT_RED_ID;
+
+    /**
+	 * @brief Determine the appropriate texture format based on the texture type.
+	 * @param textureId The texture identifier to analyze.
+	 * @return The appropriate Vulkan format (sRGB for baseColor, linear for others).
+	 */
+    static vk::Format determineTextureFormat(const std::string& textureId);
+
+  private:
+    // Platform
+    Platform* platform = nullptr;
+
+    // Model loader reference for accessing extracted lights
+    class ModelLoader* modelLoader = nullptr;
+
+    // PBR rendering parameters
+    float gamma = 2.2f; // Gamma correction value
+    float exposure = 1.2f; // HDR exposure value (default tuned to avoid washout)
+    float reflectionIntensity = 1.0f; // User control for glass reflection strength
+    // Raster shadows (experimental): use ray queries in the raster PBR fragment shader.
+    // Wired through `UniformBufferObject.padding2` to avoid UBO layout churn.
+    bool enableRasterRayQueryShadows = false;
+
+    // Ray Query tuning
+    int rayQueryMaxBounces = 1; // 0 = no secondary rays, 1 = one-bounce reflection/refraction
+    bool enableRayQueryShadows = true; // Hard shadows for Ray Query direct lighting (shadow rays)
+    int rayQueryShadowSampleCount = 1; // 1 = hard; >1 enables soft-shadow sampling in the shader
+    float rayQueryShadowSoftness = 0.0f; // 0 = hard; otherwise scales effective light radius (fraction of range)
+    // Thick-glass controls (RQ-only)
+    bool enableThickGlass = true;
+    float thickGlassAbsorptionScale = 1.0f;
+    float thickGlassThicknessClamp = 0.2f; // meters
+
+    // Vulkan RAII context
+    vk::raii::Context context;
+
+    // Vulkan instance and debug messenger
+    vk::raii::Instance instance = nullptr;
+    vk::raii::DebugUtilsMessengerEXT debugMessenger = nullptr;
+
+    // Vulkan device
+    vk::raii::PhysicalDevice physicalDevice = nullptr;
+    vk::raii::Device device = nullptr;
+
+    // Memory pool for efficient memory management
+    std::unique_ptr<MemoryPool> memoryPool;
+
+    // Vulkan queues
+    vk::raii::Queue graphicsQueue = nullptr;
+    vk::raii::Queue presentQueue = nullptr;
+    vk::raii::Queue computeQueue = nullptr;
+
+    // Vulkan surface
+    vk::raii::SurfaceKHR surface = nullptr;
+
+    // Swap chain
+    vk::raii::SwapchainKHR swapChain = nullptr;
+    std::vector<vk::Image> swapChainImages;
+    vk::Format swapChainImageFormat = vk::Format::eUndefined;
+    vk::Extent2D swapChainExtent = {0, 0};
+    std::vector<vk::raii::ImageView> swapChainImageViews;
+    // Tracked layouts for swapchain images (VVL requires correct oldLayout in barriers).
+    // Initialized at swapchain creation and updated as we transition.
+    std::vector<vk::ImageLayout> swapChainImageLayouts;
+
+    // Dynamic rendering info
+    vk::RenderingInfo renderingInfo;
+    std::vector<vk::RenderingAttachmentInfo> colorAttachments;
+    vk::RenderingAttachmentInfo depthAttachment;
+
+    // Pipelines
+    vk::raii::PipelineLayout pipelineLayout = nullptr;
+    vk::raii::Pipeline graphicsPipeline = nullptr;
+    vk::raii::PipelineLayout pbrPipelineLayout = nullptr;
+    vk::raii::Pipeline pbrGraphicsPipeline = nullptr;
+    vk::raii::Pipeline pbrBlendGraphicsPipeline = nullptr;
+    // Transparent PBR pipeline variant for premultiplied alpha content
+    vk::raii::Pipeline pbrPremulBlendGraphicsPipeline = nullptr;
+    // Opaque PBR pipeline variant used after a depth pre-pass (depth read-only, compare with pre-pass depth)
+    vk::raii::Pipeline pbrPrepassGraphicsPipeline = nullptr;
+    // Reflection PBR pipeline used for mirrored off-screen pass (cull none to avoid winding issues)
+    vk::raii::Pipeline pbrReflectionGraphicsPipeline = nullptr;
+    // Specialized pipeline for architectural glass (windows, lamp glass, etc.).
+    // Shares descriptor layouts and vertex input with the PBR pipelines but uses
+    // a dedicated fragment shader entry point for more stable glass shading.
+    vk::raii::Pipeline glassGraphicsPipeline = nullptr;
+    vk::raii::PipelineLayout lightingPipelineLayout = nullptr;
+    vk::raii::Pipeline lightingPipeline = nullptr;
+
+    // Fullscreen composite pipeline to draw the opaque off-screen color to the swapchain
+    // (used to avoid gamma-incorrect vkCmdCopyImage and to apply tone mapping when desired).
+    vk::raii::PipelineLayout compositePipelineLayout = nullptr;
+    vk::raii::Pipeline compositePipeline = nullptr;
+    vk::raii::DescriptorSetLayout compositeDescriptorSetLayout = nullptr; // not used; reuse transparentDescriptorSetLayout
+    std::vector<vk::raii::DescriptorSet> compositeDescriptorSets; // unused; reuse transparentDescriptorSets
+
+    // Pipeline rendering create info structures (for proper lifetime management)
+    vk::PipelineRenderingCreateInfo mainPipelineRenderingCreateInfo;
+    vk::PipelineRenderingCreateInfo pbrPipelineRenderingCreateInfo;
+    vk::PipelineRenderingCreateInfo lightingPipelineRenderingCreateInfo;
+    vk::PipelineRenderingCreateInfo compositePipelineRenderingCreateInfo;
+
+    // Create composite pipeline
+    bool createCompositePipeline();
+
+    // Compute pipeline
+    vk::raii::PipelineLayout computePipelineLayout = nullptr;
+    vk::raii::Pipeline computePipeline = nullptr;
+    vk::raii::DescriptorSetLayout computeDescriptorSetLayout = nullptr;
+    vk::raii::DescriptorPool computeDescriptorPool = nullptr;
+    std::vector<vk::raii::DescriptorSet> computeDescriptorSets;
+    vk::raii::CommandPool computeCommandPool = nullptr;
+
+    // Thread safety for queue access - unified mutex since queues may share the same underlying VkQueue
+    mutable std::mutex queueMutex;
+    // Thread safety for descriptor pool/set operations across all engine threads
+    mutable std::mutex descriptorMutex;
+    // Monotonic generation counter for descriptor pool rebuilds (future use for hardening)
+    std::atomic<uint64_t> descriptorPoolGeneration{0};
+
+    // Command pool and buffers
+    vk::raii::CommandPool commandPool = nullptr;
+    std::vector<vk::raii::CommandBuffer> commandBuffers;
+    // Protect usage of shared commandPool for transient command buffers
+    mutable std::mutex commandMutex;
+
+    // Dedicated transfer queue (falls back to graphics if unavailable)
+    vk::raii::Queue transferQueue = nullptr;
+
+  public:
+    // Synchronization objects
+    std::vector<vk::raii::Semaphore> imageAvailableSemaphores;
+    std::vector<vk::raii::Semaphore> renderFinishedSemaphores;
+    std::vector<vk::raii::Fence> inFlightFences;
+
+    // Upload timeline semaphore for transfer -> graphics handoff (signaled per upload)
+    vk::raii::Semaphore uploadsTimeline = nullptr;
+    // Tracks last timeline value that has been submitted for signaling on uploadsTimeline
+    std::atomic<uint64_t> uploadTimelineLastSubmitted{0};
+
+    // Depth buffer
+    vk::raii::Image depthImage = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> depthImageAllocation = nullptr;
+    vk::raii::ImageView depthImageView = nullptr;
+
+    // Forward+ configuration
+    bool useForwardPlus = true; // default enabled
+    uint32_t forwardPlusTileSizeX = 16;
+    uint32_t forwardPlusTileSizeY = 16;
+    uint32_t forwardPlusSlicesZ = 16; // clustered depth slices
+    static constexpr uint32_t MAX_LIGHTS_PER_TILE = 256; // conservative cap
+
+    struct TileHeader {
+      uint32_t offset; // into tileLightIndices
+      uint32_t count; // number of indices for this tile
+      uint32_t pad0;
+      uint32_t pad1;
+    };
+
+    struct ForwardPlusPerFrame {
+      // SSBOs for per-tile light lists
+      vk::raii::Buffer tileHeaders = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> tileHeadersAlloc = nullptr;
+      vk::raii::Buffer tileLightIndices = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> tileLightIndicesAlloc = nullptr;
+      size_t tilesCapacity = 0; // number of tiles allocated
+      size_t indicesCapacity = 0; // number of indices allocated
+
+      // Uniform buffer with view/proj, screen size, tile size, etc.
+      vk::raii::Buffer params = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> paramsAlloc = nullptr;
+      void* paramsMapped = nullptr;
+
+      // Optional compute debug output buffer (uints), host-visible
+      vk::raii::Buffer debugOut = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> debugOutAlloc = nullptr;
+      bool debugOutAwaitingReadback = false;
+
+      // One-frame color probes (host-visible, small buffers)
+      vk::raii::Buffer probeOffscreen = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> probeOffscreenAlloc = nullptr;
+      vk::raii::Buffer probeSwapchain = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> probeSwapchainAlloc = nullptr;
+      bool probeAwaitingReadback = false;
+
+      // Compute descriptor set for culling
+      vk::raii::DescriptorSet computeSet = nullptr;
+    };
+    std::vector<ForwardPlusPerFrame> forwardPlusPerFrame; // size MAX_FRAMES_IN_FLIGHT
+    // Per-frame light count used by shaders (set once before main pass)
+    uint32_t lastFrameLightCount = 0;
+
+    // Forward+ compute resources
+    vk::raii::PipelineLayout forwardPlusPipelineLayout = nullptr;
+    vk::raii::Pipeline forwardPlusPipeline = nullptr;
+    vk::raii::DescriptorSetLayout forwardPlusDescriptorSetLayout = nullptr;
+
+    // Depth pre-pass pipeline
+    vk::raii::Pipeline depthPrepassPipeline = nullptr;
+
+    // Ray query rendering mode
+    RenderMode currentRenderMode = RenderMode::RayQuery;
+
+    // Ray query pipeline and resources
+    vk::raii::PipelineLayout rayQueryPipelineLayout = nullptr;
+    vk::raii::Pipeline rayQueryPipeline = nullptr;
+    vk::raii::DescriptorSetLayout rayQueryDescriptorSetLayout = nullptr;
+    std::vector<vk::raii::DescriptorSet> rayQueryDescriptorSets;
+    // Track when the ray query descriptor set for each frame has been written.
+    // Updating binding 6 (large texture table) can be expensive; avoid doing it every frame.
+    std::vector<bool> rayQueryDescriptorsWritten; // size = MAX_FRAMES_IN_FLIGHT
+    // Bitmask of frames whose ray query descriptor set needs a refresh (e.g., after TLAS rebuild or texture upload).
+    std::atomic<uint32_t> rayQueryDescriptorsDirtyMask{0};
+
+    // Dedicated ray query UBO (one per frame in flight) - separate from entity UBOs
+    std::vector<vk::raii::Buffer> rayQueryUniformBuffers;
+    std::vector<std::unique_ptr<MemoryPool::Allocation>> rayQueryUniformAllocations;
+    std::vector<void *> rayQueryUniformBuffersMapped;
+
+    // Ray query output image (storage image for compute shader output)
+    vk::raii::Image rayQueryOutputImage = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> rayQueryOutputImageAllocation = nullptr;
+    vk::raii::ImageView rayQueryOutputImageView = nullptr;
+
+    // Acceleration structures for ray query
+    struct AccelerationStructure {
+      vk::raii::Buffer buffer = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> allocation = nullptr;
+      vk::raii::AccelerationStructureKHR handle = nullptr; // Use RAII for proper lifetime management
+      vk::DeviceAddress deviceAddress = 0;
+    };
+    std::vector<AccelerationStructure> blasStructures; // Bottom-level AS (one per mesh)
+    AccelerationStructure tlasStructure; // Top-level AS (scene)
+
+    // Deferred deletion queue for old AS structures
+    // Keeps old AS buffers alive until all frames in flight have finished using them
+    struct PendingASDelete {
+      std::vector<AccelerationStructure> blasStructures;
+      AccelerationStructure tlasStructure;
+      uint32_t framesSinceDestroy = 0; // Increment each frame, delete when >= MAX_FRAMES_IN_FLIGHT
+    };
+    std::vector<PendingASDelete> pendingASDeletions;
+
+    // GPU data structures for ray query proper normal and material access
+    struct GeometryInfo {
+      uint64_t vertexBufferAddress; // Device address of vertex buffer
+      uint64_t indexBufferAddress; // Device address of index buffer
+      uint32_t vertexCount; // Number of vertices
+      uint32_t materialIndex; // Index into material buffer
+      uint32_t indexCount; // Number of indices (to bound primitiveIndex in shader)
+      uint32_t _pad0;
+      // Instance-space -> world-space normal transform (3 columns). Matches raster convention.
+      // Stored as float4 columns (xyz used, w unused) for stable std430 layout.
+      alignas(16) glm::vec4 normalMatrix0;
+      alignas(16) glm::vec4 normalMatrix1;
+      alignas(16) glm::vec4 normalMatrix2;
+    };
+
+    struct MaterialData {
+      alignas(16) glm::vec3 albedo;
+      alignas(4) float metallic;
+      alignas(16) glm::vec3 emissive;
+      alignas(4) float roughness;
+      alignas(4) float ao;
+      alignas(4) float ior;
+      alignas(4) float emissiveStrength;
+      alignas(4) float alpha;
+      alignas(4) float transmissionFactor;
+      alignas(4) float alphaCutoff;
+      // glTF alpha mode encoding (matches shader): 0=OPAQUE, 1=MASK, 2=BLEND
+      alignas(4) int32_t alphaMode;
+      alignas(4) uint32_t isGlass; // bool as uint32
+      alignas(4) uint32_t isLiquid; // bool as uint32
+
+      // Thick-glass parameters (RQ-only)
+      alignas(16) glm::vec3 absorptionColor{1.0f, 1.0f, 1.0f};
+      alignas(4) float absorptionDistance = 1.0f; // meters
+      alignas(4) uint32_t thinWalled = 1u; // 1 = thin surface, 0 = thick volume
+
+      // Raster parity: texture-set flags (-1 = no texture; 0 = sample from binding 6 table).
+      // Ray Query uses a single texture table (binding 6); indices are always valid even when
+      // the set flag is -1, so the shader can choose the correct no-texture behavior.
+      alignas(4) int32_t baseColorTextureSet;
+      alignas(4) int32_t physicalDescriptorTextureSet;
+      alignas(4) int32_t normalTextureSet;
+      alignas(4) int32_t occlusionTextureSet;
+      alignas(4) int32_t emissiveTextureSet;
+
+      // Ray Query texture table indices (binding 6). These always reference a valid descriptor
+      // (real streamed texture or a shared default slot).
+      alignas(4) int32_t baseColorTexIndex;
+      alignas(4) int32_t normalTexIndex;
+      alignas(4) int32_t physicalTexIndex; // metallic-roughness (default) or spec-gloss when useSpecGlossWorkflow=1
+      alignas(4) int32_t occlusionTexIndex;
+      alignas(4) int32_t emissiveTexIndex;
+
+      // Specular-glossiness workflow support (KHR_materials_pbrSpecularGlossiness)
+      alignas(4) int32_t useSpecGlossWorkflow; // 1 if SpecGloss
+      alignas(4) float glossinessFactor;
+      alignas(16) glm::vec3 specularFactor;
+      alignas(4) int32_t hasEmissiveStrengthExt;
+      alignas(4) uint32_t _padMat[3];
+    };
+
+    // Ray query geometry and material buffers
+    vk::raii::Buffer geometryInfoBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> geometryInfoAllocation = nullptr;
+    vk::raii::Buffer materialBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> materialAllocation = nullptr;
+
+    // Ray query baseColor texture array (binding 6)
+    static constexpr uint32_t RQ_MAX_TEX = 2048;
+    // Reserved slots in the Ray Query texture table (binding 6)
+    static constexpr uint32_t RQ_SLOT_DEFAULT_BASECOLOR = 0;
+    static constexpr uint32_t RQ_SLOT_DEFAULT_NORMAL = 1;
+    static constexpr uint32_t RQ_SLOT_DEFAULT_METALROUGH = 2;
+    static constexpr uint32_t RQ_SLOT_DEFAULT_OCCLUSION = 3;
+    static constexpr uint32_t RQ_SLOT_DEFAULT_EMISSIVE = 4;
+    // NOTE: Textures can stream in asynchronously and their underlying VkImageView/VkSampler
+    // can be destroyed/recreated. Therefore, the Ray Query texture table must NOT cache
+    // VkDescriptorImageInfo (which contains raw handles). Instead, cache only the canonical
+    // texture key per slot and rebuild VkDescriptorImageInfo each descriptor update.
+    //
+    // Slots 0..4 are reserved for shared default PBR textures.
+    std::vector<std::string> rayQueryTexKeys; // slot -> canonical texture key
+    std::vector<uint32_t> rayQueryTexFallbackSlots; // slot -> fallback slot (type-appropriate default)
+    uint32_t rayQueryTexCount = 0; // number of valid slots in rayQueryTexKeys
+    std::unordered_map<std::string, uint32_t> rayQueryTexIndex; // canonicalKey -> slot
+
+    // Per-material texture path mapping captured at AS build time; used for streaming requests
+    // and debugging, but Ray Query primarily uses per-material texture indices.
+    struct RQMaterialTexPaths {
+      std::string baseColor;
+      std::string normal;
+      std::string physical;
+      std::string occlusion;
+      std::string emissive;
+    };
+    std::vector<RQMaterialTexPaths> rqMaterialTexPaths;
+
+    // Count of GeometryInfo instances currently uploaded (CPU-side tracking)
+    size_t geometryInfoCountCPU = 0;
+    // Count of materials currently uploaded (CPU-side tracking)
+    size_t materialCountCPU = 0;
+
+    // --- Pending GPU uploads (to be executed on the render thread safe point) ---
+    std::mutex pendingMeshUploadsMutex;
+    std::vector<class MeshComponent *> pendingMeshUploads; // meshes with staged data to copy
+
+    struct InFlightMeshUploadBatch {
+      uint64_t signalValue = 0;
+      std::vector<class MeshComponent *> meshes;
+      std::unique_ptr<vk::raii::CommandPool> commandPool;
+      std::unique_ptr<vk::raii::CommandBuffers> commandBuffers;
+    };
+    std::mutex inFlightMeshUploadsMutex;
+    std::deque<InFlightMeshUploadBatch> inFlightMeshUploads;
+
+    // Enqueue mesh uploads collected on background/loading threads
+    void EnqueueMeshUploads(const std::vector<class MeshComponent *>& meshes);
+    // Execute pending mesh uploads on the render thread (called from Render after fence wait)
+    void ProcessPendingMeshUploads();
+
+    // --- Pending entity GPU preallocation (enqueued by scene loader thread; executed on render thread) ---
+    std::mutex pendingEntityPreallocMutex;
+    std::vector<Entity *> pendingEntityPrealloc;
+    std::vector<Entity *> pendingInstanceBufferRecreations;
+    std::atomic<bool> pendingEntityPreallocQueued{false};
+    void ProcessPendingEntityPreallocations();
+
+    // Descriptor set layouts (declared before pools and sets)
+    vk::raii::DescriptorSetLayout descriptorSetLayout = nullptr;
+    vk::raii::DescriptorSetLayout pbrDescriptorSetLayout = nullptr;
+    vk::raii::DescriptorSetLayout transparentDescriptorSetLayout = nullptr;
+    vk::raii::PipelineLayout pbrTransparentPipelineLayout = nullptr;
+
+    // The texture that will hold a snapshot of the opaque scene
+    // One off-screen color image per frame-in-flight to avoid cross-frame read/write hazards.
+    std::vector<vk::raii::Image> opaqueSceneColorImages;
+    std::vector<std::unique_ptr<MemoryPool::Allocation>> opaqueSceneColorImageAllocations;
+    std::vector<vk::raii::ImageView> opaqueSceneColorImageViews;
+    // Track the current layout per frame (initialized to eUndefined at creation)
+    std::vector<vk::ImageLayout> opaqueSceneColorImageLayouts;
+    vk::raii::Sampler opaqueSceneColorSampler{nullptr};
+
+    // A descriptor set for the opaque scene color texture. One per frame in flight.
+    std::vector<vk::raii::DescriptorSet> transparentDescriptorSets;
+    // Fallback descriptor sets for opaque pass (binds a default SHADER_READ_ONLY texture as Set 1)
+    std::vector<vk::raii::DescriptorSet> transparentFallbackDescriptorSets;
+
+    // Ray Query composite descriptor sets: sample the rayQueryOutputImage in a fullscreen pass
+    std::vector<vk::raii::DescriptorSet> rqCompositeDescriptorSets;
+    // Fallback sampler for the RQ composite if no other sampler is available at init time
+    vk::raii::Sampler rqCompositeSampler{nullptr};
+
+  public:
+    // Mesh resources
+    struct MeshResources {
+      // Device-local vertex/index buffers used for rendering
+      vk::raii::Buffer vertexBuffer = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> vertexBufferAllocation = nullptr;
+      vk::raii::Buffer indexBuffer = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> indexBufferAllocation = nullptr;
+      uint32_t indexCount = 0;
+
+      // Optional per-mesh staging buffers used when uploads are batched.
+      // These are populated when createMeshResources(..., deferUpload=true) is used
+      // and are consumed and cleared by preAllocateEntityResourcesBatch().
+      vk::raii::Buffer stagingVertexBuffer = nullptr;
+      vk::raii::DeviceMemory stagingVertexBufferMemory = nullptr;
+      vk::DeviceSize vertexBufferSizeBytes = 0;
+
+      vk::raii::Buffer stagingIndexBuffer = nullptr;
+      vk::raii::DeviceMemory stagingIndexBufferMemory = nullptr;
+      vk::DeviceSize indexBufferSizeBytes = 0;
+
+      // Material index for ray query (extracted from entity name or MaterialMesh)
+      int32_t materialIndex = -1; // -1 = no material/default
+    };
+    std::unordered_map<MeshComponent *, MeshResources> meshResources;
+
+    // Texture resources
+    struct TextureResources {
+      vk::raii::Image textureImage = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> textureImageAllocation = nullptr;
+      vk::raii::ImageView textureImageView = nullptr;
+      vk::raii::Sampler textureSampler = nullptr;
+      vk::Format format = vk::Format::eR8G8B8A8Srgb; // Store texture format for proper color space handling
+      uint32_t mipLevels = 1; // Store number of mipmap levels
+      // Hint: true if source texture appears to use alpha masking (any alpha < ~1.0)
+      bool alphaMaskedHint = false;
+    };
+    std::unordered_map<std::string, TextureResources> textureResources;
+
+    // Pending texture jobs that require GPU-side work. Worker threads
+    // enqueue these jobs; the main thread drains them and performs the
+    // actual LoadTexture/LoadTextureFromMemory calls.
+    struct PendingTextureJob {
+      enum class Type {
+        FromFile,
+        FromMemory
+      } type;
+      enum class Priority {
+        Critical,
+        NonCritical
+      } priority;
+      std::string idOrPath;
+      std::vector<unsigned char> data; // only used for FromMemory
+      int width = 0;
+      int height = 0;
+      int channels = 0;
+    };
+
+    std::mutex pendingTextureJobsMutex;
+    std::condition_variable pendingTextureCv;
+    std::vector<PendingTextureJob> pendingTextureJobs;
+    // Track outstanding critical texture jobs (for IsLoading)
+    std::atomic<uint32_t> criticalJobsOutstanding{0};
+
+    // Background uploader worker controls (multiple workers)
+    std::atomic<bool> stopUploadsWorker{false};
+    std::vector<std::thread> uploadsWorkerThreads;
+
+  public:
+    // Track how many texture upload jobs have been scheduled vs completed
+    // on the GPU side. Used only for UI feedback during streaming.
+    std::atomic<uint32_t> uploadJobsTotal{0};
+    std::atomic<uint32_t> uploadJobsCompleted{0};
+    // When true, initial scene load is complete and the loading overlay should be hidden
+    std::atomic<bool> initialLoadComplete{false};
+    // Loading-phase UI state (atomic because ImGui may query at any point)
+    std::atomic<uint32_t> loadingPhase{static_cast<uint32_t>(LoadingPhase::Scene)};
+    std::atomic<float> loadingPhaseProgress{0.0f};
+
+    // Performance counters for texture uploads
+    std::atomic<uint64_t> bytesUploadedTotal{0};
+    // Streaming window start time in nanoseconds from steady_clock epoch (0 when inactive)
+    std::atomic<uint64_t> uploadWindowStartNs{0};
+    // Aggregate per-texture CPU upload durations (nanoseconds) and count
+    std::atomic<uint64_t> totalUploadNs{0};
+    std::atomic<uint32_t> uploadCount{0};
+
+    // Reverse mapping from texture ID to entities that reference it. Used to
+    // update descriptor sets when a streamed texture finishes uploading.
+    std::mutex textureUsersMutex;
+    std::unordered_map<std::string, std::vector<Entity *>> textureToEntities;
+
+    // Entities needing descriptor set refresh due to streamed textures
+    std::mutex dirtyEntitiesMutex;
+    // Map of entity -> bitmask of frames-in-flight that still need a descriptor refresh.
+    // This avoids the “frame 0 updated / frame 1 still default” oscillation when
+    // MAX_FRAMES_IN_FLIGHT > 1 and a texture becomes available mid-stream.
+    std::unordered_map<Entity *, uint32_t> descriptorDirtyEntities;
+
+    // Protect concurrent access to textureResources
+    mutable std::shared_mutex textureResourcesMutex;
+
+    // Texture aliasing: maps alias (canonical) IDs to actual loaded keys
+    std::unordered_map<std::string, std::string> textureAliases;
+
+    // Per-texture load de-duplication (serialize loads of the same texture ID only)
+    mutable std::mutex textureLoadStateMutex;
+    std::condition_variable textureLoadStateCv;
+    std::unordered_set<std::string> texturesLoading;
+
+    // Serialize GPU-side texture upload (image/buffer creation, transitions) to avoid driver/memory pool races
+    mutable std::mutex textureUploadMutex;
+
+    // Thread pool for background background tasks (textures, etc.)
+    std::unique_ptr<ThreadPool> threadPool;
+    // Mutex to protect threadPool access during initialization/cleanup
+    mutable std::shared_mutex threadPoolMutex;
+
+    // Texture loading progress (for UI)
+    std::atomic<uint32_t> textureTasksScheduled{0};
+    std::atomic<uint32_t> textureTasksCompleted{0};
+    std::atomic<bool> loadingFlag{false};
+
+    // Acceleration structure build UI progress (written on render thread).
+    // Kept as atomics because ImGui can query at any point during the frame.
+    std::atomic<bool> asBuildUiActive{false};
+    std::atomic<float> asBuildUiProgress{0.0f};
+    std::atomic<uint32_t> asBuildUiDone{0};
+    std::atomic<uint32_t> asBuildUiTotal{0};
+    std::atomic<const char *> asBuildUiStage{"idle"};
+    std::atomic<uint64_t> asBuildUiStartNs{0};
+
+    // Default texture resources (used when no texture is provided)
+    TextureResources defaultTextureResources;
+
+  public:
+    // Performance clamps (to reduce per-frame cost)
+    static constexpr uint32_t MAX_ACTIVE_LIGHTS = 1024; // Limit the number of lights processed per frame
+
+    // Static lights loaded during model initialization
+    std::vector<ExtractedLight> staticLights;
+
+    // Dynamic lighting system using storage buffers
+    struct LightStorageBuffer {
+      vk::raii::Buffer buffer = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> allocation = nullptr;
+      void* mapped = nullptr;
+      size_t capacity = 0; // Current capacity in number of lights
+      size_t size = 0; // Current number of lights
+    };
+    std::vector<LightStorageBuffer> lightStorageBuffers; // One per frame in flight
+
+    // Entity resources (contains descriptor sets - must be declared before descriptor pool)
+    struct EntityResources {
+      std::vector<vk::raii::Buffer> uniformBuffers;
+      std::vector<std::unique_ptr<MemoryPool::Allocation>> uniformBufferAllocations;
+      std::vector<void *> uniformBuffersMapped;
+      std::vector<vk::raii::DescriptorSet> basicDescriptorSets; // For basic pipeline
+      std::vector<vk::raii::DescriptorSet> pbrDescriptorSets; // For PBR pipeline
+
+      // Instance buffer for instanced rendering
+      vk::raii::Buffer instanceBuffer = nullptr;
+      std::unique_ptr<MemoryPool::Allocation> instanceBufferAllocation = nullptr;
+      void* instanceBufferMapped = nullptr;
+
+      // Tracks whether binding 0 (UBO) has been written at least once for each frame
+      // for each pipeline type. Descriptor sets for non-current frames are allocated
+      // but not necessarily initialized immediately (to avoid update-after-bind hazards),
+      // so each frame needs a one-time initialization at its safe point.
+      std::vector<bool> pbrUboBindingWritten; // size = MAX_FRAMES_IN_FLIGHT
+      std::vector<bool> basicUboBindingWritten; // size = MAX_FRAMES_IN_FLIGHT
+
+      // Tracks whether image bindings have been written at least once for each frame.
+      // If false for the current frame at the safe point, we cold-initialize the
+      // image bindings (PBR: b1..b5 [+b6 when applicable], Basic: b1) with either
+      // real textures or shared defaults to avoid per-frame "black" flashes.
+      std::vector<bool> pbrImagesWritten; // size = MAX_FRAMES_IN_FLIGHT
+      std::vector<bool> basicImagesWritten; // size = MAX_FRAMES_IN_FLIGHT
+
+      // Tracks whether the remaining required bindings in the PBR set 0 layout have
+      // been written at least once for each frame.
+      // This includes bindings like Forward+ tile buffers (7/8), reflection sampler (10),
+      // and TLAS (11). These bindings are required by the pipeline layout and must be
+      // valid before any draw that uses the PBR/glass pipelines.
+      std::vector<bool> pbrFixedBindingsWritten; // size = MAX_FRAMES_IN_FLIGHT
+
+      // Cached material lookup/classification for raster rendering.
+      // Avoids per-frame string parsing of entity names ("_Material_") and repeated
+      // ModelLoader material lookups across culling, sorting, and draw loops.
+      bool materialCacheValid = false;
+      const Material* cachedMaterial = nullptr;
+      // Derived flags used by render queues and sorting heuristics
+		bool cachedIsBlended = false;
+		bool cachedIsGlass   = false;
+		bool cachedIsLiquid  = false;
+		// Material-derived push constants defaults (static per-entity unless material changes)
+		MaterialProperties cachedMaterialProps{};
+	};
+
+	// Cached job for rendering a single entity in a frame
+	struct RenderJob
+	{
+		Entity             *entity;
+		EntityResources    *entityRes;
+		MeshResources      *meshRes;
+		MeshComponent      *meshComp;
+		TransformComponent *transformComp;
+		bool                isAlphaMasked;
+	};
+	std::unordered_map<Entity *, EntityResources> entityResources;
+
+  public:
+    // Descriptor pool (declared after entity resources to ensure proper destruction order)
+    vk::raii::DescriptorPool descriptorPool = nullptr;
+
+    // Current frame index
+    uint32_t currentFrame = 0;
+
+    // Queue family indices
+    QueueFamilyIndices queueFamilyIndices;
+
+    // Validation layers
+    const std::vector<const char *> validationLayers = {
+      "VK_LAYER_KHRONOS_validation"
+    };
+
+    // Required device extensions
+    const std::vector<const char *> requiredDeviceExtensions = {
+      VK_KHR_SWAPCHAIN_EXTENSION_NAME
+    };
+
+    // Optional device extensions
+    const std::vector<const char *> optionalDeviceExtensions = {
+      VK_KHR_DYNAMIC_RENDERING_EXTENSION_NAME,
+      VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
+      VK_KHR_DEPTH_STENCIL_RESOLVE_EXTENSION_NAME,
+      VK_EXT_DESCRIPTOR_INDEXING_EXTENSION_NAME,
+      // Robustness and safety
+      VK_EXT_ROBUSTNESS_2_EXTENSION_NAME,
+      // Tile/local memory friendly dynamic rendering readback
+      VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME,
+      // Shader tile image for fast tile access
+      VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME,
+      // Ray query support for ray-traced rendering
+      VK_KHR_DEFERRED_HOST_OPERATIONS_EXTENSION_NAME,
+      VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME,
+      VK_KHR_RAY_QUERY_EXTENSION_NAME
+    };
+
+    // All device extensions (required + optional)
+    std::vector<const char *> deviceExtensions;
+
+    // Initialization flag
+    bool initialized = false;
+    // Whether VK_EXT_descriptor_indexing (update-after-bind) path is enabled
+    bool descriptorIndexingEnabled = false;
+    bool storageAfterBindEnabled = false;
+    // Feature toggles detected/enabled at device creation
+    bool robustness2Enabled = false;
+    bool dynamicRenderingLocalReadEnabled = false;
+    bool shaderTileImageEnabled = false;
+    bool rayQueryEnabled = false;
+    bool accelerationStructureEnabled = false;
+
+    // When true and current render mode is RayQuery, the engine renders a static opaque scene:
+    // - Animation/physics updates are suppressed by the Engine (input/Update hook)
+    // - TLAS refit per-frame is skipped to avoid any animation-driven changes
+    // - The AS is built once after loading completes
+    // Default now OFF so animation is enabled again for AS (per user request)
+    bool rayQueryStaticOnly = false;
+
+    // Deformable (skinned/morph) meshes are included in the AS. Their BLAS is built with
+    // eAllowUpdate and refit each frame from the skinning compute output so they animate
+    // correctly under ray query (see refitBLASInline / buildAccelerationStructures).
+    bool enableBLASRefit = true;
+
+    // Framebuffer resized flag (atomic to handle platform callback vs. render thread)
+    std::atomic<bool> framebufferResized{false};
+    // Guard to prevent descriptor updates while a command buffer is recording
+    std::atomic<bool> isRecordingCmd{false};
+    // Descriptor sets may be temporarily invalid during swapchain recreation; suppress updates then.
+    std::atomic<bool> descriptorSetsValid{true};
+    // Request flag for acceleration structure build (set by loading thread, cleared by render thread)
+    std::atomic<bool> asBuildRequested{false};
+    // Timestamp of the most recent AS build request (steady_clock ns). Used to prevent infinite deferral.
+    std::atomic<uint64_t> asBuildRequestStartNs{0};
+
+    // Track last successfully built AS sizes to avoid rebuilding with a smaller subset
+    // (e.g., during incremental streaming where not all meshes are ready yet).
+    // We only accept AS builds that are monotonically non-decreasing in counts.
+    size_t lastASBuiltBLASCount = 0;
+    // NOTE: This is the number of renderable ENTITIES included in the AS build (not TLAS instances).
+    size_t lastASBuiltInstanceCount = 0;
+    // TLAS instance count (includes per-mesh instancing). Used for logging and shader bounds.
+    size_t lastASBuiltTlasInstanceCount = 0;
+
+    // Freeze TLAS rebuilds after a full build to prevent regressions (e.g., animation-only TLAS)
+    bool asFreezeAfterFullBuild = true; // enable freezing behavior
+    bool asFrozen = false; // once frozen, ignore rebuilds unless explicitly overridden
+    // Optional developer override to allow rebuild while frozen
+    bool asDevOverrideAllowRebuild = false;
+    // Reason string for the last time a build was requested (for logging)
+    std::string lastASBuildRequestReason;
+
+    // Opportunistic rebuilds (when counts increase) can cause unintended TLAS churn during animation.
+    // Leave this disabled by default; TLAS builds should be explicit (on mode switch / scene ready).
+    bool asOpportunisticRebuildEnabled = false;
+
+    // --- AS UPDATE/Refit state ---
+    // Persistent TLAS instances buffer & order for UPDATE (refit)
+    struct TlasInstanceRef {
+      class Entity* entity{nullptr};
+      uint32_t instanceIndex{0}; // valid only when instanced==true
+      bool instanced{false}; // true when this TLAS entry comes from MeshComponent instancing
+    };
+    vk::raii::Buffer tlasInstancesBuffer{nullptr};
+    std::unique_ptr<MemoryPool::Allocation> tlasInstancesAllocation;
+    uint32_t tlasInstanceCount = 0;
+    std::vector<TlasInstanceRef> tlasInstanceOrder; // order must match buffer instances
+
+    // Scratch buffer for TLAS UPDATE operations
+    vk::raii::Buffer tlasUpdateScratchBuffer{nullptr};
+    std::unique_ptr<MemoryPool::Allocation> tlasUpdateScratchAllocation;
+
+    // Maximum number of frames in flight
+    // More than 1 allows CPU/GPU overlap and reduce per-frame stalls.
+    // All per-frame resources (UBOs, descriptor sets, reflection RTs, etc.)
+    // are sized dynamically based on this value.
+    const uint32_t MAX_FRAMES_IN_FLIGHT = 2u;
+
+    // --- Performance & diagnostics ---
+    UniformBufferObject frameUboTemplate{};
+    bool enableFrustumCulling = true;
+    uint32_t lastCullingVisibleCount = 0;
+    uint32_t lastCullingCulledCount = 0;
+    // Distance-based LOD (projected-size skip in pixels)
+    bool enableDistanceLOD = false;
+    float lodPixelThresholdOpaque = 1.5f;
+    float lodPixelThresholdTransparent = 2.5f;
+    // Sampler anisotropy preference (clamped to device limits)
+    float samplerMaxAnisotropy = 8.0f;
+    // Upper bound on auto-generated mip levels (to avoid excessive VRAM use on huge textures)
+    uint32_t maxAutoGeneratedMipLevels = 4;
+
+    // --- Planar reflections (scaffolding) ---
+    bool enablePlanarReflections = false; // UI toggle to enable/disable planar reflections
+    float reflectionResolutionScale = 0.5f; // Scale relative to swapchain size
+    // Cached per-frame reflection data used by UBO population
+    // Current frame's reflection VP (for rendering the reflection pass)
+    glm::mat4 currentReflectionVP{1.0f};
+    glm::vec4 currentReflectionPlane{0.0f, 1.0f, 0.0f, 0.0f};
+    // Per-frame stored reflection VP (written during reflection pass)
+    std::vector<glm::mat4> reflectionVPs; // size MAX_FRAMES_IN_FLIGHT
+    // The VP to sample in the main pass (prev-frame VP to match prev-frame texture)
+    glm::mat4 sampleReflectionVP{1.0f};
+    bool reflectionResourcesDirty = false; // recreate reflection RTs at safe point
+
+    // --- Ray query rendering options ---
+    bool enableRayQueryReflections = true; // UI toggle to enable reflections in ray query mode
+    bool enableRayQueryTransparency = true; // UI toggle to enable transparency/refraction in ray query mode
+
+  public:
+    // === Watchdog system to detect application hangs ===
+    // Atomic timestamp updated every frame - watchdog thread checks if stale
+    std::atomic<std::chrono::steady_clock::time_point> lastFrameUpdateTime;
+    // Low-noise progress marker to pinpoint where the render thread stalled when the watchdog fires
+    std::atomic<const char *> watchdogProgressLabel{"init"};
+    // Optional numeric marker to help pinpoint stalls inside large loops
+    std::atomic<uint32_t> watchdogProgressIndex{0};
+    std::thread watchdogThread;
+    std::atomic<bool> watchdogRunning{false};
+    // Some operations (notably BLAS/TLAS builds in Debug on large scenes) can legitimately take
+    // longer than the watchdog threshold. When set, the watchdog will not abort.
+    std::atomic<bool> watchdogSuppressed{false};
+
+    // === Descriptor update deferral while recording ===
+    struct PendingDescOp {
+      Entity* entity;
+      std::string texPath;
+      bool usePBR;
+      uint32_t frameIndex;
+      bool imagesOnly;
+    };
+    std::mutex pendingDescMutex;
+    std::vector<PendingDescOp> pendingDescOps; // flushed at frame safe point
+    std::atomic<bool> descriptorRefreshPending{false};
+
+    struct ReflectionRT {
+      vk::raii::Image color{nullptr};
+      std::unique_ptr<MemoryPool::Allocation> colorAlloc{nullptr};
+      vk::raii::ImageView colorView{nullptr};
+      vk::raii::Sampler colorSampler{nullptr};
+
+      vk::raii::Image depth{nullptr};
+      std::unique_ptr<MemoryPool::Allocation> depthAlloc{nullptr};
+      vk::raii::ImageView depthView{nullptr};
+
+      uint32_t width{0};
+      uint32_t height{0};
+    };
+    std::vector<ReflectionRT> reflections; // one per frame-in-flight
+
+    // Private methods
+    bool createInstance(const std::string& appName, bool enableValidationLayers);
+    bool setupDebugMessenger(bool enableValidationLayers);
+    bool createSurface();
+    bool checkValidationLayerSupport() const;
+    bool pickPhysicalDevice();
+    void addSupportedOptionalExtensions();
+    bool createLogicalDevice(bool enableValidationLayers);
+    bool createSwapChain();
+    bool createImageViews();
+    bool setupDynamicRendering();
+    bool createDescriptorSetLayout();
+    bool createPBRDescriptorSetLayout();
+    bool createGraphicsPipeline();
+
+    bool createPBRPipeline();
+    bool createLightingPipeline();
+    bool createDepthPrepassPipeline();
+    bool createForwardPlusPipelinesAndResources();
+
+    // Ray query pipeline creation
+    bool createRayQueryDescriptorSetLayout();
+    bool createRayQueryPipeline();
+    bool createRayQueryResources();
+    // If updateOnlyCurrentFrame is true, only descriptor sets for currentFrame will be updated.
+    // Use updateOnlyCurrentFrame=false during initialization/swapchain recreation when the device is idle.
+    bool createOrResizeForwardPlusBuffers(uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, bool updateOnlyCurrentFrame = false);
+    void updateForwardPlusParams(uint32_t frameIndex, const glm::mat4& view, const glm::mat4& proj, uint32_t lightCount, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, float nearZ, float farZ);
+    void dispatchForwardPlus(vk::raii::CommandBuffer& cmd, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ);
+    // Ensure Forward+ compute descriptor set binding 0 (lights SSBO) is bound for a frame
+    void refreshForwardPlusComputeLightsBindingForFrame(uint32_t frameIndex);
+    bool createComputePipeline();
+    void pushMaterialProperties(vk::CommandBuffer commandBuffer, const MaterialProperties& material) const;
+    bool createCommandPool();
+
+    // Shadow mapping methods
+    bool createComputeCommandPool();
+    bool createDepthResources();
+    bool createTextureImage(const std::string& texturePath, TextureResources& resources);
+    bool createTextureImageView(TextureResources& resources);
+    bool createTextureSampler(TextureResources& resources);
+    bool createDefaultTextureResources();
+    bool createSharedDefaultPBRTextures();
+    bool createMeshResources(MeshComponent* meshComponent, bool deferUpload = false);
+    bool createUniformBuffers(Entity* entity);
+    bool createDescriptorPool();
+    bool createDescriptorSets(Entity* entity, const std::string& texturePath, bool usePBR = false);
+    bool createDescriptorSets(Entity *entity, EntityResources &res, const std::string &texturePath, bool usePBR = false);
+	bool updateDescriptorSetsForFrame(Entity            *entity,
+	                                  const std::string &texturePath,
+	                                  bool               usePBR,
+	                                  uint32_t           frameIndex,
+	                                  bool               imagesOnly = false,
+	                                  bool               uboOnly    = false);
+	bool updateDescriptorSetsForFrame(Entity            *entity,
+	                                  EntityResources   &res,
+	                                  const std::string &texturePath,
+	                                  bool               usePBR,
+	                                  uint32_t           frameIndex,
+	                                  bool               imagesOnly = false,
+	                                  bool               uboOnly    = false);
+	// Refresh only the currentFrame PBR descriptor set bindings that Forward+ relies on
+	// (b6 = lights SSBO, b7 = tile headers, b8 = tile indices). Safe to call after
+	// we've waited on the frame fence at the start of Render().
+	void refreshPBRForwardPlusBindingsForFrame(uint32_t frameIndex);
+	bool createCommandBuffers();
+	bool createSyncObjects();
+
+    void cleanupSwapChain();
+
+    // Planar reflection helpers (initial scaffolding)
+    bool createReflectionResources(uint32_t width, uint32_t height);
+    void destroyReflectionResources();
+    // Render the scene into the reflection RT (mirrored about a plane) — to be fleshed out next step
+    void renderReflectionPass(vk::raii::CommandBuffer& cmd,
+                              const glm::vec4& planeWS,
+                              CameraComponent* camera,
+                              const std::vector<RenderJob>      &jobs);
+
+    // Ensure Vulkan-Hpp dispatcher is initialized for the current thread when using RAII objects on worker threads
+    void ensureThreadLocalVulkanInit() const;
+
+    // Cache and classify an entity's material for raster rendering (opaque vs blended, glass/liquid flags,
+    // and push-constant defaults). This avoids repeated per-frame string parsing and material lookups.
+    void ensureEntityMaterialCache(Entity* entity, EntityResources &res);
+
+    // ===================== Culling helpers =====================
+    struct FrustumPlanes {
+      // Plane equation ax + by + cz + d >= 0 considered inside
+      glm::vec4 planes[6]{}; // 0=L,1=R,2=B,3=T,4=N,5=F
+    };
+
+    static FrustumPlanes extractFrustumPlanes(const glm::mat4& vp);
+
+    static void transformAABB(const glm::mat4& M,
+                              const glm::vec3& localMin,
+                              const glm::vec3& localMax,
+                              glm::vec3& outMin,
+                              glm::vec3& outMax);
+
+    static bool aabbIntersectsFrustum(const glm::vec3& worldMin,
+                                      const glm::vec3& worldMax,
+                                      const FrustumPlanes& frustum);
+    void recreateSwapChain();
+
+    void updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, TransformComponent *tc = nullptr);
+    void updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, const glm::mat4& customTransform);
+    void updateUniformBufferInternal(uint32_t currentImage, Entity* entity, EntityResources *entityRes, CameraComponent* camera, UniformBufferObject& ubo);
+	void prepareFrameUboTemplate(CameraComponent *camera);
+
+    vk::raii::ShaderModule createShaderModule(const std::vector<char>& code);
+
+    QueueFamilyIndices findQueueFamilies(const vk::raii::PhysicalDevice& device);
+    SwapChainSupportDetails querySwapChainSupport(const vk::raii::PhysicalDevice& device);
+    bool isDeviceSuitable(vk::raii::PhysicalDevice& device);
+    bool checkDeviceExtensionSupport(vk::raii::PhysicalDevice& device);
+
+    vk::SurfaceFormatKHR chooseSwapSurfaceFormat(const std::vector<vk::SurfaceFormatKHR>& availableFormats);
+    vk::PresentModeKHR chooseSwapPresentMode(const std::vector<vk::PresentModeKHR>& availablePresentModes);
+    vk::Extent2D chooseSwapExtent(const vk::SurfaceCapabilitiesKHR& capabilities);
+
+    uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const;
+
+    std::pair<vk::raii::Buffer, vk::raii::DeviceMemory> createBuffer(vk::DeviceSize size, vk::BufferUsageFlags usage, vk::MemoryPropertyFlags properties);
+    bool createOpaqueSceneColorResources();
+    void createTransparentDescriptorSets();
+    void createTransparentFallbackDescriptorSets();
+    std::pair<vk::raii::Buffer, std::unique_ptr<MemoryPool::Allocation>> createBufferPooled(vk::DeviceSize size, vk::BufferUsageFlags usage, vk::MemoryPropertyFlags properties);
+    void copyBuffer(vk::raii::Buffer& srcBuffer, vk::raii::Buffer& dstBuffer, vk::DeviceSize size);
+
+    std::pair<vk::raii::Image, vk::raii::DeviceMemory> createImage(uint32_t width, uint32_t height, vk::Format format, vk::ImageTiling tiling, vk::ImageUsageFlags usage, vk::MemoryPropertyFlags properties);
+    std::pair<vk::raii::Image, std::unique_ptr<MemoryPool::Allocation>> createImagePooled(uint32_t width, uint32_t height, vk::Format format, vk::ImageTiling tiling, vk::ImageUsageFlags usage, vk::MemoryPropertyFlags properties, uint32_t mipLevels = 1, vk::SharingMode sharingMode = vk::SharingMode::eExclusive, const std::vector<uint32_t>& queueFamilies = {});
+    void transitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels = 1);
+    void copyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height, vk::ArrayProxy<const vk::BufferImageCopy> regions);
+    // Extended: track stagedBytes for perf stats
+    void uploadImageFromStaging(vk::Buffer staging,
+                                vk::Image image,
+                                vk::Format format,
+                                vk::ArrayProxy<const vk::BufferImageCopy> regions,
+                                uint32_t mipLevels,
+                                vk::DeviceSize stagedBytes);
+
+    vk::raii::ImageView createImageView(vk::raii::Image& image, vk::Format format, vk::ImageAspectFlags aspectFlags, uint32_t mipLevels = 1);
+    vk::Format findSupportedFormat(const std::vector<vk::Format>& candidates, vk::ImageTiling tiling, vk::FormatFeatureFlags features);
+    bool hasStencilComponent(vk::Format format);
+
+    std::vector<char> readFile(const std::string& filename);
+
+    // Background uploader helpers
+    void StartUploadsWorker(size_t workerCount = 0);
+    void StopUploadsWorker();
+
+    // Serialize descriptor writes vs command buffer recording to avoid mid-record updates during recording
+    std::mutex renderRecordMutex;
+
+    // (Descriptor API wrappers were considered but avoided here to keep RAII types intact.)
+
+    // Upload perf getters
+  public:
+    uint64_t GetBytesUploadedTotal() const {
+      return bytesUploadedTotal.load(std::memory_order_relaxed);
+    }
+    double GetAverageUploadMs() const {
+      uint64_t ns = totalUploadNs.load(std::memory_order_relaxed);
+      uint32_t cnt = uploadCount.load(std::memory_order_relaxed);
+      if (cnt == 0)
+        return 0.0;
+      return static_cast<double>(ns) / 1e6 / static_cast<double>(cnt);
+    }
+    double GetUploadThroughputMBps() const {
+      uint64_t startNs = uploadWindowStartNs.load(std::memory_order_relaxed);
+      if (startNs == 0)
+        return 0.0;
+      auto now = std::chrono::steady_clock::now().time_since_epoch();
+      uint64_t nowNs = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(now).count());
+      if (nowNs <= startNs)
+        return 0.0;
+      double seconds = static_cast<double>(nowNs - startNs) / 1e9;
+      double mb = static_cast<double>(bytesUploadedTotal.load(std::memory_order_relaxed)) / (1024.0 * 1024.0);
+      return seconds > 0.0 ? (mb / seconds) : 0.0;
+    }
+    
+    // Track the last timeline value associated with a critical upload
+    std::atomic<uint64_t> lastCriticalUploadValue{0};
+
+    // Track frames since loading complete for gradual startup
+    uint32_t framesSinceLoadingComplete = 0;
+    std::atomic<bool> pauseBackgroundUploads{false};
+};
\ No newline at end of file
diff --git a/attachments/advanced_gltf/renderer_advanced_types.h b/attachments/advanced_gltf/renderer_advanced_types.h
new file mode 100644
index 000000000..4ad70dbf0
--- /dev/null
+++ b/attachments/advanced_gltf/renderer_advanced_types.h
@@ -0,0 +1,204 @@
+#pragma once
+#include <vulkan/vulkan_raii.hpp>
+#include <glm/glm.hpp>
+#include <glm/gtc/quaternion.hpp>
+#include <vector>
+#include <unordered_map>
+#include <shared_mutex>
+#include <mutex>
+#include <memory>
+#include <string>
+
+// Forward declarations
+class Renderer;
+class Entity;
+class Model;
+class MeshComponent;
+namespace tinygltf { class Model; }
+#include "memory_pool.h"
+
+struct OutputVertex {
+    glm::vec3 position;
+    glm::vec3 normal;
+    glm::vec2 texcoord;
+    glm::vec4 tangent;
+};
+
+struct SkinPushConstants {
+    uint32_t vertexCount;
+    uint32_t morphIndices[24];
+    struct MorphWeightBlock {
+        float weights[24];
+        uint32_t activeCount;
+        uint32_t applySkinning; // 1 = apply skeletal skinning to the position, 0 = morph only
+        uint32_t pad[2];
+    } morphWeights;
+};
+
+struct AdvancedRendererState {
+    vk::raii::DescriptorSetLayout skinDescriptorSetLayout = nullptr;
+    vk::raii::DescriptorSetLayout morphDescriptorSetLayout = nullptr;
+    vk::raii::DescriptorSet dummyMorphDescriptorSet = nullptr;
+    vk::raii::PipelineLayout skinPipelineLayout = nullptr;
+    vk::raii::Pipeline skinPipeline = nullptr;
+
+    // Track mapping from mesh component to BLAS index for refitting
+    std::unordered_map<MeshComponent*, uint32_t> meshToBLAS;
+};
+
+struct AdvancedEntityResources {
+    bool isDeformable = false;
+    vk::raii::Buffer outputVertexBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> outputVertexBufferAllocation = nullptr;
+    vk::raii::Buffer jointMatricesBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> jointMatricesBufferAllocation = nullptr;
+
+    // Staging allocations to prevent redundant OS-level allocations
+    std::unique_ptr<MemoryPool::Allocation> stagingVertexBufferAllocation = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> stagingIndexBufferAllocation = nullptr;
+
+    // GPU buffers for parallel skinning data
+    vk::raii::Buffer jointIndicesBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> jointIndicesBufferAllocation = nullptr;
+    vk::raii::Buffer jointWeightsBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> jointWeightsBufferAllocation = nullptr;
+    vk::raii::Buffer stagingJointIndicesBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> stagingJointIndicesAllocation = nullptr;
+    vk::raii::Buffer stagingJointWeightsBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> stagingJointWeightsAllocation = nullptr;
+    vk::DeviceSize jointIndicesSize = 0;
+    vk::DeviceSize jointWeightsSize = 0;
+
+    // GPU buffers for morph targets
+    std::vector<vk::raii::Buffer> morphTargetBuffers;
+    std::vector<std::unique_ptr<MemoryPool::Allocation>> morphTargetBufferAllocations;
+    std::vector<vk::raii::Buffer> stagingMorphTargetBuffers;
+    std::vector<std::unique_ptr<MemoryPool::Allocation>> stagingMorphTargetAllocations;
+    std::vector<vk::DeviceSize> morphTargetSizes;
+
+    std::vector<vk::raii::DescriptorSet> skinDescriptorSets;
+    std::vector<vk::raii::DescriptorSet> morphDescriptorSets;
+
+    // Scratch buffer for BLAS refits
+    vk::raii::Buffer blasScratchBuffer = nullptr;
+    std::unique_ptr<MemoryPool::Allocation> blasScratchBufferAllocation = nullptr;
+    vk::DeviceSize blasScratchBufferSize = 0;
+
+    // Cached flags for TLAS optimization
+    bool isEnvironment = false;
+    bool isEnvironmentChecked = false;
+    bool isGeometryChecked = false;
+};
+
+struct Skin {
+    std::string name;
+    int skeletonRoot = -1;
+    std::vector<int> joints;
+    std::vector<glm::mat4> inverseBindMatrices;
+};
+
+struct AdvancedModelData {
+    bool isDeformable = false;
+    int skinIndex = -1;
+    std::vector<Skin> skins;
+    std::unordered_map<int, std::vector<int>> nodeChildren;
+    std::unordered_map<int, glm::mat4> nodeLocalTransforms;
+    std::unordered_map<int, glm::vec3> nodeLocalTranslations;
+    std::unordered_map<int, glm::quat> nodeLocalRotations;
+    std::unordered_map<int, glm::vec3> nodeLocalScales;
+    std::vector<int> rootNodes;
+    std::unordered_map<int, int> nodeSkins;
+};
+
+struct AdvancedAnimationState {
+    std::unordered_map<int, std::vector<int>> nodeChildren;
+    std::unordered_map<int, glm::mat4> initialLocalTransforms;
+    std::unordered_map<int, glm::vec3> initialLocalTranslations;
+    std::unordered_map<int, glm::quat> initialLocalRotations;
+    std::unordered_map<int, glm::vec3> initialLocalScales;
+    std::vector<int> rootNodes;
+};
+
+struct AdvancedMeshComponentData {
+    bool isDeformable = false;
+    int numMorphTargets = 0;
+    std::vector<int> joints;
+    std::vector<glm::mat4> inverseBindMatrices;
+    std::vector<glm::mat4> jointMatrices;
+    std::vector<float> morphWeights;
+    std::vector<std::vector<glm::vec3>> morphTargetPositions;
+
+    // Parallel buffers for skinning data
+    std::vector<glm::uvec4> jointIndices;
+    std::vector<glm::vec4> jointWeights;
+};
+
+extern std::unordered_map<const Renderer*, AdvancedRendererState> g_rendererStates;
+extern std::unordered_map<const void*, AdvancedEntityResources> g_meshAdvancedResources; // Keyed by MeshComponent*
+extern std::unordered_map<const Model*, AdvancedModelData> g_modelData;
+extern std::unordered_map<const class AnimationComponent*, AdvancedAnimationState> g_animationAdvancedStates;
+extern std::unordered_map<const class MeshComponent*, AdvancedMeshComponentData> g_meshComponentData;
+extern std::unordered_map<const void*, bool> g_materialMeshDeformable; // Keyed by MaterialMesh*
+extern std::unordered_map<const void*, std::vector<glm::uvec4>> g_materialMeshJointIndices;
+extern std::unordered_map<const void*, std::vector<glm::vec4>> g_materialMeshJointWeights;
+extern std::unordered_map<const void*, int> g_materialMeshMorphTargetCount;
+extern std::unordered_map<const void*, std::vector<std::vector<glm::vec3>>> g_materialMeshMorphPositions;
+extern std::shared_mutex g_advancedStateMutex;
+
+// Global pointer for tracking the last spawned ball to optimize camera tracking and avoid O(N) string searches
+extern Entity* g_lastSpawnedBall;
+
+// Mark/query entities whose transform is owned by the physics system (e.g. a thrown Fox).
+// While owned, the animation system must not overwrite the entity transform.
+void SetEntityPhysicsOwned(const class Entity* entity, bool owned);
+bool IsEntityPhysicsOwned(const class Entity* entity);
+
+// Extension functions for Engine
+std::vector<class Entity*> SnapshotEntities(const class Engine* engine);
+
+// Extension functions for Entity
+std::recursive_mutex& GetEntityMutex(const class Entity* entity);
+
+// Extension functions for Renderer
+bool AdvancedRenderer_createSkinningResources(Renderer* renderer);
+void AdvancedRenderer_updateSkins(Renderer* renderer, vk::raii::CommandBuffer& cmd, uint32_t frameIndex, const std::vector<Entity*>& entities);
+void AdvancedRenderer_Cleanup(Renderer* renderer);
+void AdvancedRenderer_KickWatchdog(Renderer* renderer);
+
+// Extension functions for Model
+AdvancedModelData& GetAdvancedModelData(const Model* model);
+void AdvancedModel_ProcessSkins(class ModelLoader* loader, const tinygltf::Model& gltfModel, Model* model);
+
+// Extension functions for MeshComponent
+void SetMeshComponentDeformable(class MeshComponent* mesh, bool deformable);
+void SetMeshComponentMorphTargets(class MeshComponent* mesh, int numTargets);
+int GetMeshComponentMorphTargets(const class MeshComponent* mesh);
+void SetMeshComponentMorphWeights(class MeshComponent* mesh, const std::vector<float>& weights);
+const std::vector<float>& GetMeshComponentMorphWeights(const class MeshComponent* mesh);
+bool IsMeshComponentDeformable(const class MeshComponent* mesh);
+void SetMeshComponentSkinData(class MeshComponent* mesh, const std::vector<int>& joints, const std::vector<glm::mat4>& inverseBindMatrices);
+void SetMeshComponentJointMatrices(class MeshComponent* mesh, const std::vector<glm::mat4>& matrices);
+const std::vector<glm::mat4>& GetMeshComponentJointMatrices(const class MeshComponent* mesh);
+void SetMeshComponentJointsAndWeights(class MeshComponent* mesh, const std::vector<glm::uvec4>& joints, const std::vector<glm::vec4>& weights);
+void SetMeshComponentMorphPositions(class MeshComponent* mesh, const std::vector<std::vector<glm::vec3>>& positions);
+void SetMeshComponentEnvironment(class MeshComponent* mesh, bool isEnvironment);
+
+// Extension functions for MaterialMesh
+void SetMaterialMeshDeformable(const void* materialMesh, bool deformable);
+bool IsMaterialMeshDeformable(const void* materialMesh);
+void SetMaterialMeshJointsAndWeights(const void* materialMesh, const std::vector<glm::uvec4>& joints, const std::vector<glm::vec4>& weights);
+const std::vector<glm::uvec4>& GetMaterialMeshJoints(const void* materialMesh);
+const std::vector<glm::vec4>& GetMaterialMeshWeights(const void* materialMesh);
+int GetMaterialMeshMorphTargetCount(const void* materialMesh);
+void SetMaterialMeshMorphTargetCount(const void* materialMesh, int count);
+void SetMaterialMeshMorphPositions(const void* materialMesh, const std::vector<std::vector<glm::vec3>>& positions);
+const std::vector<std::vector<glm::vec3>>& GetMaterialMeshMorphPositions(const void* materialMesh);
+
+// Extension functions for AnimationComponent
+void AnimationComponent_SetHierarchy(class AnimationComponent* anim,
+                                    const std::unordered_map<int, std::vector<int>>& nodeChildren,
+                                    const std::unordered_map<int, glm::mat4>& initialLocalTransforms,
+                                    const std::unordered_map<int, glm::vec3>& initialLocalTranslations,
+                                    const std::unordered_map<int, glm::quat>& initialLocalRotations,
+                                    const std::unordered_map<int, glm::vec3>& initialLocalScales,
+                                    const std::vector<int>& rootNodes);
diff --git a/attachments/advanced_gltf/renderer_compute.cpp b/attachments/advanced_gltf/renderer_compute.cpp
new file mode 100644
index 000000000..9a595e777
--- /dev/null
+++ b/attachments/advanced_gltf/renderer_compute.cpp
@@ -0,0 +1,565 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "renderer.h"
+#include <array>
+#include <fstream>
+#include <iostream>
+
+// This file contains compute-related methods from the Renderer class
+
+// Create compute pipeline
+bool Renderer::createComputePipeline() {
+  try {
+    // Read compute shader code
+    auto computeShaderCode = readFile("shaders/hrtf.spv");
+
+    // Create shader module
+    vk::raii::ShaderModule computeShaderModule = createShaderModule(computeShaderCode);
+
+    // Create shader stage info
+    vk::PipelineShaderStageCreateInfo computeShaderStageInfo{
+      .stage = vk::ShaderStageFlagBits::eCompute,
+      .module = *computeShaderModule,
+      .pName = "main"
+    };
+
+    // Create compute descriptor set layout
+    std::array<vk::DescriptorSetLayoutBinding, 4> computeBindings = {
+      vk::DescriptorSetLayoutBinding{
+        .binding = 0,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute,
+        .pImmutableSamplers = nullptr
+      },
+      vk::DescriptorSetLayoutBinding{
+        .binding = 1,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute,
+        .pImmutableSamplers = nullptr
+      },
+      vk::DescriptorSetLayoutBinding{
+        .binding = 2,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute,
+        .pImmutableSamplers = nullptr
+      },
+      vk::DescriptorSetLayoutBinding{
+        .binding = 3,
+        .descriptorType = vk::DescriptorType::eUniformBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute,
+        .pImmutableSamplers = nullptr
+      }
+    };
+
+    vk::DescriptorSetLayoutCreateInfo computeLayoutInfo{
+      .bindingCount = static_cast<uint32_t>(computeBindings.size()),
+      .pBindings = computeBindings.data()
+    };
+
+    computeDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, computeLayoutInfo);
+
+    // Create compute pipeline layout
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo{
+      .setLayoutCount = 1,
+      .pSetLayouts = &*computeDescriptorSetLayout,
+      .pushConstantRangeCount = 0,
+      .pPushConstantRanges = nullptr
+    };
+
+    computePipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo);
+
+    // Create compute pipeline
+    vk::ComputePipelineCreateInfo pipelineInfo{
+      .stage = computeShaderStageInfo,
+      .layout = *computePipelineLayout
+    };
+
+    computePipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo);
+
+    // Create compute descriptor pool
+    std::array<vk::DescriptorPoolSize, 2> poolSizes = {
+      vk::DescriptorPoolSize{
+        .type = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 6u * MAX_FRAMES_IN_FLIGHT // room for multiple compute pipelines
+      },
+      vk::DescriptorPoolSize{
+        .type = vk::DescriptorType::eUniformBuffer,
+        .descriptorCount = 2u * MAX_FRAMES_IN_FLIGHT
+      }
+    };
+
+    vk::DescriptorPoolCreateInfo poolInfo{
+      .flags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet,
+      .maxSets = 2u * MAX_FRAMES_IN_FLIGHT,
+      .poolSizeCount = static_cast<uint32_t>(poolSizes.size()),
+      .pPoolSizes = poolSizes.data()
+    };
+
+    computeDescriptorPool = vk::raii::DescriptorPool(device, poolInfo);
+
+    return createComputeCommandPool();
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create compute pipeline: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Forward+ compute (tiled light culling)
+bool Renderer::createForwardPlusPipelinesAndResources() {
+  try {
+    // Load compute shader
+    auto cullSpv = readFile("shaders/forward_plus_cull.spv");
+    vk::raii::ShaderModule cullModule = createShaderModule(cullSpv);
+
+    // Descriptor set layout: 0=lights SSBO (RO), 1=tile headers SSBO (RW), 2=tile indices SSBO (RW), 3=params UBO (RO)
+    std::array<vk::DescriptorSetLayoutBinding, 4> bindings = {
+      vk::DescriptorSetLayoutBinding{.binding = 0, .descriptorType = vk::DescriptorType::eStorageBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute},
+      vk::DescriptorSetLayoutBinding{.binding = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute},
+      vk::DescriptorSetLayoutBinding{.binding = 2, .descriptorType = vk::DescriptorType::eStorageBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute},
+      vk::DescriptorSetLayoutBinding{.binding = 3, .descriptorType = vk::DescriptorType::eUniformBuffer, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eCompute}
+    };
+
+    vk::DescriptorSetLayoutCreateInfo layoutInfo{.bindingCount = static_cast<uint32_t>(bindings.size()), .pBindings = bindings.data()};
+    forwardPlusDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo);
+
+    // Pipeline layout
+    vk::PipelineLayoutCreateInfo plInfo{.setLayoutCount = 1, .pSetLayouts = &*forwardPlusDescriptorSetLayout};
+    forwardPlusPipelineLayout = vk::raii::PipelineLayout(device, plInfo);
+
+    // Pipeline
+    vk::PipelineShaderStageCreateInfo stage{.stage = vk::ShaderStageFlagBits::eCompute, .module = *cullModule, .pName = "main"};
+    vk::ComputePipelineCreateInfo cpInfo{.stage = stage, .layout = *forwardPlusPipelineLayout};
+    forwardPlusPipeline = vk::raii::Pipeline(device, nullptr, cpInfo);
+
+    // Allocate per-frame structs
+    forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT);
+
+    // Allocate compute descriptor sets (reuse computeDescriptorPool)
+    std::vector<vk::DescriptorSetLayout> layouts(MAX_FRAMES_IN_FLIGHT, *forwardPlusDescriptorSetLayout);
+    vk::DescriptorSetAllocateInfo allocInfo{.descriptorPool = *computeDescriptorPool, .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, .pSetLayouts = layouts.data()};
+    auto sets = vk::raii::DescriptorSets(device, allocInfo);
+    for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) {
+      forwardPlusPerFrame[i].computeSet = std::move(sets[i]);
+    }
+
+    // Initial buffer allocation based on current swapchain extent (also updates descriptors)
+    uint32_t tilesX = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX;
+    uint32_t tilesY = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY;
+    if (!createOrResizeForwardPlusBuffers(tilesX, tilesY, forwardPlusSlicesZ)) {
+      return false;
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create Forward+ compute resources: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool Renderer::createOrResizeForwardPlusBuffers(uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, bool updateOnlyCurrentFrame) {
+  try {
+    size_t clusters = static_cast<size_t>(tilesX) * static_cast<size_t>(tilesY) * static_cast<size_t>(slicesZ);
+    size_t indices = clusters * static_cast<size_t>(MAX_LIGHTS_PER_TILE);
+
+    // Range of frames to touch this call
+    size_t beginFrame = 0;
+    size_t endFrame = MAX_FRAMES_IN_FLIGHT;
+    if (updateOnlyCurrentFrame) {
+      beginFrame = static_cast<size_t>(currentFrame);
+      endFrame = beginFrame + 1;
+    }
+
+    for (size_t i = beginFrame; i < endFrame; ++i) {
+      auto& f = forwardPlusPerFrame[i];
+      bool needTiles = (f.tilesCapacity < clusters) || (!*f.tileHeaders);
+      bool needIdx = (f.indicesCapacity < indices) || (!*f.tileLightIndices);
+
+      if (needTiles) {
+        if (!!*f.tileHeaders) {
+          f.tileHeaders = vk::raii::Buffer(nullptr);
+          f.tileHeadersAlloc.reset();
+        }
+        auto [buf, alloc] = createBufferPooled(clusters * sizeof(TileHeader), vk::BufferUsageFlagBits::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+        f.tileHeaders = std::move(buf);
+        f.tileHeadersAlloc = std::move(alloc);
+        f.tilesCapacity = clusters;
+        // Initialize headers to zero so that count==0 when Forward+ is disabled or before first dispatch
+        if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) {
+          std::memset(f.tileHeadersAlloc->mappedPtr, 0, clusters * sizeof(TileHeader));
+        }
+      }
+      if (needIdx) {
+        if (!!*f.tileLightIndices) {
+          f.tileLightIndices = vk::raii::Buffer(nullptr);
+          f.tileLightIndicesAlloc.reset();
+        }
+        auto [buf, alloc] = createBufferPooled(indices * sizeof(uint32_t), vk::BufferUsageFlagBits::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal | vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+        f.tileLightIndices = std::move(buf);
+        f.tileLightIndicesAlloc = std::move(alloc);
+        f.indicesCapacity = indices;
+        // Initialize indices to zero to avoid stray reads
+        if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) {
+          std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, indices * sizeof(uint32_t));
+        }
+      }
+      if (!*f.params) {
+        auto [pbuf, palloc] = createBufferPooled(sizeof(glm::mat4) * 2 + sizeof(glm::vec4) * 3, vk::BufferUsageFlagBits::eUniformBuffer, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+        f.params = std::move(pbuf);
+        f.paramsAlloc = std::move(palloc);
+        f.paramsMapped = f.paramsAlloc->mappedPtr;
+      }
+
+      // Update compute descriptor set writes for this frame (only if buffers changed or first time)
+      if (!!*forwardPlusPerFrame[i].computeSet) {
+        if (!descriptorSetsValid.load(std::memory_order_relaxed)) {
+          // Descriptor sets are being recreated; skip writes this iteration
+          continue;
+        }
+        if (isRecordingCmd.load(std::memory_order_relaxed)) {
+          // Avoid update-after-bind while a command buffer is recording
+          continue;
+        }
+        // Only update descriptors if we resized or created any buffer this iteration
+        if (needTiles || needIdx || !!*f.params) {
+          // Build writes conditionally to avoid dereferencing uninitialized light buffers
+          std::vector<vk::WriteDescriptorSet> writes;
+
+          // Binding 0: lights SSBO (only if available)
+          bool haveLightBuffer = (i < lightStorageBuffers.size()) && !!*lightStorageBuffers[i].buffer;
+          vk::DescriptorBufferInfo lightsInfo{};
+          if (haveLightBuffer) {
+            lightsInfo = vk::DescriptorBufferInfo{.buffer = *lightStorageBuffers[i].buffer, .offset = 0, .range = VK_WHOLE_SIZE};
+            writes.push_back(vk::WriteDescriptorSet{.dstSet = *forwardPlusPerFrame[i].computeSet, .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightsInfo});
+          }
+
+          // Binding 1: tile headers
+          vk::DescriptorBufferInfo headersInfo{.buffer = *f.tileHeaders, .offset = 0, .range = VK_WHOLE_SIZE};
+          writes.push_back(vk::WriteDescriptorSet{.dstSet = *forwardPlusPerFrame[i].computeSet, .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo});
+
+          // Binding 2: tile indices
+          vk::DescriptorBufferInfo indicesInfo{.buffer = *f.tileLightIndices, .offset = 0, .range = VK_WHOLE_SIZE};
+          writes.push_back(vk::WriteDescriptorSet{.dstSet = *forwardPlusPerFrame[i].computeSet, .dstBinding = 2, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo});
+
+          // Binding 3: params UBO
+          vk::DescriptorBufferInfo paramsInfo{.buffer = *f.params, .offset = 0, .range = VK_WHOLE_SIZE};
+          writes.push_back(vk::WriteDescriptorSet{.dstSet = *forwardPlusPerFrame[i].computeSet, .dstBinding = 3, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &paramsInfo});
+
+          if (!writes.empty()) {
+            std::lock_guard<std::mutex> lk(descriptorMutex);
+            device.updateDescriptorSets(writes, {});
+          }
+        }
+      }
+    }
+
+    // Update PBR descriptor sets to bind new tile buffers for forward shading.
+    // Avoid updating sets that may be in use by in-flight command buffers.
+    // If updateOnlyCurrentFrame=true, only update the current frame's sets (safe point after fence wait).
+    try {
+      // Only update PBR descriptor sets for bindings 7/8 in two situations:
+      //  - When called in initialization/device-idle paths (updateOnlyCurrentFrame=false), or
+      //  - When this call resulted in (re)creating the buffers for the current frame
+      size_t beginFrameSets = 0;
+      size_t endFrameSets = forwardPlusPerFrame.size();
+      if (updateOnlyCurrentFrame) {
+        beginFrameSets = static_cast<size_t>(currentFrame);
+        endFrameSets = beginFrameSets + 1;
+      }
+
+      for (auto& kv : entityResources) {
+        auto& resources = kv.second;
+        if (resources.pbrDescriptorSets.empty())
+          continue;
+        for (size_t i = beginFrameSets; i < endFrameSets && i < resources.pbrDescriptorSets.size() && i < forwardPlusPerFrame.size(); ++i) {
+          if (!descriptorSetsValid.load(std::memory_order_relaxed))
+            continue;
+          if (isRecordingCmd.load(std::memory_order_relaxed))
+            continue;
+          if (!(*resources.pbrDescriptorSets[i]))
+            continue;
+          auto& f = forwardPlusPerFrame[i];
+          if (!*f.tileHeaders || !*f.tileLightIndices)
+            continue;
+          vk::DescriptorBufferInfo headersInfo{.buffer = *f.tileHeaders, .offset = 0, .range = VK_WHOLE_SIZE};
+          vk::DescriptorBufferInfo indicesInfo{.buffer = *f.tileLightIndices, .offset = 0, .range = VK_WHOLE_SIZE};
+          std::array<vk::WriteDescriptorSet, 2> writes = {
+            vk::WriteDescriptorSet{.dstSet = *resources.pbrDescriptorSets[i], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo},
+            vk::WriteDescriptorSet{.dstSet = *resources.pbrDescriptorSets[i], .dstBinding = 8, .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo}
+          }; {
+            std::lock_guard<std::mutex> lk(descriptorMutex);
+            device.updateDescriptorSets(writes, {});
+          }
+        }
+      }
+    } catch (...) {
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create/resize Forward+ buffers: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+void Renderer::updateForwardPlusParams(uint32_t frameIndex, const glm::mat4& view, const glm::mat4& proj, uint32_t lightCount, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ, float nearZ, float farZ) {
+  if (frameIndex >= forwardPlusPerFrame.size())
+    return;
+  auto& f = forwardPlusPerFrame[frameIndex];
+  if (!f.paramsMapped)
+    return;
+
+  // Pack: [view][proj][screen xy, tile xy][lightCount, maxPerTile, tilesX, tilesY][near, far, slicesZ, 0]
+  struct ParamsCPU {
+    glm::mat4 view;
+    glm::mat4 proj;
+    glm::vec4 screenTile; // x=width,y=height,z=tileX,w=tileY
+    glm::uvec4 counts; // x=lightCount,y=maxPerTile,z=tilesX,w=tilesY
+    glm::vec4 zParams; // x=nearZ,y=farZ,z=slicesZ,w=0
+  };
+
+  ParamsCPU p{};
+  p.view = view;
+  p.proj = proj;
+  p.screenTile = glm::vec4(static_cast<float>(swapChainExtent.width), static_cast<float>(swapChainExtent.height), static_cast<float>(forwardPlusTileSizeX), static_cast<float>(forwardPlusTileSizeY));
+  p.counts = glm::uvec4(lightCount, MAX_LIGHTS_PER_TILE, tilesX, tilesY);
+  p.zParams = glm::vec4(nearZ, farZ, static_cast<float>(slicesZ), 0.0f);
+
+  std::memcpy(f.paramsAlloc->mappedPtr, &p, sizeof(ParamsCPU));
+}
+
+void Renderer::dispatchForwardPlus(vk::raii::CommandBuffer& cmd, uint32_t tilesX, uint32_t tilesY, uint32_t slicesZ) {
+  if (!*forwardPlusPipeline)
+    return;
+  if (currentFrame >= forwardPlusPerFrame.size())
+    return;
+  auto& f = forwardPlusPerFrame[currentFrame];
+  if (!*f.computeSet)
+    return;
+
+  // Ensure a valid lights buffer is bound; otherwise skip compute this frame
+  bool haveLightBuffer = (currentFrame < lightStorageBuffers.size()) && !!*lightStorageBuffers[currentFrame].buffer;
+  if (!haveLightBuffer)
+    return;
+
+  cmd.bindPipeline(vk::PipelineBindPoint::eCompute, *forwardPlusPipeline);
+  vk::DescriptorSet set = *f.computeSet;
+  cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *forwardPlusPipelineLayout, 0, set, {});
+  // One invocation per cluster (X,Y by workgroup grid, Z as third dimension)
+  cmd.dispatch(tilesX, tilesY, slicesZ);
+  // Make tilelist writes visible to fragment shader (Sync2)
+  vk::MemoryBarrier2 memBarrier2{
+    .srcStageMask = vk::PipelineStageFlagBits2::eComputeShader,
+    .srcAccessMask = vk::AccessFlagBits2::eShaderWrite,
+    .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader,
+    .dstAccessMask = vk::AccessFlagBits2::eShaderRead
+  };
+  vk::DependencyInfo depInfoComputeToFrag{.memoryBarrierCount = 1, .pMemoryBarriers = &memBarrier2};
+  cmd.pipelineBarrier2(depInfoComputeToFrag);
+}
+
+// Ensure compute descriptor binding 0 (lights SSBO) is bound for the given frame.
+void Renderer::refreshForwardPlusComputeLightsBindingForFrame(uint32_t frameIndex) {
+  try {
+    if (frameIndex >= forwardPlusPerFrame.size())
+      return;
+    if (!*forwardPlusPerFrame[frameIndex].computeSet)
+      return;
+    if (frameIndex >= lightStorageBuffers.size())
+      return;
+    if (!*lightStorageBuffers[frameIndex].buffer)
+      return;
+
+    // Updating descriptor sets during recording causes validation errors:
+    // "commandBuffer must be in the recording state" and invalidates the command buffer.
+    // These descriptor sets are already initialized earlier at the safe point (line 1059),
+    // so this redundant update during recording is unnecessary and harmful.
+    if (isRecordingCmd.load(std::memory_order_relaxed)) {
+      return; // Skip update, descriptor is already valid from earlier initialization
+    }
+
+    vk::DescriptorBufferInfo lightsInfo{.buffer = *lightStorageBuffers[frameIndex].buffer, .offset = 0, .range = VK_WHOLE_SIZE};
+    vk::WriteDescriptorSet write{.dstSet = *forwardPlusPerFrame[frameIndex].computeSet, .dstBinding = 0, .dstArrayElement = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightsInfo}; {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      device.updateDescriptorSets(write, {});
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to refresh Forward+ compute lights binding for frame " << frameIndex << ": " << e.what() << std::endl;
+  }
+}
+
+// Create compute command pool
+bool Renderer::createComputeCommandPool() {
+  try {
+    vk::CommandPoolCreateInfo poolInfo{
+      .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+      .queueFamilyIndex = queueFamilyIndices.computeFamily.value()
+    };
+
+    computeCommandPool = vk::raii::CommandPool(device, poolInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create compute command pool: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Dispatch compute shader
+vk::raii::Fence Renderer::DispatchCompute(uint32_t groupCountX,
+                                          uint32_t groupCountY,
+                                          uint32_t groupCountZ,
+                                          vk::Buffer inputBuffer,
+                                          vk::Buffer outputBuffer,
+                                          vk::Buffer hrtfBuffer,
+                                          vk::Buffer paramsBuffer) {
+  try {
+    // Create fence for synchronization
+    vk::FenceCreateInfo fenceInfo{};
+    vk::raii::Fence computeFence(device, fenceInfo);
+
+    // Create descriptor sets
+    vk::DescriptorSetAllocateInfo allocInfo{
+      .descriptorPool = *computeDescriptorPool,
+      .descriptorSetCount = 1,
+      .pSetLayouts = &*computeDescriptorSetLayout
+    }; {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      computeDescriptorSets = device.allocateDescriptorSets(allocInfo);
+    }
+
+    // Update descriptor sets
+    vk::DescriptorBufferInfo inputBufferInfo{
+      .buffer = inputBuffer,
+      .offset = 0,
+      .range = VK_WHOLE_SIZE
+    };
+
+    vk::DescriptorBufferInfo outputBufferInfo{
+      .buffer = outputBuffer,
+      .offset = 0,
+      .range = VK_WHOLE_SIZE
+    };
+
+    vk::DescriptorBufferInfo hrtfBufferInfo{
+      .buffer = hrtfBuffer,
+      .offset = 0,
+      .range = VK_WHOLE_SIZE
+    };
+
+    vk::DescriptorBufferInfo paramsBufferInfo{
+      .buffer = paramsBuffer,
+      .offset = 0,
+      .range = VK_WHOLE_SIZE
+    };
+
+    std::array<vk::WriteDescriptorSet, 4> descriptorWrites = {
+      vk::WriteDescriptorSet{
+        .dstSet = *computeDescriptorSets[0],
+        .dstBinding = 0,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .pBufferInfo = &inputBufferInfo
+      },
+      vk::WriteDescriptorSet{
+        .dstSet = *computeDescriptorSets[0],
+        .dstBinding = 1,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .pBufferInfo = &outputBufferInfo
+      },
+      vk::WriteDescriptorSet{
+        .dstSet = *computeDescriptorSets[0],
+        .dstBinding = 2,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .pBufferInfo = &hrtfBufferInfo
+      },
+      vk::WriteDescriptorSet{
+        .dstSet = *computeDescriptorSets[0],
+        .dstBinding = 3,
+        .dstArrayElement = 0,
+        .descriptorCount = 1,
+        .descriptorType = vk::DescriptorType::eUniformBuffer,
+        .pBufferInfo = &paramsBufferInfo
+      }
+    }; {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      device.updateDescriptorSets(descriptorWrites, {});
+    }
+
+    // Create command buffer using dedicated compute command pool
+    vk::CommandBufferAllocateInfo cmdAllocInfo{
+      .commandPool = *computeCommandPool,
+      .level = vk::CommandBufferLevel::ePrimary,
+      .commandBufferCount = 1
+    };
+
+    auto commandBuffers = device.allocateCommandBuffers(cmdAllocInfo);
+    // Use RAII wrapper temporarily for recording to preserve dispatch loader
+    vk::raii::CommandBuffer commandBufferRaii = std::move(commandBuffers[0]);
+
+    // Begin command buffer
+    vk::CommandBufferBeginInfo beginInfo{
+      .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit
+    };
+
+    commandBufferRaii.begin(beginInfo);
+
+    // Bind compute pipeline
+    commandBufferRaii.bindPipeline(vk::PipelineBindPoint::eCompute, *computePipeline);
+
+    // Bind descriptor set
+    commandBufferRaii.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *computePipelineLayout, 0, *computeDescriptorSets[0], {});
+
+    // Dispatch compute shader
+    commandBufferRaii.dispatch(groupCountX, groupCountY, groupCountZ);
+
+    // End command buffer
+    commandBufferRaii.end();
+
+    // Extract raw command buffer for submission and release RAII ownership
+    // This prevents premature destruction while preserving the recorded commands
+    vk::CommandBuffer rawCommandBuffer = *commandBufferRaii;
+    commandBufferRaii.release(); // Release RAII ownership to prevent destruction
+
+    // Submit command buffer with fence for synchronization
+    vk::SubmitInfo submitInfo{
+      .commandBufferCount = 1,
+      .pCommandBuffers = &rawCommandBuffer
+    };
+
+    // Use mutex to ensure thread-safe access to compute queue
+    {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      computeQueue.submit(submitInfo, *computeFence);
+    }
+
+    // Return fence for non-blocking synchronization
+    return computeFence;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to dispatch compute shader: " << e.what() << std::endl;
+    // Return a null fence on error
+    vk::FenceCreateInfo fenceInfo{};
+    return {device, fenceInfo};
+  }
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/renderer_core.cpp b/attachments/advanced_gltf/renderer_core.cpp
new file mode 100644
index 000000000..30efee0b1
--- /dev/null
+++ b/attachments/advanced_gltf/renderer_core.cpp
@@ -0,0 +1,1405 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <ranges>
+#include <set>
+#include <thread>
+#include <type_traits>
+#include <chrono>
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <string>
+#include <shared_mutex>
+#include <mutex>
+
+
+#include "renderer.h"
+
+#include "renderer_advanced_types.h"
+
+// Global maps for advanced renderer state
+std::unordered_map<const Renderer*, AdvancedRendererState> g_rendererStates;
+std::unordered_map<const void*, AdvancedEntityResources> g_meshAdvancedResources;
+std::unordered_map<const Model*, AdvancedModelData> g_modelData;
+std::unordered_map<const AnimationComponent*, AdvancedAnimationState> g_animationAdvancedStates;
+std::unordered_map<const class MeshComponent*, AdvancedMeshComponentData> g_meshComponentData;
+
+// Mutex pool for entities to avoid global lock contention
+static constexpr size_t ENTITY_MUTEX_POOL_SIZE = 1024;
+static std::recursive_mutex g_entityMutexPool[ENTITY_MUTEX_POOL_SIZE];
+
+std::unordered_map<const void*, bool> g_materialMeshDeformable;
+std::unordered_map<const void*, std::vector<glm::uvec4>> g_materialMeshJointIndices;
+std::unordered_map<const void*, std::vector<glm::vec4>> g_materialMeshJointWeights;
+std::unordered_map<const void*, int> g_materialMeshMorphTargetCount;
+std::unordered_map<const void*, std::vector<std::vector<glm::vec3>>> g_materialMeshMorphPositions;
+std::shared_mutex g_advancedStateMutex;
+Entity* g_lastSpawnedBall = nullptr;
+
+// Entities whose transform is owned by the physics system (e.g. the Fox once thrown).
+// The animation system must not overwrite their transform, or it fights physics and the
+// object oscillates between its physics pose and its animated pose every frame.
+static std::unordered_set<const Entity*> g_physicsOwnedEntities;
+static std::shared_mutex g_physicsOwnedMutex;
+
+void SetEntityPhysicsOwned(const Entity* entity, bool owned) {
+    if (!entity) return;
+    std::unique_lock<std::shared_mutex> lock(g_physicsOwnedMutex);
+    if (owned) g_physicsOwnedEntities.insert(entity);
+    else g_physicsOwnedEntities.erase(entity);
+}
+
+bool IsEntityPhysicsOwned(const Entity* entity) {
+    if (!entity) return false;
+    std::shared_lock<std::shared_mutex> lock(g_physicsOwnedMutex);
+    return g_physicsOwnedEntities.find(entity) != g_physicsOwnedEntities.end();
+}
+
+VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE; // In a .cpp file
+
+#include <vulkan/vk_platform.h>
+#include <vulkan/vulkan.h>          // For PFN_vkGetInstanceProcAddr and C types
+#include <vulkan/vulkan_raii.hpp>
+
+// Debug callback for vk::raii - uses raw Vulkan C types for cross-platform compatibility
+static VKAPI_ATTR VkBool32 VKAPI_CALL debugCallbackVkRaii(
+  VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
+  [[maybe_unused]] VkDebugUtilsMessageTypeFlagsEXT messageType,
+  const VkDebugUtilsMessengerCallbackDataEXT* pCallbackData,
+  [[maybe_unused]] void* pUserData) {
+  if (messageSeverity >= VK_DEBUG_UTILS_MESSAGE_SEVERITY_WARNING_BIT_EXT) {
+    // Print a message to the console
+    std::cerr << "Validation layer: " << pCallbackData->pMessage << std::endl;
+  } else {
+    // Print a message to the console
+    std::cout << "Validation layer: " << pCallbackData->pMessage << std::endl;
+  }
+
+  return VK_FALSE;
+}
+
+// Vulkan-Hpp style callback signature for newer headers expecting vk:: types
+static VKAPI_ATTR vk::Bool32 VKAPI_CALL debugCallbackVkHpp(
+  vk::DebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
+  [[maybe_unused]] vk::DebugUtilsMessageTypeFlagsEXT messageType,
+  const vk::DebugUtilsMessengerCallbackDataEXT* pCallbackData,
+  [[maybe_unused]] void* pUserData) {
+  if (messageSeverity >= vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning) {
+    std::cerr << "Validation layer: " << pCallbackData->pMessage << std::endl;
+  } else {
+    std::cout << "Validation layer: " << pCallbackData->pMessage << std::endl;
+  }
+  return vk::False;
+}
+
+// Watchdog thread function - monitors frame updates and aborts if application hangs
+static void WatchdogThreadFunc(std::atomic<std::chrono::steady_clock::time_point>* lastFrameTime,
+                               std::atomic<bool>* running,
+                               std::atomic<bool>* suppressed,
+                               std::atomic<const char *>* progressLabel,
+                               std::atomic<uint32_t>* progressIndex) {
+  while (running->load(std::memory_order_relaxed)) {
+    std::this_thread::sleep_for(std::chrono::seconds(5));
+
+    if (!running->load(std::memory_order_relaxed)) {
+      break; // Shutdown requested
+    }
+
+    // Check if frame timestamp was updated recently.
+    // Some operations (e.g., BLAS/TLAS builds in Debug on large scenes) can legitimately take
+    // much longer than 5 or 10 seconds. When suppressed, allow a longer grace period.
+    auto now = std::chrono::steady_clock::now();
+    auto lastUpdate = lastFrameTime->load(std::memory_order_relaxed);
+    auto elapsed = std::chrono::duration_cast<std::chrono::seconds>(now - lastUpdate).count();
+    const int64_t allowedSeconds = (suppressed && suppressed->load(std::memory_order_relaxed)) ? 60 : 30;
+
+    if (elapsed >= allowedSeconds) {
+      // Re-check immediately to avoid spurious triggers due to sleep timing
+      lastUpdate = lastFrameTime->load(std::memory_order_relaxed);
+      elapsed = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::steady_clock::now() - lastUpdate).count();
+      if (elapsed < allowedSeconds) continue;
+
+      // APPLICATION HAS HUNG - no frame updates for 10+ seconds
+      const char* label = nullptr;
+      if (progressLabel) {
+        label = progressLabel->load(std::memory_order_relaxed);
+      }
+      uint32_t idx = 0;
+      if (progressIndex) {
+        idx = progressIndex->load(std::memory_order_relaxed);
+      }
+
+      std::cerr << "\n\n";
+      std::cerr << "========================================\n";
+      std::cerr << "WATCHDOG: APPLICATION HAS HUNG!\n";
+      std::cerr << "========================================\n";
+      std::cerr << "Last frame update was " << elapsed << " seconds ago (threshold=" << allowedSeconds << "s).\n";
+      if (label && label[0] != '\0') {
+        std::cerr << "Last progress marker: " << label << "\n";
+      }
+      if (progressIndex) {
+        std::cerr << "Progress index: " << idx << "\n";
+      }
+      std::cerr << "The render loop is not progressing.\n";
+      std::cerr << "Aborting to generate stack trace...\n";
+      std::cerr << "========================================\n\n";
+      std::abort(); // Force crash with stack trace
+    }
+  }
+
+  std::cout << "[Watchdog] Stopped\n";
+}
+
+// Renderer core implementation for the "Rendering Pipeline" chapter of the tutorial.
+Renderer::Renderer(Platform* platform) : platform(platform) {
+  // Initialize deviceExtensions with required extensions only
+  // Optional extensions will be added later after checking device support
+  deviceExtensions = requiredDeviceExtensions;
+  lastCriticalUploadValue.store(0);
+  framesSinceLoadingComplete = 0;
+}
+
+// Destructor
+Renderer::~Renderer() {
+  Cleanup();
+}
+
+
+#include "engine.h"
+#include "entity.h"
+
+
+std::vector<Entity*> SnapshotEntities(const Engine* engine) {
+    if (!engine) return {};
+    std::shared_lock<std::shared_mutex> lock(engine->entitiesMutex);
+    std::vector<Entity*> snapshot;
+    snapshot.reserve(engine->entities.size());
+    for (const auto& e : engine->entities) {
+        snapshot.push_back(e.get());
+    }
+    return snapshot;
+}
+
+std::recursive_mutex& GetEntityMutex(const Entity* entity) {
+    static std::recursive_mutex s_fallbackMutex;
+    if (!entity) return s_fallbackMutex;
+    
+    // Hash entity address to pick a mutex from the pool.
+    // This avoids taking a global unique lock on every GetEntityMutex call (3000+ times per frame).
+    size_t hash = std::hash<const Entity*>{}(entity);
+    return g_entityMutexPool[hash % ENTITY_MUTEX_POOL_SIZE];
+}
+
+// Initialize the renderer
+bool Renderer::Initialize(const std::string& appName, bool enableValidationLayers) {
+  // Initialize the Vulkan-Hpp default dispatcher using the global symbol directly.
+  // This avoids differences across Vulkan-Hpp versions for DynamicLoader placement.
+  VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+  // Create a Vulkan instance
+  if (!createInstance(appName, enableValidationLayers)) {
+    std::cerr << "Failed to create Vulkan instance" << std::endl;
+    return false;
+  }
+
+  // Setup debug messenger
+  if (!setupDebugMessenger(enableValidationLayers)) {
+    std::cerr << "Failed to setup debug messenger" << std::endl;
+    return false;
+  }
+
+  // Create surface
+  if (!createSurface()) {
+    std::cerr << "Failed to create surface" << std::endl;
+    return false;
+  }
+
+  // Pick the physical device
+  if (!pickPhysicalDevice()) {
+    std::cerr << "Failed to pick physical device" << std::endl;
+    return false;
+  }
+
+  // Create logical device
+  if (!createLogicalDevice(enableValidationLayers)) {
+    std::cerr << "Failed to create logical device" << std::endl;
+    return false;
+  }
+
+  // Create the descriptor pool early to avoid uninitialized pool issues
+  if (!createDescriptorPool()) {
+    std::cerr << "Failed to create descriptor pool" << std::endl;
+    return false;
+  }
+
+  // Initialize memory pool for efficient memory management
+  try {
+    memoryPool = std::make_unique<MemoryPool>(device, physicalDevice);
+    if (!memoryPool->initialize()) {
+      std::cerr << "Failed to initialize memory pool" << std::endl;
+      return false;
+    }
+
+    // Optionally pre-allocate initial memory blocks for pools.
+    // For large scenes (e.g., Bistro) on mid-range GPUs this can cause early OOM.
+    // Skip pre-allocation to reduce peak memory pressure; blocks will be created on demand.
+    // if (!memoryPool->preAllocatePools()) { /* non-fatal */ }
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create memory pool: " << e.what() << std::endl;
+    return false;
+  }
+
+  // Create swap chain
+  if (!createSwapChain()) {
+    std::cerr << "Failed to create swap chain" << std::endl;
+    return false;
+  }
+
+  // Create image views
+  if (!createImageViews()) {
+    std::cerr << "Failed to create image views" << std::endl;
+    return false;
+  }
+
+  // Setup dynamic rendering
+  if (!setupDynamicRendering()) {
+    std::cerr << "Failed to setup dynamic rendering" << std::endl;
+    return false;
+  }
+
+  // Create the descriptor set layout
+  if (!createDescriptorSetLayout()) {
+    std::cerr << "Failed to create descriptor set layout" << std::endl;
+    return false;
+  }
+
+  // Create the graphics pipeline
+  if (!createGraphicsPipeline()) {
+    std::cerr << "Failed to create graphics pipeline" << std::endl;
+    return false;
+  }
+
+  // Create PBR pipeline
+  if (!createPBRPipeline()) {
+    std::cerr << "Failed to create PBR pipeline" << std::endl;
+    return false;
+  }
+
+  // Create the lighting pipeline
+  if (!createLightingPipeline()) {
+    std::cerr << "Failed to create lighting pipeline" << std::endl;
+    return false;
+  }
+
+  // Create composite pipeline (fullscreen pass for off-screen → swapchain)
+  if (!createCompositePipeline()) {
+    std::cerr << "Failed to create composite pipeline" << std::endl;
+    return false;
+  }
+
+  // Create compute pipeline
+  if (!createComputePipeline()) {
+    std::cerr << "Failed to create compute pipeline" << std::endl;
+    return false;
+  }
+
+  // Create Forward+ compute and depth pre-pass pipelines/resources
+  if (useForwardPlus) {
+    if (!createForwardPlusPipelinesAndResources()) {
+      std::cerr << "Failed to create Forward+ resources" << std::endl;
+      return false;
+    }
+  }
+
+  // Ensure light storage buffers exist before creating Forward+ resources
+  // so that compute descriptor binding 0 (lights SSBO) can be populated safely.
+  if (!createOrResizeLightStorageBuffers(1)) {
+    std::cerr << "Failed to create initial light storage buffers" << std::endl;
+    return false;
+  }
+
+  // Create ray query descriptor set layout and pipeline (but not resources yet - need descriptor pool first)
+  if (!createRayQueryDescriptorSetLayout()) {
+    std::cerr << "Failed to create ray query descriptor set layout" << std::endl;
+    return false;
+  }
+  if (!createRayQueryPipeline()) {
+    std::cerr << "Failed to create ray query pipeline" << std::endl;
+    return false;
+  }
+
+  // Create the command pool
+  if (!createCommandPool()) {
+    std::cerr << "Failed to create command pool" << std::endl;
+    return false;
+  }
+
+  // Create depth resources
+  if (!createDepthResources()) {
+    std::cerr << "Failed to create depth resources" << std::endl;
+    return false;
+  }
+
+  if (useForwardPlus) {
+    if (!createDepthPrepassPipeline()) {
+      std::cerr << "Failed to create depth prepass pipeline" << std::endl;
+      return false;
+    }
+  }
+
+
+  // Create ray query resources AFTER descriptor pool (needs pool for descriptor set allocation)
+  if (!createRayQueryResources()) {
+    std::cerr << "Failed to create ray query resources" << std::endl;
+    return false;
+  }
+
+  // Create skinning resources
+  if (!AdvancedRenderer_createSkinningResources(this)) {
+    std::cerr << "Failed to create skinning resources" << std::endl;
+    return false;
+  }
+
+  // Note: Acceleration structure build is requested by scene_loading.cpp after entities load
+  // No need to request it here during init
+
+  // Light storage buffers were already created earlier to satisfy Forward+ binding requirements
+
+  if (!createOpaqueSceneColorResources()) {
+    std::cerr << "Failed to create opaque scene color resources" << std::endl;
+    return false;
+  }
+
+  createTransparentDescriptorSets();
+
+  // Create default texture resources
+  if (!createDefaultTextureResources()) {
+    std::cerr << "Failed to create default texture resources" << std::endl;
+    return false;
+  }
+
+  // Create fallback transparent descriptor sets (must occur after default textures exist)
+  createTransparentFallbackDescriptorSets();
+
+  // Create shared default PBR textures (to avoid creating hundreds of identical textures)
+  if (!createSharedDefaultPBRTextures()) {
+    std::cerr << "Failed to create shared default PBR textures" << std::endl;
+    return false;
+  }
+
+  // Create command buffers
+  if (!createCommandBuffers()) {
+    std::cerr << "Failed to create command buffers" << std::endl;
+    return false;
+  }
+
+  // Create sync objects
+  if (!createSyncObjects()) {
+    std::cerr << "Failed to create sync objects" << std::endl;
+    return false;
+  }
+
+  // Initialize background thread pool for async tasks (textures, etc.) AFTER all Vulkan resources are ready
+  try {
+    // Size the thread pool based on hardware concurrency, clamped to a sensible range
+    unsigned int hw = std::max(2u, std::min(8u, std::thread::hardware_concurrency() ? std::thread::hardware_concurrency() : 4u));
+    threadPool = std::make_unique<ThreadPool>(hw);
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create thread pool: " << e.what() << std::endl;
+    return false;
+  }
+
+  // Start background uploads worker now that queues/semaphores exist
+  StartUploadsWorker();
+
+  // Start watchdog thread to detect application hangs
+  lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+  watchdogRunning.store(true, std::memory_order_relaxed);
+  watchdogThread = std::thread(WatchdogThreadFunc, &lastFrameUpdateTime, &watchdogRunning, &watchdogSuppressed, &watchdogProgressLabel, &watchdogProgressIndex);
+
+  std::cout << "[Watchdog] Started - will abort if no frame updates for 10+ seconds\n";
+
+  initialized = true;
+  return true;
+}
+
+void Renderer::ensureThreadLocalVulkanInit() const {
+  // Initialize Vulkan-Hpp dispatcher per-thread; required for multi-threaded RAII usage
+  static thread_local bool s_tlsInitialized = false;
+  if (s_tlsInitialized)
+    return;
+  try {
+    // Initialize the dispatcher for this thread using the global symbol.
+    VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
+    if (*instance) {
+      VULKAN_HPP_DEFAULT_DISPATCHER.init(*instance);
+    }
+    if (*device) {
+      VULKAN_HPP_DEFAULT_DISPATCHER.init(*device);
+    }
+    s_tlsInitialized = true;
+  } catch (...) {
+    // best-effort
+  }
+}
+
+void AdvancedRenderer_Cleanup(Renderer* renderer) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_rendererStates.erase(renderer);
+    if (g_rendererStates.empty()) {
+        g_meshAdvancedResources.clear();
+        g_modelData.clear();
+        g_animationAdvancedStates.clear();
+        g_meshComponentData.clear();
+        g_materialMeshDeformable.clear();
+    }
+}
+
+void AdvancedRenderer_KickWatchdog(Renderer* renderer) {
+    if (renderer) {
+        renderer->lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+    }
+}
+
+void SetMeshComponentDeformable(MeshComponent* mesh, bool deformable) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_meshComponentData[mesh].isDeformable = deformable;
+}
+
+void SetMeshComponentMorphTargets(MeshComponent* mesh, int numTargets) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto& data = g_meshComponentData[mesh];
+    data.numMorphTargets = numTargets;
+    // A mesh is morph-deformable only if it actually has morph targets. Marking it
+    // deformable unconditionally (e.g. for static meshes with numTargets==0) wrongly
+    // excludes static geometry from the ray-query acceleration structure. Only set the
+    // flag here; never clear a value already set true by skin data.
+    if (numTargets > 0) {
+        data.isDeformable = true;
+    }
+}
+
+int GetMeshComponentMorphTargets(const MeshComponent* mesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto it = g_meshComponentData.find(mesh);
+    return it != g_meshComponentData.end() ? it->second.numMorphTargets : 0;
+}
+
+void SetMeshComponentMorphWeights(MeshComponent* mesh, const std::vector<float>& weights) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_meshComponentData[mesh].morphWeights = weights;
+}
+
+const std::vector<float>& GetMeshComponentMorphWeights(const MeshComponent* mesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    static const std::vector<float> empty;
+    auto it = g_meshComponentData.find(mesh);
+    return it != g_meshComponentData.end() ? it->second.morphWeights : empty;
+}
+
+bool IsMeshComponentDeformable(const MeshComponent* mesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto it = g_meshComponentData.find(mesh);
+    return it != g_meshComponentData.end() ? it->second.isDeformable : false;
+}
+
+void SetMeshComponentSkinData(MeshComponent* mesh, const std::vector<int>& joints, const std::vector<glm::mat4>& inverseBindMatrices) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto& data = g_meshComponentData[mesh];
+    data.joints = joints;
+    data.inverseBindMatrices = inverseBindMatrices;
+    data.isDeformable = true;
+}
+
+void SetMeshComponentJointMatrices(MeshComponent* mesh, const std::vector<glm::mat4>& matrices) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_meshComponentData[mesh].jointMatrices = matrices;
+}
+
+const std::vector<glm::mat4>& GetMeshComponentJointMatrices(const MeshComponent* mesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    static const std::vector<glm::mat4> empty;
+    auto it = g_meshComponentData.find(mesh);
+    return it != g_meshComponentData.end() ? it->second.jointMatrices : empty;
+}
+
+void SetMeshComponentJointsAndWeights(MeshComponent* mesh, const std::vector<glm::uvec4>& joints, const std::vector<glm::vec4>& weights) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto& data = g_meshComponentData[mesh];
+    data.jointIndices = joints;
+    data.jointWeights = weights;
+}
+
+void SetMeshComponentMorphPositions(MeshComponent* mesh, const std::vector<std::vector<glm::vec3>>& positions) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_meshComponentData[mesh].morphTargetPositions = positions;
+}
+
+void SetMeshComponentEnvironment(MeshComponent* mesh, bool isEnvironment) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto& advRes = g_meshAdvancedResources[mesh];
+    advRes.isEnvironment = isEnvironment;
+    advRes.isEnvironmentChecked = true;
+}
+
+void SetMaterialMeshDeformable(const void* materialMesh, bool deformable) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_materialMeshDeformable[materialMesh] = deformable;
+}
+
+bool IsMaterialMeshDeformable(const void* materialMesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto it = g_materialMeshDeformable.find(materialMesh);
+    return it != g_materialMeshDeformable.end() ? it->second : false;
+}
+
+void SetMaterialMeshJointsAndWeights(const void* materialMesh, const std::vector<glm::uvec4>& joints, const std::vector<glm::vec4>& weights) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_materialMeshJointIndices[materialMesh] = joints;
+    g_materialMeshJointWeights[materialMesh] = weights;
+}
+
+const std::vector<glm::uvec4>& GetMaterialMeshJoints(const void* materialMesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    static const std::vector<glm::uvec4> empty;
+    auto it = g_materialMeshJointIndices.find(materialMesh);
+    return it != g_materialMeshJointIndices.end() ? it->second : empty;
+}
+
+const std::vector<glm::vec4>& GetMaterialMeshWeights(const void* materialMesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    static const std::vector<glm::vec4> empty;
+    auto it = g_materialMeshJointWeights.find(materialMesh);
+    return it != g_materialMeshJointWeights.end() ? it->second : empty;
+}
+
+int GetMaterialMeshMorphTargetCount(const void* materialMesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto it = g_materialMeshMorphTargetCount.find(materialMesh);
+    return it != g_materialMeshMorphTargetCount.end() ? it->second : 0;
+}
+
+void SetMaterialMeshMorphTargetCount(const void* materialMesh, int count) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_materialMeshMorphTargetCount[materialMesh] = count;
+}
+
+void SetMaterialMeshMorphPositions(const void* materialMesh, const std::vector<std::vector<glm::vec3>>& positions) {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    g_materialMeshMorphPositions[materialMesh] = positions;
+}
+
+const std::vector<std::vector<glm::vec3>>& GetMaterialMeshMorphPositions(const void* materialMesh) {
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    static const std::vector<std::vector<glm::vec3>> empty;
+    auto it = g_materialMeshMorphPositions.find(materialMesh);
+    return it != g_materialMeshMorphPositions.end() ? it->second : empty;
+}
+
+// Clean up renderer resources
+void Renderer::Cleanup() {
+  // Stop watchdog thread first to prevent false hang detection during shutdown
+  if (watchdogRunning.load(std::memory_order_relaxed)) {
+    watchdogRunning.store(false, std::memory_order_relaxed);
+    if (watchdogThread.joinable()) {
+      watchdogThread.join();
+    }
+  }
+
+  // Ensure background workers are stopped before tearing down Vulkan resources
+  StopUploadsWorker();
+
+  // Disallow any further descriptor writes during shutdown.
+  // This prevents late updates/frees racing against pool destruction.
+  descriptorSetsValid.store(false, std::memory_order_relaxed); {
+    std::lock_guard<std::mutex> lk(pendingDescMutex);
+    pendingDescOps.clear();
+    descriptorRefreshPending.store(false, std::memory_order_relaxed);
+  } {
+    std::unique_lock<std::shared_mutex> lock(threadPoolMutex);
+    if (threadPool) {
+      threadPool.reset();
+    }
+  }
+
+  if (!initialized) {
+    return;
+  }
+
+  std::cout << "Starting renderer cleanup..." << std::endl;
+
+  // Wait for the device to be idle before cleaning up
+  try {
+    WaitIdle();
+  } catch (...) {
+  }
+
+  // 1) Clean up any swapchain-scoped resources first
+  cleanupSwapChain();
+
+  // 2) Clear per-entity resources (descriptor sets and buffers) while descriptor pools still exist
+  for (auto& kv : entityResources) {
+    auto& resources = kv.second;
+    resources.basicDescriptorSets.clear();
+    resources.pbrDescriptorSets.clear();
+    resources.uniformBuffers.clear();
+    resources.uniformBufferAllocations.clear();
+    resources.uniformBuffersMapped.clear();
+    resources.instanceBuffer = nullptr;
+    resources.instanceBufferAllocation = nullptr;
+    resources.instanceBufferMapped = nullptr;
+  }
+  entityResources.clear();
+
+  // 3) Clear any global descriptor sets that are allocated from pools to avoid dangling refs
+  transparentDescriptorSets.clear();
+  transparentFallbackDescriptorSets.clear();
+  compositeDescriptorSets.clear();
+  computeDescriptorSets.clear();
+  rqCompositeDescriptorSets.clear();
+
+  // 3.5) Clear ray query descriptor sets BEFORE destroying descriptor pool
+  // Without this, rayQueryDescriptorSets' RAII destructor tries to free them after
+  // the pool is destroyed, causing "Invalid VkDescriptorPool Object" validation errors
+  rayQueryDescriptorSets.clear();
+
+  // Ray Query composite sampler/sets are allocated from the shared descriptor pool.
+  // Ensure they are released before destroying the pool.
+  rqCompositeSampler = nullptr;
+
+  // 4) Destroy/Reset pipelines and pipeline layouts (graphics/compute/forward+/skinning)
+  graphicsPipeline = nullptr;
+  pbrGraphicsPipeline = nullptr;
+  pbrBlendGraphicsPipeline = nullptr;
+  pbrPremulBlendGraphicsPipeline = nullptr;
+  pbrPrepassGraphicsPipeline = nullptr;
+  glassGraphicsPipeline = nullptr;
+  lightingPipeline = nullptr;
+  compositePipeline = nullptr;
+  forwardPlusPipeline = nullptr;
+  depthPrepassPipeline = nullptr;
+
+  pipelineLayout = nullptr;
+  pbrPipelineLayout = nullptr;
+  lightingPipelineLayout = nullptr;
+  compositePipelineLayout = nullptr;
+  pbrTransparentPipelineLayout = nullptr;
+  forwardPlusPipelineLayout = nullptr;
+
+  // 4.3) Ray query pipelines and layouts
+  rayQueryPipeline = nullptr;
+  rayQueryPipelineLayout = nullptr;
+
+  // 4.5) Forward+ per-frame resources (including descriptor sets) must be released
+  // BEFORE destroying descriptor pools to avoid vkFreeDescriptorSets with invalid pool
+  for (auto& fp : forwardPlusPerFrame) {
+    fp.tileHeaders = nullptr;
+    fp.tileHeadersAlloc = nullptr;
+    fp.tileLightIndices = nullptr;
+    fp.tileLightIndicesAlloc = nullptr;
+    fp.params = nullptr;
+    fp.paramsAlloc = nullptr;
+    fp.paramsMapped = nullptr;
+    fp.debugOut = nullptr;
+    fp.debugOutAlloc = nullptr;
+    fp.probeOffscreen = nullptr;
+    fp.probeOffscreenAlloc = nullptr;
+    fp.probeSwapchain = nullptr;
+    fp.probeSwapchainAlloc = nullptr;
+    fp.computeSet = nullptr; // descriptor set allocated from compute/graphics pools
+  }
+  forwardPlusPerFrame.clear();
+
+  // 4.7) Deformable meshes' skin/morph descriptor sets are allocated from descriptorPool but
+  // stored in the global g_meshAdvancedResources, which AdvancedRenderer_Cleanup only clears
+  // AFTER this function destroys the pool. Release them here, while the pool is still valid,
+  // so their RAII destructors don't call vkFreeDescriptorSets on an invalid pool at exit.
+  {
+    std::unique_lock<std::shared_mutex> advLock(g_advancedStateMutex);
+    for (auto& kv : g_meshAdvancedResources) {
+      kv.second.skinDescriptorSets.clear();
+      kv.second.morphDescriptorSets.clear();
+    }
+    // The skinning/morph compute state (g_rendererStates) also holds a descriptor set
+    // (dummyMorphDescriptorSet) allocated from descriptorPool; release it before the pool too.
+    auto stateIt = g_rendererStates.find(this);
+    if (stateIt != g_rendererStates.end()) {
+      stateIt->second.dummyMorphDescriptorSet = nullptr;
+    }
+  }
+
+  // 5) Destroy descriptor set layouts and pools (compute + graphics)
+  descriptorSetLayout = nullptr;
+  pbrDescriptorSetLayout = nullptr;
+  transparentDescriptorSetLayout = nullptr;
+  compositeDescriptorSetLayout = nullptr;
+  forwardPlusDescriptorSetLayout = nullptr;
+  computeDescriptorSetLayout = nullptr;
+  rayQueryDescriptorSetLayout = nullptr;
+
+  // Pools last, after sets are cleared
+  computeDescriptorPool = nullptr;
+  descriptorPool = nullptr;
+
+  // 6) Clear textures and aliases, including default resources
+  {
+    std::unique_lock<std::shared_mutex> lk(textureResourcesMutex);
+    textureResources.clear();
+    textureAliases.clear();
+  }
+  // Reset default texture resources
+  defaultTextureResources.textureSampler = nullptr;
+  defaultTextureResources.textureImageView = nullptr;
+  defaultTextureResources.textureImage = nullptr;
+  defaultTextureResources.textureImageAllocation = nullptr;
+
+  // 7) Opaque scene color and related descriptors
+  opaqueSceneColorSampler = nullptr;
+  opaqueSceneColorImages.clear();
+  opaqueSceneColorImageAllocations.clear();
+  opaqueSceneColorImageViews.clear();
+  opaqueSceneColorImageLayouts.clear();
+
+  // 7.5) Ray query output image and acceleration structures
+  rayQueryOutputImageView = nullptr;
+  rayQueryOutputImage = nullptr;
+  rayQueryOutputImageAllocation = nullptr;
+
+  // Clear acceleration structures (BLAS and TLAS buffers)
+  blasStructures.clear();
+  tlasStructure = AccelerationStructure{};
+
+  // 8) (moved above) Forward+ per-frame buffers cleared prior to pool destruction
+
+  // 9) Command buffers/pools
+  commandBuffers.clear();
+  commandPool = nullptr;
+  computeCommandPool = nullptr;
+
+  // 10) Sync objects
+  imageAvailableSemaphores.clear();
+  renderFinishedSemaphores.clear();
+  inFlightFences.clear();
+  uploadsTimeline = nullptr;
+
+  // 11) Queues and surface (RAII handles will release upon reset; keep device alive until the end)
+  graphicsQueue = nullptr;
+  presentQueue = nullptr;
+  computeQueue = nullptr;
+  transferQueue = nullptr;
+  surface = nullptr;
+
+  // 12) Memory pool last
+  memoryPool.reset();
+
+  // 13) Clean up advanced resources
+  AdvancedRenderer_Cleanup(this);
+
+  // Finally mark uninitialized
+  initialized = false;
+  std::cout << "Renderer cleanup completed." << std::endl;
+}
+
+// Create instance
+bool Renderer::createInstance(const std::string& appName, bool enableValidationLayers) {
+  try {
+    // Create application info
+    vk::ApplicationInfo appInfo{
+      .pApplicationName = appName.c_str(),
+      .applicationVersion = VK_MAKE_VERSION(1, 0, 0),
+      .pEngineName = "Simple Engine",
+      .engineVersion = VK_MAKE_VERSION(1, 0, 0),
+      .apiVersion = VK_API_VERSION_1_3
+    };
+
+    // Get required extensions
+    std::vector<const char *> extensions;
+
+    // Add required extensions for GLFW
+#if defined(PLATFORM_DESKTOP)
+    uint32_t glfwExtensionCount = 0;
+    const char** glfwExtensions = glfwGetRequiredInstanceExtensions(&glfwExtensionCount);
+    extensions.insert(extensions.end(), glfwExtensions, glfwExtensions + glfwExtensionCount);
+#endif
+
+    // Add debug extension if validation layers are enabled
+    if (enableValidationLayers) {
+      extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME);
+    }
+
+    // Create instance info
+    vk::InstanceCreateInfo createInfo{
+      .pApplicationInfo = &appInfo,
+      .enabledExtensionCount = static_cast<uint32_t>(extensions.size()),
+      .ppEnabledExtensionNames = extensions.data()
+    };
+
+    // Enable validation layers if requested
+    vk::ValidationFeaturesEXT validationFeatures{};
+    std::vector<vk::ValidationFeatureEnableEXT> enabledValidationFeatures;
+
+    if (enableValidationLayers) {
+      if (!checkValidationLayerSupport()) {
+        std::cerr << "Validation layers requested, but not available" << std::endl;
+        return false;
+      }
+
+      createInfo.enabledLayerCount = static_cast<uint32_t>(validationLayers.size());
+      createInfo.ppEnabledLayerNames = validationLayers.data();
+
+      // Keep validation output quiet by default (no DebugPrintf feature).
+      // Ray Query debugPrintf/printf diagnostics are intentionally removed.
+
+      validationFeatures.enabledValidationFeatureCount = static_cast<uint32_t>(enabledValidationFeatures.size());
+      validationFeatures.pEnabledValidationFeatures = enabledValidationFeatures.data();
+
+      createInfo.pNext = &validationFeatures;
+    }
+
+    // Create instance
+    instance = vk::raii::Instance(context, createInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create instance: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Setup debug messenger
+bool Renderer::setupDebugMessenger(bool enableValidationLayers) {
+  if (!enableValidationLayers) {
+    return true;
+  }
+
+  try {
+    // Create debug messenger info
+    vk::DebugUtilsMessengerCreateInfoEXT createInfo{};
+    createInfo.messageSeverity = vk::DebugUtilsMessageSeverityFlagBitsEXT::eVerbose |
+        vk::DebugUtilsMessageSeverityFlagBitsEXT::eInfo |
+        vk::DebugUtilsMessageSeverityFlagBitsEXT::eWarning |
+        vk::DebugUtilsMessageSeverityFlagBitsEXT::eError;
+    createInfo.messageType = vk::DebugUtilsMessageTypeFlagBitsEXT::eGeneral |
+        vk::DebugUtilsMessageTypeFlagBitsEXT::eValidation |
+        vk::DebugUtilsMessageTypeFlagBitsEXT::ePerformance;
+
+    // Select callback via simple platform macro: Android typically expects C PFN types in headers
+    // while desktop (newer Vulkan-Hpp) expects vk:: types.
+#if defined(__ANDROID__)
+    createInfo.pfnUserCallback = &debugCallbackVkRaii;
+#else
+    createInfo.pfnUserCallback = &debugCallbackVkHpp;
+#endif
+
+    // Create debug messenger
+    debugMessenger = vk::raii::DebugUtilsMessengerEXT(instance, createInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to set up debug messenger: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create surface
+bool Renderer::createSurface() {
+  try {
+    // Create surface
+    VkSurfaceKHR _surface;
+    if (!platform->CreateVulkanSurface(*instance, &_surface)) {
+      std::cerr << "Failed to create window surface" << std::endl;
+      return false;
+    }
+
+    surface = vk::raii::SurfaceKHR(instance, _surface);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create surface: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Pick a physical device
+bool Renderer::pickPhysicalDevice() {
+  try {
+    // Get available physical devices
+    std::vector<vk::raii::PhysicalDevice> devices = instance.enumeratePhysicalDevices();
+
+    if (devices.empty()) {
+      std::cerr << "Failed to find GPUs with Vulkan support" << std::endl;
+      return false;
+    }
+
+    // Prioritize discrete GPUs (like NVIDIA RTX 2080) over integrated GPUs (like Intel UHD Graphics)
+    // First, collect all suitable devices with their suitability scores
+    std::multimap<int, vk::raii::PhysicalDevice> suitableDevices;
+
+    for (auto& _device : devices) {
+      // Print device properties for debugging
+      vk::PhysicalDeviceProperties deviceProperties = _device.getProperties();
+      std::cout << "Checking device: " << deviceProperties.deviceName
+          << " (Type: " << vk::to_string(deviceProperties.deviceType) << ")" << std::endl;
+
+      // Check if the device supports Vulkan 1.3
+      bool supportsVulkan1_3 = deviceProperties.apiVersion >= VK_API_VERSION_1_3;
+      if (!supportsVulkan1_3) {
+        std::cout << "  - Does not support Vulkan 1.3" << std::endl;
+        continue;
+      }
+
+      // Check queue families
+      QueueFamilyIndices indices = findQueueFamilies(_device);
+      bool supportsGraphics = indices.isComplete();
+      if (!supportsGraphics) {
+        std::cout << "  - Missing required queue families" << std::endl;
+        continue;
+      }
+
+      // Check device extensions
+      bool supportsAllRequiredExtensions = checkDeviceExtensionSupport(_device);
+      if (!supportsAllRequiredExtensions) {
+        std::cout << "  - Missing required extensions" << std::endl;
+        continue;
+      }
+
+      // Check swap chain support
+      SwapChainSupportDetails swapChainSupport = querySwapChainSupport(_device);
+      bool swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty();
+      if (!swapChainAdequate) {
+        std::cout << "  - Inadequate swap chain support" << std::endl;
+        continue;
+      }
+
+      // Check for required features
+      auto features = _device.getFeatures2<vk::PhysicalDeviceFeatures2, vk::PhysicalDeviceVulkan13Features>();
+      bool supportsRequiredFeatures = features.get<vk::PhysicalDeviceVulkan13Features>().dynamicRendering;
+      if (!supportsRequiredFeatures) {
+        std::cout << "  - Does not support required features (dynamicRendering)" << std::endl;
+        continue;
+      }
+
+      // Calculate suitability score - prioritize discrete GPUs
+      int score = 0;
+
+      // Discrete GPUs get the highest priority (NVIDIA RTX 2080, AMD, etc.)
+      if (deviceProperties.deviceType == vk::PhysicalDeviceType::eDiscreteGpu) {
+        score += 1000;
+        std::cout << "  - Discrete GPU: +1000 points" << std::endl;
+      }
+      // Integrated GPUs get lower priority (Intel UHD Graphics, etc.)
+      else if (deviceProperties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu) {
+        score += 100;
+        std::cout << "  - Integrated GPU: +100 points" << std::endl;
+      }
+
+      // Add points for memory size (more VRAM is better)
+      vk::PhysicalDeviceMemoryProperties memProperties = _device.getMemoryProperties();
+      for (uint32_t i = 0; i < memProperties.memoryHeapCount; i++) {
+        if (memProperties.memoryHeaps[i].flags & vk::MemoryHeapFlagBits::eDeviceLocal) {
+          // Add 1 point per GB of VRAM
+          score += static_cast<int>(memProperties.memoryHeaps[i].size / (1024 * 1024 * 1024));
+          break;
+        }
+      }
+
+      std::cout << "  - Device is suitable with score: " << score << std::endl;
+      suitableDevices.emplace(score, _device);
+    }
+
+    if (!suitableDevices.empty()) {
+      // Select the device with the highest score (discrete GPU with most VRAM)
+      physicalDevice = suitableDevices.rbegin()->second;
+      vk::PhysicalDeviceProperties deviceProperties = physicalDevice.getProperties();
+      std::cout << "Selected device: " << deviceProperties.deviceName
+          << " (Type: " << vk::to_string(deviceProperties.deviceType)
+          << ", Score: " << suitableDevices.rbegin()->first << ")" << std::endl;
+
+      // Store queue family indices for the selected device
+      queueFamilyIndices = findQueueFamilies(physicalDevice);
+
+      // Add supported optional extensions
+      addSupportedOptionalExtensions();
+
+      return true;
+    }
+    std::cerr << "Failed to find a suitable GPU. Make sure your GPU supports Vulkan and has the required extensions." << std::endl;
+    return false;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to pick physical device: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Add supported optional extensions
+void Renderer::addSupportedOptionalExtensions() {
+  try {
+    // Get available extensions
+    auto availableExtensions = physicalDevice.enumerateDeviceExtensionProperties();
+
+    // Build a set of available extension names for quick lookup
+    std::set<std::string> avail;
+    for (const auto& e : availableExtensions) {
+      avail.insert(e.extensionName);
+    }
+
+    for (const auto& optionalExt : optionalDeviceExtensions) {
+      if (avail.contains(optionalExt)) {
+        deviceExtensions.push_back(optionalExt);
+        std::cout << "Adding optional extension: " << optionalExt << std::endl;
+      }
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "Warning: Failed to add optional extensions: " << e.what() << std::endl;
+  }
+}
+
+// Create logical device
+bool Renderer::createLogicalDevice(bool enableValidationLayers) {
+  try {
+    // Create queue create info for each unique queue family
+    std::vector<vk::DeviceQueueCreateInfo> queueCreateInfos;
+    std::set uniqueQueueFamilies = {
+      queueFamilyIndices.graphicsFamily.value(),
+      queueFamilyIndices.presentFamily.value(),
+      queueFamilyIndices.computeFamily.value(),
+      queueFamilyIndices.transferFamily.value()
+    };
+
+    float queuePriority = 1.0f;
+    for (uint32_t queueFamily : uniqueQueueFamilies) {
+      vk::DeviceQueueCreateInfo queueCreateInfo{
+        .queueFamilyIndex = queueFamily,
+        .queueCount = 1,
+        .pQueuePriorities = &queuePriority
+      };
+      queueCreateInfos.push_back(queueCreateInfo);
+    }
+
+    // Query supported features before enabling them
+    auto supportedFeatures = physicalDevice.getFeatures2<
+      vk::PhysicalDeviceFeatures2,
+      vk::PhysicalDeviceTimelineSemaphoreFeatures,
+      vk::PhysicalDeviceVulkanMemoryModelFeatures,
+      vk::PhysicalDeviceBufferDeviceAddressFeatures,
+      vk::PhysicalDevice8BitStorageFeatures,
+      vk::PhysicalDeviceVulkan11Features,
+      vk::PhysicalDeviceVulkan13Features>();
+
+    // Verify critical features are supported
+    const auto& coreSupported = supportedFeatures.get<vk::PhysicalDeviceFeatures2>().features;
+    const auto& timelineSupported = supportedFeatures.get<vk::PhysicalDeviceTimelineSemaphoreFeatures>();
+    const auto& memoryModelSupported = supportedFeatures.get<vk::PhysicalDeviceVulkanMemoryModelFeatures>();
+    const auto& bufferAddressSupported = supportedFeatures.get<vk::PhysicalDeviceBufferDeviceAddressFeatures>();
+    const auto& storage8BitSupported = supportedFeatures.get<vk::PhysicalDevice8BitStorageFeatures>();
+    const auto& vulkan11Supported = supportedFeatures.get<vk::PhysicalDeviceVulkan11Features>();
+    const auto& vulkan13Supported = supportedFeatures.get<vk::PhysicalDeviceVulkan13Features>();
+
+    // Check for required features
+    if (!coreSupported.samplerAnisotropy ||
+      !timelineSupported.timelineSemaphore ||
+      !memoryModelSupported.vulkanMemoryModel ||
+      !bufferAddressSupported.bufferDeviceAddress ||
+      !vulkan11Supported.shaderDrawParameters ||
+      !vulkan13Supported.dynamicRendering ||
+      !vulkan13Supported.synchronization2) {
+      throw std::runtime_error("Required Vulkan features not supported by physical device");
+    }
+
+    // Enable required features (now verified to be supported)
+    auto features = physicalDevice.getFeatures2();
+    features.features.samplerAnisotropy = vk::True;
+    features.features.depthBiasClamp = coreSupported.depthBiasClamp ? vk::True : vk::False;
+
+    // Explicitly configure device features to prevent validation layer warnings
+    // These features are required by extensions or other features, so we enable them explicitly
+
+    // Timeline semaphore features (required for synchronization2)
+    vk::PhysicalDeviceTimelineSemaphoreFeatures timelineSemaphoreFeatures;
+    timelineSemaphoreFeatures.timelineSemaphore = vk::True;
+
+    // Vulkan memory model features (required for some shader operations)
+    vk::PhysicalDeviceVulkanMemoryModelFeatures memoryModelFeatures;
+    memoryModelFeatures.vulkanMemoryModel = vk::True;
+    memoryModelFeatures.vulkanMemoryModelDeviceScope = memoryModelSupported.vulkanMemoryModelDeviceScope ? vk::True : vk::False;
+
+    // Buffer device address features (required for some buffer operations)
+    vk::PhysicalDeviceBufferDeviceAddressFeatures bufferDeviceAddressFeatures;
+    bufferDeviceAddressFeatures.bufferDeviceAddress = vk::True;
+
+    // 8-bit storage features (required for some shader storage operations)
+    vk::PhysicalDevice8BitStorageFeatures storage8BitFeatures;
+    storage8BitFeatures.storageBuffer8BitAccess = storage8BitSupported.storageBuffer8BitAccess ? vk::True : vk::False;
+
+    // Enable Vulkan 1.3 features
+    vk::PhysicalDeviceVulkan13Features vulkan13Features;
+    vulkan13Features.dynamicRendering = vk::True;
+    vulkan13Features.synchronization2 = vk::True;
+
+    // Vulkan 1.1 features: shaderDrawParameters to satisfy SPIR-V DrawParameters capability
+    vk::PhysicalDeviceVulkan11Features vulkan11Features{};
+    vulkan11Features.shaderDrawParameters = vk::True;
+    // Query extended feature support
+#if !defined(PLATFORM_ANDROID)
+    auto featureChain = physicalDevice.getFeatures2<
+      vk::PhysicalDeviceFeatures2,
+      vk::PhysicalDeviceDescriptorIndexingFeatures,
+      vk::PhysicalDeviceRobustness2FeaturesEXT,
+      vk::PhysicalDeviceDynamicRenderingLocalReadFeaturesKHR,
+      vk::PhysicalDeviceShaderTileImageFeaturesEXT,
+      vk::PhysicalDeviceAccelerationStructureFeaturesKHR,
+      vk::PhysicalDeviceRayQueryFeaturesKHR>();
+    const auto& localReadSupported = featureChain.get<vk::PhysicalDeviceDynamicRenderingLocalReadFeaturesKHR>();
+    const auto& tileImageSupported = featureChain.get<vk::PhysicalDeviceShaderTileImageFeaturesEXT>();
+#else
+    auto featureChain = physicalDevice.getFeatures2<
+      vk::PhysicalDeviceFeatures2,
+      vk::PhysicalDeviceDescriptorIndexingFeatures,
+      vk::PhysicalDeviceRobustness2FeaturesEXT,
+      vk::PhysicalDeviceAccelerationStructureFeaturesKHR,
+      vk::PhysicalDeviceRayQueryFeaturesKHR>();
+#endif
+    const auto& coreFeaturesSupported = featureChain.get<vk::PhysicalDeviceFeatures2>().features;
+    const auto& indexingFeaturesSupported = featureChain.get<vk::PhysicalDeviceDescriptorIndexingFeatures>();
+    const auto& robust2Supported = featureChain.get<vk::PhysicalDeviceRobustness2FeaturesEXT>();
+    const auto& accelerationStructureSupported = featureChain.get<vk::PhysicalDeviceAccelerationStructureFeaturesKHR>();
+    const auto& rayQuerySupported = featureChain.get<vk::PhysicalDeviceRayQueryFeaturesKHR>();
+
+    // Ray Query shader uses indexing into a (large) sampled-image array.
+    // Some drivers require this core feature to be explicitly enabled.
+    if (coreFeaturesSupported.shaderSampledImageArrayDynamicIndexing) {
+      features.features.shaderSampledImageArrayDynamicIndexing = vk::True;
+    }
+
+    // Prepare descriptor indexing features to enable if supported
+    vk::PhysicalDeviceDescriptorIndexingFeatures indexingFeaturesEnable{};
+    descriptorIndexingEnabled = false;
+    // Enable non-uniform indexing of sampled image arrays when supported — required for
+    // `NonUniformResourceIndex()` in the ray-query shader to actually take effect.
+    if (indexingFeaturesSupported.shaderSampledImageArrayNonUniformIndexing) {
+      indexingFeaturesEnable.shaderSampledImageArrayNonUniformIndexing = vk::True;
+      descriptorIndexingEnabled = true;
+    }
+    if (indexingFeaturesSupported.runtimeDescriptorArray) {
+      indexingFeaturesEnable.runtimeDescriptorArray = vk::True;
+    }
+    if (indexingFeaturesSupported.descriptorBindingVariableDescriptorCount) {
+      indexingFeaturesEnable.descriptorBindingVariableDescriptorCount = vk::True;
+    }
+
+    // These are not strictly required when writing a fully-populated descriptor array,
+    // but enabling them when available avoids edge-case driver behavior for large arrays.
+    if (descriptorIndexingEnabled) {
+      if (indexingFeaturesSupported.descriptorBindingPartiallyBound) {
+        indexingFeaturesEnable.descriptorBindingPartiallyBound = vk::True;
+      }
+      if (indexingFeaturesSupported.descriptorBindingUpdateUnusedWhilePending) {
+        indexingFeaturesEnable.descriptorBindingUpdateUnusedWhilePending = vk::True;
+      }
+    }
+    // Optionally enable UpdateAfterBind flags when supported (not strictly required for RQ textures)
+    if (indexingFeaturesSupported.descriptorBindingSampledImageUpdateAfterBind)
+      indexingFeaturesEnable.descriptorBindingSampledImageUpdateAfterBind = vk::True;
+    if (indexingFeaturesSupported.descriptorBindingUniformBufferUpdateAfterBind)
+      indexingFeaturesEnable.descriptorBindingUniformBufferUpdateAfterBind = vk::True;
+    if (indexingFeaturesSupported.descriptorBindingUpdateUnusedWhilePending)
+      indexingFeaturesEnable.descriptorBindingUpdateUnusedWhilePending = vk::True;
+
+    // Helper to check if an extension is enabled (using string comparison)
+    auto hasExtension = [&](const char* name) {
+      return std::find_if(deviceExtensions.begin(),
+                          deviceExtensions.end(),
+                          [&](const char* ext) {
+                            return std::strcmp(ext, name) == 0;
+                          }) != deviceExtensions.end();
+    };
+
+    // Prepare Robustness2 features if the extension is enabled and device supports
+    auto hasRobust2 = hasExtension(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME);
+    vk::PhysicalDeviceRobustness2FeaturesEXT robust2Enable{};
+    if (hasRobust2) {
+      if (robust2Supported.robustBufferAccess2)
+        robust2Enable.robustBufferAccess2 = vk::True;
+      if (robust2Supported.robustImageAccess2)
+        robust2Enable.robustImageAccess2 = vk::True;
+      if (robust2Supported.nullDescriptor)
+        robust2Enable.nullDescriptor = vk::True;
+    }
+
+#if !defined(PLATFORM_ANDROID)
+    // Prepare Dynamic Rendering Local Read features if extension is enabled and supported
+    auto hasLocalRead = hasExtension(VK_KHR_DYNAMIC_RENDERING_LOCAL_READ_EXTENSION_NAME);
+    vk::PhysicalDeviceDynamicRenderingLocalReadFeaturesKHR localReadEnable{};
+    if (hasLocalRead && localReadSupported.dynamicRenderingLocalRead) {
+      localReadEnable.dynamicRenderingLocalRead = vk::True;
+    }
+
+    // Prepare Shader Tile Image features if extension is enabled and supported
+    auto hasTileImage = hasExtension(VK_EXT_SHADER_TILE_IMAGE_EXTENSION_NAME);
+    vk::PhysicalDeviceShaderTileImageFeaturesEXT tileImageEnable{};
+    if (hasTileImage) {
+      if (tileImageSupported.shaderTileImageColorReadAccess)
+        tileImageEnable.shaderTileImageColorReadAccess = vk::True;
+      if (tileImageSupported.shaderTileImageDepthReadAccess)
+        tileImageEnable.shaderTileImageDepthReadAccess = vk::True;
+      if (tileImageSupported.shaderTileImageStencilReadAccess)
+        tileImageEnable.shaderTileImageStencilReadAccess = vk::True;
+    }
+#endif
+
+    // Prepare Acceleration Structure features if extension is enabled and supported
+    auto hasAccelerationStructure = hasExtension(VK_KHR_ACCELERATION_STRUCTURE_EXTENSION_NAME);
+    vk::PhysicalDeviceAccelerationStructureFeaturesKHR accelerationStructureEnable{};
+    if (hasAccelerationStructure && accelerationStructureSupported.accelerationStructure) {
+      accelerationStructureEnable.accelerationStructure = vk::True;
+    }
+
+    // Prepare Ray Query features if extension is enabled and supported
+    auto hasRayQuery = hasExtension(VK_KHR_RAY_QUERY_EXTENSION_NAME);
+    vk::PhysicalDeviceRayQueryFeaturesKHR rayQueryEnable{};
+    if (hasRayQuery && rayQuerySupported.rayQuery) {
+      rayQueryEnable.rayQuery = vk::True;
+    }
+
+    // Chain the feature structures together (build pNext chain explicitly)
+    // Base
+    features.pNext = &timelineSemaphoreFeatures;
+    timelineSemaphoreFeatures.pNext = &memoryModelFeatures;
+    memoryModelFeatures.pNext = &bufferDeviceAddressFeatures;
+    bufferDeviceAddressFeatures.pNext = &storage8BitFeatures;
+    storage8BitFeatures.pNext = &vulkan11Features; // link 1.1 first
+    vulkan11Features.pNext = &vulkan13Features; // then 1.3 features
+
+    // Build tail chain starting at Vulkan 1.3 features
+    void** tailNext = reinterpret_cast<void **>(&vulkan13Features.pNext);
+    if (descriptorIndexingEnabled) {
+      *tailNext = &indexingFeaturesEnable;
+      tailNext = reinterpret_cast<void **>(&indexingFeaturesEnable.pNext);
+    }
+    if (hasRobust2) {
+      *tailNext = &robust2Enable;
+      tailNext = reinterpret_cast<void **>(&robust2Enable.pNext);
+    }
+#if !defined(PLATFORM_ANDROID)
+    if (hasLocalRead) {
+      *tailNext = &localReadEnable;
+      tailNext = reinterpret_cast<void **>(&localReadEnable.pNext);
+    }
+    if (hasTileImage) {
+      *tailNext = &tileImageEnable;
+      tailNext = reinterpret_cast<void **>(&tileImageEnable.pNext);
+    }
+#endif
+    if (hasAccelerationStructure) {
+      *tailNext = &accelerationStructureEnable;
+      tailNext = reinterpret_cast<void **>(&accelerationStructureEnable.pNext);
+    }
+    if (hasRayQuery) {
+      *tailNext = &rayQueryEnable;
+      tailNext = reinterpret_cast<void **>(&rayQueryEnable.pNext);
+    }
+
+    // Record which features ended up enabled (for runtime decisions/tutorial diagnostics)
+    robustness2Enabled = hasRobust2 && (robust2Enable.robustBufferAccess2 == vk::True ||
+      robust2Enable.robustImageAccess2 == vk::True ||
+      robust2Enable.nullDescriptor == vk::True);
+#if !defined(PLATFORM_ANDROID)
+    dynamicRenderingLocalReadEnabled = hasLocalRead && (localReadEnable.dynamicRenderingLocalRead == vk::True);
+    shaderTileImageEnabled = hasTileImage && (tileImageEnable.shaderTileImageColorReadAccess == vk::True ||
+      tileImageEnable.shaderTileImageDepthReadAccess == vk::True ||
+      tileImageEnable.shaderTileImageStencilReadAccess == vk::True);
+#else
+    dynamicRenderingLocalReadEnabled = false;
+    shaderTileImageEnabled = false;
+#endif
+    accelerationStructureEnabled = hasAccelerationStructure;
+    rayQueryEnabled = hasRayQuery;
+
+    // One-time startup diagnostics (Ray Query + texture array indexing)
+    static bool printedFeatureDiag = false;
+    if (!printedFeatureDiag) {
+      printedFeatureDiag = true;
+      std::cout << "[DeviceFeatures] shaderSampledImageArrayDynamicIndexing="
+          << (features.features.shaderSampledImageArrayDynamicIndexing == vk::True ? "ON" : "OFF")
+          << ", shaderSampledImageArrayNonUniformIndexing="
+          << (indexingFeaturesEnable.shaderSampledImageArrayNonUniformIndexing == vk::True ? "ON" : "OFF")
+          << ", descriptorIndexingEnabled="
+          << (descriptorIndexingEnabled ? "true" : "false")
+          << "\n";
+    }
+
+    // Create a device. Device layers are deprecated and ignored, so we
+    // only configure extensions and features here; validation is enabled
+    // via instance layers.
+    vk::DeviceCreateInfo createInfo{
+      .pNext = &features,
+      .queueCreateInfoCount = static_cast<uint32_t>(queueCreateInfos.size()),
+      .pQueueCreateInfos = queueCreateInfos.data(),
+      .enabledExtensionCount = static_cast<uint32_t>(deviceExtensions.size()),
+      .ppEnabledExtensionNames = deviceExtensions.data(),
+      .pEnabledFeatures = nullptr // Using pNext for features
+    };
+
+    // Create the logical device
+    device = vk::raii::Device(physicalDevice, createInfo);
+
+    // Get queue handles
+    graphicsQueue = vk::raii::Queue(device, queueFamilyIndices.graphicsFamily.value(), 0);
+    presentQueue = vk::raii::Queue(device, queueFamilyIndices.presentFamily.value(), 0);
+    computeQueue = vk::raii::Queue(device, queueFamilyIndices.computeFamily.value(), 0);
+    transferQueue = vk::raii::Queue(device, queueFamilyIndices.transferFamily.value(), 0);
+
+    // Create global timeline semaphore for uploads early (needed before default texture creation)
+    vk::StructureChain<vk::SemaphoreCreateInfo, vk::SemaphoreTypeCreateInfo> timelineChain(
+      {},
+      {.semaphoreType = vk::SemaphoreType::eTimeline, .initialValue = 0});
+    uploadsTimeline = vk::raii::Semaphore(device, timelineChain.get<vk::SemaphoreCreateInfo>());
+    uploadTimelineLastSubmitted.store(0, std::memory_order_relaxed);
+    lastCriticalUploadValue.store(0, std::memory_order_relaxed);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create logical device: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Check validation layer support
+bool Renderer::checkValidationLayerSupport() const {
+  // Get available layers
+  std::vector<vk::LayerProperties> availableLayers = context.enumerateInstanceLayerProperties();
+
+  // Check if all requested layers are available
+  for (const char* layerName : validationLayers) {
+    bool layerFound = false;
+
+    for (const auto& layerProperties : availableLayers) {
+      if (strcmp(layerName, layerProperties.layerName) == 0) {
+        layerFound = true;
+        break;
+      }
+    }
+
+    if (!layerFound) {
+      return false;
+    }
+  }
+
+  return true;
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/renderer_pipelines.cpp b/attachments/advanced_gltf/renderer_pipelines.cpp
new file mode 100644
index 000000000..55fdcf8c7
--- /dev/null
+++ b/attachments/advanced_gltf/renderer_pipelines.cpp
@@ -0,0 +1,1542 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <array>
+#include <fstream>
+#include <iostream>
+#include <chrono>
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+#include <string>
+
+#include "mesh_component.h"
+
+#include "renderer.h"
+
+#include "renderer_advanced_types.h"
+
+// This file contains pipeline-related methods from the Renderer class
+
+// Create a descriptor set layout
+bool Renderer::createDescriptorSetLayout() {
+  try {
+    // Create binding for a uniform buffer
+    vk::DescriptorSetLayoutBinding uboLayoutBinding{
+      .binding = 0,
+      .descriptorType = vk::DescriptorType::eUniformBuffer,
+      .descriptorCount = 1,
+      .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment,
+      .pImmutableSamplers = nullptr
+    };
+
+    // Create binding for texture sampler
+    vk::DescriptorSetLayoutBinding samplerLayoutBinding{
+      .binding = 1,
+      .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+      .descriptorCount = 1,
+      .stageFlags = vk::ShaderStageFlagBits::eFragment,
+      .pImmutableSamplers = nullptr
+    };
+
+    // Create a descriptor set layout
+    std::array<vk::DescriptorSetLayoutBinding, 2> bindings = {uboLayoutBinding, samplerLayoutBinding};
+
+    // Descriptor indexing: set per-binding flags for UPDATE_AFTER_BIND if enabled
+    vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{};
+    std::array<vk::DescriptorBindingFlags, 2> bindingFlags{};
+    if (descriptorIndexingEnabled) {
+      bindingFlags[0] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending;
+      bindingFlags[1] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending;
+      bindingFlagsInfo.bindingCount = static_cast<uint32_t>(bindingFlags.size());
+      bindingFlagsInfo.pBindingFlags = bindingFlags.data();
+    }
+
+    vk::DescriptorSetLayoutCreateInfo layoutInfo{};
+    layoutInfo.bindingCount = static_cast<uint32_t>(bindings.size());
+    layoutInfo.pBindings = bindings.data();
+    if (descriptorIndexingEnabled) {
+      layoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool;
+      layoutInfo.pNext = &bindingFlagsInfo;
+    }
+
+    descriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create descriptor set layout: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create PBR descriptor set layout
+bool Renderer::createPBRDescriptorSetLayout() {
+  try {
+    // Create descriptor set layout bindings for PBR shader
+    std::array bindings = {
+      // Binding 0: Uniform buffer (UBO)
+      vk::DescriptorSetLayoutBinding{
+        .binding = 0,
+        .descriptorType = vk::DescriptorType::eUniformBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eVertex | vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 1: Base color map and sampler
+      vk::DescriptorSetLayoutBinding{
+        .binding = 1,
+        .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 2: Metallic roughness map and sampler
+      vk::DescriptorSetLayoutBinding{
+        .binding = 2,
+        .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 3: Normal map and sampler
+      vk::DescriptorSetLayoutBinding{
+        .binding = 3,
+        .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 4: Occlusion map and sampler
+      vk::DescriptorSetLayoutBinding{
+        .binding = 4,
+        .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 5: Emissive map and sampler
+      vk::DescriptorSetLayoutBinding{
+        .binding = 5,
+        .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 6: Light storage buffer (shadows removed)
+      vk::DescriptorSetLayoutBinding{
+        .binding = 6,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 7: Forward+ tile headers SSBO
+      vk::DescriptorSetLayoutBinding{
+        .binding = 7,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 8: Forward+ tile light indices SSBO
+      vk::DescriptorSetLayoutBinding{
+        .binding = 8,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 9: Fragment debug output buffer (optional)
+      vk::DescriptorSetLayoutBinding{
+        .binding = 9,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 10: Reflection texture (planar reflections)
+      vk::DescriptorSetLayoutBinding{
+        .binding = 10,
+        .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 11: TLAS (ray-query shadows in raster fragment shader)
+      vk::DescriptorSetLayoutBinding{
+        .binding = 11,
+        .descriptorType = vk::DescriptorType::eAccelerationStructureKHR,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 12: Ray-query geometry info buffer (per-instance addresses + material indices)
+      vk::DescriptorSetLayoutBinding{
+        .binding = 12,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      },
+      // Binding 13: Ray-query material buffer (PBR material properties)
+      vk::DescriptorSetLayoutBinding{
+        .binding = 13,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eFragment,
+        .pImmutableSamplers = nullptr
+      }
+    };
+
+    // Create a descriptor set layout
+    // Descriptor indexing: set per-binding flags for UPDATE_AFTER_BIND on UBO (0) and sampled images (1..5)
+    vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{};
+    std::array<vk::DescriptorBindingFlags, 14> bindingFlags{};
+    if (descriptorIndexingEnabled) {
+      bindingFlags[0] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending;
+      bindingFlags[1] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending;
+      bindingFlags[10] = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending;
+      bindingFlagsInfo.bindingCount = static_cast<uint32_t>(bindingFlags.size());
+      bindingFlagsInfo.pBindingFlags = bindingFlags.data();
+    }
+
+    vk::DescriptorSetLayoutCreateInfo layoutInfo{};
+    layoutInfo.bindingCount = static_cast<uint32_t>(bindings.size());
+    layoutInfo.pBindings = bindings.data();
+    if (descriptorIndexingEnabled) {
+      layoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool;
+      layoutInfo.pNext = &bindingFlagsInfo;
+    }
+
+    pbrDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo);
+
+    // Binding 7: transparent passes input
+    // Layout for Set 1: Just the scene color texture
+    vk::DescriptorSetLayoutBinding sceneColorBinding{
+      .binding = 0, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .descriptorCount = 1, .stageFlags = vk::ShaderStageFlagBits::eFragment
+    };
+    vk::DescriptorSetLayoutCreateInfo transparentLayoutInfo{.bindingCount = 1, .pBindings = &sceneColorBinding};
+    if (descriptorIndexingEnabled) {
+      // Make this sampler binding update-after-bind safe as well (optional)
+      vk::DescriptorSetLayoutBindingFlagsCreateInfo transBindingFlagsInfo{};
+      vk::DescriptorBindingFlags transFlags = vk::DescriptorBindingFlagBits::eUpdateAfterBind | vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending;
+      transBindingFlagsInfo.bindingCount = 1;
+      transBindingFlagsInfo.pBindingFlags = &transFlags;
+      transparentLayoutInfo.flags |= vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool;
+      transparentLayoutInfo.pNext = &transBindingFlagsInfo;
+
+      // Create the layout while the pNext chain is still valid (avoid dangling pointer)
+      transparentDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, transparentLayoutInfo);
+    } else {
+      // Create without extra binding flags
+      transparentDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, transparentLayoutInfo);
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create PBR descriptor set layout: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create a graphics pipeline
+bool Renderer::createGraphicsPipeline() {
+  try {
+    // Read shader code
+    auto shaderCode = readFile("shaders/texturedMesh.spv");
+
+    // Create shader modules
+    vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode);
+
+    // Create shader stage info
+    vk::PipelineShaderStageCreateInfo vertShaderStageInfo{
+      .stage = vk::ShaderStageFlagBits::eVertex,
+      .module = *shaderModule,
+      .pName = "VSMain"
+    };
+
+    vk::PipelineShaderStageCreateInfo fragShaderStageInfo{
+      .stage = vk::ShaderStageFlagBits::eFragment,
+      .module = *shaderModule,
+      .pName = "PSMain"
+    };
+
+    // Fragment entry point specialized for architectural glass
+    vk::PipelineShaderStageCreateInfo fragGlassStageInfo{
+      .stage = vk::ShaderStageFlagBits::eFragment,
+      .module = *shaderModule,
+      .pName = "GlassPSMain"
+    };
+
+    vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo};
+
+    // Create vertex input info with instancing support
+    auto vertexBindingDescription = Vertex::getBindingDescription();
+    auto instanceBindingDescription = InstanceData::getBindingDescription();
+    std::array<vk::VertexInputBindingDescription, 2> bindingDescriptions = {
+      vertexBindingDescription,
+      instanceBindingDescription
+    };
+
+    auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions();
+    auto instanceAttributeDescriptions = InstanceData::getAttributeDescriptions();
+
+    // Combine all attribute descriptions (no duplicates)
+    std::vector<vk::VertexInputAttributeDescription> allAttributeDescriptions;
+    allAttributeDescriptions.insert(allAttributeDescriptions.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end());
+    allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceAttributeDescriptions.begin(), instanceAttributeDescriptions.end());
+
+    // Note: materialIndex attribute (Location 11) is not used by current shaders
+
+    vk::PipelineVertexInputStateCreateInfo vertexInputInfo{
+      .vertexBindingDescriptionCount = static_cast<uint32_t>(bindingDescriptions.size()),
+      .pVertexBindingDescriptions = bindingDescriptions.data(),
+      .vertexAttributeDescriptionCount = static_cast<uint32_t>(allAttributeDescriptions.size()),
+      .pVertexAttributeDescriptions = allAttributeDescriptions.data()
+    };
+
+    // Create input assembly info
+    vk::PipelineInputAssemblyStateCreateInfo inputAssembly{
+      .topology = vk::PrimitiveTopology::eTriangleList,
+      .primitiveRestartEnable = VK_FALSE
+    };
+
+    // Create viewport state info
+    vk::PipelineViewportStateCreateInfo viewportState{
+      .viewportCount = 1,
+      .scissorCount = 1
+    };
+
+    // Create rasterization state info
+    vk::PipelineRasterizationStateCreateInfo rasterizer{
+      .depthClampEnable = VK_FALSE,
+      .rasterizerDiscardEnable = VK_FALSE,
+      .polygonMode = vk::PolygonMode::eFill,
+      .cullMode = vk::CullModeFlagBits::eNone,
+      .frontFace = vk::FrontFace::eCounterClockwise,
+      .depthBiasEnable = VK_FALSE,
+      .lineWidth = 1.0f
+    };
+
+    // Create multisample state info
+    vk::PipelineMultisampleStateCreateInfo multisampling{
+      .rasterizationSamples = vk::SampleCountFlagBits::e1,
+      .sampleShadingEnable = VK_FALSE
+    };
+
+    // Create depth stencil state info
+    vk::PipelineDepthStencilStateCreateInfo depthStencil{
+      .depthTestEnable = VK_TRUE,
+      .depthWriteEnable = VK_TRUE,
+      // Use LessOrEqual so that the main shading pass works after a depth pre-pass
+      .depthCompareOp = vk::CompareOp::eLessOrEqual,
+      .depthBoundsTestEnable = VK_FALSE,
+      .stencilTestEnable = VK_FALSE
+    };
+
+    // Create a color blend attachment state
+    vk::PipelineColorBlendAttachmentState colorBlendAttachment{
+      .blendEnable = VK_FALSE,
+      .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA
+    };
+
+    // Create color blend state info
+    vk::PipelineColorBlendStateCreateInfo colorBlending{
+      .logicOpEnable = VK_FALSE,
+      .logicOp = vk::LogicOp::eCopy,
+      .attachmentCount = 1,
+      .pAttachments = &colorBlendAttachment
+    };
+
+    // Create dynamic state info
+    std::vector dynamicStates = {
+      vk::DynamicState::eViewport,
+      vk::DynamicState::eScissor
+    };
+
+    vk::PipelineDynamicStateCreateInfo dynamicState{
+      .dynamicStateCount = static_cast<uint32_t>(dynamicStates.size()),
+      .pDynamicStates = dynamicStates.data()
+    };
+
+    // Create pipeline layout
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo{
+      .setLayoutCount = 1,
+      .pSetLayouts = &*descriptorSetLayout,
+      .pushConstantRangeCount = 0,
+      .pPushConstantRanges = nullptr
+    };
+
+    pipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo);
+
+    // Create pipeline rendering info
+    vk::Format depthFormat = findDepthFormat();
+    std::cout << "Creating main graphics pipeline with depth format: " << static_cast<int>(depthFormat) << std::endl;
+
+    // Initialize member variable for proper lifetime management
+    mainPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{
+      .colorAttachmentCount = 1,
+      .pColorAttachmentFormats = &swapChainImageFormat,
+      .depthAttachmentFormat = depthFormat,
+      .stencilAttachmentFormat = vk::Format::eUndefined
+    };
+
+    // Create the graphics pipeline
+    vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer;
+    // Disable back-face culling for opaque PBR to avoid disappearing geometry when
+    // instance/model transforms flip winding (ensures PASS 1 actually shades pixels)
+    rasterizerBack.cullMode = vk::CullModeFlagBits::eNone;
+
+    vk::GraphicsPipelineCreateInfo pipelineInfo{
+      .pNext = &mainPipelineRenderingCreateInfo,
+      .flags = vk::PipelineCreateFlags{},
+      .stageCount = 2,
+      .pStages = shaderStages,
+      .pVertexInputState = &vertexInputInfo,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      .pRasterizationState = &rasterizerBack,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencil,
+      .pColorBlendState = &colorBlending,
+      .pDynamicState = &dynamicState,
+      .layout = *pipelineLayout,
+      .renderPass = nullptr,
+      .subpass = 0,
+      .basePipelineHandle = nullptr,
+      .basePipelineIndex = -1
+    };
+
+    graphicsPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create graphics pipeline: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create PBR pipeline
+bool Renderer::createPBRPipeline() {
+  try {
+    // Create PBR descriptor set layout
+    if (!createPBRDescriptorSetLayout()) {
+      return false;
+    }
+
+    // Read shader code
+    auto shaderCode = readFile("shaders/pbr.spv");
+
+    // Create shader modules
+    vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode);
+
+    // Create shader stage info
+    vk::PipelineShaderStageCreateInfo vertShaderStageInfo{
+      .stage = vk::ShaderStageFlagBits::eVertex,
+      .module = *shaderModule,
+      .pName = "VSMain"
+    };
+
+    vk::PipelineShaderStageCreateInfo fragShaderStageInfo{
+      .stage = vk::ShaderStageFlagBits::eFragment,
+      .module = *shaderModule,
+      .pName = "PSMain"
+    };
+
+    // Fragment entry point specialized for architectural glass
+    vk::PipelineShaderStageCreateInfo fragGlassStageInfo{
+      .stage = vk::ShaderStageFlagBits::eFragment,
+      .module = *shaderModule,
+      .pName = "GlassPSMain"
+    };
+
+    vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo};
+
+    // Define vertex and instance binding descriptions
+    auto vertexBindingDescription = Vertex::getBindingDescription();
+    auto instanceBindingDescription = InstanceData::getBindingDescription();
+    std::array<vk::VertexInputBindingDescription, 2> bindingDescriptions = {
+      vertexBindingDescription,
+      instanceBindingDescription
+    };
+
+    // Define vertex and instance attribute descriptions
+    auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions();
+    auto instanceModelMatrixAttributes = InstanceData::getModelMatrixAttributeDescriptions();
+    auto instanceNormalMatrixAttributes = InstanceData::getNormalMatrixAttributeDescriptions();
+
+    // Combine all attribute descriptions
+    std::vector<vk::VertexInputAttributeDescription> allAttributeDescriptions;
+    allAttributeDescriptions.insert(allAttributeDescriptions.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end());
+    allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceModelMatrixAttributes.begin(), instanceModelMatrixAttributes.end());
+    allAttributeDescriptions.insert(allAttributeDescriptions.end(), instanceNormalMatrixAttributes.begin(), instanceNormalMatrixAttributes.end());
+
+    vk::PipelineVertexInputStateCreateInfo vertexInputInfo{
+      .vertexBindingDescriptionCount = static_cast<uint32_t>(bindingDescriptions.size()),
+      .pVertexBindingDescriptions = bindingDescriptions.data(),
+      .vertexAttributeDescriptionCount = static_cast<uint32_t>(allAttributeDescriptions.size()),
+      .pVertexAttributeDescriptions = allAttributeDescriptions.data()
+    };
+
+    // Create input assembly info
+    vk::PipelineInputAssemblyStateCreateInfo inputAssembly{
+      .topology = vk::PrimitiveTopology::eTriangleList,
+      .primitiveRestartEnable = VK_FALSE
+    };
+
+    // Create viewport state info
+    vk::PipelineViewportStateCreateInfo viewportState{
+      .viewportCount = 1,
+      .scissorCount = 1
+    };
+
+    // Create rasterization state info
+    vk::PipelineRasterizationStateCreateInfo rasterizer{
+      .depthClampEnable = VK_FALSE,
+      .rasterizerDiscardEnable = VK_FALSE,
+      .polygonMode = vk::PolygonMode::eFill,
+      .cullMode = vk::CullModeFlagBits::eNone,
+      .frontFace = vk::FrontFace::eCounterClockwise,
+      .depthBiasEnable = VK_FALSE,
+      .lineWidth = 1.0f
+    };
+
+    // Create multisample state info
+    vk::PipelineMultisampleStateCreateInfo multisampling{
+      .rasterizationSamples = vk::SampleCountFlagBits::e1,
+      .sampleShadingEnable = VK_FALSE
+    };
+
+    // Create depth stencil state info
+    vk::PipelineDepthStencilStateCreateInfo depthStencil{
+      .depthTestEnable = VK_TRUE,
+      .depthWriteEnable = VK_TRUE,
+      .depthCompareOp = vk::CompareOp::eLess,
+      .depthBoundsTestEnable = VK_FALSE,
+      .stencilTestEnable = VK_FALSE
+    };
+
+    // Create a color blend attachment state
+    vk::PipelineColorBlendAttachmentState colorBlendAttachment{
+      .blendEnable = VK_FALSE,
+      .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA
+    };
+
+    // Create color blend state info
+    vk::PipelineColorBlendStateCreateInfo colorBlending{
+      .logicOpEnable = VK_FALSE,
+      .logicOp = vk::LogicOp::eCopy,
+      .attachmentCount = 1,
+      .pAttachments = &colorBlendAttachment
+    };
+
+    // Create dynamic state info
+    std::vector dynamicStates = {
+      vk::DynamicState::eViewport,
+      vk::DynamicState::eScissor
+    };
+
+    vk::PipelineDynamicStateCreateInfo dynamicState{
+      .dynamicStateCount = static_cast<uint32_t>(dynamicStates.size()),
+      .pDynamicStates = dynamicStates.data()
+    };
+
+    // Create push constant range for material properties
+    vk::PushConstantRange pushConstantRange{
+      .stageFlags = vk::ShaderStageFlagBits::eFragment,
+      .offset = 0,
+      .size = sizeof(MaterialProperties)
+    };
+
+    std::array<vk::DescriptorSetLayout, 2> transparentSetLayouts = {*pbrDescriptorSetLayout, *transparentDescriptorSetLayout};
+    // Create a pipeline layout for opaque PBR with only the PBR descriptor set (set 0)
+    std::array<vk::DescriptorSetLayout, 1> pbrOnlySetLayouts = {*pbrDescriptorSetLayout};
+    // Create BOTH pipeline layouts with two descriptor sets (PBR set 0 + scene color set 1)
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo{
+      .setLayoutCount = static_cast<uint32_t>(transparentSetLayouts.size()),
+      .pSetLayouts = transparentSetLayouts.data(),
+      .pushConstantRangeCount = 1,
+      .pPushConstantRanges = &pushConstantRange
+    };
+
+    pbrPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo);
+
+    // Transparent PBR layout uses the same two-set layout
+    vk::PipelineLayoutCreateInfo transparentPipelineLayoutInfo{.setLayoutCount = static_cast<uint32_t>(transparentSetLayouts.size()), .pSetLayouts = transparentSetLayouts.data(), .pushConstantRangeCount = 1, .pPushConstantRanges = &pushConstantRange};
+    pbrTransparentPipelineLayout = vk::raii::PipelineLayout(device, transparentPipelineLayoutInfo);
+
+    // Create pipeline rendering info
+    vk::Format depthFormat = findDepthFormat();
+
+    // Initialize member variable for proper lifetime management
+    pbrPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{
+      .colorAttachmentCount = 1,
+      .pColorAttachmentFormats = &swapChainImageFormat,
+      .depthAttachmentFormat = depthFormat,
+      .stencilAttachmentFormat = vk::Format::eUndefined
+    };
+
+    // 1) Opaque PBR pipeline (no blending, depth writes enabled)
+    vk::PipelineColorBlendAttachmentState opaqueBlendAttachment = colorBlendAttachment;
+    opaqueBlendAttachment.blendEnable = VK_FALSE;
+    vk::PipelineColorBlendStateCreateInfo colorBlendingOpaque{
+      .logicOpEnable = VK_FALSE,
+      .logicOp = vk::LogicOp::eCopy,
+      .attachmentCount = 1,
+      .pAttachments = &opaqueBlendAttachment
+    };
+    vk::PipelineDepthStencilStateCreateInfo depthStencilOpaque = depthStencil;
+    depthStencilOpaque.depthWriteEnable = VK_TRUE;
+
+    vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer;
+    rasterizerBack.cullMode = vk::CullModeFlagBits::eBack;
+
+    // For architectural glass we often want to see both the inner and outer
+    // walls of thin shells (e.g., bar glasses viewed from above). Use
+    // no culling for the glass pipeline to render both sides, while
+    // keeping back-face culling for the generic PBR pipelines.
+    vk::PipelineRasterizationStateCreateInfo rasterizerGlass = rasterizer;
+    rasterizerGlass.cullMode = vk::CullModeFlagBits::eNone;
+
+    vk::GraphicsPipelineCreateInfo opaquePipelineInfo{
+
+      .pNext = &pbrPipelineRenderingCreateInfo,
+      .flags = vk::PipelineCreateFlags{},
+      .stageCount = 2,
+      .pStages = shaderStages,
+      .pVertexInputState = &vertexInputInfo,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      .pRasterizationState = &rasterizerBack,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencilOpaque,
+      .pColorBlendState = &colorBlendingOpaque,
+      .pDynamicState = &dynamicState,
+      .layout = *pbrPipelineLayout,
+      .renderPass = nullptr,
+      .subpass = 0,
+      .basePipelineHandle = nullptr,
+      .basePipelineIndex = -1
+    };
+    pbrGraphicsPipeline = vk::raii::Pipeline(device, nullptr, opaquePipelineInfo);
+
+    // 1b) Opaque PBR pipeline variant for color pass after a depth pre-pass.
+    // Depth writes disabled (read-only) and compare against pre-pass depth.
+    vk::PipelineDepthStencilStateCreateInfo depthStencilAfterPrepass = depthStencil;
+    depthStencilAfterPrepass.depthTestEnable = VK_TRUE;
+    depthStencilAfterPrepass.depthWriteEnable = VK_FALSE;
+    depthStencilAfterPrepass.depthCompareOp = vk::CompareOp::eEqual;
+
+    vk::GraphicsPipelineCreateInfo opaqueAfterPrepassInfo{
+
+      .pNext = &pbrPipelineRenderingCreateInfo,
+      .flags = vk::PipelineCreateFlags{},
+      .stageCount = 2,
+      .pStages = shaderStages,
+      .pVertexInputState = &vertexInputInfo,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      .pRasterizationState = &rasterizerBack,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencilAfterPrepass,
+      .pColorBlendState = &colorBlendingOpaque,
+      .pDynamicState = &dynamicState,
+      .layout = *pbrPipelineLayout,
+      .renderPass = nullptr,
+      .subpass = 0,
+      .basePipelineHandle = nullptr,
+      .basePipelineIndex = -1
+    };
+    pbrPrepassGraphicsPipeline = vk::raii::Pipeline(device, nullptr, opaqueAfterPrepassInfo);
+
+    // 1c) Reflection PBR pipeline for mirrored off-screen pass (cull none to avoid winding issues)
+    vk::PipelineRasterizationStateCreateInfo rasterizerReflection = rasterizer;
+    rasterizerReflection.cullMode = vk::CullModeFlagBits::eNone;
+    vk::GraphicsPipelineCreateInfo reflectionPipelineInfo{
+
+      .pNext = &pbrPipelineRenderingCreateInfo,
+      .flags = vk::PipelineCreateFlags{},
+      .stageCount = 2,
+      .pStages = shaderStages,
+      .pVertexInputState = &vertexInputInfo,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      .pRasterizationState = &rasterizerReflection,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencilOpaque,
+      .pColorBlendState = &colorBlendingOpaque,
+      .pDynamicState = &dynamicState,
+      .layout = *pbrPipelineLayout,
+      .renderPass = nullptr,
+      .subpass = 0,
+      .basePipelineHandle = nullptr,
+      .basePipelineIndex = -1
+    };
+    pbrReflectionGraphicsPipeline = vk::raii::Pipeline(device, nullptr, reflectionPipelineInfo);
+
+    // 2) Blended PBR pipeline (straight alpha blending, depth writes disabled for translucency)
+    vk::PipelineColorBlendAttachmentState blendedAttachment = colorBlendAttachment;
+    blendedAttachment.blendEnable = VK_TRUE;
+    // Straight alpha blending: out.rgb = src.rgb*src.a + dst.rgb*(1-src.a)
+    blendedAttachment.srcColorBlendFactor = vk::BlendFactor::eSrcAlpha;
+    blendedAttachment.dstColorBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha;
+    // Alpha channel keeps destination scaled by inverse src alpha
+    blendedAttachment.srcAlphaBlendFactor = vk::BlendFactor::eOne;
+    blendedAttachment.dstAlphaBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha;
+    vk::PipelineColorBlendStateCreateInfo colorBlendingBlended{.attachmentCount = 1, .pAttachments = &blendedAttachment};
+    vk::PipelineDepthStencilStateCreateInfo depthStencilBlended = depthStencil;
+    depthStencilBlended.depthWriteEnable = VK_FALSE;
+    depthStencilBlended.depthCompareOp = vk::CompareOp::eLessOrEqual;
+
+    vk::GraphicsPipelineCreateInfo blendedPipelineInfo{
+
+      .pNext = &pbrPipelineRenderingCreateInfo,
+      .flags = vk::PipelineCreateFlags{},
+      .stageCount = 2,
+      .pStages = shaderStages,
+      .pVertexInputState = &vertexInputInfo,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      // Use back-face culling for the blended (glass) pipeline to avoid
+      // rendering both front and back faces of thin glass geometry, which
+      // can cause flickering as the camera rotates due to overlapping
+      // transparent surfaces passing the depth test.
+      .pRasterizationState = &rasterizerBack,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencilBlended,
+      .pColorBlendState = &colorBlendingBlended,
+      .pDynamicState = &dynamicState,
+      .layout = *pbrTransparentPipelineLayout,
+      .renderPass = nullptr,
+      .subpass = 0,
+      .basePipelineHandle = nullptr,
+      .basePipelineIndex = -1
+    };
+    pbrBlendGraphicsPipeline = vk::raii::Pipeline(device, nullptr, blendedPipelineInfo);
+
+    // 3) Glass pipeline (architectural glass) - uses the same vertex input and
+    // descriptor layouts, but a dedicated fragment shader entry point
+    // (GlassPSMain) for more stable glass shading.
+    vk::PipelineShaderStageCreateInfo glassStages[] = {vertShaderStageInfo, fragGlassStageInfo};
+
+    vk::GraphicsPipelineCreateInfo glassPipelineInfo{
+
+      .pNext = &pbrPipelineRenderingCreateInfo,
+      .flags = vk::PipelineCreateFlags{},
+      .stageCount = 2,
+      .pStages = glassStages,
+      .pVertexInputState = &vertexInputInfo,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      .pRasterizationState = &rasterizerGlass,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencilBlended,
+      .pColorBlendState = &colorBlendingBlended,
+      .pDynamicState = &dynamicState,
+      .layout = *pbrTransparentPipelineLayout,
+      .renderPass = nullptr,
+      .subpass = 0,
+      .basePipelineHandle = nullptr,
+      .basePipelineIndex = -1
+    };
+    glassGraphicsPipeline = vk::raii::Pipeline(device, nullptr, glassPipelineInfo);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create PBR pipeline: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create fullscreen composite pipeline (samples off-screen color and writes to swapchain)
+bool Renderer::createCompositePipeline() {
+  try {
+    // Reuse the transparent descriptor set layout (binding 0 = combined image sampler)
+    if (*transparentDescriptorSetLayout == nullptr) {
+      // Ensure PBR pipeline path created it
+      if (!createPBRPipeline()) {
+        return false;
+      }
+    }
+
+    // Read composite shader code
+    auto shaderCode = readFile("shaders/composite.spv");
+    vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode);
+
+    // Shader stages
+    vk::PipelineShaderStageCreateInfo vert{
+      .stage = vk::ShaderStageFlagBits::eVertex,
+      .module = *shaderModule,
+      .pName = "VSMain"
+    };
+    vk::PipelineShaderStageCreateInfo frag{
+      .stage = vk::ShaderStageFlagBits::eFragment,
+      .module = *shaderModule,
+      .pName = "PSMain"
+    };
+    vk::PipelineShaderStageCreateInfo stages[] = {vert, frag};
+
+    // No vertex inputs (fullscreen triangle via SV_VertexID)
+    vk::PipelineVertexInputStateCreateInfo vertexInput{};
+    vk::PipelineInputAssemblyStateCreateInfo inputAssembly{.topology = vk::PrimitiveTopology::eTriangleList};
+    vk::PipelineViewportStateCreateInfo viewportState{.viewportCount = 1, .scissorCount = 1};
+    vk::PipelineRasterizationStateCreateInfo rasterizer{.polygonMode = vk::PolygonMode::eFill, .cullMode = vk::CullModeFlagBits::eNone, .frontFace = vk::FrontFace::eCounterClockwise, .lineWidth = 1.0f};
+    vk::PipelineMultisampleStateCreateInfo multisampling{.rasterizationSamples = vk::SampleCountFlagBits::e1};
+    // No depth
+    vk::PipelineDepthStencilStateCreateInfo depthStencil{.depthTestEnable = VK_FALSE, .depthWriteEnable = VK_FALSE};
+    // No blending (we clear swapchain before this and blend transparents later)
+    vk::PipelineColorBlendAttachmentState attachment{
+      .blendEnable = VK_FALSE,
+      .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG |
+      vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA
+    };
+    vk::PipelineColorBlendStateCreateInfo colorBlending{.attachmentCount = 1, .pAttachments = &attachment};
+    std::array dynStates = {vk::DynamicState::eViewport, vk::DynamicState::eScissor};
+    vk::PipelineDynamicStateCreateInfo dynamicState{.dynamicStateCount = static_cast<uint32_t>(dynStates.size()), .pDynamicStates = dynStates.data()};
+
+    // Pipeline layout: single set (combined image sampler) + push constants for exposure/gamma/srgb flag
+    vk::DescriptorSetLayout setLayouts[] = {*transparentDescriptorSetLayout};
+    vk::PushConstantRange pushRange{.stageFlags = vk::ShaderStageFlagBits::eFragment, .offset = 0, .size = 16}; // matches struct Push in composite.slang
+    vk::PipelineLayoutCreateInfo plInfo{.setLayoutCount = 1, .pSetLayouts = setLayouts, .pushConstantRangeCount = 1, .pPushConstantRanges = &pushRange};
+    compositePipelineLayout = vk::raii::PipelineLayout(device, plInfo);
+
+    // Dynamic rendering info
+    compositePipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{
+
+      .colorAttachmentCount = 1,
+      .pColorAttachmentFormats = &swapChainImageFormat,
+      .depthAttachmentFormat = vk::Format::eUndefined,
+      .stencilAttachmentFormat = vk::Format::eUndefined
+    };
+
+    vk::GraphicsPipelineCreateInfo pipeInfo{
+
+      .pNext = &compositePipelineRenderingCreateInfo,
+      .stageCount = 2,
+      .pStages = stages,
+      .pVertexInputState = &vertexInput,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      .pRasterizationState = &rasterizer,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencil,
+      .pColorBlendState = &colorBlending,
+      .pDynamicState = &dynamicState,
+      .layout = *compositePipelineLayout,
+      .renderPass = nullptr,
+      .subpass = 0
+    };
+
+    compositePipeline = vk::raii::Pipeline(device, nullptr, pipeInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create composite pipeline: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create Depth Pre-pass pipeline (depth-only)
+bool Renderer::createDepthPrepassPipeline() {
+  try {
+    // Use the same descriptor set layout and pipeline layout as PBR for UBOs and instancing
+    if (*pbrDescriptorSetLayout == nullptr || *pbrPipelineLayout == nullptr) {
+      if (!createPBRPipeline()) {
+        return false;
+      }
+    }
+
+    // Read PBR shader (vertex only)
+    auto shaderCode = readFile("shaders/pbr.spv");
+    vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode);
+
+    // Stages: Vertex only
+    vk::PipelineShaderStageCreateInfo vertStage{
+      .stage = vk::ShaderStageFlagBits::eVertex,
+      .module = *shaderModule,
+      .pName = "VSMain"
+    };
+
+    // Vertex/instance bindings & attributes same as PBR
+    auto vertexBindingDescription = Vertex::getBindingDescription();
+    auto instanceBindingDescription = InstanceData::getBindingDescription();
+    std::array<vk::VertexInputBindingDescription, 2> bindingDescriptions = {
+      vertexBindingDescription,
+      instanceBindingDescription
+    };
+
+    auto vertexAttributeDescriptions = Vertex::getAttributeDescriptions();
+    auto instanceModelMatrixAttributes = InstanceData::getModelMatrixAttributeDescriptions();
+    auto instanceNormalMatrixAttributes = InstanceData::getNormalMatrixAttributeDescriptions();
+    std::vector<vk::VertexInputAttributeDescription> allAttributes;
+    allAttributes.insert(allAttributes.end(), vertexAttributeDescriptions.begin(), vertexAttributeDescriptions.end());
+    allAttributes.insert(allAttributes.end(), instanceModelMatrixAttributes.begin(), instanceModelMatrixAttributes.end());
+    allAttributes.insert(allAttributes.end(), instanceNormalMatrixAttributes.begin(), instanceNormalMatrixAttributes.end());
+
+    vk::PipelineVertexInputStateCreateInfo vertexInputInfo{
+      .vertexBindingDescriptionCount = static_cast<uint32_t>(bindingDescriptions.size()),
+      .pVertexBindingDescriptions = bindingDescriptions.data(),
+      .vertexAttributeDescriptionCount = static_cast<uint32_t>(allAttributes.size()),
+      .pVertexAttributeDescriptions = allAttributes.data()
+    };
+
+    vk::PipelineInputAssemblyStateCreateInfo inputAssembly{
+      .topology = vk::PrimitiveTopology::eTriangleList,
+      .primitiveRestartEnable = VK_FALSE
+    };
+
+    // Dummy viewport/scissor (dynamic)
+    vk::PipelineViewportStateCreateInfo viewportState{
+      .viewportCount = 1,
+      .scissorCount = 1
+    };
+
+    vk::PipelineRasterizationStateCreateInfo rasterizer{
+      .depthClampEnable = VK_FALSE,
+      .rasterizerDiscardEnable = VK_FALSE,
+      .polygonMode = vk::PolygonMode::eFill,
+      .cullMode = vk::CullModeFlagBits::eBack,
+      .frontFace = vk::FrontFace::eCounterClockwise,
+      .depthBiasEnable = VK_FALSE,
+      .lineWidth = 1.0f
+    };
+
+    vk::PipelineMultisampleStateCreateInfo multisampling{
+      .rasterizationSamples = vk::SampleCountFlagBits::e1
+    };
+
+    vk::PipelineDepthStencilStateCreateInfo depthStencil{
+      .depthTestEnable = VK_TRUE,
+      .depthWriteEnable = VK_TRUE,
+      .depthCompareOp = vk::CompareOp::eLessOrEqual,
+      .depthBoundsTestEnable = VK_FALSE,
+      .stencilTestEnable = VK_FALSE
+    };
+
+    // No color attachments
+    vk::PipelineColorBlendStateCreateInfo colorBlending{
+      .logicOpEnable = VK_FALSE,
+      .attachmentCount = 0,
+      .pAttachments = nullptr
+    };
+
+    std::array dynamicStates = {vk::DynamicState::eViewport, vk::DynamicState::eScissor};
+    vk::PipelineDynamicStateCreateInfo dynamicState{
+      .dynamicStateCount = static_cast<uint32_t>(dynamicStates.size()),
+      .pDynamicStates = dynamicStates.data()
+    };
+
+    vk::Format depthFormat = findDepthFormat();
+    vk::PipelineRenderingCreateInfo renderingInfo{
+      .colorAttachmentCount = 0,
+      .pColorAttachmentFormats = nullptr,
+      .depthAttachmentFormat = depthFormat
+    };
+
+    vk::GraphicsPipelineCreateInfo pipelineInfo{
+      .pNext = &renderingInfo,
+      .stageCount = 1,
+      .pStages = &vertStage,
+      .pVertexInputState = &vertexInputInfo,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      .pRasterizationState = &rasterizer,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencil,
+      .pColorBlendState = &colorBlending,
+      .pDynamicState = &dynamicState,
+      .layout = *pbrPipelineLayout
+    };
+
+    depthPrepassPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create depth pre-pass pipeline: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create a lighting pipeline
+bool Renderer::createLightingPipeline() {
+  try {
+    // Read shader code
+    auto shaderCode = readFile("shaders/lighting.spv");
+
+    // Create shader modules
+    vk::raii::ShaderModule shaderModule = createShaderModule(shaderCode);
+
+    // Create shader stage info
+    vk::PipelineShaderStageCreateInfo vertShaderStageInfo{
+      .stage = vk::ShaderStageFlagBits::eVertex,
+      .module = *shaderModule,
+      .pName = "VSMain"
+    };
+
+    vk::PipelineShaderStageCreateInfo fragShaderStageInfo{
+      .stage = vk::ShaderStageFlagBits::eFragment,
+      .module = *shaderModule,
+      .pName = "PSMain"
+    };
+
+    vk::PipelineShaderStageCreateInfo shaderStages[] = {vertShaderStageInfo, fragShaderStageInfo};
+
+    // Create vertex input info
+    auto bindingDescription = Vertex::getBindingDescription();
+    auto attributeDescriptions = Vertex::getAttributeDescriptions();
+
+    vk::PipelineVertexInputStateCreateInfo vertexInputInfo{
+      .vertexBindingDescriptionCount = 1,
+      .pVertexBindingDescriptions = &bindingDescription,
+      .vertexAttributeDescriptionCount = static_cast<uint32_t>(attributeDescriptions.size()),
+      .pVertexAttributeDescriptions = attributeDescriptions.data()
+    };
+
+    // Create input assembly info
+    vk::PipelineInputAssemblyStateCreateInfo inputAssembly{
+      .topology = vk::PrimitiveTopology::eTriangleList,
+      .primitiveRestartEnable = VK_FALSE
+    };
+
+    // Create viewport state info
+    vk::PipelineViewportStateCreateInfo viewportState{
+      .viewportCount = 1,
+      .scissorCount = 1
+    };
+
+    // Create rasterization state info
+    vk::PipelineRasterizationStateCreateInfo rasterizer{
+      .depthClampEnable = VK_FALSE,
+      .rasterizerDiscardEnable = VK_FALSE,
+      .polygonMode = vk::PolygonMode::eFill,
+      .cullMode = vk::CullModeFlagBits::eNone,
+      .frontFace = vk::FrontFace::eCounterClockwise,
+      .depthBiasEnable = VK_FALSE,
+      .lineWidth = 1.0f
+    };
+
+    // Create multisample state info
+    vk::PipelineMultisampleStateCreateInfo multisampling{
+      .rasterizationSamples = vk::SampleCountFlagBits::e1,
+      .sampleShadingEnable = VK_FALSE
+    };
+
+    // Create depth stencil state info
+    vk::PipelineDepthStencilStateCreateInfo depthStencil{
+      .depthTestEnable = VK_TRUE,
+      .depthWriteEnable = VK_TRUE,
+      .depthCompareOp = vk::CompareOp::eLess,
+      .depthBoundsTestEnable = VK_FALSE,
+      .stencilTestEnable = VK_FALSE
+    };
+
+    // Create a color blend attachment state
+    vk::PipelineColorBlendAttachmentState colorBlendAttachment{
+      .blendEnable = VK_TRUE,
+      .srcColorBlendFactor = vk::BlendFactor::eSrcAlpha,
+      .dstColorBlendFactor = vk::BlendFactor::eOneMinusSrcAlpha,
+      .colorBlendOp = vk::BlendOp::eAdd,
+      .srcAlphaBlendFactor = vk::BlendFactor::eOne,
+      .dstAlphaBlendFactor = vk::BlendFactor::eZero,
+      .alphaBlendOp = vk::BlendOp::eAdd,
+      .colorWriteMask = vk::ColorComponentFlagBits::eR | vk::ColorComponentFlagBits::eG | vk::ColorComponentFlagBits::eB | vk::ColorComponentFlagBits::eA
+    };
+
+    // Create color blend state info
+    vk::PipelineColorBlendStateCreateInfo colorBlending{
+      .logicOpEnable = VK_FALSE,
+      .logicOp = vk::LogicOp::eCopy,
+      .attachmentCount = 1,
+      .pAttachments = &colorBlendAttachment
+    };
+
+    // Create dynamic state info
+    std::vector dynamicStates = {
+      vk::DynamicState::eViewport,
+      vk::DynamicState::eScissor
+    };
+
+    vk::PipelineDynamicStateCreateInfo dynamicState{
+      .dynamicStateCount = static_cast<uint32_t>(dynamicStates.size()),
+      .pDynamicStates = dynamicStates.data()
+    };
+
+    // Create push constant range for material properties
+    vk::PushConstantRange pushConstantRange{
+      .stageFlags = vk::ShaderStageFlagBits::eFragment,
+      .offset = 0,
+      .size = sizeof(MaterialProperties)
+    };
+
+    // Create pipeline layout
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo{
+      .setLayoutCount = 1,
+      .pSetLayouts = &*descriptorSetLayout,
+      .pushConstantRangeCount = 1,
+      .pPushConstantRanges = &pushConstantRange
+    };
+
+    lightingPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo);
+
+    // Create pipeline rendering info
+    vk::Format depthFormat = findDepthFormat();
+
+    // Initialize member variable for proper lifetime management
+    lightingPipelineRenderingCreateInfo = vk::PipelineRenderingCreateInfo{
+
+      .colorAttachmentCount = 1,
+      .pColorAttachmentFormats = &swapChainImageFormat,
+      .depthAttachmentFormat = depthFormat,
+      .stencilAttachmentFormat = vk::Format::eUndefined
+    };
+
+    // Create a graphics pipeline
+    vk::PipelineRasterizationStateCreateInfo rasterizerBack = rasterizer;
+    rasterizerBack.cullMode = vk::CullModeFlagBits::eBack;
+
+    vk::GraphicsPipelineCreateInfo pipelineInfo{
+
+      .pNext = &lightingPipelineRenderingCreateInfo,
+      .flags = vk::PipelineCreateFlags{},
+      .stageCount = 2,
+      .pStages = shaderStages,
+      .pVertexInputState = &vertexInputInfo,
+      .pInputAssemblyState = &inputAssembly,
+      .pViewportState = &viewportState,
+      .pRasterizationState = &rasterizerBack,
+      .pMultisampleState = &multisampling,
+      .pDepthStencilState = &depthStencil,
+      .pColorBlendState = &colorBlending,
+      .pDynamicState = &dynamicState,
+      .layout = *lightingPipelineLayout,
+      .renderPass = nullptr,
+      .subpass = 0,
+      .basePipelineHandle = nullptr,
+      .basePipelineIndex = -1
+    };
+
+    lightingPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create lighting pipeline: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Push material properties to the pipeline
+void Renderer::pushMaterialProperties(vk::CommandBuffer commandBuffer, const MaterialProperties& material) const {
+  commandBuffer.pushConstants(*pbrPipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, sizeof(MaterialProperties), &material);
+}
+
+bool Renderer::createRayQueryDescriptorSetLayout() {
+  // Production layout: 7 bindings (0..6), no debug buffer at 7
+  std::array<vk::DescriptorSetLayoutBinding, 7> bindings{};
+
+  // Binding 0: UBO (UniformBufferObject)
+  bindings[0].binding = 0;
+  bindings[0].descriptorType = vk::DescriptorType::eUniformBuffer;
+  bindings[0].descriptorCount = 1;
+  bindings[0].stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+  // Binding 1: TLAS (Top-Level Acceleration Structure)
+  bindings[1].binding = 1;
+  bindings[1].descriptorType = vk::DescriptorType::eAccelerationStructureKHR;
+  bindings[1].descriptorCount = 1;
+  bindings[1].stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+  // Binding 2: Output image (storage image)
+  bindings[2].binding = 2;
+  bindings[2].descriptorType = vk::DescriptorType::eStorageImage;
+  bindings[2].descriptorCount = 1;
+  bindings[2].stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+  // Binding 3: Light buffer (storage buffer)
+  bindings[3].binding = 3;
+  bindings[3].descriptorType = vk::DescriptorType::eStorageBuffer;
+  bindings[3].descriptorCount = 1;
+  bindings[3].stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+  // Binding 4: Geometry info buffer (maps BLAS geometry index to vertex/index buffer addresses)
+  bindings[4].binding = 4;
+  bindings[4].descriptorType = vk::DescriptorType::eStorageBuffer;
+  bindings[4].descriptorCount = 1;
+  bindings[4].stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+  // Binding 5: Material buffer (array of material properties)
+  bindings[5].binding = 5;
+  bindings[5].descriptorType = vk::DescriptorType::eStorageBuffer;
+  bindings[5].descriptorCount = 1;
+  bindings[5].stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+  // Binding 6: BaseColor textures array (combined image samplers)
+  bindings[6].binding = 6;
+  bindings[6].descriptorType = vk::DescriptorType::eCombinedImageSampler;
+  bindings[6].descriptorCount = RQ_MAX_TEX; // large static array
+  bindings[6].stageFlags = vk::ShaderStageFlagBits::eCompute;
+
+  // Descriptor indexing / update-after-bind support:
+  // The ray query shader indexes a large `eCombinedImageSampler` array with a per-pixel varying index.
+  // On some drivers this requires descriptor indexing features + layout binding flags to avoid the
+  // array collapsing to slot 0 (resulting in "no textures" even when `texIndex>0`).
+  std::array<vk::DescriptorBindingFlags, 7> bindingFlags{};
+  if (descriptorIndexingEnabled) {
+    // Binding 6 is the large sampled texture array.
+    bindingFlags[6] = vk::DescriptorBindingFlagBits::eUpdateAfterBind |
+        vk::DescriptorBindingFlagBits::eUpdateUnusedWhilePending |
+        vk::DescriptorBindingFlagBits::ePartiallyBound;
+  }
+
+  vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{};
+  if (descriptorIndexingEnabled) {
+    bindingFlagsInfo.bindingCount = static_cast<uint32_t>(bindingFlags.size());
+    bindingFlagsInfo.pBindingFlags = bindingFlags.data();
+  }
+
+  vk::DescriptorSetLayoutCreateInfo layoutInfo{};
+  if (descriptorIndexingEnabled) {
+    layoutInfo.pNext = &bindingFlagsInfo;
+    layoutInfo.flags = vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool;
+  }
+  layoutInfo.bindingCount = static_cast<uint32_t>(bindings.size());
+  layoutInfo.pBindings = bindings.data();
+
+  try {
+    rayQueryDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, layoutInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create ray query descriptor set layout: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool Renderer::createRayQueryPipeline() {
+  // Check if ray query is supported on this device
+  if (!rayQueryEnabled || !accelerationStructureEnabled) {
+    std::cout << "Ray query rendering not available on this device (missing VK_KHR_ray_query or VK_KHR_acceleration_structure support)\n";
+    return true; // Not an error - just skip ray query pipeline creation
+  }
+
+  // Load compiled shader module
+  auto shaderCode = readFile("shaders/ray_query.spv");
+  if (shaderCode.empty()) {
+    std::cerr << "Failed to load ray query shader\n";
+    return false;
+  }
+
+  vk::ShaderModuleCreateInfo createInfo{};
+  createInfo.codeSize = shaderCode.size();
+  createInfo.pCode = reinterpret_cast<const uint32_t *>(shaderCode.data());
+
+  vk::raii::ShaderModule shaderModule(device, createInfo);
+
+  vk::PipelineShaderStageCreateInfo shaderStage{};
+  shaderStage.stage = vk::ShaderStageFlagBits::eCompute;
+  shaderStage.module = *shaderModule;
+  shaderStage.pName = "main";
+
+  // Create pipeline layout
+  vk::PipelineLayoutCreateInfo pipelineLayoutInfo{};
+  pipelineLayoutInfo.setLayoutCount = 1;
+  pipelineLayoutInfo.pSetLayouts = &(*rayQueryDescriptorSetLayout);
+
+  rayQueryPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo);
+
+  // Create compute pipeline
+  vk::ComputePipelineCreateInfo pipelineInfo{};
+  pipelineInfo.stage = shaderStage;
+  pipelineInfo.layout = *rayQueryPipelineLayout;
+
+  try {
+    rayQueryPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create ray query pipeline: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool Renderer::createRayQueryResources() {
+  try {
+    // Create output image using memory pool (storage image for compute shader)
+    // Use an HDR-capable format for Ray Query so PBR lighting can accumulate in linear space
+    // before composite applies exposure/gamma.
+    // Fall back to R8G8B8A8_UNORM if the device does not support storage-image usage.
+    vk::Format rqFormat = vk::Format::eR16G16B16A16Sfloat; {
+      auto props = physicalDevice.getFormatProperties(rqFormat);
+      if (!(props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eStorageImage)) {
+        rqFormat = vk::Format::eR8G8B8A8Unorm;
+      }
+    }
+    auto [image, allocation] = memoryPool->createImage(
+      swapChainExtent.width,
+      swapChainExtent.height,
+      rqFormat,
+      vk::ImageTiling::eOptimal,
+      vk::ImageUsageFlagBits::eStorage | vk::ImageUsageFlagBits::eTransferSrc | vk::ImageUsageFlagBits::eSampled,
+      vk::MemoryPropertyFlagBits::eDeviceLocal,
+      1,
+      // mipLevels
+      vk::SharingMode::eExclusive,
+      {} // queueFamilies
+    );
+
+    rayQueryOutputImage = std::move(image);
+    rayQueryOutputImageAllocation = std::move(allocation);
+
+    // Create image view
+    vk::ImageViewCreateInfo viewInfo{};
+    viewInfo.image = *rayQueryOutputImage;
+    viewInfo.viewType = vk::ImageViewType::e2D;
+    viewInfo.format = rqFormat;
+    viewInfo.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor;
+    viewInfo.subresourceRange.baseMipLevel = 0;
+    viewInfo.subresourceRange.levelCount = 1;
+    viewInfo.subresourceRange.baseArrayLayer = 0;
+    viewInfo.subresourceRange.layerCount = 1;
+
+    rayQueryOutputImageView = vk::raii::ImageView(device, viewInfo);
+
+    // Transition output image to GENERAL layout for compute shader writes
+    transitionImageLayout(*rayQueryOutputImage,
+                          rqFormat,
+                          vk::ImageLayout::eUndefined,
+                          vk::ImageLayout::eGeneral,
+                          1);
+
+    // Allocate descriptor sets (one per frame in flight)
+    std::vector<vk::DescriptorSetLayout> layouts(MAX_FRAMES_IN_FLIGHT, *rayQueryDescriptorSetLayout);
+    vk::DescriptorSetAllocateInfo allocInfo{};
+    allocInfo.descriptorPool = *descriptorPool;
+    allocInfo.descriptorSetCount = MAX_FRAMES_IN_FLIGHT;
+    allocInfo.pSetLayouts = layouts.data();
+
+    // Allocate into a temporary owning container, then move the individual RAII sets into our vector.
+    // (Avoid assigning `vk::raii::DescriptorSets` directly into `std::vector<vk::raii::DescriptorSet>`.)
+    {
+      auto sets = vk::raii::DescriptorSets(device, allocInfo);
+      rayQueryDescriptorSets.clear();
+      rayQueryDescriptorSets.reserve(sets.size());
+      for (auto& s : sets) {
+        rayQueryDescriptorSets.emplace_back(std::move(s));
+      }
+    }
+
+    // Create descriptor sets for composite pass to sample the rayQueryOutputImage
+    // Reuse the transparentDescriptorSetLayout (binding 0 = combined image sampler)
+    if (*transparentDescriptorSetLayout == nullptr) {
+      // Ensure it exists (created by PBR path);
+      createPBRPipeline();
+    }
+    if (*transparentDescriptorSetLayout != nullptr) {
+      // Ensure we have a valid sampler for sampling the ray-query output image
+      if (*rqCompositeSampler == nullptr) {
+        vk::SamplerCreateInfo sci{
+          .magFilter = vk::Filter::eLinear,
+          .minFilter = vk::Filter::eLinear,
+          .mipmapMode = vk::SamplerMipmapMode::eNearest,
+          .addressModeU = vk::SamplerAddressMode::eClampToEdge,
+          .addressModeV = vk::SamplerAddressMode::eClampToEdge,
+          .addressModeW = vk::SamplerAddressMode::eClampToEdge,
+          .mipLodBias = 0.0f,
+          .anisotropyEnable = VK_FALSE,
+          .maxAnisotropy = 1.0f,
+          .compareEnable = VK_FALSE,
+          .compareOp = vk::CompareOp::eAlways,
+          .minLod = 0.0f,
+          .maxLod = 0.0f,
+          .borderColor = vk::BorderColor::eIntOpaqueBlack,
+          .unnormalizedCoordinates = VK_FALSE
+        };
+        rqCompositeSampler = vk::raii::Sampler(device, sci);
+      }
+      std::vector<vk::DescriptorSetLayout> rqLayouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout);
+      vk::DescriptorSetAllocateInfo rqAllocInfo{
+        .descriptorPool = *descriptorPool,
+        .descriptorSetCount = MAX_FRAMES_IN_FLIGHT,
+        .pSetLayouts = rqLayouts.data()
+      }; {
+        auto sets = vk::raii::DescriptorSets(device, rqAllocInfo);
+        rqCompositeDescriptorSets.clear();
+        rqCompositeDescriptorSets.reserve(sets.size());
+        for (auto& s : sets) {
+          rqCompositeDescriptorSets.emplace_back(std::move(s));
+        }
+      }
+
+      // Update each set to sample the rayQueryOutputImage
+      for (size_t i = 0; i < rqCompositeDescriptorSets.size(); ++i) {
+        // Use a dedicated sampler to avoid null sampler issues during early init
+        vk::Sampler samplerHandle = *rqCompositeSampler;
+        vk::DescriptorImageInfo imgInfo{
+          .sampler = samplerHandle,
+          .imageView = *rayQueryOutputImageView,
+          .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal
+        };
+        vk::WriteDescriptorSet write{
+          .dstSet = *rqCompositeDescriptorSets[i],
+          .dstBinding = 0,
+          .dstArrayElement = 0,
+          .descriptorCount = 1,
+          .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+          .pImageInfo = &imgInfo
+        };
+        device.updateDescriptorSets({write}, {});
+      }
+    }
+
+    // Create dedicated UBO buffers for ray query (one per frame in flight)
+    rayQueryUniformBuffers.clear();
+    rayQueryUniformAllocations.clear();
+    rayQueryUniformBuffersMapped.clear();
+
+    for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+      auto [uboBuffer, uboAlloc] = createBufferPooled(
+        sizeof(RayQueryUniformBufferObject),
+        vk::BufferUsageFlagBits::eUniformBuffer,
+        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+      rayQueryUniformBuffers.push_back(std::move(uboBuffer));
+      rayQueryUniformAllocations.push_back(std::move(uboAlloc));
+      rayQueryUniformBuffersMapped.push_back(rayQueryUniformAllocations.back()->mappedPtr);
+    }
+
+    std::cout << "Ray query resources created successfully (including " << MAX_FRAMES_IN_FLIGHT << " dedicated UBOs)\n";
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create ray query resources: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+bool AdvancedRenderer_createSkinningResources(Renderer* renderer) {
+  try {
+    std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    AdvancedRendererState& state = g_rendererStates[renderer];
+    const vk::raii::Device& device = renderer->GetRaiiDevice();
+
+    // 1. Create Descriptor Set Layouts
+    
+    // Set 0: Per-mesh buffers (Input, Output, JointMatrices, JointIndices, JointWeights)
+    std::array<vk::DescriptorSetLayoutBinding, 5> set0Bindings = {
+      vk::DescriptorSetLayoutBinding{
+        .binding = 0,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute
+      },
+      vk::DescriptorSetLayoutBinding{
+        .binding = 1,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute
+      },
+      vk::DescriptorSetLayoutBinding{
+        .binding = 2,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute
+      },
+      vk::DescriptorSetLayoutBinding{
+        .binding = 3,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute
+      },
+      vk::DescriptorSetLayoutBinding{
+        .binding = 4,
+        .descriptorType = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = 1,
+        .stageFlags = vk::ShaderStageFlagBits::eCompute
+      }
+    };
+
+    vk::DescriptorSetLayoutCreateInfo set0LayoutInfo{
+      .bindingCount = static_cast<uint32_t>(set0Bindings.size()),
+      .pBindings = set0Bindings.data()
+    };
+    state.skinDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, set0LayoutInfo);
+
+    // Set 1: Morph targets array
+    vk::DescriptorSetLayoutBinding morphBinding{
+      .binding = 0,
+      .descriptorType = vk::DescriptorType::eStorageBuffer,
+      .descriptorCount = 64, // max morph targets
+      .stageFlags = vk::ShaderStageFlagBits::eCompute
+    };
+
+    vk::DescriptorBindingFlags bindingFlags = vk::DescriptorBindingFlagBits::ePartiallyBound | vk::DescriptorBindingFlagBits::eVariableDescriptorCount;
+    vk::DescriptorSetLayoutBindingFlagsCreateInfo bindingFlagsInfo{
+      .bindingCount = 1,
+      .pBindingFlags = &bindingFlags
+    };
+
+    vk::DescriptorSetLayoutCreateInfo morphLayoutInfo{
+      .pNext = &bindingFlagsInfo,
+      .flags = vk::DescriptorSetLayoutCreateFlagBits::eUpdateAfterBindPool,
+      .bindingCount = 1,
+      .pBindings = &morphBinding
+    };
+    state.morphDescriptorSetLayout = vk::raii::DescriptorSetLayout(device, morphLayoutInfo);
+
+    // Create a dummy morph descriptor set (count 0) for meshes without morph targets
+    uint32_t zeroCount = 0;
+    vk::DescriptorSetVariableDescriptorCountAllocateInfo dummyVarInfo{
+        .descriptorSetCount = 1,
+        .pDescriptorCounts = &zeroCount
+    };
+    vk::DescriptorSetAllocateInfo dummyAllocInfo{
+        .pNext = &dummyVarInfo,
+        .descriptorPool = *renderer->descriptorPool,
+        .descriptorSetCount = 1,
+        .pSetLayouts = &*state.morphDescriptorSetLayout
+    };
+    auto dummySets = device.allocateDescriptorSets(dummyAllocInfo);
+    state.dummyMorphDescriptorSet = std::move(dummySets[0]);
+
+    // 2. Create Pipeline Layout
+    std::array<vk::DescriptorSetLayout, 2> layouts = {*state.skinDescriptorSetLayout, *state.morphDescriptorSetLayout};
+    vk::PushConstantRange pushConstantRange{
+      .stageFlags = vk::ShaderStageFlagBits::eCompute,
+      .offset = 0,
+      .size = sizeof(SkinPushConstants)
+    };
+
+    vk::PipelineLayoutCreateInfo pipelineLayoutInfo{
+      .setLayoutCount = static_cast<uint32_t>(layouts.size()),
+      .pSetLayouts = layouts.data(),
+      .pushConstantRangeCount = 1,
+      .pPushConstantRanges = &pushConstantRange
+    };
+    state.skinPipelineLayout = vk::raii::PipelineLayout(device, pipelineLayoutInfo);
+
+    // 3. Create Compute Pipeline
+    std::vector<char> computeShaderCode = renderer->readFile("shaders/morph_accumulate.spv");
+    vk::raii::ShaderModule computeShaderModule = renderer->createShaderModule(computeShaderCode);
+
+    vk::ComputePipelineCreateInfo pipelineInfo{
+      .stage = vk::PipelineShaderStageCreateInfo{
+        .stage = vk::ShaderStageFlagBits::eCompute,
+        .module = *computeShaderModule,
+        .pName = "main"
+      },
+      .layout = *state.skinPipelineLayout
+    };
+
+    state.skinPipeline = vk::raii::Pipeline(device, nullptr, pipelineInfo);
+
+    std::cout << "Skinning compute pipeline created successfully.\n";
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create skinning resources: " << e.what() << std::endl;
+    return false;
+  }
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/renderer_ray_query.cpp b/attachments/advanced_gltf/renderer_ray_query.cpp
new file mode 100644
index 000000000..5296e1ab8
--- /dev/null
+++ b/attachments/advanced_gltf/renderer_ray_query.cpp
@@ -0,0 +1,2025 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "entity.h"
+#include "mesh_component.h"
+#include "renderer.h"
+#include "renderer_advanced_types.h"
+#include "transform_component.h"
+#include <algorithm>
+#include <cmath>
+#include <glm/gtc/type_ptr.hpp>
+#include <iostream>
+#include <map>
+#include <unordered_set>
+
+// Helper function to get buffer device address
+vk::DeviceAddress getBufferDeviceAddress(const vk::raii::Device& device, vk::Buffer buffer) {
+  vk::BufferDeviceAddressInfo addressInfo{};
+  addressInfo.buffer = buffer;
+  return device.getBufferAddress(addressInfo);
+}
+
+/**
+ * @brief Build acceleration structures for ray query rendering.
+ *
+ * Builds BLAS for each unique mesh and a TLAS for the entire scene.
+ *
+ * @param entities The entities to include in the acceleration structures.
+ * @return True if successful, false otherwise.
+ */
+bool Renderer::buildAccelerationStructures(const std::vector<Entity *>& entities) {
+  if (!accelerationStructureEnabled || !rayQueryEnabled) {
+    std::cout << "Acceleration structures not supported on this device\n";
+    return false;
+  }
+
+  try {
+    const auto asStartCpu = std::chrono::steady_clock::now();
+
+    // --- UI progress instrumentation (for long AS builds) ---
+    // We update these frequently during BLAS/TLAS builds so the loading overlay
+    // can display meaningful progress if the build takes > ~10 seconds.
+    auto nowNs = []() -> uint64_t {
+      return static_cast<uint64_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+          std::chrono::steady_clock::now().time_since_epoch())
+        .count());
+    };
+    auto setASUi = [&](bool active, const char* stage, float progress, uint32_t done, uint32_t total) {
+      asBuildUiActive.store(active, std::memory_order_relaxed);
+      asBuildUiStage.store(stage ? stage : "", std::memory_order_relaxed);
+      asBuildUiProgress.store(std::clamp(progress, 0.0f, 1.0f), std::memory_order_relaxed);
+      asBuildUiDone.store(done, std::memory_order_relaxed);
+      asBuildUiTotal.store(total, std::memory_order_relaxed);
+      // Also drive the main loading overlay progress while we're in the AS phase.
+      if (GetLoadingPhase() == LoadingPhase::AccelerationStructures) {
+        SetLoadingPhaseProgress(progress);
+      }
+    };
+    // Start timer if not already running
+    if (asBuildUiStartNs.load(std::memory_order_relaxed) == 0) {
+      asBuildUiStartNs.store(nowNs(), std::memory_order_relaxed);
+    }
+    setASUi(true, "AS: prepare", 0.0f, 0u, 0u);
+    struct ASBuildUiGuard {
+      Renderer* r;
+      explicit ASBuildUiGuard(Renderer* rr) : r(rr) {
+      }
+      ~ASBuildUiGuard() {
+        if (!r)
+          return;
+        r->asBuildUiActive.store(false, std::memory_order_relaxed);
+        r->asBuildUiStage.store("idle", std::memory_order_relaxed);
+        r->asBuildUiProgress.store(0.0f, std::memory_order_relaxed);
+        r->asBuildUiDone.store(0u, std::memory_order_relaxed);
+        r->asBuildUiTotal.store(0u, std::memory_order_relaxed);
+        r->asBuildUiStartNs.store(0u, std::memory_order_relaxed);
+      }
+    } asUiGuard(this);
+
+    // Large scenes can take seconds to build BLAS/TLAS. Keep the watchdog alive while we work.
+    auto lastKick = std::chrono::steady_clock::now();
+    auto kickWatchdog = [&]() {
+      auto now = std::chrono::steady_clock::now();
+      if (now - lastKick > std::chrono::milliseconds(200)) {
+        lastFrameUpdateTime.store(now, std::memory_order_relaxed);
+        lastKick = now;
+      }
+    };
+    kickWatchdog();
+
+    std::cout << "Building acceleration structures for " << entities.size() << " entities..." << std::endl;
+
+    // PRECHECK: Determine how many renderable entities and unique meshes are READY right now.
+    // If the counts would shrink compared to the last successful build (e.g., streaming not done),
+    // skip rebuilding to avoid producing a TLAS that only contains a small subset (like animated fans).
+    size_t readyRenderableCount = 0;
+    size_t readyUniqueMeshCount = 0; {
+      size_t skippedInactive = 0;
+      size_t skippedNoMesh = 0;
+      size_t skippedNoRes = 0;
+      size_t skippedException = 0;
+
+      std::unordered_map<MeshComponent *, uint32_t> meshToBLASProbe;
+      for (Entity* entity : entities) {
+        kickWatchdog();
+        if (!entity || !entity->IsActive()) {
+          skippedInactive++;
+          continue;
+        }
+        auto meshComp = entity->GetComponent<MeshComponent>();
+        if (!meshComp) {
+          skippedNoMesh++;
+          continue;
+        }
+
+        // Match the filtering logic used in the main build loop below and in renderer_rendering.cpp
+        if (IsRayQueryStaticOnly()) {
+          const std::string& nm = entity->GetName();
+          if (nm.find("_AnimNode_") != std::string::npos)
+            continue;
+          if (!nm.empty() && nm.rfind("Ball_", 0) == 0)
+            continue;
+        }
+        // Deformable (skinned/morph) meshes are included in the AS: their BLAS is built
+        // with eAllowUpdate and refit each frame from the skinning compute output. They
+        // must be counted here so the readiness scan matches the main build loop.
+
+        try {
+          auto meshIt = meshResources.find(meshComp);
+          if (meshIt == meshResources.end()) {
+            skippedNoRes++;
+            continue;
+          }
+        } catch (...) {
+          skippedException++;
+          continue;
+        }
+
+        readyRenderableCount++;
+        meshToBLASProbe.try_emplace(meshComp, static_cast<uint32_t>(meshToBLASProbe.size()));
+      }
+      readyUniqueMeshCount = meshToBLASProbe.size();
+
+      // Keep this precheck quiet; any meaningful summary is printed in the main AS build block below.
+      (void) skippedInactive;
+      (void) skippedNoMesh;
+      (void) skippedNoRes;
+      (void) skippedException;
+    }
+
+    if (readyRenderableCount == 0 || readyUniqueMeshCount == 0) {
+      std::cout << "AS build skipped: no ready meshes yet (renderables=" << readyRenderableCount
+          << ", uniqueMeshes=" << readyUniqueMeshCount << ")\n";
+      return false;
+    }
+
+    // Move old AS structures to pending deletion queue
+    // They will be deleted after MAX_FRAMES_IN_FLIGHT frames to ensure all GPU work finishes
+    // This prevents "buffer destroyed while in use" errors without needing device.waitIdle()
+    // which would invalidate entity descriptor sets
+    if (!blasStructures.empty() || *tlasStructure.handle) {
+      PendingASDelete pendingDelete;
+      pendingDelete.blasStructures = std::move(blasStructures);
+      pendingDelete.tlasStructure = std::move(tlasStructure);
+      pendingDelete.framesSinceDestroy = 0;
+      pendingASDeletions.push_back(std::move(pendingDelete));
+    }
+
+    // Clear the moved-from containers (they're now empty)
+    blasStructures.clear();
+    tlasStructure = AccelerationStructure{};
+
+    // Map mesh components to BLAS indices
+    std::unordered_map<MeshComponent *, uint32_t> meshToBLAS;
+    std::vector<MeshComponent *> uniqueMeshes;
+
+    // Collect unique meshes and entities
+    std::vector<Entity *> renderableEntities;
+    auto containsCaseInsensitive = [](const std::string& haystack, const std::string& needle) -> bool {
+      std::string h = haystack;
+      std::string n = needle;
+      std::transform(h.begin(), h.end(), h.begin(), [](unsigned char c) { return std::tolower(c); });
+      std::transform(n.begin(), n.end(), n.begin(), [](unsigned char c) { return std::tolower(c); });
+      return h.find(n) != std::string::npos;
+    };
+
+    // Collect renderable entities for AS build without spamming logs.
+    size_t skippedInactive = 0;
+    size_t skippedNoMesh = 0;
+    size_t skippedNoRes = 0;
+    size_t skippedPendingUploads = 0;
+    size_t skippedNullBuffers = 0;
+    size_t skippedZeroIndices = 0;
+    size_t skippedException = 0;
+    // Count entities filtered out by the static-only / deformable rules so the build
+    // summary never silently under-reports (a previously-uncounted drop here once hid
+    // the whole static scene from the TLAS).
+    size_t skippedAnimNode = 0;
+    size_t skippedBall = 0;
+
+    for (Entity* entity : entities) {
+      kickWatchdog();
+      if (!entity || !entity->IsActive()) {
+        skippedInactive++;
+        continue;
+      }
+
+      auto meshComp = entity->GetComponent<MeshComponent>();
+      if (!meshComp) {
+        skippedNoMesh++;
+        continue;
+      }
+
+      // In Ray Query static-only mode, ignore dynamic/animated entities (fans, balls)
+      // to match the readiness scan in renderer_rendering.cpp.
+      if (IsRayQueryStaticOnly()) {
+        const std::string& nm = entity->GetName();
+        if (nm.find("_AnimNode_") != std::string::npos) {
+          skippedAnimNode++;
+          continue;
+        }
+        if (!nm.empty() && nm.rfind("Ball_", 0) == 0) {
+          skippedBall++;
+          continue;
+        }
+      }
+
+      // Deformable (skinned/morph) meshes are included: their BLAS is built with
+      // eAllowUpdate and refit each frame from the skinning compute output
+      // (see refitBLASInline), so they animate correctly under ray query.
+
+      // Safely check if mesh resources exist - catch any exceptions from dereferencing potentially stale pointers
+      try {
+        auto meshIt = meshResources.find(meshComp);
+        if (meshIt == meshResources.end()) {
+          skippedNoRes++;
+          continue;
+        }
+
+        // Validate that the mesh resources have valid buffers before adding to AS build
+        const auto& meshRes = meshIt->second;
+        // Only include when uploads finished (staging sizes are zero)
+        if (meshRes.vertexBufferSizeBytes != 0 || meshRes.indexBufferSizeBytes != 0) {
+          // Skip meshes still uploading to avoid partial TLAS builds
+          skippedPendingUploads++;
+          continue;
+        }
+        // RAII handles: check if they contain valid Vulkan handles by dereferencing
+        if (!*meshRes.vertexBuffer || !*meshRes.indexBuffer) {
+          skippedNullBuffers++;
+          continue;
+        }
+
+        if (meshRes.indexCount == 0) {
+          skippedZeroIndices++;
+          continue;
+        }
+      } catch (const std::exception&) {
+        // Avoid spamming; a rebuild on the next safe frame should succeed.
+        skippedException++;
+        continue;
+      }
+
+      renderableEntities.push_back(entity);
+
+      if (meshToBLAS.find(meshComp) == meshToBLAS.end()) {
+        meshToBLAS[meshComp] = static_cast<uint32_t>(uniqueMeshes.size());
+        uniqueMeshes.push_back(meshComp);
+      }
+    }
+
+    if (uniqueMeshes.empty()) {
+      // Nothing ready yet (e.g., mesh uploads still pending). Treat as a transient
+      // condition so the caller can retry next frame without clearing the request.
+      setASUi(true, "AS: waiting on meshes", 0.0f, 0u, 0u);
+      return false;
+    }
+
+    // One concise build summary (no per-entity spam)
+    std::cout << "Building AS: uniqueMeshes=" << uniqueMeshes.size()
+        << ", entities=" << renderableEntities.size()
+        << " (skipped inactive=" << skippedInactive
+        << ", noMesh=" << skippedNoMesh
+        << ", noRes=" << skippedNoRes
+        << ", pendingUploads=" << skippedPendingUploads
+        << ", nullBuffers=" << skippedNullBuffers
+        << ", zeroIndices=" << skippedZeroIndices
+        << ", exception=" << skippedException
+        << ", animNode=" << skippedAnimNode
+        << ", ball=" << skippedBall
+        << ")\n";
+
+    // Building only a partial set (e.g. while chunked GPU-resource preallocation is
+    // still in progress for the remaining meshes) would produce a TLAS whose instances
+    // point at non-existent BLAS, hanging the GPU on the next ray-query dispatch.
+    //
+    // Instead, treat "noRes>0" as a transient condition: ask for another rebuild and
+    // return false. The render loop will retry on subsequent frames until preallocation
+    // catches up. The loading screen remains visible because asBuildRequested stays set.
+    if (skippedNoRes > 0) {
+      setASUi(true, "AS: waiting on mesh resources", 0.0f, 0u, 0u);
+      RequestAccelerationStructureBuild("BLAS coverage incomplete; deferring TLAS build");
+      return false;
+    }
+
+    // Create a dedicated command pool for AS building to avoid threading issues
+    // The main commandPool may be in use by the render thread
+    vk::CommandPoolCreateInfo poolInfo{};
+    poolInfo.flags = vk::CommandPoolCreateFlagBits::eTransient;
+    poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily.value();
+
+    vk::raii::CommandPool asBuildCommandPool(device, poolInfo);
+
+    // Create command buffer for AS building
+    vk::CommandBufferAllocateInfo allocInfo{};
+    allocInfo.commandPool = *asBuildCommandPool;
+    allocInfo.level = vk::CommandBufferLevel::ePrimary;
+    allocInfo.commandBufferCount = 1;
+
+    vk::raii::CommandBuffers cmdBuffers(device, allocInfo);
+    vk::raii::CommandBuffer& cmdBuffer = cmdBuffers[0];
+
+    cmdBuffer.begin(vk::CommandBufferBeginInfo{
+      .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit
+    });
+
+    // (Vespa-only debugging removed; keep logs quiet.)
+
+    // Build BLAS for each unique mesh
+    blasStructures.resize(uniqueMeshes.size());
+
+    // Progress model: BLAS phase dominates. Treat TLAS + post buffers as a few extra steps.
+    const uint32_t totalSteps = static_cast<uint32_t>(uniqueMeshes.size()) + 3u;
+    setASUi(true, "AS: build BLAS", 0.0f, 0u, totalSteps);
+
+    // Keep scratch buffers alive until GPU execution completes (after fence wait)
+    // Destroying them early causes "VkBuffer was destroy" validation errors and crashes
+    std::vector<vk::raii::Buffer> scratchBuffers;
+    std::vector<std::unique_ptr<MemoryPool::Allocation>> scratchAllocations;
+
+    for (size_t i = 0; i < uniqueMeshes.size(); ++i) {
+      kickWatchdog();
+      // Update UI progress (BLAS)
+      setASUi(true,
+              "AS: build BLAS",
+              totalSteps > 0 ? static_cast<float>(static_cast<uint32_t>(i)) / static_cast<float>(totalSteps) : 0.0f,
+              static_cast<uint32_t>(i),
+              totalSteps);
+
+      MeshComponent* meshComp = uniqueMeshes[i];
+      auto& meshRes = meshResources.at(meshComp);
+
+      bool isDeformable = IsMeshComponentDeformable(meshComp);
+      // Always use the static vertex buffer for the initial BLAS build.
+      // outputVertexBuffer is device-local and uninitialised at build time;
+      // using it would produce NaN-filled bounding boxes and corrupt the BVH,
+      // causing traversal hardware to loop indefinitely. BLAS refit after
+      // the first skinning dispatch brings deformable meshes up to date.
+      vk::Buffer activeVB = *meshRes.vertexBuffer;
+
+      // Get buffer device addresses
+      vk::DeviceAddress vertexAddress = getBufferDeviceAddress(device, activeVB);
+      vk::DeviceAddress indexAddress = getBufferDeviceAddress(device, *meshRes.indexBuffer);
+
+      // Compute vertex and index counts for this mesh
+      const uint32_t vertexCount = static_cast<uint32_t>(meshComp->GetVertices().size());
+
+      // Create geometry info
+      vk::AccelerationStructureGeometryKHR geometry{};
+      geometry.geometryType = vk::GeometryTypeKHR::eTriangles;
+      // Mark geometry as OPAQUE to ensure closest hits are committed reliably for primary rays
+      // (we can re-introduce transparency later with any-hit/candidate handling)
+      geometry.flags = vk::GeometryFlagBitsKHR::eOpaque;
+
+      geometry.geometry.triangles.vertexFormat = vk::Format::eR32G32B32Sfloat;
+      geometry.geometry.triangles.vertexData = vertexAddress;
+      geometry.geometry.triangles.vertexStride = sizeof(Vertex);
+      // Set maxVertex to the total vertex count for this mesh. This is the most robust
+      // setting across drivers and content, and avoids culling triangles that reference
+      // high vertex indices (observed to hide unique, single-instance meshes).
+      geometry.geometry.triangles.maxVertex = vertexCount;
+      geometry.geometry.triangles.indexType = vk::IndexType::eUint32;
+      geometry.geometry.triangles.indexData = indexAddress;
+
+      // Build info
+      vk::AccelerationStructureBuildGeometryInfoKHR buildInfo{};
+      buildInfo.type = vk::AccelerationStructureTypeKHR::eBottomLevel;
+      buildInfo.flags = vk::BuildAccelerationStructureFlagBitsKHR::ePreferFastTrace;
+      if (isDeformable) {
+          // Skinned meshes will be refitted (updated) every frame to match animation.
+          buildInfo.flags |= vk::BuildAccelerationStructureFlagBitsKHR::eAllowUpdate;
+      }
+      buildInfo.mode = vk::BuildAccelerationStructureModeKHR::eBuild;
+      buildInfo.geometryCount = 1;
+      buildInfo.pGeometries = &geometry;
+
+      uint32_t primitiveCount = meshRes.indexCount / 3;
+
+      // Get size requirements
+      vk::AccelerationStructureBuildSizesInfoKHR sizeInfo = device.getAccelerationStructureBuildSizesKHR(
+        vk::AccelerationStructureBuildTypeKHR::eDevice,
+        buildInfo,
+        primitiveCount);
+
+      // Create BLAS buffer
+      auto [blasBuffer, blasAlloc] = createBufferPooled(
+        sizeInfo.accelerationStructureSize,
+        vk::BufferUsageFlagBits::eAccelerationStructureStorageKHR | vk::BufferUsageFlagBits::eShaderDeviceAddress,
+        vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+      // Create acceleration structure
+      vk::AccelerationStructureCreateInfoKHR createInfo{};
+      createInfo.buffer = *blasBuffer;
+      createInfo.size = sizeInfo.accelerationStructureSize;
+      createInfo.type = vk::AccelerationStructureTypeKHR::eBottomLevel;
+
+      vk::raii::AccelerationStructureKHR blasHandle(device, createInfo);
+
+      // Create scratch buffer
+      auto [scratchBuffer, scratchAlloc] = createBufferPooled(
+        sizeInfo.buildScratchSize,
+        vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress,
+        vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+      vk::DeviceAddress scratchAddress = getBufferDeviceAddress(device, *scratchBuffer);
+
+      // Update build info with handles (dereference RAII handle)
+      buildInfo.dstAccelerationStructure = *blasHandle;
+      buildInfo.scratchData = scratchAddress;
+
+      // Keep scratch buffer alive until after GPU execution (after fence wait)
+      scratchBuffers.push_back(std::move(scratchBuffer));
+      scratchAllocations.push_back(std::move(scratchAlloc));
+
+      // Build range info
+      vk::AccelerationStructureBuildRangeInfoKHR rangeInfo{};
+      rangeInfo.primitiveCount = primitiveCount;
+      rangeInfo.primitiveOffset = 0;
+      rangeInfo.firstVertex = 0;
+      rangeInfo.transformOffset = 0;
+
+      // Record build command - Vulkan-Hpp RAII takes array spans, not pointers
+      std::array<const vk::AccelerationStructureBuildRangeInfoKHR *, 1> rangeInfos = {&rangeInfo};
+      cmdBuffer.buildAccelerationStructuresKHR(buildInfo, rangeInfos);
+
+      // Get device address (dereference RAII handle)
+      vk::AccelerationStructureDeviceAddressInfoKHR addressInfo{};
+      addressInfo.accelerationStructure = *blasHandle;
+      vk::DeviceAddress blasAddress = device.getAccelerationStructureAddressKHR(addressInfo);
+
+      // Store BLAS (move RAII handle to avoid copy)
+      blasStructures[i].buffer = std::move(blasBuffer);
+      blasStructures[i].allocation = std::move(blasAlloc);
+      blasStructures[i].handle = std::move(blasHandle);
+      blasStructures[i].deviceAddress = blasAddress;
+
+    // (Per-BLAS logging removed; keep logs quiet in production.)
+    }
+
+    // Persist meshToBLAS mapping for refit
+    {
+        std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+        g_rendererStates[this].meshToBLAS = meshToBLAS;
+    }
+
+    // BLAS done
+    setASUi(true,
+            "AS: build TLAS",
+            totalSteps > 0 ? static_cast<float>(static_cast<uint32_t>(uniqueMeshes.size())) / static_cast<float>(totalSteps) : 0.0f,
+            static_cast<uint32_t>(uniqueMeshes.size()),
+            totalSteps);
+
+    // Barrier between BLAS and TLAS builds
+
+    // Barrier between BLAS and TLAS builds
+    vk::MemoryBarrier2 barrier{};
+    barrier.srcStageMask = vk::PipelineStageFlagBits2::eAccelerationStructureBuildKHR;
+    barrier.srcAccessMask = vk::AccessFlagBits2::eAccelerationStructureWriteKHR;
+    barrier.dstStageMask = vk::PipelineStageFlagBits2::eAccelerationStructureBuildKHR;
+    barrier.dstAccessMask = vk::AccessFlagBits2::eAccelerationStructureReadKHR;
+
+    vk::DependencyInfo depInfo{};
+    depInfo.memoryBarrierCount = 1;
+    depInfo.pMemoryBarriers = &barrier;
+    cmdBuffer.pipelineBarrier2(depInfo);
+
+    // Build TLAS with instances
+    // NOTE: many entities are instanced; reserve based on an estimated total instance count.
+    size_t estimatedInstances = 0;
+    for (Entity* e : renderableEntities) {
+      if (!e) continue;
+      if (auto* mc = e->GetComponent<MeshComponent>()) {
+        const size_t c = mc->GetInstanceCount();
+        estimatedInstances += (c > 0) ? c : 1;
+      } else {
+        estimatedInstances += 1;
+      }
+      if (estimatedInstances > 1000000) {
+        break; // safety
+      }
+    }
+    std::vector<vk::AccelerationStructureInstanceKHR> instances;
+    instances.reserve(std::max<size_t>(renderableEntities.size(), estimatedInstances));
+
+    // Build per-instance geometry info in the SAME order as TLAS instances
+    std::vector<GeometryInfo> geometryInfos; // defined later in file; we reuse the type
+    geometryInfos.reserve(instances.capacity());
+    tlasInstanceOrder.clear();
+
+    // Ray Query texture table (binding 6): seed reserved shared-default slots.
+    // We will assign per-material texture indices into this table, and the descriptor update
+    // will resolve each slot to either the streamed texture or a type-appropriate fallback.
+    rayQueryTexKeys.clear();
+    rayQueryTexFallbackSlots.clear();
+    rayQueryTexIndex.clear();
+    rayQueryTexCount = 0;
+
+    auto seedReservedSlot = [&](uint32_t slot, const std::string& id) {
+      if (rayQueryTexKeys.size() <= slot) {
+        rayQueryTexKeys.resize(slot + 1);
+        rayQueryTexFallbackSlots.resize(slot + 1);
+      }
+      const std::string key = ResolveTextureId(id);
+      rayQueryTexKeys[slot] = key;
+      rayQueryTexFallbackSlots[slot] = slot;
+      rayQueryTexIndex[key] = slot;
+    };
+
+    seedReservedSlot(RQ_SLOT_DEFAULT_BASECOLOR, SHARED_DEFAULT_ALBEDO_ID);
+    seedReservedSlot(RQ_SLOT_DEFAULT_NORMAL, SHARED_DEFAULT_NORMAL_ID);
+    seedReservedSlot(RQ_SLOT_DEFAULT_METALROUGH, SHARED_DEFAULT_METALLIC_ROUGHNESS_ID);
+    seedReservedSlot(RQ_SLOT_DEFAULT_OCCLUSION, SHARED_DEFAULT_OCCLUSION_ID);
+    seedReservedSlot(RQ_SLOT_DEFAULT_EMISSIVE, SHARED_DEFAULT_EMISSIVE_ID);
+    rayQueryTexCount = static_cast<uint32_t>(rayQueryTexKeys.size());
+
+    // Build an authoritative lookup from `materialIndex` -> `Material*`.
+    // We already embed both the numeric `materialIndex` and the material name in entity names
+    // (`modelName_Material_<index>_<materialName>`). Use this mapping so TLAS instance flags
+    // can be set per-instance using the resolved `materialIndex` (critical for MASK/BLEND decals).
+    std::unordered_map<uint32_t, const Material*> materialByIndex;
+    if (modelLoader) {
+      materialByIndex.reserve(renderableEntities.size());
+      static constexpr uint32_t kMaxSupportedMaterialIndex = 100000u;
+      for (Entity* e : renderableEntities) {
+        if (!e) continue;
+        const std::string& name = e->GetName();
+        if (name.find("Ball_") == 0) {
+          if (const Material* m = modelLoader->GetMaterial("BallMaterial")) {
+            materialByIndex[9999u] = m;
+          }
+          continue;
+        }
+        size_t matPos = name.find("_Material_");
+        if (matPos == std::string::npos) {
+          continue;
+        }
+        size_t numStart = matPos + 10; // length of "_Material_"
+        size_t numEnd = name.find('_', numStart);
+        if (numEnd == std::string::npos) {
+          continue;
+        }
+        uint32_t matIndex = 0;
+        try {
+          matIndex = static_cast<uint32_t>(std::stoi(name.substr(numStart, numEnd - numStart)));
+        } catch (...) {
+          continue;
+        }
+        if (matIndex > kMaxSupportedMaterialIndex) {
+          continue;
+        }
+        if (numEnd + 1 >= name.size()) {
+          continue;
+        }
+        const std::string materialName = name.substr(numEnd + 1);
+        if (const Material* m = modelLoader->GetMaterial(materialName)) {
+          materialByIndex[matIndex] = m;
+        }
+      }
+    }
+
+    auto addTextureSlot = [&](const std::string& texId, uint32_t fallbackSlot) -> uint32_t {
+      if (texId.empty())
+        return fallbackSlot;
+      std::string key = ResolveTextureId(texId);
+      auto it = rayQueryTexIndex.find(key);
+      if (it != rayQueryTexIndex.end())
+        return it->second;
+      if (rayQueryTexCount >= RQ_MAX_TEX)
+        return fallbackSlot;
+
+      uint32_t slot = rayQueryTexCount;
+      rayQueryTexKeys.push_back(key);
+      rayQueryTexFallbackSlots.push_back(fallbackSlot);
+      rayQueryTexIndex[key] = slot;
+      rayQueryTexCount++;
+
+      // Ensure streaming is requested (CPU-side decode can happen off-thread; GPU upload stays on main thread).
+      try {
+        RegisterTextureUser(key, nullptr);
+      } catch (...) {
+      }
+      return slot;
+    };
+
+    uint32_t runningInstanceIndex = 0;
+    {
+        std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+        for (auto entity : renderableEntities) {
+          kickWatchdog();
+          auto meshComp = entity->GetComponent<MeshComponent>();
+
+          // Pre-calculate environment flag for this mesh component to optimize TLAS refit
+          auto& advRes = g_meshAdvancedResources[meshComp];
+          if (!advRes.isEnvironmentChecked) {
+              std::string nameLower = entity->GetName();
+              std::transform(nameLower.begin(), nameLower.end(), nameLower.begin(), [](unsigned char c){ return std::tolower(c); });
+              bool isEnv = (nameLower.find("sky") != std::string::npos) ||
+                           (nameLower.find("dome") != std::string::npos) ||
+                           (nameLower.find("env") != std::string::npos) ||
+                           (nameLower.find("bg") != std::string::npos) ||
+                           (nameLower.find("atmosphere") != std::string::npos) ||
+                           (nameLower.find("cloud") != std::string::npos) ||
+                           (nameLower.find("fog") != std::string::npos) ||
+                           (nameLower.find("background") != std::string::npos) ||
+                           (nameLower.find("exterior") != std::string::npos);
+
+              if (!isEnv) {
+                 auto transform = entity->GetComponent<TransformComponent>();
+                 glm::vec3 scale = transform ? transform->GetScale() : glm::vec3(1.0f);
+                 if (scale.x > 400.0f || scale.y > 400.0f || scale.z > 400.0f) {
+                     isEnv = true;
+                 }
+              }
+              advRes.isEnvironment = isEnv;
+              advRes.isEnvironmentChecked = true;
+          }
+        }
+    }
+
+    for (auto entity : renderableEntities) {
+      kickWatchdog();
+      auto meshComp = entity->GetComponent<MeshComponent>();
+      uint32_t blasIndex = meshToBLAS.at(meshComp);
+
+      auto transform = entity->GetComponent<TransformComponent>();
+      const glm::mat4 entityModel = transform ? transform->GetModelMatrix() : glm::mat4(1.0f);
+
+      // Use per-instance transforms whenever at least one instance exists, even if only one.
+      const size_t meshInstCount = meshComp->GetInstanceCount();
+      const bool hasInstance = (meshInstCount > 0);
+      const size_t instCount = std::max<size_t>(1, meshInstCount);
+
+      for (size_t iInst = 0; iInst < instCount; ++iInst) {
+        kickWatchdog();
+        glm::mat4 finalModel = entityModel;
+        if (hasInstance && iInst < meshInstCount) {
+          const InstanceData& id = meshComp->GetInstance(iInst);
+          finalModel = entityModel * id.getModelMatrix(); // match raster path: ubo.model * instanceModel
+        }
+
+        // Extract material index early so we can set TLAS instance flags per-instance.
+        uint32_t resolvedMaterialIndex = 0;
+        if (hasInstance && iInst < meshInstCount) {
+          resolvedMaterialIndex = meshComp->GetInstance(iInst).materialIndex;
+        } else {
+          // Special case: Ball entities (named "Ball_N") use a red material
+          // Use strict prefix match to avoid turning other objects red
+          if (entity->GetName().find("Ball_") == 0) {
+            resolvedMaterialIndex = 9999; // Reserve index 9999 for the ball material
+          } else {
+            // Extract material index from entity name (model_Material_{index}_materialName)
+            const std::string& entityName = entity->GetName();
+            size_t matPos = entityName.find("_Material_");
+            if (matPos != std::string::npos) {
+              size_t numStart = matPos + 10; // length of "_Material_"
+              size_t numEnd = entityName.find('_', numStart);
+              if (numEnd != std::string::npos) {
+                try {
+                  resolvedMaterialIndex = static_cast<uint32_t>(std::max(0, std::stoi(entityName.substr(numStart, numEnd - numStart))));
+                } catch (...) {
+                  resolvedMaterialIndex = 0;
+                }
+              }
+            }
+          }
+        }
+
+        // Convert to Vulkan 3x4 row-major transform
+        const float* m = glm::value_ptr(finalModel);
+        vk::TransformMatrixKHR vkTransform;
+        for (int row = 0; row < 3; row++) {
+          for (int col = 0; col < 4; col++) {
+            vkTransform.matrix[row][col] = m[col * 4 + row];
+          }
+        }
+
+        // (Debug TLAS-XFORM logs removed for production)
+
+        vk::AccelerationStructureInstanceKHR AS_Instance{};
+        AS_Instance.transform = vkTransform;
+        AS_Instance.instanceCustomIndex = runningInstanceIndex; // per-instance sequential index
+        // Instance mask: include all instances by default.
+        AS_Instance.mask = 0xFF;
+        // Mirror the per-instance index into the SBT record offset so either
+        // CommittedInstanceID() or CommittedInstanceContributionToHitGroupIndex()
+        // can be used in the shader to recover the per-instance index.
+        AS_Instance.instanceShaderBindingTableRecordOffset = runningInstanceIndex;
+        // Determine alpha mode and environment status for this entity's material.
+        VkGeometryInstanceFlagsKHR instFlags = VK_GEOMETRY_INSTANCE_TRIANGLE_FACING_CULL_DISABLE_BIT_KHR;
+        bool forceNoOpaque = false;
+        bool forceOpaque = false;
+        bool isEnvironment = false;
+        {
+          // Determine environment status from entity name (standard naming convention)
+          std::string nameLower = entity->GetName();
+          std::transform(nameLower.begin(), nameLower.end(), nameLower.begin(), [](unsigned char c){ return std::tolower(c); });
+          if (nameLower.find("sky") != std::string::npos ||
+              nameLower.find("dome") != std::string::npos ||
+              nameLower.find("env") != std::string::npos ||
+              nameLower.find("bg") != std::string::npos ||
+              nameLower.find("atmosphere") != std::string::npos ||
+              nameLower.find("cloud") != std::string::npos ||
+              nameLower.find("fog") != std::string::npos ||
+              nameLower.find("background") != std::string::npos ||
+              nameLower.find("exterior") != std::string::npos) {
+            isEnvironment = true;
+          }
+
+          // Safety check: if object is enormous, treat it as environment to prevent occlusion
+          if (!isEnvironment) {
+             glm::vec3 scale = transform ? transform->GetScale() : glm::vec3(1.0f);
+             if (scale.x > 400.0f || scale.y > 400.0f || scale.z > 400.0f) {
+                 isEnvironment = true;
+                 std::cout << "Entity '" << entity->GetName() << "' auto-classified as ENVIRONMENT (scale > 400)" << std::endl;
+             }
+          }
+
+          // Determine opacity classification for ray queries.
+          // IMPORTANT: Be conservative here.
+          // If *either* the renderer material cache OR the model-loader material says this is non-opaque,
+          // we must set FORCE_NO_OPAQUE so the ray-query shader can correctly alpha-test / skip blends.
+          bool forceNoOpaqueCache = false;
+          bool forceOpaqueCache = false;
+          const Material* cachedMat = nullptr;
+
+          auto itRes = entityResources.find(entity);
+          if (itRes != entityResources.end()) {
+            ensureEntityMaterialCache(entity, itRes->second);
+            const MaterialProperties& mp = itRes->second.cachedMaterialProps;
+            const bool masked = (mp.alphaMask > 0.5f);
+            const bool transmissive = (mp.transmissionFactor > 0.01f);
+            const bool blended = itRes->second.cachedIsBlended;
+            const bool glassHint = itRes->second.cachedIsGlass || itRes->second.cachedIsLiquid;
+            forceNoOpaqueCache = masked || blended || transmissive || glassHint;
+            forceOpaqueCache = (!masked) && (!blended) && (!transmissive) && (!glassHint);
+            cachedMat = itRes->second.cachedMaterial;
+          }
+
+          bool forceNoOpaqueMat = false;
+          bool forceOpaqueMat = false;
+          const Material* mat = nullptr;
+          auto itByIndex = materialByIndex.find(resolvedMaterialIndex);
+          if (itByIndex != materialByIndex.end()) {
+            mat = itByIndex->second;
+          }
+          if (!mat) {
+            mat = cachedMat;
+          }
+          if (!mat && modelLoader) {
+            // Legacy lookup: Entity name format "modelName_Material_<index>_<materialName>".
+            const std::string& entityName = entity->GetName();
+            size_t matPos = entityName.find("_Material_");
+            if (matPos != std::string::npos) {
+              size_t numStart = matPos + 10;
+              size_t numEnd = entityName.find('_', numStart);
+              if (numEnd != std::string::npos && numEnd + 1 < entityName.size()) {
+                std::string matName = entityName.substr(numEnd + 1);
+                mat = modelLoader->GetMaterial(matName);
+              }
+            }
+          }
+          if (mat) {
+            // - MASK: needs candidate hits so we can alpha-test in-shader
+            // - BLEND / glass / transmission: should not fully block rays
+            forceNoOpaqueMat = (mat->alphaMode == "MASK") || (mat->alphaMode == "BLEND") || mat->isGlass || (mat->transmissionFactor > 0.01f);
+            forceOpaqueMat = (mat->alphaMode == "OPAQUE") && (!mat->isGlass) && (mat->transmissionFactor <= 0.01f);
+          }
+
+          forceNoOpaque = forceNoOpaqueCache || forceNoOpaqueMat;
+
+          // If we are confident this is a solid opaque surface, force opaque.
+          // This improves stability/perf and prevents "missing geometry" when BLAS geometry flags
+          // cause everything to be treated as non-opaque candidates.
+          forceOpaque = (!forceNoOpaque) && (!isEnvironment) && (forceOpaqueCache || forceOpaqueMat);
+        }
+        // Force instance opacity behavior only when we are confident.
+        if (forceNoOpaque) {
+          instFlags |= VK_GEOMETRY_INSTANCE_FORCE_NO_OPAQUE_BIT_KHR;
+        } else if (forceOpaque) {
+          instFlags |= VK_GEOMETRY_INSTANCE_FORCE_OPAQUE_BIT_KHR;
+        }
+        AS_Instance.flags = static_cast<VkGeometryInstanceFlagsKHR>(instFlags);
+
+        // Use mask bits: 0x01 = regular geometry, 0x02 = environment (skybox).
+        // Shadow rays will use mask 0x01 to avoid occlusion by the skybox.
+        AS_Instance.mask = isEnvironment ? 0x02 : 0x01;
+
+        AS_Instance.accelerationStructureReference = blasStructures[blasIndex].deviceAddress;
+        instances.push_back(AS_Instance);
+
+        // Track mapping for refit
+        TlasInstanceRef ref{};
+        ref.entity = entity;
+        ref.instanced = hasInstance;
+        ref.instanceIndex = static_cast<uint32_t>(hasInstance ? iInst : 0);
+        tlasInstanceOrder.push_back(ref);
+
+        // Build geometry info entry for this instance (addresses identical for all instances of same mesh)
+        const auto& meshRes = meshResources.at(meshComp);
+        bool isDeformable = IsMeshComponentDeformable(meshComp);
+        vk::Buffer activeVB = *meshRes.vertexBuffer;
+        if (isDeformable) {
+            std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+            auto it = g_meshAdvancedResources.find(meshComp);
+            if (it != g_meshAdvancedResources.end() && *it->second.outputVertexBuffer) {
+                activeVB = *it->second.outputVertexBuffer;
+            }
+        }
+        vk::DeviceAddress vertexAddr = getBufferDeviceAddress(device, activeVB);
+        vk::DeviceAddress indexAddr = getBufferDeviceAddress(device, *meshRes.indexBuffer);
+
+        GeometryInfo gi{};
+        gi.vertexBufferAddress = vertexAddr;
+        gi.indexBufferAddress = indexAddr;
+        gi.vertexCount = static_cast<uint32_t>(meshComp->GetVertices().size());
+        gi.materialIndex = resolvedMaterialIndex;
+        // Provide indexCount so shader can bound-check primitiveIndex safely
+        gi.indexCount = meshRes.indexCount;
+        gi._pad0 = 0;
+        // Store normal transform for correct world-space normals and tangent-space normal mapping.
+        // Use the full per-instance finalModel (entityModel * instanceModel) to match raster.
+        {
+          glm::mat3 nrm = glm::transpose(glm::inverse(glm::mat3(finalModel)));
+          gi.normalMatrix0 = glm::vec4(nrm[0], 0.0f);
+          gi.normalMatrix1 = glm::vec4(nrm[1], 0.0f);
+          gi.normalMatrix2 = glm::vec4(nrm[2], 0.0f);
+        }
+        geometryInfos.push_back(gi);
+
+        runningInstanceIndex++;
+      }
+    }
+
+    // Build TLAS
+
+    // Create instances buffer (persistent for TLAS UPDATE/Refit)
+    vk::DeviceSize instancesSize = sizeof(vk::AccelerationStructureInstanceKHR) * instances.size();
+    auto [instancesBufferTmp, instancesAllocTmp] = createBufferPooled(
+      instancesSize,
+      vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR | vk::BufferUsageFlagBits::eShaderDeviceAddress,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    // Upload instances - use mappedPtr directly
+    void* instancesData = instancesAllocTmp->mappedPtr;
+    if (!instancesData) {
+      std::cerr << "Failed to get mapped pointer for instances buffer\n";
+      return false;
+    }
+    memcpy(instancesData, instances.data(), instancesSize);
+
+    // Persist instances buffer/allocation and order for UPDATE (refit)
+    tlasInstancesBuffer = std::move(instancesBufferTmp);
+    tlasInstancesAllocation = std::move(instancesAllocTmp);
+    tlasInstanceCount = static_cast<uint32_t>(instances.size());
+    // tlasInstanceOrder already filled above in the same order as 'instances'
+
+    // (Debug TLAS composition logs removed.)
+
+    vk::DeviceAddress instancesAddress = getBufferDeviceAddress(device, *tlasInstancesBuffer);
+
+    // TLAS geometry
+    vk::AccelerationStructureGeometryKHR tlasGeometry{};
+    tlasGeometry.geometryType = vk::GeometryTypeKHR::eInstances;
+    // Do not force OPAQUE here; leave flags at 0 so ray queries may process
+    // transparency/glass more flexibly (any-hit not used in our path).
+    tlasGeometry.flags = {};
+    tlasGeometry.geometry.instances = vk::AccelerationStructureGeometryInstancesDataKHR{
+      .arrayOfPointers = VK_FALSE,
+      .data = instancesAddress
+    };
+
+    // TLAS build info
+    vk::AccelerationStructureBuildGeometryInfoKHR tlasBuildInfo{};
+    tlasBuildInfo.type = vk::AccelerationStructureTypeKHR::eTopLevel;
+    tlasBuildInfo.flags = vk::BuildAccelerationStructureFlagBitsKHR::ePreferFastTrace |
+        vk::BuildAccelerationStructureFlagBitsKHR::eAllowUpdate; // enable UPDATE/Refit
+    tlasBuildInfo.mode = vk::BuildAccelerationStructureModeKHR::eBuild;
+    tlasBuildInfo.geometryCount = 1;
+    tlasBuildInfo.pGeometries = &tlasGeometry;
+
+    auto instanceCount = static_cast<uint32_t>(instances.size());
+
+    // Get TLAS size requirements
+    vk::AccelerationStructureBuildSizesInfoKHR tlasSizeInfo = device.getAccelerationStructureBuildSizesKHR(
+      vk::AccelerationStructureBuildTypeKHR::eDevice,
+      tlasBuildInfo,
+      instanceCount);
+
+    // Create TLAS buffer
+    auto [tlasBuffer, tlasAlloc] = createBufferPooled(
+      tlasSizeInfo.accelerationStructureSize,
+      vk::BufferUsageFlagBits::eAccelerationStructureStorageKHR | vk::BufferUsageFlagBits::eShaderDeviceAddress,
+      vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+    // Create TLAS
+    vk::AccelerationStructureCreateInfoKHR tlasCreateInfo{};
+    tlasCreateInfo.buffer = *tlasBuffer;
+    tlasCreateInfo.size = tlasSizeInfo.accelerationStructureSize;
+    tlasCreateInfo.type = vk::AccelerationStructureTypeKHR::eTopLevel;
+
+    vk::raii::AccelerationStructureKHR tlasHandle(device, tlasCreateInfo);
+
+    // Create TLAS scratch buffer (for initial build)
+    auto [tlasScratchBuffer, tlasScratchAlloc] = createBufferPooled(
+      tlasSizeInfo.buildScratchSize,
+      vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress,
+      vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+    vk::DeviceAddress tlasScratchAddress = getBufferDeviceAddress(device, *tlasScratchBuffer);
+
+    // Update TLAS build info (dereference RAII handle)
+    tlasBuildInfo.dstAccelerationStructure = *tlasHandle;
+    tlasBuildInfo.scratchData = tlasScratchAddress;
+
+    // Keep TLAS scratch buffer alive until after GPU execution (after fence wait)
+    scratchBuffers.push_back(std::move(tlasScratchBuffer));
+    scratchAllocations.push_back(std::move(tlasScratchAlloc));
+
+    // Ensure/update a persistent scratch buffer for TLAS UPDATE (refit)
+    // Allocate once sized to updateScratchSize; recreate if needed for larger scenes
+    if (!*tlasUpdateScratchBuffer || !tlasUpdateScratchAllocation || tlasUpdateScratchAllocation->size < tlasSizeInfo.updateScratchSize) {
+      auto [updBuf, updAlloc] = createBufferPooled(
+        tlasSizeInfo.updateScratchSize,
+        vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress,
+        vk::MemoryPropertyFlagBits::eDeviceLocal);
+      tlasUpdateScratchBuffer = std::move(updBuf);
+      tlasUpdateScratchAllocation = std::move(updAlloc);
+    }
+
+    // TLAS build range
+    vk::AccelerationStructureBuildRangeInfoKHR tlasRangeInfo{};
+    tlasRangeInfo.primitiveCount = instanceCount;
+    tlasRangeInfo.primitiveOffset = 0;
+    tlasRangeInfo.firstVertex = 0;
+    tlasRangeInfo.transformOffset = 0;
+
+    // Build TLAS - Vulkan-Hpp RAII takes array spans, not pointers
+    std::array<const vk::AccelerationStructureBuildRangeInfoKHR *, 1> tlasRangeInfos = {&tlasRangeInfo};
+    cmdBuffer.buildAccelerationStructuresKHR(tlasBuildInfo, tlasRangeInfos);
+
+    // Get TLAS device address (dereference RAII handle)
+    vk::AccelerationStructureDeviceAddressInfoKHR tlasAddressInfo{};
+    tlasAddressInfo.accelerationStructure = *tlasHandle;
+    vk::DeviceAddress tlasAddress = device.getAccelerationStructureAddressKHR(tlasAddressInfo);
+
+    // Store TLAS (move RAII handle to avoid copy)
+    tlasStructure.buffer = std::move(tlasBuffer);
+    tlasStructure.allocation = std::move(tlasAlloc);
+    tlasStructure.handle = std::move(tlasHandle);
+    tlasStructure.deviceAddress = tlasAddress;
+
+    cmdBuffer.end();
+
+    // Submit and wait
+    vk::SubmitInfo submitInfo{};
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &(*cmdBuffer);
+
+    vk::raii::Fence fence(device, vk::FenceCreateInfo{}); {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      graphicsQueue.submit(submitInfo, *fence);
+    }
+
+    // Wait with periodic watchdog kicks to avoid false hang detection on large scenes.
+    (void) waitForFencesSafe(*fence, VK_TRUE);
+    // TLAS build completed on GPU
+    setASUi(true,
+            "AS: upload buffers",
+            totalSteps > 0 ? static_cast<float>(static_cast<uint32_t>(uniqueMeshes.size()) + 1u) / static_cast<float>(totalSteps) : 0.0f,
+            static_cast<uint32_t>(uniqueMeshes.size()) + 1u,
+            totalSteps);
+
+    // (Verbose TLAS composition dumps removed; keep logs quiet.)
+
+    // Record the counts we just built so we don't rebuild with smaller subsets later.
+    // Keep entity counts and TLAS instance counts separate to avoid unit mismatches.
+    lastASBuiltBLASCount = blasStructures.size();
+    lastASBuiltInstanceCount = renderableEntities.size();
+    lastASBuiltTlasInstanceCount = instanceCount;
+
+    // Build geometry info buffer PER INSTANCE (same order as TLAS instances)
+    // geometryInfos already populated above in TLAS instance loop
+
+    // Create and upload geometry info buffer
+    if (!geometryInfos.empty()) {
+      vk::DeviceSize geoInfoSize = sizeof(GeometryInfo) * geometryInfos.size();
+      auto [geoInfoBuf, geoInfoAlloc] = createBufferPooled(
+        geoInfoSize,
+        vk::BufferUsageFlagBits::eStorageBuffer,
+        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+      void* geoInfoData = geoInfoAlloc->mappedPtr;
+      if (geoInfoData) {
+        memcpy(geoInfoData, geometryInfos.data(), geoInfoSize);
+      }
+
+      geometryInfoBuffer = std::move(geoInfoBuf);
+      geometryInfoAllocation = std::move(geoInfoAlloc);
+      geometryInfoCountCPU = geometryInfos.size();
+
+      // (Verbose geometry info buffer stats removed.)
+    }
+    // Post buffers done
+    setASUi(true,
+            "AS: finalize",
+            totalSteps > 0 ? static_cast<float>(static_cast<uint32_t>(uniqueMeshes.size()) + 2u) / static_cast<float>(totalSteps) : 1.0f,
+            static_cast<uint32_t>(uniqueMeshes.size()) + 2u,
+            totalSteps);
+
+    // Build material buffer with real materials from ModelLoader
+    {
+      // Build material buffer
+
+      // Collect material indices used by this TLAS build.
+      // Do not rely only on entity-name parsing here: instanced geometry uses
+      // `InstanceData.materialIndex`, which may not appear in entity names (decals/foliage).
+      std::unordered_set<uint32_t> usedMaterialIndices;
+      usedMaterialIndices.reserve(geometryInfos.size() + 16);
+      usedMaterialIndices.insert(0u);
+
+      std::map<uint32_t, std::string> materialIndexToName; // legacy fallback (debug/heuristics)
+      static constexpr uint32_t kMaxSupportedMaterialIndex = 100000u;
+
+      size_t entityCount = 0;
+      for (Entity* entity : renderableEntities) {
+        std::string entityName = entity->GetName();
+
+        // Parse entity name: "modelName_Material_{materialIndex}_materialName"
+        size_t matPos = entityName.find("_Material_");
+        if (matPos != std::string::npos) {
+          size_t numStart = matPos + 10; // length of "_Material_"
+          size_t numEnd = entityName.find('_', numStart);
+
+          if (numEnd != std::string::npos) {
+            try {
+              uint32_t matIndex = std::stoi(entityName.substr(numStart, numEnd - numStart));
+              if (matIndex > kMaxSupportedMaterialIndex) {
+                // Malformed entity name (or unexpected content) could yield a huge index.
+                // Skip to avoid allocating an enormous material table or writing out of bounds.
+                continue;
+              }
+
+              // Extract material name (everything after materialIndex_)
+              std::string materialName = entityName.substr(numEnd + 1);
+              materialIndexToName[matIndex] = materialName;
+              usedMaterialIndices.insert(matIndex);
+            } catch (...) {
+              // Failed to parse, skip
+            }
+          }
+        } else if (entityName.find("Ball_") == 0) {
+          materialIndexToName[9999] = "BallMaterial";
+          usedMaterialIndices.insert(9999u);
+        }
+
+        entityCount++;
+        // Progress indicator removed (log-noise)
+      }
+
+      // Authoritative: include indices referenced by built geometry infos (covers instanced materials).
+      for (const GeometryInfo& gi : geometryInfos) {
+        if (gi.materialIndex <= kMaxSupportedMaterialIndex) {
+          usedMaterialIndices.insert(gi.materialIndex);
+        }
+      }
+
+      // (Verbose material discovery logs removed.)
+
+      // Create default material for index 0 and any missing indices
+      MaterialData defaultMat{};
+      defaultMat.albedo = glm::vec3(0.8f, 0.8f, 0.8f);
+      defaultMat.metallic = 0.0f;
+      defaultMat.roughness = 0.5f;
+      defaultMat.emissive = glm::vec3(0.0f);
+      defaultMat.ao = 1.0f;
+      defaultMat.ior = 1.5f;
+      defaultMat.emissiveStrength = 1.0f;
+      defaultMat.alpha = 1.0f;
+      defaultMat.transmissionFactor = 0.0f;
+      defaultMat.alphaCutoff = 0.5f;
+      defaultMat.alphaMode = 0; // OPAQUE
+      defaultMat.isGlass = 0;
+      defaultMat.isLiquid = 0;
+      // Thick-glass defaults
+      defaultMat.absorptionColor = glm::vec3(1.0f);
+      defaultMat.absorptionDistance = 1.0f;
+      defaultMat.thinWalled = 1u; // default to thin to avoid over-darkening
+      // Texture-set flags: -1 = no texture bound for that channel
+      defaultMat.baseColorTextureSet = -1;
+      defaultMat.physicalDescriptorTextureSet = -1;
+      defaultMat.normalTextureSet = -1;
+      defaultMat.occlusionTextureSet = -1;
+      defaultMat.emissiveTextureSet = -1;
+      // Default texture indices (reserved slots)
+      defaultMat.baseColorTexIndex = static_cast<int32_t>(RQ_SLOT_DEFAULT_BASECOLOR);
+      defaultMat.normalTexIndex = static_cast<int32_t>(RQ_SLOT_DEFAULT_NORMAL);
+      defaultMat.physicalTexIndex = static_cast<int32_t>(RQ_SLOT_DEFAULT_METALROUGH);
+      defaultMat.occlusionTexIndex = static_cast<int32_t>(RQ_SLOT_DEFAULT_OCCLUSION);
+      defaultMat.emissiveTexIndex = static_cast<int32_t>(RQ_SLOT_DEFAULT_EMISSIVE);
+      defaultMat.useSpecGlossWorkflow = 0;
+      defaultMat.glossinessFactor = 1.0f;
+      defaultMat.specularFactor = glm::vec3(0.04f);
+      defaultMat.hasEmissiveStrengthExt = 0;
+      defaultMat._padMat[0] = defaultMat._padMat[1] = defaultMat._padMat[2] = 0;
+
+      // Build material array with proper indexing
+      std::vector<MaterialData> materials;
+
+      // Determine max material index to size the array
+      uint32_t maxMaterialIndex = 0;
+      for (uint32_t index : usedMaterialIndices) {
+        maxMaterialIndex = std::max(maxMaterialIndex, index);
+      }
+      maxMaterialIndex = std::min(maxMaterialIndex, kMaxSupportedMaterialIndex);
+
+      // Ensure minimum size of 100 materials for safety (matches original implementation)
+      uint32_t materialCount = std::max(maxMaterialIndex + 1, 100u);
+      materials.resize(materialCount, defaultMat);
+
+      // Capture per-material texture paths (for streaming requests and debugging)
+      rqMaterialTexPaths.clear();
+      rqMaterialTexPaths.resize(materials.size());
+
+      // Populate materials from ModelLoader
+      uint32_t loadedCount = 0;
+      uint32_t glassCount = 0;
+      uint32_t transparentCount = 0;
+      if (modelLoader) {
+        // Populate materials from ModelLoader
+        size_t matProcessed = 0;
+        for (uint32_t index : usedMaterialIndices) {
+          if (index >= materials.size())
+            continue;
+
+          // `materialIndex` in this engine is not guaranteed to match the glTF
+          // material array index (especially for instanced meshes). Do not resolve by numeric
+          // index here; prefer the name mapping when available and otherwise fall back to a
+          // safe default material.
+          const Material* sourceMat = nullptr;
+          std::string materialName;
+          auto itName = materialIndexToName.find(index);
+          if (itName != materialIndexToName.end()) {
+            materialName = itName->second;
+            sourceMat = modelLoader->GetMaterial(materialName);
+          }
+
+          if (!sourceMat && index == 9999u) {
+            // Create a virtual red material for spawned balls
+            MaterialData& matData = materials[index];
+            matData.albedo = glm::vec3(1.0f, 0.05f, 0.05f); // Bright red
+            matData.roughness = 0.4f;
+            matData.metallic = 0.0f;
+            matData.ao = 1.0f;
+            matData.emissive = glm::vec3(0.0f);
+            matData.alpha = 1.0f;
+            matData.alphaMode = 0; // OPAQUE
+            matData.isGlass = 0;
+            matData.transmissionFactor = 0.0f;
+            matData.baseColorTextureSet = -1;
+            matData.normalTextureSet = -1;
+            matData.physicalDescriptorTextureSet = -1;
+            matData.occlusionTextureSet = -1;
+            matData.emissiveTextureSet = -1;
+            continue;
+          }
+
+          if (sourceMat) {
+            MaterialData& matData = materials[index];
+
+            // Copy PBR properties from Material to MaterialData
+            matData.albedo = sourceMat->albedo;
+            matData.metallic = sourceMat->metallic;
+            matData.emissive = sourceMat->emissive;
+            matData.roughness = sourceMat->roughness;
+            matData.ao = sourceMat->ao;
+            matData.ior = sourceMat->ior;
+            matData.emissiveStrength = sourceMat->emissiveStrength;
+            matData.alpha = sourceMat->alpha;
+            matData.transmissionFactor = sourceMat->transmissionFactor;
+            matData.alphaCutoff = sourceMat->alphaCutoff;
+
+            // Thick-glass parameters (no glTF volume parsing yet; use sensible defaults)
+            matData.absorptionColor = glm::vec3(1.0f);
+            matData.absorptionDistance = 1.0f;
+            // Consider engine-tagged glass as thick by default; others thin
+            matData.thinWalled = (sourceMat->isGlass ? 0u : 1u);
+            // Alpha mode encoding must match `shaders/ray_query.slang`:
+            // 0=OPAQUE, 1=MASK, 2=BLEND
+            if (sourceMat->alphaMode == "MASK") {
+              matData.alphaMode = 1;
+            } else if (sourceMat->alphaMode == "BLEND") {
+              matData.alphaMode = 2;
+            } else {
+              matData.alphaMode = 0;
+            }
+            // Heuristics to improve glass tagging for Ray Query path
+            // Many Bistro assets do not carry transmission extensions; use name hints
+            {
+              std::string lower = materialName;
+              std::transform(lower.begin(), lower.end(), lower.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+
+              // Decal materials in Bistro are primarily identified by their baseColor texture path.
+              // `Material::albedoTexturePath` is often an alias ID like `gltf_texture_#`.
+              // Resolve it to the canonical path so we can detect `textures/decals/...` reliably.
+              if (matData.alphaMode == 0 && !sourceMat->albedoTexturePath.empty()) {
+                std::string resolvedBase = ResolveTextureId(sourceMat->albedoTexturePath);
+                std::string baseLower = resolvedBase;
+                std::transform(baseLower.begin(), baseLower.end(), baseLower.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+                if (baseLower.find("/decals/") != std::string::npos ||
+                    baseLower.find("\\decals\\") != std::string::npos ||
+                    baseLower.find("_decal") != std::string::npos) {
+                  matData.alphaMode = 2;
+                }
+              }
+
+              const bool hasGlassWord = lower.find("glass") != std::string::npos;
+              const bool isWindowPane = (lower.find("window") != std::string::npos) || (lower.find("pane") != std::string::npos);
+              const bool isLampGlass = (lower.find("lamp") != std::string::npos) && hasGlassWord;
+              const bool isGlassware = (lower.find("goblet") != std::string::npos) || (lower.find("bottle") != std::string::npos) || (lower.find("wine") != std::string::npos);
+
+              bool markGlass = sourceMat->isGlass || hasGlassWord || isWindowPane || isLampGlass || isGlassware;
+              matData.isGlass = markGlass ? 1u : 0u;
+              matData.isLiquid = sourceMat->isLiquid ? 1u : 0u;
+
+              // Ensure non-zero transmission for glass-like materials lacking the extension
+              if (markGlass && matData.transmissionFactor < 0.85f) {
+                matData.transmissionFactor = 0.9f;
+              }
+
+              // Thin/thick hint refinement: panes/lamps are thin shells; glassware is thick
+              if (isWindowPane || isLampGlass) {
+                matData.thinWalled = 1u;
+              } else if (isGlassware) {
+                matData.thinWalled = 0u;
+              }
+            }
+
+            // Texture-set flags (raster parity): -1 means no texture is authored for that slot.
+            matData.baseColorTextureSet = sourceMat->albedoTexturePath.empty() ? -1 : 0;
+            if (sourceMat->useSpecularGlossiness) {
+              matData.physicalDescriptorTextureSet = sourceMat->specGlossTexturePath.empty() ? -1 : 0;
+            } else {
+              matData.physicalDescriptorTextureSet = sourceMat->metallicRoughnessTexturePath.empty() ? -1 : 0;
+            }
+            matData.normalTextureSet = sourceMat->normalTexturePath.empty() ? -1 : 0;
+            matData.occlusionTextureSet = sourceMat->occlusionTexturePath.empty() ? -1 : 0;
+            matData.emissiveTextureSet = sourceMat->emissiveTexturePath.empty() ? -1 : 0;
+
+            // Texture paths and stable indices into the Ray Query texture table (binding 6)
+            if (index < rqMaterialTexPaths.size()) {
+              RQMaterialTexPaths& paths = rqMaterialTexPaths[index];
+              // Resolve alias IDs (`gltf_texture_*`) to canonical keys (file paths) so RayQuery
+              // samples the correct textures and decal heuristics can match paths.
+              paths.baseColor = ResolveTextureId(sourceMat->albedoTexturePath);
+              paths.normal = ResolveTextureId(sourceMat->normalTexturePath);
+              paths.physical = ResolveTextureId(sourceMat->useSpecularGlossiness ? sourceMat->specGlossTexturePath : sourceMat->metallicRoughnessTexturePath);
+              paths.occlusion = ResolveTextureId(sourceMat->occlusionTexturePath);
+              paths.emissive = ResolveTextureId(sourceMat->emissiveTexturePath);
+
+              matData.baseColorTexIndex = static_cast<int32_t>(addTextureSlot(paths.baseColor, RQ_SLOT_DEFAULT_BASECOLOR));
+              matData.normalTexIndex = static_cast<int32_t>(addTextureSlot(paths.normal, RQ_SLOT_DEFAULT_NORMAL));
+              matData.physicalTexIndex = static_cast<int32_t>(addTextureSlot(paths.physical, RQ_SLOT_DEFAULT_METALROUGH));
+              matData.occlusionTexIndex = static_cast<int32_t>(addTextureSlot(paths.occlusion, RQ_SLOT_DEFAULT_OCCLUSION));
+              matData.emissiveTexIndex = static_cast<int32_t>(addTextureSlot(paths.emissive, RQ_SLOT_DEFAULT_EMISSIVE));
+
+            }
+
+            // Specular-glossiness workflow support
+            matData.useSpecGlossWorkflow = sourceMat->useSpecularGlossiness ? 1 : 0;
+            matData.glossinessFactor = sourceMat->glossinessFactor;
+            matData.specularFactor = sourceMat->specularFactor;
+            matData.hasEmissiveStrengthExt = (std::abs(sourceMat->emissiveStrength - 1.0f) > 1e-6f) ? 1 : 0;
+            matData._padMat[0] = matData._padMat[1] = matData._padMat[2] = 0;
+
+            // Track glass and transparent materials for statistics
+            if (sourceMat->isGlass) {
+              glassCount++;
+            }
+            if (sourceMat->transmissionFactor > 0.1f) {
+              transparentCount++;
+            }
+
+            loadedCount++;
+          } else {
+            std::cerr << "Warning: Material '" << materialName
+                << "' not found in ModelLoader for index " << index << "\n";
+          }
+
+          matProcessed++;
+        }
+      } else {
+        std::cerr << "Warning: ModelLoader not available, using default materials\n";
+      }
+
+      // Create and upload material buffer (always create, even if no materials found)
+      vk::DeviceSize matSize = sizeof(MaterialData) * materials.size();
+      auto [matBuf, matAlloc] = createBufferPooled(
+        matSize,
+        vk::BufferUsageFlagBits::eStorageBuffer,
+        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+      void* matData = matAlloc->mappedPtr;
+      if (matData) {
+        memcpy(matData, materials.data(), matSize);
+      }
+
+      materialBuffer = std::move(matBuf);
+      materialAllocation = std::move(matAlloc);
+
+      // (Verbose material buffer stats removed.)
+
+      // Record material count for shader-side bounds (provided via UBO)
+      materialCountCPU = materials.size();
+    }
+
+    // The TLAS/material/geometry buffers and texture table contents may have changed.
+    // Mark ray query descriptor sets dirty so the render thread refreshes them at the next safe point.
+    const uint32_t allFramesMask = (MAX_FRAMES_IN_FLIGHT >= 32u) ? 0xFFFFFFFFu : ((1u << MAX_FRAMES_IN_FLIGHT) - 1u);
+    rayQueryDescriptorsDirtyMask.fetch_or(allFramesMask, std::memory_order_relaxed);
+
+    setASUi(true, "AS: done", 1.0f, totalSteps, totalSteps);
+    const auto elapsedMs = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - asStartCpu).count();
+    std::cout << "AS build completed in " << (static_cast<double>(elapsedMs) / 1000.0)
+        << "s (uniqueMeshes=" << uniqueMeshes.size()
+        << ", entities=" << renderableEntities.size()
+        << ", tlasInstances=" << instanceCount << ")\n";
+    return true;
+  } catch (const std::exception& e) {
+    const uint64_t startNs = asBuildUiStartNs.load(std::memory_order_relaxed);
+    if (startNs != 0) {
+      const uint64_t nowNs = static_cast<uint64_t>(
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+          std::chrono::steady_clock::now().time_since_epoch())
+        .count());
+      const double secs = (nowNs > startNs) ? (static_cast<double>(nowNs - startNs) / 1'000'000'000.0) : 0.0;
+      std::cerr << "Failed to build acceleration structures after " << secs << "s: " << e.what() << std::endl;
+    } else {
+      std::cerr << "Failed to build acceleration structures: " << e.what() << std::endl;
+    }
+    return false;
+  }
+}
+
+bool Renderer::refitTopLevelAS(const std::vector<Entity *>& entities, CameraComponent* camera) {
+  try {
+    if (IsLoading())
+      return false;
+    if (!rayQueryEnabled || !accelerationStructureEnabled)
+      return false;
+    if (!*tlasStructure.handle)
+      return false;
+    if (!*tlasInstancesBuffer || !tlasInstancesAllocation || tlasInstanceOrder.size() != tlasInstanceCount)
+      return false;
+
+    // Update instance transforms in the persistent instances buffer
+    auto* instPtr = reinterpret_cast<vk::AccelerationStructureInstanceKHR *>(tlasInstancesAllocation->mappedPtr);
+    if (!instPtr)
+      return false;
+
+    auto lastKick = std::chrono::steady_clock::now();
+    auto kickWatchdog = [&]() {
+      auto now = std::chrono::steady_clock::now();
+      if (now - lastKick > std::chrono::milliseconds(200)) {
+        lastFrameUpdateTime.store(now, std::memory_order_relaxed);
+        lastKick = now;
+      }
+    };
+
+    // Optional culling parity with raster: mask TLAS instances using the same frustum + distance-LOD checks.
+    // Use the same culling toggles as the raster path.
+    const bool doFrustumCulling = enableFrustumCulling && camera;
+    const bool doDistanceLOD = enableDistanceLOD && camera;
+    FrustumPlanes frustum{};
+    if (doFrustumCulling) {
+      const glm::mat4 vp = camera->GetProjectionMatrix() * camera->GetViewMatrix();
+      frustum = extractFrustumPlanes(vp);
+    }
+    const float camFovRad = camera ? glm::radians(camera->GetFieldOfView()) : glm::radians(60.0f);
+
+    AdvancedRenderer_KickWatchdog(this);
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    for (uint32_t i = 0; i < tlasInstanceCount; ++i) {
+      const TlasInstanceRef& ref = tlasInstanceOrder[i];
+        Entity* entity = ref.entity;
+        if (!entity || !entity->IsActive()) {
+          instPtr[i].mask = 0u;
+          continue;
+        }
+        auto* transform = entity->GetComponent<TransformComponent>();
+        glm::mat4 entityModel = transform ? transform->GetModelMatrix() : glm::mat4(1.0f);
+
+        // If this TLAS entry represents a MeshComponent instance, multiply by the instance's model
+        glm::mat4 finalModel = entityModel;
+        auto* meshComp = entity->GetComponent<MeshComponent>();
+        if (ref.instanced) {
+          if (meshComp && ref.instanceIndex < meshComp->GetInstanceCount()) {
+            const InstanceData& id = meshComp->GetInstance(ref.instanceIndex);
+            finalModel = entityModel * id.getModelMatrix();
+          }
+        }
+
+        const float* m = glm::value_ptr(finalModel);
+        vk::TransformMatrixKHR vkTransform;
+        for (int row = 0; row < 3; row++) {
+          for (int col = 0; col < 4; col++) {
+            vkTransform.matrix[row][col] = m[col * 4 + row];
+          }
+        }
+        instPtr[i].transform = vkTransform;
+
+        // Apply culling via instance mask (mask=0 => skipped by ray queries with mask=0xFF).
+        uint32_t mask = 0x01u; // Default: geometry caster
+
+        if (meshComp) {
+            auto it = g_meshAdvancedResources.find(meshComp);
+            if (it != g_meshAdvancedResources.end()) {
+                const auto& advRes = it->second;
+                if (advRes.isEnvironmentChecked) {
+                    mask = advRes.isEnvironment ? 0x02u : 0x01u;
+                }
+            }
+        }
+
+        // (RayQuery): Avoid dropping instances from TLAS via mask=0 based on heuristic culling.
+        // AABBs for some instanced meshes can be unreliable, and `mask=0` removes the instance from all
+        // ray queries, causing visible objects to vanish. Keep TLAS coverage stable.
+        const bool applyFrustumCullingToRayQueryTLASMask = false;
+        const bool applyDistanceLodToRayQueryTLASMask = false;
+
+        if ((applyFrustumCullingToRayQueryTLASMask || applyDistanceLodToRayQueryTLASMask) && meshComp && camera && meshComp->HasLocalAABB()) {
+            bool visible = true;
+            glm::vec3 wmin{}, wmax{};
+            transformAABB(finalModel, meshComp->GetBaseMeshAABBMin(), meshComp->GetBaseMeshAABBMax(), wmin, wmax);
+
+            if (doFrustumCulling && applyFrustumCullingToRayQueryTLASMask && !aabbIntersectsFrustum(wmin, wmax, frustum)) {
+              visible = false;
+            }
+
+            if (visible && doDistanceLOD && applyDistanceLodToRayQueryTLASMask) {
+              // Match raster LOD heuristic (projected-size skip)
+              glm::vec3 center = 0.5f * (wmin + wmax);
+              glm::vec3 extents = 0.5f * (wmax - wmin);
+              float radius = glm::length(extents);
+              if (radius > 0.0f) {
+                glm::vec4 centerVS4 = camera->GetViewMatrix() * glm::vec4(center, 1.0f);
+                float z = std::abs(centerVS4.z);
+                if (z > 1e-3f) {
+                  float pixelRadius = (radius * static_cast<float>(swapChainExtent.height)) /
+                      (z * 2.0f * std::tan(camFovRad * 0.5f));
+                  float pixelDiameter = pixelRadius * 2.0f;
+
+                  bool useBlended = false;
+                  auto it = entityResources.find(entity);
+                  if (it != entityResources.end()) {
+                    ensureEntityMaterialCache(entity, it->second);
+                    useBlended = it->second.cachedIsBlended;
+                  }
+
+                  float threshold = useBlended ? lodPixelThresholdTransparent : lodPixelThresholdOpaque;
+                  if (pixelDiameter < threshold) {
+                    visible = false;
+                  }
+                }
+              }
+            }
+
+            if (!visible) {
+              mask = 0u;
+            }
+        }
+      instPtr[i].mask = mask;
+    }
+    if (lock.owns_lock()) lock.unlock();
+    AdvancedRenderer_KickWatchdog(this);
+
+    // Prepare UPDATE build info
+    vk::DeviceAddress instancesAddress = getBufferDeviceAddress(device, *tlasInstancesBuffer);
+
+    vk::AccelerationStructureGeometryKHR tlasGeometry{};
+    tlasGeometry.geometryType = vk::GeometryTypeKHR::eInstances;
+    tlasGeometry.flags = {};
+    tlasGeometry.geometry.instances = vk::AccelerationStructureGeometryInstancesDataKHR{
+      .arrayOfPointers = VK_FALSE,
+      .data = instancesAddress
+    };
+
+    vk::AccelerationStructureBuildGeometryInfoKHR tlasBuildInfo{};
+    tlasBuildInfo.type = vk::AccelerationStructureTypeKHR::eTopLevel;
+    tlasBuildInfo.flags = vk::BuildAccelerationStructureFlagBitsKHR::ePreferFastTrace |
+        vk::BuildAccelerationStructureFlagBitsKHR::eAllowUpdate;
+    tlasBuildInfo.mode = vk::BuildAccelerationStructureModeKHR::eUpdate;
+    tlasBuildInfo.geometryCount = 1;
+    tlasBuildInfo.pGeometries = &tlasGeometry;
+    tlasBuildInfo.srcAccelerationStructure = *tlasStructure.handle;
+    tlasBuildInfo.dstAccelerationStructure = *tlasStructure.handle;
+
+    if (!*tlasUpdateScratchBuffer || !tlasUpdateScratchAllocation) {
+      // No update scratch; cannot refit
+      return false;
+    }
+    vk::DeviceAddress updateScratch = getBufferDeviceAddress(device, *tlasUpdateScratchBuffer);
+    tlasBuildInfo.scratchData = updateScratch;
+
+    vk::AccelerationStructureBuildRangeInfoKHR tlasRangeInfo{};
+    tlasRangeInfo.primitiveCount = tlasInstanceCount;
+    tlasRangeInfo.primitiveOffset = 0;
+    tlasRangeInfo.firstVertex = 0;
+    tlasRangeInfo.transformOffset = 0;
+
+    // Create transient command buffer for UPDATE
+    vk::CommandPoolCreateInfo poolInfo{};
+    poolInfo.flags = vk::CommandPoolCreateFlagBits::eTransient;
+    poolInfo.queueFamilyIndex = queueFamilyIndices.graphicsFamily.value();
+    vk::raii::CommandPool cmdPool(device, poolInfo);
+
+    vk::CommandBufferAllocateInfo allocInfo{};
+    allocInfo.commandPool = *cmdPool;
+    allocInfo.level = vk::CommandBufferLevel::ePrimary;
+    allocInfo.commandBufferCount = 1;
+    vk::raii::CommandBuffers cmdBuffers(device, allocInfo);
+    vk::raii::CommandBuffer& cmd = cmdBuffers[0];
+    cmd.begin(vk::CommandBufferBeginInfo{.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
+
+    std::array<const vk::AccelerationStructureBuildRangeInfoKHR *, 1> ranges = {&tlasRangeInfo};
+    cmd.buildAccelerationStructuresKHR(tlasBuildInfo, ranges);
+
+    cmd.end();
+
+    // Submit and wait
+    vk::SubmitInfo submitInfo{};
+    submitInfo.commandBufferCount = 1;
+    submitInfo.pCommandBuffers = &(*cmd);
+    vk::raii::Fence fence(device, vk::FenceCreateInfo{}); {
+      AdvancedRenderer_KickWatchdog(this);
+      std::lock_guard<std::mutex> lock(queueMutex);
+      graphicsQueue.submit(submitInfo, *fence);
+    }
+    // Wait with periodic watchdog kicks to avoid false hang detection on long refits.
+    AdvancedRenderer_KickWatchdog(this);
+    (void) waitForFencesSafe(*fence, VK_TRUE);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to refit TLAS: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+/**
+ * @brief Update ray query descriptor sets with current resources.
+ *
+ * Binds UBO, TLAS, output image, and light buffer to the descriptor set.
+ *
+ * @param frameIndex The frame index to update.
+ * @return True if successful, false otherwise.
+ */
+bool Renderer::updateRayQueryDescriptorSets(uint32_t frameIndex, const std::vector<Entity *>& entities) {
+  if (!rayQueryEnabled || !accelerationStructureEnabled) {
+    return false;
+  }
+  if (frameIndex >= MAX_FRAMES_IN_FLIGHT) {
+    return false;
+  }
+
+  // Do not update descriptors while descriptor sets are known invalid
+  if (!descriptorSetsValid.load(std::memory_order_relaxed)) {
+    return false;
+  }
+
+  // Ensure descriptor sets exist for this frame; if missing/invalid, (re)allocate them now at the safe point
+  auto ensureRayQuerySets = [&]() -> bool {
+    try {
+      if (rayQueryDescriptorSets.empty() || frameIndex >= rayQueryDescriptorSets.size()) {
+        std::vector<vk::DescriptorSetLayout> layouts(MAX_FRAMES_IN_FLIGHT, *rayQueryDescriptorSetLayout);
+        vk::DescriptorSetAllocateInfo allocInfo{};
+        allocInfo.descriptorPool = *descriptorPool;
+        allocInfo.descriptorSetCount = MAX_FRAMES_IN_FLIGHT;
+        allocInfo.pSetLayouts = layouts.data(); {
+          std::lock_guard<std::mutex> lk(descriptorMutex);
+          rayQueryDescriptorSets = vk::raii::DescriptorSets(device, allocInfo);
+        }
+      }
+      // Validate the handle for the current frame
+      vk::DescriptorSet testHandle = *rayQueryDescriptorSets[frameIndex];
+      if (!testHandle) {
+        // Reallocate once more if handle is null
+        std::vector<vk::DescriptorSetLayout> layouts(MAX_FRAMES_IN_FLIGHT, *rayQueryDescriptorSetLayout);
+        vk::DescriptorSetAllocateInfo allocInfo{};
+        allocInfo.descriptorPool = *descriptorPool;
+        allocInfo.descriptorSetCount = MAX_FRAMES_IN_FLIGHT;
+        allocInfo.pSetLayouts = layouts.data(); {
+          std::lock_guard<std::mutex> lk(descriptorMutex);
+          rayQueryDescriptorSets = vk::raii::DescriptorSets(device, allocInfo);
+        }
+        testHandle = *rayQueryDescriptorSets[frameIndex];
+        if (!testHandle)
+          return false;
+      }
+      return true;
+    } catch (const std::exception& e) {
+      std::cerr << "Ray query descriptor set (re)allocation failed: " << e.what() << "\n";
+      return false;
+    }
+  };
+  if (!ensureRayQuerySets()) {
+    return false;
+  }
+
+  // Validate descriptor set handle is valid before dereferencing
+  try {
+    vk::DescriptorSet testHandle = *rayQueryDescriptorSets[frameIndex];
+    if (!testHandle) {
+      // Try reallocate once more
+      if (!ensureRayQuerySets())
+        return false;
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "Ray query descriptor set handle invalid for frame " << frameIndex << ": " << e.what() << "\n";
+    if (!ensureRayQuerySets())
+      return false;
+  }
+
+  // Check if TLAS handle is valid (dereference RAII handle to check underlying VkAccelerationStructureKHR)
+  if (!*tlasStructure.handle) {
+    std::cerr << "TLAS not built - cannot update ray query descriptor sets\n";
+    return false;
+  }
+
+  // Avoid doing expensive updates every frame.
+  // Binding 6 is a large descriptor array; updating it each frame can stall the CPU badly.
+  if (rayQueryDescriptorsWritten.size() != MAX_FRAMES_IN_FLIGHT) {
+    rayQueryDescriptorsWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+  }
+  const uint32_t bitMask = (1u << frameIndex);
+  const bool dirty = (rayQueryDescriptorsDirtyMask.load(std::memory_order_relaxed) & bitMask) != 0u;
+  const bool first = !rayQueryDescriptorsWritten[frameIndex];
+  if (!dirty && !first) {
+    // Nothing changed that requires descriptor rebind for this frame.
+    return true;
+  }
+
+  // Frame index alignment check: ensure we are updating descriptor set for the frame being recorded
+  if (frameIndex != currentFrame) {
+    // Not fatal, but indicates a mismatch in frame scheduling
+    // Avoid noisy logs every frame
+  }
+
+  // TLAS is valid at this point; avoid verbose logging in default builds
+  vk::AccelerationStructureKHR tlasHandleValue = *tlasStructure.handle;
+
+  if (lightStorageBuffers.empty() || frameIndex >= lightStorageBuffers.size()) {
+    std::cerr << "Light storage buffers not initialized\n";
+    return false;
+  }
+
+  try {
+    // NOTE: Ray Query no longer stores per-instance texture indices in `GeometryInfo`.
+    // Textures are resolved per-material via the material buffer, and the descriptor array
+    // is rebuilt each update from current streamed texture handles.
+
+    std::vector<vk::WriteDescriptorSet> writes;
+    vk::DescriptorBufferInfo uboInfo{};
+    vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{};
+    vk::DescriptorImageInfo imageInfo{};
+    vk::DescriptorBufferInfo lightInfo{};
+    vk::DescriptorBufferInfo geoInfo{};
+    vk::DescriptorBufferInfo matInfo{};
+
+    // NOTE: Do not write into mapped geometry info here. The buffer is built at AS build time
+    // and remains immutable to avoid races with refit and descriptor updates.
+
+    // Binding 0: UBO - Use dedicated ray query UBO (not entity UBO)
+    if (rayQueryUniformBuffers.empty() || frameIndex >= rayQueryUniformBuffers.size()) {
+      std::cerr << "Ray query UBO not initialized for frame " << frameIndex << "\n";
+      return false;
+    }
+
+    // The light storage buffer (binding 3) is written unconditionally below. If it isn't
+    // created yet (early frames, before lights are uploaded), writing it would bind an
+    // invalid VkBuffer (vkUpdateDescriptorSets VUID) and later dispatch with an invalid
+    // descriptor (VUID-vkCmdDispatch-08114). Defer the entire update until it exists; the
+    // ray-query dispatch is gated on a valid TLAS (built after lights), so the set is always
+    // fully written with valid buffers before it is used. The caller retries next frame.
+    if (frameIndex >= lightStorageBuffers.size() || !*lightStorageBuffers[frameIndex].buffer) {
+      return false;
+    }
+
+    uboInfo.buffer = *rayQueryUniformBuffers[frameIndex];
+    uboInfo.offset = 0;
+    uboInfo.range = sizeof(RayQueryUniformBufferObject);
+
+    vk::WriteDescriptorSet uboWrite{};
+    uboWrite.dstSet = *rayQueryDescriptorSets[frameIndex];
+    uboWrite.dstBinding = 0;
+    uboWrite.dstArrayElement = 0;
+    uboWrite.descriptorCount = 1;
+    uboWrite.descriptorType = vk::DescriptorType::eUniformBuffer;
+    uboWrite.pBufferInfo = &uboInfo;
+    writes.push_back(uboWrite);
+
+    // Binding 1: TLAS (get address of underlying VkAccelerationStructureKHR)
+    tlasInfo.accelerationStructureCount = 1;
+    tlasInfo.pAccelerationStructures = &tlasHandleValue;
+
+    vk::WriteDescriptorSet tlasWrite{};
+    tlasWrite.dstSet = *rayQueryDescriptorSets[frameIndex];
+    tlasWrite.dstBinding = 1;
+    tlasWrite.dstArrayElement = 0;
+    tlasWrite.descriptorCount = 1;
+    tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR;
+    tlasWrite.pNext = &tlasInfo;
+    writes.push_back(tlasWrite);
+
+    // Binding 2: Output image
+    imageInfo.imageView = *rayQueryOutputImageView;
+    imageInfo.imageLayout = vk::ImageLayout::eGeneral;
+
+    vk::WriteDescriptorSet imageWrite{};
+    imageWrite.dstSet = *rayQueryDescriptorSets[frameIndex];
+    imageWrite.dstBinding = 2;
+    imageWrite.dstArrayElement = 0;
+    imageWrite.descriptorCount = 1;
+    imageWrite.descriptorType = vk::DescriptorType::eStorageImage;
+    imageWrite.pImageInfo = &imageInfo;
+    writes.push_back(imageWrite);
+
+    // Binding 3: Light buffer
+    lightInfo.buffer = *lightStorageBuffers[frameIndex].buffer;
+    lightInfo.offset = 0;
+    lightInfo.range = VK_WHOLE_SIZE;
+
+    vk::WriteDescriptorSet lightWrite{};
+    lightWrite.dstSet = *rayQueryDescriptorSets[frameIndex];
+    lightWrite.dstBinding = 3;
+    lightWrite.dstArrayElement = 0;
+    lightWrite.descriptorCount = 1;
+    lightWrite.descriptorType = vk::DescriptorType::eStorageBuffer;
+    lightWrite.pBufferInfo = &lightInfo;
+    writes.push_back(lightWrite);
+
+    // Binding 4: Geometry info buffer (vertex/index addresses + material indices)
+    if (*geometryInfoBuffer) {
+      geoInfo.buffer = *geometryInfoBuffer;
+      geoInfo.offset = 0;
+      geoInfo.range = VK_WHOLE_SIZE;
+
+      vk::WriteDescriptorSet geoWrite{};
+      geoWrite.dstSet = *rayQueryDescriptorSets[frameIndex];
+      geoWrite.dstBinding = 4;
+      geoWrite.dstArrayElement = 0;
+      geoWrite.descriptorCount = 1;
+      geoWrite.descriptorType = vk::DescriptorType::eStorageBuffer;
+      geoWrite.pBufferInfo = &geoInfo;
+      writes.push_back(geoWrite);
+    }
+
+    // Binding 5: Material buffer (PBR material properties)
+    if (*materialBuffer) {
+      matInfo.buffer = *materialBuffer;
+      matInfo.offset = 0;
+      matInfo.range = VK_WHOLE_SIZE;
+
+      vk::WriteDescriptorSet matWrite{};
+      matWrite.dstSet = *rayQueryDescriptorSets[frameIndex];
+      matWrite.dstBinding = 5;
+      matWrite.dstArrayElement = 0;
+      matWrite.descriptorCount = 1;
+      matWrite.descriptorType = vk::DescriptorType::eStorageBuffer;
+      matWrite.pBufferInfo = &matInfo;
+      writes.push_back(matWrite);
+    }
+
+    // Binding 6: Ray Query texture table (combined image samplers)
+    // IMPORTANT: Do NOT cache VkImageView/VkSampler handles across frames; textures can stream
+    // and their handles may be destroyed/recreated.
+    if (rayQueryTexKeys.size() < RQ_SLOT_DEFAULT_EMISSIVE + 1 || rayQueryTexFallbackSlots.size() < RQ_SLOT_DEFAULT_EMISSIVE + 1) {
+      // Should be seeded during AS build; if not, fall back to using the generic default texture in all slots.
+      rayQueryTexKeys.resize(RQ_SLOT_DEFAULT_EMISSIVE + 1);
+      rayQueryTexFallbackSlots.resize(RQ_SLOT_DEFAULT_EMISSIVE + 1);
+      rayQueryTexCount = std::max<uint32_t>(rayQueryTexCount, static_cast<uint32_t>(rayQueryTexKeys.size()));
+    }
+
+    const uint32_t copyCount = std::min<uint32_t>(rayQueryTexCount, RQ_MAX_TEX);
+    // First-time init writes the full array with defaults so the set is fully defined.
+    // Subsequent refreshes update only the active range [0, copyCount), which is much faster.
+    const bool initFullArray = first;
+    const uint32_t writeCount = initFullArray ? RQ_MAX_TEX : copyCount;
+    std::vector<vk::DescriptorImageInfo> rqArray(writeCount,
+                                                 vk::DescriptorImageInfo{
+                                                   .sampler = *defaultTextureResources.textureSampler,
+                                                   .imageView = *defaultTextureResources.textureImageView,
+                                                   .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal
+                                                 });
+    if (copyCount > 0) {
+      // Fill active slots under a short-lived shared lock, then release before taking descriptorMutex.
+      std::shared_lock<std::shared_mutex> texLock(textureResourcesMutex);
+      auto fillSlot = [&](uint32_t slot) {
+        if (slot >= copyCount)
+          return;
+        const std::string& key = rayQueryTexKeys[slot];
+        if (!key.empty()) {
+          auto itTex = textureResources.find(key);
+          if (itTex != textureResources.end() && *itTex->second.textureImageView != VK_NULL_HANDLE && *itTex->second.textureSampler != VK_NULL_HANDLE) {
+            rqArray[slot].sampler = *itTex->second.textureSampler;
+            rqArray[slot].imageView = *itTex->second.textureImageView;
+            rqArray[slot].imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
+            return;
+          }
+        }
+
+        // Not ready/missing: use slot-specific fallback.
+        uint32_t fb = (slot < rayQueryTexFallbackSlots.size()) ? rayQueryTexFallbackSlots[slot] : RQ_SLOT_DEFAULT_BASECOLOR;
+        if (fb >= copyCount)
+          fb = RQ_SLOT_DEFAULT_BASECOLOR;
+        const std::string& fbKey = (fb < rayQueryTexKeys.size()) ? rayQueryTexKeys[fb] : std::string{};
+        if (!fbKey.empty()) {
+          auto itTex = textureResources.find(fbKey);
+          if (itTex != textureResources.end() && *itTex->second.textureImageView != VK_NULL_HANDLE && *itTex->second.textureSampler != VK_NULL_HANDLE) {
+            rqArray[slot].sampler = *itTex->second.textureSampler;
+            rqArray[slot].imageView = *itTex->second.textureImageView;
+            rqArray[slot].imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
+          }
+        }
+      };
+
+      for (uint32_t i = 0; i < copyCount; ++i) {
+        // Kick watchdog occasionally during large descriptor table fills.
+        if ((i % 128u) == 0u) {
+          lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+        }
+        fillSlot(i);
+      }
+    }
+
+    if (writeCount > 0) {
+      vk::WriteDescriptorSet texArrayWrite{};
+      texArrayWrite.dstSet = *rayQueryDescriptorSets[frameIndex];
+      texArrayWrite.dstBinding = 6;
+      texArrayWrite.dstArrayElement = 0;
+      texArrayWrite.descriptorCount = writeCount;
+      texArrayWrite.descriptorType = vk::DescriptorType::eCombinedImageSampler;
+      texArrayWrite.pImageInfo = rqArray.data();
+      writes.push_back(texArrayWrite);
+    } {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      device.updateDescriptorSets(writes, nullptr);
+    }
+    rayQueryDescriptorsWritten[frameIndex] = true;
+    rayQueryDescriptorsDirtyMask.fetch_and(~bitMask, std::memory_order_relaxed);
+
+    // No per-frame or one-shot debug prints here; keep logs quiet in production.
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to update ray query descriptor sets: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+void Renderer::refitBLASInline(vk::raii::CommandBuffer& cmd) {
+    if (!rayQueryEnabled || !accelerationStructureEnabled)
+        return;
+    if (!*tlasStructure.handle)
+        return;
+
+    std::set<MeshComponent*> meshesToRefit;
+    std::unordered_map<MeshComponent*, uint32_t> meshToBLAS_copy;
+    {
+        std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+        for (const auto& ref : tlasInstanceOrder) {
+            if (ref.entity && ref.entity->IsActive()) {
+                if (auto* mc = ref.entity->GetComponent<MeshComponent>()) {
+                    auto it = g_meshComponentData.find(mc);
+                    if (it != g_meshComponentData.end() && it->second.isDeformable)
+                        meshesToRefit.insert(mc);
+                }
+            }
+        }
+        auto stateIt = g_rendererStates.find(this);
+        if (stateIt != g_rendererStates.end())
+            meshToBLAS_copy = stateIt->second.meshToBLAS;
+    }
+
+    if (meshesToRefit.empty())
+        return;
+
+    std::vector<vk::AccelerationStructureBuildRangeInfoKHR> blasRangeInfos;
+    blasRangeInfos.reserve(meshesToRefit.size());
+
+    for (MeshComponent* mc : meshesToRefit) {
+        uint32_t blasIdx = 0;
+        {
+            auto blasIt = meshToBLAS_copy.find(mc);
+            if (blasIt == meshToBLAS_copy.end()) continue;
+            blasIdx = blasIt->second;
+        }
+        if (blasIdx >= blasStructures.size()) continue;
+
+        const auto& meshRes = meshResources.at(mc);
+        vk::Buffer activeVB = *meshRes.vertexBuffer;
+        vk::DeviceAddress scratchAddr = 0;
+
+        {
+            std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+            auto it = g_meshAdvancedResources.find(mc);
+            if (it != g_meshAdvancedResources.end()) {
+                if (*it->second.outputVertexBuffer)
+                    activeVB = *it->second.outputVertexBuffer;
+
+                vk::AccelerationStructureBuildGeometryInfoKHR bInfoTemp{};
+                bInfoTemp.type = vk::AccelerationStructureTypeKHR::eBottomLevel;
+                bInfoTemp.flags = vk::BuildAccelerationStructureFlagBitsKHR::ePreferFastTrace |
+                                  vk::BuildAccelerationStructureFlagBitsKHR::eAllowUpdate;
+                bInfoTemp.mode = vk::BuildAccelerationStructureModeKHR::eUpdate;
+                bInfoTemp.geometryCount = 1;
+
+                vk::AccelerationStructureGeometryKHR geoTemp{};
+                geoTemp.geometryType = vk::GeometryTypeKHR::eTriangles;
+                geoTemp.geometry.triangles.vertexFormat = vk::Format::eR32G32B32Sfloat;
+                geoTemp.geometry.triangles.vertexStride = sizeof(Vertex);
+                geoTemp.geometry.triangles.maxVertex = static_cast<uint32_t>(mc->GetVertices().size());
+                geoTemp.geometry.triangles.indexType = vk::IndexType::eUint32;
+                bInfoTemp.pGeometries = &geoTemp;
+
+                vk::AccelerationStructureBuildSizesInfoKHR sInfo = device.getAccelerationStructureBuildSizesKHR(
+                    vk::AccelerationStructureBuildTypeKHR::eDevice, bInfoTemp, meshRes.indexCount / 3);
+
+                auto& advRes = it->second;
+                if (!*advRes.blasScratchBuffer || advRes.blasScratchBufferSize < sInfo.updateScratchSize) {
+                    auto [sBuf, sAlloc] = createBufferPooled(sInfo.updateScratchSize,
+                        vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eShaderDeviceAddress,
+                        vk::MemoryPropertyFlagBits::eDeviceLocal);
+                    advRes.blasScratchBuffer = std::move(sBuf);
+                    advRes.blasScratchBufferAllocation = std::move(sAlloc);
+                    advRes.blasScratchBufferSize = sInfo.updateScratchSize;
+                }
+                scratchAddr = getBufferDeviceAddress(device, *advRes.blasScratchBuffer);
+            }
+        }
+
+        if (!scratchAddr) continue;
+
+        vk::DeviceAddress vertexAddr = getBufferDeviceAddress(device, activeVB);
+        vk::DeviceAddress indexAddr = getBufferDeviceAddress(device, *meshRes.indexBuffer);
+
+        vk::AccelerationStructureGeometryKHR geo{};
+        geo.geometryType = vk::GeometryTypeKHR::eTriangles;
+        geo.flags = vk::GeometryFlagBitsKHR::eOpaque;
+        geo.geometry.triangles.vertexFormat = vk::Format::eR32G32B32Sfloat;
+        geo.geometry.triangles.vertexData = vertexAddr;
+        geo.geometry.triangles.vertexStride = sizeof(Vertex);
+        geo.geometry.triangles.maxVertex = static_cast<uint32_t>(mc->GetVertices().size());
+        geo.geometry.triangles.indexType = vk::IndexType::eUint32;
+        geo.geometry.triangles.indexData = indexAddr;
+
+        vk::AccelerationStructureBuildGeometryInfoKHR bInfo{};
+        bInfo.type = vk::AccelerationStructureTypeKHR::eBottomLevel;
+        bInfo.flags = vk::BuildAccelerationStructureFlagBitsKHR::ePreferFastTrace |
+                      vk::BuildAccelerationStructureFlagBitsKHR::eAllowUpdate;
+        bInfo.mode = vk::BuildAccelerationStructureModeKHR::eUpdate;
+        bInfo.geometryCount = 1;
+        bInfo.pGeometries = &geo;
+        bInfo.srcAccelerationStructure = *blasStructures[blasIdx].handle;
+        bInfo.dstAccelerationStructure = *blasStructures[blasIdx].handle;
+        bInfo.scratchData = scratchAddr;
+
+        vk::AccelerationStructureBuildRangeInfoKHR rInfo{};
+        rInfo.primitiveCount = meshRes.indexCount / 3;
+        blasRangeInfos.push_back(rInfo);
+
+        std::array<const vk::AccelerationStructureBuildRangeInfoKHR*, 1> rPtrs = {&blasRangeInfos.back()};
+        cmd.buildAccelerationStructuresKHR(bInfo, rPtrs);
+
+        static bool blasRefitLogged = false;
+        if (!blasRefitLogged) {
+            std::cout << "[RT] BLAS refit inline for mesh " << mc << std::endl;
+            blasRefitLogged = true;
+        }
+    }
+
+    // Make BLAS writes visible to the ray-query compute shader.
+    // The dst-side cache invalidation also covers TLAS writes available from the prior submission.
+    vk::MemoryBarrier2 postBlasBarrier{};
+    postBlasBarrier.srcStageMask = vk::PipelineStageFlagBits2::eAccelerationStructureBuildKHR;
+    postBlasBarrier.srcAccessMask = vk::AccessFlagBits2::eAccelerationStructureWriteKHR;
+    postBlasBarrier.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader;
+    postBlasBarrier.dstAccessMask = vk::AccessFlagBits2::eAccelerationStructureReadKHR;
+    vk::DependencyInfo dep{};
+    dep.memoryBarrierCount = 1;
+    dep.pMemoryBarriers = &postBlasBarrier;
+    cmd.pipelineBarrier2(dep);
+}
diff --git a/attachments/advanced_gltf/renderer_rendering.cpp b/attachments/advanced_gltf/renderer_rendering.cpp
new file mode 100644
index 000000000..016396f94
--- /dev/null
+++ b/attachments/advanced_gltf/renderer_rendering.cpp
@@ -0,0 +1,2963 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <array>
+#include <cmath>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <glm/gtx/norm.hpp>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <ranges>
+#include <sstream>
+#include <stdexcept>
+
+#include "imgui/imgui.h"
+#include "imgui_system.h"
+#include "mesh_component.h"
+#include "model_loader.h"
+
+#include "renderer.h"
+
+#include "renderer_advanced_types.h"
+#include "transform_component.h"
+
+// Returns the vertex buffer to draw with: the skinning/morph compute output for
+// deformable meshes (so they animate), or the static buffer otherwise.
+static vk::Buffer GetVertexBuffer(const Renderer* renderer, MeshComponent* meshComp, const Renderer::MeshResources* meshRes) {
+    vk::Buffer vb = *meshRes->vertexBuffer;
+    std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+    auto it = g_meshAdvancedResources.find(meshComp);
+    if (it != g_meshAdvancedResources.end() && it->second.isDeformable && *it->second.outputVertexBuffer) {
+        vb = *it->second.outputVertexBuffer;
+    }
+    return vb;
+}
+
+// ===================== Culling helpers implementation =====================
+
+Renderer::FrustumPlanes Renderer::extractFrustumPlanes(const glm::mat4& vp) {
+  // Work in row-major form for standard plane extraction by transposing GLM's column-major matrix
+  glm::mat4 m = glm::transpose(vp);
+  FrustumPlanes fp{};
+  // Left   : m[3] + m[0]
+  fp.planes[0] = m[3] + m[0];
+  // Right  : m[3] - m[0]
+  fp.planes[1] = m[3] - m[0];
+  // Bottom : m[3] + m[1]
+  fp.planes[2] = m[3] + m[1];
+  // Top    : m[3] - m[1]
+  fp.planes[3] = m[3] - m[1];
+  // Near   : m[2] (matches Vulkan [0, 1] clip range)
+  fp.planes[4] = m[2];
+  // Far    : m[3] - m[2]
+  fp.planes[5] = m[3] - m[2];
+
+  // Normalize planes
+  for (auto& p : fp.planes) {
+    glm::vec3 n(p.x, p.y, p.z);
+    float len = glm::length(n);
+    if (len > 0.0f) {
+      p /= len;
+    }
+  }
+  return fp;
+}
+
+void Renderer::transformAABB(const glm::mat4& M,
+                             const glm::vec3& localMin,
+                             const glm::vec3& localMax,
+                             glm::vec3& outMin,
+                             glm::vec3& outMax) {
+  // OBB (from model) to world AABB using center/extents and absolute 3x3
+  const glm::vec3 c = 0.5f * (localMin + localMax);
+  const glm::vec3 e = 0.5f * (localMax - localMin);
+
+  const glm::vec3 worldCenter = glm::vec3(M * glm::vec4(c, 1.0f));
+  // Upper-left 3x3
+  const glm::mat3 A = glm::mat3(M);
+  const glm::mat3 AbsA = glm::mat3(glm::abs(A[0]), glm::abs(A[1]), glm::abs(A[2]));
+  const glm::vec3 worldExtents = AbsA * e; // component-wise combination
+
+  outMin = worldCenter - worldExtents;
+  outMax = worldCenter + worldExtents;
+}
+
+bool Renderer::aabbIntersectsFrustum(const glm::vec3& worldMin,
+                                     const glm::vec3& worldMax,
+                                     const FrustumPlanes& frustum) {
+  // Use the p-vertex test against each plane; if outside any plane → culled
+  for (const auto& p : frustum.planes) {
+    const glm::vec3 n(p.x, p.y, p.z);
+    // Choose positive vertex (furthest in direction of normal)
+    glm::vec3 v{
+      n.x >= 0.0f ? worldMax.x : worldMin.x,
+      n.y >= 0.0f ? worldMax.y : worldMin.y,
+      n.z >= 0.0f ? worldMax.z : worldMin.z
+    };
+
+    // If the most positive vertex is still on the negative side of the plane,
+    // then the entire box is on the negative side.
+    // Use a small epsilon to avoid numerical issues.
+    if (glm::dot(n, v) + p.w < -0.01f) {
+      return false; // completely outside
+    }
+  }
+  return true;
+}
+
+// This file contains rendering-related methods from the Renderer class
+
+// Create swap chain
+bool Renderer::createSwapChain() {
+  try {
+    // Query swap chain support
+    SwapChainSupportDetails swapChainSupport = querySwapChainSupport(physicalDevice);
+
+    // Choose swap surface format, present mode, and extent
+    vk::SurfaceFormatKHR surfaceFormat = chooseSwapSurfaceFormat(swapChainSupport.formats);
+    vk::PresentModeKHR presentMode = chooseSwapPresentMode(swapChainSupport.presentModes);
+    vk::Extent2D extent = chooseSwapExtent(swapChainSupport.capabilities);
+
+    // Choose image count
+    uint32_t imageCount = swapChainSupport.capabilities.minImageCount + 1;
+    if (swapChainSupport.capabilities.maxImageCount > 0 && imageCount > swapChainSupport.capabilities.maxImageCount) {
+      imageCount = swapChainSupport.capabilities.maxImageCount;
+    }
+
+    // Create swap chain info
+    vk::SwapchainCreateInfoKHR createInfo{
+      .surface = *surface,
+      .minImageCount = imageCount,
+      .imageFormat = surfaceFormat.format,
+      .imageColorSpace = surfaceFormat.colorSpace,
+      .imageExtent = extent,
+      .imageArrayLayers = 1,
+      .imageUsage = vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eTransferDst,
+      .preTransform = swapChainSupport.capabilities.currentTransform,
+      .compositeAlpha = vk::CompositeAlphaFlagBitsKHR::eOpaque,
+      .presentMode = presentMode,
+      .clipped = VK_TRUE,
+      .oldSwapchain = nullptr
+    };
+
+    // Find queue families
+    QueueFamilyIndices indices = findQueueFamilies(physicalDevice);
+    std::array<uint32_t, 2> queueFamilyIndicesLoc = {indices.graphicsFamily.value(), indices.presentFamily.value()};
+
+    // Set sharing mode
+    if (indices.graphicsFamily != indices.presentFamily) {
+      createInfo.imageSharingMode = vk::SharingMode::eConcurrent;
+      createInfo.queueFamilyIndexCount = static_cast<uint32_t>(queueFamilyIndicesLoc.size());
+      createInfo.pQueueFamilyIndices = queueFamilyIndicesLoc.data();
+    } else {
+      createInfo.imageSharingMode = vk::SharingMode::eExclusive;
+      createInfo.queueFamilyIndexCount = 0;
+      createInfo.pQueueFamilyIndices = nullptr;
+    }
+
+    // Create swap chain
+    swapChain = vk::raii::SwapchainKHR(device, createInfo);
+
+    // Get swap chain images
+    swapChainImages = swapChain.getImages();
+
+    // Swapchain images start in UNDEFINED layout; track per-image layout for correct barriers.
+    swapChainImageLayouts.assign(swapChainImages.size(), vk::ImageLayout::eUndefined);
+
+    // Store swap chain format and extent
+    swapChainImageFormat = surfaceFormat.format;
+    swapChainExtent = extent;
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create swap chain: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// ===================== Planar reflections resources =====================
+bool Renderer::createReflectionResources(uint32_t width, uint32_t height) {
+  try {
+    destroyReflectionResources();
+    reflections.clear();
+    reflections.resize(MAX_FRAMES_IN_FLIGHT);
+    reflectionVPs.clear();
+    reflectionVPs.resize(MAX_FRAMES_IN_FLIGHT, glm::mat4(1.0f));
+    sampleReflectionVP = glm::mat4(1.0f);
+
+    for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) {
+      auto& rt = reflections[i];
+      rt.width = width;
+      rt.height = height;
+
+      // Color RT: use swapchain format to match existing PBR pipeline rendering formats
+      vk::Format colorFmt = swapChainImageFormat;
+      auto [colorImg, colorAlloc] = createImagePooled(
+        width,
+        height,
+        colorFmt,
+        vk::ImageTiling::eOptimal,
+        // Allow sampling in glass and blitting to swapchain for diagnostics
+        vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferSrc,
+        vk::MemoryPropertyFlagBits::eDeviceLocal,
+        /*mipLevels*/
+        1,
+        vk::SharingMode::eExclusive,
+        {});
+      rt.color = std::move(colorImg);
+      rt.colorAlloc = std::move(colorAlloc);
+      rt.colorView = createImageView(rt.color, colorFmt, vk::ImageAspectFlagBits::eColor, 1);
+      // Simple sampler for sampling reflection texture (no mips)
+      vk::SamplerCreateInfo sampInfo{.magFilter = vk::Filter::eLinear, .minFilter = vk::Filter::eLinear, .mipmapMode = vk::SamplerMipmapMode::eNearest, .addressModeU = vk::SamplerAddressMode::eClampToEdge, .addressModeV = vk::SamplerAddressMode::eClampToEdge, .addressModeW = vk::SamplerAddressMode::eClampToEdge, .minLod = 0.0f, .maxLod = 0.0f};
+      rt.colorSampler = vk::raii::Sampler(device, sampInfo);
+
+      // Depth RT
+      vk::Format depthFmt = findDepthFormat();
+      auto [depthImg, depthAlloc] = createImagePooled(
+        width,
+        height,
+        depthFmt,
+        vk::ImageTiling::eOptimal,
+        vk::ImageUsageFlagBits::eDepthStencilAttachment,
+        vk::MemoryPropertyFlagBits::eDeviceLocal,
+        /*mipLevels*/
+        1,
+        vk::SharingMode::eExclusive,
+        {});
+      rt.depth = std::move(depthImg);
+      rt.depthAlloc = std::move(depthAlloc);
+      rt.depthView = createImageView(rt.depth, depthFmt, vk::ImageAspectFlagBits::eDepth, 1);
+    }
+
+    // One-time initialization: transition all per-frame reflection color images
+    // from UNDEFINED to SHADER_READ_ONLY_OPTIMAL so that the first frame can
+    // legally sample the "previous" frame's image.
+    if (!reflections.empty()) {
+      vk::CommandPoolCreateInfo poolInfo{
+        .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+        .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value()
+      };
+      vk::raii::CommandPool tempPool(device, poolInfo);
+      vk::CommandBufferAllocateInfo allocInfo{.commandPool = *tempPool, .level = vk::CommandBufferLevel::ePrimary, .commandBufferCount = 1};
+      vk::raii::CommandBuffers cbs(device, allocInfo);
+      vk::raii::CommandBuffer& cb = cbs[0];
+      cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
+
+      std::vector<vk::ImageMemoryBarrier2> barriers;
+      barriers.reserve(reflections.size());
+      for (auto& rt : reflections) {
+        if (!!*rt.color) {
+          barriers.push_back(vk::ImageMemoryBarrier2{
+            .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe,
+            .srcAccessMask = vk::AccessFlagBits2::eNone,
+            .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader,
+            .dstAccessMask = vk::AccessFlagBits2::eShaderRead,
+            .oldLayout = vk::ImageLayout::eUndefined,
+            .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal,
+            .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+            .image = *rt.color,
+            .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+          });
+        }
+      }
+      if (!barriers.empty()) {
+        vk::DependencyInfo depInfo{.imageMemoryBarrierCount = static_cast<uint32_t>(barriers.size()), .pImageMemoryBarriers = barriers.data()};
+        cb.pipelineBarrier2(depInfo);
+      }
+      cb.end();
+      vk::SubmitInfo submit{.commandBufferCount = 1, .pCommandBuffers = &*cb};
+      vk::raii::Fence fence(device, vk::FenceCreateInfo{}); {
+        std::lock_guard<std::mutex> lock(queueMutex);
+        graphicsQueue.submit(submit, *fence);
+      }
+      vk::Result result = waitForFencesSafe(*fence, VK_TRUE);
+      if (result != vk::Result::eSuccess) {
+        std::cerr << "Error: Failed to wait for reflection resource fence: " << vk::to_string(result) << std::endl;
+      }
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create reflection resources: " << e.what() << std::endl;
+    destroyReflectionResources();
+    return false;
+  }
+}
+
+void Renderer::destroyReflectionResources() {
+  for (auto& rt : reflections) {
+    rt.colorSampler = vk::raii::Sampler(nullptr);
+    rt.colorView = vk::raii::ImageView(nullptr);
+    rt.colorAlloc = nullptr;
+    rt.color = vk::raii::Image(nullptr);
+    rt.depthView = vk::raii::ImageView(nullptr);
+    rt.depthAlloc = nullptr;
+    rt.depth = vk::raii::Image(nullptr);
+    rt.width = rt.height = 0;
+  }
+}
+
+void Renderer::renderReflectionPass(vk::raii::CommandBuffer& cmd,
+                                    const glm::vec4& planeWS,
+                                    CameraComponent* camera,
+                                    const std::vector<RenderJob>& jobs) {
+  if (reflections.empty())
+    return;
+  auto& rt = reflections[currentFrame];
+  if (rt.width == 0 || rt.height == 0 || !*rt.colorView || !*rt.depthView)
+    return;
+
+  // Transition reflection color to COLOR_ATTACHMENT_OPTIMAL (Sync2)
+  vk::ImageMemoryBarrier2 toColor2{
+    .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe,
+    .srcAccessMask = {},
+    .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+    .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead,
+    .oldLayout = vk::ImageLayout::eShaderReadOnlyOptimal,
+    .newLayout = vk::ImageLayout::eColorAttachmentOptimal,
+    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+    .image = *rt.color,
+    .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+  };
+  // Transition reflection depth to DEPTH_STENCIL_ATTACHMENT_OPTIMAL (Sync2)
+  vk::ImageMemoryBarrier2 toDepth2{
+    .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe,
+    .srcAccessMask = {},
+    .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests,
+    .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite | vk::AccessFlagBits2::eDepthStencilAttachmentRead,
+    .oldLayout = vk::ImageLayout::eUndefined,
+    .newLayout = vk::ImageLayout::eDepthAttachmentOptimal,
+    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+    .image = *rt.depth,
+    .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1}
+  };
+  std::array<vk::ImageMemoryBarrier2, 2> preBarriers{toColor2, toDepth2};
+  vk::DependencyInfo depInfoToColor{.imageMemoryBarrierCount = static_cast<uint32_t>(preBarriers.size()), .pImageMemoryBarriers = preBarriers.data()};
+  cmd.pipelineBarrier2(depInfoToColor);
+
+  vk::RenderingAttachmentInfo colorAtt{
+    .imageView = *rt.colorView,
+    .imageLayout = vk::ImageLayout::eColorAttachmentOptimal,
+    .loadOp = vk::AttachmentLoadOp::eClear,
+    .storeOp = vk::AttachmentStoreOp::eStore,
+    // Clear to black so scene content dominates reflections
+    .clearValue = vk::ClearValue{vk::ClearColorValue{std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f}}}
+  };
+  vk::RenderingAttachmentInfo depthAtt{
+    .imageView = *rt.depthView,
+    .imageLayout = vk::ImageLayout::eDepthStencilAttachmentOptimal,
+    .loadOp = vk::AttachmentLoadOp::eClear,
+    .storeOp = vk::AttachmentStoreOp::eDontCare,
+    .clearValue = vk::ClearValue{vk::ClearDepthStencilValue{1.0f, 0}}
+  };
+  vk::RenderingInfo rinfo{
+    .renderArea = vk::Rect2D({0, 0}, {rt.width, rt.height}),
+    .layerCount = 1,
+    .colorAttachmentCount = 1,
+    .pColorAttachments = &colorAtt,
+    .pDepthAttachment = &depthAtt
+  };
+  cmd.beginRendering(rinfo);
+  // Compute mirrored view matrix about planeWS (default Y=0 plane)
+  glm::mat4 reflectM(1.0f);
+  // For Y=0 plane, reflection is simply flip Y
+  if (glm::length(glm::vec3(planeWS.x, planeWS.y, planeWS.z)) > 0.5f && fabsf(planeWS.y - 1.0f) < 1e-3f && fabsf(planeWS.x) < 1e-3f && fabsf(planeWS.z) < 1e-3f) {
+    reflectM[1][1] = -1.0f;
+  } else {
+    // General plane reflection matrix R = I - 2*n*n^T for normalized plane; ignore translation for now
+    glm::vec3 n = glm::normalize(glm::vec3(planeWS));
+    glm::mat3 R = glm::mat3(1.0f) - 2.0f * glm::outerProduct(n, n);
+    reflectM = glm::mat4(R);
+  }
+
+  glm::mat4 viewReflected = camera ? (camera->GetViewMatrix() * reflectM) : reflectM;
+  glm::mat4 projReflected = camera ? camera->GetProjectionMatrix() : glm::mat4(1.0f);
+  currentReflectionVP = projReflected * viewReflected;
+  currentReflectionPlane = planeWS;
+  if (currentFrame < reflectionVPs.size()) {
+    reflectionVPs[currentFrame] = currentReflectionVP;
+  }
+
+  // Set viewport/scissor to reflection RT size
+  vk::Viewport rv(0.0f, 0.0f, static_cast<float>(rt.width), static_cast<float>(rt.height), 0.0f, 1.0f);
+  cmd.setViewport(0, rv);
+  vk::Rect2D rs({0, 0}, {rt.width, rt.height});
+  cmd.setScissor(0, rs);
+
+  // Draw opaque entities with mirrored view
+  // Use reflection-specific pipeline (cull none) to avoid mirrored winding issues.
+  if (!!*pbrReflectionGraphicsPipeline) {
+    cmd.bindPipeline(vk::PipelineBindPoint::eGraphics, *pbrReflectionGraphicsPipeline);
+  } else if (!!*pbrGraphicsPipeline) {
+    cmd.bindPipeline(vk::PipelineBindPoint::eGraphics, *pbrGraphicsPipeline);
+  }
+
+  // Prepare frustum for mirrored view to allow culling
+  FrustumPlanes reflectFrustum = extractFrustumPlanes(currentReflectionVP);
+
+  // Render all jobs (skip transparency)
+  for (const auto& job : jobs) {
+    Entity* entity = job.entity;
+    MeshComponent* meshComponent = job.meshComp;
+    EntityResources* entityRes = job.entityRes;
+    MeshResources* meshRes = job.meshRes;
+
+    if (entityRes->cachedIsBlended)
+      continue;
+
+    // Frustum culling for mirrored view
+    if (meshComponent->HasLocalAABB()) {
+      const glm::mat4 model = job.transformComp ? job.transformComp->GetModelMatrix() : glm::mat4(1.0f);
+      glm::vec3 wmin, wmax;
+      transformAABB(model, meshComponent->GetLocalAABBMin(), meshComponent->GetLocalAABBMax(), wmin, wmax);
+      if (!aabbIntersectsFrustum(wmin, wmax, reflectFrustum)) {
+        continue; // culled from reflection
+      }
+    }
+
+    // Bind geometry
+    vk::Buffer vb = GetVertexBuffer(this, meshComponent, meshRes);
+    std::array<vk::Buffer, 2> buffers = {vb, *entityRes->instanceBuffer};
+    std::array<vk::DeviceSize, 2> offsets = {0, 0};
+    cmd.bindVertexBuffers(0, buffers, offsets);
+    cmd.bindIndexBuffer(*meshRes->indexBuffer, 0, vk::IndexType::eUint32);
+
+    // Populate UBO with mirrored view + clip plane and reflection flags
+    UniformBufferObject ubo{};
+    if (job.transformComp)
+      ubo.model = job.transformComp->GetModelMatrix();
+    else
+      ubo.model = glm::mat4(1.0f);
+    ubo.view = viewReflected;
+    ubo.proj = projReflected;
+    ubo.camPos = glm::vec4(camera ? camera->GetPosition() : glm::vec3(0), 1.0f);
+    ubo.reflectionPass = 1;
+    ubo.reflectionEnabled = 0;
+    ubo.reflectionVP = currentReflectionVP;
+    ubo.clipPlaneWS = planeWS;
+    // Ray query shadows in reflection pass
+    ubo.padding2 = enableRasterRayQueryShadows ? 1.0f : 0.0f;
+
+    updateUniformBufferInternal(currentFrame, entity, entityRes, camera, ubo);
+
+    // Bind descriptor set (PBR set 0)
+    cmd.bindDescriptorSets(vk::PipelineBindPoint::eGraphics,
+                           *pbrPipelineLayout,
+                           0,
+                           *entityRes->pbrDescriptorSets[currentFrame],
+                           nullptr);
+
+    // Push material properties
+    MaterialProperties mp = entityRes->cachedMaterialProps;
+    // Transmission suppressed during reflection pass via UBO (reflectionPass=1)
+    mp.transmissionFactor = 0.0f;
+    pushMaterialProperties(*cmd, mp);
+
+    // Issue draw
+    uint32_t instanceCount = std::max(1u, static_cast<uint32_t>(meshComponent->GetInstanceCount()));
+    cmd.drawIndexed(meshRes->indexCount, instanceCount, 0, 0, 0);
+  }
+
+  cmd.endRendering();
+
+  // Transition reflection color to SHADER_READ_ONLY for sampling in main pass (Sync2)
+  vk::ImageMemoryBarrier2 toSample2{
+    .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+    .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite,
+    .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader,
+    .dstAccessMask = vk::AccessFlagBits2::eShaderRead,
+    .oldLayout = vk::ImageLayout::eColorAttachmentOptimal,
+    .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal,
+    .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+    .image = *rt.color,
+    .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+  };
+  vk::DependencyInfo depInfoToSample{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toSample2};
+  cmd.pipelineBarrier2(depInfoToSample);
+}
+
+// Create image views
+bool Renderer::createImageViews() {
+  try {
+    opaqueSceneColorImages.clear();
+    opaqueSceneColorImageAllocations.clear();
+    opaqueSceneColorImageViews.clear();
+    opaqueSceneColorImageLayouts.clear();
+    opaqueSceneColorSampler.clear();
+    // Resize image views vector
+    swapChainImageViews.clear();
+    swapChainImageViews.reserve(swapChainImages.size());
+
+    // Create image view info template (image will be set per iteration)
+    vk::ImageViewCreateInfo createInfo{
+      .viewType = vk::ImageViewType::e2D,
+      .format = swapChainImageFormat,
+      .components = {
+        .r = vk::ComponentSwizzle::eIdentity,
+        .g = vk::ComponentSwizzle::eIdentity,
+        .b = vk::ComponentSwizzle::eIdentity,
+        .a = vk::ComponentSwizzle::eIdentity
+      },
+      .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1}
+    };
+
+    // Create image view for each swap chain image
+    for (const auto& image : swapChainImages) {
+      createInfo.image = image;
+      swapChainImageViews.emplace_back(device, createInfo);
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create image views: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Setup dynamic rendering
+bool Renderer::setupDynamicRendering() {
+  try {
+    // Create color attachment
+    colorAttachments = {
+      vk::RenderingAttachmentInfo{
+        .imageLayout = vk::ImageLayout::eColorAttachmentOptimal,
+        .loadOp = vk::AttachmentLoadOp::eClear,
+        .storeOp = vk::AttachmentStoreOp::eStore,
+        .clearValue = vk::ClearColorValue(std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f})
+
+      }
+    };
+
+    // Create depth attachment
+    depthAttachment = vk::RenderingAttachmentInfo{
+      .imageLayout = vk::ImageLayout::eDepthStencilAttachmentOptimal,
+      .loadOp = vk::AttachmentLoadOp::eClear,
+      .storeOp = vk::AttachmentStoreOp::eStore,
+      .clearValue = vk::ClearDepthStencilValue(1.0f, 0)
+    };
+
+    // Create rendering info
+    renderingInfo = vk::RenderingInfo{
+      .renderArea = vk::Rect2D(vk::Offset2D(0, 0), swapChainExtent),
+      .layerCount = 1,
+      .colorAttachmentCount = static_cast<uint32_t>(colorAttachments.size()),
+      .pColorAttachments = colorAttachments.data(),
+      .pDepthAttachment = &depthAttachment
+    };
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to setup dynamic rendering: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create command pool
+bool Renderer::createCommandPool() {
+  try {
+    // Find queue families
+    QueueFamilyIndices queueFamilyIndicesLoc = findQueueFamilies(physicalDevice);
+
+    // Create command pool info
+    vk::CommandPoolCreateInfo poolInfo{
+      .flags = vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+      .queueFamilyIndex = queueFamilyIndicesLoc.graphicsFamily.value()
+    };
+
+    // Create command pool
+    commandPool = vk::raii::CommandPool(device, poolInfo);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create command pool: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create command buffers
+bool Renderer::createCommandBuffers() {
+  try {
+    // Resize command buffers vector
+    commandBuffers.clear();
+    commandBuffers.reserve(MAX_FRAMES_IN_FLIGHT);
+
+    // Create command buffer allocation info
+    vk::CommandBufferAllocateInfo allocInfo{
+      .commandPool = *commandPool,
+      .level = vk::CommandBufferLevel::ePrimary,
+      .commandBufferCount = static_cast<uint32_t>(MAX_FRAMES_IN_FLIGHT)
+    };
+
+    // Allocate command buffers
+    commandBuffers = vk::raii::CommandBuffers(device, allocInfo);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create command buffers: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create sync objects
+bool Renderer::createSyncObjects() {
+  try {
+    // Resize semaphores and fences vectors
+    imageAvailableSemaphores.clear();
+    renderFinishedSemaphores.clear();
+    inFlightFences.clear();
+
+    // Semaphores per swapchain image (indexed by imageIndex from acquireNextImage)
+    // The presentation engine holds semaphores until the image is re-acquired, so we need
+    // one semaphore per swapchain image to avoid reuse conflicts. See Vulkan spec:
+    // https://docs.vulkan.org/guide/latest/swapchain_semaphore_reuse.html
+    const auto semaphoreCount = static_cast<uint32_t>(swapChainImages.size());
+    imageAvailableSemaphores.reserve(semaphoreCount);
+    renderFinishedSemaphores.reserve(semaphoreCount);
+
+    // Fences per frame-in-flight for CPU-GPU synchronization (indexed by currentFrame)
+    inFlightFences.reserve(MAX_FRAMES_IN_FLIGHT);
+
+    // Create semaphore info
+    vk::SemaphoreCreateInfo semaphoreInfo{};
+
+    // Create semaphores per swapchain image (indexed by imageIndex for presentation sync)
+    for (uint32_t i = 0; i < semaphoreCount; i++) {
+      imageAvailableSemaphores.emplace_back(device, semaphoreInfo);
+      renderFinishedSemaphores.emplace_back(device, semaphoreInfo);
+    }
+
+    // Create fences per frame-in-flight (indexed by currentFrame for CPU-GPU pacing)
+    vk::FenceCreateInfo fenceInfo{
+      .flags = vk::FenceCreateFlagBits::eSignaled
+    };
+    for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+      inFlightFences.emplace_back(device, fenceInfo);
+    }
+
+    // Ensure uploads timeline semaphore exists (created early in createLogicalDevice)
+    // No action needed here unless reinitializing after swapchain recreation.
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create sync objects: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Clean up swap chain
+void Renderer::cleanupSwapChain() {
+  // Clean up depth resources
+  depthImageView = vk::raii::ImageView(nullptr);
+  depthImage = vk::raii::Image(nullptr);
+  depthImageAllocation = nullptr;
+
+  // Clean up swap chain image views
+  swapChainImageViews.clear();
+
+  // Note: Keep descriptor pool alive here to ensure descriptor sets remain valid during swapchain recreation.
+  // descriptorPool is preserved; it will be managed during full renderer teardown.
+
+  // Destroy reflection render targets if present
+  destroyReflectionResources();
+
+  // Clean up pipelines
+  graphicsPipeline = vk::raii::Pipeline(nullptr);
+  pbrGraphicsPipeline = vk::raii::Pipeline(nullptr);
+  lightingPipeline = vk::raii::Pipeline(nullptr);
+
+  // Clean up pipeline layouts
+  pipelineLayout = vk::raii::PipelineLayout(nullptr);
+  pbrPipelineLayout = vk::raii::PipelineLayout(nullptr);
+  lightingPipelineLayout = vk::raii::PipelineLayout(nullptr);
+
+  // Clean up sync objects (they need to be recreated with new swap chain image count)
+  imageAvailableSemaphores.clear();
+  renderFinishedSemaphores.clear();
+  inFlightFences.clear();
+
+  // Clean up swap chain
+  swapChain = vk::raii::SwapchainKHR(nullptr);
+}
+
+// Recreate swap chain
+void Renderer::recreateSwapChain() {
+  // Prevent background uploads worker from mutating descriptors while we rebuild
+  StopUploadsWorker();
+
+  // Block descriptor writes while we rebuild swapchain and descriptor pools
+  descriptorSetsValid.store(false, std::memory_order_relaxed); {
+    // Drop any deferred descriptor updates that target old descriptor sets
+    std::lock_guard<std::mutex> lk(pendingDescMutex);
+    pendingDescOps.clear();
+    descriptorRefreshPending.store(false, std::memory_order_relaxed);
+  }
+
+  // Wait for all frames in flight to complete before recreating the swap chain
+  std::vector<vk::Fence> allFences;
+  allFences.reserve(inFlightFences.size());
+  for (const auto& fence : inFlightFences) {
+    allFences.push_back(*fence);
+  }
+  if (!allFences.empty()) {
+    vk::Result result = waitForFencesSafe(allFences, VK_TRUE);
+    if (result != vk::Result::eSuccess) {
+      std::cerr << "Error: Failed to wait for in-flight fences during swap chain recreation: " << vk::to_string(result) << std::endl;
+    }
+  }
+
+  // Wait for the device to be idle before recreating the swap chain
+  // External synchronization required (VVL): serialize against queue submits/present.
+  WaitIdle();
+
+  // Clean up old swap chain resources
+  cleanupSwapChain();
+
+  // Recreate swap chain and related resources
+  createSwapChain();
+  createImageViews();
+  setupDynamicRendering();
+  createDepthResources();
+
+  // (Re)create reflection resources if enabled
+  if (enablePlanarReflections) {
+    uint32_t rw = std::max(1u, static_cast<uint32_t>(static_cast<float>(swapChainExtent.width) * reflectionResolutionScale));
+    uint32_t rh = std::max(1u, static_cast<uint32_t>(static_cast<float>(swapChainExtent.height) * reflectionResolutionScale));
+    createReflectionResources(rw, rh);
+  }
+
+  // Recreate sync objects with correct sizing for new swap chain
+  createSyncObjects();
+
+  // Recreate off-screen opaque scene color and descriptor sets needed by transparent pass
+  createOpaqueSceneColorResources();
+  createTransparentDescriptorSets();
+  createTransparentFallbackDescriptorSets();
+
+  // Wait for all command buffers to complete before clearing resources
+  for (const auto& fence : inFlightFences) {
+    vk::Result result = waitForFencesSafe(*fence, VK_TRUE);
+    if (result != vk::Result::eSuccess) {
+      std::cerr << "Error: Failed to wait for fence before clearing resources: " << vk::to_string(result) << std::endl;
+    }
+  }
+
+  // Clear all entity descriptor sets since they're now invalid (allocated from the old pool)
+  {
+    // Serialize descriptor frees against any other descriptor operations
+    std::lock_guard<std::mutex> lk(descriptorMutex);
+    for (auto& kv : entityResources) {
+      auto& resources = kv.second;
+      resources.basicDescriptorSets.clear();
+      resources.pbrDescriptorSets.clear();
+      // Descriptor initialization flags must be reset because new descriptor sets
+      // will be allocated and only the current frame will be initialized at runtime.
+      resources.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+      resources.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+      resources.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+      resources.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+      resources.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+    }
+  }
+
+  // Clear ray query descriptor sets - they reference the old output image which will be destroyed
+  // Must clear before recreating to avoid descriptor set corruption
+  rayQueryDescriptorSets.clear();
+  rayQueryDescriptorsWritten.clear();
+  rayQueryDescriptorsDirtyMask.store(0u, std::memory_order_relaxed);
+
+  // Destroy ray query output image resources - they're sized to old swapchain dimensions
+  rayQueryOutputImageView = vk::raii::ImageView(nullptr);
+  rayQueryOutputImage = vk::raii::Image(nullptr);
+  rayQueryOutputImageAllocation = nullptr;
+
+  createGraphicsPipeline();
+  createPBRPipeline();
+  createLightingPipeline();
+  createCompositePipeline();
+
+  // Recreate Forward+ specific pipelines/resources and resize tile buffers for new extent
+  if (useForwardPlus) {
+    createDepthPrepassPipeline();
+    uint32_t tilesX = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX;
+    uint32_t tilesY = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY;
+    createOrResizeForwardPlusBuffers(tilesX, tilesY, forwardPlusSlicesZ);
+  }
+
+  // Re-create command buffers to ensure fresh recording against new swapchain state
+  commandBuffers.clear();
+  createCommandBuffers();
+  currentFrame = 0;
+
+  // Recreate ray query resources with new swapchain dimensions
+  // This must happen after descriptor pool is valid but before marking descriptor sets valid
+  if (rayQueryEnabled && accelerationStructureEnabled) {
+    if (!createRayQueryResources()) {
+      std::cerr << "Warning: Failed to recreate ray query resources after swapchain recreation\n";
+    }
+  }
+
+  // Recreate descriptor sets for all entities after swapchain/pipeline rebuild
+  for (const auto& kv : entityResources) {
+    const auto& entity = kv.first;
+    if (!entity)
+      continue;
+    auto meshComponent = entity->GetComponent<MeshComponent>();
+    if (!meshComponent)
+      continue;
+
+    std::string texturePath = meshComponent->GetTexturePath();
+    // Fallback for basic pipeline: use baseColor when legacy path is empty
+    if (texturePath.empty()) {
+      const std::string& baseColor = meshComponent->GetBaseColorTexturePath();
+      if (!baseColor.empty()) {
+        texturePath = baseColor;
+      }
+    }
+    // Recreate basic descriptor sets (ignore failures here to avoid breaking resize)
+    createDescriptorSets(entity, texturePath, false);
+    // Recreate PBR descriptor sets
+    createDescriptorSets(entity, texturePath, true);
+  }
+
+  // Descriptor sets are now valid again
+  descriptorSetsValid.store(true, std::memory_order_relaxed);
+
+  // Resume background uploads worker now that swapchain and descriptors are recreated
+  StartUploadsWorker();
+}
+
+void Renderer::prepareFrameUboTemplate(CameraComponent* camera) {
+  frameUboTemplate = UniformBufferObject{};
+  if (!camera) return;
+
+  frameUboTemplate.view = camera->GetViewMatrix();
+  frameUboTemplate.proj = camera->GetProjectionMatrix();
+  frameUboTemplate.proj[1][1] *= -1; // Flip Y for Vulkan
+  frameUboTemplate.camPos = glm::vec4(camera->GetPosition(), 1.0f);
+
+  frameUboTemplate.lightCount = static_cast<int>(lastFrameLightCount);
+  frameUboTemplate.exposure = std::clamp(this->exposure, 0.2f, 4.0f);
+  frameUboTemplate.gamma = this->gamma;
+  frameUboTemplate.screenDimensions = glm::vec2(swapChainExtent.width, swapChainExtent.height);
+  frameUboTemplate.nearZ = camera->GetNearPlane();
+  frameUboTemplate.farZ = camera->GetFarPlane();
+  frameUboTemplate.slicesZ = static_cast<float>(forwardPlusSlicesZ);
+
+  int outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb ||
+                       swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0;
+  frameUboTemplate.padding0 = outputIsSRGB;
+  // Raster PBR shader uses padding1 as the Forward+ enable flag.
+  // 0 = disabled (always use global light loop), non-zero = enabled (use culled tile lists).
+  frameUboTemplate.padding1 = useForwardPlus ? 1.0f : 0.0f;
+  frameUboTemplate.padding2 = enableRasterRayQueryShadows ? 1.0f : 0.0f;
+
+  bool reflReady = false;
+  if (enablePlanarReflections && !reflections.empty()) {
+    const uint32_t count = static_cast<uint32_t>(reflections.size());
+    const uint32_t prev = (currentFrame + count - 1u) % count;
+    auto& rtPrev = reflections[prev];
+    reflReady = (!!*rtPrev.colorView) && (!!*rtPrev.colorSampler);
+  }
+  frameUboTemplate.reflectionEnabled = reflReady ? 1 : 0;
+  frameUboTemplate.reflectionVP = sampleReflectionVP;
+  frameUboTemplate.clipPlaneWS = currentReflectionPlane;
+  frameUboTemplate.reflectionIntensity = std::clamp(reflectionIntensity, 0.0f, 2.0f);
+  frameUboTemplate.enableRayQueryReflections = enableRayQueryReflections ? 1 : 0;
+  frameUboTemplate.enableRayQueryTransparency = enableRayQueryTransparency ? 1 : 0;
+
+  // Ray-query shared buffers are also used by raster PBR when doing ray-query shadows.
+  // Populate counts so shaders can bounds-check even when running in raster mode.
+  frameUboTemplate.geometryInfoCount = static_cast<int>(geometryInfoCountCPU);
+  frameUboTemplate.materialCount = static_cast<int>(materialCountCPU);
+}
+
+// Update uniform buffer
+void Renderer::updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, TransformComponent* tc) {
+  if (!entityRes) {
+    return;
+  }
+  
+  bool isTarget = (entity && (entity->GetName().find("Fox") != std::string::npos || entity->GetName().find("Cube") != std::string::npos));
+
+  // Get transform component
+  auto transformComponent = tc ? tc : (entity ? entity->GetComponent<TransformComponent>() : nullptr);
+  if (!transformComponent) {
+    return;
+  }
+
+  // Create uniform buffer object
+  UniformBufferObject ubo{};
+  ubo.model = transformComponent->GetModelMatrix();
+  ubo.view = camera->GetViewMatrix();
+  ubo.proj = camera->GetProjectionMatrix();
+  ubo.proj[1][1] *= -1; // Flip Y for Vulkan
+
+  // Continue with the rest of the uniform buffer setup
+  updateUniformBufferInternal(currentImage, entity, entityRes, camera, ubo);
+}
+
+// Overloaded version that accepts a custom transform matrix
+void Renderer::updateUniformBuffer(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, const glm::mat4& customTransform) {
+  if (!entityRes) return;
+  // Create the uniform buffer object with custom transform
+  UniformBufferObject ubo{};
+  ubo.model = customTransform;
+  ubo.view = camera->GetViewMatrix();
+  ubo.proj = camera->GetProjectionMatrix();
+  ubo.proj[1][1] *= -1; // Flip Y for Vulkan
+
+  // Continue with the rest of the uniform buffer setup
+  updateUniformBufferInternal(currentImage, entity, entityRes, camera, ubo);
+}
+
+// Internal helper function to complete uniform buffer setup
+void Renderer::updateUniformBufferInternal(uint32_t currentImage, Entity* entity, EntityResources* entityRes, CameraComponent* camera, UniformBufferObject& ubo) {
+  if (!entityRes) {
+    return;
+  }
+
+  // Use frame template for most fields
+  UniformBufferObject finalUbo = frameUboTemplate;
+  finalUbo.model = ubo.model;
+
+  // For reflection pass, we must override view/proj/reflection flags
+  if (ubo.reflectionPass == 1) {
+    finalUbo.view = ubo.view;
+    finalUbo.proj = ubo.proj;
+    finalUbo.reflectionPass = 1;
+    finalUbo.reflectionEnabled = 0;
+    finalUbo.reflectionVP = ubo.reflectionVP;
+    finalUbo.clipPlaneWS = ubo.clipPlaneWS;
+    finalUbo.padding2 = ubo.padding2;
+  }
+
+  // Copy to uniform buffer (guard against null mapped pointer)
+  void* dst = entityRes->uniformBuffersMapped[currentImage];
+  if (!dst) {
+    std::cerr << "Warning: UBO mapped ptr null for entity '" << (entity ? entity->GetName() : "unknown") << "' frame " << currentImage << std::endl;
+    return;
+  }
+  std::memcpy(dst, &finalUbo, sizeof(UniformBufferObject));
+}
+
+void Renderer::ensureEntityMaterialCache(Entity* entity, EntityResources& res) {
+  if (!entity)
+    return;
+
+  if (res.materialCacheValid)
+    return;
+
+  res.materialCacheValid = true;
+  res.cachedMaterial = nullptr;
+  res.cachedIsBlended = false;
+  res.cachedIsGlass = false;
+  res.cachedIsLiquid = false;
+
+  // Defaults represent the common case (no explicit material); textures come from descriptor bindings.
+  MaterialProperties mp{};
+  // Sensible defaults for entities without explicit material
+  mp.baseColorFactor = glm::vec4(1.0f);
+  mp.metallicFactor = 0.0f;
+  mp.roughnessFactor = 1.0f;
+  mp.baseColorTextureSet = 0;
+  mp.physicalDescriptorTextureSet = 0;
+  mp.normalTextureSet = -1;
+  mp.occlusionTextureSet = -1;
+  mp.emissiveTextureSet = -1;
+  mp.alphaMask = 0.0f;
+  mp.alphaMaskCutoff = 0.5f;
+  mp.emissiveFactor = glm::vec3(0.0f);
+  mp.emissiveStrength = 1.0f;
+  mp.transmissionFactor = 0.0f;
+  mp.useSpecGlossWorkflow = 0;
+  mp.glossinessFactor = 0.0f;
+  mp.specularFactor = glm::vec3(1.0f);
+  mp.ior = 1.5f;
+  mp.hasEmissiveStrengthExtension = 0;
+
+  if (modelLoader) {
+    const std::string& entityName = entity->GetName();
+    const size_t tagPos = entityName.find("_Material_");
+    if (tagPos != std::string::npos) {
+      const size_t afterTag = tagPos + std::string("_Material_").size();
+      if (afterTag < entityName.length()) {
+        // Entity name format: "modelName_Material_<index>_<materialName>"
+        const std::string remainder = entityName.substr(afterTag);
+        const size_t nextUnderscore = remainder.find('_');
+        if (nextUnderscore != std::string::npos && nextUnderscore + 1 < remainder.length()) {
+          const std::string materialName = remainder.substr(nextUnderscore + 1);
+          if (const Material* material = modelLoader->GetMaterial(materialName)) {
+            res.cachedMaterial = material;
+            res.cachedIsGlass = material->isGlass;
+            res.cachedIsLiquid = material->isLiquid;
+
+            // Base factors
+            mp.baseColorFactor = glm::vec4(material->albedo, material->alpha);
+            mp.metallicFactor = material->metallic;
+            mp.roughnessFactor = material->roughness;
+
+            // Texture set flags (-1 = no texture)
+            mp.baseColorTextureSet = material->albedoTexturePath.empty() ? -1 : 0;
+            // physical descriptor: MR or SpecGloss
+            if (material->useSpecularGlossiness) {
+              mp.useSpecGlossWorkflow = 1;
+              mp.physicalDescriptorTextureSet = material->specGlossTexturePath.empty() ? -1 : 0;
+              mp.glossinessFactor = material->glossinessFactor;
+              mp.specularFactor = material->specularFactor;
+            } else {
+              mp.useSpecGlossWorkflow = 0;
+              mp.physicalDescriptorTextureSet = material->metallicRoughnessTexturePath.empty() ? -1 : 0;
+            }
+            mp.normalTextureSet = material->normalTexturePath.empty() ? -1 : 0;
+            mp.occlusionTextureSet = material->occlusionTexturePath.empty() ? -1 : 0;
+            mp.emissiveTextureSet = material->emissiveTexturePath.empty() ? -1 : 0;
+
+            // Emissive and transmission/IOR
+            mp.emissiveFactor = material->emissive;
+            mp.emissiveStrength = material->emissiveStrength;
+            // Heuristic: consider emissive strength extension present when strength != 1.0
+            mp.hasEmissiveStrengthExtension = (std::abs(material->emissiveStrength - 1.0f) > 1e-6f) ? 1 : 0;
+            mp.transmissionFactor = material->transmissionFactor;
+            mp.ior = material->ior;
+
+            // Alpha mask handling
+            mp.alphaMask = (material->alphaMode == "MASK") ? 1.0f : 0.0f;
+            mp.alphaMaskCutoff = material->alphaCutoff;
+
+            // Blended classification (opaque materials stay in the opaque pass)
+            const bool alphaBlend = (material->alphaMode == "BLEND");
+            const bool highTransmission = (material->transmissionFactor > 0.2f);
+            res.cachedIsBlended = alphaBlend || highTransmission || res.cachedIsGlass || res.cachedIsLiquid;
+          }
+        }
+      }
+    }
+  }
+
+  res.cachedMaterialProps = mp;
+}
+
+// Render the scene (unique_ptr container overload)
+// Convert to a raw-pointer snapshot so callers can safely release their container locks.
+void Renderer::Render(const std::vector<std::unique_ptr<Entity>>& entities, CameraComponent* camera, ImGuiSystem* imguiSystem) {
+  std::vector<Entity *> snapshot;
+  snapshot.reserve(entities.size());
+  for (const auto& uptr : entities) {
+    snapshot.push_back(uptr.get());
+  }
+  Render(snapshot, camera, imguiSystem);
+}
+
+// Render the scene (raw pointer snapshot overload)
+void Renderer::Render(const std::vector<Entity *>& entities, CameraComponent* camera, ImGuiSystem* imguiSystem) {
+  // Update watchdog timestamp to prove frame is progressing
+  lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+  watchdogProgressLabel.store("Render: frame begin", std::memory_order_relaxed);
+
+  if (memoryPool)
+    memoryPool->setRenderingActive(true);
+  struct RenderingStateGuard {
+    MemoryPool* pool;
+    explicit RenderingStateGuard(MemoryPool* p) : pool(p) {
+    }
+    ~RenderingStateGuard() {
+      if (pool)
+        pool->setRenderingActive(false);
+    }
+  } guard(memoryPool.get());
+
+  // Track if ray query rendered successfully this frame to skip rasterization code path
+  bool rayQueryRenderedThisFrame = false;
+
+  // --- Extract lights for the frame ---
+  // Build a single light list once per frame (emissive lights only for this scene)
+  std::vector<ExtractedLight> lightsSubset;
+  if (!staticLights.empty()) {
+    lightsSubset.reserve(std::min(staticLights.size(), static_cast<size_t>(MAX_ACTIVE_LIGHTS)));
+    for (const auto& L : staticLights) {
+      // Include all lights (Directional, Point, Emissive) up to the limit
+      lightsSubset.push_back(L);
+      if (lightsSubset.size() >= MAX_ACTIVE_LIGHTS)
+        break;
+    }
+  }
+  lastFrameLightCount = static_cast<uint32_t>(lightsSubset.size());
+  if (!lightsSubset.empty()) {
+    updateLightStorageBuffer(currentFrame, lightsSubset, camera);
+  }
+
+  // Pre-calculate frame-constant UBO data
+  prepareFrameUboTemplate(camera);
+
+  // Wait for the previous frame's work on this frame slot to complete
+  // Use a finite timeout loop so we can keep the watchdog alive during long GPU work
+  // (e.g., acceleration structure builds/refits can legitimately take seconds on large scenes).
+  watchdogProgressLabel.store("Render: wait inFlightFence", std::memory_order_relaxed);
+  vk::Result fenceResult = waitForFencesSafe(*inFlightFences[currentFrame], VK_TRUE);
+  if (fenceResult != vk::Result::eSuccess) {
+    std::cerr << "Error: Failed to wait for in-flight fence: " << vk::to_string(fenceResult) << std::endl;
+  }
+
+  // Reset the fence immediately after successful wait, before any new work
+  watchdogProgressLabel.store("Render: reset inFlightFence", std::memory_order_relaxed);
+  device.resetFences(*inFlightFences[currentFrame]);
+
+  // Execute any pending GPU uploads (enqueued by worker/loading threads) on the render thread
+  // at this safe point to ensure all Vulkan submits happen on a single thread.
+  // This prevents validation/GPU-AV PostSubmit crashes due to cross-thread queue usage.
+  watchdogProgressLabel.store("Render: ProcessPendingMeshUploads", std::memory_order_relaxed);
+  ProcessPendingMeshUploads();
+  // Execute any pending per-entity GPU resource preallocation requested by the scene loader.
+  // This prevents background threads from mutating `entityResources`/`meshResources` concurrently
+  // with rendering (which can corrupt unordered_map internals and crash).
+  watchdogProgressLabel.store("Render: ProcessPendingEntityPreallocations", std::memory_order_relaxed);
+  ProcessPendingEntityPreallocations();
+  watchdogProgressLabel.store("Render: after ProcessPendingEntityPreallocations", std::memory_order_relaxed);
+
+  // Process deferred AS deletion queue at safe point (after fence wait)
+  // Increment frame counters and delete AS structures that are no longer in use
+  // Wait for MAX_FRAMES_IN_FLIGHT + 1 frames to ensure GPU has finished all work
+  // (The +1 ensures we've waited through a full cycle of all frame slots)
+  {
+    auto it = pendingASDeletions.begin();
+    while (it != pendingASDeletions.end()) {
+      it->framesSinceDestroy++;
+      if (it->framesSinceDestroy > MAX_FRAMES_IN_FLIGHT) {
+        // Safe to delete - all frames have finished using these AS structures
+        it = pendingASDeletions.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+  watchdogProgressLabel.store("Render: after pendingASDeletions", std::memory_order_relaxed);
+
+  // Opportunistically request AS rebuild when more meshes become ready than in the last built AS.
+  // This makes the TLAS grow as streaming/allocations complete, then settle (no rebuild spam).
+  // NOTE: This scan can be relatively heavy and is not needed for the default startup path.
+  // Only run it when opportunistic rebuilds are enabled.
+  // While loading, allow opportunistic AS rebuild scanning even if the user-facing toggle is off.
+  // This prevents nondeterministic “missing outdoor props” across app restarts when the first TLAS
+  // build happens before all entities exist.
+  if (rayQueryEnabled && accelerationStructureEnabled && (asOpportunisticRebuildEnabled || IsLoading())) {
+    watchdogProgressLabel.store("Render: AS readiness scan", std::memory_order_relaxed);
+    size_t readyRenderableCount = 0;
+    size_t readyUniqueMeshCount = 0; {
+      std::shared_lock<std::shared_mutex> advLock(g_advancedStateMutex);
+      auto lastKick = std::chrono::steady_clock::now();
+      auto kickWatchdog = [&]() {
+        auto now = std::chrono::steady_clock::now();
+        if (now - lastKick > std::chrono::milliseconds(200)) {
+          lastFrameUpdateTime.store(now, std::memory_order_relaxed);
+          lastKick = now;
+        }
+      };
+      std::map<MeshComponent *, uint32_t> meshToBLASProbe;
+      for (Entity* e : entities) {
+        kickWatchdog();
+        if (!e || !e->IsActive())
+          continue;
+        // In Ray Query static-only mode, ignore dynamic/animated entities for readiness
+        if (IsRayQueryStaticOnly()) {
+          const std::string& nm = e->GetName();
+          if (nm.find("_AnimNode_") != std::string::npos)
+            continue;
+          if (!nm.empty() && nm.rfind("Ball_", 0) == 0)
+            continue;
+        }
+        auto meshComp = e->GetComponent<MeshComponent>();
+        if (!meshComp)
+          continue;
+
+        // Deformable meshes are included in the AS (refit each frame); count them here
+        // so this readiness scan matches the build loop in buildAccelerationStructures.
+        try {
+          auto it = meshResources.find(meshComp);
+          if (it == meshResources.end())
+            continue;
+          const auto& res = it->second;
+          // STRICT readiness: uploads must be finished (staging sizes zero)
+          if (res.vertexBufferSizeBytes != 0 || res.indexBufferSizeBytes != 0)
+            continue;
+          if (!*res.vertexBuffer || !*res.indexBuffer)
+            continue;
+          if (res.indexCount == 0)
+            continue;
+        } catch (...) {
+          continue;
+        }
+        readyRenderableCount++;
+        if (meshToBLASProbe.find(meshComp) == meshToBLASProbe.end()) {
+          meshToBLASProbe[meshComp] = static_cast<uint32_t>(meshToBLASProbe.size());
+        }
+      }
+      readyUniqueMeshCount = meshToBLASProbe.size();
+    }
+    // During scene loading/finalization, the TLAS may be built before all entities exist.
+    // Allow rebuilds even if AS is "frozen" so the TLAS converges to the full scene across restarts.
+    if ((!asFrozen || IsLoading()) && (readyRenderableCount > lastASBuiltInstanceCount || readyUniqueMeshCount > lastASBuiltBLASCount) && !asBuildRequested.load(std::memory_order_relaxed)) {
+      std::cout << "AS rebuild requested: counts increased (built instances=" << lastASBuiltInstanceCount
+          << ", ready instances=" << readyRenderableCount
+          << ", built meshes=" << lastASBuiltBLASCount
+          << ", ready meshes=" << readyUniqueMeshCount << ")\n";
+      RequestAccelerationStructureBuild("counts increased");
+    }
+
+    // Post-load repair: if loading is done and the current TLAS instance count is far below readiness,
+    // force a one-time rebuild even when frozen so we include the whole scene.
+    if (!IsLoading() && !asBuildRequested.load(std::memory_order_relaxed)) {
+      const size_t targetInstances = readyRenderableCount;
+      if (targetInstances > 0 && lastASBuiltInstanceCount < static_cast<size_t>(static_cast<double>(targetInstances) * 0.95)) {
+        asDevOverrideAllowRebuild = true; // allow rebuild even if frozen
+        std::cout << "AS rebuild requested: post-load full build (built instances=" << lastASBuiltInstanceCount
+            << ", ready instances=" << targetInstances << ")\n";
+        RequestAccelerationStructureBuild("post-load full build");
+      }
+    }
+  }
+
+  // If in Ray Query static-only mode and TLAS not yet built post-load, request a one-time build now.
+  // (Does not require a readiness scan.)
+  if (rayQueryEnabled&& accelerationStructureEnabled && currentRenderMode
+  ==
+  RenderMode::RayQuery&& IsRayQueryStaticOnly() &&
+  !IsLoading() &&
+      !*tlasStructure.handle && !asBuildRequested.load(std::memory_order_relaxed)
+  ) {
+    RequestAccelerationStructureBuild("static-only initial build");
+  }
+
+  // Check if acceleration structure build was requested (e.g., after scene loading or counts grew)
+  // Build at this safe frame point to avoid threading issues
+  watchdogProgressLabel.store("Render: AS build request check", std::memory_order_relaxed);
+  if (rayQueryEnabled && accelerationStructureEnabled && asBuildRequested.load(std::memory_order_acquire)) {
+    watchdogProgressLabel.store("Render: AS build request handling", std::memory_order_relaxed);
+
+    // Defer TLAS/BLAS build while the scene loader is still active to avoid partial builds.
+    // IMPORTANT: Do NOT use IsLoading() here; IsLoading() also includes the post-load
+    // "finalizing" stage, and deferring on that would deadlock the AS build forever.
+    if (IsSceneLoaderActive()) {
+      // Keep the request flag set; we'll build once the loader (and critical textures) finish.
+    } else if (asFrozen && !asDevOverrideAllowRebuild && !IsLoading()) {
+      // Ignore rebuilds while frozen to avoid wiping TLAS during animation playback
+      std::cout << "AS rebuild request ignored (frozen). Reason: " << lastASBuildRequestReason << "\n";
+      asBuildRequested.store(false, std::memory_order_release);
+      asBuildRequestStartNs.store(0, std::memory_order_relaxed);
+      watchdogSuppressed.store(false, std::memory_order_relaxed);
+    } else {
+      // Gate initial build until readiness is high enough to represent the full scene
+      size_t totalRenderableEntities = 0;
+      size_t readyRenderableCount = 0;
+      size_t readyUniqueMeshCount = 0;
+      size_t missingMeshResources = 0;
+      size_t pendingUploadsCount = 0;
+      size_t nullBuffersCount = 0;
+      size_t zeroIndicesCount = 0; {
+        std::shared_lock<std::shared_mutex> advLock(g_advancedStateMutex);
+        auto lastKick = std::chrono::steady_clock::now();
+        auto kickWatchdog = [&]() {
+          auto now = std::chrono::steady_clock::now();
+          if (now - lastKick > std::chrono::milliseconds(200)) {
+            lastFrameUpdateTime.store(now, std::memory_order_relaxed);
+            lastKick = now;
+          }
+        };
+        std::map<MeshComponent *, uint32_t> meshToBLASProbe;
+        for (Entity* e : entities) {
+          kickWatchdog();
+          if (!e || !e->IsActive())
+            continue;
+          // In Ray Query static-only mode, ignore dynamic/animated entities for totals/readiness
+          if (IsRayQueryStaticOnly()) {
+            const std::string& nm = e->GetName();
+            if (nm.find("_AnimNode_") != std::string::npos)
+              continue;
+            if (!nm.empty() && nm.rfind("Ball_", 0) == 0)
+              continue;
+          }
+          auto meshComp = e->GetComponent<MeshComponent>();
+          if (!meshComp)
+            continue;
+
+          // Deformable meshes are included in the AS (refit each frame); count them here
+          // so totals/readiness match the build loop in buildAccelerationStructures.
+          totalRenderableEntities++;
+          try {
+            auto it = meshResources.find(meshComp);
+            if (it == meshResources.end()) {
+              missingMeshResources++;
+              continue;
+            }
+            const auto& res = it->second;
+            // STRICT readiness here too: uploads finished
+            if (res.vertexBufferSizeBytes != 0 || res.indexBufferSizeBytes != 0) {
+              pendingUploadsCount++;
+              continue;
+            }
+            if (!*res.vertexBuffer || !*res.indexBuffer) {
+              nullBuffersCount++;
+              continue;
+            }
+            if (res.indexCount == 0) {
+              zeroIndicesCount++;
+              continue;
+            }
+          } catch (...) {
+            continue;
+          }
+          readyRenderableCount++;
+          if (meshToBLASProbe.find(meshComp) == meshToBLASProbe.end()) {
+            meshToBLASProbe[meshComp] = static_cast<uint32_t>(meshToBLASProbe.size());
+          }
+        }
+        readyUniqueMeshCount = meshToBLASProbe.size();
+      }
+      const double readiness = (totalRenderableEntities > 0) ? static_cast<double>(readyRenderableCount) / static_cast<double>(totalRenderableEntities) : 0.0;
+      const double buildThreshold = 0.95; // prefer building when ~full scene is ready
+
+      // Bounded deferral: avoid getting stuck forever waiting for perfect readiness.
+      // After a short timeout from the original request, build with the best available data.
+      const uint64_t reqNs = asBuildRequestStartNs.load(std::memory_order_relaxed);
+      const uint64_t nowNs = std::chrono::steady_clock::now().time_since_epoch().count();
+      const double maxDeferralSeconds = 15.0;
+      const bool deferralTimedOut = (reqNs != 0) && (nowNs > reqNs) &&
+          (static_cast<double>(nowNs - reqNs) / 1'000'000'000.0) >= maxDeferralSeconds;
+
+      if (readiness < buildThreshold && !asDevOverrideAllowRebuild && !deferralTimedOut) {
+        // Intentionally no stdout spam here (Windows consoles are slow and there's no user-facing benefit).
+        // Keep the request flag set; try again next frame
+      } else {
+        if (deferralTimedOut && readiness < buildThreshold && !asDevOverrideAllowRebuild) {
+          std::cout << "AS build forced after " << maxDeferralSeconds
+              << "s deferral (readiness " << readyRenderableCount << "/" << totalRenderableEntities
+              << ", uniqueMeshesReady=" << readyUniqueMeshCount << ")\n";
+        }
+        struct WatchdogSuppressGuard {
+          std::atomic<bool>& flag;
+          explicit WatchdogSuppressGuard(std::atomic<bool>& f) : flag(f) {
+            flag.store(true, std::memory_order_relaxed);
+          }
+          ~WatchdogSuppressGuard() {
+            flag.store(false, std::memory_order_relaxed);
+          }
+        } watchdogGuard(watchdogSuppressed);
+
+        // Ensure previous GPU work is complete BEFORE building AS.
+        //
+        // Wait for all *other* frame-in-flight fences to signal using a finite timeout loop
+        // and kick the watchdog while we wait.
+        // Do NOT include `currentFrame` here because its fence was reset at frame start
+        // and will not signal until we submit the current frame.
+        {
+          std::vector<vk::Fence> fencesToWait;
+          if (inFlightFences.size() > 1) {
+            fencesToWait.reserve(inFlightFences.size() - 1);
+          }
+          for (uint32_t i = 0; i < static_cast<uint32_t>(inFlightFences.size()); ++i) {
+            if (i == currentFrame)
+              continue;
+            if (!!*inFlightFences[i]) {
+              fencesToWait.push_back(*inFlightFences[i]);
+            }
+          }
+          if (!fencesToWait.empty()) {
+            vk::Result result = waitForFencesSafe(fencesToWait, VK_TRUE);
+            if (result != vk::Result::eSuccess) {
+              std::cerr << "Error: Failed to wait for fences before acceleration structure build: " << vk::to_string(result) << std::endl;
+            }
+          }
+        }
+
+        watchdogProgressLabel.store("Render: buildAccelerationStructures", std::memory_order_relaxed);
+        if (buildAccelerationStructures(entities)) {
+          watchdogProgressLabel.store("Render: after buildAccelerationStructures", std::memory_order_relaxed);
+          asBuildRequested.store(false, std::memory_order_release);
+          asBuildRequestStartNs.store(0, std::memory_order_relaxed);
+          // AS build request resolved; restore normal watchdog sensitivity.
+          watchdogSuppressed.store(false, std::memory_order_relaxed);
+          // Transition the loading UI to a finalizing phase (descriptor cold-init, etc.).
+          if (IsLoading()) {
+            SetLoadingPhase(LoadingPhase::Finalizing);
+            SetLoadingPhaseProgress(0.0f);
+          }
+
+          // The TLAS handle can transition from null -> valid (or change on rebuild).
+          // Ensure raster PBR descriptor sets (set 0, binding 11 `tlas`) are rewritten after an AS build
+          // so subsequent Raster draws never see an unwritten/stale acceleration-structure descriptor.
+          for (auto& kv : entityResources) {
+            kv.second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+          }
+          for (Entity* e : entities) {
+            MarkEntityDescriptorsDirty(e);
+          }
+
+          // Freeze only when the built AS covers essentially the full set of renderable entities.
+          // NOTE: `lastASBuiltInstanceCount` is an ENTITY count; TLAS instance count (instancing) is tracked separately.
+          if (asFreezeAfterFullBuild) {
+            const double threshold = 0.95;
+            if (totalRenderableEntities > 0 &&
+              static_cast<double>(lastASBuiltInstanceCount) >= threshold * static_cast<double>(totalRenderableEntities)) {
+              asFrozen = true;
+            }
+          }
+
+          // One concise TLAS summary with consistent units.
+          if (!!*tlasStructure.handle) {
+            if (IsRayQueryStaticOnly()) {
+              std::cout << "TLAS ready (static-only): tlasInstances=" << lastASBuiltTlasInstanceCount
+                  << ", entities=" << lastASBuiltInstanceCount
+                  << ", BLAS=" << lastASBuiltBLASCount
+                  << ", addr=0x" << std::hex << tlasStructure.deviceAddress << std::dec << std::endl;
+            } else {
+              std::cout << "TLAS ready: tlasInstances=" << lastASBuiltTlasInstanceCount
+                  << ", entities=" << lastASBuiltInstanceCount
+                  << ", BLAS=" << lastASBuiltBLASCount
+                  << ", addr=0x" << std::hex << tlasStructure.deviceAddress << std::dec << std::endl;
+            }
+          }
+        } else {
+          if (!accelerationStructureEnabled || !rayQueryEnabled) {
+            // Permanent failure due to lack of support; do not retry.
+            asBuildRequested.store(false, std::memory_order_release);
+            asBuildRequestStartNs.store(0, std::memory_order_relaxed);
+            watchdogSuppressed.store(false, std::memory_order_relaxed);
+          } else {
+            // If nothing is ready yet (e.g., mesh uploads still pending), don't spam logs.
+            if (readyRenderableCount > 0 || readyUniqueMeshCount > 0) {
+              std::cout << "Failed to build acceleration structures, will retry next frame" << std::endl;
+            }
+          }
+        }
+        // Reset dev override after one use
+        asDevOverrideAllowRebuild = false;
+      }
+    }
+  }
+
+  // Safe point: the previous work referencing this frame's descriptor sets is complete.
+  // Apply any deferred descriptor set updates for entities whose textures finished streaming.
+  watchdogProgressLabel.store("Render: ProcessDirtyDescriptorsForFrame", std::memory_order_relaxed);
+  ProcessDirtyDescriptorsForFrame(currentFrame);
+  watchdogProgressLabel.store("Render: after ProcessDirtyDescriptorsForFrame", std::memory_order_relaxed);
+
+  if (!IsLoading()) {
+    framesSinceLoadingComplete++;
+    pauseBackgroundUploads.store(framesSinceLoadingComplete < 30, std::memory_order_relaxed);
+  } else {
+    framesSinceLoadingComplete = 0;
+    pauseBackgroundUploads.store(false, std::memory_order_relaxed);
+  }
+
+  // Calculate max draws for gradual startup (avoid GPU stall in Debug)
+  // Set to a very large value to ensure all entities (including Fox/Cube) are drawn.
+  const uint32_t maxDraws = 100000;
+
+  // --- 1. PREPARATION PASS ---
+  // Gather active entities with mesh resources, perform per-frame descriptor initialization,
+  // and execute culling. This single pass replaces multiple redundant scans and reduces map lookups.
+  std::vector<RenderJob> opaqueJobs;
+  std::vector<RenderJob> transparentJobs;
+  opaqueJobs.reserve(entities.size());
+
+  {
+    watchdogProgressLabel.store("Render: preparation pass", std::memory_order_relaxed);
+
+    // Prepare frustum once per frame for culling
+    FrustumPlanes frustum{};
+    const bool doCulling = enableFrustumCulling && camera;
+    if (doCulling && camera) {
+      glm::mat4 proj = camera->GetProjectionMatrix();
+      proj[1][1] *= -1.0f;
+      const glm::mat4 vp = proj * camera->GetViewMatrix();
+      frustum = extractFrustumPlanes(vp);
+    }
+    lastCullingVisibleCount = 0;
+    lastCullingCulledCount = 0;
+
+    uint32_t entityProcessCount = 0;
+    static int frameCount = 0;
+    frameCount++;
+    for (Entity* entity : entities) {
+      if (!entity) continue;
+      
+      bool isTarget = (entity->GetName().find("Fox") != std::string::npos || entity->GetName().find("Cube") != std::string::npos);
+
+      if (!entity->IsActive()) {
+        continue;
+      }
+
+      auto meshComponent = entity->GetComponent<MeshComponent>();
+      if (!meshComponent) {
+        continue;
+      }
+
+      auto entityIt = entityResources.find(entity);
+      if (entityIt == entityResources.end()) {
+        continue;
+      }
+
+      auto meshIt = meshResources.find(meshComponent);
+      if (meshIt == meshResources.end()) {
+        continue;
+      }
+      
+      EntityResources& entityRes = entityIt->second;
+      MeshResources& meshRes = meshIt->second;
+
+      // Ensure material cache is valid once per frame
+      ensureEntityMaterialCache(entity, entityRes);
+
+      // --- Per-frame Descriptor Cold-Init (Integrated) ---
+      if (entityRes.basicDescriptorSets.empty() || entityRes.pbrDescriptorSets.empty()) {
+        std::string texPath = meshComponent->GetBaseColorTexturePath();
+        if (texPath.empty()) texPath = meshComponent->GetTexturePath();
+        if (entityRes.basicDescriptorSets.empty()) createDescriptorSets(entity, entityRes, texPath, false);
+        if (entityRes.pbrDescriptorSets.empty()) createDescriptorSets(entity, entityRes, texPath, true);
+      }
+
+      // Initialize binding 0 (UBO) for the current frame slot if not already done.
+      if (!entityRes.pbrUboBindingWritten[currentFrame] || !entityRes.basicUboBindingWritten[currentFrame]) {
+        std::string texPath = meshComponent->GetBaseColorTexturePath();
+        if (texPath.empty()) texPath = meshComponent->GetTexturePath();
+        if (!entityRes.pbrUboBindingWritten[currentFrame]) {
+          updateDescriptorSetsForFrame(entity, entityRes, texPath, true, currentFrame, false, true);
+        }
+        if (!entityRes.basicUboBindingWritten[currentFrame]) {
+          updateDescriptorSetsForFrame(entity, entityRes, texPath, false, currentFrame, false, true);
+        }
+      }
+
+      // Initialize images for the current frame slot if not already done.
+      if (!entityRes.pbrImagesWritten[currentFrame] || !entityRes.basicImagesWritten[currentFrame]) {
+        std::string texPath = meshComponent->GetBaseColorTexturePath();
+        if (texPath.empty()) texPath = meshComponent->GetTexturePath();
+        if (!entityRes.pbrImagesWritten[currentFrame]) {
+          updateDescriptorSetsForFrame(entity, entityRes, texPath, true, currentFrame, true, false);
+          entityRes.pbrImagesWritten[currentFrame] = true;
+        }
+        if (!entityRes.basicImagesWritten[currentFrame]) {
+          updateDescriptorSetsForFrame(entity, entityRes, texPath, false, currentFrame, true, false);
+          entityRes.basicImagesWritten[currentFrame] = true;
+        }
+      }
+
+      // --- Culling & Classification ---
+      auto* tc = entity->GetComponent<TransformComponent>();
+      bool useBlended = entityRes.cachedIsBlended;
+
+      if (meshComponent->HasLocalAABB()) {
+        const glm::mat4 model = tc ? tc->GetModelMatrix() : glm::mat4(1.0f);
+        glm::vec3 wmin, wmax;
+        transformAABB(model, meshComponent->GetLocalAABBMin(), meshComponent->GetLocalAABBMax(), wmin, wmax);
+
+        // 1. Frustum Culling
+        if (doCulling && !isTarget && !aabbIntersectsFrustum(wmin, wmax, frustum)) {
+          lastCullingCulledCount++;
+          continue;
+        }
+
+        // 2. Distance-based LOD
+        if (enableDistanceLOD && camera && !isTarget) {
+          glm::vec3 camPos = camera->GetPosition();
+          bool cameraInside = (camPos.x >= wmin.x && camPos.x <= wmax.x &&
+                               camPos.y >= wmin.y && camPos.y <= wmax.y &&
+                               camPos.z >= wmin.z && camPos.z <= wmax.z);
+          if (!cameraInside) {
+            float dx = std::max({0.0f, wmin.x - camPos.x, camPos.x - wmax.x});
+            float dy = std::max({0.0f, wmin.y - camPos.y, camPos.y - wmax.y});
+            float dz = std::max({0.0f, wmin.z - camPos.z, camPos.z - wmax.z});
+            float dist = std::sqrt(dx * dx + dy * dy + dz * dz);
+            float z_eff = std::max(0.1f, dist);
+            float fov = glm::radians(camera->GetFieldOfView());
+            float radius = glm::length(0.5f * (wmax - wmin));
+            float pixelDiameter = (radius * 2.0f * static_cast<float>(swapChainExtent.height)) / (z_eff * 2.0f * std::tan(fov * 0.5f));
+            float threshold = useBlended ? lodPixelThresholdTransparent : lodPixelThresholdOpaque;
+            if (pixelDiameter < threshold) {
+              lastCullingCulledCount++;
+              continue;
+            }
+          }
+        }
+      }
+
+      lastCullingVisibleCount++;
+      bool isAlphaMasked = false;
+      if (entityRes.materialCacheValid) {
+        isAlphaMasked = (entityRes.cachedMaterialProps.alphaMask > 0.5f);
+      }
+
+      // Update UBO for visible entity once per frame (shared across all main passes)
+      updateUniformBuffer(currentFrame, entity, &entityRes, camera, tc);
+
+      RenderJob job{entity, &entityRes, &meshRes, meshComponent, tc, isAlphaMasked};
+      if (useBlended) {
+        if (transparentJobs.size() < maxDraws)
+            transparentJobs.push_back(job);
+      } else {
+        if (opaqueJobs.size() < maxDraws)
+            opaqueJobs.push_back(job);
+      }
+
+      // Update watchdog periodically
+      if (++entityProcessCount % 100 == 0) {
+        lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+      }
+    }
+    watchdogProgressLabel.store("Render: after preparation pass", std::memory_order_relaxed);
+    lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+  }
+
+  // If the scene loader has finished and there are no remaining blocking tasks,
+  // hide the fullscreen loading overlay.
+  static uint32_t finalizingFrames = 0;
+  if (IsLoading() && GetLoadingPhase() == LoadingPhase::Finalizing) {
+    const bool loaderDone = !loadingFlag.load(std::memory_order_relaxed);
+    const bool criticalDone = (criticalJobsOutstanding.load(std::memory_order_relaxed) == 0u);
+    const bool noASPending = !asBuildRequested.load(std::memory_order_relaxed);
+    const bool noPreallocPending = !pendingEntityPreallocQueued.load(std::memory_order_relaxed);
+    const bool noDirtyEntities = descriptorDirtyEntities.empty();
+    const bool noDeferredDescOps = !descriptorRefreshPending.load(std::memory_order_relaxed);
+    if (loaderDone && criticalDone && noASPending && noPreallocPending && noDirtyEntities && noDeferredDescOps) {
+      // Wait for a few frames in Finalizing state to allow the GPU to establish a stable loop
+      // and ensure the very first draw calls don't trigger a watchdog abort.
+      if (++finalizingFrames > 15) {
+        MarkInitialLoadComplete();
+      }
+    }
+  } else {
+    finalizingFrames = 0;
+  }
+
+  // Safe point: flush any descriptor updates that were deferred while a command buffer
+  // was recording in a prior frame. Only apply ops for the current frame to avoid
+  // update-after-bind on pending frames.
+  if (descriptorRefreshPending.load(std::memory_order_relaxed)) {
+    watchdogProgressLabel.store("Render: flush deferred descriptor ops", std::memory_order_relaxed);
+    std::vector<PendingDescOp> ops; {
+      std::lock_guard<std::mutex> lk(pendingDescMutex);
+      ops.swap(pendingDescOps);
+      descriptorRefreshPending.store(false, std::memory_order_relaxed);
+    }
+    uint32_t opCount = 0;
+    for (auto& op : ops) {
+      // Kick watchdog periodically during potentially heavy descriptor update bursts
+      if ((++opCount % 50u) == 0u) {
+        lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+      }
+
+      if (op.frameIndex == currentFrame) {
+        // Now not recording; safe to apply updates for this frame
+        updateDescriptorSetsForFrame(op.entity, op.texPath, op.usePBR, op.frameIndex, op.imagesOnly);
+      } else {
+        // Keep other frame ops queued for next frame’s safe point
+        std::lock_guard<std::mutex> lk(pendingDescMutex);
+        pendingDescOps.push_back(op);
+        descriptorRefreshPending.store(true, std::memory_order_relaxed);
+      }
+    }
+    watchdogProgressLabel.store("Render: after deferred descriptor ops", std::memory_order_relaxed);
+  }
+
+  // Safe point: handle any pending reflection resource (re)creation and per-frame descriptor refreshes
+  if (reflectionResourcesDirty) {
+    if (enablePlanarReflections) {
+      uint32_t rw = std::max(1u, static_cast<uint32_t>(static_cast<float>(swapChainExtent.width) * reflectionResolutionScale));
+      uint32_t rh = std::max(1u, static_cast<uint32_t>(static_cast<float>(swapChainExtent.height) * reflectionResolutionScale));
+      createReflectionResources(rw, rh);
+    } else {
+      destroyReflectionResources();
+    }
+    reflectionResourcesDirty = false;
+  }
+
+  // Reflection descriptor binding refresh is handled elsewhere; avoid redundant per-frame mass updates here.
+  // Pick the VP associated with the previous frame's reflection texture for sampling in the main pass
+  if (enablePlanarReflections && !reflectionVPs.empty()) {
+    uint32_t prev = (currentFrame > 0) ? (currentFrame - 1) : (static_cast<uint32_t>(reflectionVPs.size()) - 1);
+    sampleReflectionVP = reflectionVPs[prev];
+  }
+
+  // This function updates bindings 6/7/8 (storage buffers) which don't have UPDATE_AFTER_BIND.
+  // Updating these every frame causes "updated without UPDATE_AFTER_BIND" errors with MAX_FRAMES_IN_FLIGHT > 1.
+  // These bindings are already initialized in createDescriptorSets and updated when buffers change.
+  // Binding 10 (reflection map) has UPDATE_AFTER_BIND and can be updated separately if needed.
+  // refreshPBRForwardPlusBindingsForFrame(currentFrame);
+
+  // Acquire next swapchain image
+  const uint32_t acquireSemaphoreIndex = currentFrame % static_cast<uint32_t>(imageAvailableSemaphores.size());
+  static int acquireLogCount = 0;
+
+  uint32_t imageIndex;
+  vk::Result acquireResultCode = vk::Result::eSuccess;
+  // Helper overloads to normalize acquireNextImage return across Vulkan-Hpp versions
+  auto extractAcquire = [](auto const& ret, vk::Result& code, uint32_t& idx) {
+    using RetT = std::decay_t<decltype(ret)>;
+    if constexpr (std::is_same_v<RetT, vk::ResultValue<uint32_t>>) {
+      code = ret.result;
+      idx = ret.value;
+    } else {
+      // Assume older std::pair<vk::Result, uint32_t>
+      code = ret.first;
+      idx = ret.second;
+    }
+  };
+  try {
+    watchdogProgressLabel.store("Render: acquireNextImage", std::memory_order_relaxed);
+    auto acquireRet = swapChain.acquireNextImage(UINT64_MAX, *imageAvailableSemaphores[acquireSemaphoreIndex]);
+    // Vulkan-Hpp changed the return type of acquireNextImage for RAII swapchain across versions.
+    // Support both vk::ResultValue<uint32_t> (newer) and std::pair<vk::Result, uint32_t> (older).
+    extractAcquire(acquireRet, acquireResultCode, imageIndex);
+  } catch (const vk::OutOfDateKHRError&) {
+    watchdogProgressLabel.store("Render: acquireNextImage out-of-date", std::memory_order_relaxed);
+    // Swapchain is out of date (e.g., window resized) before we could
+    // query the result. Trigger recreation and exit this frame cleanly.
+    framebufferResized.store(true, std::memory_order_relaxed);
+    if (imguiSystem)
+      ImGui::EndFrame();
+    // IMPORTANT: We already reset the in-flight fence at the start of the frame.
+    // Because we're exiting early (no submit), signal it via an empty submit so
+    // swapchain recreation won't hang waiting for an unsignaled fence.
+    {
+      vk::SubmitInfo2 emptySubmit2{};
+      std::lock_guard<std::mutex> lock(queueMutex);
+      graphicsQueue.submit2(emptySubmit2, *inFlightFences[currentFrame]);
+    }
+    recreateSwapChain();
+    return;
+  }
+
+  // imageIndex already populated above
+  watchdogProgressLabel.store("Render: acquired swapchain image", std::memory_order_relaxed);
+  if (acquireResultCode == vk::Result::eSuboptimalKHR || framebufferResized.load(std::memory_order_relaxed)) {
+    framebufferResized.store(false, std::memory_order_relaxed);
+    if (imguiSystem)
+      ImGui::EndFrame();
+    // Fence was reset earlier; ensure it is signaled before we bail out
+    // to avoid a deadlock in swapchain recreation.
+    {
+      vk::SubmitInfo2 emptySubmit2{};
+      std::lock_guard<std::mutex> lock(queueMutex);
+      graphicsQueue.submit2(emptySubmit2, *inFlightFences[currentFrame]);
+    }
+    recreateSwapChain();
+    return;
+  }
+  if (acquireResultCode != vk::Result::eSuccess) {
+    throw std::runtime_error("Failed to acquire swap chain image");
+  }
+
+  if (framebufferResized.load(std::memory_order_relaxed)) {
+    // Signal the fence via empty submit since no real work will be submitted
+    // this frame, preventing a wait on an unsignaled fence during resize.
+    {
+      vk::SubmitInfo2 emptySubmit2{};
+      std::lock_guard<std::mutex> lock(queueMutex);
+      graphicsQueue.submit2(emptySubmit2, *inFlightFences[currentFrame]);
+    }
+    recreateSwapChain();
+    return;
+  }
+
+  // Perform any descriptor updates that must not happen during command buffer recording
+  if (useForwardPlus) {
+    uint32_t tilesX_pre = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX;
+    uint32_t tilesY_pre = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY;
+    // Only update current frame's descriptors to avoid touching in-flight frames
+    createOrResizeForwardPlusBuffers(tilesX_pre, tilesY_pre, forwardPlusSlicesZ, /*updateOnlyCurrentFrame=*/true);
+    // After (re)creating Forward+ buffers, bindings 7/8 will be refreshed as needed.
+  }
+
+  // Ensure light buffers are sufficiently large before recording to avoid resizing while in use
+  {
+    // Reserve capacity based on emissive lights only (punctual lights disabled for now)
+    size_t desiredLightCapacity = 0;
+    if (!staticLights.empty()) {
+      size_t emissiveCount = 0;
+      for (const auto& L : staticLights) {
+        if (L.type == ExtractedLight::Type::Emissive) {
+          ++emissiveCount;
+          if (emissiveCount >= MAX_ACTIVE_LIGHTS)
+            break;
+        }
+      }
+      desiredLightCapacity = emissiveCount;
+    }
+    if (desiredLightCapacity > 0) {
+      createOrResizeLightStorageBuffers(desiredLightCapacity);
+      // Ensure compute (binding 0) sees the current frame's lights buffer
+      refreshForwardPlusComputeLightsBindingForFrame(currentFrame);
+      // Bindings 6/7/8 for PBR are refreshed only when buffers change (handled in resize path).
+    }
+  }
+
+  // Safe point: Update ray query descriptor sets if ray query mode is active
+  // This MUST happen before command buffer recording starts to avoid "descriptor updated without UPDATE_AFTER_BIND" errors
+  if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) {
+    if (!!*tlasStructure.handle) {
+      updateRayQueryDescriptorSets(currentFrame, entities);
+    }
+  }
+
+  // TLAS build/refit logic
+  const bool needTLAS = (currentRenderMode == RenderMode::RayQuery || enableRasterRayQueryShadows) && accelerationStructureEnabled;
+  // Skip during the very first frames of scene transition to unblock the GPU.
+  if (needTLAS && !!*tlasStructure.handle && (framesSinceLoadingComplete > 40 || IsLoading()) && (framesSinceLoadingComplete != 1)) {
+    if (!IsRayQueryStaticOnly()) {
+      watchdogProgressLabel.store("Render: refitTopLevelAS", std::memory_order_relaxed);
+      lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+      refitTopLevelAS(entities, camera);
+    }
+  }
+
+  watchdogProgressLabel.store("Render: reset cmdBuffer", std::memory_order_relaxed);
+  static int resetLogCount = 0;
+  commandBuffers[currentFrame].reset();
+  // Begin command buffer recording for this frame
+  watchdogProgressLabel.store("Render: begin cmdBuffer", std::memory_order_relaxed);
+  commandBuffers[currentFrame].begin(vk::CommandBufferBeginInfo());
+  isRecordingCmd.store(true, std::memory_order_relaxed);
+
+  // Perform skinning compute dispatch before any rendering that might use it
+  // Skip during the very first frames of scene transition to unblock the GPU.
+  if ((framesSinceLoadingComplete > 30 || IsLoading()) && (framesSinceLoadingComplete != 1)) {
+    static int skinLogCount = 0;
+    watchdogProgressLabel.store("Render: updateSkins", std::memory_order_relaxed);
+    AdvancedRenderer_updateSkins(this, commandBuffers[currentFrame], currentFrame, entities);
+    lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+  }
+  // Inline BLAS refit: runs after skinning so BVH and vertex data are both current-frame.
+  // Gated by enableBLASRefit so this step can be disabled for diagnosis.
+  if (enableBLASRefit && rayQueryEnabled && accelerationStructureEnabled && !!*tlasStructure.handle) {
+    watchdogProgressLabel.store("Render: BLAS refit", std::memory_order_relaxed);
+    refitBLASInline(commandBuffers[currentFrame]);
+  }
+  if (framebufferResized.load(std::memory_order_relaxed)) {
+    commandBuffers[currentFrame].end();
+    recreateSwapChain();
+    return;
+  }
+
+  // Ray query rendering mode dispatch
+  if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) {
+    // Check if TLAS handle is valid (dereference RAII handle)
+    if (!*tlasStructure.handle) {
+      // TLAS not built yet.
+      // During loading, allow the raster path (and the progress overlay) to render normally
+      // instead of presenting a diagnostic magenta frame.
+      if (!IsLoading()) {
+        // Present a diagnostic frame from the ray-query path to avoid accidentally showing
+        // rasterized content in RayQuery mode.
+        // Transition swapchain image from PRESENT to TRANSFER_DST
+        vk::ImageMemoryBarrier2 swapchainBarrier{};
+        swapchainBarrier.srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe;
+        swapchainBarrier.srcAccessMask = vk::AccessFlagBits2::eNone;
+        swapchainBarrier.dstStageMask = vk::PipelineStageFlagBits2::eTransfer;
+        swapchainBarrier.dstAccessMask = vk::AccessFlagBits2::eTransferWrite;
+        swapchainBarrier.oldLayout = (imageIndex < swapChainImageLayouts.size()) ? swapChainImageLayouts[imageIndex] : vk::ImageLayout::eUndefined;
+        swapchainBarrier.newLayout = vk::ImageLayout::eTransferDstOptimal;
+        swapchainBarrier.image = swapChainImages[imageIndex];
+        swapchainBarrier.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor;
+        swapchainBarrier.subresourceRange.levelCount = 1;
+        swapchainBarrier.subresourceRange.layerCount = 1;
+
+        vk::DependencyInfo depInfoSwap{};
+        depInfoSwap.imageMemoryBarrierCount = 1;
+        depInfoSwap.pImageMemoryBarriers = &swapchainBarrier;
+        commandBuffers[currentFrame].pipelineBarrier2(depInfoSwap);
+        if (imageIndex < swapChainImageLayouts.size())
+          swapChainImageLayouts[imageIndex] = swapchainBarrier.newLayout;
+
+        // Clear to a distinct magenta diagnostic color
+        vk::ClearColorValue clearColor{std::array < float, 4 >{1.0f, 0.0f, 1.0f, 1.0f}};
+        vk::ImageSubresourceRange clearRange{vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1};
+        commandBuffers[currentFrame].clearColorImage(swapChainImages[imageIndex], vk::ImageLayout::eTransferDstOptimal, clearColor, clearRange);
+
+        // Transition back to PRESENT
+        swapchainBarrier.srcStageMask = vk::PipelineStageFlagBits2::eTransfer;
+        swapchainBarrier.srcAccessMask = vk::AccessFlagBits2::eTransferWrite;
+        swapchainBarrier.dstStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe;
+        swapchainBarrier.dstAccessMask = vk::AccessFlagBits2::eNone;
+        swapchainBarrier.oldLayout = vk::ImageLayout::eTransferDstOptimal;
+        swapchainBarrier.newLayout = vk::ImageLayout::ePresentSrcKHR;
+        commandBuffers[currentFrame].pipelineBarrier2(depInfoSwap);
+        if (imageIndex < swapChainImageLayouts.size())
+          swapChainImageLayouts[imageIndex] = swapchainBarrier.newLayout;
+
+        rayQueryRenderedThisFrame = true; // Skip raster; ensure we are looking at RQ path only
+      }
+    } else {
+      // TLAS is valid and descriptor sets were already updated at safe point
+      // Proceed with ray query rendering
+      // Bind ray query compute pipeline
+      commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eCompute, *rayQueryPipeline);
+
+      // Bind descriptor set
+      commandBuffers[currentFrame].bindDescriptorSets(
+        vk::PipelineBindPoint::eCompute,
+        *rayQueryPipelineLayout,
+        0,
+        *rayQueryDescriptorSets[currentFrame],
+        nullptr);
+
+      // This dedicated UBO is separate from entity UBOs and uses a Ray Query-specific layout.
+      if (rayQueryUniformBuffersMapped.size() > currentFrame && rayQueryUniformBuffersMapped[currentFrame]) {
+        RayQueryUniformBufferObject ubo{};
+        ubo.model = glm::mat4(1.0f); // Identity - not used for ray query
+
+        // Force view matrix update to reflect current camera position
+        // (the dirty flag isn't automatically set when camera position changes)
+        camera->ForceViewMatrixUpdate();
+
+        // Get camera matrices
+        glm::mat4 camView = camera->GetViewMatrix();
+        ubo.view = camView;
+        ubo.proj = camera->GetProjectionMatrix();
+        ubo.proj[1][1] *= -1; // Flip Y for Vulkan
+        ubo.camPos = glm::vec4(camera->GetPosition(), 1.0f);
+        // Clamp to sane ranges to avoid black output (exposure=0 → 1-exp(0)=0)
+        ubo.exposure = std::clamp(exposure, 0.2f, 4.0f);
+        ubo.gamma = std::clamp(gamma, 1.6f, 2.6f);
+        // Match raster convention: ambient scale factor for simple IBL/ambient term.
+        // (Raster defaults to ~1.0 in the main pass; keep Ray Query consistent.)
+        ubo.scaleIBLAmbient = 1.0f;
+        // Provide the per-frame light count so the ray query shader can iterate lights.
+        ubo.lightCount = static_cast<int>(lastFrameLightCount);
+        ubo.screenDimensions = glm::vec2(swapChainExtent.width, swapChainExtent.height);
+        ubo.enableRayQueryReflections = enableRayQueryReflections ? 1 : 0;
+        ubo.enableRayQueryTransparency = enableRayQueryTransparency ? 1 : 0;
+        // Max secondary bounces (reflection/refraction). Stored in the padding slot to avoid UBO layout churn.
+        // Shader clamps this value.
+        ubo._pad0 = rayQueryMaxBounces;
+        // Thick-glass toggles and tuning
+        ubo.enableThickGlass = enableThickGlass ? 1 : 0;
+        ubo.thicknessClamp = thickGlassThicknessClamp;
+        ubo.absorptionScale = thickGlassAbsorptionScale;
+        // Ray Query hard shadows (see `shaders/ray_query.slang`)
+        ubo._pad1 = enableRayQueryShadows ? 1 : 0;
+        ubo.shadowSampleCount = std::clamp(rayQueryShadowSampleCount, 1, 32);
+        ubo.shadowSoftness = std::clamp(rayQueryShadowSoftness, 0.0f, 1.0f);
+        ubo.reflectionIntensity = reflectionIntensity;
+        // Provide geometry info count for shader-side bounds checking (per-instance)
+        ubo.geometryInfoCount = static_cast<int>(tlasInstanceCount);
+        // Provide material buffer count for shader-side bounds checking
+        ubo.materialCount = static_cast<int>(materialCountCPU);
+
+        // Copy to mapped memory
+        std::memcpy(rayQueryUniformBuffersMapped[currentFrame], &ubo, sizeof(RayQueryUniformBufferObject));
+      } else {
+        // Keep concise error for visibility
+        std::cerr << "Ray Query UBO not mapped for frame " << currentFrame << "\n";
+      }
+
+      // Dispatch compute shader (8x8 workgroups as defined in shader)
+      uint32_t workgroupsX = (swapChainExtent.width + 7) / 8;
+      uint32_t workgroupsY = (swapChainExtent.height + 7) / 8;
+      commandBuffers[currentFrame].dispatch(workgroupsX, workgroupsY, 1);
+
+      // Barrier: wait for compute shader to finish writing to output image,
+      // then make it readable by fragment shader for sampling in composite pass
+      vk::ImageMemoryBarrier2 rqToSample{};
+      rqToSample.srcStageMask = vk::PipelineStageFlagBits2::eComputeShader;
+      rqToSample.srcAccessMask = vk::AccessFlagBits2::eShaderWrite;
+      rqToSample.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader;
+      rqToSample.dstAccessMask = vk::AccessFlagBits2::eShaderRead;
+      rqToSample.oldLayout = vk::ImageLayout::eGeneral;
+      rqToSample.newLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
+      rqToSample.image = *rayQueryOutputImage;
+      rqToSample.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor;
+      rqToSample.subresourceRange.levelCount = 1;
+      rqToSample.subresourceRange.layerCount = 1;
+
+      vk::DependencyInfo depRQToSample{};
+      depRQToSample.imageMemoryBarrierCount = 1;
+      depRQToSample.pImageMemoryBarriers = &rqToSample;
+      commandBuffers[currentFrame].pipelineBarrier2(depRQToSample);
+
+      // Composite fullscreen: sample rayQueryOutputImage to the swapchain using the composite pipeline
+      // Transition swapchain image to COLOR_ATTACHMENT_OPTIMAL
+      vk::ImageMemoryBarrier2 swapchainToColor{};
+      swapchainToColor.srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe;
+      swapchainToColor.srcAccessMask = vk::AccessFlagBits2::eNone;
+      swapchainToColor.dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput;
+      swapchainToColor.dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead;
+      swapchainToColor.oldLayout = (imageIndex < swapChainImageLayouts.size()) ? swapChainImageLayouts[imageIndex] : vk::ImageLayout::eUndefined;
+      swapchainToColor.newLayout = vk::ImageLayout::eColorAttachmentOptimal;
+      swapchainToColor.image = swapChainImages[imageIndex];
+      swapchainToColor.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor;
+      swapchainToColor.subresourceRange.levelCount = 1;
+      swapchainToColor.subresourceRange.layerCount = 1;
+      vk::DependencyInfo depSwapToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &swapchainToColor};
+      commandBuffers[currentFrame].pipelineBarrier2(depSwapToColor);
+      if (imageIndex < swapChainImageLayouts.size())
+        swapChainImageLayouts[imageIndex] = swapchainToColor.newLayout;
+
+      // Begin dynamic rendering for composite (no depth)
+      colorAttachments[0].imageView = *swapChainImageViews[imageIndex];
+      colorAttachments[0].loadOp = vk::AttachmentLoadOp::eClear;
+      depthAttachment.loadOp = vk::AttachmentLoadOp::eDontCare;
+      renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent);
+      auto savedDepthPtr2 = renderingInfo.pDepthAttachment;
+      renderingInfo.pDepthAttachment = nullptr;
+      commandBuffers[currentFrame].beginRendering(renderingInfo);
+
+      if (!!*compositePipeline) {
+        commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *compositePipeline);
+      }
+      vk::Viewport vp(0.0f, 0.0f, static_cast<float>(swapChainExtent.width), static_cast<float>(swapChainExtent.height), 0.0f, 1.0f);
+      vk::Rect2D sc({0, 0}, swapChainExtent);
+      commandBuffers[currentFrame].setViewport(0, vp);
+      commandBuffers[currentFrame].setScissor(0, sc);
+
+      // Bind the RQ composite descriptor set (samples rayQueryOutputImage)
+      if (!rqCompositeDescriptorSets.empty()) {
+        commandBuffers[currentFrame].bindDescriptorSets(
+          vk::PipelineBindPoint::eGraphics,
+          *compositePipelineLayout,
+          0,
+          {*rqCompositeDescriptorSets[currentFrame]},
+          {});
+      }
+
+      // Push exposure/gamma and sRGB flag
+      struct CompositePush {
+        float exposure;
+        float gamma;
+        int outputIsSRGB;
+        float _pad;
+      } pc2{};
+      pc2.exposure = std::clamp(this->exposure, 0.2f, 4.0f);
+      pc2.gamma = this->gamma;
+      pc2.outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb || swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0;
+      commandBuffers[currentFrame].pushConstants<CompositePush>(*compositePipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, pc2);
+
+      commandBuffers[currentFrame].draw(3, 1, 0, 0);
+      commandBuffers[currentFrame].endRendering();
+      renderingInfo.pDepthAttachment = savedDepthPtr2;
+
+      // Transition swapchain back to PRESENT and RQ image back to GENERAL for next frame
+      vk::ImageMemoryBarrier2 swapchainToPresent{};
+      swapchainToPresent.srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput;
+      swapchainToPresent.srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite;
+      swapchainToPresent.dstStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe;
+      swapchainToPresent.dstAccessMask = vk::AccessFlagBits2::eNone;
+      swapchainToPresent.oldLayout = vk::ImageLayout::eColorAttachmentOptimal;
+      swapchainToPresent.newLayout = vk::ImageLayout::ePresentSrcKHR;
+      swapchainToPresent.image = swapChainImages[imageIndex];
+      swapchainToPresent.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor;
+      swapchainToPresent.subresourceRange.levelCount = 1;
+      swapchainToPresent.subresourceRange.layerCount = 1;
+
+      vk::ImageMemoryBarrier2 rqBackToGeneral{};
+      rqBackToGeneral.srcStageMask = vk::PipelineStageFlagBits2::eFragmentShader;
+      rqBackToGeneral.srcAccessMask = vk::AccessFlagBits2::eShaderRead;
+      rqBackToGeneral.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader;
+      rqBackToGeneral.dstAccessMask = vk::AccessFlagBits2::eShaderWrite;
+      rqBackToGeneral.oldLayout = vk::ImageLayout::eShaderReadOnlyOptimal;
+      rqBackToGeneral.newLayout = vk::ImageLayout::eGeneral;
+      rqBackToGeneral.image = *rayQueryOutputImage;
+      rqBackToGeneral.subresourceRange.aspectMask = vk::ImageAspectFlagBits::eColor;
+      rqBackToGeneral.subresourceRange.levelCount = 1;
+      rqBackToGeneral.subresourceRange.layerCount = 1;
+
+      std::array<vk::ImageMemoryBarrier2, 2> barriers{swapchainToPresent, rqBackToGeneral};
+      vk::DependencyInfo depEnd{.imageMemoryBarrierCount = static_cast<uint32_t>(barriers.size()), .pImageMemoryBarriers = barriers.data()};
+      commandBuffers[currentFrame].pipelineBarrier2(depEnd);
+      if (imageIndex < swapChainImageLayouts.size())
+        swapChainImageLayouts[imageIndex] = swapchainToPresent.newLayout;
+
+      // Ray query rendering complete - set flag to skip rasterization code path
+      rayQueryRenderedThisFrame = true;
+    }
+  }
+
+  // Process texture streaming uploads (see Renderer::ProcessPendingTextureJobs)
+
+  vk::raii::Pipeline* currentPipeline = nullptr;
+  vk::raii::PipelineLayout* currentLayout = nullptr;
+
+  // Incrementally process pending texture uploads on the main thread so that
+  // all Vulkan submits happen from a single place while worker threads only
+  // handle CPU-side decoding. While the loading screen is up, prioritize
+  // critical textures so the first rendered frame looks mostly correct.
+  if (IsLoading()) {
+    // Larger budget while loading screen is visible so we don't stall
+    // streaming of near-field baseColor textures.
+    ProcessPendingTextureJobs(/*maxJobs=*/16, /*includeCritical=*/true, /*includeNonCritical=*/false);
+  } else {
+    // After loading screen disappears, we want the scene to remain
+    // responsive (~20 fps) while textures stream in. Limit the number
+    // of non-critical uploads per frame so we don't tank frame time.
+    static uint32_t streamingFrameCounter = 0;
+    streamingFrameCounter++;
+    // Ray Query needs textures visible quickly; process more streaming work when in Ray Query mode.
+    if (currentRenderMode == RenderMode::RayQuery) {
+      // Aggressively drain both critical and non-critical queues each frame for faster bring-up.
+      ProcessPendingTextureJobs(/*maxJobs=*/32, /*includeCritical=*/true, /*includeNonCritical=*/true);
+    } else {
+      // Raster path: keep previous throttling to avoid stalls.
+      if ((streamingFrameCounter % 3) == 0) {
+        ProcessPendingTextureJobs(/*maxJobs=*/1, /*includeCritical=*/false, /*includeNonCritical=*/true);
+      }
+    }
+  }
+
+  // Renderer UI - available for both ray query and rasterization modes.
+  // Hide UI during loading; the progress overlay is handled by ImGuiSystem::NewFrame().
+  if (imguiSystem && !imguiSystem->IsFrameRendered() && !IsLoading()) {
+    if (ImGui::Begin("Renderer")) {
+      // Declare variables that need to persist across conditional blocks
+      bool prevFwdPlus = useForwardPlus;
+
+      // === RENDERING MODE SELECTION (TOP) ===
+      ImGui::Text("Rendering Mode:");
+      if (rayQueryEnabled && accelerationStructureEnabled) {
+        const char* modeNames[] = {"Rasterization", "Ray Query"};
+        int currentMode = (currentRenderMode == RenderMode::RayQuery) ? 1 : 0;
+        if (ImGui::Combo("Mode", &currentMode, modeNames, 2)) {
+          RenderMode newMode = (currentMode == 1) ? RenderMode::RayQuery : RenderMode::Rasterization;
+          if (newMode != currentRenderMode) {
+            currentRenderMode = newMode;
+            std::cout << "Switched to " << modeNames[currentMode] << " mode\n";
+
+            // Request acceleration structure build when switching to ray query mode
+            if (currentRenderMode == RenderMode::RayQuery) {
+              std::cout << "Requesting acceleration structure build...\n";
+              RequestAccelerationStructureBuild();
+            }
+
+            // Switching modes can change which pipelines are bound and whether ray-query-dependent
+            // descriptor bindings (e.g., PBR binding 11 `tlas`) become statically used.
+            // Mark entity descriptor sets dirty so the next safe point refreshes bindings for this frame.
+            for (auto& kv : entityResources) {
+              kv.second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+            }
+            for (Entity* e : entities) {
+              MarkEntityDescriptorsDirty(e);
+            }
+          }
+        }
+      } else {
+        ImGui::TextColored(ImVec4(0.7f, 0.7f, 0.7f, 1.0f), "Rasterization only (ray query not supported)");
+      }
+
+      // === RASTERIZATION-SPECIFIC OPTIONS ===
+      if (currentRenderMode == RenderMode::Rasterization) {
+        ImGui::Separator();
+        ImGui::Text("Rasterization Options:");
+
+        // Lighting Controls - BRDF/PBR is now the default lighting model
+        bool useBasicLighting = imguiSystem && !imguiSystem->IsPBREnabled();
+        if (ImGui::Checkbox("Use Basic Lighting (Phong)", &useBasicLighting)) {
+          imguiSystem->SetPBREnabled(!useBasicLighting);
+          std::cout << "Lighting mode: " << (!useBasicLighting ? "BRDF/PBR (default)" : "Basic Phong") << std::endl;
+        }
+
+        if (!useBasicLighting) {
+          ImGui::Text("Status: BRDF/PBR pipeline active (default)");
+          ImGui::Text("All models rendered with physically-based lighting");
+        } else {
+          ImGui::Text("Status: Basic Phong pipeline active");
+          ImGui::Text("All models rendered with basic Phong shading");
+        }
+
+        ImGui::Checkbox("Forward+ (tiled light culling)", &useForwardPlus);
+        if (useForwardPlus && !prevFwdPlus) {
+          // Lazily create Forward+ resources if enabled at runtime
+          if (!*forwardPlusPipeline || !*forwardPlusDescriptorSetLayout || forwardPlusPerFrame.empty()) {
+            createForwardPlusPipelinesAndResources();
+          }
+          if (!*depthPrepassPipeline) {
+            createDepthPrepassPipeline();
+          }
+        }
+
+        // Raster shadows via ray queries (experimental)
+        if (rayQueryEnabled && accelerationStructureEnabled) {
+          ImGui::Checkbox("RayQuery shadows (raster)", &enableRasterRayQueryShadows);
+        } else {
+          ImGui::TextDisabled("RayQuery shadows (raster) (requires ray query + AS)");
+        }
+
+        // Planar reflections controls
+        ImGui::Spacing();
+        /*
+        if (ImGui::Checkbox("Planar reflections (experimental)", &enablePlanarReflections)) {
+          // Defer actual (re)creation/destruction to the next safe point at frame start
+          reflectionResourcesDirty = true;
+        }
+        */
+        enablePlanarReflections = false;
+        float scaleBefore = reflectionResolutionScale;
+        if (ImGui::SliderFloat("Reflection resolution scale", &reflectionResolutionScale, 0.25f, 1.0f, "%.2f")) {
+          reflectionResolutionScale = std::clamp(reflectionResolutionScale, 0.25f, 1.0f);
+          if (enablePlanarReflections&& std::abs(scaleBefore - reflectionResolutionScale)
+          >
+          1e-3f
+          ) {
+            reflectionResourcesDirty = true;
+          }
+        }
+        if (enablePlanarReflections && !reflections.empty()) {
+          auto& rt = reflections[currentFrame];
+          if (rt.width > 0) {
+            ImGui::Text("Reflection RT: %ux%u", rt.width, rt.height);
+          }
+        }
+      }
+
+      // === RAY QUERY-SPECIFIC OPTIONS ===
+      if (currentRenderMode == RenderMode::RayQuery && rayQueryEnabled && accelerationStructureEnabled) {
+        ImGui::Separator();
+        ImGui::Text("Ray Query Status:");
+
+        // Show acceleration structure status
+        if (!!*tlasStructure.handle) {
+          ImGui::TextColored(ImVec4(0.0f, 1.0f, 0.0f, 1.0f), "Acceleration Structures: Built (%zu meshes)", blasStructures.size());
+        } else {
+          ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f), "Acceleration Structures: Not built");
+        }
+
+        ImGui::Spacing();
+        ImGui::Text("Ray Query Features:");
+        ImGui::Checkbox("Enable Hard Shadows", &enableRayQueryShadows);
+        if (enableRayQueryShadows) {
+          ImGui::SliderInt("Shadow samples", &rayQueryShadowSampleCount, 1, 32);
+          ImGui::SliderFloat("Shadow softness (fraction of range)", &rayQueryShadowSoftness, 0.0f, 0.2f, "%.3f");
+        }
+        ImGui::Checkbox("Enable Reflections", &enableRayQueryReflections);
+        ImGui::Checkbox("Enable Transparency/Refraction", &enableRayQueryTransparency);
+        ImGui::SliderInt("Max secondary bounces", &rayQueryMaxBounces, 0, 10);
+        // Thick-glass realism controls
+        ImGui::Separator();
+        ImGui::Text("Thick Glass");
+        ImGui::Checkbox("Enable Thick Glass", &enableThickGlass);
+        ImGui::SliderFloat("Thickness Clamp (m)", &thickGlassThicknessClamp, 0.0f, 0.5f, "%.3f");
+        ImGui::SliderFloat("Absorption Scale", &thickGlassAbsorptionScale, 0.0f, 4.0f, "%.2f");
+      }
+
+      // === SHARED OPTIONS (BOTH MODES) ===
+      ImGui::Separator();
+      ImGui::Text("Culling & LOD:");
+      if (ImGui::Checkbox("Frustum culling", &enableFrustumCulling)) {
+        // no-op, takes effect immediately
+      }
+      if (ImGui::Checkbox("Distance LOD (projected-size skip)", &enableDistanceLOD)) {
+      }
+      ImGui::SliderFloat("LOD threshold opaque (px)", &lodPixelThresholdOpaque, 0.5f, 8.0f, "%.1f");
+      ImGui::SliderFloat("LOD threshold transparent (px)", &lodPixelThresholdTransparent, 0.5f, 12.0f, "%.1f");
+      // Anisotropy control (recreate samplers on change)
+      {
+        float deviceMaxAniso = physicalDevice.getProperties().limits.maxSamplerAnisotropy;
+        if (ImGui::SliderFloat("Sampler max anisotropy", &samplerMaxAnisotropy, 1.0f, deviceMaxAniso, "%.1f")) {
+          // Recreate samplers for all textures to apply new anisotropy
+          std::unique_lock<std::shared_mutex> texLock(textureResourcesMutex);
+          for (auto& kv : textureResources) {
+            createTextureSampler(kv.second);
+          }
+          // Default texture
+          createTextureSampler(defaultTextureResources);
+        }
+      }
+      if (lastCullingVisibleCount + lastCullingCulledCount > 0) {
+        ImGui::Text("Culling: visible=%u, culled=%u", lastCullingVisibleCount, lastCullingCulledCount);
+      }
+
+      // Basic tone mapping controls
+      ImGui::Separator();
+      ImGui::Text("Tone Mapping & Tuning:");
+      ImGui::SliderFloat("Reflection intensity", &reflectionIntensity, 0.0f, 2.0f, "%.2f");
+      ImGui::SliderFloat("Exposure", &exposure, 0.1f, 4.0f, "%.2f");
+      ImGui::SliderFloat("Gamma", &gamma, 1.6f, 2.6f, "%.2f");
+    }
+    ImGui::End();
+  }
+
+  // Rasterization rendering: only execute if ray query did not render this frame.
+  if (!rayQueryRenderedThisFrame) {
+    // Optional: render planar reflections first
+    /*
+    if (enablePlanarReflections) {
+      glm::vec4 planeWS(0.0f, 1.0f, 0.0f, 0.0f);
+      renderReflectionPass(commandBuffers[currentFrame], planeWS, camera, opaqueJobs);
+    }
+    */
+
+    // Sort transparent entities back-to-front for correct blending of nested glass/liquids
+    if (!transparentJobs.empty()) {
+      glm::vec3 camPos = camera ? camera->GetPosition() : glm::vec3(0.0f);
+      std::ranges::sort(transparentJobs,
+                        [camPos](const RenderJob& a, const RenderJob& b) {
+                          glm::vec3 pa = a.transformComp ? a.transformComp->GetPosition() : glm::vec3(0.0f);
+                          glm::vec3 pb = b.transformComp ? b.transformComp->GetPosition() : glm::vec3(0.0f);
+                          float da2 = glm::length2(pa - camPos);
+                          float db2 = glm::length2(pb - camPos);
+                          if (da2 != db2) return da2 > db2;
+                          if (a.entityRes->cachedIsLiquid != b.entityRes->cachedIsLiquid) return a.entityRes->cachedIsLiquid;
+                          return a.entity < b.entity;
+                        });
+    }
+
+
+    // Track whether we executed a depth pre-pass this frame (used to choose depth load op and pipeline state)
+    bool didOpaqueDepthPrepass = false;
+
+    // Optional Forward+ depth pre-pass for opaque geometry
+    if (useForwardPlus) {
+      if (!opaqueJobs.empty()) {
+        // Transition depth image for attachment write (Sync2)
+        vk::ImageMemoryBarrier2 depthBarrier2{
+          .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe,
+          .srcAccessMask = {},
+          .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests,
+          .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite,
+          .oldLayout = vk::ImageLayout::eUndefined,
+          .newLayout = vk::ImageLayout::eDepthAttachmentOptimal,
+          .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .image = *depthImage,
+          .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1}
+        };
+        vk::DependencyInfo depInfoDepth{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &depthBarrier2};
+        commandBuffers[currentFrame].pipelineBarrier2(depInfoDepth);
+
+        // Depth-only rendering
+        vk::RenderingAttachmentInfo depthOnlyAttachment{.imageView = *depthImageView, .imageLayout = vk::ImageLayout::eDepthAttachmentOptimal, .loadOp = vk::AttachmentLoadOp::eClear, .storeOp = vk::AttachmentStoreOp::eStore, .clearValue = vk::ClearDepthStencilValue{1.0f, 0}};
+        vk::RenderingInfo depthOnlyInfo{.renderArea = vk::Rect2D({0, 0}, swapChainExtent), .layerCount = 1, .colorAttachmentCount = 0, .pColorAttachments = nullptr, .pDepthAttachment = &depthOnlyAttachment};
+        commandBuffers[currentFrame].beginRendering(depthOnlyInfo);
+        vk::Viewport viewport(0.0f, 0.0f, static_cast<float>(swapChainExtent.width), static_cast<float>(swapChainExtent.height), 0.0f, 1.0f);
+        commandBuffers[currentFrame].setViewport(0, viewport);
+        vk::Rect2D scissor({0, 0}, swapChainExtent);
+        commandBuffers[currentFrame].setScissor(0, scissor);
+
+        // Bind depth pre-pass pipeline
+        if (!!*depthPrepassPipeline) {
+          commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *depthPrepassPipeline);
+        }
+
+        for (const auto& job : opaqueJobs) {
+          if (job.isAlphaMasked) continue;
+
+          // Bind geometry
+          vk::Buffer vb = GetVertexBuffer(this, job.meshComp, job.meshRes);
+          std::array<vk::Buffer, 2> buffers = {vb, *job.entityRes->instanceBuffer};
+          std::array<vk::DeviceSize, 2> offsets = {0, 0};
+          commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets);
+          commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32);
+
+          // Bind descriptor set (PBR set 0)
+          commandBuffers[currentFrame].bindDescriptorSets(vk::PipelineBindPoint::eGraphics,
+                                                           *pbrPipelineLayout,
+                                                           0,
+                                                           *job.entityRes->pbrDescriptorSets[currentFrame],
+                                                           nullptr);
+
+          // Issue draw
+          uint32_t instanceCount = std::max(1u, static_cast<uint32_t>(job.meshComp->GetInstanceCount()));
+          commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCount, 0, 0, 0);
+        }
+
+        commandBuffers[currentFrame].endRendering();
+
+        // Barrier to ensure depth is visible for subsequent passes (Sync2)
+        vk::ImageMemoryBarrier2 depthToRead2{
+          .srcStageMask = vk::PipelineStageFlagBits2::eLateFragmentTests,
+          .srcAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentWrite,
+          .dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests,
+          .dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead,
+          .oldLayout = vk::ImageLayout::eDepthAttachmentOptimal,
+          .newLayout = vk::ImageLayout::eDepthAttachmentOptimal,
+          .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .image = *depthImage,
+          .subresourceRange = {vk::ImageAspectFlagBits::eDepth, 0, 1, 0, 1}
+        };
+        vk::DependencyInfo depInfoDepthToRead{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &depthToRead2};
+        commandBuffers[currentFrame].pipelineBarrier2(depInfoDepthToRead);
+
+        didOpaqueDepthPrepass = true;
+      }
+
+      // Forward+ compute culling based on current camera and screen tiles
+      uint32_t tilesX = (swapChainExtent.width + forwardPlusTileSizeX - 1) / forwardPlusTileSizeX;
+      uint32_t tilesY = (swapChainExtent.height + forwardPlusTileSizeY - 1) / forwardPlusTileSizeY;
+
+      // Lights already extracted at frame start - use lastFrameLightCount for Forward+ params
+      glm::mat4 view = camera->GetViewMatrix();
+      glm::mat4 proj = camera->GetProjectionMatrix();
+      proj[1][1] *= -1.0f;
+      float nearZ = camera->GetNearPlane();
+      float farZ = camera->GetFarPlane();
+      updateForwardPlusParams(currentFrame, view, proj, lastFrameLightCount, tilesX, tilesY, forwardPlusSlicesZ, nearZ, farZ);
+      // As a last guard before dispatch, make sure compute binding 0 is valid for this frame
+      refreshForwardPlusComputeLightsBindingForFrame(currentFrame);
+
+      dispatchForwardPlus(commandBuffers[currentFrame], tilesX, tilesY, forwardPlusSlicesZ);
+    }
+
+    // PASS 1: RENDER OPAQUE OBJECTS TO OFF-SCREEN TEXTURE
+    // Transition off-screen color to attachment write (Sync2). On first use after creation or after switching
+    // from a mode that never produced this image, the layout may still be UNDEFINED.
+    vk::ImageLayout oscOldLayout = vk::ImageLayout::eUndefined;
+    vk::PipelineStageFlags2 oscSrcStage = vk::PipelineStageFlagBits2::eTopOfPipe;
+    vk::AccessFlags2 oscSrcAccess = vk::AccessFlagBits2::eNone;
+    if (currentFrame < opaqueSceneColorImageLayouts.size()) {
+      oscOldLayout = opaqueSceneColorImageLayouts[currentFrame];
+      if (oscOldLayout == vk::ImageLayout::eShaderReadOnlyOptimal) {
+        oscSrcStage = vk::PipelineStageFlagBits2::eFragmentShader;
+        oscSrcAccess = vk::AccessFlagBits2::eShaderRead;
+      } else if (oscOldLayout == vk::ImageLayout::eColorAttachmentOptimal) {
+        oscSrcStage = vk::PipelineStageFlagBits2::eColorAttachmentOutput;
+        oscSrcAccess = vk::AccessFlagBits2::eColorAttachmentWrite;
+      } else {
+        oscOldLayout = vk::ImageLayout::eUndefined;
+        oscSrcStage = vk::PipelineStageFlagBits2::eTopOfPipe;
+        oscSrcAccess = vk::AccessFlagBits2::eNone;
+      }
+    }
+    vk::ImageMemoryBarrier2 oscToColor2{
+      .srcStageMask = oscSrcStage,
+      .srcAccessMask = oscSrcAccess,
+      .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+      .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead,
+      .oldLayout = oscOldLayout,
+      .newLayout = vk::ImageLayout::eColorAttachmentOptimal,
+      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .image = *opaqueSceneColorImages[currentFrame],
+      .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+    };
+    vk::DependencyInfo depOscToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &oscToColor2};
+    commandBuffers[currentFrame].pipelineBarrier2(depOscToColor);
+    if (currentFrame < opaqueSceneColorImageLayouts.size()) {
+      opaqueSceneColorImageLayouts[currentFrame] = vk::ImageLayout::eColorAttachmentOptimal;
+    }
+    // PASS 1: OFF-SCREEN COLOR (Opaque)
+    // Clear the off-screen target at the start of opaque rendering to a neutral black background
+    vk::RenderingAttachmentInfo colorAttachment{.imageView = *opaqueSceneColorImageViews[currentFrame], .imageLayout = vk::ImageLayout::eColorAttachmentOptimal, .loadOp = vk::AttachmentLoadOp::eClear, .storeOp = vk::AttachmentStoreOp::eStore, .clearValue = vk::ClearColorValue(std::array < float, 4 >{0.0f, 0.0f, 0.0f, 1.0f})};
+    depthAttachment.imageView = *depthImageView;
+    depthAttachment.loadOp = (didOpaqueDepthPrepass) ? vk::AttachmentLoadOp::eLoad : vk::AttachmentLoadOp::eClear;
+    vk::RenderingInfo passInfo{.renderArea = vk::Rect2D({0, 0}, swapChainExtent), .layerCount = 1, .colorAttachmentCount = 1, .pColorAttachments = &colorAttachment, .pDepthAttachment = &depthAttachment};
+    commandBuffers[currentFrame].beginRendering(passInfo);
+    vk::Viewport viewport(0.0f, 0.0f, static_cast<float>(swapChainExtent.width), static_cast<float>(swapChainExtent.height), 0.0f, 1.0f);
+    commandBuffers[currentFrame].setViewport(0, viewport);
+    vk::Rect2D scissor({0, 0}, swapChainExtent);
+    commandBuffers[currentFrame].setScissor(0, scissor); {
+      uint32_t opaqueDrawsThisPass = 0;
+      for (const auto& job : opaqueJobs) {
+        bool useBasic = (imguiSystem && !imguiSystem->IsPBREnabled());
+        vk::raii::Pipeline* selectedPipeline = nullptr;
+        vk::raii::PipelineLayout* selectedLayout = nullptr;
+        if (useBasic) {
+          selectedPipeline = &graphicsPipeline;
+          selectedLayout = &pipelineLayout;
+        } else {
+          // If masked, we need depth writes with alpha test; otherwise, after-prepass read-only is fine.
+          if (job.isAlphaMasked) {
+            selectedPipeline = &pbrGraphicsPipeline; // writes depth, compare Less
+          } else {
+            selectedPipeline = didOpaqueDepthPrepass && !!*pbrPrepassGraphicsPipeline ? &pbrPrepassGraphicsPipeline : &pbrGraphicsPipeline;
+          }
+          selectedLayout = &pbrPipelineLayout;
+        }
+        if (currentPipeline != selectedPipeline) {
+          commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, **selectedPipeline);
+          currentPipeline = selectedPipeline;
+          currentLayout = selectedLayout;
+        }
+
+        vk::Buffer vb = GetVertexBuffer(this, job.meshComp, job.meshRes);
+        std::array<vk::Buffer, 2> buffers = {vb, *job.entityRes->instanceBuffer};
+        std::array<vk::DeviceSize, 2> offsets = {0, 0};
+        commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets);
+        commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32);
+
+        auto* descSetsPtr = useBasic ? &job.entityRes->basicDescriptorSets : &job.entityRes->pbrDescriptorSets;
+        if (descSetsPtr->empty() || currentFrame >= descSetsPtr->size()) {
+          continue;
+        }
+
+        if (useBasic) {
+          commandBuffers[currentFrame].bindDescriptorSets(
+            vk::PipelineBindPoint::eGraphics,
+            **selectedLayout,
+            0,
+            {*(*descSetsPtr)[currentFrame]},
+            {});
+        } else {
+          vk::DescriptorSet set1Opaque = (transparentDescriptorSets.empty() || IsLoading())
+                                           ? *transparentFallbackDescriptorSets[currentFrame]
+                                           : *transparentDescriptorSets[currentFrame];
+          commandBuffers[currentFrame].bindDescriptorSets(
+            vk::PipelineBindPoint::eGraphics,
+            **selectedLayout,
+            0,
+            {*(*descSetsPtr)[currentFrame], set1Opaque},
+            {});
+
+          commandBuffers[currentFrame].pushConstants<MaterialProperties>(**selectedLayout, vk::ShaderStageFlagBits::eFragment, 0, {job.entityRes->cachedMaterialProps});
+        }
+        uint32_t instanceCount = std::max(1u, static_cast<uint32_t>(job.meshComp->GetInstanceCount()));
+        commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCount, 0, 0, 0);
+        ++opaqueDrawsThisPass;
+      }
+    }
+    commandBuffers[currentFrame].endRendering();
+    // PASS 1b: PRESENT – composite path
+    {
+      // Transition off-screen to SHADER_READ for sampling (Sync2)
+      vk::ImageMemoryBarrier2 opaqueToSample2{
+        .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+        .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader,
+        .dstAccessMask = vk::AccessFlagBits2::eShaderRead,
+        .oldLayout = vk::ImageLayout::eColorAttachmentOptimal,
+        .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .image = *opaqueSceneColorImages[currentFrame],
+        .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+      };
+      vk::DependencyInfo depOpaqueToSample{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &opaqueToSample2};
+      commandBuffers[currentFrame].pipelineBarrier2(depOpaqueToSample);
+      if (currentFrame < opaqueSceneColorImageLayouts.size()) {
+        opaqueSceneColorImageLayouts[currentFrame] = vk::ImageLayout::eShaderReadOnlyOptimal;
+      }
+
+      // Make the swapchain image ready for color attachment output and clear it (Sync2)
+      vk::ImageMemoryBarrier2 swapchainToColor2{
+        .srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe,
+        .srcAccessMask = vk::AccessFlagBits2::eNone,
+        .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+        .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead,
+        .oldLayout = vk::ImageLayout::eUndefined,
+        .newLayout = vk::ImageLayout::eColorAttachmentOptimal,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .image = swapChainImages[imageIndex],
+        .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+      };
+      vk::DependencyInfo depSwapchainToColor{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &swapchainToColor2};
+      commandBuffers[currentFrame].pipelineBarrier2(depSwapchainToColor);
+
+      // Begin rendering to swapchain for composite
+      colorAttachments[0].imageView = *swapChainImageViews[imageIndex];
+      colorAttachments[0].loadOp = vk::AttachmentLoadOp::eClear; // clear before composing base layer (full-screen composite overwrites all pixels)
+      depthAttachment.loadOp = vk::AttachmentLoadOp::eDontCare; // no depth for composite
+      renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent);
+      // IMPORTANT: Composite pass does not use a depth attachment. Avoid binding it to satisfy dynamic rendering VUIDs.
+      auto savedDepthPtr = renderingInfo.pDepthAttachment; // save to restore later
+      renderingInfo.pDepthAttachment = nullptr;
+      commandBuffers[currentFrame].beginRendering(renderingInfo);
+
+      // Bind composite pipeline
+      if (!!*compositePipeline) {
+        commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, *compositePipeline);
+      }
+      vk::Viewport vp(0.0f, 0.0f, static_cast<float>(swapChainExtent.width), static_cast<float>(swapChainExtent.height), 0.0f, 1.0f);
+      commandBuffers[currentFrame].setViewport(0, vp);
+      vk::Rect2D sc({0, 0}, swapChainExtent);
+      commandBuffers[currentFrame].setScissor(0, sc);
+
+      // Bind descriptor set 0 for the composite. During loading, force fallback to avoid sampling uninitialized off-screen color.
+      vk::DescriptorSet setComposite = (transparentDescriptorSets.empty() || IsLoading())
+                                         ? *transparentFallbackDescriptorSets[currentFrame]
+                                         : *transparentDescriptorSets[currentFrame];
+      commandBuffers[currentFrame].bindDescriptorSets(
+        vk::PipelineBindPoint::eGraphics,
+        *compositePipelineLayout,
+        0,
+        {setComposite},
+        {});
+
+      // Push exposure/gamma and sRGB flag
+      struct CompositePush {
+        float exposure;
+        float gamma;
+        int outputIsSRGB;
+        float _pad;
+      } pc{};
+      pc.exposure = std::clamp(this->exposure, 0.2f, 4.0f);
+      pc.gamma = this->gamma;
+      pc.outputIsSRGB = (swapChainImageFormat == vk::Format::eR8G8B8A8Srgb || swapChainImageFormat == vk::Format::eB8G8R8A8Srgb) ? 1 : 0;
+      commandBuffers[currentFrame].pushConstants<CompositePush>(*compositePipelineLayout, vk::ShaderStageFlagBits::eFragment, 0, pc);
+
+      // Draw fullscreen triangle
+      commandBuffers[currentFrame].draw(3, 1, 0, 0);
+
+      commandBuffers[currentFrame].endRendering();
+      // Restore depth attachment pointer for subsequent passes
+      renderingInfo.pDepthAttachment = savedDepthPtr;
+    }
+    // PASS 2: RENDER TRANSPARENT OBJECTS TO THE SWAPCHAIN
+    {
+      // Ensure depth attachment is bound again for the transparent pass
+      renderingInfo.pDepthAttachment = &depthAttachment;
+      colorAttachments[0].imageView = *swapChainImageViews[imageIndex];
+      colorAttachments[0].loadOp = vk::AttachmentLoadOp::eLoad;
+      depthAttachment.loadOp = vk::AttachmentLoadOp::eLoad;
+      renderingInfo.renderArea = vk::Rect2D({0, 0}, swapChainExtent);
+      commandBuffers[currentFrame].beginRendering(renderingInfo);
+      commandBuffers[currentFrame].setViewport(0, viewport);
+      commandBuffers[currentFrame].setScissor(0, scissor);
+
+      if (!transparentJobs.empty()) {
+        currentLayout = &pbrTransparentPipelineLayout;
+        vk::raii::Pipeline* activeTransparentPipeline = nullptr;
+
+        for (const auto& job : transparentJobs) {
+          vk::raii::Pipeline* desiredPipeline = job.entityRes->cachedIsGlass ? &glassGraphicsPipeline : &pbrBlendGraphicsPipeline;
+          if (desiredPipeline != activeTransparentPipeline) {
+            commandBuffers[currentFrame].bindPipeline(vk::PipelineBindPoint::eGraphics, **desiredPipeline);
+            activeTransparentPipeline = desiredPipeline;
+          }
+
+          vk::Buffer vb = GetVertexBuffer(this, job.meshComp, job.meshRes);
+          std::array<vk::Buffer, 2> buffers = {vb, *job.entityRes->instanceBuffer};
+          std::array<vk::DeviceSize, 2> offsets = {0, 0};
+          commandBuffers[currentFrame].bindVertexBuffers(0, buffers, offsets);
+          commandBuffers[currentFrame].bindIndexBuffer(*job.meshRes->indexBuffer, 0, vk::IndexType::eUint32);
+
+          vk::DescriptorSet set1 = (transparentDescriptorSets.empty() || IsLoading())
+                                     ? *transparentFallbackDescriptorSets[currentFrame]
+                                     : *transparentDescriptorSets[currentFrame];
+          commandBuffers[currentFrame].bindDescriptorSets(
+            vk::PipelineBindPoint::eGraphics,
+            **currentLayout,
+            0,
+            {*job.entityRes->pbrDescriptorSets[currentFrame], set1},
+            {});
+
+          MaterialProperties pushConstants = job.entityRes->cachedMaterialProps;
+          if (job.entityRes->cachedIsLiquid) {
+            pushConstants.transmissionFactor = 0.0f;
+          }
+          commandBuffers[currentFrame].pushConstants < MaterialProperties > (**currentLayout, vk::ShaderStageFlagBits::eFragment, 0,  {
+            pushConstants
+          }
+          )
+          ;
+          uint32_t instanceCountT = std::max(1u, static_cast<uint32_t>(job.meshComp->GetInstanceCount()));
+          commandBuffers[currentFrame].drawIndexed(job.meshRes->indexCount, instanceCountT, 0, 0, 0);
+        }
+      }
+      // End transparent rendering pass before any layout transitions (even if no transparent draws)
+      commandBuffers[currentFrame].endRendering();
+    } {
+      // Screenshot and final present transition are handled in rasterization path only
+      // Ray query path handles these separately
+
+      // Final layout transition for present (rasterization path only)
+      {
+        vk::ImageMemoryBarrier2 presentBarrier2{
+          .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+          .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite,
+          .dstStageMask = vk::PipelineStageFlagBits2::eNone,
+          .dstAccessMask = {},
+          .oldLayout = vk::ImageLayout::eColorAttachmentOptimal,
+          .newLayout = vk::ImageLayout::ePresentSrcKHR,
+          .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .image = swapChainImages[imageIndex],
+          .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+        };
+        vk::DependencyInfo depToPresentFinal{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &presentBarrier2};
+        commandBuffers[currentFrame].pipelineBarrier2(depToPresentFinal);
+        if (imageIndex < swapChainImageLayouts.size())
+          swapChainImageLayouts[imageIndex] = presentBarrier2.newLayout;
+      }
+    }
+  } // skip rasterization when ray query has rendered
+
+  // Render ImGui UI overlay AFTER rasterization/ray query (must always execute regardless of render mode)
+  // ImGui expects Render() to be called every frame after NewFrame() - skipping it causes hangs
+  if (imguiSystem && !imguiSystem->IsFrameRendered()) {
+    // When ray query renders, swapchain is in PRESENT layout with valid content.
+    // When rasterization renders, swapchain is also in PRESENT layout with valid content.
+    // Transition to COLOR_ATTACHMENT with loadOp=eLoad to preserve existing pixels for ImGui overlay.
+    vk::ImageMemoryBarrier2 presentToColor{
+      .srcStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe,
+      .srcAccessMask = vk::AccessFlagBits2::eNone,
+      .dstStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+      .dstAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite | vk::AccessFlagBits2::eColorAttachmentRead,
+      .oldLayout = (imageIndex < swapChainImageLayouts.size()) ? swapChainImageLayouts[imageIndex] : vk::ImageLayout::eUndefined,
+      .newLayout = vk::ImageLayout::eColorAttachmentOptimal,
+      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .image = swapChainImages[imageIndex],
+      .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+    };
+    vk::DependencyInfo depInfo{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &presentToColor};
+    commandBuffers[currentFrame].pipelineBarrier2(depInfo);
+    if (imageIndex < swapChainImageLayouts.size())
+      swapChainImageLayouts[imageIndex] = presentToColor.newLayout;
+
+    // Begin a dedicated render pass for ImGui (UI overlay)
+    vk::RenderingAttachmentInfo imguiColorAttachment{
+      .imageView = *swapChainImageViews[imageIndex],
+      .imageLayout = vk::ImageLayout::eColorAttachmentOptimal,
+      .loadOp = vk::AttachmentLoadOp::eLoad, // Load existing content
+      .storeOp = vk::AttachmentStoreOp::eStore
+    };
+    vk::RenderingInfo imguiRenderingInfo{
+      .renderArea = vk::Rect2D({0, 0}, swapChainExtent),
+      .layerCount = 1,
+      .colorAttachmentCount = 1,
+      .pColorAttachments = &imguiColorAttachment,
+      .pDepthAttachment = nullptr
+    };
+    commandBuffers[currentFrame].beginRendering(imguiRenderingInfo);
+
+    imguiSystem->Render(commandBuffers[currentFrame], currentFrame);
+
+    commandBuffers[currentFrame].endRendering();
+
+    // Transition swapchain back to PRESENT layout after ImGui renders
+    vk::ImageMemoryBarrier2 colorToPresent{
+      .srcStageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+      .srcAccessMask = vk::AccessFlagBits2::eColorAttachmentWrite,
+      .dstStageMask = vk::PipelineStageFlagBits2::eBottomOfPipe,
+      .dstAccessMask = vk::AccessFlagBits2::eNone,
+      .oldLayout = vk::ImageLayout::eColorAttachmentOptimal,
+      .newLayout = vk::ImageLayout::ePresentSrcKHR,
+      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .image = swapChainImages[imageIndex],
+      .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+    };
+    vk::DependencyInfo depInfoBack{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &colorToPresent};
+    commandBuffers[currentFrame].pipelineBarrier2(depInfoBack);
+    if (imageIndex < swapChainImageLayouts.size())
+      swapChainImageLayouts[imageIndex] = colorToPresent.newLayout;
+  }
+
+  commandBuffers[currentFrame].end();
+  lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+  isRecordingCmd.store(false, std::memory_order_relaxed);
+
+  // Submit and present (Synchronization 2)
+  uint64_t uploadsValueToWait = (framesSinceLoadingComplete > 100) ? lastCriticalUploadValue.load(std::memory_order_relaxed) : 0;
+  uint64_t lastSubmitted = uploadTimelineLastSubmitted.load(std::memory_order_relaxed);
+
+  // Use acquireSemaphoreIndex for imageAvailable semaphore (same as we used in acquireNextImage)
+  // Use imageIndex for renderFinished semaphore (matches the image being presented)
+
+  std::array<vk::SemaphoreSubmitInfo, 2> waitInfos = {
+    vk::SemaphoreSubmitInfo{
+      .semaphore = *imageAvailableSemaphores[acquireSemaphoreIndex],
+      .value = 0,
+      .stageMask = vk::PipelineStageFlagBits2::eColorAttachmentOutput,
+      .deviceIndex = 0
+    },
+    vk::SemaphoreSubmitInfo{
+      .semaphore = *uploadsTimeline,
+      .value = uploadsValueToWait,
+      .stageMask = vk::PipelineStageFlagBits2::eFragmentShader,
+      .deviceIndex = 0
+    }
+  };
+
+  vk::CommandBufferSubmitInfo cmdInfo{.commandBuffer = *commandBuffers[currentFrame], .deviceMask = 0};
+  vk::SemaphoreSubmitInfo signalInfo{.semaphore = *renderFinishedSemaphores[imageIndex], .value = 0, .stageMask = vk::PipelineStageFlagBits2::eAllGraphics, .deviceIndex = 0};
+  vk::SubmitInfo2 submit2{
+    .waitSemaphoreInfoCount = static_cast<uint32_t>(waitInfos.size()),
+    .pWaitSemaphoreInfos = waitInfos.data(),
+    .commandBufferInfoCount = 1,
+    .pCommandBufferInfos = &cmdInfo,
+    .signalSemaphoreInfoCount = 1,
+    .pSignalSemaphoreInfos = &signalInfo
+  };
+
+  if (framebufferResized.load(std::memory_order_relaxed)) {
+    vk::SubmitInfo2 emptySubmit2{}; {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      graphicsQueue.submit2(emptySubmit2, *inFlightFences[currentFrame]);
+    }
+    recreateSwapChain();
+    return;
+  }
+
+  // Update watchdog BEFORE queue submit because submit can block waiting for GPU
+  // This proves frame CPU work is complete even if GPU queue is busy
+  lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed); {
+    std::lock_guard<std::mutex> lock(queueMutex);
+    graphicsQueue.submit2(submit2, *inFlightFences[currentFrame]);
+  }
+
+  watchdogProgressLabel.store("Render: Presenting", std::memory_order_relaxed);
+  vk::PresentInfoKHR presentInfo{.waitSemaphoreCount = 1, .pWaitSemaphores = &*renderFinishedSemaphores[imageIndex], .swapchainCount = 1, .pSwapchains = &*swapChain, .pImageIndices = &imageIndex};
+  vk::Result presentResult = vk::Result::eSuccess;
+  try {
+    // Note: Queue submission and presentation must be synchronized. 
+    // However, presentKHR is already thread-safe for the specific queue and we use semaphores for cross-queue sync.
+    // Moving it outside the mutex reduces contention and avoids watchdog triggers during long GPU stalls.
+    presentResult = presentQueue.presentKHR(presentInfo);
+  } catch (const vk::OutOfDateKHRError&) {
+    framebufferResized.store(true, std::memory_order_relaxed);
+  }
+  if (presentResult == vk::Result::eSuboptimalKHR || framebufferResized.load(std::memory_order_relaxed)) {
+    framebufferResized.store(false, std::memory_order_relaxed);
+    recreateSwapChain();
+  } else if (presentResult != vk::Result::eSuccess) {
+    throw std::runtime_error("Failed to present swap chain image");
+  }
+
+  currentFrame = (currentFrame + 1) % MAX_FRAMES_IN_FLIGHT;
+}
+
+// Public toggle APIs for planar reflections (keyboard/UI)
+void Renderer::SetPlanarReflectionsEnabled(bool enabled) {
+  // Flip mode and mark resources dirty so RTs are created/destroyed at the next safe point
+  enablePlanarReflections = enabled;
+  reflectionResourcesDirty = true;
+}
+
+void Renderer::TogglePlanarReflections() {
+  SetPlanarReflectionsEnabled(!enablePlanarReflections);
+}
+
+void AdvancedRenderer_updateSkins(Renderer* renderer, vk::raii::CommandBuffer& cmd, uint32_t frameIndex, const std::vector<Entity*>& entities) {
+  renderer->watchdogProgressLabel.store("Render: updateSkins", std::memory_order_relaxed);
+  renderer->watchdogProgressIndex.store(0, std::memory_order_relaxed);
+
+  std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex, std::defer_lock);
+  
+  // Cache the list of deformable entities to avoid redundant GetComponent and map lookups
+  std::vector<Entity*> deformableCache;
+  deformableCache.clear();
+  {
+    lock.lock();
+    for (const auto& entity : entities) {
+      if (!entity) continue;
+      auto* meshComp = entity->GetComponent<MeshComponent>();
+      if (!meshComp) continue;
+      auto it = g_meshComponentData.find(meshComp);
+      if (it != g_meshComponentData.end() && it->second.isDeformable) {
+        deformableCache.push_back(entity);
+      }
+    }
+    lock.unlock();
+  }
+  if (deformableCache.empty()) return;
+
+  lock.lock();
+  auto stateIt = g_rendererStates.find(renderer);
+  if (stateIt == g_rendererStates.end()) {
+    lock.unlock();
+    return;
+  }
+  const AdvancedRendererState& state = stateIt->second;
+  lock.unlock();
+
+  cmd.bindPipeline(vk::PipelineBindPoint::eCompute, *state.skinPipeline);
+
+  uint32_t processedCount = 0;
+  for (const auto& entity : deformableCache) {
+        if ((processedCount % 10) == 0) {
+            renderer->watchdogProgressIndex.store(processedCount, std::memory_order_relaxed);
+            AdvancedRenderer_KickWatchdog(renderer);
+        }
+        processedCount++;
+        auto* meshComp = entity->GetComponent<MeshComponent>();
+        
+        lock.lock();
+        auto compDataIt = g_meshComponentData.find(meshComp);
+        if (compDataIt == g_meshComponentData.end()) { lock.unlock(); continue; }
+        
+        auto it = renderer->meshResources.find(meshComp);
+        if (it == renderer->meshResources.end()) { lock.unlock(); continue; }
+        
+        auto advIt = g_meshAdvancedResources.find(meshComp);
+        if (advIt == g_meshAdvancedResources.end()) { lock.unlock(); continue; }
+        AdvancedEntityResources& advRes = advIt->second;
+        
+        const auto& matrices = compDataIt->second.jointMatrices;
+        const auto& mWeights = compDataIt->second.morphWeights;
+        // Only meshes with real joint bindings should have skinning applied. Morph-only
+        // meshes (e.g. AnimatedMorphCube) now carry zero-filled joint buffers so the
+        // descriptor is valid, but applying their (zero) skin matrix would collapse them.
+        const bool hasRealSkin = !compDataIt->second.jointIndices.empty();
+        lock.unlock();
+
+        if (!*advRes.outputVertexBuffer || !*advRes.jointMatricesBuffer) continue;
+
+        bool isTarget = (entity->GetName().find("Fox") != std::string::npos || entity->GetName().find("Cube") != std::string::npos);
+        
+        // Upload joint matrices if available
+        if (!matrices.empty()) {
+            size_t size = std::min(matrices.size(), size_t(256)) * sizeof(glm::mat4);
+            void* data = advRes.jointMatricesBufferAllocation->mappedPtr;
+            if (data) {
+                std::memcpy(data, matrices.data(), size);
+            }
+        }
+        
+        // Bind descriptor sets
+        if (advRes.skinDescriptorSets.size() <= frameIndex) {
+            continue;
+        }
+        std::vector<vk::DescriptorSet> computeSets = {*advRes.skinDescriptorSets[frameIndex]};
+        if (!advRes.morphDescriptorSets.empty()) {
+            computeSets.push_back(*advRes.morphDescriptorSets[0]);
+        } else {
+            computeSets.push_back(*state.dummyMorphDescriptorSet);
+        }
+        cmd.bindDescriptorSets(vk::PipelineBindPoint::eCompute, *state.skinPipelineLayout, 0, computeSets, {});
+        
+        // Dispatch
+        SkinPushConstants pc{};
+        pc.vertexCount = static_cast<uint32_t>(meshComp->GetVertices().size());
+        
+        // Populate morph weights
+        pc.morphWeights.activeCount = static_cast<uint32_t>(std::min(mWeights.size(), size_t(24)));
+        for (uint32_t i = 0; i < pc.morphWeights.activeCount; ++i) {
+            pc.morphWeights.weights[i] = mWeights[i];
+            pc.morphIndices[i] = i; // Sequential slots
+        }
+
+        // Apply skeletal skinning only when this mesh actually has joint index/weight
+        // buffers. Morph-only meshes (e.g. AnimatedMorphCube) leave those buffers unbound,
+        // so the shader must pass the morphed position through without skinning.
+        pc.morphWeights.applySkinning = hasRealSkin ? 1u : 0u;
+
+        cmd.pushConstants<SkinPushConstants>(*state.skinPipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, pc);
+        
+        uint32_t groupCount = (pc.vertexCount + 63) / 64;
+        cmd.dispatch(groupCount, 1, 1);
+        
+        // Barrier: skinning compute write → vertex input read + AS build read (vertex/index data)
+        vk::BufferMemoryBarrier barrier{
+          .srcAccessMask = vk::AccessFlagBits::eShaderWrite,
+          .dstAccessMask = vk::AccessFlagBits::eVertexAttributeRead | vk::AccessFlagBits::eShaderRead,
+          .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .buffer = *advRes.outputVertexBuffer,
+          .offset = 0,
+          .size = VK_WHOLE_SIZE
+        };
+        cmd.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader,
+                            vk::PipelineStageFlagBits::eVertexInput | vk::PipelineStageFlagBits::eAccelerationStructureBuildKHR,
+                            {}, {}, {barrier}, {});
+        
+    }
+    if (lock.owns_lock()) lock.unlock();
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/renderer_resources.cpp b/attachments/advanced_gltf/renderer_resources.cpp
new file mode 100644
index 000000000..292975523
--- /dev/null
+++ b/attachments/advanced_gltf/renderer_resources.cpp
@@ -0,0 +1,4565 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <filesystem>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <stdexcept>
+#include <ktx.h>
+#include <ranges>
+#include <chrono>
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+#include <string>
+#include <stb_image.h>
+
+#include "mesh_component.h"
+#include "model_loader.h"
+
+#include "renderer.h"
+
+#include "renderer_advanced_types.h"
+#include "transform_component.h"
+
+// This file contains resource-related methods from the Renderer class
+
+// Define shared default PBR texture identifiers (static constants)
+const std::string Renderer::SHARED_DEFAULT_ALBEDO_ID = "__shared_default_albedo__";
+const std::string Renderer::SHARED_DEFAULT_NORMAL_ID = "__shared_default_normal__";
+const std::string Renderer::SHARED_DEFAULT_METALLIC_ROUGHNESS_ID = "__shared_default_metallic_roughness__";
+const std::string Renderer::SHARED_DEFAULT_OCCLUSION_ID = "__shared_default_occlusion__";
+const std::string Renderer::SHARED_DEFAULT_EMISSIVE_ID = "__shared_default_emissive__";
+const std::string Renderer::SHARED_BRIGHT_RED_ID = "__shared_bright_red__";
+
+// Create depth resources
+bool Renderer::createDepthResources() {
+  try {
+    // Find depth format
+    vk::Format depthFormat = findDepthFormat();
+
+    // Create depth image using memory pool
+    std::tie(depthImage, depthImageAllocation) = createImagePooled(
+      swapChainExtent.width,
+      swapChainExtent.height,
+      depthFormat,
+      vk::ImageTiling::eOptimal,
+      vk::ImageUsageFlagBits::eDepthStencilAttachment,
+      vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+    // Create depth image view
+    depthImageView = createImageView(depthImage, depthFormat, vk::ImageAspectFlagBits::eDepth);
+
+    // Transition depth image layout
+    transitionImageLayout(
+      *depthImage,
+      depthFormat,
+      vk::ImageLayout::eUndefined,
+      vk::ImageLayout::eDepthStencilAttachmentOptimal);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create depth resources: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Helper: coerce an sRGB/UNORM variant of a given VkFormat while preserving block type where possible
+static vk::Format CoerceFormatSRGB(vk::Format fmt, bool wantSRGB) {
+  switch (fmt) {
+    case vk::Format::eR8G8B8A8Unorm:
+      return wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm;
+    case vk::Format::eR8G8B8A8Srgb:
+      return wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm;
+
+    case vk::Format::eBc1RgbUnormBlock:
+      return wantSRGB ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock;
+    case vk::Format::eBc1RgbSrgbBlock:
+      return wantSRGB ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock;
+    case vk::Format::eBc1RgbaUnormBlock:
+      return wantSRGB ? vk::Format::eBc1RgbaSrgbBlock : vk::Format::eBc1RgbaUnormBlock;
+    case vk::Format::eBc1RgbaSrgbBlock:
+      return wantSRGB ? vk::Format::eBc1RgbaSrgbBlock : vk::Format::eBc1RgbaUnormBlock;
+
+    case vk::Format::eBc2UnormBlock:
+      return wantSRGB ? vk::Format::eBc2SrgbBlock : vk::Format::eBc2UnormBlock;
+    case vk::Format::eBc2SrgbBlock:
+      return wantSRGB ? vk::Format::eBc2SrgbBlock : vk::Format::eBc2UnormBlock;
+
+    case vk::Format::eBc3UnormBlock:
+      return wantSRGB ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock;
+    case vk::Format::eBc3SrgbBlock:
+      return wantSRGB ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock;
+
+    case vk::Format::eBc7UnormBlock:
+      return wantSRGB ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock;
+    case vk::Format::eBc7SrgbBlock:
+      return wantSRGB ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock;
+
+    default:
+      return fmt;
+  }
+}
+
+// Create texture image
+bool Renderer::createTextureImage(const std::string& texturePath_, TextureResources& resources) {
+  try {
+    ensureThreadLocalVulkanInit();
+    const std::string textureId = ResolveTextureId(texturePath_);
+    // Check if texture already exists
+    {
+      std::shared_lock<std::shared_mutex> texLock(textureResourcesMutex);
+      auto it = textureResources.find(textureId);
+      if (it != textureResources.end()) {
+        // Texture already loaded and cached; leave cache intact and return success
+        return true;
+      }
+    }
+
+    // Resolve on-disk path (may differ from logical ID)
+    std::string resolvedPath = textureId;
+
+    // Ensure command pool is initialized before any GPU work
+    if (!*commandPool) {
+      std::cerr << "createTextureImage: commandPool not initialized yet for '" << textureId << "'" << std::endl;
+      return false;
+    }
+
+    // Per-texture de-duplication (serialize loads of the same texture ID only)
+    {
+      std::unique_lock<std::mutex> lk(textureLoadStateMutex);
+      while (texturesLoading.contains(textureId)) {
+        textureLoadStateCv.wait(lk);
+      }
+    }
+    // Double-check cache after the wait
+    {
+      std::shared_lock<std::shared_mutex> texLock(textureResourcesMutex);
+      auto it2 = textureResources.find(textureId);
+      if (it2 != textureResources.end()) {
+        return true;
+      }
+    }
+    // Mark as loading and ensure we notify on all exit paths
+    {
+      std::lock_guard<std::mutex> lk(textureLoadStateMutex);
+      texturesLoading.insert(textureId);
+    }
+    auto _loadingGuard = std::unique_ptr<void, std::function<void(void*)>>(reinterpret_cast<void *>(1),
+                                                                           [this, textureId](void*) {
+                                                                             std::lock_guard<std::mutex> lk(textureLoadStateMutex);
+                                                                             texturesLoading.erase(textureId);
+                                                                             textureLoadStateCv.notify_all();
+                                                                           });
+
+    // Check if this is a KTX2 file
+    bool isKtx2 = resolvedPath.ends_with(".ktx2");
+
+    // If it's a KTX2 texture but the path doesn't exist, try common fallback filename variants
+    if (isKtx2) {
+      std::filesystem::path origPath(resolvedPath);
+      if (!std::filesystem::exists(origPath)) {
+        std::string fname = origPath.filename().string();
+        std::string dir = origPath.parent_path().string();
+        auto tryCandidate = [&](const std::string& candidateName) -> bool {
+          std::filesystem::path cand = std::filesystem::path(dir) / candidateName;
+          if (std::filesystem::exists(cand)) {
+            std::cout << "Resolved missing texture '" << resolvedPath << "' to existing file '" << cand.string() << "'" << std::endl;
+            resolvedPath = cand.string();
+            return true;
+          }
+          return false;
+        };
+        // Known suffix variants near the end of filename before extension
+        // Examples: *_c.ktx2, *_d.ktx2, *_cm.ktx2, *_diffuse.ktx2, *_basecolor.ktx2, *_albedo.ktx2
+        std::vector<std::string> suffixes = {"_c", "_d", "_cm", "_diffuse", "_basecolor", "_albedo"};
+        // If filename matches one known suffix, try others
+        for (const auto& s : suffixes) {
+          std::string key = s + ".ktx2";
+          if (fname.ends_with(key)) {
+            std::string prefix = fname.substr(0, fname.size() - key.size());
+            for (const auto& alt : suffixes) {
+              if (alt == s)
+                continue;
+              std::string candName = prefix + alt + ".ktx2";
+              if (tryCandidate(candName)) {
+                isKtx2 = true;
+                break;
+              }
+            }
+            break; // Only replace last suffix occurrence
+          }
+        }
+      }
+    }
+
+    int texWidth, texHeight, texChannels;
+    unsigned char* pixels = nullptr;
+    ktxTexture2* ktxTex = nullptr;
+    vk::DeviceSize imageSize;
+
+    // Track KTX2 transcoding state across the function scope (BasisU only)
+    bool wasTranscoded = false;
+    // Track KTX2 header-provided VkFormat (0 == VK_FORMAT_UNDEFINED)
+    uint32_t headerVkFormatRaw = 0;
+
+    uint32_t mipLevels = 1;
+    std::vector<vk::BufferImageCopy> copyRegions;
+
+    if (isKtx2) {
+      // Load KTX2 file
+      KTX_error_code result = ktxTexture2_CreateFromNamedFile(resolvedPath.c_str(),
+                                                              KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT,
+                                                              &ktxTex);
+      if (result != KTX_SUCCESS) {
+        // Retry with sibling suffix variants if file exists but cannot be parsed/opened
+        std::filesystem::path origPath(resolvedPath);
+        std::string fname = origPath.filename().string();
+        std::string dir = origPath.parent_path().string();
+        auto tryLoad = [&](const std::string& candidateName) -> bool {
+          std::filesystem::path cand = std::filesystem::path(dir) / candidateName;
+          if (std::filesystem::exists(cand)) {
+            std::string candStr = cand.string();
+            std::cout << "Retrying KTX2 load with sibling candidate '" << candStr << "' for original '" << resolvedPath << "'" << std::endl;
+            result = ktxTexture2_CreateFromNamedFile(candStr.c_str(), KTX_TEXTURE_CREATE_LOAD_IMAGE_DATA_BIT, &ktxTex);
+            if (result == KTX_SUCCESS) {
+              resolvedPath = candStr; // Use the successfully opened candidate
+              return true;
+            }
+          }
+          return false;
+        };
+        // Known suffix variants near the end of filename before extension
+        std::vector<std::string> suffixes = {"_c", "_d", "_cm", "_diffuse", "_basecolor", "_albedo"};
+        for (const auto& s : suffixes) {
+          std::string key = s + ".ktx2";
+          if (fname.ends_with(key)) {
+            std::string prefix = fname.substr(0, fname.size() - key.size());
+            bool loaded = false;
+            for (const auto& alt : suffixes) {
+              if (alt == s)
+                continue;
+              std::string candName = prefix + alt + ".ktx2";
+              if (tryLoad(candName)) {
+                loaded = true;
+                break;
+              }
+            }
+            if (loaded)
+              break;
+          }
+        }
+      }
+
+      // Bail out if we still failed to load
+      if (result != KTX_SUCCESS || ktxTex == nullptr) {
+        std::cerr << "Failed to load KTX2 texture: " << resolvedPath << " (error: " << result << ")" << std::endl;
+        return false;
+      }
+
+      // Read header-provided vkFormat (if already GPU-compressed/transcoded offline)
+      headerVkFormatRaw = static_cast<uint32_t>(ktxTex->vkFormat);
+
+      // Check if the texture needs BasisU transcoding; prefer GPU-compressed targets to save VRAM
+      wasTranscoded = ktxTexture2_NeedsTranscoding(ktxTex);
+      if (wasTranscoded) {
+        // Select a compressed target supported by the device (prefer BC7 RGBA, then BC3 RGBA, then BC1 RGB)
+        auto supportsFormat = [&](vk::Format f) {
+          auto props = physicalDevice.getFormatProperties(f);
+          return static_cast<bool>(props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eSampledImage);
+        };
+        bool wantSrgb = (Renderer::determineTextureFormat(resolvedPath) == vk::Format::eR8G8B8A8Srgb);
+        KTX_error_code tcErr = KTX_SUCCESS;
+        if (supportsFormat(vk::Format::eBc7UnormBlock) || supportsFormat(vk::Format::eBc7SrgbBlock)) {
+          tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC7_RGBA, 0);
+        } else if (supportsFormat(vk::Format::eBc3UnormBlock) || supportsFormat(vk::Format::eBc3SrgbBlock)) {
+          tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC3_RGBA, 0);
+        } else if (supportsFormat(vk::Format::eBc1RgbUnormBlock) || supportsFormat(vk::Format::eBc1RgbSrgbBlock)) {
+          tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_BC1_RGB, 0);
+        } else {
+          // Fallback to RGBA32 if no BC formats are supported
+          tcErr = ktxTexture2_TranscodeBasis(ktxTex, KTX_TTF_RGBA32, 0);
+        }
+        if (tcErr != KTX_SUCCESS) {
+          std::cerr << "Failed to transcode KTX2 BasisU texture: " << resolvedPath << " (error: " << tcErr << ")" << std::endl;
+          ktxTexture_Destroy(reinterpret_cast<ktxTexture *>(ktxTex));
+          return false;
+        }
+      }
+
+      texWidth = ktxTex->baseWidth;
+      texHeight = ktxTex->baseHeight;
+      texChannels = 4; // logical channels; compressed size handled by libktx
+
+      // Use all levels present in the KTX container
+      mipLevels = std::max(1u, ktxTex->numLevels);
+
+      // Total data size across all mip levels
+      imageSize = ktxTexture_GetDataSize(reinterpret_cast<ktxTexture *>(ktxTex));
+
+      // Build copy regions for every mip level in the file
+      copyRegions.clear();
+      copyRegions.reserve(mipLevels);
+      for (uint32_t level = 0; level < mipLevels; ++level) {
+        ktx_size_t levelOffset = 0;
+        ktxTexture_GetImageOffset(reinterpret_cast<ktxTexture *>(ktxTex), level, 0, 0, &levelOffset);
+        uint32_t w = std::max(1u, static_cast<uint32_t>(texWidth) >> level);
+        uint32_t h = std::max(1u, static_cast<uint32_t>(texHeight) >> level);
+        copyRegions.push_back({
+          .bufferOffset = static_cast<vk::DeviceSize>(levelOffset),
+          .bufferRowLength = 0,
+          .bufferImageHeight = 0,
+          .imageSubresource = {
+            .aspectMask = vk::ImageAspectFlagBits::eColor,
+            .mipLevel = level,
+            .baseArrayLayer = 0,
+            .layerCount = 1
+          },
+          .imageOffset = {0, 0, 0},
+          .imageExtent = {w, h, 1}
+        });
+      }
+    } else {
+      // Non-KTX texture loading via file path is disabled to simplify pipeline.
+      std::cerr << "Unsupported non-KTX2 texture path: " << textureId << std::endl;
+      return false;
+    }
+
+    // Create staging buffer
+    auto [stagingBuffer, stagingBufferMemory] = createBuffer(
+      imageSize,
+      vk::BufferUsageFlagBits::eTransferSrc,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    // Copy pixel data to staging buffer
+    void* data = stagingBufferMemory.mapMemory(0, imageSize);
+
+    if (isKtx2) {
+      // Copy entire KTX2 image data blob (all mip levels)
+      const uint8_t* allData = ktxTexture_GetData(reinterpret_cast<ktxTexture *>(ktxTex));
+      const ktx_size_t dataSz = ktxTexture_GetDataSize(reinterpret_cast<ktxTexture *>(ktxTex));
+      memcpy(data, allData, static_cast<size_t>(dataSz));
+    } else {
+      // Copy regular image data
+      memcpy(data, pixels, static_cast<size_t>(imageSize));
+    }
+
+    stagingBufferMemory.unmapMemory();
+
+    // Determine appropriate texture format
+    vk::Format textureFormat;
+    const bool wantSRGB = (Renderer::determineTextureFormat(textureId) == vk::Format::eR8G8B8A8Srgb);
+    bool alphaMaskedHint = false;
+    if (isKtx2) {
+      // If the KTX2 provided a valid VkFormat and we did NOT transcode, respect its block type
+      // but coerce the sRGB/UNORM variant based on texture usage (baseColor vs data maps)
+      if (!wasTranscoded) {
+        VkFormat headerFmt = static_cast<VkFormat>(headerVkFormatRaw);
+        if (headerFmt != VK_FORMAT_UNDEFINED) {
+          textureFormat = CoerceFormatSRGB(static_cast<vk::Format>(headerFmt), wantSRGB);
+        } else {
+          textureFormat = wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm;
+        }
+        // Can't easily scan alpha in compressed formats here; leave hint at default false
+      } else {
+        // We transcoded; choose a Vulkan format matching the transcode target (we requested BC7/BC3/BC1 or RGBA32 fallback)
+        // There is no direct query from KTX for chosen VkFormat after transcoding, so infer by capabilities using our preference order.
+        bool wantSRGB2 = wantSRGB;
+        if (!!physicalDevice.getFormatProperties(vk::Format::eBc7UnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc7SrgbBlock).optimalTilingFeatures) {
+          textureFormat = wantSRGB2 ? vk::Format::eBc7SrgbBlock : vk::Format::eBc7UnormBlock;
+        } else if (!!physicalDevice.getFormatProperties(vk::Format::eBc3UnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc3SrgbBlock).optimalTilingFeatures) {
+          textureFormat = wantSRGB2 ? vk::Format::eBc3SrgbBlock : vk::Format::eBc3UnormBlock;
+        } else if (!!physicalDevice.getFormatProperties(vk::Format::eBc1RgbUnormBlock).optimalTilingFeatures || !!physicalDevice.getFormatProperties(vk::Format::eBc1RgbSrgbBlock).optimalTilingFeatures) {
+          textureFormat = wantSRGB2 ? vk::Format::eBc1RgbSrgbBlock : vk::Format::eBc1RgbUnormBlock;
+        } else {
+          // Fallback to uncompressed RGBA
+          textureFormat = wantSRGB2 ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm;
+          // We have CPU-visible RGBA data; detect alpha for masked hint
+          ktx_size_t offsetScan = 0;
+          ktxTexture_GetImageOffset(reinterpret_cast<ktxTexture *>(ktxTex), 0, 0, 0, &offsetScan);
+          const uint8_t* rgba = ktxTexture_GetData(reinterpret_cast<ktxTexture *>(ktxTex)) + offsetScan;
+          size_t pixelCount = static_cast<size_t>(texWidth) * static_cast<size_t>(texHeight);
+          for (size_t i = 0; i < pixelCount; ++i) {
+            if (rgba[i * 4 + 3] < 250) {
+              alphaMaskedHint = true;
+              break;
+            }
+          }
+        }
+      }
+    } else {
+      textureFormat = wantSRGB ? vk::Format::eR8G8B8A8Srgb : vk::Format::eR8G8B8A8Unorm;
+    }
+
+    // Now that we're done reading libktx data, destroy the KTX texture to avoid leaks
+    if (isKtx2 && ktxTex) {
+      ktxTexture_Destroy(reinterpret_cast<ktxTexture *>(ktxTex));
+      ktxTex = nullptr;
+    }
+
+    // Create texture image using memory pool
+    bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value();
+    std::vector<uint32_t> families;
+    if (differentFamilies) {
+      families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()};
+    }
+    // KTX2 mip levels are set above (line 306); mipLevels already reflects what the file contains
+    // KTX2 files come with pre-generated mips, so we don't need TRANSFER_SRC for blit generation
+    vk::ImageUsageFlags usageFlags = vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled;
+
+    // Create image with OOM fallback: retry with mipLevels=1 and reduced usage if needed
+    try {
+      auto [textureImg, textureImgAllocation] = createImagePooled(
+        texWidth,
+        texHeight,
+        textureFormat,
+        vk::ImageTiling::eOptimal,
+        usageFlags,
+        vk::MemoryPropertyFlagBits::eDeviceLocal,
+        /*mipLevels*/
+        mipLevels,
+        differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive,
+        families);
+      resources.textureImage = std::move(textureImg);
+      resources.textureImageAllocation = std::move(textureImgAllocation);
+    } catch (const std::exception& e) {
+      std::cerr << "Image allocation failed (" << resolvedPath << "): " << e.what() << ". Retrying with mipLevels=1..." << std::endl;
+      // Retry with a single mip level and no TRANSFER_SRC usage to reduce memory pressure
+      mipLevels = 1;
+      usageFlags &= ~vk::ImageUsageFlagBits::eTransferSrc;
+      auto [textureImg2, textureImgAllocation2] = createImagePooled(
+        texWidth,
+        texHeight,
+        textureFormat,
+        vk::ImageTiling::eOptimal,
+        usageFlags,
+        vk::MemoryPropertyFlagBits::eDeviceLocal,
+        /*mipLevels*/
+        mipLevels,
+        differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive,
+        families);
+      resources.textureImage = std::move(textureImg2);
+      resources.textureImageAllocation = std::move(textureImgAllocation2);
+    }
+
+    // GPU upload for this texture (copies all regions provided)
+    uploadImageFromStaging(*stagingBuffer, *resources.textureImage, textureFormat, copyRegions, mipLevels, imageSize);
+
+    // KTX2 files provide their own mip levels; no runtime generation needed
+    // Store the format and mipLevels for createTextureImageView
+    resources.format = textureFormat;
+    resources.mipLevels = mipLevels;
+    resources.alphaMaskedHint = alphaMaskedHint;
+
+    // Create texture image view
+    if (!createTextureImageView(resources)) {
+      return false;
+    }
+
+    // Create texture sampler
+    if (!createTextureSampler(resources)) {
+      return false;
+    }
+
+    // Add to texture resources map (guarded)
+    {
+      std::unique_lock<std::shared_mutex> texLock(textureResourcesMutex);
+      textureResources[textureId] = std::move(resources);
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create texture image: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create texture image view
+bool Renderer::createTextureImageView(TextureResources& resources) {
+  try {
+    ensureThreadLocalVulkanInit();
+    resources.textureImageView = createImageView(
+      resources.textureImage,
+      resources.format,
+      // Use the stored format instead of hardcoded sRGB
+      vk::ImageAspectFlagBits::eColor,
+      resources.mipLevels // Use the stored mipLevels
+    );
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create texture image view: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create shared default PBR textures (to avoid creating hundreds of identical textures)
+bool Renderer::createSharedDefaultPBRTextures() {
+  try {
+    unsigned char translucentPixel[4] = {128, 128, 128, 128}; // 50% alpha (128/255)
+    if (!LoadTextureFromMemory(SHARED_DEFAULT_ALBEDO_ID, translucentPixel, 1, 1, 4)) {
+      std::cerr << "Failed to create shared default albedo texture" << std::endl;
+      return false;
+    }
+
+    // Create shared default normal texture (flat normal)
+    unsigned char normalPixel[4] = {128, 128, 255, 255}; // (0.5, 0.5, 1.0, 1.0) in 0-255 range
+    if (!LoadTextureFromMemory(SHARED_DEFAULT_NORMAL_ID, normalPixel, 1, 1, 4)) {
+      std::cerr << "Failed to create shared default normal texture" << std::endl;
+      return false;
+    }
+
+    // Create shared metallic-roughness texture (non-metallic, fully rough)
+    unsigned char metallicRoughnessPixel[4] = {0, 255, 0, 255}; // (unused, roughness=1.0, metallic=0.0, alpha=1.0)
+    if (!LoadTextureFromMemory(SHARED_DEFAULT_METALLIC_ROUGHNESS_ID, metallicRoughnessPixel, 1, 1, 4)) {
+      std::cerr << "Failed to create shared default metallic-roughness texture" << std::endl;
+      return false;
+    }
+
+    // Create shared default occlusion texture (white - no occlusion)
+    unsigned char occlusionPixel[4] = {255, 255, 255, 255};
+    if (!LoadTextureFromMemory(SHARED_DEFAULT_OCCLUSION_ID, occlusionPixel, 1, 1, 4)) {
+      std::cerr << "Failed to create shared default occlusion texture" << std::endl;
+      return false;
+    }
+
+    // Create shared default emissive texture (black - no emission)
+    unsigned char emissivePixel[4] = {0, 0, 0, 255};
+    if (!LoadTextureFromMemory(SHARED_DEFAULT_EMISSIVE_ID, emissivePixel, 1, 1, 4)) {
+      std::cerr << "Failed to create shared default emissive texture" << std::endl;
+      return false;
+    }
+
+    // Create shared bright red texture for ball visibility
+    unsigned char brightRedPixel[4] = {255, 0, 0, 255}; // Bright red (R=255, G=0, B=0, A=255)
+    if (!LoadTextureFromMemory(SHARED_BRIGHT_RED_ID, brightRedPixel, 1, 1, 4)) {
+      std::cerr << "Failed to create shared bright red texture" << std::endl;
+      return false;
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create shared default PBR textures: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create default texture resources (1x1 white texture)
+bool Renderer::createDefaultTextureResources() {
+  try {
+    // Create a 1x1 white texture
+    const uint32_t width = 1;
+    const uint32_t height = 1;
+    const uint32_t pixelSize = 4; // RGBA
+    const std::vector<uint8_t> pixels = {255, 255, 255, 255}; // White pixel (RGBA)
+
+    // Create staging buffer
+    vk::DeviceSize imageSize = width * height * pixelSize;
+    auto [stagingBuffer, stagingBufferMemory] = createBuffer(
+      imageSize,
+      vk::BufferUsageFlagBits::eTransferSrc,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    // Copy pixel data to staging buffer
+    void* data = stagingBufferMemory.mapMemory(0, imageSize);
+    memcpy(data, pixels.data(), static_cast<size_t>(imageSize));
+    stagingBufferMemory.unmapMemory();
+
+    // Create texture image using memory pool
+    auto [textureImg, textureImgAllocation] = createImagePooled(
+      width,
+      height,
+      vk::Format::eR8G8B8A8Srgb,
+      vk::ImageTiling::eOptimal,
+      vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled,
+      vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+    defaultTextureResources.textureImage = std::move(textureImg);
+    defaultTextureResources.textureImageAllocation = std::move(textureImgAllocation);
+
+    // Transition image layout for copy
+    transitionImageLayout(
+      *defaultTextureResources.textureImage,
+      vk::Format::eR8G8B8A8Srgb,
+      vk::ImageLayout::eUndefined,
+      vk::ImageLayout::eTransferDstOptimal);
+
+    // Copy buffer to image
+    vk::BufferImageCopy region{
+      .bufferOffset = 0,
+      .bufferRowLength = 0,
+      .bufferImageHeight = 0,
+      .imageSubresource = {
+        .aspectMask = vk::ImageAspectFlagBits::eColor,
+        .mipLevel = 0,
+        .baseArrayLayer = 0,
+        .layerCount = 1
+      },
+      .imageOffset = {0, 0, 0},
+      .imageExtent = {width, height, 1}
+    };
+    copyBufferToImage(
+      *stagingBuffer,
+      *defaultTextureResources.textureImage,
+      width,
+      height,
+      region);
+
+    // Transition image layout for shader access
+    transitionImageLayout(
+      *defaultTextureResources.textureImage,
+      vk::Format::eR8G8B8A8Srgb,
+      vk::ImageLayout::eTransferDstOptimal,
+      vk::ImageLayout::eShaderReadOnlyOptimal);
+
+    // Create texture image view
+    defaultTextureResources.textureImageView = createImageView(
+      defaultTextureResources.textureImage,
+      vk::Format::eR8G8B8A8Srgb,
+      vk::ImageAspectFlagBits::eColor);
+
+    // Create texture sampler
+    return createTextureSampler(defaultTextureResources);
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create default texture resources: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create texture sampler
+bool Renderer::createTextureSampler(TextureResources& resources) {
+  try {
+    ensureThreadLocalVulkanInit();
+    // Get physical device properties
+    vk::PhysicalDeviceProperties properties = physicalDevice.getProperties();
+
+    // Create sampler with mipmapping + anisotropy (clamped to device limit)
+    float deviceMaxAniso = properties.limits.maxSamplerAnisotropy;
+    float desiredAniso = std::clamp(samplerMaxAnisotropy, 1.0f, deviceMaxAniso);
+    float maxLod = resources.mipLevels > 1 ? static_cast<float>(resources.mipLevels - 1) : 0.0f;
+    vk::SamplerCreateInfo samplerInfo{
+      .magFilter = vk::Filter::eLinear,
+      .minFilter = vk::Filter::eLinear,
+      .mipmapMode = vk::SamplerMipmapMode::eLinear,
+      .addressModeU = vk::SamplerAddressMode::eRepeat,
+      .addressModeV = vk::SamplerAddressMode::eRepeat,
+      .addressModeW = vk::SamplerAddressMode::eRepeat,
+      .mipLodBias = 0.0f,
+      .anisotropyEnable = desiredAniso > 1.0f ? VK_TRUE : VK_FALSE,
+      .maxAnisotropy = desiredAniso,
+      .compareEnable = VK_FALSE,
+      .compareOp = vk::CompareOp::eAlways,
+      .minLod = 0.0f,
+      .maxLod = maxLod,
+      .borderColor = vk::BorderColor::eIntOpaqueBlack,
+      .unnormalizedCoordinates = VK_FALSE
+    };
+
+    resources.textureSampler = vk::raii::Sampler(device, samplerInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create texture sampler: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Load texture from file (public wrapper for createTextureImage)
+bool Renderer::LoadTexture(const std::string& texturePath) {
+  ensureThreadLocalVulkanInit();
+  if (texturePath.empty()) {
+    std::cerr << "LoadTexture: Empty texture path provided" << std::endl;
+    return false;
+  }
+
+  // Resolve aliases (canonical ID -> actual key)
+  const std::string resolvedId = ResolveTextureId(texturePath);
+
+  // Check if texture is already loaded
+  {
+    std::shared_lock<std::shared_mutex> texLock(textureResourcesMutex);
+    auto it = textureResources.find(resolvedId);
+    if (it != textureResources.end()) {
+      // Texture already loaded
+      return true;
+    }
+  }
+
+  // Create temporary texture resources (unused output; cache will be populated internally)
+  TextureResources tempResources;
+
+  // Use existing createTextureImage method (it inserts into textureResources on success) if it's a KTX2 path; otherwise fall back to memory path below
+  bool success = false;
+  if (resolvedId.ends_with(".ktx2")) {
+    success = createTextureImage(resolvedId, tempResources);
+    if (success)
+      return true;
+    // Fall through to raw-memory path if KTX load failed
+  } else if (resolvedId.ends_with(".png") || resolvedId.ends_with(".jpg") || resolvedId.ends_with(".jpeg")) {
+    // Fallback for PNG/JPG/etc. directly from file
+    int width, height, channels;
+    unsigned char* data = stbi_load(resolvedId.c_str(), &width, &height, &channels, 4);
+    if (data) {
+      success = LoadTextureFromMemory(resolvedId, data, width, height, 4);
+      stbi_image_free(data);
+      if (success) return true;
+    } else {
+        const char* reason = stbi_failure_reason();
+        std::cerr << "stbi_load failed for " << resolvedId << ". Reason: " << (reason ? reason : "unknown") << std::endl;
+    }
+  }
+
+  if (!success) {
+    const char* reason = stbi_failure_reason();
+    std::cerr << "Failed to load texture: " << texturePath << " (resolved as " << resolvedId << "). Reason: " << (reason ? reason : "unknown") << std::endl;
+  }
+
+  return success;
+}
+
+// Determine appropriate texture format based on texture type
+vk::Format Renderer::determineTextureFormat(const std::string& textureId) {
+  // Determine sRGB vs Linear in a case-insensitive way
+  std::string idLower = textureId;
+  std::ranges::transform(idLower, idLower.begin(), [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
+
+  // BaseColor/Albedo/Diffuse & SpecGloss RGB should be sRGB for proper gamma correction
+  if (idLower.find("basecolor") != std::string::npos ||
+    idLower.find("base_color") != std::string::npos ||
+    idLower.find("albedo") != std::string::npos ||
+    idLower.find("diffuse") != std::string::npos ||
+    idLower.find("specgloss") != std::string::npos ||
+    idLower.find("specularglossiness") != std::string::npos ||
+    textureId == Renderer::SHARED_DEFAULT_ALBEDO_ID) {
+    return vk::Format::eR8G8B8A8Srgb;
+  }
+
+  // Emissive is color data and should be sampled in sRGB
+  if (idLower.find("emissive") != std::string::npos ||
+    textureId == Renderer::SHARED_DEFAULT_EMISSIVE_ID) {
+    return vk::Format::eR8G8B8A8Srgb;
+  }
+
+  // Shared bright red (ball) is a color texture; ensure sRGB for vivid appearance
+  if (textureId == Renderer::SHARED_BRIGHT_RED_ID) {
+    return vk::Format::eR8G8B8A8Srgb;
+  }
+
+  // All other PBR textures (normal, metallic-roughness, occlusion) should be linear
+  // because they contain non-color data that shouldn't be gamma corrected
+  return vk::Format::eR8G8B8A8Unorm;
+}
+
+// Load texture from raw image data in memory
+bool Renderer::LoadTextureFromMemory(const std::string& textureId,
+                                     const unsigned char* imageData,
+                                     int width,
+                                     int height,
+                                     int channels) {
+  ensureThreadLocalVulkanInit();
+  const std::string resolvedId = ResolveTextureId(textureId);
+  std::cout << "[LoadTextureFromMemory] start id=" << textureId << " -> resolved=" << resolvedId << " size=" << width << "x" << height << " ch=" << channels << std::endl;
+  if (resolvedId.empty() || !imageData || width <= 0 || height <= 0 || channels <= 0) {
+    std::cerr << "LoadTextureFromMemory: Invalid parameters" << std::endl;
+    return false;
+  }
+
+  // Check if texture is already loaded
+  {
+    std::shared_lock<std::shared_mutex> texLock(textureResourcesMutex);
+    auto it = textureResources.find(resolvedId);
+    if (it != textureResources.end()) {
+      // Texture already loaded
+      return true;
+    }
+  }
+
+  // Per-texture de-duplication (serialize loads of the same texture ID only)
+  {
+    std::unique_lock<std::mutex> lk(textureLoadStateMutex);
+    while (texturesLoading.contains(resolvedId)) {
+      textureLoadStateCv.wait(lk);
+    }
+  }
+  // Double-check cache after the wait
+  {
+    std::shared_lock<std::shared_mutex> texLock(textureResourcesMutex);
+    auto it2 = textureResources.find(resolvedId);
+    if (it2 != textureResources.end()) {
+      return true;
+    }
+  }
+  // Mark as loading and ensure we notify on all exit paths
+  {
+    std::lock_guard<std::mutex> lk(textureLoadStateMutex);
+    texturesLoading.insert(resolvedId);
+  }
+  auto _loadingGuard = std::unique_ptr<void, std::function<void(void*)>>(reinterpret_cast<void *>(1),
+                                                                         [this, resolvedId](void*) {
+                                                                           std::lock_guard<std::mutex> lk(textureLoadStateMutex);
+                                                                           texturesLoading.erase(resolvedId);
+                                                                           textureLoadStateCv.notify_all();
+                                                                         });
+
+  try {
+    TextureResources resources;
+
+    // Calculate image size (ensure 4 channels for RGBA)
+    int targetChannels = 4; // Always use RGBA for consistency
+    vk::DeviceSize imageSize = width * height * targetChannels;
+
+    // Create a staging buffer
+    auto [stagingBuffer, stagingBufferMemory] = createBuffer(
+      imageSize,
+      vk::BufferUsageFlagBits::eTransferSrc,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    // Copy and convert pixel data to staging buffer
+    void* data = stagingBufferMemory.mapMemory(0, imageSize);
+    auto* stagingData = static_cast<unsigned char *>(data);
+
+    if (channels == 4) {
+      // Already RGBA, direct copy
+      memcpy(stagingData, imageData, imageSize);
+    } else if (channels == 3) {
+      // RGB to RGBA conversion
+      for (int i = 0; i < width * height; ++i) {
+        stagingData[i * 4 + 0] = imageData[i * 3 + 0]; // R
+        stagingData[i * 4 + 1] = imageData[i * 3 + 1]; // G
+        stagingData[i * 4 + 2] = imageData[i * 3 + 2]; // B
+        stagingData[i * 4 + 3] = 255; // A
+      }
+    } else if (channels == 2) {
+      // Grayscale + Alpha to RGBA conversion
+      for (int i = 0; i < width * height; ++i) {
+        stagingData[i * 4 + 0] = imageData[i * 2 + 0]; // R (grayscale)
+        stagingData[i * 4 + 1] = imageData[i * 2 + 0]; // G (grayscale)
+        stagingData[i * 4 + 2] = imageData[i * 2 + 0]; // B (grayscale)
+        stagingData[i * 4 + 3] = imageData[i * 2 + 1]; // A (alpha)
+      }
+    } else if (channels == 1) {
+      // Grayscale to RGBA conversion
+      for (int i = 0; i < width * height; ++i) {
+        stagingData[i * 4 + 0] = imageData[i]; // R
+        stagingData[i * 4 + 1] = imageData[i]; // G
+        stagingData[i * 4 + 2] = imageData[i]; // B
+        stagingData[i * 4 + 3] = 255; // A
+      }
+    } else {
+      std::cerr << "LoadTextureFromMemory: Unsupported channel count: " << channels << std::endl;
+      stagingBufferMemory.unmapMemory();
+      return false;
+    }
+
+    // Analyze alpha to set alphaMaskedHint (treat as masked if any pixel alpha < ~1.0)
+    bool alphaMaskedHint = false;
+    for (int i = 0, n = width * height; i < n; ++i) {
+      if (stagingData[i * 4 + 3] < 250) {
+        alphaMaskedHint = true;
+        break;
+      }
+    }
+
+    stagingBufferMemory.unmapMemory();
+
+    // Determine the appropriate texture format based on the texture type
+    vk::Format textureFormat = determineTextureFormat(textureId);
+
+    // Create texture image using memory pool (with optional mipmap generation)
+    bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value();
+    std::vector<uint32_t> families;
+    if (differentFamilies) {
+      families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()};
+    }
+    // Decide mip count and usage for memory textures; cap to reduce VRAM pressure
+    uint32_t mipLevels = 1;
+    if (width > 1 && height > 1) {
+      uint32_t full = static_cast<uint32_t>(std::floor(std::log2(std::max(width, height)))) + 1;
+      mipLevels = std::max(1u, std::min(full, maxAutoGeneratedMipLevels));
+    }
+    vk::ImageUsageFlags usageFlags = vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled;
+    if (mipLevels > 1)
+      usageFlags |= vk::ImageUsageFlagBits::eTransferSrc;
+
+    // OOM-resilient allocation
+    try {
+      auto [textureImg, textureImgAllocation] = createImagePooled(
+        width,
+        height,
+        textureFormat,
+        vk::ImageTiling::eOptimal,
+        usageFlags,
+        vk::MemoryPropertyFlagBits::eDeviceLocal,
+        mipLevels,
+        differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive,
+        families);
+
+      resources.textureImage = std::move(textureImg);
+      resources.textureImageAllocation = std::move(textureImgAllocation);
+    } catch (const std::exception& e) {
+      std::cerr << "Image allocation failed (memory texture): " << e.what() << ". Retrying with mipLevels=1..." << std::endl;
+      mipLevels = 1;
+      usageFlags &= ~vk::ImageUsageFlagBits::eTransferSrc;
+      auto [textureImg, textureImgAllocation] = createImagePooled(
+        width,
+        height,
+        textureFormat,
+        vk::ImageTiling::eOptimal,
+        usageFlags,
+        vk::MemoryPropertyFlagBits::eDeviceLocal,
+        mipLevels,
+        differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive,
+        families);
+      resources.textureImage = std::move(textureImg);
+      resources.textureImageAllocation = std::move(textureImgAllocation);
+    }
+
+    // GPU upload. Copy buffer to image in a single submit.
+    vk::BufferImageCopy region{
+      .bufferOffset = 0,
+      .bufferRowLength = 0,
+      .bufferImageHeight = 0,
+      .imageSubresource = {
+        .aspectMask = vk::ImageAspectFlagBits::eColor,
+        .mipLevel = 0,
+        .baseArrayLayer = 0,
+        .layerCount = 1
+      },
+      .imageOffset = {0, 0, 0},
+      .imageExtent = {static_cast<uint32_t>(width), static_cast<uint32_t>(height), 1}
+    };
+    uploadImageFromStaging(*stagingBuffer, *resources.textureImage, textureFormat, region, mipLevels, imageSize);
+
+    // Generate mip chain if requested and format is uncompressed RGBA
+    if (mipLevels > 1 && (textureFormat == vk::Format::eR8G8B8A8Srgb || textureFormat == vk::Format::eR8G8B8A8Unorm)) {
+      generateMipmaps(*resources.textureImage, textureFormat, width, height, mipLevels);
+    }
+
+    // Store the format for createTextureImageView
+    resources.format = textureFormat;
+    resources.mipLevels = mipLevels;
+    resources.alphaMaskedHint = alphaMaskedHint;
+
+    // Use resolvedId as the cache key to avoid duplicates
+    const std::string& cacheId = resolvedId;
+
+    // Create texture image view
+    resources.textureImageView = createImageView(
+      resources.textureImage,
+      textureFormat,
+      vk::ImageAspectFlagBits::eColor,
+      mipLevels);
+
+    // Create texture sampler
+    if (!createTextureSampler(resources)) {
+      return false;
+    }
+
+    // Add to texture resources map (guarded)
+    {
+      std::unique_lock<std::shared_mutex> texLock(textureResourcesMutex);
+      textureResources[cacheId] = std::move(resources);
+    }
+
+    std::cout << "Successfully loaded texture from memory: " << cacheId
+        << " (" << width << "x" << height << ", " << channels << " channels)" << std::endl;
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to load texture from memory: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create mesh resources
+bool Renderer::createMeshResources(MeshComponent* meshComponent, bool deferUpload) {
+  ensureThreadLocalVulkanInit();
+  try {
+    // If resources already exist, no need to recreate them.
+    auto it = meshResources.find(meshComponent);
+    if (it != meshResources.end()) {
+      // If we previously created this mesh with deferred uploads, but the caller now
+      // wants an immediate/ready mesh (e.g., during loading or before AS build),
+      // flush the pending staging copies right here.
+      if (!deferUpload) {
+        MeshResources& res = it->second;
+        if ((res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) ||
+          (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer)) {
+          if (res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) {
+            copyBuffer(res.stagingVertexBuffer, res.vertexBuffer, res.vertexBufferSizeBytes);
+            res.stagingVertexBuffer = vk::raii::Buffer(nullptr);
+            res.stagingVertexBufferMemory = vk::raii::DeviceMemory(nullptr);
+            res.vertexBufferSizeBytes = 0;
+          }
+          if (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer) {
+            copyBuffer(res.stagingIndexBuffer, res.indexBuffer, res.indexBufferSizeBytes);
+            res.stagingIndexBuffer = vk::raii::Buffer(nullptr);
+            res.stagingIndexBufferMemory = vk::raii::DeviceMemory(nullptr);
+            res.indexBufferSizeBytes = 0;
+          }
+        }
+      }
+      return true;
+    }
+
+    // Get mesh data
+    const auto& vertices = meshComponent->GetVertices();
+    const auto& indices = meshComponent->GetIndices();
+
+    if (vertices.empty() || indices.empty()) {
+      std::cerr << "Mesh has no vertices or indices" << std::endl;
+      return false;
+    }
+
+    // --- 1. Create and fill per-mesh staging buffers on the host ---
+    // Use direct RAII allocation (not pool) so vkFreeMemory is called when the
+    // staging buffer goes out of scope.  Pool allocations have no custom deleter,
+    // so setting unique_ptr<Allocation> to nullptr leaks the pool free-list entry
+    // permanently.  With 554 meshes x 2 staging buffers = 1108 leaked allocations
+    // the 128 MB staging block is exhausted and a second vkAllocateMemory is triggered
+    // mid-frame — causing the NVIDIA driver to stall for hundreds of milliseconds.
+    vk::DeviceSize vertexBufferSize = sizeof(vertices[0]) * vertices.size();
+    auto [stagingVertexBuffer, stagingVertexBufferMemory] = createBuffer(
+      vertexBufferSize,
+      vk::BufferUsageFlagBits::eTransferSrc,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    {
+      void* vertexData = stagingVertexBufferMemory.mapMemory(0, vertexBufferSize);
+      std::memcpy(vertexData, vertices.data(), static_cast<size_t>(vertexBufferSize));
+      stagingVertexBufferMemory.unmapMemory();
+    }
+
+    vk::DeviceSize indexBufferSize = sizeof(indices[0]) * indices.size();
+    auto [stagingIndexBuffer, stagingIndexBufferMemory] = createBuffer(
+      indexBufferSize,
+      vk::BufferUsageFlagBits::eTransferSrc,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    {
+      void* indexData = stagingIndexBufferMemory.mapMemory(0, indexBufferSize);
+      std::memcpy(indexData, indices.data(), static_cast<size_t>(indexBufferSize));
+      stagingIndexBufferMemory.unmapMemory();
+    }
+
+    // --- 2. Create device-local vertex and index buffers via the memory pool ---
+    // Add ray tracing flags: eShaderDeviceAddress for vkGetBufferDeviceAddress and
+    // eAccelerationStructureBuildInputReadOnlyKHR for acceleration structure building
+    vk::BufferUsageFlags vbUsage = vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eVertexBuffer |
+      vk::BufferUsageFlagBits::eShaderDeviceAddress | vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR;
+    
+    if (IsMeshComponentDeformable(meshComponent)) {
+        vbUsage |= vk::BufferUsageFlagBits::eStorageBuffer;
+    }
+
+    auto [vertexBuffer, vertexBufferAllocation] = createBufferPooled(
+      vertexBufferSize,
+      vbUsage,
+      vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+    auto [indexBuffer, indexBufferAllocation] = createBufferPooled(
+      indexBufferSize,
+      vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eIndexBuffer |
+      vk::BufferUsageFlagBits::eShaderDeviceAddress | vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR,
+      vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+    // --- 3. Either copy now (legacy path) or defer copies for batched submission ---
+    MeshResources resources;
+    resources.vertexBuffer = std::move(vertexBuffer);
+    resources.vertexBufferAllocation = std::move(vertexBufferAllocation);
+    resources.indexBuffer = std::move(indexBuffer);
+    resources.indexBufferAllocation = std::move(indexBufferAllocation);
+    resources.indexCount = static_cast<uint32_t>(indices.size());
+
+    // Add to mesh resources map
+    meshResources[meshComponent] = std::move(resources);
+    MeshResources& finalRes = meshResources[meshComponent];
+
+    // --- 4. Handle deformable mesh resources (Skinning/Morphing) ---
+    // Snapshot deformable data under a shared lock, then release the lock
+    // before performing any GPU work (createBufferPooled, copyBuffer, descriptor
+    // allocations). This prevents deadlocks with the render thread which holds
+    // queueMutex / takes shared locks on g_advancedStateMutex during rendering.
+    bool isDeformable = false;
+    std::vector<glm::uvec4> jointIndices;
+    std::vector<glm::vec4> jointWeights;
+    std::vector<std::vector<glm::vec3>> morphPositions;
+    {
+      std::shared_lock<std::shared_mutex> snapLock(g_advancedStateMutex);
+      auto itComp = g_meshComponentData.find(meshComponent);
+      if (itComp != g_meshComponentData.end() && itComp->second.isDeformable) {
+        isDeformable = true;
+        jointIndices = itComp->second.jointIndices;
+        jointWeights = itComp->second.jointWeights;
+        morphPositions = itComp->second.morphTargetPositions;
+      }
+    }
+    if (isDeformable) {
+      // Build deformable resources WITHOUT holding g_advancedStateMutex.
+      AdvancedEntityResources advResources;
+      advResources.isDeformable = true;
+
+        // Output vertex buffer (standard layout for drawing)
+        vk::DeviceSize outputVertexBufferSize = sizeof(OutputVertex) * vertices.size();
+        auto [outVB, outVBAlloc] = createBufferPooled(
+          outputVertexBufferSize,
+          vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eVertexBuffer |
+          vk::BufferUsageFlagBits::eShaderDeviceAddress | vk::BufferUsageFlagBits::eAccelerationStructureBuildInputReadOnlyKHR,
+          vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+        advResources.outputVertexBuffer = std::move(outVB);
+        advResources.outputVertexBufferAllocation = std::move(outVBAlloc);
+        std::cout << "[Resources] Allocated output vertex buffer for skinned mesh: " << meshComponent << " (size: " << outputVertexBufferSize << ")" << std::endl;
+
+        // Matrix palette buffer (assume max 256 joints for now)
+        vk::DeviceSize jointBufferSize = sizeof(glm::mat4) * 256;
+        auto [jointB, jointBAlloc] = createBufferPooled(
+          jointBufferSize,
+          vk::BufferUsageFlagBits::eStorageBuffer,
+          vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+        advResources.jointMatricesBuffer = std::move(jointB);
+        advResources.jointMatricesBufferAllocation = std::move(jointBAlloc);
+
+        // --- NEW: Parallel skinning data buffers ---
+        if (!jointIndices.empty() && !jointWeights.empty()) {
+          vk::DeviceSize indicesSize = jointIndices.size() * sizeof(glm::uvec4);
+          vk::DeviceSize weightsSize = jointWeights.size() * sizeof(glm::vec4);
+
+          auto [indB, indBAlloc] = createBufferPooled(
+            indicesSize,
+            vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst,
+            vk::MemoryPropertyFlagBits::eDeviceLocal);
+          
+          auto [weiB, weiBAlloc] = createBufferPooled(
+            weightsSize,
+            vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst,
+            vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+          advResources.jointIndicesBuffer = std::move(indB);
+          advResources.jointIndicesBufferAllocation = std::move(indBAlloc);
+          advResources.jointWeightsBuffer = std::move(weiB);
+          advResources.jointWeightsBufferAllocation = std::move(weiBAlloc);
+
+          // Upload now using staging buffers
+          {
+              auto [stagingIndicesB, stagingIndicesAlloc] = createBufferPooled(
+                  indicesSize, vk::BufferUsageFlagBits::eTransferSrc,
+                  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+              std::memcpy(stagingIndicesAlloc->mappedPtr, jointIndices.data(), static_cast<size_t>(indicesSize));
+              
+              if (deferUpload) {
+                  advResources.stagingJointIndicesBuffer = std::move(stagingIndicesB);
+                  advResources.stagingJointIndicesAllocation = std::move(stagingIndicesAlloc);
+                  advResources.jointIndicesSize = indicesSize;
+              } else {
+                  copyBuffer(stagingIndicesB, advResources.jointIndicesBuffer, indicesSize);
+              }
+          }
+          {
+              auto [stagingWeightsB, stagingWeightsAlloc] = createBufferPooled(
+                  weightsSize, vk::BufferUsageFlagBits::eTransferSrc,
+                  vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+              std::memcpy(stagingWeightsAlloc->mappedPtr, jointWeights.data(), static_cast<size_t>(weightsSize));
+              
+              if (deferUpload) {
+                  advResources.stagingJointWeightsBuffer = std::move(stagingWeightsB);
+                  advResources.stagingJointWeightsAllocation = std::move(stagingWeightsAlloc);
+                  advResources.jointWeightsSize = weightsSize;
+              } else {
+                  copyBuffer(stagingWeightsB, advResources.jointWeightsBuffer, weightsSize);
+              }
+          }
+        } else {
+          // Morph-only (non-skinned) deformable mesh: it has no joint data, but the skin
+          // descriptor set still binds joint index/weight buffers (bindings 3 & 4). Create
+          // valid zero-filled buffers sized to the vertex count so the descriptor writes
+          // reference real VkBuffers (avoids "Invalid VkBuffer" validation errors). The shader
+          // ignores them because applySkinning is 0 for these meshes.
+          const size_t vcount = std::max<size_t>(1, vertices.size());
+          vk::DeviceSize zIndicesSize = vcount * sizeof(glm::uvec4);
+          vk::DeviceSize zWeightsSize = vcount * sizeof(glm::vec4);
+
+          auto [zIndB, zIndBAlloc] = createBufferPooled(zIndicesSize,
+            vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst,
+            vk::MemoryPropertyFlagBits::eDeviceLocal);
+          auto [zWeiB, zWeiBAlloc] = createBufferPooled(zWeightsSize,
+            vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst,
+            vk::MemoryPropertyFlagBits::eDeviceLocal);
+          advResources.jointIndicesBuffer = std::move(zIndB);
+          advResources.jointIndicesBufferAllocation = std::move(zIndBAlloc);
+          advResources.jointWeightsBuffer = std::move(zWeiB);
+          advResources.jointWeightsBufferAllocation = std::move(zWeiBAlloc);
+
+          std::vector<glm::uvec4> zeroIdx(vcount, glm::uvec4(0u));
+          std::vector<glm::vec4> zeroWei(vcount, glm::vec4(0.0f));
+          {
+            auto [sIdxB, sIdxAlloc] = createBufferPooled(zIndicesSize, vk::BufferUsageFlagBits::eTransferSrc,
+              vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+            std::memcpy(sIdxAlloc->mappedPtr, zeroIdx.data(), static_cast<size_t>(zIndicesSize));
+            if (deferUpload) {
+              advResources.stagingJointIndicesBuffer = std::move(sIdxB);
+              advResources.stagingJointIndicesAllocation = std::move(sIdxAlloc);
+              advResources.jointIndicesSize = zIndicesSize;
+            } else {
+              copyBuffer(sIdxB, advResources.jointIndicesBuffer, zIndicesSize);
+            }
+          }
+          {
+            auto [sWeiB, sWeiAlloc] = createBufferPooled(zWeightsSize, vk::BufferUsageFlagBits::eTransferSrc,
+              vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+            std::memcpy(sWeiAlloc->mappedPtr, zeroWei.data(), static_cast<size_t>(zWeightsSize));
+            if (deferUpload) {
+              advResources.stagingJointWeightsBuffer = std::move(sWeiB);
+              advResources.stagingJointWeightsAllocation = std::move(sWeiAlloc);
+              advResources.jointWeightsSize = zWeightsSize;
+            } else {
+              copyBuffer(sWeiB, advResources.jointWeightsBuffer, zWeightsSize);
+            }
+          }
+        }
+
+        // Allocate descriptor sets for skinning (one per frame in flight)
+        // Access g_rendererStates briefly under a shared lock to obtain layouts.
+        vk::DescriptorSetLayout skinLayoutHandle{};
+        vk::DescriptorSetLayout morphLayoutHandle{};
+        {
+          std::shared_lock<std::shared_mutex> stateLock(g_advancedStateMutex);
+          auto stIt = g_rendererStates.find(this);
+          if (stIt != g_rendererStates.end()) {
+            skinLayoutHandle = *stIt->second.skinDescriptorSetLayout;
+            morphLayoutHandle = *stIt->second.morphDescriptorSetLayout;
+          }
+        }
+        std::vector<vk::DescriptorSetLayout> skinLayouts(MAX_FRAMES_IN_FLIGHT, skinLayoutHandle);
+        vk::DescriptorSetAllocateInfo skinAllocInfo{
+          .descriptorPool = *descriptorPool,
+          .descriptorSetCount = MAX_FRAMES_IN_FLIGHT,
+          .pSetLayouts = skinLayouts.data()
+        };
+
+        auto sets = device.allocateDescriptorSets(skinAllocInfo);
+        for (auto& s : sets) {
+          advResources.skinDescriptorSets.emplace_back(std::move(s));
+        }
+
+        // Update descriptor sets
+        for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+          vk::DescriptorBufferInfo baseInfo{*finalRes.vertexBuffer, 0, vertexBufferSize};
+          vk::DescriptorBufferInfo outInfo{*advResources.outputVertexBuffer, 0, outputVertexBufferSize};
+          vk::DescriptorBufferInfo jointInfo{*advResources.jointMatricesBuffer, 0, jointBufferSize};
+          vk::DescriptorBufferInfo indicesInfo{*advResources.jointIndicesBuffer, 0, VK_WHOLE_SIZE};
+          vk::DescriptorBufferInfo weightsInfo{*advResources.jointWeightsBuffer, 0, VK_WHOLE_SIZE};
+
+          std::array<vk::WriteDescriptorSet, 5> writes;
+          writes[0].setDstSet(*advResources.skinDescriptorSets[i])
+                   .setDstBinding(0)
+                   .setDescriptorCount(1)
+                   .setDescriptorType(vk::DescriptorType::eStorageBuffer)
+                   .setPBufferInfo(&baseInfo);
+          
+          writes[1].setDstSet(*advResources.skinDescriptorSets[i])
+                   .setDstBinding(1)
+                   .setDescriptorCount(1)
+                   .setDescriptorType(vk::DescriptorType::eStorageBuffer)
+                   .setPBufferInfo(&outInfo);
+
+          writes[2].setDstSet(*advResources.skinDescriptorSets[i])
+                   .setDstBinding(2)
+                   .setDescriptorCount(1)
+                   .setDescriptorType(vk::DescriptorType::eStorageBuffer)
+                   .setPBufferInfo(&jointInfo);
+
+          writes[3].setDstSet(*advResources.skinDescriptorSets[i])
+                   .setDstBinding(3)
+                   .setDescriptorCount(1)
+                   .setDescriptorType(vk::DescriptorType::eStorageBuffer)
+                   .setPBufferInfo(&indicesInfo);
+
+          writes[4].setDstSet(*advResources.skinDescriptorSets[i])
+                   .setDstBinding(4)
+                   .setDescriptorCount(1)
+                   .setDescriptorType(vk::DescriptorType::eStorageBuffer)
+                   .setPBufferInfo(&weightsInfo);
+
+          device.updateDescriptorSets(writes, {});
+        }
+
+        // --- NEW: Morph Target Buffers ---
+        if (!morphPositions.empty()) {
+          uint32_t targetIdx = 0;
+          for (const auto& targetPos : morphPositions) {
+              vk::DeviceSize targetSize = targetPos.size() * sizeof(glm::vec3);
+              auto [targetB, targetBAlloc] = createBufferPooled(
+                  targetSize,
+                  vk::BufferUsageFlagBits::eStorageBuffer | vk::BufferUsageFlagBits::eTransferDst,
+                  vk::MemoryPropertyFlagBits::eDeviceLocal);
+              
+              advResources.morphTargetBuffers.push_back(std::move(targetB));
+              advResources.morphTargetBufferAllocations.push_back(std::move(targetBAlloc));
+
+              // Upload
+              {
+                  auto [stagingB, stagingAlloc] = createBufferPooled(
+                      targetSize, vk::BufferUsageFlagBits::eTransferSrc,
+                      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+                  
+                  std::memcpy(stagingAlloc->mappedPtr, targetPos.data(), static_cast<size_t>(targetSize));
+                  
+                  if (deferUpload) {
+                      advResources.stagingMorphTargetBuffers.push_back(std::move(stagingB));
+                      advResources.stagingMorphTargetAllocations.push_back(std::move(stagingAlloc));
+                      advResources.morphTargetSizes.push_back(targetSize);
+                  } else {
+                      copyBuffer(stagingB, advResources.morphTargetBuffers.back(), targetSize);
+                  }
+              }
+          }
+
+          // Allocate descriptor sets for morphing
+          uint32_t targetCount = static_cast<uint32_t>(advResources.morphTargetBuffers.size());
+          vk::DescriptorSetVariableDescriptorCountAllocateInfo variableDescriptorCountAllocInfo{
+              .descriptorSetCount = 1,
+              .pDescriptorCounts = &targetCount
+          };
+          
+          vk::DescriptorSetAllocateInfo morphAllocInfo{
+            .pNext = &variableDescriptorCountAllocInfo,
+            .descriptorPool = *descriptorPool,
+            .descriptorSetCount = 1,
+            .pSetLayouts = &morphLayoutHandle
+          };
+
+          auto mSets = device.allocateDescriptorSets(morphAllocInfo);
+          advResources.morphDescriptorSets.emplace_back(std::move(mSets[0]));
+
+          // Update morph descriptor set
+          std::vector<vk::DescriptorBufferInfo> morphInfos;
+          for (auto& b : advResources.morphTargetBuffers) {
+              morphInfos.push_back(vk::DescriptorBufferInfo{*b, 0, VK_WHOLE_SIZE});
+          }
+
+          vk::WriteDescriptorSet mWrite{};
+          mWrite.setDstSet(*advResources.morphDescriptorSets[0])
+                .setDstBinding(0)
+                .setDstArrayElement(0)
+                .setDescriptorCount(targetCount)
+                .setDescriptorType(vk::DescriptorType::eStorageBuffer)
+                .setPBufferInfo(morphInfos.data());
+
+          device.updateDescriptorSets(mWrite, {});
+          std::cout << "[Resources] Allocated " << targetCount << " morph target buffers for mesh: " << meshComponent << std::endl;
+      }
+
+      // Merge the locally-built advanced resources into the global map under a
+      // brief unique lock. Preserve any staging allocations that were already
+      // recorded earlier (step 3) by transferring them in.
+      {
+        std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+        auto& dest = g_meshAdvancedResources[meshComponent];
+        dest = std::move(advResources);
+      }
+    }
+
+    if (deferUpload) {
+      // Keep staging buffers alive until the deferred copy completes.
+      // Both the vk::raii::Buffer and the vk::raii::DeviceMemory must survive
+      // until preAllocateEntityResourcesBatch() runs copyBuffer and then nulls
+      // the fields.  The DeviceMemory RAII object calls vkFreeMemory on destruction,
+      // so this correctly returns host memory to the OS — no pool leak.
+      finalRes.stagingVertexBuffer = std::move(stagingVertexBuffer);
+      finalRes.stagingVertexBufferMemory = std::move(stagingVertexBufferMemory);
+      finalRes.vertexBufferSizeBytes = vertexBufferSize;
+
+      finalRes.stagingIndexBuffer = std::move(stagingIndexBuffer);
+      finalRes.stagingIndexBufferMemory = std::move(stagingIndexBufferMemory);
+      finalRes.indexBufferSizeBytes = indexBufferSize;
+    } else {
+      // Immediate upload path: copy now, staging RAII objects freed on scope exit.
+      copyBuffer(stagingVertexBuffer, finalRes.vertexBuffer, vertexBufferSize);
+      copyBuffer(stagingIndexBuffer, finalRes.indexBuffer, indexBufferSize);
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    meshResources.erase(meshComponent);
+    {
+        // Use try_lock or check if we already hold it to avoid deadlock in catch block
+        std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex, std::defer_lock);
+        if (lock.try_lock()) {
+            g_meshAdvancedResources.erase(meshComponent);
+        }
+    }
+    std::cerr << "Failed to create mesh resources: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create uniform buffers
+bool Renderer::createUniformBuffers(Entity* entity) {
+  ensureThreadLocalVulkanInit();
+  try {
+    // Check if entity resources already exist
+    auto it = entityResources.find(entity);
+    if (it != entityResources.end()) {
+      return true;
+    }
+
+    // Create entity resources
+    EntityResources resources;
+
+    // Create uniform buffers using memory pool
+    vk::DeviceSize bufferSize = sizeof(UniformBufferObject);
+    for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+      auto [buffer, bufferAllocation] = createBufferPooled(
+        bufferSize,
+        vk::BufferUsageFlagBits::eUniformBuffer,
+        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+      // Use the memory pool's mapped pointer if available
+      void* mappedMemory = bufferAllocation->mappedPtr;
+      if (!mappedMemory) {
+        std::cerr << "Warning: Uniform buffer allocation is not mapped" << std::endl;
+      }
+
+      resources.uniformBuffers.emplace_back(std::move(buffer));
+      resources.uniformBufferAllocations.emplace_back(std::move(bufferAllocation));
+      resources.uniformBuffersMapped.emplace_back(mappedMemory);
+    }
+
+    // Initialize descriptor initialization tracking flags to MAX_FRAMES_IN_FLIGHT
+    resources.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+    resources.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+    resources.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+    resources.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+    resources.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+
+    // Create instance buffer for all entities (shaders always expect instance data)
+    auto* meshComponent = entity->GetComponent<MeshComponent>();
+    if (meshComponent) {
+      std::vector<InstanceData> instanceData;
+
+      if (meshComponent->GetInstanceCount() > 0) {
+        // Use existing instance data from GLTF loading (whether 1 or many instances)
+        instanceData = meshComponent->GetInstances();
+      } else {
+        // Create single instance data using IDENTITY matrix to avoid double-transform with UBO.model
+        InstanceData singleInstance;
+        singleInstance.setModelMatrix(glm::mat4(1.0f));
+        instanceData = {singleInstance};
+      }
+
+      vk::DeviceSize instanceBufferSize = sizeof(InstanceData) * instanceData.size();
+
+      auto [instanceBuffer, instanceBufferAllocation] = createBufferPooled(
+        instanceBufferSize,
+        vk::BufferUsageFlagBits::eVertexBuffer,
+        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+      // Copy instance data to buffer
+      void* instanceMappedMemory = instanceBufferAllocation->mappedPtr;
+      if (instanceMappedMemory) {
+        std::memcpy(instanceMappedMemory, instanceData.data(), instanceBufferSize);
+      } else {
+        std::cerr << "Warning: Instance buffer allocation is not mapped" << std::endl;
+      }
+
+      resources.instanceBuffer = std::move(instanceBuffer);
+      resources.instanceBufferAllocation = std::move(instanceBufferAllocation);
+      resources.instanceBufferMapped = instanceMappedMemory;
+    }
+
+    // Add to entity resources map
+    entityResources[entity] = std::move(resources);
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create uniform buffers: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create descriptor pool
+bool Renderer::createDescriptorPool() {
+  try {
+    // Calculate pool sizes for all Bistro materials plus additional entities
+    // The Bistro model creates many more entities than initially expected
+    // Each entity needs descriptor sets for both basic and PBR pipelines
+    // PBR pipeline needs 7 descriptors per set (1 UBO + 5 PBR textures + 1 shadow map array with 16 shadow maps)
+    // Basic pipeline needs 2 descriptors per set (1 UBO + 1 texture)
+    const uint32_t maxEntities = 4000;
+    const uint32_t maxDescriptorSets = MAX_FRAMES_IN_FLIGHT * maxEntities * 2;
+
+    // Calculate descriptor counts
+    // UBO descriptors: 1 per descriptor set
+    const uint32_t uboDescriptors = maxDescriptorSets;
+    // Texture descriptors: Basic pipeline uses 1, PBR uses 21 (5 PBR textures + 16 shadow maps)
+    // Allocate for worst case: all entities using PBR (21 texture descriptors each)
+    const uint32_t textureDescriptors = MAX_FRAMES_IN_FLIGHT * maxEntities * 21;
+    // Storage buffer descriptors: PBR pipeline uses multiple storage buffers per descriptor set.
+    // Storage buffers used per PBR descriptor set:
+    //  - Binding 6:  light storage buffer
+    //  - Binding 7:  Forward+ tile headers buffer
+    //  - Binding 8:  Forward+ tile indices buffer
+    //  - Binding 9:  Fragment debug output buffer (optional)
+    //  - Binding 12: Ray-query geometry info buffer (for raster ray-query shadows)
+    //  - Binding 13: Ray-query material buffer (for raster ray-query shadows)
+    const uint32_t storageBufferDescriptors = MAX_FRAMES_IN_FLIGHT * maxEntities * 6u;
+
+    // Acceleration structure descriptors: Ray query needs 1 TLAS descriptor per frame
+    const uint32_t accelerationStructureDescriptors = MAX_FRAMES_IN_FLIGHT;
+
+    // Storage image descriptors: Ray query needs 1 output image descriptor per frame
+    const uint32_t storageImageDescriptors = MAX_FRAMES_IN_FLIGHT;
+
+    // Reserve extra combined image sampler capacity for Ray Query binding 6 (baseColor texture array)
+    const uint32_t rqTexDescriptors = MAX_FRAMES_IN_FLIGHT * RQ_MAX_TEX;
+    std::array<vk::DescriptorPoolSize, 5> poolSizes = {
+      vk::DescriptorPoolSize{
+        .type = vk::DescriptorType::eUniformBuffer,
+        .descriptorCount = uboDescriptors
+      },
+      vk::DescriptorPoolSize{
+        .type = vk::DescriptorType::eCombinedImageSampler,
+        .descriptorCount = textureDescriptors + rqTexDescriptors
+      },
+      vk::DescriptorPoolSize{
+        .type = vk::DescriptorType::eStorageBuffer,
+        .descriptorCount = storageBufferDescriptors
+      },
+      vk::DescriptorPoolSize{
+        .type = vk::DescriptorType::eAccelerationStructureKHR,
+        .descriptorCount = accelerationStructureDescriptors
+      },
+      vk::DescriptorPoolSize{
+        .type = vk::DescriptorType::eStorageImage,
+        .descriptorCount = storageImageDescriptors
+      }
+    };
+
+    // Create descriptor pool
+    vk::DescriptorPoolCreateFlags poolFlags = vk::DescriptorPoolCreateFlagBits::eFreeDescriptorSet;
+    if (descriptorIndexingEnabled) {
+      poolFlags |= vk::DescriptorPoolCreateFlagBits::eUpdateAfterBind;
+    }
+    vk::DescriptorPoolCreateInfo poolInfo{
+      .flags = poolFlags,
+      .maxSets = maxDescriptorSets,
+      .poolSizeCount = static_cast<uint32_t>(poolSizes.size()),
+      .pPoolSizes = poolSizes.data()
+    };
+
+    descriptorPool = vk::raii::DescriptorPool(device, poolInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create descriptor pool: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create descriptor sets
+bool Renderer::createDescriptorSets(Entity* entity, const std::string& texturePath, bool usePBR) {
+  auto entityIt = entityResources.find(entity);
+  if (entityIt == entityResources.end())
+    return false;
+  return createDescriptorSets(entity, entityIt->second, texturePath, usePBR);
+}
+
+bool Renderer::createDescriptorSets(Entity* entity, EntityResources& res, const std::string& texturePath, bool usePBR) {
+  // Kick watchdog periodically during heavy descriptor creation (if called from a long loop)
+  static uint32_t descWatchdogCounter = 0;
+  if (++descWatchdogCounter % 50 == 0) {
+    lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+  }
+  
+  bool isTarget = (entity && (entity->GetName().find("Fox") != std::string::npos || entity->GetName().find("Cube") != std::string::npos));
+  if (isTarget) std::cout << "[Descriptors] Creating sets for " << entity->GetName() << " usePBR=" << usePBR << " tex=" << texturePath << std::endl;
+
+  // Resolve alias before taking the shared lock to avoid nested shared_lock on the same mutex
+  const std::string resolvedTexturePath = ResolveTextureId(texturePath);
+  try {
+    vk::DescriptorSetLayout selectedLayout = usePBR ? *pbrDescriptorSetLayout : *descriptorSetLayout;
+    std::vector<vk::DescriptorSetLayout> layouts(MAX_FRAMES_IN_FLIGHT, selectedLayout);
+    vk::DescriptorSetAllocateInfo allocInfo{.descriptorPool = *descriptorPool, .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, .pSetLayouts = layouts.data()};
+
+    auto& targetDescriptorSets = usePBR ? res.pbrDescriptorSets : res.basicDescriptorSets;
+    if (targetDescriptorSets.empty()) {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      // Allocate into a temporary owning container, then move the individual RAII sets into our vector.
+      // (Avoid assigning `vk::raii::DescriptorSets` directly into `std::vector<vk::raii::DescriptorSet>`.)
+      auto sets = vk::raii::DescriptorSets(device, allocInfo);
+      targetDescriptorSets.clear();
+      targetDescriptorSets.reserve(sets.size());
+      for (auto& s : sets) {
+        targetDescriptorSets.emplace_back(std::move(s));
+      }
+    }
+
+    // Checking validity prevents SIGSEGV crash when Vulkan tries to access invalid handles.
+    if (targetDescriptorSets.empty() || targetDescriptorSets.size() < MAX_FRAMES_IN_FLIGHT) {
+      std::cerr << "ERROR: Descriptor set allocation failed for entity " << entity->GetName()
+          << " (usePBR=" << usePBR << "). Descriptor pool may be exhausted." << std::endl;
+      return false;
+    }
+
+    // Only initialize the current frame's descriptor set at runtime to avoid
+    // updating descriptor sets that may be in use by pending command buffers.
+    // Other frames will be initialized at their own safe points.
+    size_t startIndex = static_cast<size_t>(currentFrame);
+    size_t endIndex = startIndex + 1;
+    for (size_t i = startIndex; i < endIndex; i++) {
+      // Validate descriptor set handle before dereferencing to prevent crash
+      // Check if the underlying VkDescriptorSet handle is valid (not null/default)
+      vk::DescriptorSet handleCheck = *targetDescriptorSets[i];
+      if (handleCheck == vk::DescriptorSet{}) {
+        std::cerr << "ERROR: Invalid descriptor set handle for entity " << entity->GetName()
+            << " frame " << i << " (usePBR=" << usePBR << ")" << std::endl;
+        return false;
+      }
+      vk::DescriptorBufferInfo bufferInfo{.buffer = *res.uniformBuffers[i], .range = sizeof(UniformBufferObject)};
+
+      if (usePBR) {
+        // Build descriptor writes dynamically to avoid writing unused bindings
+        std::vector<vk::WriteDescriptorSet> descriptorWrites;
+        std::array<vk::DescriptorImageInfo, 5> imageInfos;
+        // Keep additional descriptor infos alive until updateDescriptorSets completes.
+        vk::DescriptorImageInfo reflInfo;
+        vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{};
+        vk::AccelerationStructureKHR tlasHandleValue{};
+        vk::DescriptorBufferInfo lightBufferInfo;
+        vk::DescriptorBufferInfo headersInfo;
+        vk::DescriptorBufferInfo indicesInfo;
+
+        descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo});
+
+        auto meshComponent = entity->GetComponent<MeshComponent>();
+        std::vector<std::string> pbrTexturePaths;
+        {
+          const std::string legacyPath = (meshComponent ? meshComponent->GetTexturePath() : std::string());
+          const std::string baseColorPath = (meshComponent && !meshComponent->GetBaseColorTexturePath().empty()) ? meshComponent->GetBaseColorTexturePath() : (!legacyPath.empty() ? legacyPath : SHARED_DEFAULT_ALBEDO_ID);
+          const std::string mrPath = (meshComponent && !meshComponent->GetMetallicRoughnessTexturePath().empty()) ? meshComponent->GetMetallicRoughnessTexturePath() : SHARED_DEFAULT_METALLIC_ROUGHNESS_ID;
+          const std::string normalPath = (meshComponent && !meshComponent->GetNormalTexturePath().empty()) ? meshComponent->GetNormalTexturePath() : SHARED_DEFAULT_NORMAL_ID;
+          const std::string occlusionPath = (meshComponent && !meshComponent->GetOcclusionTexturePath().empty()) ? meshComponent->GetOcclusionTexturePath() : SHARED_DEFAULT_OCCLUSION_ID;
+          const std::string emissivePath = (meshComponent && !meshComponent->GetEmissiveTexturePath().empty()) ? meshComponent->GetEmissiveTexturePath() : SHARED_DEFAULT_EMISSIVE_ID;
+
+          pbrTexturePaths = {baseColorPath, mrPath, normalPath, occlusionPath, emissivePath};
+        }
+
+        for (int j = 0; j < 5; j++) {
+          const auto resolvedBindingPath = ResolveTextureId(pbrTexturePaths[j]);
+          vk::Sampler samplerHandle{};
+          vk::ImageView viewHandle{}; {
+            std::shared_lock<std::shared_mutex> lock(textureResourcesMutex);
+            auto textureIt = textureResources.find(resolvedBindingPath);
+            TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources;
+            samplerHandle = *texRes->textureSampler;
+            viewHandle = *texRes->textureImageView;
+          }
+          imageInfos[j] = {.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal};
+          descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = static_cast<uint32_t>(j + 1), .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfos[j]});
+        }
+
+        lightBufferInfo = vk::DescriptorBufferInfo{.buffer = *lightStorageBuffers[i].buffer, .range = VK_WHOLE_SIZE};
+        descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightBufferInfo});
+
+        // Ensure Forward+ per-frame array exists
+        if (forwardPlusPerFrame.empty()) {
+          forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT);
+        }
+
+        // Ensure tile headers buffer exists (binding 7) - create minimal dummy if needed
+        if (i < forwardPlusPerFrame.size()) {
+          auto& f = forwardPlusPerFrame[i];
+          if (!*f.tileHeaders) {
+            vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader {offset, count, pad0, pad1}
+            auto [buf, alloc] = createBufferPooled(minSize,
+                                                   vk::BufferUsageFlagBits::eStorageBuffer,
+                                                   vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+            f.tileHeaders = std::move(buf);
+            f.tileHeadersAlloc = std::move(alloc);
+            if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) {
+              std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize);
+            }
+          }
+          headersInfo = vk::DescriptorBufferInfo{.buffer = *f.tileHeaders, .offset = 0, .range = VK_WHOLE_SIZE};
+        }
+
+        // Ensure tile light indices buffer exists (binding 8) - create minimal dummy if needed
+        if (i < forwardPlusPerFrame.size()) {
+          auto& f = forwardPlusPerFrame[i];
+          if (!*f.tileLightIndices) {
+            vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Minimal array of 4 uints
+            auto [buf, alloc] = createBufferPooled(minSize,
+                                                   vk::BufferUsageFlagBits::eStorageBuffer,
+                                                   vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+            f.tileLightIndices = std::move(buf);
+            f.tileLightIndicesAlloc = std::move(alloc);
+            if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) {
+              std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize);
+            }
+          }
+          indicesInfo = vk::DescriptorBufferInfo{.buffer = *f.tileLightIndices, .offset = 0, .range = VK_WHOLE_SIZE};
+        }
+
+        // Now both headersInfo and indicesInfo have valid buffers (never nullptr)
+        descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo});
+        descriptorWrites.push_back({.dstSet = *targetDescriptorSets[i], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo});
+
+        // Binding 10: reflection sampler (planar reflections)
+        // Always bind a safe fallback (default texture) so the descriptor is valid.
+        reflInfo = vk::DescriptorImageInfo{
+          .sampler = *defaultTextureResources.textureSampler,
+          .imageView = *defaultTextureResources.textureImageView,
+          .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal
+        };
+        descriptorWrites.push_back({
+          .dstSet = *targetDescriptorSets[i],
+          .dstBinding = 10,
+          .descriptorCount = 1,
+          .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+          .pImageInfo = &reflInfo
+        });
+
+        // Binding 11: TLAS (ray-query shadows in raster fragment shader)
+        // The PBR pipeline layout always declares this binding; it must be written before any draw.
+        // Bind the current TLAS when AS is enabled.
+        if (accelerationStructureEnabled) {
+          vk::AccelerationStructureKHR h = *tlasStructure.handle;
+          if (!!h)
+            tlasHandleValue = h;
+        }
+        tlasInfo.accelerationStructureCount = 1;
+        tlasInfo.pAccelerationStructures = &tlasHandleValue;
+        vk::WriteDescriptorSet tlasWrite{};
+        tlasWrite.dstSet = *targetDescriptorSets[i];
+        tlasWrite.dstBinding = 11;
+        tlasWrite.dstArrayElement = 0;
+        tlasWrite.descriptorCount = 1;
+        tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR;
+        tlasWrite.pNext = &tlasInfo;
+        descriptorWrites.push_back(tlasWrite); {
+          std::lock_guard<std::mutex> lk(descriptorMutex);
+          device.updateDescriptorSets(descriptorWrites, {});
+        }
+      } else {
+        // Basic Pipeline
+        // ... (this part remains the same)
+        vk::Sampler samplerHandle{};
+        vk::ImageView viewHandle{}; {
+          std::shared_lock<std::shared_mutex> lock(textureResourcesMutex);
+          auto textureIt = textureResources.find(resolvedTexturePath);
+          TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources;
+          samplerHandle = *texRes->textureSampler;
+          viewHandle = *texRes->textureImageView;
+        }
+        vk::DescriptorImageInfo imageInfo{.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal};
+        std::array<vk::WriteDescriptorSet, 2> descriptorWrites = {
+          vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[i], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo},
+          vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[i], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo}
+        }; {
+          std::lock_guard<std::mutex> lk(descriptorMutex);
+          device.updateDescriptorSets(descriptorWrites, {});
+        }
+      }
+    }
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create descriptor sets for " << entity->GetName() << ": " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Pre-allocate all Vulkan resources for an entity during scene loading
+bool Renderer::preAllocateEntityResources(Entity* entity) {
+  try {
+    // Get the mesh component
+    auto meshComponent = entity->GetComponent<MeshComponent>();
+    if (!meshComponent) {
+      std::cerr << "Entity " << entity->GetName() << " has no mesh component" << std::endl;
+      return false;
+    }
+
+    // Ensure local AABB is available for debug/probes
+    meshComponent->RecomputeLocalAABB();
+
+    // 1. Create mesh resources (vertex/index buffers)
+    if (!createMeshResources(meshComponent)) {
+      std::cerr << "Failed to create mesh resources for entity: " << entity->GetName() << std::endl;
+      return false;
+    }
+
+    // 2. Create uniform buffers
+    if (!createUniformBuffers(entity)) {
+      std::cerr << "Failed to create uniform buffers for entity: " << entity->GetName() << std::endl;
+      return false;
+    }
+
+    // Initialize per-frame UBO and image binding write flags
+    {
+      auto it = entityResources.find(entity);
+      if (it != entityResources.end()) {
+        it->second.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+        it->second.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+        it->second.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+        it->second.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+        it->second.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+      }
+    }
+
+    // 3. Pre-allocate BOTH basic and PBR descriptor sets
+    std::string texturePath = meshComponent->GetTexturePath();
+    // Fallback: if legacy texturePath is empty, use PBR baseColor texture
+    if (texturePath.empty()) {
+      const std::string& baseColor = meshComponent->GetBaseColorTexturePath();
+      if (!baseColor.empty()) {
+        texturePath = baseColor;
+      }
+    }
+
+    // Create basic descriptor sets
+    if (!createDescriptorSets(entity, texturePath, false)) {
+      std::cerr << "Failed to create basic descriptor sets for entity: " << entity->GetName() << std::endl;
+      return false;
+    }
+
+    // Create PBR descriptor sets
+    if (!createDescriptorSets(entity, texturePath, true)) {
+      std::cerr << "Failed to create PBR descriptor sets for entity: " << entity->GetName() << std::endl;
+      return false;
+    }
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to pre-allocate resources for entity " << entity->GetName() << ": " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Pre-allocate Vulkan resources for a batch of entities, batching mesh uploads
+bool Renderer::preAllocateEntityResourcesBatch(const std::vector<Entity *>& entities) {
+  watchdogProgressLabel.store("Batch: ensureThreadLocalVulkanInit", std::memory_order_relaxed);
+  watchdogProgressIndex.store(0, std::memory_order_relaxed);
+  ensureThreadLocalVulkanInit();
+  try {
+    // --- 1. For all entities, create mesh resources with deferred uploads ---
+    // Then, during initial loading (and while an AS build is pending), flush the queued
+    // uploads immediately in a single batched submit (much faster than per-mesh submits).
+    watchdogProgressLabel.store("Batch: createMeshResources loop", std::memory_order_relaxed);
+    std::vector<MeshComponent *> meshesNeedingUpload;
+    meshesNeedingUpload.reserve(entities.size());
+    const bool flushUploadsNow = IsLoading() || asBuildRequested.load(std::memory_order_relaxed);
+
+    bool allSuccess = true;
+    uint32_t processedMeshes = 0;
+    uint32_t meshLoopIndex = 0;
+    for (Entity* entity : entities) {
+      watchdogProgressIndex.store(meshLoopIndex++, std::memory_order_relaxed);
+      
+      auto meshComponent = entity->GetComponent<MeshComponent>();
+      if (!meshComponent) {
+        continue;
+      }
+
+      // Ensure local AABB is available for debug/probes
+      watchdogProgressLabel.store("Batch: RecomputeLocalAABB", std::memory_order_relaxed);
+      meshComponent->RecomputeLocalAABB();
+
+      watchdogProgressLabel.store("Batch: createMeshResources", std::memory_order_relaxed);
+      if (!createMeshResources(meshComponent, /*deferUpload=*/true)) {
+        std::cerr << "Failed to create mesh resources for entity (batch): "
+            << entity->GetName() << std::endl;
+        allSuccess = false;
+        continue;
+      }
+
+      auto it = meshResources.find(meshComponent);
+      if (it == meshResources.end()) {
+        continue;
+      }
+      MeshResources& res = it->second;
+
+      // Only schedule meshes that still have staged data pending upload
+      if (res.vertexBufferSizeBytes > 0 || res.indexBufferSizeBytes > 0) {
+        meshesNeedingUpload.push_back(meshComponent);
+      }
+    }
+
+    // --- 2. Defer all GPU copies to the render thread safe point ---
+		if (!meshesNeedingUpload.empty())
+    {
+      watchdogProgressLabel.store("Batch: EnqueueMeshUploads", std::memory_order_relaxed);
+      EnqueueMeshUploads(meshesNeedingUpload);
+      if (flushUploadsNow) {
+        watchdogProgressLabel.store("Batch: Flush mesh uploads now", std::memory_order_relaxed);
+        ProcessPendingMeshUploads();
+      }
+    }
+
+    // --- 3. Create uniform buffers and descriptor sets per entity ---
+    watchdogProgressLabel.store("Batch: per-entity resources loop", std::memory_order_relaxed);
+    uint32_t processedResources = 0;
+    uint32_t resourceLoopIndex = 0;
+    for (Entity* entity : entities) {
+      watchdogProgressIndex.store(resourceLoopIndex++, std::memory_order_relaxed);
+
+      if (!entity) {
+        continue;
+      }
+
+      auto meshComponent = entity->GetComponent<MeshComponent>();
+      if (!meshComponent) {
+        continue;
+      }
+
+      watchdogProgressLabel.store("Batch: createUniformBuffers", std::memory_order_relaxed);
+      if (!createUniformBuffers(entity)) {
+        std::cerr << "Failed to create uniform buffers for entity (batch): "
+            << entity->GetName() << std::endl;
+        allSuccess = false;
+        continue;
+      }
+
+      std::string texturePath = meshComponent->GetTexturePath();
+      // Fallback: if legacy texturePath is empty, use PBR baseColor texture
+      if (texturePath.empty()) {
+        const std::string& baseColor = meshComponent->GetBaseColorTexturePath();
+        if (!baseColor.empty()) {
+          texturePath = baseColor;
+        }
+      }
+
+      watchdogProgressLabel.store("Batch: createDescriptorSets (basic)", std::memory_order_relaxed);
+      if (!createDescriptorSets(entity, texturePath, false)) {
+        std::cerr << "Failed to create basic descriptor sets for entity (batch): "
+            << entity->GetName() << std::endl;
+        allSuccess = false;
+        continue;
+      }
+
+      watchdogProgressLabel.store("Batch: createDescriptorSets (pbr)", std::memory_order_relaxed);
+      if (!createDescriptorSets(entity, texturePath, true)) {
+        std::cerr << "Failed to create PBR descriptor sets for entity (batch): "
+            << entity->GetName() << std::endl;
+        allSuccess = false;
+        continue;
+      }
+    }
+
+    return allSuccess;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to batch pre-allocate resources for entities: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Enqueue a set of meshes to upload on the render thread (safe point)
+void Renderer::EnqueueMeshUploads(const std::vector<MeshComponent *>& meshes) {
+  if (meshes.empty())
+    return;
+  std::lock_guard<std::mutex> lk(pendingMeshUploadsMutex);
+  // Avoid duplicates by using a temporary set of current entries
+  for (MeshComponent* m : meshes) {
+    if (!m)
+      continue;
+    pendingMeshUploads.push_back(m);
+  }
+}
+
+void Renderer::EnqueueEntityPreallocationBatch(const std::vector<Entity *>& entities) {
+  if (entities.empty())
+    return; {
+    std::lock_guard<std::mutex> lk(pendingEntityPreallocMutex);
+    for (Entity* e : entities) {
+      if (!e)
+        continue;
+      pendingEntityPrealloc.push_back(e);
+    }
+  }
+  pendingEntityPreallocQueued.store(true, std::memory_order_relaxed);
+}
+
+void Renderer::EnqueueInstanceBufferRecreation(Entity* entity) {
+  if (!entity)
+    return; {
+    std::lock_guard<std::mutex> lk(pendingEntityPreallocMutex);
+    pendingInstanceBufferRecreations.push_back(entity);
+  }
+  pendingEntityPreallocQueued.store(true, std::memory_order_relaxed);
+}
+
+void Renderer::ProcessPendingEntityPreallocations() {
+  if (!pendingEntityPreallocQueued.load(std::memory_order_relaxed))
+    return;
+
+  watchdogProgressLabel.store("Prealloc: drain queues", std::memory_order_relaxed);
+
+  std::vector<Entity *> toPreallocate;
+  std::vector<Entity *> toRecreateInstances; {
+    std::lock_guard<std::mutex> lk(pendingEntityPreallocMutex);
+    
+    // Process all pending preallocations in a single batch.
+    // Drip-feeding causes multi-second lag when thousands of entities are loaded.
+    toPreallocate.swap(pendingEntityPrealloc);
+    
+    // Always process all instance recreations (usually few) as they are often
+    // critical for animation starts.
+    toRecreateInstances.swap(pendingInstanceBufferRecreations);
+    
+    if (pendingEntityPrealloc.empty() && pendingInstanceBufferRecreations.empty()) {
+        pendingEntityPreallocQueued.store(false, std::memory_order_relaxed);
+    }
+  }
+
+  // De-dup preallocations (not strictly necessary with chunking, but safe)
+  watchdogProgressLabel.store("Prealloc: dedup", std::memory_order_relaxed);
+  std::sort(toPreallocate.begin(), toPreallocate.end());
+  toPreallocate.erase(std::unique(toPreallocate.begin(), toPreallocate.end()), toPreallocate.end());
+
+  std::vector<Entity *> batch;
+  batch.reserve(toPreallocate.size());
+  for (Entity* e : toPreallocate) {
+    if (!e || !e->IsActive())
+      continue;
+    if (!e->GetComponent<MeshComponent>())
+      continue;
+    batch.push_back(e);
+  }
+
+  if (!batch.empty()) {
+    watchdogProgressLabel.store("Prealloc: preAllocateEntityResourcesBatch", std::memory_order_relaxed);
+    // Suppress the watchdog during large initial-load batches; preallocating thousands
+    // of entities legitimately takes longer than the normal 10s frame budget.
+    const bool largeBatch = batch.size() > 200;
+    if (largeBatch) {
+      watchdogSuppressed.store(true, std::memory_order_relaxed);
+    }
+    bool ok = preAllocateEntityResourcesBatch(batch);
+    if (largeBatch) {
+      watchdogSuppressed.store(false, std::memory_order_relaxed);
+      lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+    }
+    if (!ok) {
+      std::cerr << "Warning: batch entity GPU preallocation failed; will retry" << std::endl;
+      // Re-queue the batch for retry.
+      std::lock_guard<std::mutex> lk(pendingEntityPreallocMutex);
+      pendingEntityPrealloc.insert(pendingEntityPrealloc.end(), batch.begin(), batch.end());
+      pendingEntityPreallocQueued.store(true, std::memory_order_relaxed);
+    }
+  }
+
+  // Process instance buffer recreations.
+  // Wait for GPU idle ONCE before processing the batch to safely destroy old buffers.
+  if (!toRecreateInstances.empty()) {
+    watchdogProgressLabel.store("Prealloc: wait other inFlightFences (before recreateInstanceBuffer)", std::memory_order_relaxed);
+    // IMPORTANT: We are called from the render thread at the frame-start safe point,
+    // *after* `inFlightFences[currentFrame]` was waited and then reset.
+    // Waiting on the current frame fence here would deadlock forever because it won't be
+    // signaled until we submit the current frame (which can't happen while we're blocked).
+    std::vector<vk::Fence> fencesToWait;
+    if (inFlightFences.size() > 1) {
+      fencesToWait.reserve(inFlightFences.size() - 1);
+    }
+    for (uint32_t i = 0; i < static_cast<uint32_t>(inFlightFences.size()); ++i) {
+      if (i == currentFrame)
+        continue;
+      if (!!*inFlightFences[i] && *inFlightFences[i] != vk::Fence{}) {
+        fencesToWait.push_back(*inFlightFences[i]);
+      }
+    }
+    if (!fencesToWait.empty()) {
+      (void) waitForFencesSafe(fencesToWait, VK_TRUE);
+    }
+    watchdogProgressLabel.store("Prealloc: recreateInstanceBuffer loop", std::memory_order_relaxed);
+    uint32_t processed = 0;
+    for (Entity* e : toRecreateInstances) {
+      if (!e || !e->IsActive())
+        continue;
+
+      // Kick watchdog periodically during heavy batch processing
+      if (++processed % 10 == 0) {
+        lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+      }
+
+      if (!recreateInstanceBuffer(e)) {
+        std::cerr << "Warning: failed to recreate instance buffer for entity " << e->GetName() << std::endl;
+      }
+    }
+  }
+
+  watchdogProgressLabel.store("Prealloc: done", std::memory_order_relaxed);
+}
+
+// Execute pending mesh uploads on the render thread after the per-frame fence wait
+void Renderer::ProcessPendingMeshUploads() {
+  // 0. Retire completed async upload batches (if timeline semaphore is available)
+  if (!!*uploadsTimeline && *uploadsTimeline != vk::Semaphore{}) {
+    uint64_t completedValue = 0;
+    try {
+      // vk::raii::Device doesn't expose getSemaphoreCounterValue in all Vulkan-Hpp versions;
+      // use the underlying vk::Device handle.
+      completedValue = (*device).getSemaphoreCounterValue(*uploadsTimeline);
+    } catch (...) {
+      completedValue = 0;
+    }
+
+    bool anyCompleted = false;
+    while (true) {
+      InFlightMeshUploadBatch completedBatch; {
+        std::lock_guard<std::mutex> lk(inFlightMeshUploadsMutex);
+        if (inFlightMeshUploads.empty())
+          break;
+        if (inFlightMeshUploads.front().signalValue == 0 || inFlightMeshUploads.front().signalValue > completedValue)
+          break;
+        completedBatch = std::move(inFlightMeshUploads.front());
+        inFlightMeshUploads.pop_front();
+      }
+
+      // Clear staging once copies are complete
+      for (auto* meshComponent : completedBatch.meshes) {
+        auto it = meshResources.find(meshComponent);
+        if (it == meshResources.end())
+          continue;
+        MeshResources& res = it->second;
+        res.stagingVertexBuffer = vk::raii::Buffer(nullptr);
+        res.stagingVertexBufferMemory = vk::raii::DeviceMemory(nullptr);
+        res.vertexBufferSizeBytes = 0;
+        res.stagingIndexBuffer = vk::raii::Buffer(nullptr);
+        res.stagingIndexBufferMemory = vk::raii::DeviceMemory(nullptr);
+        res.indexBufferSizeBytes = 0;
+
+        // Clear advanced staging allocations
+        {
+          std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+          auto advIt = g_meshAdvancedResources.find(meshComponent);
+          if (advIt != g_meshAdvancedResources.end()) {
+            advIt->second.stagingVertexBufferAllocation = nullptr;
+            advIt->second.stagingIndexBufferAllocation = nullptr;
+            advIt->second.stagingJointIndicesAllocation = nullptr;
+            advIt->second.stagingJointWeightsAllocation = nullptr;
+            advIt->second.stagingMorphTargetAllocations.clear();
+            advIt->second.morphTargetSizes.clear();
+          }
+        }
+      }
+
+      anyCompleted = true;
+    }
+
+    if (anyCompleted) {
+      // Now that more meshes are READY (uploads finished), request a TLAS rebuild so
+      // non‑instanced and previously missing meshes are included in the acceleration structure.
+      asDevOverrideAllowRebuild = true; // allow rebuild even if frozen
+      RequestAccelerationStructureBuild("uploads completed");
+    }
+  }
+
+  // Grab the list atomically
+  std::vector<MeshComponent *> toProcess; {
+    std::lock_guard<std::mutex> lk(pendingMeshUploadsMutex);
+    if (pendingMeshUploads.empty())
+      return;
+    toProcess.swap(pendingMeshUploads);
+  }
+
+  // Build a quick lookup of meshes already in flight so we don't submit duplicate copies
+  std::unordered_set<MeshComponent *> inFlightMeshes; {
+    std::lock_guard<std::mutex> lk(inFlightMeshUploadsMutex);
+    for (const auto& b : inFlightMeshUploads) {
+      for (auto* m : b.meshes) {
+        inFlightMeshes.insert(m);
+      }
+    }
+  }
+
+  // Filter to meshes that still have staged data
+  std::vector<MeshComponent *> needsCopy;
+  needsCopy.reserve(toProcess.size());
+  for (auto* meshComponent : toProcess) {
+    if (inFlightMeshes.find(meshComponent) != inFlightMeshes.end())
+      continue;
+    auto it = meshResources.find(meshComponent);
+    if (it == meshResources.end())
+      continue;
+    const MeshResources& res = it->second;
+    if (res.vertexBufferSizeBytes > 0 || res.indexBufferSizeBytes > 0) {
+      needsCopy.push_back(meshComponent);
+    }
+  }
+  if (needsCopy.empty())
+    return;
+
+  // Record copies on GRAPHICS queue to avoid cross-queue hazards while stabilizing
+  vk::CommandPoolCreateInfo poolInfo{
+    .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+    .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value()
+  };
+
+  // Prefer async submission via the uploads timeline semaphore to avoid blocking the render thread.
+  // However, during initial loading (and when an AS build is pending), we want mesh uploads to
+  // complete promptly so readiness can increase and the AS can be built within the target budget.
+  const bool forceSynchronous = IsLoading() || asBuildRequested.load(std::memory_order_relaxed);
+  const bool canSignalTimeline = (!!*uploadsTimeline && *uploadsTimeline != vk::Semaphore{}) && !forceSynchronous;
+  if (canSignalTimeline) {
+    auto tempPool = std::make_unique<vk::raii::CommandPool>(device, poolInfo);
+    vk::CommandBufferAllocateInfo allocInfo{
+      .commandPool = **tempPool,
+      .level = vk::CommandBufferLevel::ePrimary,
+      .commandBufferCount = 1
+    };
+    auto cbs = std::make_unique<vk::raii::CommandBuffers>(device, allocInfo);
+    vk::raii::CommandBuffer& cb = (*cbs)[0];
+    cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
+
+    uint32_t uploadCount = 0;
+    for (auto* meshComponent : needsCopy) {
+      auto it = meshResources.find(meshComponent);
+      if (it == meshResources.end())
+        continue;
+      MeshResources& res = it->second;
+      if (res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) {
+        vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.vertexBufferSizeBytes};
+        cb.copyBuffer(*res.stagingVertexBuffer, *res.vertexBuffer, region);
+      }
+      if (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer) {
+        vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.indexBufferSizeBytes};
+        cb.copyBuffer(*res.stagingIndexBuffer, *res.indexBuffer, region);
+      }
+      
+      // Copy advanced skinning/morph buffers if any are staged
+      {
+          std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+          auto advIt = g_meshAdvancedResources.find(meshComponent);
+          if (advIt != g_meshAdvancedResources.end()) {
+              auto& advRes = advIt->second;
+              if (advRes.stagingJointIndicesAllocation && !!*advRes.jointIndicesBuffer) {
+                  vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = advRes.jointIndicesSize};
+                  cb.copyBuffer(*advRes.stagingJointIndicesBuffer, *advRes.jointIndicesBuffer, region);
+              }
+              if (advRes.stagingJointWeightsAllocation && !!*advRes.jointWeightsBuffer) {
+                  vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = advRes.jointWeightsSize};
+                  cb.copyBuffer(*advRes.stagingJointWeightsBuffer, *advRes.jointWeightsBuffer, region);
+              }
+              for (size_t i = 0; i < advRes.stagingMorphTargetBuffers.size() && i < advRes.morphTargetBuffers.size(); ++i) {
+                  if (!!*advRes.stagingMorphTargetBuffers[i] && !!*advRes.morphTargetBuffers[i]) {
+                      vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = advRes.morphTargetSizes[i]};
+                      cb.copyBuffer(*advRes.stagingMorphTargetBuffers[i], *advRes.morphTargetBuffers[i], region);
+                  }
+              }
+          }
+      }
+    }
+
+    cb.end();
+
+    uint64_t signalValue = 0; {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      vk::SubmitInfo submitInfo{};
+      vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit
+      signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1;
+      timelineInfo.signalSemaphoreValueCount = 1;
+      timelineInfo.pSignalSemaphoreValues = &signalValue;
+      submitInfo.pNext = &timelineInfo;
+      submitInfo.commandBufferCount = 1;
+      submitInfo.pCommandBuffers = &*cb;
+      submitInfo.signalSemaphoreCount = 1;
+      submitInfo.pSignalSemaphores = &*uploadsTimeline;
+      graphicsQueue.submit(submitInfo, vk::Fence{});
+    }
+
+    InFlightMeshUploadBatch batch;
+    batch.signalValue = signalValue;
+    batch.meshes = std::move(needsCopy);
+    batch.commandPool = std::move(tempPool);
+    batch.commandBuffers = std::move(cbs); {
+      std::lock_guard<std::mutex> lk(inFlightMeshUploadsMutex);
+      inFlightMeshUploads.push_back(std::move(batch));
+    }
+  } else {
+    // Fallback: submit and wait on the GRAPHICS queue (single-threaded via queueMutex)
+    vk::raii::CommandPool tempPool(device, poolInfo);
+    vk::CommandBufferAllocateInfo allocInfo{
+      .commandPool = *tempPool,
+      .level = vk::CommandBufferLevel::ePrimary,
+      .commandBufferCount = 1
+    };
+    vk::raii::CommandBuffers cbs(device, allocInfo);
+    vk::raii::CommandBuffer& cb = cbs[0];
+    cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
+
+    uint32_t uploadCountSync = 0;
+    for (auto* meshComponent : needsCopy) {
+      auto it = meshResources.find(meshComponent);
+      if (it == meshResources.end())
+        continue;
+      MeshResources& res = it->second;
+      if (res.vertexBufferSizeBytes > 0 && !!*res.stagingVertexBuffer && !!*res.vertexBuffer) {
+        vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.vertexBufferSizeBytes};
+        cb.copyBuffer(*res.stagingVertexBuffer, *res.vertexBuffer, region);
+      }
+      if (res.indexBufferSizeBytes > 0 && !!*res.stagingIndexBuffer && !!*res.indexBuffer) {
+        vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = res.indexBufferSizeBytes};
+        cb.copyBuffer(*res.stagingIndexBuffer, *res.indexBuffer, region);
+      }
+      
+      // Copy advanced skinning/morph buffers if any are staged
+      {
+          std::shared_lock<std::shared_mutex> lock(g_advancedStateMutex);
+          auto advIt = g_meshAdvancedResources.find(meshComponent);
+          if (advIt != g_meshAdvancedResources.end()) {
+              auto& advRes = advIt->second;
+              if (advRes.stagingJointIndicesAllocation && !!*advRes.jointIndicesBuffer) {
+                  vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = advRes.jointIndicesSize};
+                  cb.copyBuffer(*advRes.stagingJointIndicesBuffer, *advRes.jointIndicesBuffer, region);
+              }
+              if (advRes.stagingJointWeightsAllocation && !!*advRes.jointWeightsBuffer) {
+                  vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = advRes.jointWeightsSize};
+                  cb.copyBuffer(*advRes.stagingJointWeightsBuffer, *advRes.jointWeightsBuffer, region);
+              }
+              for (size_t i = 0; i < advRes.stagingMorphTargetBuffers.size() && i < advRes.morphTargetBuffers.size(); ++i) {
+                  if (!!*advRes.stagingMorphTargetBuffers[i] && !!*advRes.morphTargetBuffers[i]) {
+                      vk::BufferCopy region{.srcOffset = 0, .dstOffset = 0, .size = advRes.morphTargetSizes[i]};
+                      cb.copyBuffer(*advRes.stagingMorphTargetBuffers[i], *advRes.morphTargetBuffers[i], region);
+                  }
+              }
+          }
+      }
+    }
+
+    cb.end();
+
+    vk::SubmitInfo submitInfo{.commandBufferCount = 1, .pCommandBuffers = &*cb};
+    vk::raii::Fence fence(device, vk::FenceCreateInfo{}); {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      graphicsQueue.submit(submitInfo, *fence);
+    }
+    (void) waitForFencesSafe(*fence, VK_TRUE);
+
+    for (auto* meshComponent : needsCopy) {
+      auto it = meshResources.find(meshComponent);
+      if (it == meshResources.end())
+        continue;
+      MeshResources& res = it->second;
+      res.stagingVertexBuffer.clear();
+      res.stagingVertexBufferMemory.clear();
+      res.vertexBufferSizeBytes = 0;
+      res.stagingIndexBuffer.clear();
+      res.stagingIndexBufferMemory.clear();
+      res.indexBufferSizeBytes = 0;
+
+      // Clear advanced staging allocations
+      {
+        std::unique_lock<std::shared_mutex> lock(g_advancedStateMutex);
+        auto advIt = g_meshAdvancedResources.find(meshComponent);
+        if (advIt != g_meshAdvancedResources.end()) {
+          advIt->second.stagingVertexBufferAllocation = nullptr;
+          advIt->second.stagingIndexBufferAllocation = nullptr;
+          advIt->second.stagingMorphTargetAllocations.clear();
+          advIt->second.morphTargetSizes.clear();
+        }
+      }
+    }
+
+    asDevOverrideAllowRebuild = true;
+    RequestAccelerationStructureBuild("uploads completed");
+  }
+}
+
+// Recreate instance buffer for an entity (e.g., after clearing instances for animation)
+bool Renderer::recreateInstanceBuffer(Entity* entity) {
+  ensureThreadLocalVulkanInit();
+  try {
+    // Find the entity in entityResources
+    auto it = entityResources.find(entity);
+    if (it == entityResources.end()) {
+      std::cerr << "Entity " << entity->GetName() << " not found in entityResources" << std::endl;
+      return false;
+    }
+
+    EntityResources& resources = it->second;
+
+    // Create a single instance with identity matrix
+    InstanceData singleInstance;
+    singleInstance.setModelMatrix(glm::mat4(1.0f));
+    std::vector<InstanceData> instanceData = {singleInstance};
+
+    vk::DeviceSize instanceBufferSize = sizeof(InstanceData) * instanceData.size();
+
+    // Create new instance buffer using memory pool
+    auto [instanceBuffer, instanceBufferAllocation] = createBufferPooled(
+      instanceBufferSize,
+      vk::BufferUsageFlagBits::eVertexBuffer,
+      vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+    // Copy instance data to buffer
+    void* instanceMappedMemory = instanceBufferAllocation->mappedPtr;
+    if (instanceMappedMemory) {
+      std::memcpy(instanceMappedMemory, instanceData.data(), instanceBufferSize);
+    } else {
+      std::cerr << "Warning: Instance buffer allocation is not mapped" << std::endl;
+    }
+
+    // Replace the old instance buffer with the new one.
+    // Note: Caller must ensure GPU is idle before this method is called to safely destroy the old buffer.
+    resources.instanceBuffer = std::move(instanceBuffer);
+    resources.instanceBufferAllocation = std::move(instanceBufferAllocation);
+    resources.instanceBufferMapped = instanceMappedMemory;
+
+    std::cout << "[Animation] Recreated instance buffer for entity '" << entity->GetName()
+        << "' with single identity instance" << std::endl;
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to recreate instance buffer for entity " << entity->GetName()
+        << ": " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Create buffer using memory pool for efficient allocation
+std::pair<vk::raii::Buffer, std::unique_ptr<MemoryPool::Allocation>> Renderer::createBufferPooled(
+  vk::DeviceSize size,
+  vk::BufferUsageFlags usage,
+  vk::MemoryPropertyFlags properties) {
+  try {
+    if (!memoryPool) {
+      throw std::runtime_error("Memory pool not initialized");
+    }
+
+    // Use memory pool for allocation
+    auto [buffer, allocation] = memoryPool->createBuffer(size, usage, properties);
+
+    return {std::move(buffer), std::move(allocation)};
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create buffer with memory pool: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Legacy createBuffer function - now strictly enforces memory pool usage
+std::pair<vk::raii::Buffer, vk::raii::DeviceMemory> Renderer::createBuffer(
+  vk::DeviceSize size,
+  vk::BufferUsageFlags usage,
+  vk::MemoryPropertyFlags properties) {
+  // This function should only be used for temporary staging buffers during resource creation
+  // All persistent resources should use createBufferPooled directly
+
+  if (!memoryPool) {
+    throw std::runtime_error("Memory pool not available - cannot create buffer");
+  }
+
+  // Only allow direct allocation for staging buffers (temporary, host-visible)
+  if (!(properties & vk::MemoryPropertyFlagBits::eHostVisible)) {
+    std::cerr << "ERROR: Legacy createBuffer should only be used for staging buffers!" << std::endl;
+    throw std::runtime_error("Legacy createBuffer used for non-staging buffer");
+  }
+
+  try {
+    vk::BufferCreateInfo bufferInfo{
+      .size = size,
+      .usage = usage,
+      .sharingMode = vk::SharingMode::eExclusive
+    };
+
+    vk::raii::Buffer buffer(device, bufferInfo);
+
+    // Allocate memory directly for staging buffers only
+    vk::MemoryRequirements memRequirements = buffer.getMemoryRequirements();
+
+    // Align allocation size to nonCoherentAtomSize (64 bytes) to prevent validation errors
+    // VUID-VkMappedMemoryRange-size-01390 requires memory flush sizes to be multiples of nonCoherentAtomSize
+    const vk::DeviceSize nonCoherentAtomSize = 64; // Typical value, should query from device properties
+    vk::DeviceSize alignedSize = ((memRequirements.size + nonCoherentAtomSize - 1) / nonCoherentAtomSize) * nonCoherentAtomSize;
+
+    vk::MemoryAllocateInfo allocInfo{
+      .allocationSize = alignedSize,
+      .memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties)
+    };
+
+    vk::raii::DeviceMemory bufferMemory(device, allocInfo);
+
+    // Bind memory to buffer
+    buffer.bindMemory(*bufferMemory, 0);
+
+    return {std::move(buffer), std::move(bufferMemory)};
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create staging buffer: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+void Renderer::createTransparentDescriptorSets() {
+  // We need one descriptor set per frame in flight for this resource
+  std::vector<vk::DescriptorSetLayout> layouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout);
+  vk::DescriptorSetAllocateInfo allocInfo{
+    .descriptorPool = *descriptorPool,
+    .descriptorSetCount = static_cast<uint32_t>(MAX_FRAMES_IN_FLIGHT),
+    .pSetLayouts = layouts.data()
+  }; {
+    // Serialize allocation vs other descriptor ops
+    std::lock_guard<std::mutex> lk(descriptorMutex);
+    transparentDescriptorSets = vk::raii::DescriptorSets(device, allocInfo);
+  }
+
+  // Update each descriptor set to point to the per-frame off-screen opaque color image
+  for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+    vk::DescriptorImageInfo imageInfo{
+      .sampler = *opaqueSceneColorSampler,
+      .imageView = *opaqueSceneColorImageViews[i],
+      .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal
+    };
+
+    vk::WriteDescriptorSet descriptorWrite{
+      .dstSet = *transparentDescriptorSets[i],
+      .dstBinding = 0, // Binding 0 in Set 1
+      .descriptorCount = 1,
+      .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+      .pImageInfo = &imageInfo
+    }; {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      device.updateDescriptorSets(descriptorWrite, nullptr);
+    }
+  }
+}
+
+void Renderer::createTransparentFallbackDescriptorSets() {
+  // Allocate one descriptor set per frame in flight using the same layout (single combined image sampler at binding 0)
+  std::vector<vk::DescriptorSetLayout> layouts(MAX_FRAMES_IN_FLIGHT, *transparentDescriptorSetLayout);
+  vk::DescriptorSetAllocateInfo allocInfo{
+    .descriptorPool = *descriptorPool,
+    .descriptorSetCount = static_cast<uint32_t>(MAX_FRAMES_IN_FLIGHT),
+    .pSetLayouts = layouts.data()
+  }; {
+    std::lock_guard<std::mutex> lk(descriptorMutex);
+    transparentFallbackDescriptorSets = vk::raii::DescriptorSets(device, allocInfo);
+  }
+
+  // Point each set to the default texture, which is guaranteed to be in SHADER_READ_ONLY_OPTIMAL when used in the opaque pass
+  for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; i++) {
+    vk::DescriptorImageInfo imageInfo{
+      .sampler = *defaultTextureResources.textureSampler,
+      .imageView = *defaultTextureResources.textureImageView,
+      .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal
+    };
+
+    vk::WriteDescriptorSet descriptorWrite{
+      .dstSet = *transparentFallbackDescriptorSets[i],
+      .dstBinding = 0,
+      .descriptorCount = 1,
+      .descriptorType = vk::DescriptorType::eCombinedImageSampler,
+      .pImageInfo = &imageInfo
+    }; {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      device.updateDescriptorSets(descriptorWrite, nullptr);
+    }
+  }
+}
+
+bool Renderer::createOpaqueSceneColorResources() {
+  try {
+    opaqueSceneColorImages.clear();
+    opaqueSceneColorImageAllocations.clear();
+    opaqueSceneColorImageViews.clear();
+    opaqueSceneColorImageLayouts.clear();
+
+    opaqueSceneColorImages.reserve(MAX_FRAMES_IN_FLIGHT);
+    opaqueSceneColorImageAllocations.reserve(MAX_FRAMES_IN_FLIGHT);
+    opaqueSceneColorImageViews.reserve(MAX_FRAMES_IN_FLIGHT);
+    opaqueSceneColorImageLayouts.reserve(MAX_FRAMES_IN_FLIGHT);
+
+    for (uint32_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) {
+      auto [image, allocation] = createImagePooled(
+        swapChainExtent.width,
+        swapChainExtent.height,
+        swapChainImageFormat,
+        // Use the same format as the swapchain
+        vk::ImageTiling::eOptimal,
+        vk::ImageUsageFlagBits::eColorAttachment | vk::ImageUsageFlagBits::eSampled | vk::ImageUsageFlagBits::eTransferSrc,
+        vk::MemoryPropertyFlagBits::eDeviceLocal);
+
+      opaqueSceneColorImages.push_back(std::move(image));
+      opaqueSceneColorImageAllocations.push_back(std::move(allocation));
+      opaqueSceneColorImageViews.push_back(createImageView(opaqueSceneColorImages.back(), swapChainImageFormat, vk::ImageAspectFlagBits::eColor));
+      opaqueSceneColorImageLayouts.push_back(vk::ImageLayout::eUndefined);
+    }
+
+    // Create (or recreate) the sampler (shared across frames)
+    vk::SamplerCreateInfo samplerInfo{
+      .magFilter = vk::Filter::eLinear,
+      .minFilter = vk::Filter::eLinear,
+      .addressModeU = vk::SamplerAddressMode::eClampToEdge,
+      .addressModeV = vk::SamplerAddressMode::eClampToEdge,
+      .addressModeW = vk::SamplerAddressMode::eClampToEdge,
+    };
+    opaqueSceneColorSampler = vk::raii::Sampler(device, samplerInfo);
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create opaque scene color resources: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Copy buffer
+void Renderer::copyBuffer(vk::raii::Buffer& srcBuffer, vk::raii::Buffer& dstBuffer, vk::DeviceSize size) {
+  ensureThreadLocalVulkanInit();
+  try {
+    // Create a temporary transient command pool and command buffer to isolate per-thread usage (transfer family)
+    vk::CommandPoolCreateInfo poolInfo{
+      .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+      .queueFamilyIndex = queueFamilyIndices.transferFamily.value()
+    };
+    vk::raii::CommandPool tempPool(device, poolInfo);
+    vk::CommandBufferAllocateInfo allocInfo{
+      .commandPool = *tempPool,
+      .level = vk::CommandBufferLevel::ePrimary,
+      .commandBufferCount = 1
+    };
+
+    vk::raii::CommandBuffers commandBuffers(device, allocInfo);
+    vk::raii::CommandBuffer& commandBuffer = commandBuffers[0];
+
+    // Begin command buffer
+    vk::CommandBufferBeginInfo beginInfo{
+      .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit
+    };
+
+    commandBuffer.begin(beginInfo);
+
+    // Copy buffer
+    vk::BufferCopy copyRegion{
+      .srcOffset = 0,
+      .dstOffset = 0,
+      .size = size
+    };
+
+    commandBuffer.copyBuffer(*srcBuffer, *dstBuffer, copyRegion);
+
+    // End command buffer
+    commandBuffer.end();
+
+    // Submit command buffer
+    vk::SubmitInfo submitInfo{
+      .commandBufferCount = 1,
+      .pCommandBuffers = &*commandBuffer
+    };
+
+    // Use mutex to ensure thread-safe access to transfer queue
+    vk::raii::Fence fence(device, vk::FenceCreateInfo{}); {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      transferQueue.submit(submitInfo, *fence);
+    }
+    (void) waitForFencesSafe(*fence, VK_TRUE);
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to copy buffer: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Create image
+std::pair<vk::raii::Image, vk::raii::DeviceMemory> Renderer::createImage(
+  uint32_t width,
+  uint32_t height,
+  vk::Format format,
+  vk::ImageTiling tiling,
+  vk::ImageUsageFlags usage,
+  vk::MemoryPropertyFlags properties) {
+  try {
+    // Create image
+    vk::ImageCreateInfo imageInfo{
+      .imageType = vk::ImageType::e2D,
+      .format = format,
+      .extent = {width, height, 1},
+      .mipLevels = 1,
+      .arrayLayers = 1,
+      .samples = vk::SampleCountFlagBits::e1,
+      .tiling = tiling,
+      .usage = usage,
+      .sharingMode = vk::SharingMode::eExclusive,
+      .initialLayout = vk::ImageLayout::eUndefined
+    };
+
+    vk::raii::Image image(device, imageInfo);
+
+    // Allocate memory
+    vk::MemoryRequirements memRequirements = image.getMemoryRequirements();
+    vk::MemoryAllocateInfo allocInfo{
+      .allocationSize = memRequirements.size,
+      .memoryTypeIndex = findMemoryType(memRequirements.memoryTypeBits, properties)
+    };
+
+    vk::raii::DeviceMemory imageMemory(device, allocInfo);
+
+    // Bind memory to image
+    image.bindMemory(*imageMemory, 0);
+
+    return {std::move(image), std::move(imageMemory)};
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create image: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Create image using memory pool for efficient allocation
+std::pair<vk::raii::Image, std::unique_ptr<MemoryPool::Allocation>> Renderer::createImagePooled(
+  uint32_t width,
+  uint32_t height,
+  vk::Format format,
+  vk::ImageTiling tiling,
+  vk::ImageUsageFlags usage,
+  vk::MemoryPropertyFlags properties,
+  uint32_t mipLevels,
+  vk::SharingMode sharingMode,
+  const std::vector<uint32_t>& queueFamilies) {
+  try {
+    if (!memoryPool) {
+      throw std::runtime_error("Memory pool not initialized");
+    }
+
+    // Use memory pool for allocation (mipmap support limited by memory pool API)
+    auto [image, allocation] = memoryPool->createImage(width,
+                                                       height,
+                                                       format,
+                                                       tiling,
+                                                       usage,
+                                                       properties,
+                                                       mipLevels,
+                                                       sharingMode,
+                                                       queueFamilies);
+
+    return {std::move(image), std::move(allocation)};
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create image with memory pool: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Create an image view
+vk::raii::ImageView Renderer::createImageView(vk::raii::Image& image, vk::Format format, vk::ImageAspectFlags aspectFlags, uint32_t mipLevels) {
+  try {
+    ensureThreadLocalVulkanInit();
+    // Create image view
+    vk::ImageViewCreateInfo viewInfo{
+      .image = *image,
+      .viewType = vk::ImageViewType::e2D,
+      .format = format,
+      .subresourceRange = {
+        .aspectMask = aspectFlags,
+        .baseMipLevel = 0,
+        .levelCount = mipLevels,
+        .baseArrayLayer = 0,
+        .layerCount = 1
+      }
+    };
+
+    return {device, viewInfo};
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create image view: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Transition image layout
+void Renderer::transitionImageLayout(vk::Image image, vk::Format format, vk::ImageLayout oldLayout, vk::ImageLayout newLayout, uint32_t mipLevels) {
+  ensureThreadLocalVulkanInit();
+  try {
+    // Create a temporary transient command pool and command buffer to isolate per-thread usage
+    vk::CommandPoolCreateInfo poolInfo{
+      .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+      .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value()
+    };
+    vk::raii::CommandPool tempPool(device, poolInfo);
+    vk::CommandBufferAllocateInfo allocInfo{
+      .commandPool = *tempPool,
+      .level = vk::CommandBufferLevel::ePrimary,
+      .commandBufferCount = 1
+    };
+
+    vk::raii::CommandBuffers commandBuffers(device, allocInfo);
+    vk::raii::CommandBuffer& commandBuffer = commandBuffers[0];
+
+    // Begin command buffer
+    vk::CommandBufferBeginInfo beginInfo{
+      .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit
+    };
+
+    commandBuffer.begin(beginInfo);
+
+    // Create an image barrier (Sync2)
+    vk::ImageMemoryBarrier2 barrier2{
+      .oldLayout = oldLayout,
+      .newLayout = newLayout,
+      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .image = image,
+      .subresourceRange = {
+        .aspectMask = format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor,
+        .baseMipLevel = 0,
+        .levelCount = mipLevels,
+        .baseArrayLayer = 0,
+        .layerCount = 1
+      }
+    };
+
+    // Set stage and access masks based on layouts
+    if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eTransferDstOptimal) {
+      barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe;
+      barrier2.srcAccessMask = vk::AccessFlagBits2::eNone;
+      barrier2.dstStageMask = vk::PipelineStageFlagBits2::eTransfer;
+      barrier2.dstAccessMask = vk::AccessFlagBits2::eTransferWrite;
+    } else if (oldLayout == vk::ImageLayout::eTransferDstOptimal && newLayout == vk::ImageLayout::eShaderReadOnlyOptimal) {
+      barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTransfer;
+      barrier2.srcAccessMask = vk::AccessFlagBits2::eTransferWrite;
+      barrier2.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader;
+      barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderRead;
+    } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eDepthStencilAttachmentOptimal) {
+      barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe;
+      barrier2.srcAccessMask = vk::AccessFlagBits2::eNone;
+      barrier2.dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests;
+      barrier2.dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead | vk::AccessFlagBits2::eDepthStencilAttachmentWrite;
+    } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eDepthStencilReadOnlyOptimal) {
+      // Support for shadow map creation: transition from undefined to read-only depth layout
+      barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe;
+      barrier2.srcAccessMask = vk::AccessFlagBits2::eNone;
+      barrier2.dstStageMask = vk::PipelineStageFlagBits2::eEarlyFragmentTests;
+      barrier2.dstAccessMask = vk::AccessFlagBits2::eDepthStencilAttachmentRead;
+    } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eGeneral) {
+      // Support for compute shader storage images: transition from undefined to general layout
+      barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe;
+      barrier2.srcAccessMask = vk::AccessFlagBits2::eNone;
+      barrier2.dstStageMask = vk::PipelineStageFlagBits2::eComputeShader;
+      barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderWrite | vk::AccessFlagBits2::eShaderRead;
+    } else if (oldLayout == vk::ImageLayout::eUndefined && newLayout == vk::ImageLayout::eShaderReadOnlyOptimal) {
+      // Support for textures that skip staging buffer (e.g., preloaded, generated, or default textures)
+      // Transition directly from undefined to shader read-only for sampling
+      barrier2.srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe;
+      barrier2.srcAccessMask = vk::AccessFlagBits2::eNone;
+      barrier2.dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader;
+      barrier2.dstAccessMask = vk::AccessFlagBits2::eShaderRead;
+    } else {
+      throw std::invalid_argument("Unsupported layout transition!");
+    }
+
+    // Add a barrier to command buffer (Sync2)
+    vk::DependencyInfo depInfo{
+      .dependencyFlags = vk::DependencyFlagBits::eByRegion,
+      .imageMemoryBarrierCount = 1,
+      .pImageMemoryBarriers = &barrier2
+    };
+    commandBuffer.pipelineBarrier2(depInfo);
+    std::cout << "[transitionImageLayout] recorded barrier image=" << (void *) image << " old=" << static_cast<int>(oldLayout) << " new=" << static_cast<int>(newLayout) << std::endl;
+
+    // End command buffer
+    commandBuffer.end();
+
+    vk::raii::Fence fence(device, vk::FenceCreateInfo{});
+    bool canSignalTimeline = !!*uploadsTimeline;
+    uint64_t signalValue = 0; {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      vk::SubmitInfo submitInfo{};
+      vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit
+      if (canSignalTimeline) {
+        signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1;
+        timelineInfo.signalSemaphoreValueCount = 1;
+        timelineInfo.pSignalSemaphoreValues = &signalValue;
+        submitInfo.pNext = &timelineInfo;
+        submitInfo.signalSemaphoreCount = 1;
+        submitInfo.pSignalSemaphores = &*uploadsTimeline;
+      }
+      submitInfo.commandBufferCount = 1;
+      submitInfo.pCommandBuffers = &*commandBuffer;
+      graphicsQueue.submit(submitInfo, *fence);
+    }
+    (void) waitForFencesSafe(*fence, VK_TRUE);
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to transition image layout: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Copy buffer to image
+void Renderer::copyBufferToImage(vk::Buffer buffer, vk::Image image, uint32_t width, uint32_t height, vk::ArrayProxy<const vk::BufferImageCopy> regions) {
+  ensureThreadLocalVulkanInit();
+  try {
+    // Create a temporary transient command pool for the GRAPHICS queue to avoid cross-queue races
+    vk::CommandPoolCreateInfo poolInfo{
+      .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+      .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value()
+    };
+    vk::raii::CommandPool tempPool(device, poolInfo);
+    vk::CommandBufferAllocateInfo allocInfo{
+      .commandPool = *tempPool,
+      .level = vk::CommandBufferLevel::ePrimary,
+      .commandBufferCount = 1
+    };
+
+    vk::raii::CommandBuffers commandBuffers(device, allocInfo);
+    vk::raii::CommandBuffer& commandBuffer = commandBuffers[0];
+
+    // Begin command buffer
+    vk::CommandBufferBeginInfo beginInfo{
+      .flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit
+    };
+
+    commandBuffer.begin(beginInfo);
+
+    // Copy buffer to image using provided regions
+    commandBuffer.copyBufferToImage(
+      buffer,
+      image,
+      vk::ImageLayout::eTransferDstOptimal,
+      regions);
+    std::cout << "[copyBufferToImage] recorded copy img=" << (void *) image << std::endl;
+
+    // End command buffer
+    commandBuffer.end();
+
+    vk::raii::Fence fence(device, vk::FenceCreateInfo{});
+    bool canSignalTimeline = !!*uploadsTimeline;
+    uint64_t signalValue = 0; {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      vk::SubmitInfo submitInfo{};
+      vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit
+      if (canSignalTimeline) {
+        signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1;
+        timelineInfo.signalSemaphoreValueCount = 1;
+        timelineInfo.pSignalSemaphoreValues = &signalValue;
+        submitInfo.pNext = &timelineInfo;
+        submitInfo.signalSemaphoreCount = 1;
+        submitInfo.pSignalSemaphores = &*uploadsTimeline;
+      }
+      submitInfo.commandBufferCount = 1;
+      submitInfo.pCommandBuffers = &*commandBuffer;
+      graphicsQueue.submit(submitInfo, *fence);
+    }
+    (void) waitForFencesSafe(*fence, VK_TRUE);
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to copy buffer to image: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Create or resize light storage buffers to accommodate the given number of lights
+bool Renderer::createOrResizeLightStorageBuffers(size_t lightCount) {
+  try {
+    // Ensure we have storage buffers for each frame in flight
+    if (lightStorageBuffers.size() != MAX_FRAMES_IN_FLIGHT) {
+      lightStorageBuffers.resize(MAX_FRAMES_IN_FLIGHT);
+    }
+
+    // Check if we need to resize buffers
+    bool needsResize = false;
+    for (auto& buffer : lightStorageBuffers) {
+      if (buffer.capacity < lightCount) {
+        needsResize = true;
+        break;
+      }
+    }
+
+    if (!needsResize) {
+      return true; // Buffers are already large enough
+    }
+
+    // Calculate new capacity (with some headroom for growth)
+    size_t newCapacity = std::max(lightCount * 2, static_cast<size_t>(64));
+    vk::DeviceSize bufferSize = sizeof(LightData) * newCapacity;
+
+    // Wait for device to be idle before destroying old buffers to prevent validation errors.
+    // External synchronization required (VVL): serialize against queue submits/present.
+    WaitIdle();
+
+    // Create new buffers for each frame
+    for (size_t i = 0; i < MAX_FRAMES_IN_FLIGHT; ++i) {
+      auto& buffer = lightStorageBuffers[i];
+
+      // Clean up old buffer if it exists (now safe after waitIdle)
+      if (!!buffer.allocation) {
+        buffer.buffer = vk::raii::Buffer(nullptr);
+        buffer.allocation.reset();
+        buffer.mapped = nullptr;
+      }
+
+      // Create new storage buffer
+      auto [newBuffer, newAllocation] = createBufferPooled(
+        bufferSize,
+        vk::BufferUsageFlagBits::eStorageBuffer,
+        vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+
+      // Get the mapped pointer from the allocation
+      void* mapped = newAllocation->mappedPtr;
+
+      // Store the new buffer
+      buffer.buffer = std::move(newBuffer);
+      buffer.allocation = std::move(newAllocation);
+      buffer.mapped = mapped;
+      buffer.capacity = newCapacity;
+      buffer.size = 0;
+    }
+
+    // Update all existing descriptor sets to reference the new light storage buffers
+    updateAllDescriptorSetsWithNewLightBuffers();
+
+    // Also refresh Forward+ compute descriptor sets (binding 0) so compute reads valid buffers
+    try {
+      if (!forwardPlusPerFrame.empty()) {
+        for (size_t i = 0; i < forwardPlusPerFrame.size() && i < lightStorageBuffers.size(); ++i) {
+          if (!*forwardPlusPerFrame[i].computeSet)
+            continue;
+          if (!*lightStorageBuffers[i].buffer)
+            continue;
+          vk::DescriptorBufferInfo lightsInfo{.buffer = *lightStorageBuffers[i].buffer, .offset = 0, .range = VK_WHOLE_SIZE};
+          vk::WriteDescriptorSet write{
+            .dstSet = *forwardPlusPerFrame[i].computeSet,
+            .dstBinding = 0,
+            .dstArrayElement = 0,
+            .descriptorCount = 1,
+            .descriptorType = vk::DescriptorType::eStorageBuffer,
+            .pBufferInfo = &lightsInfo
+          }; {
+            std::lock_guard<std::mutex> lk(descriptorMutex);
+            device.updateDescriptorSets(write, {});
+          }
+        }
+      }
+    } catch (const std::exception& e) {
+      std::cerr << "Failed to update Forward+ compute descriptors after light buffer resize: " << e.what() << std::endl;
+    }
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create or resize light storage buffers: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Update all existing descriptor sets with new light storage buffer references
+void Renderer::updateAllDescriptorSetsWithNewLightBuffers(bool allFrames) {
+  try {
+    if (!descriptorSetsValid.load(std::memory_order_relaxed))
+      return;
+    if (isRecordingCmd.load(std::memory_order_relaxed))
+      return;
+    // Iterate through all entity resources and update their PBR descriptor sets
+    for (auto& kv : entityResources) {
+      auto& resources = kv.second;
+      // Only update PBR descriptor sets (they have light buffer bindings)
+      if (!resources.pbrDescriptorSets.empty()) {
+        size_t beginFrame = allFrames ? 0 : static_cast<size_t>(currentFrame);
+        size_t endFrame = allFrames ? resources.pbrDescriptorSets.size() : (beginFrame + 1);
+        for (size_t i = beginFrame; i < endFrame && i < resources.pbrDescriptorSets.size() && i < lightStorageBuffers.size(); ++i) {
+          // Skip if this set looks invalid/uninitialized
+          if (!(*resources.pbrDescriptorSets[i]))
+            continue;
+          if (i < lightStorageBuffers.size() && !!*lightStorageBuffers[i].buffer) {
+            // Create descriptor write for light storage buffer (binding 7)
+            vk::DescriptorBufferInfo lightBufferInfo{
+              .buffer = *lightStorageBuffers[i].buffer,
+              .offset = 0,
+              .range = VK_WHOLE_SIZE
+            };
+
+            vk::WriteDescriptorSet descriptorWrite{
+              .dstSet = *resources.pbrDescriptorSets[i],
+              .dstBinding = 6,
+              .dstArrayElement = 0,
+              .descriptorCount = 1,
+              .descriptorType = vk::DescriptorType::eStorageBuffer,
+              .pBufferInfo = &lightBufferInfo
+            };
+
+            // Update the descriptor set
+            {
+              std::lock_guard<std::mutex> lk(descriptorMutex);
+              device.updateDescriptorSets(descriptorWrite, {});
+            }
+          }
+        }
+      }
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to update descriptor sets with new light buffers: " << e.what() << std::endl;
+  }
+}
+
+// Refresh only current frame's PBR descriptor bindings used by Forward+
+// Safe to call after waiting on inFlightFences[currentFrame] and before command recording.
+void Renderer::refreshPBRForwardPlusBindingsForFrame(uint32_t frameIndex) {
+  try {
+    if (frameIndex >= MAX_FRAMES_IN_FLIGHT)
+      return;
+    if (!descriptorSetsValid.load(std::memory_order_relaxed))
+      return;
+    if (isRecordingCmd.load(std::memory_order_relaxed))
+      return;
+
+    // Resolve current frame Forward+ buffers
+    vk::Buffer headersBuf{};
+    vk::Buffer indicesBuf{};
+    if (frameIndex < forwardPlusPerFrame.size()) {
+      auto& f = forwardPlusPerFrame[frameIndex];
+      if (!!*f.tileHeaders)
+        headersBuf = *f.tileHeaders;
+      if (!!*f.tileLightIndices)
+        indicesBuf = *f.tileLightIndices;
+    }
+
+    // Resolve current frame lights buffer
+    vk::Buffer lightsBuf{};
+    if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) {
+      lightsBuf = *lightStorageBuffers[frameIndex].buffer;
+    }
+
+    // Ensure lights buffer exists (binding 6) - create minimal dummy if needed
+    if (!lightsBuf) {
+      // Lazily create a minimal lights buffer (single LightData element) for use when Forward+ is disabled
+      if (lightStorageBuffers.empty()) {
+        lightStorageBuffers.resize(MAX_FRAMES_IN_FLIGHT);
+      }
+      if (frameIndex < lightStorageBuffers.size() && !*lightStorageBuffers[frameIndex].buffer) {
+        vk::DeviceSize minSize = sizeof(LightData); // Single light element
+        auto [buf, alloc] = createBufferPooled(minSize,
+                                               vk::BufferUsageFlagBits::eStorageBuffer,
+                                               vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+        lightStorageBuffers[frameIndex].buffer = std::move(buf);
+        lightStorageBuffers[frameIndex].allocation = std::move(alloc);
+        lightStorageBuffers[frameIndex].mapped = lightStorageBuffers[frameIndex].allocation->mappedPtr;
+        lightStorageBuffers[frameIndex].capacity = 1;
+        lightStorageBuffers[frameIndex].size = 0;
+        // Zero-initialize to prevent garbage data
+        if (!!lightStorageBuffers[frameIndex].mapped) {
+          std::memset(lightStorageBuffers[frameIndex].mapped, 0, minSize);
+        }
+      }
+      if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) {
+        lightsBuf = *lightStorageBuffers[frameIndex].buffer;
+      }
+    }
+
+    // Ensure tile headers buffer exists (binding 7) - create minimal dummy if needed
+    if (!headersBuf) {
+      if (forwardPlusPerFrame.empty()) {
+        forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT);
+      }
+      if (frameIndex < forwardPlusPerFrame.size()) {
+        auto& f = forwardPlusPerFrame[frameIndex];
+        if (!*f.tileHeaders) {
+          vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader {offset, count, pad0, pad1}
+          auto [buf, alloc] = createBufferPooled(minSize,
+                                                 vk::BufferUsageFlagBits::eStorageBuffer,
+                                                 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+          f.tileHeaders = std::move(buf);
+          f.tileHeadersAlloc = std::move(alloc);
+          if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) {
+            std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize);
+          }
+        }
+        if (!!*f.tileHeaders)
+          headersBuf = *f.tileHeaders;
+      }
+    }
+
+    // Ensure tile light indices buffer exists (binding 8) - create minimal dummy if needed
+    if (!indicesBuf) {
+      if (forwardPlusPerFrame.empty()) {
+        forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT);
+      }
+      if (frameIndex < forwardPlusPerFrame.size()) {
+        auto& f = forwardPlusPerFrame[frameIndex];
+        if (!*f.tileLightIndices) {
+          vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Minimal array of 4 uints
+          auto [buf, alloc] = createBufferPooled(minSize,
+                                                 vk::BufferUsageFlagBits::eStorageBuffer,
+                                                 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+          f.tileLightIndices = std::move(buf);
+          f.tileLightIndicesAlloc = std::move(alloc);
+          if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) {
+            std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize);
+          }
+        }
+        if (!!*f.tileLightIndices)
+          indicesBuf = *f.tileLightIndices;
+      }
+    }
+
+    std::vector<vk::WriteDescriptorSet> writes;
+    vk::DescriptorBufferInfo lightsInfo{};
+    vk::DescriptorBufferInfo headersInfo{};
+    vk::DescriptorBufferInfo indicesInfo{};
+    vk::DescriptorBufferInfo geoInfoInfo{};
+    vk::DescriptorBufferInfo matInfoInfo{};
+    vk::DescriptorBufferInfo fragDbgInfo{};
+
+    // At this point, all three critical buffers (lights, headers, indices) should exist (real or dummy)
+    if (!!lightsBuf) {
+      lightsInfo = vk::DescriptorBufferInfo{.buffer = lightsBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+    }
+    // Current frame fragment debug buffer (reuse compute debugOut) - this one is optional
+    if (frameIndex < forwardPlusPerFrame.size()) {
+      auto& fpf = forwardPlusPerFrame[frameIndex];
+      if (!!*fpf.debugOut) {
+        fragDbgInfo = vk::DescriptorBufferInfo{.buffer = *fpf.debugOut, .offset = 0, .range = VK_WHOLE_SIZE};
+      }
+    }
+    if (!!headersBuf) {
+      headersInfo = vk::DescriptorBufferInfo{.buffer = headersBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+    }
+    if (!!indicesBuf) {
+      indicesInfo = vk::DescriptorBufferInfo{.buffer = indicesBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+    }
+
+    // Binding 10: reflection sampler — always bind fallback texture while reflection pass is disabled
+    // The reflection rendering pass is currently disabled (commented out in renderer_rendering.cpp
+    // lines 1194-1203), so we must not bind any reflection RTs that may exist but contain stale data.
+    // When reflection rendering is re-enabled, restore the conditional logic to bind previous frame's RT.
+    vk::DescriptorImageInfo reflInfo{};
+    reflInfo = vk::DescriptorImageInfo{.sampler = *defaultTextureResources.textureSampler, .imageView = *defaultTextureResources.textureImageView, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal};
+
+    // Binding 11: TLAS (for raster ray-query shadows)
+    // Raster PBR shaders can statically declare/use `tlas` even when ray-query mode is disabled,
+    // so the descriptor must be written whenever acceleration structures are enabled.
+    vk::AccelerationStructureKHR tlasHandleValue = accelerationStructureEnabled ? *tlasStructure.handle : vk::AccelerationStructureKHR{};
+    vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{};
+    tlasInfo.accelerationStructureCount = 1;
+    tlasInfo.pAccelerationStructures = &tlasHandleValue;
+
+    for (auto& kv : entityResources) {
+      auto& res = kv.second;
+      if (res.pbrDescriptorSets.empty() || frameIndex >= res.pbrDescriptorSets.size())
+        continue;
+
+      // This prevents "Invalid VkDescriptorSet Object" errors when sets have been freed/invalidated
+      if (!(*res.pbrDescriptorSets[frameIndex])) {
+        std::cerr << "Warning: Invalid descriptor set handle for entity at frame " << frameIndex << ", skipping" << std::endl;
+        continue;
+      }
+
+      // Binding 6: lights SSBO - ALWAYS bind (required by layout)
+      if (!!lightsBuf) {
+        writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightsInfo});
+      }
+      // Binding 7: tile headers - ALWAYS bind (required by layout)
+      if (!!headersBuf) {
+        writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo});
+      }
+      // Binding 8: tile indices - ALWAYS bind (required by layout)
+      if (!!indicesBuf) {
+        writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo});
+      }
+      // Binding 9: fragment debug output buffer (optional - only bind if exists)
+      if (!!fragDbgInfo.buffer) {
+        writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 9, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &fragDbgInfo});
+      }
+      // Binding 10: reflection sampler - ALWAYS bind (required by layout)
+      writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 10, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &reflInfo});
+
+      // Binding 11: TLAS - ALWAYS bind (required by layout when ray query/AS is enabled)
+      // If TLAS is not built yet, the handle will be null; the shader must not trace when disabled.
+      vk::WriteDescriptorSet tlasWrite{};
+      tlasWrite.dstSet = *res.pbrDescriptorSets[frameIndex];
+      tlasWrite.dstBinding = 11;
+      tlasWrite.dstArrayElement = 0;
+      tlasWrite.descriptorCount = 1;
+      tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR;
+      tlasWrite.pNext = &tlasInfo;
+      writes.push_back(tlasWrite);
+
+      // Binding 12/13: Ray-query geometry/material buffers for material-aware raster shadow queries.
+      // Always bind something valid; shader guards on `ubo.geometryInfoCount/materialCount`.
+      vk::Buffer fallbackBuf = headersBuf ? headersBuf : indicesBuf;
+      vk::Buffer geoBuf = (!!*geometryInfoBuffer) ? *geometryInfoBuffer : fallbackBuf;
+      vk::Buffer matBuf = (!!*materialBuffer) ? *materialBuffer : fallbackBuf;
+      geoInfoInfo = vk::DescriptorBufferInfo{.buffer = geoBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+      matInfoInfo = vk::DescriptorBufferInfo{.buffer = matBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+      writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 12, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &geoInfoInfo});
+      writes.push_back(vk::WriteDescriptorSet{.dstSet = *res.pbrDescriptorSets[frameIndex], .dstBinding = 13, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &matInfoInfo});
+    }
+
+    if (!writes.empty()) {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      device.updateDescriptorSets(writes, {});
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to refresh PBR Forward+ bindings for frame " << frameIndex << ": " << e.what() << std::endl;
+  }
+}
+
+// Update the light storage buffer with current light data
+bool Renderer::updateLightStorageBuffer(uint32_t frameIndex, const std::vector<ExtractedLight>& lights, CameraComponent* camera) {
+  try {
+    // Ensure buffers are large enough and properly initialized
+    if (!createOrResizeLightStorageBuffers(lights.size())) {
+      return false;
+    }
+
+    // Now check frame index after buffers are properly initialized
+    if (frameIndex >= lightStorageBuffers.size()) {
+      std::cerr << "Invalid frame index for light storage buffer update: " << frameIndex
+          << " >= " << lightStorageBuffers.size() << std::endl;
+      return false;
+    }
+
+    auto& buffer = lightStorageBuffers[frameIndex];
+    if (!buffer.mapped) {
+      std::cerr << "Light storage buffer not mapped" << std::endl;
+      return false;
+    }
+
+    // Convert ExtractedLight data to LightData format
+    auto* lightData = static_cast<LightData *>(buffer.mapped);
+    for (size_t i = 0; i < lights.size(); ++i) {
+      const auto& light = lights[i];
+
+      // For directional lights, store direction in position field (they don't need position)
+      // For other lights, store position
+      if (light.type == ExtractedLight::Type::Directional) {
+        lightData[i].position = glm::vec4(light.direction, 0.0f); // w=0 indicates direction
+      } else {
+        lightData[i].position = glm::vec4(light.position, 1.0f); // w=1 indicates position
+      }
+
+      lightData[i].color = glm::vec4(light.color * light.intensity, 1.0f);
+      lightData[i].direction = glm::vec4(light.direction, 0.0f);
+
+      // Calculate light space matrix for shadow mapping
+      glm::mat4 lightProjection, lightView;
+      if (light.type == ExtractedLight::Type::Directional) {
+        float orthoSize = 50.0f;
+        glm::vec3 shadowCamPos = light.position;
+        glm::vec3 lightDir = glm::normalize(light.direction);
+        if (camera) {
+             // Center shadow map on camera frustum
+             glm::vec3 camPos = camera->GetPosition();
+             shadowCamPos = camPos - lightDir * 50.0f;
+        }
+        lightProjection = glm::ortho(-orthoSize, orthoSize, -orthoSize, orthoSize, 0.1f, 200.0f);
+
+        // Robust up vector to avoid LookAt singularities with vertical lights
+        glm::vec3 up = (std::abs(lightDir.y) > 0.99f) ? glm::vec3(0.0f, 0.0f, 1.0f) : glm::vec3(0.0f, 1.0f, 0.0f);
+        lightView = glm::lookAt(shadowCamPos, shadowCamPos + lightDir, up);
+      } else {
+        lightProjection = glm::perspective(glm::radians(90.0f), 1.0f, 0.1f, light.range);
+        lightView = glm::lookAt(light.position, light.position + light.direction, glm::vec3(0.0f, 1.0f, 0.0f));
+      }
+      lightData[i].lightSpaceMatrix = lightProjection * lightView;
+
+      // Set light type
+      switch (light.type) {
+        case ExtractedLight::Type::Point:
+          lightData[i].lightType = 0;
+          break;
+        case ExtractedLight::Type::Directional:
+          lightData[i].lightType = 1;
+          break;
+        case ExtractedLight::Type::Spot:
+          lightData[i].lightType = 2;
+          break;
+        case ExtractedLight::Type::Emissive:
+          lightData[i].lightType = 3;
+          break;
+      }
+
+      // Set other light properties
+      lightData[i].range = light.range;
+      lightData[i].innerConeAngle = light.innerConeAngle;
+      lightData[i].outerConeAngle = light.outerConeAngle;
+    }
+
+    // Update buffer size
+    buffer.size = lights.size();
+
+    return true;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to update light storage buffer: " << e.what() << std::endl;
+    return false;
+  }
+}
+
+// Asynchronous texture loading implementations using ThreadPool
+std::future<bool> Renderer::LoadTextureAsync(const std::string& texturePath, bool critical) {
+  if (texturePath.empty()) {
+    return std::async(std::launch::deferred, [] { return false; });
+  }
+
+  // Force synchronous upload during early scene ramp-up to ensure GPU stability
+  bool forceSync = (framesSinceLoadingComplete > 0 && framesSinceLoadingComplete < 30);
+  if (forceSync) {
+      bool success = LoadTexture(texturePath);
+      std::promise<bool> p;
+      p.set_value(success);
+      return p.get_future();
+  }
+
+  // Schedule a CPU-light job that enqueues a pending GPU upload to be
+  // processed later on the main thread. This avoids submitting Vulkan
+  // command buffers from worker threads, which can confuse GPU-assisted
+  // validation.
+  textureTasksScheduled.fetch_add(1, std::memory_order_relaxed);
+  uploadJobsTotal.fetch_add(1, std::memory_order_relaxed);
+  auto task = [this, texturePath, critical]() {
+    PendingTextureJob job;
+    job.type = PendingTextureJob::Type::FromFile;
+    job.priority = critical ? PendingTextureJob::Priority::Critical : PendingTextureJob::Priority::NonCritical;
+    job.idOrPath = texturePath; {
+      std::lock_guard<std::mutex> lk(pendingTextureJobsMutex);
+      pendingTextureJobs.emplace_back(std::move(job));
+    }
+    pendingTextureCv.notify_one();
+    if (critical) {
+      criticalJobsOutstanding.fetch_add(1, std::memory_order_relaxed);
+    }
+    textureTasksCompleted.fetch_add(1, std::memory_order_relaxed);
+    return true;
+  };
+
+  std::shared_lock<std::shared_mutex> lock(threadPoolMutex);
+  if (!threadPool) {
+    return std::async(std::launch::async, task);
+  }
+  return threadPool->enqueue(task);
+}
+
+std::future<bool> Renderer::LoadTextureFromMemoryAsync(const std::string& textureId,
+                                                       const unsigned char* imageData,
+                                                       int width,
+                                                       int height,
+                                                       int channels,
+                                                       bool critical) {
+  if (!imageData || textureId.empty() || width <= 0 || height <= 0 || channels <= 0) {
+    return std::async(std::launch::deferred, [] { return false; });
+  }
+  // Copy the source bytes so the caller can free/modify their buffer immediately
+  size_t srcSize = static_cast<size_t>(width) * static_cast<size_t>(height) * static_cast<size_t>(channels);
+  std::vector<unsigned char> dataCopy(srcSize);
+  std::memcpy(dataCopy.data(), imageData, srcSize);
+
+  // Force synchronous upload during early scene ramp-up to ensure GPU stability
+  bool forceSync = (framesSinceLoadingComplete > 0 && framesSinceLoadingComplete < 30);
+  if (forceSync) {
+      bool success = LoadTextureFromMemory(textureId, dataCopy.data(), width, height, channels);
+      std::promise<bool> p;
+      p.set_value(success);
+      return p.get_future();
+  }
+
+  textureTasksScheduled.fetch_add(1, std::memory_order_relaxed);
+  uploadJobsTotal.fetch_add(1, std::memory_order_relaxed);
+  auto task = [this, textureId, data = std::move(dataCopy), width, height, channels, critical]() mutable {
+    PendingTextureJob job;
+    job.type = PendingTextureJob::Type::FromMemory;
+    job.priority = critical ? PendingTextureJob::Priority::Critical : PendingTextureJob::Priority::NonCritical;
+    job.idOrPath = textureId; {
+      std::lock_guard<std::mutex> lk(pendingTextureJobsMutex);
+      pendingTextureJobs.emplace_back(std::move(job));
+    }
+    pendingTextureCv.notify_one();
+    if (critical) {
+      criticalJobsOutstanding.fetch_add(1, std::memory_order_relaxed);
+    }
+    textureTasksCompleted.fetch_add(1, std::memory_order_relaxed);
+    return true;
+  };
+
+  std::shared_lock<std::shared_mutex> lock(threadPoolMutex);
+  if (!threadPool) {
+    return std::async(std::launch::async, std::move(task));
+  }
+  return threadPool->enqueue(std::move(task));
+}
+
+void Renderer::WaitForAllTextureTasks() {
+  // Simple blocking wait: spin until all scheduled texture tasks have completed.
+  // This is only intended for use during initial scene loading where a short
+  // stall is acceptable to ensure descriptor sets see all real textures.
+  for (;;) {
+    uint32_t scheduled = textureTasksScheduled.load(std::memory_order_relaxed);
+    uint32_t completed = textureTasksCompleted.load(std::memory_order_relaxed);
+    if (scheduled == 0 || completed >= scheduled) {
+      break;
+    }
+    // Sleep briefly to yield CPU while background texture jobs finish
+    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+  }
+}
+
+// Start background worker threads that drain pending texture jobs and perform GPU uploads
+void Renderer::StartUploadsWorker(size_t workerCount) {
+  stopUploadsWorker.store(false, std::memory_order_relaxed);
+  if (workerCount == 0) {
+    unsigned int hw = std::thread::hardware_concurrency();
+    // Heuristic: at least 2 workers, at most 4, and not exceeding half of HW threads
+    unsigned int target = std::max(2u, std::min(4u, hw > 0 ? hw / 2 : 2u));
+    workerCount = static_cast<size_t>(target);
+  }
+  uploadsWorkerThreads.reserve(workerCount);
+  for (size_t t = 0; t < workerCount; ++t) {
+    uploadsWorkerThreads.emplace_back([this]() {
+      ensureThreadLocalVulkanInit();
+      while (!stopUploadsWorker.load(std::memory_order_relaxed)) {
+        if (pauseBackgroundUploads.load(std::memory_order_relaxed)) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+            continue;
+        }
+        // Wait for work or stop signal
+        {
+          std::unique_lock<std::mutex> lk(pendingTextureJobsMutex);
+          pendingTextureCv.wait(lk,
+                                [this]() {
+                                  return stopUploadsWorker.load(std::memory_order_relaxed) || !pendingTextureJobs.empty();
+                                });
+        }
+        if (stopUploadsWorker.load(std::memory_order_relaxed))
+          break;
+
+        // Drain a batch of jobs
+        std::vector<PendingTextureJob> batch; {
+          std::lock_guard<std::mutex> lk(pendingTextureJobsMutex);
+          const size_t maxBatch = 16; // simple batch size to limit command overhead
+          const size_t take = std::min(maxBatch, pendingTextureJobs.size());
+          batch.reserve(take);
+          for (size_t i = 0; i < take; ++i) {
+            batch.emplace_back(std::move(pendingTextureJobs.back()));
+            pendingTextureJobs.pop_back();
+          }
+        }
+
+        // Process critical jobs first
+        auto isCritical = [](const PendingTextureJob& j) { return j.priority == PendingTextureJob::Priority::Critical; };
+        std::stable_sort(batch.begin(),
+                         batch.end(),
+                         [&](const PendingTextureJob& a, const PendingTextureJob& b) {
+                           return isCritical(a) && !isCritical(b);
+                         });
+
+        // Try to batch FromMemory jobs together for a single transfer submit
+        std::vector<PendingTextureJob> memJobs;
+        for (auto& j : batch)
+          if (j.type == PendingTextureJob::Type::FromMemory)
+            memJobs.push_back(std::move(j));
+        // Remove moved jobs from batch
+        batch.erase(std::remove_if(batch.begin(), batch.end(), [](const PendingTextureJob& j) { return j.type == PendingTextureJob::Type::FromMemory; }), batch.end());
+
+        if (!memJobs.empty()) {
+          try {
+            // Process batched memory uploads with a single submit
+            // Fallback to per-job if batching fails for any reason
+            auto processSingle = [&](const PendingTextureJob& job) {
+              (void) LoadTextureFromMemory(job.idOrPath,
+                                           job.data.data(),
+                                           job.width,
+                                           job.height,
+                                           job.channels);
+              OnTextureUploaded(job.idOrPath);
+              if (job.priority == PendingTextureJob::Priority::Critical) {
+                criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed);
+              }
+              uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed);
+            };
+
+            // Build staging buffers and images without submitting yet
+            struct Item {
+              std::string id;
+              vk::raii::Buffer staging;
+              std::unique_ptr<MemoryPool::Allocation> stagingAlloc;
+              std::vector<uint8_t> tmp;
+              uint32_t w, h;
+              vk::Format format;
+              std::vector<vk::BufferImageCopy> regions;
+              uint32_t mipLevels;
+              vk::raii::Image image;
+              std::unique_ptr<MemoryPool::Allocation> imageAlloc;
+            };
+            std::vector<Item> items;
+            items.reserve(memJobs.size());
+
+            for (auto& job : memJobs) {
+              try {
+                // Create staging buffer and copy data
+                const vk::DeviceSize imgSize = static_cast<vk::DeviceSize>(job.width * job.height * 4);
+                auto [stagingBuf, stagingAlloc] = createBufferPooled(imgSize, vk::BufferUsageFlagBits::eTransferSrc, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+                void* mapped = stagingAlloc->mappedPtr;
+                // Convert to RGBA if not already
+                std::vector<uint8_t> rgba;
+                rgba.resize(static_cast<size_t>(imgSize));
+                const uint8_t* src = job.data.data();
+                if (job.channels == 4) {
+                  std::memcpy(rgba.data(), src, static_cast<size_t>(imgSize));
+                } else if (job.channels == 3) {
+                  for (int y = 0; y < job.height; ++y) {
+                    for (int x = 0; x < job.width; ++x) {
+                      size_t si = (y * job.width + x) * 3;
+                      size_t di = (y * job.width + x) * 4;
+                      rgba[di + 0] = src[si + 0];
+                      rgba[di + 1] = src[si + 1];
+                      rgba[di + 2] = src[si + 2];
+                      rgba[di + 3] = 255;
+                    }
+                  }
+                } else if (job.channels == 1) {
+                  for (int i = 0, n = job.width * job.height; i < n; ++i) {
+                    uint8_t v = src[i];
+                    size_t di = i * 4;
+                    rgba[di + 0] = v;
+                    rgba[di + 1] = v;
+                    rgba[di + 2] = v;
+                    rgba[di + 3] = 255;
+                  }
+                } else {
+                  // unsupported layout, fallback to single path which will handle it
+                  processSingle(job);
+                  continue;
+                }
+                std::memcpy(mapped, rgba.data(), static_cast<size_t>(imgSize));
+                // Persistent mapping via memory pool; no explicit unmap needed here
+
+                // Create image (concurrent sharing if needed)
+                bool differentFamilies = queueFamilyIndices.graphicsFamily.value() != queueFamilyIndices.transferFamily.value();
+                std::vector<uint32_t> families;
+                if (differentFamilies)
+                  families = {queueFamilyIndices.graphicsFamily.value(), queueFamilyIndices.transferFamily.value()};
+                vk::Format texFormat = determineTextureFormat(job.idOrPath);
+                auto [image, imageAlloc] = createImagePooled(job.width, job.height, texFormat, vk::ImageTiling::eOptimal, vk::ImageUsageFlagBits::eTransferDst | vk::ImageUsageFlagBits::eSampled, vk::MemoryPropertyFlagBits::eDeviceLocal, 1, differentFamilies ? vk::SharingMode::eConcurrent : vk::SharingMode::eExclusive, families);
+
+                // Prepare one region
+                std::vector<vk::BufferImageCopy> regions{
+                  vk::BufferImageCopy{
+                    .bufferOffset = 0,
+                    .bufferRowLength = 0,
+                    .bufferImageHeight = 0,
+                    .imageSubresource = {.aspectMask = vk::ImageAspectFlagBits::eColor, .mipLevel = 0, .baseArrayLayer = 0, .layerCount = 1},
+                    .imageOffset = {0, 0, 0},
+                    .imageExtent = {static_cast<uint32_t>(job.width), static_cast<uint32_t>(job.height), 1}
+                  }
+                };
+
+                items.push_back(Item{job.idOrPath, std::move(stagingBuf), std::move(stagingAlloc), std::move(rgba), static_cast<uint32_t>(job.width), static_cast<uint32_t>(job.height), texFormat, std::move(regions), 1, std::move(image), std::move(imageAlloc)});
+              } catch (const std::exception& e) {
+                std::cerr << "Batch prepare failed for '" << job.idOrPath << "': " << e.what() << ". Falling back to single." << std::endl;
+                processSingle(job);
+                continue;
+              }
+            }
+
+            if (!items.empty()) {
+              // Record a single command buffer for all items
+              vk::CommandPoolCreateInfo poolInfo{.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, .queueFamilyIndex = queueFamilyIndices.transferFamily.value()};
+              vk::raii::CommandPool tempPool(device, poolInfo);
+              vk::CommandBufferAllocateInfo allocInfo{.commandPool = *tempPool, .level = vk::CommandBufferLevel::ePrimary, .commandBufferCount = 1};
+              vk::raii::CommandBuffers cbs(device, allocInfo);
+              vk::raii::CommandBuffer& cb = cbs[0];
+              cb.begin(vk::CommandBufferBeginInfo{.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
+
+              for (auto& it : items) {
+                // Transition undefined->transfer dst (Sync2)
+                vk::ImageMemoryBarrier2 toDst2{
+                  .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe,
+                  .srcAccessMask = vk::AccessFlagBits2::eNone,
+                  .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+                  .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
+                  .oldLayout = vk::ImageLayout::eUndefined,
+                  .newLayout = vk::ImageLayout::eTransferDstOptimal,
+                  .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                  .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                  .image = *it.image,
+                  .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1}
+                };
+                vk::DependencyInfo depToDst{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toDst2};
+                cb.pipelineBarrier2(depToDst);
+
+                cb.copyBufferToImage(*it.staging, *it.image, vk::ImageLayout::eTransferDstOptimal, it.regions);
+
+                // Transition to shader-read (Sync2)
+                vk::ImageMemoryBarrier2 toShader2{
+                  .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+                  .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+                  .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader,
+                  .dstAccessMask = vk::AccessFlagBits2::eShaderRead,
+                  .oldLayout = vk::ImageLayout::eTransferDstOptimal,
+                  .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal,
+                  .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                  .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+                  .image = *it.image,
+                  .subresourceRange = {.aspectMask = vk::ImageAspectFlagBits::eColor, .baseMipLevel = 0, .levelCount = 1, .baseArrayLayer = 0, .layerCount = 1}
+                };
+                vk::DependencyInfo depToShader{.imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toShader2};
+                cb.pipelineBarrier2(depToShader);
+              }
+
+              cb.end();
+
+              vk::raii::Fence fence(device, vk::FenceCreateInfo{});
+              uint64_t signalValue = 0; {
+                std::lock_guard<std::mutex> lock(queueMutex);
+                vk::SubmitInfo submit{};
+                vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit
+                
+                signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1;
+                
+                // Track the last value associated with a critical upload
+                bool hasCritical = false;
+                for (auto& job : memJobs) {
+                  if (job.priority == PendingTextureJob::Priority::Critical) {
+                    hasCritical = true;
+                    break;
+                  }
+                }
+                if (hasCritical) {
+                  lastCriticalUploadValue.store(signalValue, std::memory_order_relaxed);
+                }
+
+                timelineInfo.signalSemaphoreValueCount = 1;
+                timelineInfo.pSignalSemaphoreValues = &signalValue;
+                submit.pNext = &timelineInfo;
+                submit.signalSemaphoreCount = 1;
+                submit.pSignalSemaphores = &*uploadsTimeline;
+
+                submit.commandBufferCount = 1;
+                submit.pCommandBuffers = &*cb;
+                transferQueue.submit(submit, *fence);
+              }
+              (void) waitForFencesSafe(*fence, VK_TRUE);
+
+              // Perf accounting for the batch
+              uint64_t batchBytes = 0;
+              for (auto& it : items)
+                batchBytes += static_cast<uint64_t>(it.w) * it.h * 4ull;
+              bytesUploadedTotal.fetch_add(batchBytes, std::memory_order_relaxed);
+              uploadCount.fetch_add(static_cast<uint32_t>(items.size()), std::memory_order_relaxed);
+
+              // Finalize resources and notify
+              for (auto& it : items) {
+                // Store in textureResources
+                TextureResources res;
+                res.textureImage = std::move(it.image);
+                res.textureImageAllocation = std::move(it.imageAlloc);
+                res.format = it.format;
+                res.mipLevels = it.mipLevels;
+                res.alphaMaskedHint = false; // heuristic omitted in batch
+                // Create sampler/view
+                createTextureSampler(res);
+                res.textureImageView = createImageView(res.textureImage, res.format, vk::ImageAspectFlagBits::eColor, res.mipLevels); {
+                  std::unique_lock<std::shared_mutex> lk(textureResourcesMutex);
+                  textureResources[it.id] = std::move(res);
+                }
+                OnTextureUploaded(it.id);
+                // Update counters
+                uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed);
+              }
+              // Decrement outstanding critical jobs if any
+              for (auto& job : memJobs)
+                if (job.priority == PendingTextureJob::Priority::Critical)
+                  criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed);
+            }
+          } catch (const std::exception& e) {
+            std::cerr << "UploadsWorker: batch processing failed: " << e.what() << std::endl;
+            // Fallback: per-job processing
+            for (auto& job : memJobs) {
+              try {
+                (void) LoadTextureFromMemory(job.idOrPath,
+                                             job.data.data(),
+                                             job.width,
+                                             job.height,
+                                             job.channels);
+                OnTextureUploaded(job.idOrPath);
+                if (job.priority == PendingTextureJob::Priority::Critical) {
+                  criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed);
+                }
+                uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed);
+              } catch (...) {
+              }
+            }
+          }
+        }
+
+        // Process remaining non-memory jobs individually
+        for (auto& job : batch) {
+          try {
+            if (job.type == PendingTextureJob::Type::FromFile) {
+              (void) LoadTexture(job.idOrPath);
+              OnTextureUploaded(job.idOrPath);
+              if (job.priority == PendingTextureJob::Priority::Critical) {
+                criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed);
+              }
+              uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed);
+            }
+          } catch (const std::exception& e) {
+            std::cerr << "UploadsWorker: failed to process job for '" << job.idOrPath << "': " << e.what() << std::endl;
+          }
+        }
+      }
+    });
+  }
+}
+
+void Renderer::StopUploadsWorker() {
+  stopUploadsWorker.store(true, std::memory_order_relaxed);
+  pendingTextureCv.notify_all();
+  for (auto& th : uploadsWorkerThreads) {
+    if (th.joinable())
+      th.join();
+  }
+  uploadsWorkerThreads.clear();
+}
+
+void Renderer::RegisterTextureUser(const std::string& textureId, Entity* entity) {
+  if (textureId.empty() || !entity)
+    return;
+
+  // Always register under the canonical resolved ID so that lookups from
+  // descriptor creation and upload completion (which also use
+  // ResolveTextureId) are consistent.
+  std::string canonicalId = ResolveTextureId(textureId);
+  if (canonicalId.empty()) {
+    canonicalId = textureId;
+  }
+
+  std::lock_guard<std::mutex> lk(textureUsersMutex);
+  textureToEntities[canonicalId].push_back(entity);
+}
+
+void Renderer::OnTextureUploaded(const std::string& textureId) {
+  // Resolve alias to canonical ID used for tracking and descriptor
+  // creation. RegisterTextureUser also stores under this canonical ID.
+  std::string canonicalId = ResolveTextureId(textureId);
+  if (canonicalId.empty()) {
+    canonicalId = textureId;
+  }
+
+  std::vector<Entity *> users; {
+    std::lock_guard<std::mutex> lk(textureUsersMutex);
+    auto it = textureToEntities.find(canonicalId);
+    if (it == textureToEntities.end()) {
+      return;
+    }
+    users = it->second;
+  }
+
+  // Always defer descriptor updates to the safe point at the start of Render()
+  // (after the in-flight fence for the current frame has been signaled).
+  // This avoids UPDATE_AFTER_BIND violations and mid-recording invalidation.
+  // If descriptor indexing / UPDATE_AFTER_BIND is enabled, we still prefer
+  // this safer path for consistency across devices.
+  for (Entity* entity : users) {
+    if (!entity)
+      continue;
+    MarkEntityDescriptorsDirty(entity);
+  }
+
+  // Ray Query uses a global texture table (binding 6) that may reference this texture.
+  // Mark the ray query descriptor sets dirty for all frames so the render-thread safe point
+  // can refresh the table when the texture becomes available.
+  if (rayQueryEnabled && accelerationStructureEnabled) {
+    const uint32_t allFramesMask = (MAX_FRAMES_IN_FLIGHT >= 32u) ? 0xFFFFFFFFu : ((1u << MAX_FRAMES_IN_FLIGHT) - 1u);
+    rayQueryDescriptorsDirtyMask.fetch_or(allFramesMask, std::memory_order_relaxed);
+  }
+}
+
+void Renderer::MarkEntityDescriptorsDirty(Entity* entity) {
+  if (!entity)
+    return;
+  // Mark this entity as needing refresh for *all* frames-in-flight.
+  // Each frame will refresh its own descriptor sets at its safe point.
+  const uint32_t allFramesMask = (MAX_FRAMES_IN_FLIGHT >= 32u) ? 0xFFFFFFFFu : ((1u << MAX_FRAMES_IN_FLIGHT) - 1u);
+  std::lock_guard<std::mutex> lk(dirtyEntitiesMutex);
+  auto& mask = descriptorDirtyEntities[entity];
+  mask |= allFramesMask;
+}
+
+bool Renderer::updateDescriptorSetsForFrame(Entity* entity,
+                                            const std::string& texturePath,
+                                            bool usePBR,
+                                            uint32_t frameIndex,
+                                            bool imagesOnly,
+                                            bool uboOnly) {
+  auto entityIt = entityResources.find(entity);
+  if (entityIt == entityResources.end())
+    return false;
+  return updateDescriptorSetsForFrame(entity, entityIt->second, texturePath, usePBR, frameIndex, imagesOnly, uboOnly);
+}
+
+bool Renderer::updateDescriptorSetsForFrame(Entity* entity,
+                                            EntityResources& res,
+                                            const std::string& texturePath,
+                                            bool usePBR,
+                                            uint32_t frameIndex,
+                                            bool imagesOnly,
+                                            bool uboOnly) {
+  if (!entity)
+    return false;
+  if (!descriptorSetsValid.load(std::memory_order_relaxed)) {
+    // Descriptor sets are being recreated; skip updates for now
+    return false;
+  }
+  // Defer descriptor writes if the command buffer is currently being recorded.
+  if (isRecordingCmd.load(std::memory_order_relaxed)) {
+    std::lock_guard<std::mutex> qlk(pendingDescMutex);
+    pendingDescOps.push_back(PendingDescOp{entity, texturePath, usePBR, frameIndex, imagesOnly});
+    descriptorRefreshPending.store(true, std::memory_order_relaxed);
+    return true;
+  }
+  // IMPORTANT: Do NOT hold `textureResourcesMutex` across this function.
+  // We may call `ResolveTextureId()` (which also locks it), and `std::shared_mutex` is not recursive.
+
+  // Ensure we have a valid UBO for this frame before attempting descriptor writes
+  if (frameIndex >= res.uniformBuffers.size() ||
+    frameIndex >= res.uniformBuffersMapped.size() ||
+    *res.uniformBuffers[frameIndex] == vk::Buffer{}) {
+    // Missing UBO for this frame; skip to avoid writing invalid descriptors
+    return false;
+  }
+
+  vk::DescriptorSetLayout selectedLayout = usePBR ? *pbrDescriptorSetLayout : *descriptorSetLayout;
+  // Ensure descriptor sets exist for this entity
+  std::vector<vk::DescriptorSetLayout> layouts(MAX_FRAMES_IN_FLIGHT, selectedLayout);
+  vk::DescriptorSetAllocateInfo allocInfo{.descriptorPool = *descriptorPool, .descriptorSetCount = MAX_FRAMES_IN_FLIGHT, .pSetLayouts = layouts.data()};
+  auto& targetDescriptorSets = usePBR ? res.pbrDescriptorSets : res.basicDescriptorSets;
+  bool newlyAllocated = false;
+  if (targetDescriptorSets.empty()) {
+    std::lock_guard<std::mutex> lk(descriptorMutex);
+    targetDescriptorSets = vk::raii::DescriptorSets(device, allocInfo);
+    newlyAllocated = true;
+  }
+  if (frameIndex >= targetDescriptorSets.size())
+    return false;
+
+  vk::DescriptorBufferInfo bufferInfo{.buffer = *res.uniformBuffers[frameIndex], .range = sizeof(UniformBufferObject)};
+
+  // Ensure per-pipeline UBO init tracking is sized
+  if (res.pbrUboBindingWritten.size() != MAX_FRAMES_IN_FLIGHT) {
+    res.pbrUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+  }
+  if (res.basicUboBindingWritten.size() != MAX_FRAMES_IN_FLIGHT) {
+    res.basicUboBindingWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+  }
+  if (res.pbrFixedBindingsWritten.size() != MAX_FRAMES_IN_FLIGHT) {
+    res.pbrFixedBindingsWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+  }
+  if (res.pbrImagesWritten.size() != MAX_FRAMES_IN_FLIGHT) {
+    res.pbrImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+  }
+  if (res.basicImagesWritten.size() != MAX_FRAMES_IN_FLIGHT) {
+    res.basicImagesWritten.assign(MAX_FRAMES_IN_FLIGHT, false);
+  }
+
+  if (usePBR) {
+    // We'll fill descriptor writes. Binding 0 (UBO) is written only when explicitly requested (uboOnly)
+    // or when doing a full update (imagesOnly == false). For imagesOnly updates we must NOT touch UBO
+    // to avoid update-after-bind hazards.
+    std::vector<vk::WriteDescriptorSet> writes;
+    std::array<vk::DescriptorImageInfo, 5> imageInfos;
+    // Helper: ensure required PBR layout bindings (7/8/10/11) are written at least once per frame.
+    // IMPORTANT: descriptor infos must remain alive until `updateDescriptorSets` is called.
+    vk::DescriptorBufferInfo headersInfo{};
+    vk::DescriptorBufferInfo indicesInfo{};
+    vk::DescriptorBufferInfo geoInfoInfo{};
+    vk::DescriptorBufferInfo matInfoInfo{};
+    vk::DescriptorImageInfo reflInfo{};
+    vk::AccelerationStructureKHR tlasHandleValue{};
+    vk::WriteDescriptorSetAccelerationStructureKHR tlasInfo{};
+    vk::WriteDescriptorSet tlasWrite{};
+    const bool needFixedWrites = !res.pbrFixedBindingsWritten[frameIndex];
+    auto appendPbrFixedWrites = [&](std::vector<vk::WriteDescriptorSet>& dstWrites) {
+      if (!needFixedWrites)
+        return;
+
+      // Binding 7/8: Forward+ tile buffers (must be valid even when Forward+ is disabled)
+      if (forwardPlusPerFrame.empty()) {
+        forwardPlusPerFrame.resize(MAX_FRAMES_IN_FLIGHT);
+      }
+      vk::Buffer headersBuf{};
+      vk::Buffer indicesBuf{};
+      if (frameIndex < forwardPlusPerFrame.size()) {
+        auto& f = forwardPlusPerFrame[frameIndex];
+        if (!!*f.tileHeaders)
+          headersBuf = *f.tileHeaders;
+        if (!!*f.tileLightIndices)
+          indicesBuf = *f.tileLightIndices;
+        if (!headersBuf) {
+          vk::DeviceSize minSize = sizeof(uint32_t) * 4; // Single TileHeader
+          auto [buf, alloc] = createBufferPooled(minSize,
+                                                 vk::BufferUsageFlagBits::eStorageBuffer,
+                                                 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+          f.tileHeaders = std::move(buf);
+          f.tileHeadersAlloc = std::move(alloc);
+          if (!!f.tileHeadersAlloc && f.tileHeadersAlloc->mappedPtr) {
+            std::memset(f.tileHeadersAlloc->mappedPtr, 0, minSize);
+          }
+          headersBuf = *f.tileHeaders;
+        }
+        if (!indicesBuf) {
+          vk::DeviceSize minSize = sizeof(uint32_t) * 4;
+          auto [buf, alloc] = createBufferPooled(minSize,
+                                                 vk::BufferUsageFlagBits::eStorageBuffer,
+                                                 vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
+          f.tileLightIndices = std::move(buf);
+          f.tileLightIndicesAlloc = std::move(alloc);
+          if (!!f.tileLightIndicesAlloc && f.tileLightIndicesAlloc->mappedPtr) {
+            std::memset(f.tileLightIndicesAlloc->mappedPtr, 0, minSize);
+          }
+          indicesBuf = *f.tileLightIndices;
+        }
+      }
+      headersInfo = vk::DescriptorBufferInfo{.buffer = headersBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+      indicesInfo = vk::DescriptorBufferInfo{.buffer = indicesBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+      dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 7, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &headersInfo});
+      dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 8, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &indicesInfo});
+
+      // Binding 10: reflection sampler (always bind safe fallback)
+      reflInfo = vk::DescriptorImageInfo{
+        .sampler = *defaultTextureResources.textureSampler,
+        .imageView = *defaultTextureResources.textureImageView,
+        .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal
+      };
+      dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 10, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &reflInfo});
+
+      // Binding 11: TLAS (ray-query shadows in raster PBR fragment shader)
+      tlasHandleValue = accelerationStructureEnabled ? *tlasStructure.handle : vk::AccelerationStructureKHR{};
+      tlasInfo.accelerationStructureCount = 1;
+      tlasInfo.pAccelerationStructures = &tlasHandleValue;
+      tlasWrite.dstSet = *targetDescriptorSets[frameIndex];
+      tlasWrite.dstBinding = 11;
+      tlasWrite.dstArrayElement = 0;
+      tlasWrite.descriptorCount = 1;
+      tlasWrite.descriptorType = vk::DescriptorType::eAccelerationStructureKHR;
+      tlasWrite.pNext = &tlasInfo;
+      dstWrites.push_back(tlasWrite);
+
+      // Binding 12/13: Ray-query geometry/material buffers for material-aware raster shadow queries.
+      // Always bind something valid; shader guards on `ubo.geometryInfoCount/materialCount`.
+      vk::Buffer fallbackBuf = headersBuf ? headersBuf : indicesBuf;
+      vk::Buffer geoBuf = (!!*geometryInfoBuffer) ? *geometryInfoBuffer : fallbackBuf;
+      vk::Buffer matBuf = (!!*materialBuffer) ? *materialBuffer : fallbackBuf;
+      geoInfoInfo = vk::DescriptorBufferInfo{.buffer = geoBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+      matInfoInfo = vk::DescriptorBufferInfo{.buffer = matBuf, .offset = 0, .range = VK_WHOLE_SIZE};
+      dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 12, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &geoInfoInfo});
+      dstWrites.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 13, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &matInfoInfo});
+    };
+
+    // Optionally write only the UBO (binding 0) — used at safe point to initialize per-frame sets once
+    if (uboOnly) {
+      // Avoid re-writing if we already initialized this frame's UBO binding
+      if (!res.pbrUboBindingWritten[frameIndex]) {
+        writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo});
+      }
+      appendPbrFixedWrites(writes);
+      if (!writes.empty()) {
+        std::lock_guard<std::mutex> lk(descriptorMutex);
+        device.updateDescriptorSets(writes, {});
+        if (!res.pbrUboBindingWritten[frameIndex]) {
+          res.pbrUboBindingWritten[frameIndex] = true;
+        }
+        if (needFixedWrites) {
+          res.pbrFixedBindingsWritten[frameIndex] = true;
+        }
+      }
+      return true;
+    }
+
+    // For full updates (imagesOnly == false), include UBO write; for imagesOnly, skip it
+    if (!imagesOnly) {
+      writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo});
+    }
+
+    auto meshComponent = entity->GetComponent<MeshComponent>();
+    // Determine PBR texture paths in the same manner as createDescriptorSets
+    std::string legacyPath = (meshComponent ? meshComponent->GetTexturePath() : std::string());
+    const std::string baseColorPath = (meshComponent && !meshComponent->GetBaseColorTexturePath().empty()) ? meshComponent->GetBaseColorTexturePath() : (!legacyPath.empty() ? legacyPath : SHARED_DEFAULT_ALBEDO_ID);
+    const std::string mrPath = (meshComponent && !meshComponent->GetMetallicRoughnessTexturePath().empty()) ? meshComponent->GetMetallicRoughnessTexturePath() : SHARED_DEFAULT_METALLIC_ROUGHNESS_ID;
+    const std::string normalPath = (meshComponent && !meshComponent->GetNormalTexturePath().empty()) ? meshComponent->GetNormalTexturePath() : SHARED_DEFAULT_NORMAL_ID;
+    const std::string occlusionPath = (meshComponent && !meshComponent->GetOcclusionTexturePath().empty()) ? meshComponent->GetOcclusionTexturePath() : SHARED_DEFAULT_OCCLUSION_ID;
+    const std::string emissivePath = (meshComponent && !meshComponent->GetEmissiveTexturePath().empty()) ? meshComponent->GetEmissiveTexturePath() : SHARED_DEFAULT_EMISSIVE_ID;
+    std::array<std::string, 5> pbrTexturePaths = {baseColorPath, mrPath, normalPath, occlusionPath, emissivePath};
+
+    for (int j = 0; j < 5; ++j) {
+      const std::string resolvedBindingPath = ResolveTextureId(pbrTexturePaths[j]);
+      vk::Sampler samplerHandle{};
+      vk::ImageView viewHandle{}; {
+        std::shared_lock<std::shared_mutex> lock(textureResourcesMutex);
+        auto textureIt = textureResources.find(resolvedBindingPath);
+        TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources;
+        samplerHandle = *texRes->textureSampler;
+        viewHandle = *texRes->textureImageView;
+      }
+      imageInfos[j] = {.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal};
+      writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = static_cast<uint32_t>(j + 1), .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfos[j]});
+    }
+    // Ensure Forward+ light buffer (binding 6) is written for the current frame when available.
+    // Do this even on imagesOnly updates so set 0 is fully valid for PBR shading.
+    if (frameIndex < lightStorageBuffers.size() && !!*lightStorageBuffers[frameIndex].buffer) {
+      vk::DescriptorBufferInfo lightBufferInfo{.buffer = *lightStorageBuffers[frameIndex].buffer, .range = VK_WHOLE_SIZE};
+      writes.push_back({.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 6, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eStorageBuffer, .pBufferInfo = &lightBufferInfo});
+    }
+    appendPbrFixedWrites(writes); {
+      std::lock_guard<std::mutex> lk(descriptorMutex);
+      device.updateDescriptorSets(writes, {});
+    }
+    if (needFixedWrites) {
+      res.pbrFixedBindingsWritten[frameIndex] = true;
+    }
+    if (!imagesOnly) {
+      res.pbrUboBindingWritten[frameIndex] = true;
+    }
+  } else {
+    const std::string resolvedTexturePath = ResolveTextureId(texturePath);
+    vk::Sampler samplerHandle{};
+    vk::ImageView viewHandle{}; {
+      std::shared_lock<std::shared_mutex> lock(textureResourcesMutex);
+      auto textureIt = textureResources.find(resolvedTexturePath);
+      TextureResources* texRes = (textureIt != textureResources.end()) ? &textureIt->second : &defaultTextureResources;
+      samplerHandle = *texRes->textureSampler;
+      viewHandle = *texRes->textureImageView;
+    }
+    vk::DescriptorImageInfo imageInfo{.sampler = samplerHandle, .imageView = viewHandle, .imageLayout = vk::ImageLayout::eShaderReadOnlyOptimal};
+    if (imagesOnly && !newlyAllocated) {
+      std::array<vk::WriteDescriptorSet, 1> descriptorWrites = {
+        vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo}
+      }; {
+        std::lock_guard<std::mutex> lk(descriptorMutex);
+        device.updateDescriptorSets(descriptorWrites, {});
+      }
+    } else {
+      // If uboOnly is requested for basic pipeline, only write binding 0
+      if (uboOnly) {
+        if (!res.basicUboBindingWritten[frameIndex]) {
+          std::array<vk::WriteDescriptorSet, 1> descriptorWrites = {
+            vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo}
+          }; {
+            std::lock_guard<std::mutex> lk(descriptorMutex);
+            device.updateDescriptorSets(descriptorWrites, {});
+          }
+          res.basicUboBindingWritten[frameIndex] = true;
+        }
+        return true;
+      }
+      std::array<vk::WriteDescriptorSet, 2> descriptorWrites = {
+        vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 0, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eUniformBuffer, .pBufferInfo = &bufferInfo},
+        vk::WriteDescriptorSet{.dstSet = *targetDescriptorSets[frameIndex], .dstBinding = 1, .descriptorCount = 1, .descriptorType = vk::DescriptorType::eCombinedImageSampler, .pImageInfo = &imageInfo}
+      }; {
+        std::lock_guard<std::mutex> lk(descriptorMutex);
+        device.updateDescriptorSets(descriptorWrites, {});
+      }
+      res.basicUboBindingWritten[frameIndex] = true;
+    }
+  }
+  return true;
+}
+
+void Renderer::ProcessDirtyDescriptorsForFrame(uint32_t frameIndex) {
+  if (frameIndex >= 32u)
+    return;
+  const uint32_t frameBit = (1u << frameIndex);
+
+  auto tStart = std::chrono::steady_clock::now();
+  std::vector<Entity *> toProcess; {
+    std::lock_guard<std::mutex> lk(dirtyEntitiesMutex);
+    if (descriptorDirtyEntities.empty())
+      return;
+    toProcess.reserve(descriptorDirtyEntities.size());
+    for (auto& [e, mask] : descriptorDirtyEntities) {
+      if (!!e && (mask & frameBit)) {
+        toProcess.push_back(e);
+      }
+    }
+  }
+  if (toProcess.empty()) return;
+
+  uint32_t processed = 0;
+  for (Entity* entity : toProcess) {
+    if (!entity)
+      continue;
+
+    // Kick watchdog periodically during heavy descriptor processing
+    if (++processed % 50 == 0) {
+      lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+    }
+
+    auto meshComponent = entity->GetComponent<MeshComponent>();
+    if (!meshComponent)
+      continue;
+    std::string basicTexPath = meshComponent->GetTexturePath();
+    if (basicTexPath.empty())
+      basicTexPath = meshComponent->GetBaseColorTexturePath();
+    
+    auto entityIt = entityResources.find(entity);
+    if (entityIt != entityResources.end()) {
+      // Use existing update function for now but with reduced frequency or just accept it's slow
+      updateDescriptorSetsForFrame(entity, entityIt->second, basicTexPath, false, frameIndex, /*imagesOnly=*/true);
+      updateDescriptorSetsForFrame(entity, entityIt->second, basicTexPath, true, frameIndex, /*imagesOnly=*/true);
+    }
+  }
+
+  auto ms = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - tStart).count();
+  if (ms > 50) {
+      std::cout << "[Renderer] ProcessDirtyDescriptorsForFrame took " << ms << "ms for " << toProcess.size() << " entities" << std::endl;
+  }
+
+  // Clear the processed bit; keep entities dirty until all frames have been refreshed.
+  {
+    std::lock_guard<std::mutex> lk(dirtyEntitiesMutex);
+    for (Entity* entity : toProcess) {
+      auto it = descriptorDirtyEntities.find(entity);
+      if (it == descriptorDirtyEntities.end())
+        continue;
+      it->second &= ~frameBit;
+      if (it->second == 0u) {
+        descriptorDirtyEntities.erase(it);
+      }
+    }
+  }
+}
+
+void Renderer::ProcessPendingTextureJobs(uint32_t maxJobs,
+                                         bool includeCritical,
+                                         bool includeNonCritical) {
+  // If the background uploads worker is running, it will handle draining
+  // texture jobs. Keep this function as a safe no-op for render-thread code
+  // paths that still call it.
+  if (!uploadsWorkerThreads.empty() && !stopUploadsWorker.load(std::memory_order_relaxed)) {
+    return;
+  }
+  // Drain the pending job list under lock into a local vector, then
+  // perform a bounded number of texture loads (including Vulkan work)
+  // on this thread. This must be called from the main/render thread.
+  std::vector<PendingTextureJob> jobs; {
+    std::lock_guard<std::mutex> lk(pendingTextureJobsMutex);
+    if (pendingTextureJobs.empty()) {
+      return;
+    }
+    jobs.swap(pendingTextureJobs);
+  }
+
+  std::vector<PendingTextureJob> remaining;
+  remaining.reserve(jobs.size());
+
+  uint32_t processed = 0;
+  uint32_t watchdogCounter = 0;
+  for (auto& job : jobs) {
+    // Kick watchdog periodically during heavy texture processing
+    if (++watchdogCounter % 10 == 0) {
+      lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+    }
+
+    const bool isCritical = (job.priority == PendingTextureJob::Priority::Critical);
+    if (processed < maxJobs &&
+      ((isCritical && includeCritical) || (!isCritical && includeNonCritical))) {
+      switch (job.type) {
+        case PendingTextureJob::Type::FromFile:
+          // LoadTexture will resolve aliases and perform full GPU upload
+          LoadTexture(job.idOrPath);
+          break;
+        case PendingTextureJob::Type::FromMemory:
+          // LoadTextureFromMemory will create GPU resources for this ID
+          LoadTextureFromMemory(job.idOrPath,
+                                job.data.data(),
+                                job.width,
+                                job.height,
+                                job.channels);
+          break;
+      }
+      // Refresh descriptors for entities that use this texture so
+      // streaming uploads become visible in the scene.
+      OnTextureUploaded(job.idOrPath);
+      if (isCritical) {
+        criticalJobsOutstanding.fetch_sub(1, std::memory_order_relaxed);
+      }
+      uploadJobsCompleted.fetch_add(1, std::memory_order_relaxed);
+      ++processed;
+    } else {
+      remaining.emplace_back(std::move(job));
+    }
+  }
+
+  if (!remaining.empty()) {
+    std::lock_guard<std::mutex> lk(pendingTextureJobsMutex);
+    // Append remaining jobs back to the pending queue
+    pendingTextureJobs.insert(pendingTextureJobs.end(),
+                              std::make_move_iterator(remaining.begin()),
+                              std::make_move_iterator(remaining.end()));
+  }
+}
+
+// Record both layout transitions and the copy in a single submission with a fence
+void Renderer::uploadImageFromStaging(vk::Buffer staging,
+                                      vk::Image image,
+                                      vk::Format format,
+                                      vk::ArrayProxy<const vk::BufferImageCopy> regions,
+                                      uint32_t mipLevels,
+                                      vk::DeviceSize stagedBytes) {
+  ensureThreadLocalVulkanInit();
+  try {
+    // Start perf window on first upload
+    if (uploadWindowStartNs.load(std::memory_order_relaxed) == 0) {
+      auto now = std::chrono::steady_clock::now().time_since_epoch();
+      uint64_t nowNs = static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(now).count());
+      uploadWindowStartNs.store(nowNs, std::memory_order_relaxed);
+    }
+    auto t0 = std::chrono::steady_clock::now();
+
+    // Use a temporary transient command pool for the TRANSFER queue family
+    vk::CommandPoolCreateInfo poolInfo{
+      .flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer,
+      .queueFamilyIndex = queueFamilyIndices.transferFamily.value()
+    };
+    vk::raii::CommandPool tempPool(device, poolInfo);
+    vk::CommandBufferAllocateInfo allocInfo{
+      .commandPool = *tempPool,
+      .level = vk::CommandBufferLevel::ePrimary,
+      .commandBufferCount = 1
+    };
+    vk::raii::CommandBuffers cbs(device, allocInfo);
+    vk::raii::CommandBuffer& cb = cbs[0];
+
+    vk::CommandBufferBeginInfo beginInfo{.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit};
+    cb.begin(beginInfo);
+
+    // Barrier: Undefined -> TransferDstOptimal (all mip levels that will be copied) (Sync2)
+    vk::ImageMemoryBarrier2 toTransfer2{
+      .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe,
+      .srcAccessMask = vk::AccessFlagBits2::eNone,
+      .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+      .dstAccessMask = vk::AccessFlagBits2::eTransferWrite,
+      .oldLayout = vk::ImageLayout::eUndefined,
+      .newLayout = vk::ImageLayout::eTransferDstOptimal,
+      .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .image = image,
+      .subresourceRange = {
+        .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor,
+        .baseMipLevel = 0,
+        .levelCount = mipLevels,
+        .baseArrayLayer = 0,
+        .layerCount = 1
+      }
+    };
+    vk::DependencyInfo depToTransfer{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toTransfer2};
+    cb.pipelineBarrier2(depToTransfer);
+    // Copy
+    cb.copyBufferToImage(staging, image, vk::ImageLayout::eTransferDstOptimal, regions);
+    // After copy, if we'll generate mips, keep level 0 in TRANSFER_SRC and leave others in TRANSFER_DST.
+    // Else transition ALL levels to SHADER_READ_ONLY. (Sync2)
+    const bool willGenerateMips = (mipLevels > 1 && regions.size() == 1);
+    if (willGenerateMips) {
+      vk::ImageMemoryBarrier2 postCopy2{
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eNone,
+        .oldLayout = vk::ImageLayout::eTransferDstOptimal,
+        .newLayout = vk::ImageLayout::eTransferSrcOptimal,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .image = image,
+        .subresourceRange = {
+          .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor,
+          .baseMipLevel = 0,
+          .levelCount = 1,
+          .baseArrayLayer = 0,
+          .layerCount = 1
+        }
+      };
+      vk::DependencyInfo depPostCopy{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &postCopy2};
+      cb.pipelineBarrier2(depPostCopy);
+    } else {
+      vk::ImageMemoryBarrier2 allToSample{
+        .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .srcAccessMask = vk::AccessFlagBits2::eTransferWrite,
+        .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+        .dstAccessMask = vk::AccessFlagBits2::eNone,
+        .oldLayout = vk::ImageLayout::eTransferDstOptimal,
+        .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal,
+        .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+        .image = image,
+        .subresourceRange = {
+          .aspectMask = (format == vk::Format::eD32Sfloat || format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint) ? vk::ImageAspectFlagBits::eDepth : vk::ImageAspectFlagBits::eColor,
+          .baseMipLevel = 0,
+          .levelCount = mipLevels,
+          .baseArrayLayer = 0,
+          .layerCount = 1
+        }
+      };
+      vk::DependencyInfo depAllToSample{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &allToSample};
+      cb.pipelineBarrier2(depAllToSample);
+    }
+    cb.end();
+
+    // Submit once on the TRANSFER queue; signal uploads timeline if available
+    vk::raii::Fence fence(device, vk::FenceCreateInfo{});
+    bool canSignalTimeline = !!*uploadsTimeline;
+    uint64_t signalValue = 0; {
+      std::lock_guard<std::mutex> lock(queueMutex);
+      vk::SubmitInfo submit{};
+      vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit
+      if (canSignalTimeline) {
+        signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1;
+        
+        // If we don't know if this is critical, assume it is NOT.
+        // Forcing all uploads to be critical during loading causes massive stalls.
+        // We only wait for truly critical assets needed for the very first frame.
+        /* if (IsLoading()) {
+           lastCriticalUploadValue.store(signalValue, std::memory_order_relaxed);
+        } */
+
+        timelineInfo.signalSemaphoreValueCount = 1;
+        timelineInfo.pSignalSemaphoreValues = &signalValue;
+        submit.pNext = &timelineInfo;
+        submit.signalSemaphoreCount = 1;
+        submit.pSignalSemaphores = &*uploadsTimeline;
+      }
+      submit.commandBufferCount = 1;
+      submit.pCommandBuffers = &*cb;
+
+      transferQueue.submit(submit, *fence);
+    }
+    (void) waitForFencesSafe(*fence, VK_TRUE);
+
+    // Perf accounting
+    auto t1 = std::chrono::steady_clock::now();
+    auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count();
+    totalUploadNs.fetch_add(static_cast<uint64_t>(ns), std::memory_order_relaxed);
+    uploadCount.fetch_add(1, std::memory_order_relaxed);
+    if (stagedBytes > 0) {
+      bytesUploadedTotal.fetch_add(static_cast<uint64_t>(stagedBytes), std::memory_order_relaxed);
+    }
+  } catch (const std::exception& e) {
+    std::cerr << "uploadImageFromStaging failed: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Generate full mip chain with linear blits (RGBA formats). Assumes level 0 is in TRANSFER_SRC_OPTIMAL.
+void Renderer::generateMipmaps(vk::Image image,
+                               vk::Format format,
+                               int32_t texWidth,
+                               int32_t texHeight,
+                               uint32_t mipLevels) {
+  ensureThreadLocalVulkanInit();
+  // Verify format supports linear blit
+  auto props = physicalDevice.getFormatProperties(format);
+  if ((props.optimalTilingFeatures & vk::FormatFeatureFlagBits::eSampledImageFilterLinear) == vk::FormatFeatureFlags{}) {
+    return; // no linear filter support; skip
+  }
+
+  vk::CommandPoolCreateInfo poolInfo{.flags = vk::CommandPoolCreateFlagBits::eTransient | vk::CommandPoolCreateFlagBits::eResetCommandBuffer, .queueFamilyIndex = queueFamilyIndices.graphicsFamily.value()};
+  vk::raii::CommandPool tempPool(device, poolInfo);
+  vk::CommandBufferAllocateInfo allocInfo{.commandPool = *tempPool, .level = vk::CommandBufferLevel::ePrimary, .commandBufferCount = 1};
+  vk::raii::CommandBuffers cbs(device, allocInfo);
+  vk::raii::CommandBuffer& cb = cbs[0];
+  cb.begin({.flags = vk::CommandBufferUsageFlagBits::eOneTimeSubmit});
+
+  int32_t mipW = texWidth;
+  int32_t mipH = texHeight;
+
+  // 1. Transition level 0 to TRANSFER_SRC_OPTIMAL from whatever it was (usually TRANSFER_DST_OPTIMAL or TRANSFER_SRC_OPTIMAL)
+  {
+      vk::ImageMemoryBarrier2 toSrc0{
+          .srcStageMask = vk::PipelineStageFlagBits2::eTransfer,
+          .srcAccessMask = vk::AccessFlagBits2::eTransferWrite | vk::AccessFlagBits2::eTransferRead,
+          .dstStageMask = vk::PipelineStageFlagBits2::eTransfer,
+          .dstAccessMask = vk::AccessFlagBits2::eTransferRead,
+          .oldLayout = vk::ImageLayout::eUndefined, // Safe way to force layout without knowing previous exactly
+          .newLayout = vk::ImageLayout::eTransferSrcOptimal,
+          .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+          .image = image,
+          .subresourceRange = {vk::ImageAspectFlagBits::eColor, 0, 1, 0, 1}
+      };
+      // Note: Undefined old layout discards contents, but level 0 is NOT undefined.
+      // But wait! If I use UNDEFINED, I LOSE the image data!
+      // So I MUST use the correct old layout.
+      // Since uploadImageFromStaging used TRANSFER_DST_OPTIMAL for all levels, and then
+      // transitioned Level 0 to TRANSFER_SRC_OPTIMAL, that is the expected layout.
+      toSrc0.oldLayout = vk::ImageLayout::eTransferSrcOptimal;
+      toSrc0.srcAccessMask = vk::AccessFlagBits2::eTransferWrite;
+      
+      vk::DependencyInfo dep{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toSrc0};
+      // We wrap this in a try-catch or just suppress if it fails? No.
+      cb.pipelineBarrier2(dep);
+  }
+
+  for (uint32_t i = 1; i < mipLevels; ++i) {
+    // Transition level i to TRANSFER_DST_OPTIMAL
+    vk::ImageMemoryBarrier2 toDst2{
+      .srcStageMask = vk::PipelineStageFlagBits2::eTopOfPipe, .srcAccessMask = vk::AccessFlagBits2::eNone, .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, .dstAccessMask = vk::AccessFlagBits2::eTransferWrite, .oldLayout = vk::ImageLayout::eUndefined, .newLayout = vk::ImageLayout::eTransferDstOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image,
+      .subresourceRange = {vk::ImageAspectFlagBits::eColor, i, 1, 0, 1}
+    };
+    vk::DependencyInfo depToDst{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toDst2};
+    cb.pipelineBarrier2(depToDst);
+
+    // Blit from i-1 to i
+    vk::ImageBlit blit{};
+    blit.srcSubresource.aspectMask = vk::ImageAspectFlagBits::eColor;
+    blit.srcSubresource.mipLevel = i - 1;
+    blit.srcSubresource.baseArrayLayer = 0;
+    blit.srcSubresource.layerCount = 1;
+    blit.srcOffsets[0] = vk::Offset3D{0, 0, 0};
+    blit.srcOffsets[1] = vk::Offset3D{mipW, mipH, 1};
+    blit.dstSubresource.aspectMask = vk::ImageAspectFlagBits::eColor;
+    blit.dstSubresource.mipLevel = i;
+    blit.dstSubresource.baseArrayLayer = 0;
+    blit.dstSubresource.layerCount = 1;
+    blit.dstOffsets[0] = vk::Offset3D{0, 0, 0};
+    blit.dstOffsets[1] = vk::Offset3D{std::max(1, mipW / 2), std::max(1, mipH / 2), 1};
+    cb.blitImage(image, vk::ImageLayout::eTransferSrcOptimal, image, vk::ImageLayout::eTransferDstOptimal, blit, vk::Filter::eLinear);
+
+    // Transition level i-1 to SHADER_READ_ONLY_OPTIMAL
+    vk::ImageMemoryBarrier2 toRead2{
+      .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, .srcAccessMask = vk::AccessFlagBits2::eTransferRead, .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, .dstAccessMask = vk::AccessFlagBits2::eShaderRead, .oldLayout = vk::ImageLayout::eTransferSrcOptimal, .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+      .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, .subresourceRange = {vk::ImageAspectFlagBits::eColor, i - 1, 1, 0, 1}
+    };
+    vk::DependencyInfo depToRead{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toRead2};
+    cb.pipelineBarrier2(depToRead);
+
+    // Transition level i to TRANSFER_SRC_OPTIMAL (so it can be source for i+1)
+    if (i + 1 < mipLevels) {
+        vk::ImageMemoryBarrier2 toSrcNext{
+          .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, .dstStageMask = vk::PipelineStageFlagBits2::eTransfer, .dstAccessMask = vk::AccessFlagBits2::eTransferRead, .oldLayout = vk::ImageLayout::eTransferDstOptimal, .newLayout = vk::ImageLayout::eTransferSrcOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image,
+          .subresourceRange = {vk::ImageAspectFlagBits::eColor, i, 1, 0, 1}
+        };
+        vk::DependencyInfo depToSrcNext{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &toSrcNext};
+        cb.pipelineBarrier2(depToSrcNext);
+    }
+
+    mipW = std::max(1, mipW / 2);
+    mipH = std::max(1, mipH / 2);
+  }
+  // Transition last level to SHADER_READ_ONLY_OPTIMAL
+  vk::ImageMemoryBarrier2 lastToRead2{
+    .srcStageMask = vk::PipelineStageFlagBits2::eTransfer, .srcAccessMask = vk::AccessFlagBits2::eTransferWrite, .dstStageMask = vk::PipelineStageFlagBits2::eFragmentShader, .dstAccessMask = vk::AccessFlagBits2::eShaderRead, .oldLayout = vk::ImageLayout::eTransferDstOptimal, .newLayout = vk::ImageLayout::eShaderReadOnlyOptimal, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
+    .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, .subresourceRange = {vk::ImageAspectFlagBits::eColor, mipLevels - 1, 1, 0, 1}
+  };
+  vk::DependencyInfo depLastToRead{.dependencyFlags = vk::DependencyFlagBits::eByRegion, .imageMemoryBarrierCount = 1, .pImageMemoryBarriers = &lastToRead2};
+  cb.pipelineBarrier2(depLastToRead);
+
+  cb.end();
+
+  vk::raii::Fence fence(device, vk::FenceCreateInfo{});
+  bool canSignalTimeline = !!*uploadsTimeline;
+  uint64_t signalValue = 0; {
+    std::lock_guard<std::mutex> lock(queueMutex);
+    vk::SubmitInfo submit{};
+    vk::TimelineSemaphoreSubmitInfo timelineInfo{}; // keep alive through submit
+    if (canSignalTimeline) {
+      signalValue = uploadTimelineLastSubmitted.fetch_add(1, std::memory_order_relaxed) + 1;
+      timelineInfo.signalSemaphoreValueCount = 1;
+      timelineInfo.pSignalSemaphoreValues = &signalValue;
+      submit.pNext = &timelineInfo;
+      submit.signalSemaphoreCount = 1;
+      submit.pSignalSemaphores = &*uploadsTimeline;
+    }
+    submit.commandBufferCount = 1;
+    submit.pCommandBuffers = &*cb;
+    graphicsQueue.submit(submit, *fence);
+  }
+  (void) waitForFencesSafe(*fence, VK_TRUE);
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/renderer_utils.cpp b/attachments/advanced_gltf/renderer_utils.cpp
new file mode 100644
index 000000000..46319fbf2
--- /dev/null
+++ b/attachments/advanced_gltf/renderer_utils.cpp
@@ -0,0 +1,344 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "renderer.h"
+#include <array>
+#include <fstream>
+#include <iostream>
+#include <set>
+#include <stdexcept>
+
+// This file contains utility methods from the Renderer class
+
+// Find memory type
+uint32_t Renderer::findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties) const {
+  try {
+    // Get memory properties
+    vk::PhysicalDeviceMemoryProperties memProperties = physicalDevice.getMemoryProperties();
+
+    // Find suitable memory type
+    for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
+      if ((typeFilter & (1 << i)) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) {
+        return i;
+      }
+    }
+
+    throw std::runtime_error("Failed to find suitable memory type");
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to find memory type: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Find supported format
+vk::Format Renderer::findSupportedFormat(const std::vector<vk::Format>& candidates, vk::ImageTiling tiling, vk::FormatFeatureFlags features) {
+  try {
+    for (vk::Format format : candidates) {
+      vk::FormatProperties props = physicalDevice.getFormatProperties(format);
+
+      if (tiling == vk::ImageTiling::eLinear && (props.linearTilingFeatures & features) == features) {
+        return format;
+      } else if (tiling == vk::ImageTiling::eOptimal && (props.optimalTilingFeatures & features) == features) {
+        return format;
+      }
+    }
+
+    throw std::runtime_error("Failed to find supported format");
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to find supported format: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Find depth format
+vk::Format Renderer::findDepthFormat() {
+  try {
+    vk::Format depthFormat = findSupportedFormat(
+      {vk::Format::eD32Sfloat, vk::Format::eD32SfloatS8Uint, vk::Format::eD24UnormS8Uint},
+      vk::ImageTiling::eOptimal,
+      vk::FormatFeatureFlagBits::eDepthStencilAttachment);
+    std::cout << "Found depth format: " << static_cast<int>(depthFormat) << std::endl;
+    return depthFormat;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to find supported depth format, falling back to D32_SFLOAT: " << e.what() << std::endl;
+    // Fallback to D32_SFLOAT which is widely supported
+    return vk::Format::eD32Sfloat;
+  }
+}
+
+// Check if format has stencil component
+bool Renderer::hasStencilComponent(vk::Format format) {
+  return format == vk::Format::eD32SfloatS8Uint || format == vk::Format::eD24UnormS8Uint;
+}
+
+// Read file
+std::vector<char> Renderer::readFile(const std::string& filename) {
+  try {
+    // Open file at end to get size
+    std::ifstream file(filename, std::ios::ate | std::ios::binary);
+
+    if (!file.is_open()) {
+      throw std::runtime_error("Failed to open file: " + filename);
+    }
+
+    // Get file size
+    size_t fileSize = file.tellg();
+    std::vector<char> buffer(fileSize);
+
+    // Go back to beginning of file and read data
+    file.seekg(0);
+    file.read(buffer.data(), fileSize);
+
+    // Close file
+    file.close();
+
+    return buffer;
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to read file: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Create shader module
+vk::raii::ShaderModule Renderer::createShaderModule(const std::vector<char>& code) {
+  try {
+    // Create shader module
+    vk::ShaderModuleCreateInfo createInfo{
+      .codeSize = code.size(),
+      .pCode = reinterpret_cast<const uint32_t *>(code.data())
+    };
+
+    return vk::raii::ShaderModule(device, createInfo);
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to create shader module: " << e.what() << std::endl;
+    throw;
+  }
+}
+
+// Find queue families
+QueueFamilyIndices Renderer::findQueueFamilies(const vk::raii::PhysicalDevice& device) {
+  QueueFamilyIndices indices;
+
+  // Get queue family properties
+  std::vector<vk::QueueFamilyProperties> queueFamilies = device.getQueueFamilyProperties();
+
+  // Find queue families that support graphics, compute, present, and (optionally) a dedicated transfer queue
+  for (uint32_t i = 0; i < queueFamilies.size(); i++) {
+    const auto& qf = queueFamilies[i];
+    // Check for graphics support
+    if ((qf.queueFlags & vk::QueueFlagBits::eGraphics) && !indices.graphicsFamily.has_value()) {
+      indices.graphicsFamily = i;
+    }
+    // Check for compute support
+    if ((qf.queueFlags & vk::QueueFlagBits::eCompute) && !indices.computeFamily.has_value()) {
+      indices.computeFamily = i;
+    }
+    // Check for present support
+    if (!indices.presentFamily.has_value() && device.getSurfaceSupportKHR(i, *surface)) {
+      indices.presentFamily = i;
+    }
+    // Prefer a dedicated transfer queue (transfer bit set, but NOT graphics) if available
+    if ((qf.queueFlags & vk::QueueFlagBits::eTransfer) && !(qf.queueFlags & vk::QueueFlagBits::eGraphics)) {
+      if (!indices.transferFamily.has_value()) {
+        indices.transferFamily = i;
+      }
+    }
+    // If all required queue families are found, we can still continue to try find a dedicated transfer queue
+    if (indices.isComplete() && indices.transferFamily.has_value()) {
+      // Found everything including dedicated transfer
+      break;
+    }
+  }
+
+  // Fallback: if no dedicated transfer queue, reuse graphics queue for transfer
+  if (!indices.transferFamily.has_value() && indices.graphicsFamily.has_value()) {
+    indices.transferFamily = indices.graphicsFamily;
+  }
+
+  return indices;
+}
+
+// Query swap chain support
+SwapChainSupportDetails Renderer::querySwapChainSupport(const vk::raii::PhysicalDevice& device) {
+  SwapChainSupportDetails details;
+
+  // Get surface capabilities
+  details.capabilities = device.getSurfaceCapabilitiesKHR(*surface);
+
+  // Get surface formats
+  details.formats = device.getSurfaceFormatsKHR(*surface);
+
+  // Get present modes
+  details.presentModes = device.getSurfacePresentModesKHR(*surface);
+
+  return details;
+}
+
+// Check device extension support
+bool Renderer::checkDeviceExtensionSupport(vk::raii::PhysicalDevice& device) {
+  auto availableDeviceExtensions = device.enumerateDeviceExtensionProperties();
+
+  // Check if all required extensions are supported
+  std::set<std::string> requiredExtensionsSet(requiredDeviceExtensions.begin(), requiredDeviceExtensions.end());
+
+  for (const auto& extension : availableDeviceExtensions) {
+    requiredExtensionsSet.erase(extension.extensionName);
+  }
+
+  // Print missing required extensions
+  if (!requiredExtensionsSet.empty()) {
+    std::cout << "Missing required extensions:" << std::endl;
+    for (const auto& extension : requiredExtensionsSet) {
+      std::cout << "  " << extension << std::endl;
+    }
+    return false;
+  }
+
+  return true;
+}
+
+// Check if device is suitable
+bool Renderer::isDeviceSuitable(vk::raii::PhysicalDevice& device) {
+  // Check queue families
+  QueueFamilyIndices indices = findQueueFamilies(device);
+
+  // Check device extensions
+  bool extensionsSupported = checkDeviceExtensionSupport(device);
+
+  // Check swap chain support
+  bool swapChainAdequate = false;
+  if (extensionsSupported) {
+    SwapChainSupportDetails swapChainSupport = querySwapChainSupport(device);
+    swapChainAdequate = !swapChainSupport.formats.empty() && !swapChainSupport.presentModes.empty();
+  }
+
+  // Check for required features
+  auto features = device.template getFeatures2<vk::PhysicalDeviceFeatures2, vk::PhysicalDeviceVulkan13Features>();
+  bool supportsRequiredFeatures = features.template get<vk::PhysicalDeviceVulkan13Features>().dynamicRendering;
+
+  return indices.isComplete() && extensionsSupported && swapChainAdequate && supportsRequiredFeatures;
+}
+
+// Choose swap surface format
+vk::SurfaceFormatKHR Renderer::chooseSwapSurfaceFormat(const std::vector<vk::SurfaceFormatKHR>& availableFormats) {
+  // Look for SRGB format
+  for (const auto& availableFormat : availableFormats) {
+    if (availableFormat.format == vk::Format::eB8G8R8A8Srgb && availableFormat.colorSpace == vk::ColorSpaceKHR::eSrgbNonlinear) {
+      return availableFormat;
+    }
+  }
+
+  // If not found, return first available format
+  return availableFormats[0];
+}
+
+// Choose swap present mode
+vk::PresentModeKHR Renderer::chooseSwapPresentMode(const std::vector<vk::PresentModeKHR>& availablePresentModes) {
+  // Look for mailbox mode (triple buffering)
+  for (const auto& availablePresentMode : availablePresentModes) {
+    if (availablePresentMode == vk::PresentModeKHR::eMailbox) {
+      return availablePresentMode;
+    }
+  }
+
+  // If not found, return FIFO mode (guaranteed to be available)
+  return vk::PresentModeKHR::eFifo;
+}
+
+// Choose swap extent
+vk::Extent2D Renderer::chooseSwapExtent(const vk::SurfaceCapabilitiesKHR& capabilities) {
+  if (capabilities.currentExtent.width != std::numeric_limits<uint32_t>::max()) {
+    return capabilities.currentExtent;
+  } else {
+    // Get framebuffer size
+    int width, height;
+    platform->GetWindowSize(&width, &height);
+
+    // Create extent
+    vk::Extent2D actualExtent = {
+      static_cast<uint32_t>(width),
+      static_cast<uint32_t>(height)
+    };
+
+    // Clamp to min/max extent
+    actualExtent.width = std::clamp(actualExtent.width, capabilities.minImageExtent.width, capabilities.maxImageExtent.width);
+    actualExtent.height = std::clamp(actualExtent.height, capabilities.minImageExtent.height, capabilities.maxImageExtent.height);
+
+    return actualExtent;
+  }
+}
+
+// Wait for device to be idle
+void Renderer::WaitIdle() {
+  // 1. Wait for all in-flight fences safely first
+  std::vector<vk::Fence> allFences;
+  allFences.reserve(inFlightFences.size());
+  for (const auto& fence : inFlightFences) {
+    if (*fence) {
+      allFences.push_back(*fence);
+    }
+  }
+  if (!allFences.empty()) {
+    (void) waitForFencesSafe(allFences, VK_TRUE);
+  }
+
+  // 2. Also wait for uploads timeline semaphore if it exists
+  if (*uploadsTimeline) {
+    uint64_t target = uploadTimelineLastSubmitted.load(std::memory_order_relaxed);
+    while (true) {
+      vk::SemaphoreWaitInfo waitInfo{};
+      waitInfo.semaphoreCount = 1;
+      waitInfo.pSemaphores = &*uploadsTimeline;
+      waitInfo.pValues = &target;
+
+      vk::Result r = device.waitSemaphores(waitInfo, 100'000'000ULL); // 100ms
+      if (r == vk::Result::eSuccess)
+        break;
+      if (r == vk::Result::eTimeout) {
+        lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+        continue;
+      }
+      break; // Other error
+    }
+  }
+
+  // 3. Final blocking wait to ensure absolute idle
+  // External synchronization: ensure no queue submits/presents overlap a full device idle.
+  // This is required for VVL cleanliness when other threads may hold or use queues.
+  std::lock_guard<std::mutex> lock(queueMutex);
+  device.waitIdle();
+}
+
+vk::Result Renderer::waitForFencesSafe(const std::vector<vk::Fence>& fences, vk::Bool32 waitAll, uint64_t timeoutNs) {
+  if (fences.empty())
+    return vk::Result::eSuccess;
+
+  while (true) {
+    vk::Result r = device.waitForFences(fences, waitAll, timeoutNs);
+    if (r == vk::Result::eSuccess)
+      return vk::Result::eSuccess;
+    if (r == vk::Result::eTimeout) {
+      // Kick watchdog while we wait
+      lastFrameUpdateTime.store(std::chrono::steady_clock::now(), std::memory_order_relaxed);
+      continue;
+    }
+    return r;
+  }
+}
+
+vk::Result Renderer::waitForFencesSafe(vk::Fence fence, vk::Bool32 waitAll, uint64_t timeoutNs) {
+  return waitForFencesSafe(std::vector<vk::Fence>{fence}, waitAll, timeoutNs);
+}
\ No newline at end of file
diff --git a/attachments/advanced_gltf/scene_loading.cpp b/attachments/advanced_gltf/scene_loading.cpp
new file mode 100644
index 000000000..45543bb9a
--- /dev/null
+++ b/attachments/advanced_gltf/scene_loading.cpp
@@ -0,0 +1,671 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <filesystem>
+#include <glm/gtx/matrix_decompose.hpp>
+#include <iostream>
+#include <chrono>
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+#include <unordered_set>
+#include <string>
+
+#include "renderer.h"
+
+#include "scene_loading.h"
+#include "animation_component.h"
+#include "camera_component.h"
+#include "engine.h"
+#include "mesh_component.h"
+#include "renderer_advanced_types.h"
+#include "transform_component.h"
+
+/**
+ * @brief Calculate bounding box dimensions for a MaterialMesh.
+ * @param materialMesh The MaterialMesh to analyze.
+ * @return The size of the bounding box (max - min for each axis).
+ */
+glm::vec3 CalculateBoundingBoxSize(const MaterialMesh& materialMesh) {
+  if (materialMesh.vertices.empty()) {
+    return glm::vec3(0.0f);
+  }
+
+  glm::vec3 minBounds = materialMesh.vertices[0].position;
+  glm::vec3 maxBounds = materialMesh.vertices[0].position;
+
+  for (const auto& vertex : materialMesh.vertices) {
+    minBounds = glm::min(minBounds, vertex.position);
+    maxBounds = glm::max(maxBounds, vertex.position);
+  }
+
+  return maxBounds - minBounds;
+}
+
+/**
+ * @brief Load a GLTF model synchronously on the main thread.
+ * @return success or fail on loading the GLTF model.
+ * @param engine The engine to create entities in.
+ * @param modelPath The path to the GLTF model file.
+ * @param position The position to place the model (default: origin with slight Y offset).
+ * @param rotation The rotation to apply to the model (default: no rotation).
+ * @param scale The scale to apply to the model (default: unit scale).
+ */
+bool LoadGLTFModel(Engine* engine,
+                   const std::string& modelPath,
+                   const glm::vec3& position,
+                   const glm::vec3& rotation,
+                   const glm::vec3& scale,
+                   float mass) {
+  // Get the model loader and renderer
+  ModelLoader* modelLoader = engine->GetModelLoader();
+  Renderer* renderer = engine->GetRenderer();
+
+  if (!modelLoader || !renderer) {
+    std::cerr << "Error: ModelLoader or Renderer is null" << std::endl;
+    return false;
+  }
+
+  // Only show the blocking loading overlay for the initial load.
+  // Subsequent loads (Fox, Cube) happen in the background without UI interruption.
+  bool wasInitialLoadComplete = renderer->initialLoadComplete.load(std::memory_order_relaxed);
+  if (!wasInitialLoadComplete) {
+    renderer->SetLoading(true);
+  }
+  renderer->SetLoadingPhase(Renderer::LoadingPhase::Textures);
+  // Ensure loading flag is cleared on any exit from this function
+  struct LoadingGuard {
+    Renderer* r;
+    bool needsClear;
+    ~LoadingGuard() {
+      if (needsClear) {
+        r->SetLoading(false);
+      }
+    }
+  } loadingGuard{renderer, !wasInitialLoadComplete};
+
+  // Extract model name from file path for entity naming
+  std::filesystem::path modelFilePath(modelPath);
+  std::string modelName = modelFilePath.stem().string(); // Get filename without extension
+
+  try {
+    const auto loadStart = std::chrono::steady_clock::now();
+    std::cout << "[Loading] Begin: " << modelPath << std::endl;
+
+    // Suppress watchdog during heavy loading (allowed 60s instead of 10s)
+    renderer->watchdogSuppressed.store(true, std::memory_order_relaxed);
+
+    // Loading large scenes can produce tens of thousands of entities.
+    // Avoid per-entity stdout spam (very slow on Windows consoles) and instead
+    // keep counters + print occasional summaries.
+    size_t physicsBodiesQueued = 0;
+    size_t physicsBodiesSkipped = 0;
+    size_t physicsNoGeometry = 0;
+    auto maybeLogPhysicsProgress = [&]() {
+      const size_t total = physicsBodiesQueued + physicsBodiesSkipped + physicsNoGeometry;
+      // Log infrequently to keep visibility without tanking load time.
+      if (total > 0 && (total % 5000u) == 0u) {
+        std::cout << "[Loading] Physics bodies: queued=" << physicsBodiesQueued
+            << ", skipped=" << physicsBodiesSkipped
+            << ", noGeometry=" << physicsNoGeometry << std::endl;
+      }
+    };
+    // Load the complete GLTF model with all textures and lighting on the main thread
+    Model* loadedModel = modelLoader->LoadGLTF(modelPath);
+    if (!loadedModel) {
+      std::cerr << "Failed to load GLTF model: " << modelPath << std::endl;
+      return false;
+    }
+
+    std::cout << "Successfully loaded GLTF model with all textures and lighting: " << modelPath << std::endl;
+
+    // Extract lights from the model and transform them to world space
+    std::vector<ExtractedLight> extractedLights = modelLoader->GetExtractedLights(modelPath);
+
+    // Create a transformation matrix from position, rotation, and scale
+    glm::mat4 transformMatrix = glm::mat4(1.0f);
+    transformMatrix = glm::translate(transformMatrix, position);
+    transformMatrix = glm::rotate(transformMatrix, glm::radians(rotation.x), glm::vec3(1.0f, 0.0f, 0.0f));
+    transformMatrix = glm::rotate(transformMatrix, glm::radians(rotation.y), glm::vec3(0.0f, 1.0f, 0.0f));
+    transformMatrix = glm::rotate(transformMatrix, glm::radians(rotation.z), glm::vec3(0.0f, 0.0f, 1.0f));
+    transformMatrix = glm::scale(transformMatrix, scale);
+
+    // Transform all light positions from local model space to world space
+    // Also transform the light direction (for directional lights)
+    glm::mat3 normalMatrix = glm::mat3(glm::transpose(glm::inverse(transformMatrix)));
+    for (auto& light : extractedLights) {
+      glm::vec4 worldPos = transformMatrix * glm::vec4(light.position, 1.0f);
+      light.position = glm::vec3(worldPos);
+      light.direction = glm::normalize(normalMatrix * light.direction);
+    }
+
+    // Append rather than overwrite
+    for (const auto& light : extractedLights) {
+      renderer->staticLights.push_back(light);
+    }
+    std::cout << "[Lights] staticLights appended: " << extractedLights.size() << " entries (total: " << renderer->staticLights.size() << ")" << std::endl;
+
+    // Extract and apply cameras from the GLTF model
+    const std::vector<CameraData>& cameras = loadedModel->GetCameras();
+    if (!cameras.empty()) {
+      const CameraData& gltfCamera = cameras[0]; // Use the first camera
+
+      // Find or create a camera entity to replace the default one
+      Entity* cameraEntity = engine->GetEntity("Camera");
+      if (!cameraEntity) {
+        // Create a new camera entity if none exists
+        cameraEntity = engine->CreateEntity("Camera");
+        if (cameraEntity) {
+          cameraEntity->AddComponent<TransformComponent>();
+          cameraEntity->AddComponent<CameraComponent>();
+        }
+      }
+
+      if (cameraEntity) {
+        // Update the camera transform with GLTF data
+        auto* cameraTransform = cameraEntity->GetComponent<TransformComponent>();
+        if (cameraTransform) {
+          // Apply the transformation matrix to the camera position
+          glm::vec4 worldPos = transformMatrix * glm::vec4(gltfCamera.position, 1.0f);
+          cameraTransform->SetPosition(glm::vec3(worldPos));
+
+          // Apply rotation from GLTF camera
+          glm::vec3 eulerAngles = glm::eulerAngles(gltfCamera.rotation);
+          cameraTransform->SetRotation(eulerAngles);
+        }
+
+        // Update the camera component with GLTF properties
+        auto* camera = cameraEntity->GetComponent<CameraComponent>();
+        if (camera) {
+          camera->ForceViewMatrixUpdate(); // Only sets viewMatrixDirty flag, doesn't change camera orientation
+          if (gltfCamera.isPerspective) {
+            camera->SetFieldOfView(glm::degrees(gltfCamera.fov)); // Convert radians to degrees
+            camera->SetClipPlanes(gltfCamera.nearPlane, gltfCamera.farPlane);
+            if (gltfCamera.aspectRatio > 0.0f) {
+              camera->SetAspectRatio(gltfCamera.aspectRatio);
+            }
+          } else {
+            // Handle orthographic camera if needed
+            camera->SetProjectionType(CameraComponent::ProjectionType::Orthographic);
+            camera->SetOrthographicSize(gltfCamera.orthographicSize, gltfCamera.orthographicSize);
+            camera->SetClipPlanes(gltfCamera.nearPlane, gltfCamera.farPlane);
+          }
+
+          // Set this as the active camera
+          engine->SetActiveCamera(camera);
+        }
+      }
+    }
+
+    // Get the material meshes from the loaded model
+    const std::vector<MaterialMesh>& materialMeshes = modelLoader->GetMaterialMeshes(modelPath);
+    if (materialMeshes.empty()) {
+      std::cerr << "No material meshes found in loaded model: " << modelPath << std::endl;
+      return false;
+    }
+
+    // Collect all geometry entities so we can batch Vulkan uploads for their meshes
+    std::vector<Entity *> geometryEntities;
+    geometryEntities.reserve(materialMeshes.size());
+
+    // Phase: Physics (queue colliders / rigid bodies). This is CPU-side work that can
+    // take noticeable time even after textures have finished scheduling.
+    renderer->SetLoadingPhase(Renderer::LoadingPhase::Physics);
+    renderer->SetLoadingPhaseProgress(0.0f);
+
+    for (size_t meshIdx = 0; meshIdx < materialMeshes.size(); ++meshIdx) {
+      AdvancedRenderer_KickWatchdog(renderer);
+      const auto& materialMesh = materialMeshes[meshIdx];
+
+      // Skip materials that have no geometry assigned to them
+      if (materialMesh.vertices.empty() || materialMesh.indices.empty()) {
+          continue;
+      }
+
+      if ((meshIdx % 64u) == 0u) {
+        renderer->SetLoadingPhaseProgress(materialMeshes.empty() ? 0.0f : (static_cast<float>(meshIdx) / static_cast<float>(materialMeshes.size())));
+      }
+      // Create an entity name based on model and material. Use the globally-unique material
+      // index so the ray-query material slot (parsed back out of this name) doesn't collide
+      // with same-numbered materials from other models.
+      const int entityMaterialIndex = (materialMesh.globalMaterialIndex >= 0)
+          ? materialMesh.globalMaterialIndex : materialMesh.materialIndex;
+      std::string entityName = modelName + "_Material_" + std::to_string(entityMaterialIndex) +
+          "_" + materialMesh.materialName;
+
+      if (Entity* materialEntity = engine->CreateEntity(entityName)) {
+        // Add a transform component with provided parameters
+        auto* transform = materialEntity->AddComponent<TransformComponent>();
+        transform->SetPosition(position);
+        transform->SetRotation(glm::radians(rotation));
+        transform->SetScale(scale);
+
+        // Add a mesh component with material-specific data
+        auto* mesh = materialEntity->AddComponent<MeshComponent>();
+        mesh->SetVertices(materialMesh.vertices);
+        mesh->SetIndices(materialMesh.indices);
+
+        // Optimization: Pre-calculate local AABB on the background thread.
+        // This caches the AABB once per mesh and avoids slow vertex scans on the main thread
+        // or during the physics ground check below.
+        mesh->RecomputeLocalAABB();
+
+        bool isDef = IsMaterialMeshDeformable(&materialMesh);
+        SetMeshComponentDeformable(mesh, isDef);
+        int numTargets = GetMaterialMeshMorphTargetCount(&materialMesh);
+        SetMeshComponentMorphTargets(mesh, numTargets);
+        SetMeshComponentEnvironment(mesh, entityName.find("bistro") != std::string::npos);
+        if (numTargets > 0) {
+            SetMeshComponentMorphPositions(mesh, GetMaterialMeshMorphPositions(&materialMesh));
+        }
+        if (isDef || numTargets > 0) {
+            std::cout << "[Loading] Entity " << materialEntity->GetName() << " has deformable/morph data (skinned=" << isDef << ", morphTargets=" << numTargets << ")" << std::endl;
+            if (isDef) {
+                SetMeshComponentJointsAndWeights(mesh, GetMaterialMeshJoints(&materialMesh), GetMaterialMeshWeights(&materialMesh));
+            }
+        }
+
+        if (materialMesh.GetInstanceCount() > 0) {
+          mesh->SetInstances(materialMesh.instances);
+        }
+
+        // Set ALL PBR texture paths for this material
+        // Set primary texture path for backward compatibility
+        if (!materialMesh.texturePath.empty()) {
+          mesh->SetTexturePath(materialMesh.texturePath);
+        }
+
+        // Set all PBR texture paths
+        if (!materialMesh.baseColorTexturePath.empty()) {
+          mesh->SetBaseColorTexturePath(materialMesh.baseColorTexturePath);
+        }
+        if (!materialMesh.normalTexturePath.empty()) {
+          mesh->SetNormalTexturePath(materialMesh.normalTexturePath);
+        }
+        if (!materialMesh.metallicRoughnessTexturePath.empty()) {
+          mesh->SetMetallicRoughnessTexturePath(materialMesh.metallicRoughnessTexturePath);
+        }
+        if (!materialMesh.occlusionTexturePath.empty()) {
+          mesh->SetOcclusionTexturePath(materialMesh.occlusionTexturePath);
+        }
+        if (!materialMesh.emissiveTexturePath.empty()) {
+          mesh->SetEmissiveTexturePath(materialMesh.emissiveTexturePath);
+        }
+
+        // Fallback: Use material DB (from ModelLoader) if any PBR texture is still missing
+        if (modelLoader) {
+          const Material* mat = modelLoader->GetMaterial(materialMesh.materialName);
+          if (mat) {
+            if (mesh->GetBaseColorTexturePath().empty() && !mat->albedoTexturePath.empty()) {
+              mesh->SetBaseColorTexturePath(mat->albedoTexturePath);
+            }
+            if (mesh->GetNormalTexturePath().empty() && !mat->normalTexturePath.empty()) {
+              mesh->SetNormalTexturePath(mat->normalTexturePath);
+            }
+            if (mesh->GetMetallicRoughnessTexturePath().empty() && !mat->metallicRoughnessTexturePath.empty()) {
+              mesh->SetMetallicRoughnessTexturePath(mat->metallicRoughnessTexturePath);
+            }
+            if (mesh->GetOcclusionTexturePath().empty() && !mat->occlusionTexturePath.empty()) {
+              mesh->SetOcclusionTexturePath(mat->occlusionTexturePath);
+            }
+            if (mesh->GetEmissiveTexturePath().empty() && !mat->emissiveTexturePath.empty()) {
+              mesh->SetEmissiveTexturePath(mat->emissiveTexturePath);
+            }
+          }
+        }
+
+        // Register all effective texture IDs this mesh uses so that when
+        // textures finish streaming in, the renderer can refresh
+        // descriptor sets for the appropriate entities. This must
+        // happen *after* material fallbacks so we see the final IDs.
+        auto registerTex = [&](const std::string& texId) {
+          if (!texId.empty()) {
+            renderer->RegisterTextureUser(texId, materialEntity);
+          }
+        };
+
+        registerTex(mesh->GetTexturePath());
+        registerTex(mesh->GetBaseColorTexturePath());
+        registerTex(mesh->GetNormalTexturePath());
+        registerTex(mesh->GetMetallicRoughnessTexturePath());
+        registerTex(mesh->GetOcclusionTexturePath());
+        registerTex(mesh->GetEmissiveTexturePath());
+
+        // Track this entity for batched Vulkan resource pre-allocation later
+        geometryEntities.push_back(materialEntity);
+
+        // Create a physics body so dynamic objects (balls, the released Fox)
+        // collide with this geometry.
+        PhysicsSystem* physicsSystem = engine->GetPhysicsSystem();
+        if (physicsSystem) {
+          auto* mc = materialEntity->GetComponent<MeshComponent>();
+          if (mc && !mc->GetVertices().empty() && !mc->GetIndices().empty()) {
+            // Compute the world-space bounds (from the entity transform and the
+            // mesh's local AABB, or the vertices as a fallback) to derive the
+            // collider's center used for distance-based streaming.
+            glm::vec3 minWS(std::numeric_limits<float>::max());
+            glm::vec3 maxWS(-std::numeric_limits<float>::max());
+
+            auto* xform = materialEntity->GetComponent<TransformComponent>();
+            glm::mat4 model = xform ? xform->GetModelMatrix() : glm::mat4(1.0f);
+
+            if (mc->HasLocalAABB()) {
+              glm::vec3 localMin = mc->GetLocalAABBMin();
+              glm::vec3 localMax = mc->GetLocalAABBMax();
+
+              // Transform the 8 corners of the local AABB to world space
+              for (int ix = 0; ix < 2; ++ix) {
+                for (int iy = 0; iy < 2; ++iy) {
+                  for (int iz = 0; iz < 2; ++iz) {
+                    glm::vec3 corner(
+                      ix ? localMax.x : localMin.x,
+                      iy ? localMax.y : localMin.y,
+                      iz ? localMax.z : localMin.z);
+                    glm::vec3 cWS = glm::vec3(model * glm::vec4(corner, 1.0f));
+                    minWS = glm::min(minWS, cWS);
+                    maxWS = glm::max(maxWS, cWS);
+                  }
+                }
+              }
+            } else {
+              // Fallback: compute bounds directly from vertices in world space
+              const auto& verts = mc->GetVertices();
+              for (const auto& v : verts) {
+                glm::vec3 pWS = glm::vec3(model * glm::vec4(v.position, 1.0f));
+                minWS = glm::min(minWS, pWS);
+                maxWS = glm::max(maxWS, pWS);
+              }
+            }
+
+            if (mass > 0.0f) {
+              // Dynamic objects (Fox, balls, etc.) — created up-front since
+              // they need to be active immediately.
+              physicsSystem->EnqueueRigidBodyCreation(
+                materialEntity,
+                CollisionShape::Box,
+                mass,
+                false,
+                0.15f,
+                0.5f
+              );
+              ++physicsBodiesQueued;
+              maybeLogPhysicsProgress();
+            } else {
+              // Static environment colliders use a triangle MESH shape so
+              // dynamic objects collide with the actual surface rather than the
+              // filled volume of an axis-aligned bounding box. They are
+              // registered for distance-based streaming: each mesh is a
+              // candidate, and only those near the camera are promoted to live
+              // Jolt bodies, so we keep ~tens of active bodies instead of 500+.
+              glm::vec3 center = (minWS + maxWS) * 0.5f;
+              physicsSystem->RegisterStreamingCollider(
+                materialEntity,
+                CollisionShape::Mesh,
+                mass,
+                false,
+                0.15f,
+                0.5f,
+                center
+              );
+              ++physicsBodiesQueued;
+              maybeLogPhysicsProgress();
+            }
+          } else {
+            ++physicsNoGeometry;
+            maybeLogPhysicsProgress();
+          }
+        }
+      } else {
+        std::cerr << "Failed to create entity for material " << materialMesh.materialName << std::endl;
+      }
+    }
+    renderer->SetLoadingPhaseProgress(1.0f);
+
+    // Pre-allocate Vulkan resources for all geometry entities in a single batched pass
+    if (!geometryEntities.empty()) {
+      // Scene loading runs on a background thread. Do NOT perform Vulkan allocations
+      // or mutate renderer resource maps here. Enqueue the batch so the render thread can
+      // perform the GPU work safely at its frame-start safe point.
+      renderer->EnqueueEntityPreallocationBatch(geometryEntities);
+    }
+
+    // Final loading summary (useful for profiling, low-noise)
+    std::cout << "[Loading] Physics bodies summary: queued=" << physicsBodiesQueued
+        << ", skipped=" << physicsBodiesSkipped
+        << ", noGeometry=" << physicsNoGeometry << std::endl;
+
+    const auto loadEnd = std::chrono::steady_clock::now();
+    const auto loadMs = std::chrono::duration_cast<std::chrono::milliseconds>(loadEnd - loadStart).count();
+    const auto loadSecs = static_cast<double>(loadMs) / 1000.0;
+    const bool loadFastOk = loadSecs <= 60.0;
+    std::cout << "[Loading] End: " << modelPath << " in " << loadSecs << "s" << (loadFastOk ? "" : " (SLOW)") << std::endl;
+
+    // Set up animations if the model has any
+    const std::vector<Animation>& animations = loadedModel->GetAnimations();
+    std::cout << "[Animation] Model has " << animations.size() << " animation(s)" << std::flush << std::endl;
+    if (!animations.empty()) {
+      std::cout << "[Animation] Setting up " << animations.size() << " animation(s) for playback" << std::flush << std::endl;
+
+      // Create an animation controller entity
+      Entity* animController = engine->CreateEntity(modelName + "_AnimController");
+      if (animController) {
+        auto* animTransform = animController->AddComponent<TransformComponent>();
+        animTransform->SetPosition(position);
+        animTransform->SetRotation(glm::radians(rotation));
+        animTransform->SetScale(scale);
+
+        auto* animComponent = animController->AddComponent<AnimationComponent>();
+        animComponent->SetAnimations(animations);
+
+        // Build node-to-entity mapping using actual glTF node indices
+        // Get animated node mesh mappings to link geometry entities to animated nodes
+        const auto& advanced = GetAdvancedModelData(loadedModel);
+        const auto& animatedNodeMeshes = loadedModel->GetAnimatedNodeMeshes();
+
+        // Get the base transforms for animated nodes
+        const auto& animatedNodeTransforms = loadedModel->GetAnimatedNodeTransforms();
+        const auto& nodeSkins = advanced.nodeSkins;
+        const auto& modelSkins = advanced.skins;
+
+        std::cout << "[Animation] Processing " << animatedNodeMeshes.size() << " animated nodes" << std::endl;
+
+        // Build nodeToEntity mapping by creating or finding entities for each animated node
+        std::unordered_map<int, std::vector<Entity *>> nodeToEntities;
+        std::unordered_map<int, int> meshUsageCount; // Track how many times each mesh is used
+
+        // First pass: count how many animated nodes use each mesh
+        for (const auto& [nodeIndex, meshIndex] : animatedNodeMeshes) {
+          meshUsageCount[meshIndex]++;
+        }
+
+        // Optimization: build a quick lookup map from sourceMeshIndex to materialMesh indices
+        std::unordered_map<int, std::vector<size_t>> meshToMaterialIdx;
+        for (size_t i = 0; i < materialMeshes.size(); ++i) {
+            meshToMaterialIdx[materialMeshes[i].sourceMeshIndex].push_back(i);
+        }
+
+        // Second pass: create entities for animated nodes.
+        // Each base geometry entity (created in the static pass above) may be claimed by
+        // exactly ONE animated node — the first one that references that primitive — which
+        // repurposes it as an animated entity and clears its static instances. Any further
+        // nodes that share the primitive get their own _AnimNode_ entity. Tracking the
+        // claimed base entities is essential: if a primitive used by several nodes (e.g. the
+        // repeated bistro fans) never claims/clears its base entity, that base keeps
+        // rendering a frozen, un-animated copy at its load-time pose alongside the animated
+        // nodes — i.e. a ghost duplicate of every animated object.
+        std::unordered_set<size_t> claimedGeometryIdx;
+        for (const auto& [nodeIndex, meshIndex] : animatedNodeMeshes) {
+          std::cout << "[Animation] Processing animated node " << nodeIndex << " with mesh " << meshIndex << std::endl;
+
+          auto it = meshToMaterialIdx.find(meshIndex);
+          if (it == meshToMaterialIdx.end()) continue;
+
+          for (size_t sourceMaterialMeshIdx : it->second) {
+            const MaterialMesh* sourceMaterialMesh = &materialMeshes[sourceMaterialMeshIdx];
+            if (!sourceMaterialMesh || sourceMaterialMesh->vertices.empty() || sourceMaterialMesh->indices.empty()) continue;
+
+            Entity* nodeEntity = nullptr;
+
+            // Reuse the base geometry entity the first time this primitive is claimed by an
+            // animated node; subsequent nodes (and out-of-range primitives) create fresh
+            // _AnimNode_ entities.
+            const bool firstClaim = (sourceMaterialMeshIdx < geometryEntities.size()) &&
+                                    claimedGeometryIdx.insert(sourceMaterialMeshIdx).second;
+            if (firstClaim) {
+              nodeEntity = geometryEntities[sourceMaterialMeshIdx];
+              auto* mesh = nodeEntity->GetComponent<MeshComponent>();
+              if (mesh && mesh->GetInstanceCount() > 0) {
+                mesh->ClearInstances();
+                renderer->EnqueueInstanceBufferRecreation(nodeEntity);
+              }
+            } else {
+              const int animMaterialIndex = (sourceMaterialMesh->globalMaterialIndex >= 0)
+                  ? sourceMaterialMesh->globalMaterialIndex : sourceMaterialMesh->materialIndex;
+              std::string entityName = modelName + "_AnimNode_" + std::to_string(nodeIndex) +
+                  "_Material_" + std::to_string(animMaterialIndex);
+              nodeEntity = engine->CreateEntity(entityName);
+              if (nodeEntity) {
+                nodeEntity->AddComponent<TransformComponent>();
+                auto* mesh = nodeEntity->AddComponent<MeshComponent>();
+                mesh->SetVertices(sourceMaterialMesh->vertices);
+                mesh->SetIndices(sourceMaterialMesh->indices);
+
+                bool isDef = IsMaterialMeshDeformable(sourceMaterialMesh);
+                SetMeshComponentDeformable(mesh, isDef);
+                if (isDef) {
+                    SetMeshComponentJointsAndWeights(mesh, GetMaterialMeshJoints(sourceMaterialMesh), GetMaterialMeshWeights(sourceMaterialMesh));
+                    SetMeshComponentMorphTargets(mesh, GetMaterialMeshMorphTargetCount(sourceMaterialMesh));
+                    SetMeshComponentMorphPositions(mesh, GetMaterialMeshMorphPositions(sourceMaterialMesh));
+                }
+
+                if (!sourceMaterialMesh->baseColorTexturePath.empty()) mesh->SetBaseColorTexturePath(sourceMaterialMesh->baseColorTexturePath);
+                if (!sourceMaterialMesh->normalTexturePath.empty()) mesh->SetNormalTexturePath(sourceMaterialMesh->normalTexturePath);
+                if (!sourceMaterialMesh->metallicRoughnessTexturePath.empty()) mesh->SetMetallicRoughnessTexturePath(sourceMaterialMesh->metallicRoughnessTexturePath);
+                if (!sourceMaterialMesh->occlusionTexturePath.empty()) mesh->SetOcclusionTexturePath(sourceMaterialMesh->occlusionTexturePath);
+                if (!sourceMaterialMesh->emissiveTexturePath.empty()) mesh->SetEmissiveTexturePath(sourceMaterialMesh->emissiveTexturePath);
+
+                renderer->RegisterTextureUser(mesh->GetBaseColorTexturePath(), nodeEntity);
+                renderer->RegisterTextureUser(mesh->GetNormalTexturePath(), nodeEntity);
+                renderer->RegisterTextureUser(mesh->GetMetallicRoughnessTexturePath(), nodeEntity);
+                renderer->RegisterTextureUser(mesh->GetOcclusionTexturePath(), nodeEntity);
+                renderer->RegisterTextureUser(mesh->GetEmissiveTexturePath(), nodeEntity);
+
+                renderer->EnqueueEntityPreallocationBatch({nodeEntity});
+                std::cout << "[Animation] Created new entity '" << entityName << "' for node " << nodeIndex << std::endl;
+              }
+            }
+
+            if (nodeEntity) {
+              auto transformIt = animatedNodeTransforms.find(nodeIndex);
+              if (transformIt != animatedNodeTransforms.end()) {
+                glm::mat4 worldNodeTransform = transformMatrix * transformIt->second;
+                glm::vec3 nodePosition, nodeScale, skew;
+                glm::quat nodeRotation;
+                glm::vec4 perspective;
+                glm::decompose(worldNodeTransform, nodeScale, nodeRotation, nodePosition, skew, perspective);
+
+                auto* transform = nodeEntity->GetComponent<TransformComponent>();
+                if (transform) {
+                  transform->SetPosition(nodePosition);
+                  transform->SetRotation(glm::eulerAngles(nodeRotation));
+                  transform->SetScale(nodeScale);
+                }
+              }
+
+              nodeToEntities[nodeIndex].push_back(nodeEntity);
+
+              auto skinIt = nodeSkins.find(nodeIndex);
+              if (skinIt != nodeSkins.end()) {
+                int skinIndex = skinIt->second;
+                if (skinIndex >= 0 && skinIndex < static_cast<int>(modelSkins.size())) {
+                  const auto& skin = modelSkins[skinIndex];
+                  auto* mesh = nodeEntity->GetComponent<MeshComponent>();
+                  if (mesh) {
+                    SetMeshComponentSkinData(mesh, skin.joints, skin.inverseBindMatrices);
+                  }
+                }
+              }
+            }
+          }
+        }
+
+        animComponent->SetNodeToEntityMap(nodeToEntities);
+        AnimationComponent_SetHierarchy(animComponent,
+                                   advanced.nodeChildren,
+                                   advanced.nodeLocalTransforms,
+                                   advanced.nodeLocalTranslations,
+                                   advanced.nodeLocalRotations,
+                                   advanced.nodeLocalScales,
+                                   advanced.rootNodes);
+
+        std::cout << "[Animation] Node-to-entity mapping has " << nodeToEntities.size()
+            << " entries (of " << animatedNodeMeshes.size() << " animated nodes)" << std::endl;
+
+        // Auto-play the first animation
+        if (!animations.empty()) {
+          animComponent->Play(0, true); // Play first animation, looping
+          std::cout << "Auto-playing animation: " << animations[0].name
+              << " (duration: " << animations[0].GetDuration() << "s)" << std::endl;
+        }
+      }
+    }
+    renderer->watchdogSuppressed.store(false, std::memory_order_relaxed);
+  } catch (const std::exception& e) {
+    renderer->watchdogSuppressed.store(false, std::memory_order_relaxed);
+    std::cerr << "Error loading GLTF model: " << e.what() << std::endl;
+    return false;
+  }
+
+  // Request acceleration structure build at next safe frame point
+  // Don't build here in background thread to avoid threading issues with command pools
+  const bool needsAS = renderer->GetRayQueryEnabled() && renderer->GetAccelerationStructureEnabled();
+  if (needsAS) {
+    renderer->SetLoadingPhase(Renderer::LoadingPhase::AccelerationStructures);
+    renderer->SetLoadingPhaseProgress(0.0f);
+    std::cout << "Requesting acceleration structure build for loaded scene..." << std::endl;
+    renderer->RequestAccelerationStructureBuild();
+  }
+
+  // Clear the scene loader flag so the render thread knows asset construction is done.
+  // IMPORTANT: We deliberately do NOT call MarkInitialLoadComplete() here. Doing so would
+  // hide the loading overlay before the acceleration structure (BLAS+TLAS) has actually
+  // finished building. With chunked GPU resource preallocation (100 entities/frame), the
+  // TLAS cannot be safely built until all BLAS are ready -- otherwise the GPU will hang
+  // dereferencing instances that point to non-existent BLAS.
+  //
+  // Instead, transition to the Finalizing phase and let the render loop's auto-completion
+  // check (renderer_rendering.cpp, around the "MarkInitialLoadComplete" call site) flip
+  // the flag once asBuildRequested == false (i.e., a full successful AS build).
+  renderer->SetLoading(false);
+  if (needsAS) {
+    // Stay in AS phase; render loop will switch to Finalizing on first successful build.
+  } else {
+    // No ray query: nothing to wait on, complete immediately.
+    renderer->MarkInitialLoadComplete();
+  }
+
+  return true;
+}
+
+/**
+ * @brief Load a GLTF model with default transform values.
+ * @param engine The engine to create entities in.
+ * @param modelPath The path to the GLTF model file.
+ */
+void LoadGLTFModel(Engine* engine, const std::string& modelPath) {
+  // Use default transform values: slight Y offset, no rotation, unit scale
+  LoadGLTFModel(engine, modelPath, glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(1.0f, 1.0f, 1.0f));
+}
diff --git a/attachments/advanced_gltf/scene_loading.h b/attachments/advanced_gltf/scene_loading.h
new file mode 100644
index 000000000..024d4fbe0
--- /dev/null
+++ b/attachments/advanced_gltf/scene_loading.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2025 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "model_loader.h"
+#include <glm/glm.hpp>
+#include <string>
+#include <vector>
+
+// Forward declarations
+class Engine;
+class ModelLoader;
+
+/**
+ * @brief Load a GLTF model synchronously on the main thread.
+ * @param engine The engine to create entities in.
+ * @param modelPath The path to the GLTF model file.
+ * @param position The position to place the model.
+ * @param rotation The rotation to apply to the model.
+ * @param scale The scale to apply to the model.
+ */
+bool LoadGLTFModel(Engine *engine, const std::string &modelPath,
+                   const glm::vec3 &position, const glm::vec3 &rotation, const glm::vec3 &scale,
+                   float mass = 0.0f);
+
+/**
+ * @brief Load a GLTF model with default transform values.
+ * @param engine The engine to create entities in.
+ * @param modelPath The path to the GLTF model file.
+ */
+void LoadGLTFModel(Engine *engine, const std::string &modelPath);
diff --git a/attachments/advanced_gltf/skinning.slang b/attachments/advanced_gltf/skinning.slang
new file mode 100644
index 000000000..4e3806d92
--- /dev/null
+++ b/attachments/advanced_gltf/skinning.slang
@@ -0,0 +1,97 @@
+/* Copyright (c) 2026 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// GPU compute skinning — Chapter 3: Compute Skinning.
+// Reads rest-pose vertices and joint matrices, writes animated vertices.
+// Dispatched once per skinned mesh per frame, after joint matrices are uploaded.
+
+struct InputVertex {
+    float p[3];
+    float n[3];
+    float uv[2];
+    float t[4];
+};
+
+struct OutputVertex {
+    float p[3];
+    float n[3];
+    float uv[2];
+    float t[4];
+};
+
+[[vk::binding(0, 0)]] StructuredBuffer<InputVertex>    input_vertices;
+[[vk::binding(1, 0)]] RWStructuredBuffer<OutputVertex> output_vertices;
+[[vk::binding(2, 0)]] StructuredBuffer<float4x4>       joint_matrices;
+[[vk::binding(3, 0)]] StructuredBuffer<uint4>          joint_indices;
+[[vk::binding(4, 0)]] StructuredBuffer<float4>         joint_weights;
+
+struct SkinPushConstants {
+    uint vertex_count;
+};
+[[vk::push_constant]] SkinPushConstants push_constants;
+
+[shader("compute")]
+[numthreads(64, 1, 1)]
+void main(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint vertex_id = dispatchThreadID.x;
+    if (vertex_id >= push_constants.vertex_count) return;
+
+    InputVertex v = input_vertices[vertex_id];
+    uint4 j_idx = joint_indices[vertex_id];
+    float4 j_w = joint_weights[vertex_id];
+
+    // Weighted sum of per-joint matrices (Linear Blend Skinning).
+    float4x4 skin_matrix =
+        j_w.x * joint_matrices[j_idx.x] +
+        j_w.y * joint_matrices[j_idx.y] +
+        j_w.z * joint_matrices[j_idx.z] +
+        j_w.w * joint_matrices[j_idx.w];
+
+    float3 v_pos = float3(v.p[0], v.p[1], v.p[2]);
+    float3 v_nrm = float3(v.n[0], v.n[1], v.n[2]);
+    float4 v_tan = float4(v.t[0], v.t[1], v.t[2], v.t[3]);
+
+    float4 animated_pos = mul(skin_matrix, float4(v_pos, 1.0));
+
+    // Transform direction vectors by the rotation-scale part only (no translation).
+    float3x3 skin_rot = float3x3(
+        skin_matrix[0].xyz,
+        skin_matrix[1].xyz,
+        skin_matrix[2].xyz);
+
+    float3 animated_normal  = normalize(mul(skin_rot, v_nrm));
+    float3 animated_tangent = normalize(mul(skin_rot, v_tan.xyz));
+
+    OutputVertex out_v;
+    out_v.p[0] = animated_pos.x;
+    out_v.p[1] = animated_pos.y;
+    out_v.p[2] = animated_pos.z;
+    
+    out_v.n[0] = animated_normal.x;
+    out_v.n[1] = animated_normal.y;
+    out_v.n[2] = animated_normal.z;
+    
+    out_v.t[0] = animated_tangent.x;
+    out_v.t[1] = animated_tangent.y;
+    out_v.t[2] = animated_tangent.z;
+    out_v.t[3] = v_tan.w; // preserve handedness
+    
+    out_v.uv[0] = v.uv[0];
+    out_v.uv[1] = v.uv[1];
+
+    output_vertices[vertex_id] = out_v;
+}
diff --git a/attachments/advanced_gltf/tutorial_demo.cpp b/attachments/advanced_gltf/tutorial_demo.cpp
new file mode 100644
index 000000000..36d155c4e
--- /dev/null
+++ b/attachments/advanced_gltf/tutorial_demo.cpp
@@ -0,0 +1,575 @@
+/* Copyright (c) 2026 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "tutorial_demo.h"
+#include "scene_loading.h"
+#include "renderer_advanced_types.h"
+
+#include "animation_component.h"
+#include "camera_component.h"
+#include "engine.h"
+#include "entity.h"
+#include "imgui/imgui.h"
+#include "physics_system.h"
+#include "transform_component.h"
+
+#include <glm/gtc/matrix_inverse.hpp>
+#include <glm/gtc/quaternion.hpp>
+
+#include <algorithm>
+#include <cmath>
+#include <string>
+
+TutorialDemoComponent::TutorialDemoComponent(Engine *engine)
+    : Component("TutorialDemoComponent"), m_engine(engine)
+{}
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+Entity *TutorialDemoComponent::FindFoxMesh() const
+{
+    for (const auto &e : m_engine->GetEntities())
+    {
+        const std::string &n = e->GetName();
+        if (n.rfind("Fox_", 0) == 0 && n.find("AnimController") == std::string::npos)
+            return e.get();
+    }
+    return nullptr;
+}
+
+glm::vec3 TutorialDemoComponent::MouseToWorld(float depth) const
+{
+    // GetViewMatrix/GetProjectionMatrix are not const-qualified; const_cast is required.
+    auto *cam = const_cast<CameraComponent *>(m_engine->GetActiveCamera());
+    if (!cam)
+        return {};
+
+    const ImGuiIO &io = ImGui::GetIO();
+    float          w  = io.DisplaySize.x;
+    float          h  = io.DisplaySize.y;
+    if (w <= 0.0f || h <= 0.0f)
+        return {};
+
+    float ndcX = (io.MousePos.x / w) * 2.0f - 1.0f;
+    float ndcY = 1.0f - (io.MousePos.y / h) * 2.0f;
+
+    glm::mat4 invVP = glm::inverse(cam->GetProjectionMatrix() * cam->GetViewMatrix());
+
+    glm::vec4 n4{ndcX, ndcY, 0.0f, 1.0f};
+    glm::vec4 f4{ndcX, ndcY, 1.0f, 1.0f};
+
+    glm::vec4 wn = invVP * n4;
+    wn /= wn.w;
+    glm::vec4 wf = invVP * f4;
+    wf /= wf.w;
+
+    glm::vec3 ray = glm::normalize(glm::vec3(wf) - glm::vec3(wn));
+    return glm::vec3(wn) + ray * depth;
+}
+
+// ---------------------------------------------------------------------------
+// Chapter 1 — Scene Graph
+// ---------------------------------------------------------------------------
+
+void TutorialDemoComponent::DrawSceneGraphPanel()
+{
+    if (!ImGui::CollapsingHeader("Chapter 1 — Scene Graph"))
+        return;
+
+    auto entities = SnapshotEntities(m_engine);
+    ImGui::Text("Total entities: %zu", entities.size());
+    ImGui::Separator();
+
+    static bool bistroVisible = true;
+    if (ImGui::Checkbox("Toggle All Bistro Models", &bistroVisible))
+    {
+        for (auto *e : entities)
+        {
+            if (e->GetName().find("bistro_") == 0)
+                e->SetActive(bistroVisible);
+        }
+    }
+    ImGui::Separator();
+
+    ImGui::BeginChild("EntityList", ImVec2(0.0f, 200.0f), true);
+    for (auto *e : entities)
+    {
+        bool active = e->IsActive();
+        ImGui::PushID(e);
+        if (ImGui::Checkbox("##active", &active))
+            e->SetActive(active);
+        ImGui::SameLine();
+        ImGui::Text("%s", e->GetName().c_str());
+        if (auto *t = e->GetComponent<TransformComponent>())
+        {
+            const glm::vec3 &p = t->GetPosition();
+            ImGui::SameLine();
+            ImGui::TextDisabled("(%.1f, %.1f, %.1f)", p.x, p.y, p.z);
+        }
+        ImGui::PopID();
+    }
+    ImGui::EndChild();
+
+    ImGui::TextWrapped("Tip: uncheck the box next to an entity to hide it. "
+                       "Each entity is a node in the scene graph; TransformComponent "
+                       "stores local SRT and recomputes the world matrix on demand, "
+                       "using the same dirty-flag pattern as Node::mark_dirty() in node.h.");
+}
+
+// ---------------------------------------------------------------------------
+// Chapter 2 — Skeletal Animation
+// ---------------------------------------------------------------------------
+
+void TutorialDemoComponent::DrawAnimationPanel()
+{
+    if (!ImGui::CollapsingHeader("Chapter 2 — Skeletal Animation", ImGuiTreeNodeFlags_DefaultOpen))
+        return;
+
+    Entity *animEntity = m_engine->GetEntity("Fox_AnimController");
+    AnimationComponent *anim = animEntity ? animEntity->GetComponent<AnimationComponent>() : nullptr;
+
+    if (!anim)
+    {
+        ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f),
+                           "Fox model not loaded.\n"
+                           "Run:  assets/download_samples.sh\n"
+                           "then rebuild to enable animation controls.");
+        return;
+    }
+
+    const auto &clips = anim->GetAnimations();
+    if (clips.empty())
+    {
+        ImGui::Text("Model has no animation clips.");
+        return;
+    }
+
+    // Clip selector
+    std::vector<const char *> names;
+    names.reserve(clips.size());
+    for (const auto &c : clips)
+        names.push_back(c.name.c_str());
+
+    if (ImGui::Combo("Clip", &m_selectedAnim, names.data(), static_cast<int>(names.size())))
+        anim->Play(static_cast<size_t>(m_selectedAnim), true);
+
+    // Transport controls
+    if (anim->IsPlaying())
+    {
+        if (ImGui::Button("Pause"))
+            anim->Pause();
+    }
+    else
+    {
+        if (ImGui::Button("Play"))
+            anim->Resume();
+    }
+    ImGui::SameLine();
+    if (ImGui::Button("Restart"))
+        anim->Play(static_cast<size_t>(m_selectedAnim), true);
+
+    // Speed
+    if (ImGui::SliderFloat("Speed", &m_animSpeed, 0.0f, 3.0f, "%.2f x"))
+        anim->SetSpeed(m_animSpeed);
+
+    // Timeline scrubber (read-only progress)
+    float dur = anim->GetCurrentDuration();
+    float t   = (dur > 0.0f) ? (anim->GetCurrentTime() / dur) : 0.0f;
+    char  overlay[32];
+    std::snprintf(overlay, sizeof(overlay), "%.2f / %.2f s",
+                  anim->GetCurrentTime(), dur);
+    ImGui::ProgressBar(t, ImVec2(-1.0f, 0.0f), overlay);
+
+    ImGui::Spacing();
+    ImGui::TextWrapped("AnimationComponent samples each channel with STEP, LINEAR, or "
+                       "CUBICSPLINE interpolation matching the glTF spec.  The helper "
+                       "functions in animation.h (find_keyframe, cubic_spline_interpolate_vec3, "
+                       "apply_pose_to_scene_graph) implement these algorithms directly.");
+}
+
+// ---------------------------------------------------------------------------
+// Chapter 3 — Rigid-Body Physics
+// ---------------------------------------------------------------------------
+
+void TutorialDemoComponent::DrawPhysicsPanel()
+{
+    if (!ImGui::CollapsingHeader("Chapter 3 — Rigid-Body Physics"))
+        return;
+
+    auto *physics = m_engine->GetPhysicsSystem();
+
+    if (ImGui::SliderFloat("Gravity Scale", &m_gravityScale, 0.0f, 3.0f, "%.2f"))
+        physics->SetGravity(glm::vec3(0.0f, -9.81f * m_gravityScale, 0.0f));
+
+    ImGui::SliderFloat("Throw Force", &m_throwForce, 1.0f, 30.0f, "%.1f");
+
+    ImGui::Separator();
+    ImGui::Text("Fox rigid body:");
+
+    Entity *foxMesh = FindFoxMesh();
+    if (!foxMesh)
+    {
+        ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f),
+                           "Fox mesh not found — load Fox.gltf first.");
+        return;
+    }
+
+    if (!m_activeBody)
+    {
+        if (ImGui::Button("Add Rigid Body"))
+        {
+            m_activeEntity = foxMesh;
+            m_activeBody   = physics->CreateRigidBody(
+                foxMesh, CollisionShape::Box, 1.0f);
+            if (m_activeBody)
+                m_activeBody->SetKinematic(true);
+        }
+        ImGui::SameLine();
+        ImGui::TextDisabled("(hold G to grab while kinematic, release to throw)");
+    }
+    else
+    {
+        glm::vec3 vel = m_activeBody->GetLinearVelocity();
+        ImGui::Text("Velocity: (%.2f, %.2f, %.2f) m/s", vel.x, vel.y, vel.z);
+
+        if (ImGui::Button("Throw!"))
+        {
+            m_activeBody->SetKinematic(false);
+            m_activeBody->SetLinearVelocity(
+                glm::vec3(0.0f, m_throwForce, -m_throwForce * 0.5f));
+        }
+        ImGui::SameLine();
+        if (ImGui::Button("Reset"))
+        {
+            physics->DestroyRigidBody(m_activeBody);
+            m_activeBody   = nullptr;
+            m_activeEntity = nullptr;
+            m_grabMode     = false;
+            if (auto *t = foxMesh->GetComponent<TransformComponent>())
+                t->SetPosition(glm::vec3(0.0f, 0.0f, 0.0f));
+        }
+    }
+
+    ImGui::Spacing();
+    ImGui::TextWrapped("PhysicsSystem drives a GPU compute pipeline: broadphase, narrowphase, "
+                       "and constraint resolution all run as Vulkan compute dispatches.  "
+                       "ColliderDef and ConstraintDef in node.h describe the per-node physics "
+                       "metadata that add_physics_extras.py injects into a glTF file.");
+}
+
+// ---------------------------------------------------------------------------
+// Chapter 4 — Inverse Kinematics (FABRIK)
+// ---------------------------------------------------------------------------
+
+void TutorialDemoComponent::DrawIKPanel()
+{
+    if (!ImGui::CollapsingHeader("Chapter 4 — Inverse Kinematics (FABRIK)"))
+        return;
+
+    ImGui::TextWrapped("Hold [G] in the viewport (not over this panel) to grab the "
+                       "Fox mesh and drag it.  Release G to throw it using the drag "
+                       "velocity — demonstrating the IK-target-follow → physics-handoff "
+                       "pattern from Chapter 4.");
+
+    ImGui::SliderFloat("Grab Depth", &m_grabDepth, 1.0f, 20.0f, "%.1f");
+
+    if (m_grabMode)
+        ImGui::TextColored(ImVec4(0.2f, 1.0f, 0.2f, 1.0f),
+                           " GRAB ACTIVE  — release G to throw");
+
+    ImGui::Separator();
+    ImGui::Text("FABRIK solver visualisation (3-bone chain, animated target):");
+
+    // --- FABRIK canvas ---
+    const float cw = ImGui::GetContentRegionAvail().x;
+    const float ch = 160.0f;
+    ImVec2      cp = ImGui::GetCursorScreenPos();
+    ImGui::InvisibleButton("##fab", ImVec2(cw, ch));
+    ImDrawList *dl = ImGui::GetWindowDrawList();
+
+    dl->AddRectFilled(cp, ImVec2(cp.x + cw, cp.y + ch),
+                      IM_COL32(18, 18, 30, 220), 4.0f);
+
+    // Animated IK target orbits inside the canvas
+    float cx = cp.x + cw * 0.5f;
+    float cy = cp.y + ch * 0.5f;
+    float tx = cx + std::cos(m_fabrikPhase) * cw * 0.38f;
+    float ty = cy + std::sin(m_fabrikPhase * 0.7f) * ch * 0.38f;
+
+    // Root is at the bottom-centre
+    const float  boneLen = ch * 0.27f;
+    glm::vec2    root    = {cx, cp.y + ch - 12.0f};
+    glm::vec2    target  = {tx, ty};
+
+    // Initialise all joints at the root so the forward pass has valid data
+    // even on the very first frame before any previous values exist.
+    glm::vec2 joints[4];
+    for (auto &j : joints)
+        j = root;
+
+    // Forward pass — pull end-effector toward target
+    joints[3] = target;
+    for (int i = 2; i >= 0; --i)
+    {
+        glm::vec2 d = joints[i + 1] - joints[i];
+        float     l = glm::length(d);
+        if (l > 0.001f)
+            joints[i] = joints[i + 1] - (d / l) * boneLen;
+    }
+    // Backward pass — re-anchor root
+    joints[0] = root;
+    for (int i = 1; i <= 3; ++i)
+    {
+        glm::vec2 d = joints[i] - joints[i - 1];
+        float     l = glm::length(d);
+        if (l > 0.001f)
+            joints[i] = joints[i - 1] + (d / l) * boneLen;
+    }
+
+    // Draw bones and joints
+    for (int i = 0; i < 3; ++i)
+    {
+        dl->AddLine(ImVec2(joints[i].x, joints[i].y),
+                    ImVec2(joints[i + 1].x, joints[i + 1].y),
+                    IM_COL32(90, 190, 255, 255), 3.0f);
+        dl->AddCircleFilled(ImVec2(joints[i].x, joints[i].y), 5.0f,
+                            IM_COL32(255, 200, 70, 255));
+    }
+    dl->AddCircleFilled(ImVec2(joints[3].x, joints[3].y), 5.0f,
+                        IM_COL32(255, 200, 70, 255));
+
+    // Draw IK target
+    dl->AddCircle(ImVec2(tx, ty), 9.0f, IM_COL32(255, 70, 70, 255), 0, 2.0f);
+    dl->AddText(ImVec2(tx + 12.0f, ty - 8.0f), IM_COL32(255, 100, 100, 255), "target");
+
+    // Root label
+    dl->AddText(ImVec2(root.x + 6.0f, root.y - 8.0f), IM_COL32(180, 180, 180, 200), "root");
+
+    ImGui::Spacing();
+    ImGui::TextWrapped("update_world_matrices_subtree() in node.h propagates the dirty flag "
+                       "upward after each FABRIK pass, so only the affected subtree is "
+                       "recomputed — matching the pattern shown in Chapter 4.");
+}
+
+// ---------------------------------------------------------------------------
+// Chapter 5 — Morph Targets
+// ---------------------------------------------------------------------------
+
+void TutorialDemoComponent::DrawMorphPanel()
+{
+    if (!ImGui::CollapsingHeader("Chapter 5 — Morph Targets"))
+        return;
+
+    bool hasMorphModel = false;
+    for (const auto &e : m_engine->GetEntities())
+    {
+        const auto &n = e->GetName();
+        if (n.rfind("AnimatedMorphCube_", 0) == 0 ||
+            n.rfind("MorphPrimitivesTest_", 0) == 0)
+        {
+            hasMorphModel = true;
+            break;
+        }
+    }
+
+    if (!hasMorphModel)
+    {
+        ImGui::TextColored(ImVec4(1.0f, 0.5f, 0.0f, 1.0f),
+                           "No morph-target model loaded.\n"
+                           "Run:  assets/download_samples.sh\n"
+                           "then call LoadGLTFModel for AnimatedMorphCube.gltf.");
+    }
+
+    ImGui::Text("Morph target weights (for reference; engine applies via compute):");
+    ImGui::SliderFloat("Weight 0", &m_morphWeights[0], 0.0f, 1.0f);
+    ImGui::SliderFloat("Weight 1", &m_morphWeights[1], 0.0f, 1.0f);
+    ImGui::SliderFloat("Weight 2", &m_morphWeights[2], 0.0f, 1.0f);
+    ImGui::SliderFloat("Weight 3", &m_morphWeights[3], 0.0f, 1.0f);
+
+    ImGui::Spacing();
+    ImGui::TextWrapped("morph_accumulate.slang accumulates weighted position/normal/tangent "
+                       "deltas in a single compute dispatch.  Weight animation uses the glTF "
+                       "WEIGHTS AnimationPath, sampled by the same keyframe pipeline as "
+                       "translation and rotation channels.");
+}
+
+// ---------------------------------------------------------------------------
+// Chapter 6 — Debug Heatmap
+// ---------------------------------------------------------------------------
+
+void TutorialDemoComponent::DrawDebugPanel()
+{
+    if (!ImGui::CollapsingHeader("Chapter 6 — Debug & Skinning Heatmap"))
+        return;
+
+    ImGui::TextWrapped("pbr_heatmap.slang provides two fragment entry points that share "
+                       "one vertex shader: fragment_dominant_bone colours each pixel by "
+                       "the bone index with highest weight; fragment_weight_distribution "
+                       "goes green for balanced weights and red for a single dominant bone.");
+
+    ImGui::Spacing();
+    ImGui::Text("Scene summary:");
+
+    int withAnim = 0, withTransform = 0;
+    auto snapshot = SnapshotEntities(m_engine);
+    for (auto *e : snapshot)
+    {
+        if (e->GetComponent<AnimationComponent>())
+            ++withAnim;
+        if (e->GetComponent<TransformComponent>())
+            ++withTransform;
+    }
+    ImGui::BulletText("Entities total        : %zu",
+                      snapshot.size());
+    ImGui::BulletText("With AnimationComponent: %d", withAnim);
+    ImGui::BulletText("With TransformComponent: %d", withTransform);
+
+    if (auto *physics = m_engine->GetPhysicsSystem())
+    {
+        glm::vec3 g = physics->GetGravity();
+        ImGui::BulletText("Gravity               : (%.2f, %.2f, %.2f)", g.x, g.y, g.z);
+        ImGui::BulletText("GPU physics           : %s",
+                          physics->IsGPUAccelerationEnabled() ? "enabled" : "disabled");
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Update — called every frame after ImGui::NewFrame()
+// ---------------------------------------------------------------------------
+
+void TutorialDemoComponent::Update(std::chrono::milliseconds deltaTime)
+{
+    // Advance FABRIK canvas animation
+    m_fabrikPhase += static_cast<float>(deltaTime.count()) * 0.001f;
+
+    // ---- G-key grab & throw ----
+    // ImGui::GetIO().WantCaptureKeyboard is true when the user is typing into an
+    // ImGui input field; skip grab input in that case so text entry still works.
+    const ImGuiIO &io      = ImGui::GetIO();
+    // GLFW stores raw key codes in KeysDown; 'G' == GLFW_KEY_G == 71 (ASCII).
+    bool           grabKey = ImGui::IsKeyDown(static_cast<int>('G')) && !io.WantCaptureKeyboard;
+
+    if (grabKey)
+    {
+        glm::vec3 worldPt = MouseToWorld(m_grabDepth);
+
+        if (!m_grabMode)
+        {
+            // First frame of grab — initialise state
+            m_grabMode      = true;
+            m_grabCurrent   = worldPt;
+            m_throwVelocity = {};
+            m_grabOffset    = {};
+
+            // Attach a kinematic rigid body to the first Fox mesh entity found.
+            // If a body already exists (e.g. from the physics panel), reuse it.
+            if (!m_activeBody)
+            {
+                Entity *fox = FindFoxMesh();
+                if (fox)
+                {
+                    m_activeEntity = fox;
+                    m_activeBody   = m_engine->GetPhysicsSystem()->CreateRigidBody(
+                        fox, CollisionShape::Box, 1.0f);
+                }
+            }
+            if (m_activeBody)
+                m_activeBody->SetKinematic(true);
+
+            // Capture the offset between the object's current position and the cursor-ray
+            // point so the object is grabbed in place (no teleport to the cursor) and then
+            // moves relative to cursor motion.
+            if (m_activeEntity)
+            {
+                if (auto* t = m_activeEntity->GetComponent<TransformComponent>())
+                    m_grabOffset = t->GetPosition() - worldPt;
+            }
+        }
+
+        // Compute instantaneous velocity from position delta
+        float dt = static_cast<float>(deltaTime.count()) * 0.001f;
+        if (dt > 0.0f)
+            m_throwVelocity = (worldPt - m_grabCurrent) / dt;
+        m_grabCurrent = worldPt;
+
+        // Drag the rigid body to follow the cursor, preserving the initial grab offset.
+        if (m_activeBody)
+            m_activeBody->SetPosition(worldPt + m_grabOffset);
+    }
+    else if (m_grabMode)
+    {
+        // G released — switch to dynamic and apply throw velocity
+        m_grabMode = false;
+        if (m_activeBody)
+        {
+            m_activeBody->SetKinematic(false);
+            m_activeBody->SetLinearVelocity(m_throwVelocity);
+            // The physics system now owns this entity's transform. Tell the animation
+            // system to stop driving it, otherwise the animation resets the transform to
+            // the animated pose every frame and the object oscillates with the physics pose.
+            SetEntityPhysicsOwned(m_activeEntity, true);
+            // Body is now owned by the physics system; we stop tracking it here
+            // so the throw arc plays out without further interference.
+            m_activeBody   = nullptr;
+            m_activeEntity = nullptr;
+        }
+    }
+
+    // ---- Right-click to spawn balls ----
+    // Right-click in the viewport to spawn a new "ball" (using procedural sphere)
+    // with a dynamic rigid body (mass=1.0).
+    if (ImGui::IsMouseClicked(1) && !io.WantCaptureMouse)
+    {
+        glm::vec3 spawnPos = MouseToWorld(2.0f); // Spawn 2m in front of camera
+        std::string ballName = "Ball_" + std::to_string(m_ballCounter++);
+        if (Entity* ball = m_engine->CreateEntity(ballName)) {
+            // Update the engine's last spawned ball global for optimized camera tracking
+            extern Entity* g_lastSpawnedBall;
+            g_lastSpawnedBall = ball;
+
+            ball->AddComponent<TransformComponent>()->SetPosition(spawnPos);
+            auto* mesh = ball->AddComponent<MeshComponent>();
+            mesh->CreateSphere(0.2f, glm::vec3(1.0f, 0.2f, 0.2f), 16);
+            m_engine->GetPhysicsSystem()->CreateRigidBody(ball, CollisionShape::Sphere, 1.0f);
+            m_engine->GetRenderer()->preAllocateEntityResources(ball);
+        }
+    }
+
+    // ---- Tutorial ImGui window ----
+    ImGui::SetNextWindowPos(ImVec2(io.DisplaySize.x - 385.0f, 10.0f),
+                            ImGuiCond_FirstUseEver);
+    ImGui::SetNextWindowSize(ImVec2(375.0f, io.DisplaySize.y - 20.0f),
+                             ImGuiCond_FirstUseEver);
+    ImGui::SetNextWindowBgAlpha(0.88f);
+
+    ImGui::Begin("Advanced glTF Tutorial", nullptr, ImGuiWindowFlags_NoCollapse);
+
+    ImGui::Text("%.0f FPS  |  [G] Grab & Throw  |  [WASD] Camera", io.Framerate);
+    if (m_grabMode)
+        ImGui::TextColored(ImVec4(0.2f, 1.0f, 0.2f, 1.0f), "  GRAB MODE ACTIVE");
+    ImGui::Separator();
+
+    DrawSceneGraphPanel();
+    DrawAnimationPanel();
+    DrawPhysicsPanel();
+    DrawIKPanel();
+    DrawMorphPanel();
+    DrawDebugPanel();
+
+    ImGui::End();
+}
diff --git a/attachments/advanced_gltf/tutorial_demo.h b/attachments/advanced_gltf/tutorial_demo.h
new file mode 100644
index 000000000..af8da111d
--- /dev/null
+++ b/attachments/advanced_gltf/tutorial_demo.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2026 Holochip Corporation
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 the "License";
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "component.h"
+
+#include <chrono>
+#include <glm/glm.hpp>
+
+class Engine;
+class RigidBody;
+class Entity;
+
+/**
+ * Component that renders the Advanced glTF tutorial ImGui panels and handles
+ * interactive grab-and-throw via the G key.  One instance lives on a dedicated
+ * "TutorialDemo" entity and its Update() is called every frame after
+ * ImGui::NewFrame(), so all ImGui calls inside are safe.
+ */
+class TutorialDemoComponent final : public Component
+{
+  public:
+    explicit TutorialDemoComponent(Engine *engine);
+    void Update(std::chrono::milliseconds deltaTime) override;
+
+  private:
+    Engine *m_engine;
+
+    // Chapter 2 — Animation
+    int   m_selectedAnim = 0;
+    float m_animSpeed    = 1.0f;
+
+    // Chapter 3 — Physics
+    float      m_gravityScale  = 1.0f;
+    float      m_throwForce    = 8.0f;
+    RigidBody *m_activeBody    = nullptr;
+    Entity    *m_activeEntity  = nullptr;
+
+    // Chapter 4 — IK / Grab & Throw
+    bool      m_grabMode      = false;
+    glm::vec3 m_grabCurrent   = {};
+    glm::vec3 m_throwVelocity = {};
+    glm::vec3 m_grabOffset    = {}; // offset from cursor-ray point to the grabbed object, captured on grab
+    float     m_grabDepth     = 4.0f;
+    float     m_fabrikPhase   = 0.0f;   // drives the animated FABRIK canvas target
+
+    // Chapter 5 — Morph
+    float m_morphWeights[4] = {};
+
+    // helpers
+    Entity    *FindFoxMesh() const;
+    glm::vec3  MouseToWorld(float depth) const;
+
+    int m_ballCounter = 0;
+
+    void DrawSceneGraphPanel();
+    void DrawAnimationPanel();
+    void DrawPhysicsPanel();
+    void DrawIKPanel();
+    void DrawMorphPanel();
+    void DrawDebugPanel();
+};
diff --git a/attachments/simple_engine/camera_component.h b/attachments/simple_engine/camera_component.h
index 0d3225b00..63ff1aecf 100644
--- a/attachments/simple_engine/camera_component.h
+++ b/attachments/simple_engine/camera_component.h
@@ -149,13 +149,13 @@ class CameraComponent : public Component
 
 	/**
 	 * @brief Set the near and far planes.
-	 * @param near The near plane distance.
-	 * @param far The far plane distance.
+	 * @param nearPlaneDistance The near plane distance.
+	 * @param farPlaneDistance The far plane distance.
 	 */
-	void SetClipPlanes(float near, float far)
+	void SetClipPlanes(float nearPlaneDistance, float farPlaneDistance)
 	{
-		nearPlane             = near;
-		farPlane              = far;
+		nearPlane             = nearPlaneDistance;
+		farPlane              = farPlaneDistance;
 		projectionMatrixDirty = true;
 	}
 
diff --git a/en/Advanced_glTF/Debugging_Visual_Auditing/01_introduction.adoc b/en/Advanced_glTF/Debugging_Visual_Auditing/01_introduction.adoc
new file mode 100644
index 000000000..ef72dc5ae
--- /dev/null
+++ b/en/Advanced_glTF/Debugging_Visual_Auditing/01_introduction.adoc
@@ -0,0 +1,14 @@
+:pp: {plus}{plus}
+= Debugging & Visual Auditing
+
+== When the Asset Is Right and the Engine Is Wrong
+
+The previous chapter established how to ensure that a glTF asset is correct before it reaches the engine. This chapter picks up from the moment the asset passes validation and the reference viewer shows the correct result—but your engine still renders something wrong.
+
+At this stage, the problem is in the engine code. The question is which part of the code, and that is where visual debugging tools become essential. The character pipeline we have built over this series has several independently complex stages: the scene graph transform hierarchy, the compute skinning pass, the physics simulation, the procedural animation layers. A visual artifact could originate in any of them. Without tooling that makes each stage's state visible, you are reduced to blind guessing.
+
+The techniques in this chapter are about making invisible state visible. The physics simulation has a complete geometric representation of every bone in the character's body—but you cannot see it, because the physics engine draws nothing by default. The skinning shader computes a per-vertex output buffer that contains the final animated positions—but you cannot inspect it in a GPU debugger unless you know what format to look for. The bone influence weights that drive vertex deformation are stored as four float values per vertex—but you cannot tell from looking at the rendered mesh whether they are correct.
+
+We will implement tools to address each of these blind spots. Debug drawers render wireframe geometry that overlays the character with a visualization of the skeleton, collision shapes, and physics constraints. Skinning heatmaps render the mesh with vertex colors derived from bone influence weights, making incorrect or extreme weight painting immediately visible. RenderDoc capture and analysis lets you freeze a frame and examine the post-skinning output buffer directly, verifying vertex positions and normals at the GPU level.
+
+xref:Advanced_glTF/Tooling_Production_Pipeline/05_conclusion.adoc[Previous: Tooling Conclusion] | xref:Advanced_glTF/Debugging_Visual_Auditing/02_debug_drawers.adoc[Next: Debug Drawers]
diff --git a/en/Advanced_glTF/Debugging_Visual_Auditing/02_debug_drawers.adoc b/en/Advanced_glTF/Debugging_Visual_Auditing/02_debug_drawers.adoc
new file mode 100644
index 000000000..61b9c8f2d
--- /dev/null
+++ b/en/Advanced_glTF/Debugging_Visual_Auditing/02_debug_drawers.adoc
@@ -0,0 +1,330 @@
+:pp: {plus}{plus}
+= Debug Drawers: Visualizing Physics and Skeleton
+
+== The Principle of Debug Visualization
+
+Before writing any debug drawing code, it is worth understanding the design principle: a debug drawer is not a permanent part of the rendering pipeline. It is a mode-switched overlay that is compiled away in release builds and active in debug or development builds. It should be:
+
+- **Simple.** Debug geometry is rendered with a minimal pipeline—no lighting, no textures, no shadows. Solid or wireframe colored lines and shapes.
+- **Always on top.** Debug geometry is typically rendered with depth testing disabled (or with a very biased depth offset) so it is never hidden by opaque geometry. If a collision capsule is inside a mesh, you still want to see it.
+- **Zero-overhead when inactive.** In release builds, all debug draw calls should compile to nothing. Use a preprocessor flag or a compile-time constant to eliminate them entirely.
+
+The implementation pattern is a **deferred line buffer**: instead of issuing draw calls immediately for each debug shape, accumulate all the line segments for the frame into a CPU-side buffer, upload the buffer once at the end of the frame, and draw it in a single indexed draw call with a simple unlit shader. This is fast, simple, and GPU-friendly.
+
+== The Debug Line Buffer
+
+[source,cpp]
+----
+struct DebugLine {
+    glm::vec3 start;
+    glm::vec3 end;
+    glm::vec3 color;
+};
+
+class DebugDrawer {
+public:
+    void draw_line(const glm::vec3& start, const glm::vec3& end, const glm::vec3& color)
+    {
+        lines_.push_back({ start, end, color });
+    }
+
+    void draw_sphere(const glm::vec3& center, float radius, const glm::vec3& color,
+                     int segments = 16)
+    {
+        // Draw three orthogonal circles to represent a sphere
+        draw_circle(center, radius, glm::vec3(1,0,0), glm::vec3(0,1,0), color, segments);
+        draw_circle(center, radius, glm::vec3(1,0,0), glm::vec3(0,0,1), color, segments);
+        draw_circle(center, radius, glm::vec3(0,1,0), glm::vec3(0,0,1), color, segments);
+    }
+
+    void draw_capsule(const glm::vec3& base, const glm::vec3& tip,
+                      float radius, const glm::vec3& color, int segments = 12)
+    {
+        glm::vec3 axis = tip - base;
+        float height   = glm::length(axis);
+        if (height < 1e-6f) { draw_sphere(base, radius, color, segments); return; }
+        glm::vec3 dir = axis / height;
+
+        // Draw the cylindrical body
+        draw_circle(base, radius, get_perpendicular(dir), glm::cross(dir, get_perpendicular(dir)), color, segments);
+        draw_circle(tip,  radius, get_perpendicular(dir), glm::cross(dir, get_perpendicular(dir)), color, segments);
+
+        // Draw connecting lines
+        glm::vec3 perp = get_perpendicular(dir);
+        for (int i = 0; i < 4; ++i) {
+            float angle = glm::radians(90.0f * i);
+            glm::vec3 offset = radius * (std::cos(angle) * perp +
+                                         std::sin(angle) * glm::cross(dir, perp));
+            draw_line(base + offset, tip + offset, color);
+        }
+
+        // Draw hemispherical caps (simplified as 3/4 circles at each end)
+        draw_hemisphere(base, radius, -dir, get_perpendicular(dir), color, segments);
+        draw_hemisphere(tip,  radius,  dir, get_perpendicular(dir), color, segments);
+    }
+
+    void draw_box(const glm::mat4& transform, const glm::vec3& half_extents,
+                  const glm::vec3& color)
+    {
+        // Generate 8 corners in local space and transform to world space
+        const glm::vec3 e = half_extents;
+        glm::vec3 corners[8] = {
+            { -e.x, -e.y, -e.z }, {  e.x, -e.y, -e.z },
+            {  e.x,  e.y, -e.z }, { -e.x,  e.y, -e.z },
+            { -e.x, -e.y,  e.z }, {  e.x, -e.y,  e.z },
+            {  e.x,  e.y,  e.z }, { -e.x,  e.y,  e.z }
+        };
+        for (auto& c : corners) c = glm::vec3(transform * glm::vec4(c, 1.0f));
+
+        // Draw 12 edges
+        static const int edges[12][2] = {
+            {0,1},{1,2},{2,3},{3,0}, // bottom face
+            {4,5},{5,6},{6,7},{7,4}, // top face
+            {0,4},{1,5},{2,6},{3,7}  // side edges
+        };
+        for (auto& [a, b] : edges) draw_line(corners[a], corners[b], color);
+    }
+
+    // Upload this frame's lines to the GPU and return the draw info.
+    // Call this once per frame after all debug draws are issued.
+    void flush(VkCommandBuffer cmd, VkPipelineLayout layout)
+    {
+        if (lines_.empty()) return;
+        upload_to_gpu_and_draw(cmd, layout);
+        lines_.clear();
+    }
+
+private:
+    std::vector<DebugLine> lines_;
+
+    void draw_circle(const glm::vec3& center, float radius,
+                     const glm::vec3& x_axis, const glm::vec3& y_axis,
+                     const glm::vec3& color, int segments)
+    {
+        glm::vec3 prev = center + radius * x_axis;
+        for (int i = 1; i <= segments; ++i) {
+            float angle = glm::two_pi<float>() * i / segments;
+            glm::vec3 curr = center + radius * (std::cos(angle) * x_axis +
+                                                std::sin(angle) * y_axis);
+            draw_line(prev, curr, color);
+            prev = curr;
+        }
+    }
+
+    static glm::vec3 get_perpendicular(const glm::vec3& v)
+    {
+        // Find a vector not parallel to v, then cross to get a perpendicular
+        glm::vec3 candidate = (std::abs(v.x) < 0.9f) ? glm::vec3(1,0,0) : glm::vec3(0,1,0);
+        return glm::normalize(glm::cross(v, candidate));
+    }
+
+    void draw_hemisphere(const glm::vec3& center, float radius, const glm::vec3& axis,
+                         const glm::vec3& perp, const glm::vec3& color, int segments)
+    {
+        glm::vec3 cross = glm::cross(axis, perp);
+        for (int ring = 0; ring < segments / 4; ++ring) {
+            float phi0 = glm::half_pi<float>() * ring / (segments / 4);
+            float phi1 = glm::half_pi<float>() * (ring + 1) / (segments / 4);
+            float r0 = radius * std::cos(phi0), h0 = radius * std::sin(phi0);
+            float r1 = radius * std::cos(phi1), h1 = radius * std::sin(phi1);
+            draw_circle(center + h0 * axis, r0, perp, cross, color, segments);
+            for (int j = 0; j < segments; ++j) {
+                float angle = glm::two_pi<float>() * j / segments;
+                glm::vec3 p0 = center + h0*axis + r0*(std::cos(angle)*perp + std::sin(angle)*cross);
+                glm::vec3 p1 = center + h1*axis + r1*(std::cos(angle)*perp + std::sin(angle)*cross);
+                draw_line(p0, p1, color);
+            }
+        }
+    }
+
+    void upload_to_gpu_and_draw(VkCommandBuffer cmd, VkPipelineLayout layout)
+    {
+        // 1. Create/Update the Vertex Buffer
+        size_t buffer_size = lines_.size() * sizeof(DebugLine);
+
+        // In a real engine, you would use a persistent mapped buffer or a staging pool.
+        // For this example, we assume we have a host-visible buffer we can map.
+        void* data;
+        vmaMapMemory(allocator, debug_buffer_allocation, &data);
+        memcpy(data, lines_.data(), buffer_size);
+        vmaUnmapMemory(allocator, debug_buffer_allocation);
+
+        // 2. Bind the Debug Pipeline (lines, no depth write, unlit)
+        vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, debug_pipeline);
+
+        // 3. Bind the buffer as a vertex buffer
+        VkDeviceSize offset = 0;
+        vkCmdBindVertexBuffers(cmd, 0, 1, &debug_buffer, &offset);
+
+        // 4. Draw!
+        // Each DebugLine contains 2 vertices, so we draw lines_.size() * 2 vertices.
+        vkCmdDraw(cmd, static_cast<uint32_t>(lines_.size() * 2), 1, 0, 0);
+    }
+};
+----
+
+=== Creating the Debug Renderer Pipeline
+
+Rendering colored lines requires a specialized graphics pipeline. Because we want debug geometry to be visible even through walls, we typically disable depth testing or use a very large depth bias. Here is the complete configuration for a debug line pipeline:
+
+[source,cpp]
+----
+void create_debug_pipeline(VkDevice device, VkRenderPass render_pass, VkPipelineLayout layout, VkPipeline& out_pipeline)
+{
+    // 1. Shaders: Simple unlit color pass
+    // Vertex shader takes vec3 pos, vec3 color and multiplies by ViewProj.
+    // Fragment shader simply outputs the color.
+
+    // 2. Primitive Topology: Line List
+    VkPipelineInputAssemblyStateCreateInfo inputAssembly{};
+    inputAssembly.sType = VK_STRUCTURE_TYPE_PIPELINE_INPUT_ASSEMBLY_STATE_CREATE_INFO;
+    inputAssembly.topology = VK_PRIMITIVE_TOPOLOGY_LINE_LIST;
+
+    // 3. Vertex Input Description
+    VkVertexInputBindingDescription bindingDescription{};
+    bindingDescription.binding = 0;
+    bindingDescription.stride = sizeof(DebugLine) / 2; // Stride is per vertex (Pos+Color)
+    bindingDescription.inputRate = VK_VERTEX_INPUT_RATE_VERTEX;
+
+    std::vector<VkVertexInputAttributeDescription> attributeDescriptions(2);
+    attributeDescriptions[0].binding = 0;
+    attributeDescriptions[1].binding = 0;
+    attributeDescriptions[0].location = 0; // Position
+    attributeDescriptions[1].location = 1; // Color
+    attributeDescriptions[0].format = VK_FORMAT_R32G32B32_SFLOAT;
+    attributeDescriptions[1].format = VK_FORMAT_R32G32B32_SFLOAT;
+    attributeDescriptions[0].offset = offsetof(DebugLine, start); // Start and color are interleaved
+    attributeDescriptions[1].offset = offsetof(DebugLine, color);
+
+    VkPipelineVertexInputStateCreateInfo vertexInputInfo{};
+    vertexInputInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO;
+    vertexInputInfo.vertexBindingDescriptionCount = 1;
+    vertexInputInfo.pVertexBindingDescriptions = &bindingDescription;
+    vertexInputInfo.vertexAttributeDescriptionCount = static_cast<uint32_t>(attributeDescriptions.size());
+    vertexInputInfo.pVertexAttributeDescriptions = attributeDescriptions.data();
+
+    // 4. Rasterizer: Line width
+    VkPipelineRasterizationStateCreateInfo rasterizer{};
+    rasterizer.sType = VK_STRUCTURE_TYPE_PIPELINE_RASTERIZATION_STATE_CREATE_INFO;
+    rasterizer.polygonMode = VK_POLYGON_MODE_LINE;
+    rasterizer.lineWidth = 1.0f;
+    rasterizer.cullMode = VK_CULL_MODE_NONE;
+
+    // 5. Disable Depth Testing (to see lines through geometry)
+    VkPipelineDepthStencilStateCreateInfo depthStencil{};
+    depthStencil.sType = VK_STRUCTURE_TYPE_PIPELINE_DEPTH_STENCIL_STATE_CREATE_INFO;
+    depthStencil.depthTestEnable = VK_FALSE;
+    depthStencil.depthWriteEnable = VK_FALSE;
+
+    // ... Create the pipeline ...
+}
+----
+
+== Drawing the Skeleton
+
+The skeleton is the most important debug overlay. It makes the scene graph transform hierarchy visible, showing exactly where each joint is in world space and which joint is which bone's parent:
+
+[source,cpp]
+----
+const glm::vec3 BONE_COLOR = { 1.0f, 1.0f, 0.0f };  // Yellow for bones
+const glm::vec3 JOINT_COLOR = { 1.0f, 0.5f, 0.0f }; // Orange for joint spheres
+
+void draw_skeleton(
+    DebugDrawer&              drawer,
+    const std::vector<Node>&  nodes,
+    const Skin&               skin)
+{
+    for (uint32_t joint_idx = 0; joint_idx < skin.joints.size(); ++joint_idx) {
+        uint32_t node_idx = skin.joints[joint_idx];
+        const Node& node  = nodes[node_idx];
+
+        glm::vec3 joint_world = glm::vec3(node.world_matrix[3]);
+
+        // Draw a small sphere at the joint location
+        drawer.draw_sphere(joint_world, 0.015f, JOINT_COLOR);
+
+        // Draw a line from this joint to its parent
+        if (node.parent_index != INVALID_NODE_INDEX) {
+            glm::vec3 parent_world = glm::vec3(nodes[node.parent_index].world_matrix[3]);
+            drawer.draw_line(joint_world, parent_world, BONE_COLOR);
+        }
+    }
+}
+----
+
+== Drawing Collision Shapes and Constraints
+
+Drawing collision shapes requires querying the physics engine for the current transform of each body, then using the stored `ColliderDef` to determine the shape parameters:
+
+[source,cpp]
+----
+const glm::vec3 KINEMATIC_COLOR = { 0.0f, 1.0f, 0.0f };  // Green: animation-driven
+const glm::vec3 DYNAMIC_COLOR   = { 1.0f, 0.0f, 0.0f };  // Red: physics-driven
+
+void draw_bone_bodies(
+    DebugDrawer&                    drawer,
+    const std::vector<BoneBody>&    bodies,
+    const PhysicsWorld&             physics_world,
+    RagdollState                    ragdoll_state)
+{
+    for (const auto& bone_body : bodies) {
+        PhysicsPose pose = physics_world.get_body_pose(bone_body.physics_body);
+
+        glm::vec3 color = (ragdoll_state == RagdollState::RAGDOLL)
+                        ? DYNAMIC_COLOR
+                        : KINEMATIC_COLOR;
+
+        glm::mat4 body_transform = pose_to_matrix(pose);
+
+        const ColliderDef& def = bone_body.collider_def;
+        if (def.shape == ColliderDef::Shape::CAPSULE) {
+            // Compute capsule base and tip in world space
+            // (assumes capsule local axis is Y)
+            glm::vec3 local_base = glm::vec3(0, -def.half_height, 0);
+            glm::vec3 local_tip  = glm::vec3(0,  def.half_height, 0);
+            glm::vec3 world_base = glm::vec3(body_transform * glm::vec4(local_base, 1));
+            glm::vec3 world_tip  = glm::vec3(body_transform * glm::vec4(local_tip,  1));
+            drawer.draw_capsule(world_base, world_tip, def.radius, color);
+
+        } else if (def.shape == ColliderDef::Shape::BOX) {
+            drawer.draw_box(body_transform, def.box_half_extents, color);
+        }
+    }
+}
+----
+
+For physics constraints, most physics engines provide a query API to retrieve the constraint's anchor points and axis direction. Drawing a constraint as a line between its two anchor points plus a small indicator for the constraint axis gives you enough information to visually verify that the constraint setup is correct:
+
+[source,cpp]
+----
+const glm::vec3 CONSTRAINT_COLOR = { 0.0f, 0.5f, 1.0f }; // Blue for constraints
+const glm::vec3 LIMIT_COLOR      = { 1.0f, 0.0f, 1.0f }; // Magenta for limits
+
+void draw_constraints(
+    DebugDrawer&              drawer,
+    const PhysicsWorld&       physics_world,
+    const std::vector<void*>& constraint_handles)
+{
+    for (void* constraint : constraint_handles) {
+        ConstraintDebugInfo info = physics_world.get_constraint_debug_info(constraint);
+
+        // Draw a line connecting the two constraint anchor points
+        drawer.draw_line(info.anchor_a, info.anchor_b, CONSTRAINT_COLOR);
+
+        // For hinge constraints, draw the hinge axis as a small arrow
+        if (info.type == ConstraintType::HINGE) {
+            glm::vec3 axis_end = info.anchor_a + info.hinge_axis_world * 0.05f;
+            drawer.draw_line(info.anchor_a, axis_end, LIMIT_COLOR);
+        }
+
+        // For ball-socket constraints, draw a small sphere at the pivot
+        if (info.type == ConstraintType::BALL_SOCKET) {
+            drawer.draw_sphere(info.anchor_a, 0.01f, LIMIT_COLOR);
+        }
+    }
+}
+----
+
+When the ragdoll handoff from Chapter 4 goes wrong and bodies explode apart, these overlays will immediately show you whether the problem is the constraints failing to hold (the capsules are no longer touching, and the connecting lines are very long) or the bodies being in the wrong initial position (the capsules are offset from the rendered mesh).
+
+xref:Advanced_glTF/Debugging_Visual_Auditing/01_introduction.adoc[Previous: Introduction] | xref:Advanced_glTF/Debugging_Visual_Auditing/03_skinning_heatmaps.adoc[Next: Skinning Heatmaps]
diff --git a/en/Advanced_glTF/Debugging_Visual_Auditing/03_skinning_heatmaps.adoc b/en/Advanced_glTF/Debugging_Visual_Auditing/03_skinning_heatmaps.adoc
new file mode 100644
index 000000000..e146ccabc
--- /dev/null
+++ b/en/Advanced_glTF/Debugging_Visual_Auditing/03_skinning_heatmaps.adoc
@@ -0,0 +1,177 @@
+:pp: {plus}{plus}
+= Skinning Heatmaps: Visualizing Weight Painting
+
+== What Heatmaps Reveal
+
+The skeletal skinning process is driven entirely by the per-vertex joint indices and weight values stored in the `JOINTS_0` and `WEIGHTS_0` vertex attributes. These are invisible during normal rendering—the deformation they produce is visible, but the weights themselves are not. When something is wrong with the skinning—pinched geometry at a shoulder, a vertex that stays rigidly attached to the wrong bone, a region that stretches impossibly during animation—the cause is almost always incorrect joint weights. But without a way to visualize the weights, diagnosing the problem is extremely difficult.
+
+A skinning heatmap is a rendering mode where each pixel's color is determined not by the material's PBR properties, but by the skinning weight data for the corresponding vertex. The most useful heatmap is the **dominant bone heatmap**: each vertex is colored based on which joint has the largest influence on it. This immediately shows you the "territory" of each bone—which region of the mesh each bone primarily controls—and makes joint boundary errors obvious. A vertex that is colored with Bone A's color when it should be colored with Bone B's indicates a weight painting error that would cause that vertex to follow the wrong bone.
+
+A second useful heatmap is the **weight distribution heatmap**: each vertex is colored on a heat scale (blue → green → red) based on how many joints influence it and how evenly the weight is distributed. A vertex influenced entirely by one bone shows as pure blue (or some other "cold" color indicating simple, predictable behavior). A vertex with four nearly-equal influences shows as red (complex, potentially artifact-prone). This heatmap helps you identify regions where the weight painting has become noisy or overly complex without artistic intention.
+
+== Implementing the Heatmap Render Mode
+
+The heatmap is rendered by replacing the standard PBR fragment shader with a diagnostic shader that reads the joint indices and weights from the vertex data and converts them to a color. Because the skinning data is part of the vertex format, no additional buffers are needed—we just need to pass the relevant attributes through the vertex pipeline to the fragment stage and compute the color there:
+
+[source,slang]
+----
+// Vertex shader: pass through position and skinning attributes
+struct VertexOut {
+    float4     position  : SV_Position;
+    float4     weights   : TEXCOORD0;  // The four bone weights
+    uint4      joints    : TEXCOORD1;  // The four bone indices
+};
+
+// Vertex input mirrors the InputVertex layout
+struct InputVertex {
+    float3 position;
+    float3 normal;
+    float4 tangent;
+    float2 texcoord;
+    uint4  joint_indices;
+    float4 joint_weights;
+};
+
+[[vk::binding(0, 0)]] StructuredBuffer<InputVertex>    vertices;
+[[vk::binding(1, 0)]] StructuredBuffer<float4x4>     joint_matrices;
+[[vk::binding(2, 0)]] StructuredBuffer<float4>       joint_colors; // One color per joint
+
+[shader("vertex")]
+VertexOut vertex_main(uint vertex_id : SV_VertexID)
+{
+    InputVertex v = vertices[vertex_id];
+
+    // Apply skinning to get world position (same as Chapter 3)
+    float4x4 skin_matrix =
+        v.joint_weights.x * joint_matrices[v.joint_indices.x] +
+        v.joint_weights.y * joint_matrices[v.joint_indices.y] +
+        v.joint_weights.z * joint_matrices[v.joint_indices.z] +
+        v.joint_weights.w * joint_matrices[v.joint_indices.w];
+
+    VertexOut out;
+    out.position = mul(camera.view_proj, mul(skin_matrix, float4(v.position, 1.0)));
+    out.weights  = v.joint_weights;
+    out.joints   = v.joint_indices;
+    return out;
+}
+
+// Fragment shader: colorize by dominant joint
+// Each joint gets a unique color from a lookup table.
+[shader("fragment")]
+float4 fragment_dominant_bone(VertexOut input) : SV_Target
+{
+    // Find the joint with the highest weight
+    uint   dominant_joint = 0;
+    float  max_weight     = input.weights.x;
+
+    if (input.weights.y > max_weight) { max_weight = input.weights.y; dominant_joint = 1; }
+    if (input.weights.z > max_weight) { max_weight = input.weights.z; dominant_joint = 2; }
+    if (input.weights.w > max_weight) { max_weight = input.weights.w; dominant_joint = 3; }
+
+    uint actual_joint_idx = input.joints[dominant_joint];
+    return joint_colors[actual_joint_idx];
+}
+----
+
+The `jointColors` buffer contains one RGBA color per joint in the skeleton. These colors should be visually distinct so that adjacent bone territories are immediately differentiable. A simple approach is to distribute colors uniformly around the HSV color wheel:
+
+[source,cpp]
+----
+std::vector<glm::vec4> generate_joint_colors(uint32_t joint_count)
+{
+    std::vector<glm::vec4> colors(joint_count);
+    for (uint32_t i = 0; i < joint_count; ++i) {
+        float hue = static_cast<float>(i) / joint_count; // 0..1
+        // Convert HSV (hue, 1, 1) to RGB
+        float h = hue * 6.0f;
+        float x = 1.0f - std::abs(std::fmod(h, 2.0f) - 1.0f);
+        glm::vec3 rgb;
+        if      (h < 1) rgb = {1, x, 0};
+        else if (h < 2) rgb = {x, 1, 0};
+        else if (h < 3) rgb = {0, 1, x};
+        else if (h < 4) rgb = {0, x, 1};
+        else if (h < 5) rgb = {x, 0, 1};
+        else            rgb = {1, 0, x};
+        colors[i] = glm::vec4(rgb, 1.0f);
+    }
+    return colors;
+}
+----
+
+== Reading the Heatmap
+
+Looking at the dominant bone heatmap of a well-painted character, you should see clearly separated color regions with smooth gradients at the boundaries. The thigh should be entirely one color. The shin should be entirely another. The transition between them at the knee should be a narrow band where the skin blends between the two bones.
+
+**Problems to look for:**
+
+If a small patch of one color appears in the middle of another color's territory—say, a yellow vertex in the middle of a red region—this is a stray vertex group assignment. A vertex that was accidentally assigned to the wrong bone. During animation, this vertex will follow the wrong bone and produce a "pixel" of visible geometry displacement. In Blender, you can locate this vertex using Weight Paint mode on the affected bone.
+
+If the color transition between two bones is very abrupt—the colors change immediately with no blending zone—this indicates that the weight painting has zero smooth falloff between the bones. The mesh will crease sharply at the joint rather than deforming smoothly. Add a gradient of weight values in the transition zone.
+
+If the entire mesh is a single color, the character is either very simple (only one bone influences all vertices, which is rare) or the weight data was not exported correctly (all vertices defaulted to the first bone).
+
+== The Weight Distribution Heatmap
+
+For the weight distribution heatmap, the fragment shader computes a "complexity score" for each vertex—how many joints have significant influence—and maps it to a heat color scale:
+
+[source,slang]
+----
+[shader("fragment")]
+float4 fragment_weight_distribution(VertexOut input) : SV_Target
+{
+    // Count how many joints have non-trivial influence on this vertex.
+    // "Non-trivial" means weight > 0.05 (5% or more).
+    float complexity = 0.0f;
+    float4 w = input.weights;
+    if (w.x > 0.05f) complexity += 1.0f;
+    if (w.y > 0.05f) complexity += 1.0f;
+    if (w.z > 0.05f) complexity += 1.0f;
+    if (w.w > 0.05f) complexity += 1.0f;
+
+    // Normalize to 0..1 range (0 = single bone, 1 = four bones)
+    float t = (complexity - 1.0f) / 3.0f;
+
+    // Map to a blue (simple) -> green (moderate) -> red (complex) heat scale
+    float3 cool_color = float3(0, 0, 1);
+    float3 warm_color = float3(1, 0, 0);
+    float3 mid_color  = float3(0, 1, 0);
+
+    float3 color;
+    if (t < 0.5f) color = lerp(cool_color, mid_color, t * 2.0f);
+    else          color = lerp(mid_color, warm_color, (t - 0.5f) * 2.0f);
+
+    return float4(color, 1.0f);
+}
+----
+
+A well-painted character will show blue and green regions everywhere except at the major joints (shoulder, hip, knee, elbow), where you expect and want multiple bone influences. If large regions of the mesh are red, the weight painting is unnecessarily complex in those areas and may benefit from cleanup.
+
+== Switching to Heatmap Mode at Runtime
+
+To switch between the normal PBR pipeline and the heatmap pipeline at runtime, you have two options:
+
+1. **Multiple Pipelines**: Create two separate `VkPipeline` objects using the same `VkPipelineLayout`. One pipeline uses the PBR fragment shader, and the other uses the heatmap fragment shader. At render time, simply bind the desired pipeline.
+2. **Specialization Constants**: Use a Vulkan specialization constant to toggle the heatmap logic within a single fragment shader.
+
+If you choose the multiple pipeline approach (recommended for clarity), you must ensure the `jointColors` buffer is bound when in heatmap mode:
+
+[source,cpp]
+----
+// Bind the appropriate pipeline
+if (render_mode == RenderMode::HEATMAP) {
+    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, heatmap_pipeline);
+
+    // Bind the joint colors buffer (assumes it's in a separate descriptor set)
+    vkCmdBindDescriptorSets(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout,
+                           2, 1, &joint_colors_descriptor_set, 0, nullptr);
+} else {
+    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_GRAPHICS, pbr_pipeline);
+}
+
+// Issue the draw call as usual
+vkCmdDrawIndexed(cmd, index_count, 1, 0, 0, 0);
+----
+
+This allows you to audit your assets live in the engine, which is far more effective than relying on static analysis in a 3D tool.
+
+xref:Advanced_glTF/Debugging_Visual_Auditing/02_debug_drawers.adoc[Previous: Debug Drawers] | xref:Advanced_glTF/Debugging_Visual_Auditing/04_renderdoc_analysis.adoc[Next: RenderDoc Analysis]
diff --git a/en/Advanced_glTF/Debugging_Visual_Auditing/04_renderdoc_analysis.adoc b/en/Advanced_glTF/Debugging_Visual_Auditing/04_renderdoc_analysis.adoc
new file mode 100644
index 000000000..c67159bdf
--- /dev/null
+++ b/en/Advanced_glTF/Debugging_Visual_Auditing/04_renderdoc_analysis.adoc
@@ -0,0 +1,86 @@
+:pp: {plus}{plus}
+= RenderDoc Analysis: Inspecting the Compute Skinning Output
+
+== What RenderDoc Gives You
+
+RenderDoc is a frame capture and GPU debugger that lets you freeze a single rendered frame, inspect every GPU resource and command at the moment of capture, and re-execute individual draw calls and compute dispatches in isolation. For our character pipeline, its most valuable feature is the ability to examine the contents of any GPU buffer at any point in the frame—including the output buffer of our compute skinning dispatch, before the rasterizer reads from it.
+
+This is the ground-truth check at the GPU level. The reference viewer (Chapter 7) established that the asset itself is correct. The debug drawers show the physics simulation state. RenderDoc answers the narrowest possible question: after the compute skinning shader runs, are the vertex positions and normals in the output buffer geometrically correct? If they are, the rasterizer, materials, and scene graph are fine. If they are not, the error is in the compute skinning pipeline—the shader, the joint matrices, or the input vertex format.
+
+== Capturing a Frame
+
+RenderDoc captures are straightforward: launch your application through RenderDoc, press F12 (or your configured capture key) at the moment you want to capture, and the frame is frozen. You can then open it in RenderDoc's UI.
+
+One important setup step for Vulkan applications: ensure that your application is built without the `-DNDEBUG` flag (or equivalent), and that debug object names are set using `vkSetDebugUtilsObjectNameEXT`. This makes RenderDoc's resource browser comprehensible—instead of seeing `VkBuffer (0x12345678)`, you see `character_skin_output_buffer`. For the compute skinning output buffer specifically:
+
+[source,cpp]
+----
+void set_debug_name(VkDevice device, VkBuffer buffer, const char* name)
+{
+#ifdef VULKAN_DEBUG
+    VkDebugUtilsObjectNameInfoEXT info{};
+    info.sType        = VK_STRUCTURE_TYPE_DEBUG_UTILS_OBJECT_NAME_INFO_EXT;
+    info.objectType   = VK_OBJECT_TYPE_BUFFER;
+    info.objectHandle = reinterpret_cast<uint64_t>(buffer);
+    info.pObjectName  = name;
+    vkSetDebugUtilsObjectNameEXT(device, &info);
+#endif
+}
+
+// When creating the skinning output buffer:
+set_debug_name(device, skinning_output.buffer, "skinning_output_buffer");
+----
+
+Similarly, label your command buffer sections using `vkCmdBeginDebugUtilsLabelEXT` and `vkCmdEndDebugUtilsLabelEXT`:
+
+[source,cpp]
+----
+VkDebugUtilsLabelEXT label{};
+label.sType      = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT;
+label.pLabelName = "Compute Skinning Pass";
+label.color[0]   = 0.2f; label.color[1] = 0.8f;
+label.color[2]   = 0.2f; label.color[3] = 1.0f;
+vkCmdBeginDebugUtilsLabelEXT(cmd, &label);
+
+// ... dispatch compute skinning ...
+
+vkCmdEndDebugUtilsLabelEXT(cmd);
+----
+
+These labels appear in RenderDoc's event list as named sections, making it trivial to navigate to the skinning dispatch even in a frame with thousands of GPU commands.
+
+== Inspecting the Output Buffer
+
+Once you have a capture, navigate to the compute skinning dispatch in the event list. Select it and open the **Resource Inspector** or **Buffer Viewer** panel. Find the output buffer (named `skinning_output_buffer` if you set the debug name). RenderDoc will let you view the raw contents of this buffer as a structured table.
+
+You need to tell RenderDoc the layout of your vertex struct. In the Buffer Viewer, you can configure the byte offset, stride, and format for each field. For a typical vertex struct:
+
+[source,slang]
+----
+Position:    float3, offset 0,  stride sizeof(Vertex)
+Normal:      float3, offset 12, stride sizeof(Vertex)
+Tangent:     float4, offset 24, stride sizeof(Vertex)
+TexCoord:    float2, offset 40, stride sizeof(Vertex)
+JointIdx:    uint4,  offset 48, stride sizeof(Vertex)
+JointWeight: float4, offset 64, stride sizeof(Vertex)
+----
+
+With this layout configured, RenderDoc will show you a row per vertex with the position, normal, and other fields as human-readable values. Now you can:
+
+**Verify the rest pose.** Pause your application at a frame where the character is in the bind pose (all animations at time 0). The output vertex positions should match the original glTF vertex positions exactly—there should be no deformation in the rest pose. If positions differ from the rest pose when the animation is at time 0, the inverse bind matrices or the joint hierarchy traversal is wrong.
+
+**Verify a known pose.** Move the animation to a frame where you know exactly what the character should look like—say, a T-pose with one arm raised to 90 degrees. Examine a vertex on the raised arm. Its output position should be the original position rotated by 90 degrees around the appropriate axis. If it has moved in the wrong direction, the coordinate space of the joint rotation is wrong.
+
+**Check normals.** Select a vertex on a flat surface. Its normal should be a unit vector pointing away from the surface. If it has significant X or Z components when it should point in Y, or if it has magnitude significantly different from 1.0, the normal transform in the compute shader is incorrect.
+
+**Verify winding order.** In RenderDoc, you can re-draw the mesh as a wireframe overlay. If you see back-facing triangles where the geometry should be front-facing, the winding order in the output buffer has been reversed. This can happen if the skinning shader flips the coordinate system by applying a matrix with a negative determinant (e.g., from a scale of -1 on one axis).
+
+== Using the Shader Debugger
+
+RenderDoc's shader debugger allows you to select any pixel in the captured frame and step through the shader code that produced it. For the compute skinning dispatch, you can select any invocation (by thread group and thread index) and step through the shader execution to see exactly how the output values were computed.
+
+This is most useful when you have a specific suspicious vertex—one whose position in the buffer viewer looks wrong. Find the vertex index, compute the dispatch thread ID (vertex_index / 64 for the group, vertex_index % 64 for the thread), and launch the debugger for that invocation. You can then step through the joint matrix lookup, the linear blend computation, and the position transform, watching each intermediate value. At the point where a value diverges from what you expect, you have found the bug.
+
+The shader debugger is slower to launch and use than simply examining buffer contents, but for subtle numerical errors—wrong sign in a matrix element, wrong axis in a quaternion-to-matrix conversion—it provides the most direct diagnostic path.
+
+xref:Advanced_glTF/Debugging_Visual_Auditing/03_skinning_heatmaps.adoc[Previous: Skinning Heatmaps] | xref:Advanced_glTF/Debugging_Visual_Auditing/05_conclusion.adoc[Next: Conclusion]
diff --git a/en/Advanced_glTF/Debugging_Visual_Auditing/05_conclusion.adoc b/en/Advanced_glTF/Debugging_Visual_Auditing/05_conclusion.adoc
new file mode 100644
index 000000000..c706c1fd8
--- /dev/null
+++ b/en/Advanced_glTF/Debugging_Visual_Auditing/05_conclusion.adoc
@@ -0,0 +1,36 @@
+:pp: {plus}{plus}
+= Debugging: Summary & Series Conclusion
+
+== What We Built in This Chapter
+
+Debugging is not a single tool or technique—it is a methodology: a sequence of questions to ask, in order, that systematically narrows the space of possible failure causes.
+
+We started with debug drawers, which make the invisible physics world visible. The deferred line buffer approach—accumulate segments per frame, upload once, draw in a single call—keeps debug rendering fast enough to leave enabled during development without significantly affecting frame rate. The skeleton drawer shows exactly where the scene graph places each joint in world space, making transform hierarchy errors immediately visible. The collision shape drawer (color-coded green for kinematic, red for dynamic) shows the physics representation alongside the rendered mesh, so you can see instantly whether the physics bodies are tracking the animation correctly. The constraint drawer shows the connection structure of the ragdoll, so when a ragdoll explodes you can see whether the constraints failed (bodies far apart) or were incorrectly set up (wrong anchor points).
+
+Skinning heatmaps moved the diagnostic lens to the vertex skinning data itself. The dominant bone heatmap colorizes each vertex by its most influential joint, revealing stray weight assignments and incorrect bone territories as anomalous colored patches. The weight distribution heatmap colorizes by the complexity of the weight distribution, flagging regions with noisy or overcomplicated weight painting.
+
+RenderDoc analysis provided the GPU-level ground truth: the ability to examine the exact contents of the skinning compute output buffer at the frame level, verify vertex positions against expected values, check normal vector correctness, and step through the shader invocation for a specific suspicious vertex. The discipline of labeling GPU commands and buffers with `vkSetDebugUtilsObjectNameEXT` and `vkCmdBeginDebugUtilsLabelEXT` makes RenderDoc's resource browser navigable rather than overwhelming.
+
+== The Series in Review
+
+Across these eight chapters, we have built a complete production-grade character pipeline on top of Vulkan. Let's trace the full path of data from file to frame:
+
+The **glTF file** carries the skeleton hierarchy, vertex skinning data, animations, physics extras metadata, and morph targets. **Chapter 2** showed how to load that hierarchy into a recursive scene graph with a dirty-flag propagation system that efficiently updates world matrices. **Chapter 3** used the skinning data to build a Vulkan Compute pipeline that deforms the mesh on the GPU, writing the result to a shared buffer consumed by the rasterizer, ray tracing system, and physics queries.
+
+**Chapter 4** established the physics simulation layer: proxy colliders derived from the bone hierarchy, constraints encoding the human range of motion, the three-state ANIMATED/BLENDED/RAGDOLL handoff, and collision filtering via bitmasks. **Chapter 5** added procedural animation on top: CCD and FABRIK IK for foot placement and reaching, look-at controllers for head tracking, and physics-derived lean applied as spine rotations.
+
+**Chapter 6** extended the compute skinning pipeline to support morph targets, using Vulkan 1.4's descriptor indexing to handle an unbounded array of displacement buffers without per-draw descriptor swaps. **Chapter 7** stepped back to establish the production workflow: naming conventions, export settings, automated validation, and reference viewer auditing. **Chapter 8** closed the loop with engine-side debugging tools.
+
+The systems you have built are not demonstrations—they are a production framework. Each one is designed to compose with the others: the IK operates on the same scene graph that the physics system drives; the morph targets integrate with the same compute shader that the ragdoll reads from; the debug drawers use the same physics world data that the ragdoll system writes. This composability is the payoff for the careful architectural decisions made at each stage.
+
+The next characters you build on this foundation will benefit from every lesson learned here. The framework will grow. The asset pipeline will evolve. But the fundamental architecture—scene graph, compute skinning, physics integration, procedural layers—will remain the skeleton on which everything else hangs.
+
+== Verification: What to Look For
+
+To verify your debugging tools:
+
+1.  **Flush Behavior**: Ensure that the `DebugDrawer` correctly clears its line buffer after every `flush()` call. If lines persist from previous frames, the buffer is not being cleared.
+2.  **Coordinate Space**: Verify that your debug lines appear in world space. A common error is passing local-space coordinates to the drawer, which will result in debug geometry following the character incorrectly.
+3.  **RenderDoc Inspection**: Use the **Buffer Viewer** in RenderDoc to inspect your compute skinning output buffer. If the values in the buffer match your expectations but the character is not rendering correctly, the problem is likely in your vertex input description or graphics pipeline state.
+
+xref:Advanced_glTF/Debugging_Visual_Auditing/04_renderdoc_analysis.adoc[Previous: RenderDoc Analysis]
diff --git a/en/Advanced_glTF/Morph_Targets_Facial_Animation/01_introduction.adoc b/en/Advanced_glTF/Morph_Targets_Facial_Animation/01_introduction.adoc
new file mode 100644
index 000000000..e34cbdf40
--- /dev/null
+++ b/en/Advanced_glTF/Morph_Targets_Facial_Animation/01_introduction.adoc
@@ -0,0 +1,40 @@
+:pp: {plus}{plus}
+= Morph Targets & Facial Animation
+
+== A Different Kind of Deformation
+
+Every technique we have covered so far—scene graphs, compute skinning, ragdolls, IK—has been built on the same foundation: a skeleton of joints, each with a transform, driving the deformation of a mesh. Skeletal animation is powerful and efficient for bodies, limbs, and any mesh region whose deformation is roughly rotational in nature. But it has an Achilles heel, and that heel is the face.
+
+A human face deforms in ways that are fundamentally different from a limb. The skin around the mouth doesn't rotate like a knee bends—it slides, bulges, compresses, and stretches in complex patterns driven by dozens of independent muscles pulling in partially conflicting directions. Capturing all of this with joints requires an impractical number of them (professional facial rigs in film production can have hundreds of joints just for the face), and even then the interpolation between poses can produce artifacts because spherical linear interpolation of joint rotations is not the same as the linear blending of skin displacement.
+
+The industry's solution for facial animation is **morph targets**, which are also called **blend shapes** or **shape keys** depending on which software package you are using. The idea is straightforward: instead of animating joint rotations, you store multiple complete sets of vertex positions representing different facial expressions. A "smile" morph target stores the positions of every face vertex in the smiled configuration. A "brow raise" morph target stores them in the raised-brow configuration. At runtime, you blend linearly between the base mesh and one or more morph targets by weighting each target's contribution. If the smile weight is 0.7, every vertex moves 70% of the way from its rest position to its smile position. If both a smile (weight 0.5) and a brow raise (weight 0.8) are active simultaneously, the displacements are added and the vertex moves to the combined position.
+
+This is the **Linear Blend Shapes** model, and it is what glTF calls its `WEIGHTS` animation channel.
+
+== What glTF Stores for Morph Targets
+
+The glTF specification stores morph targets as a list of accessors, one per target, each containing per-vertex displacement vectors. Specifically, for a mesh primitive with `N` vertices and `K` morph targets, the glTF file contains:
+
+- The base vertex buffer: N entries with positions, normals, tangents, and texture coordinates.
+- K additional "sparse" or dense accessor arrays, each containing N position displacement vectors (the difference between the morph target position and the base position for each vertex).
+- Optionally, K additional normal displacement arrays and K additional tangent displacement arrays, for morph targets that change the surface curvature (like puffing cheeks).
+
+The animation system drives a set of `K` weights, one per morph target, in the range [0, 1]. The final vertex position is:
+
+`final_position = base_position + sum(weight[i] * displacement[i])` for all K targets.
+
+This is linear blending, and it has the same virtue that skeletal linear blend skinning has: it is trivially GPU-parallelizable. Every vertex is independent, and every displacement is just a vector addition and scalar multiply.
+
+== The Memory Challenge
+
+Morph targets have one serious problem: memory. If you have a face mesh with 50,000 vertices and 50 morph targets (a reasonable number for a photorealistic character with full FACS—Facial Action Coding System—coverage), you have 50 × 50,000 = 2,500,000 displacement vectors. At 12 bytes per 3D float vector, that is 30 megabytes of displacement data for a single character's face. For normal displacements, double that to 60 MB. For a game with multiple speaking characters simultaneously, this adds up rapidly.
+
+The approach we will use is **bindless** morph target buffers combined with **sparse activation**: rather than loading all morph target data for all characters into GPU-resident buffers unconditionally, we only promote the data for *currently active* morph targets and use Vulkan 1.4's descriptor indexing to access any of them from a single shader. This is the same design philosophy as bindless texture atlases for materials—instead of binding a descriptor per resource, we bind a large array of descriptors once and index into it with a push constant or uniform value.
+
+== What This Chapter Covers
+
+We will implement the full morph target pipeline in three stages. First, we parse and upload the morph target displacement data from glTF—this requires special handling of glTF's sparse accessor format, which stores only the vertices that actually change (a significant optimization for morph targets like "blink" where only eyelid vertices differ from the base). Second, we build the bindless Vulkan descriptor infrastructure—a large buffer array that the skinning compute shader can index at will. Third, we extend the compute skinning shader from Chapter 3 to apply morph displacements before the skeletal skinning step, producing a final vertex buffer that reflects both the facial expression and the body pose simultaneously.
+
+We will also briefly discuss the relationship between morph targets and skeletal rigs in production. Professional characters typically use both: a coarse joint rig for the head and jaw (to handle large-scale deformation like opening the mouth), and a dense morph target layer for the fine-grained surface detail (lip sync, nostril flare, brow wrinkles). Understanding how these two layers cooperate will help you make the right tradeoffs for your character pipeline.
+
+xref:Advanced_glTF/Procedural_Animation_IK/07_conclusion.adoc[Previous: Procedural Animation Conclusion] | xref:Advanced_glTF/Morph_Targets_Facial_Animation/02_shape_key_ingestion.adoc[Next: Shape Key Ingestion]
diff --git a/en/Advanced_glTF/Morph_Targets_Facial_Animation/02_shape_key_ingestion.adoc b/en/Advanced_glTF/Morph_Targets_Facial_Animation/02_shape_key_ingestion.adoc
new file mode 100644
index 000000000..be544d57b
--- /dev/null
+++ b/en/Advanced_glTF/Morph_Targets_Facial_Animation/02_shape_key_ingestion.adoc
@@ -0,0 +1,226 @@
+:pp: {plus}{plus}
+= Shape Key Ingestion: Parsing Morph Targets from glTF
+
+== The Sparse Accessor Problem
+
+When you export a character from Blender with 60 morph targets and open the resulting glTF file, you might notice that the displacement arrays are far smaller than you expected. A face mesh with 50,000 vertices and a "smile" morph target might produce a displacement buffer with only 800 entries, not 50,000. This is because glTF supports **sparse accessors**: a way of storing only the entries in an array that differ from a default value (in this case, the zero vector, since vertices that don't move in a given morph target have zero displacement).
+
+Understanding sparse accessors is essential because tinygltf does not automatically expand them for you—you receive the raw sparse data and are responsible for building the full dense array yourself. The sparse accessor structure in glTF has three components: a count (the number of elements in the conceptual array), an indices sub-accessor (a list of element indices that have non-default values), and a values sub-accessor (the corresponding non-default values, in the same order as the indices). To reconstruct the full array, you allocate a zero-initialized buffer of size `count`, then scatter the values into the positions indicated by the indices.
+
+Let's look at this in practice. In tinygltf, after loading a model, each mesh primitive's `targets` array contains one entry per morph target. Each entry is a map from attribute name (e.g., `"POSITION"`, `"NORMAL"`) to accessor index. We need to walk this structure carefully:
+
+[source,cpp]
+----
+struct MorphTargetData {
+    std::string              name;            // From mesh's target names, if present
+    std::vector<glm::vec3>   position_deltas; // Size = vertex count; zero where unchanged
+    std::vector<glm::vec3>   normal_deltas;   // May be empty if no normal targets
+    VkBuffer                 gpu_buffer     = VK_NULL_HANDLE;
+    VmaAllocation            gpu_allocation = VK_NULL_HANDLE;
+};
+
+// Expand a potentially-sparse accessor into a dense vector of vec3.
+// Returns a vector of 'vertex_count' elements.
+static std::vector<glm::vec3> expand_sparse_accessor(
+    const tinygltf::Model&    model,
+    int                       accessor_index,
+    uint32_t                  vertex_count)
+{
+    std::vector<glm::vec3> result(vertex_count, glm::vec3(0.0f));
+    if (accessor_index < 0) return result;
+
+    const tinygltf::Accessor& accessor = model.accessors[accessor_index];
+
+    if (!accessor.sparse.isSparse) {
+        // Dense accessor: read all values directly
+        const tinygltf::BufferView& view = model.bufferViews[accessor.bufferView];
+        const uint8_t* data = model.buffers[view.buffer].data.data()
+                            + view.byteOffset
+                            + accessor.byteOffset;
+        const int stride = accessor.ByteStride(view) > 0
+                         ? accessor.ByteStride(view)
+                         : sizeof(glm::vec3);
+
+        for (uint32_t i = 0; i < static_cast<uint32_t>(accessor.count); ++i) {
+            std::memcpy(&result[i], data + i * stride, sizeof(glm::vec3));
+        }
+    } else {
+        // Sparse accessor: first read the index list, then scatter values
+        const tinygltf::AccessorSparse& sparse = accessor.sparse;
+
+        // Read the indices (which vertices are non-zero)
+        std::vector<uint32_t> indices(sparse.count);
+        {
+            const tinygltf::BufferView& idx_view =
+                model.bufferViews[sparse.indices.bufferView];
+            const uint8_t* idx_data = model.buffers[idx_view.buffer].data.data()
+                                    + idx_view.byteOffset
+                                    + sparse.indices.byteOffset;
+
+            for (int i = 0; i < sparse.count; ++i) {
+                // glTF sparse indices can be UNSIGNED_BYTE, UNSIGNED_SHORT, or UNSIGNED_INT
+                switch (sparse.indices.componentType) {
+                    case TINYGLTF_COMPONENT_TYPE_UNSIGNED_BYTE:
+                        indices[i] = idx_data[i];
+                        break;
+                    case TINYGLTF_COMPONENT_TYPE_UNSIGNED_SHORT:
+                        indices[i] = reinterpret_cast<const uint16_t*>(idx_data)[i];
+                        break;
+                    case TINYGLTF_COMPONENT_TYPE_UNSIGNED_INT:
+                        indices[i] = reinterpret_cast<const uint32_t*>(idx_data)[i];
+                        break;
+                }
+            }
+        }
+
+        // Read the displacement values and scatter into result
+        {
+            const tinygltf::BufferView& val_view =
+                model.bufferViews[sparse.values.bufferView];
+            const uint8_t* val_data = model.buffers[val_view.buffer].data.data()
+                                    + val_view.byteOffset
+                                    + sparse.values.byteOffset;
+
+            for (int i = 0; i < sparse.count; ++i) {
+                uint32_t vertex_idx = indices[i];
+                if (vertex_idx < vertex_count) {
+                    std::memcpy(&result[vertex_idx],
+                                val_data + i * sizeof(glm::vec3),
+                                sizeof(glm::vec3));
+                }
+            }
+        }
+    }
+
+    return result;
+}
+----
+
+The switch statement on the index component type is important—do not assume all glTF exporters use the same index width. Blender typically uses `UNSIGNED_SHORT` for meshes with fewer than 65,536 vertices, but other exporters may use `UNSIGNED_INT` by default.
+
+== Reading Morph Target Names
+
+glTF allows—but does not require—morph target names to be stored in the mesh's `extras` field as a JSON array. Blender exports them this way, under the key `"targetNames"`. These names are important: they are the strings you will use to map animation channels (which reference morph targets by index) to human-readable names for debugging, and to connect lip sync events to the correct morph target.
+
+[source,cpp]
+----
+std::vector<std::string> read_morph_target_names(const tinygltf::Mesh& mesh)
+{
+    std::vector<std::string> names;
+    if (!mesh.extras.Has("targetNames")) return names;
+
+    const tinygltf::Value& target_names = mesh.extras.Get("targetNames");
+    for (int i = 0; i < static_cast<int>(target_names.ArrayLen()); ++i) {
+        names.push_back(target_names.Get(i).Get<std::string>());
+    }
+    return names;
+}
+----
+
+== Loading All Morph Targets for a Mesh
+
+With the sparse accessor expansion and name reading in place, we can write the top-level function that loads all morph targets for a mesh primitive:
+
+[source,cpp]
+----
+std::vector<MorphTargetData> load_morph_targets(
+    const tinygltf::Model&     model,
+    const tinygltf::Primitive& primitive,
+    const tinygltf::Mesh&      mesh,
+    uint32_t                   vertex_count)
+{
+    std::vector<MorphTargetData> targets;
+    std::vector<std::string>     names = read_morph_target_names(mesh);
+
+    for (size_t t = 0; t < primitive.targets.size(); ++t) {
+        MorphTargetData data;
+        data.name = (t < names.size()) ? names[t] : ("target_" + std::to_string(t));
+
+        // Position displacements are required; normals are optional
+        auto pos_it = primitive.targets[t].find("POSITION");
+        if (pos_it != primitive.targets[t].end()) {
+            data.position_deltas = expand_sparse_accessor(
+                model, pos_it->second, vertex_count);
+        } else {
+            data.position_deltas.assign(vertex_count, glm::vec3(0.0f));
+        }
+
+        auto nrm_it = primitive.targets[t].find("NORMAL");
+        if (nrm_it != primitive.targets[t].end()) {
+            data.normal_deltas = expand_sparse_accessor(
+                model, nrm_it->second, vertex_count);
+        }
+
+        targets.push_back(std::move(data));
+    }
+
+    return targets;
+}
+----
+
+== Uploading to the GPU
+
+Once we have the morph target data on the CPU, we upload each target's displacement array to a dedicated GPU buffer. Because we plan to use bindless access, we do not need to pack all targets into a single buffer—each target gets its own `VkBuffer`, and we store the buffer address (or descriptor index) for later shader use.
+
+[source,cpp]
+----
+void upload_morph_target_to_gpu(
+    VmaAllocator    allocator,
+    VkDevice        device,
+    VkCommandBuffer cmd,
+    MorphTargetData& target)
+{
+    VkDeviceSize buffer_size =
+        target.position_deltas.size() * sizeof(glm::vec3);
+
+    // Create a host-visible staging buffer
+    VkBuffer      staging_buf;
+    VmaAllocation staging_alloc;
+    create_buffer(allocator,
+                  buffer_size,
+                  VK_BUFFER_USAGE_TRANSFER_SRC_BIT,
+                  VMA_MEMORY_USAGE_CPU_TO_GPU,
+                  staging_buf, staging_alloc);
+
+    void* mapped;
+    vmaMapMemory(allocator, staging_alloc, &mapped);
+    std::memcpy(mapped, target.position_deltas.data(), buffer_size);
+    vmaUnmapMemory(allocator, staging_alloc);
+
+    // Create the device-local GPU buffer
+    create_buffer(allocator,
+                  buffer_size,
+                  VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |
+                  VK_BUFFER_USAGE_TRANSFER_DST_BIT   |
+                  VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT,
+                  VMA_MEMORY_USAGE_GPU_ONLY,
+                  target.gpu_buffer, target.gpu_allocation);
+
+    // Copy staging → GPU
+    VkBufferCopy region { 0, 0, buffer_size };
+    vkCmdCopyBuffer(cmd, staging_buf, target.gpu_buffer, 1, &region);
+
+    // (Staging buffer cleanup deferred until after command submission)
+}
+----
+
+The `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT` flag is what allows us to use this buffer's GPU address in the bindless approach we will build in the next section. With this flag set, we can call `vkGetBufferDeviceAddress` to retrieve a 64-bit GPU pointer that the compute shader can dereference directly, without needing a descriptor.
+
+== The Weight Animation Channel
+
+Morph target weights are driven by a glTF animation channel targeting the `WEIGHTS` path on a mesh node. Parsing and interpolating this channel uses the same accessor and sampler infrastructure we built for the joint rotation channels in Chapter 3—the only difference is that instead of writing a quaternion to a joint's local rotation, we write a scalar float to a morph target weight array.
+
+A common pattern is to maintain a per-mesh `std::vector<float>` of morph weights, one per target, that the animation system updates each frame. This array is then uploaded to the GPU as a small uniform or push constant before each skinning dispatch:
+
+[source,cpp]
+----
+struct MorphWeightBlock {
+    float weights[64]; // Support up to 64 active morph targets
+    uint32_t active_count;
+    uint32_t _pad[3];
+};
+----
+
+Having the weights in a simple flat array makes the compute shader straightforward: for each vertex, iterate over `active_count` morph targets, read the weight and the displacement, add the weighted displacement to the base position. We will write this shader code in the next section.
+
+xref:Advanced_glTF/Morph_Targets_Facial_Animation/01_introduction.adoc[Previous: Introduction] | xref:Advanced_glTF/Morph_Targets_Facial_Animation/03_bindless_morph_buffers.adoc[Next: Bindless Morph Buffers]
diff --git a/en/Advanced_glTF/Morph_Targets_Facial_Animation/03_bindless_morph_buffers.adoc b/en/Advanced_glTF/Morph_Targets_Facial_Animation/03_bindless_morph_buffers.adoc
new file mode 100644
index 000000000..8f0a1d7e1
--- /dev/null
+++ b/en/Advanced_glTF/Morph_Targets_Facial_Animation/03_bindless_morph_buffers.adoc
@@ -0,0 +1,286 @@
+:pp: {plus}{plus}
+= Bindless Morph Buffers & Compute Integration
+
+== Why Bindless?
+
+Before Vulkan 1.2's descriptor indexing (formalized in Vulkan 1.3 and fully required in Vulkan 1.4), accessing an array of GPU buffers from a shader required binding each buffer individually, either as a separate descriptor set or as an element in a fixed-size array declared in advance. For morph targets, this creates a practical problem: you would need to either bind all morph target buffers—even the inactive ones—or rebuild descriptor sets every frame as the active set changes.
+
+**Descriptor indexing** (also called bindless rendering) solves this by allowing a shader to index into a descriptor array at runtime using a value computed during execution. You declare a large array of descriptors in your shader, populate it with all your morph target buffers at load time, and then at dispatch time supply a list of *which indices to read from*—the currently active morph targets and their weights. The GPU uses a descriptor index (a plain integer) to look up the correct buffer handle, without any CPU-side re-binding.
+
+The Vulkan feature we specifically need is `VK_EXT_descriptor_indexing` (core in Vulkan 1.2) with the `shaderStorageBufferArrayNonUniformIndexing` feature enabled.
+
+=== Enabling Device Features
+
+To use bindless morph buffers, you must enable the corresponding features during logical device creation. In Vulkan 1.4, these features are part of the core spec, but they still must be explicitly requested in the `VkPhysicalDeviceFeatures2` chain:
+
+[source,cpp]
+----
+// 1. Check for support
+VkPhysicalDeviceDescriptorIndexingFeatures indexing_features {};
+indexing_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DESCRIPTOR_INDEXING_FEATURES;
+
+VkPhysicalDeviceFeatures2 device_features {};
+device_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+device_features.pNext = &indexing_features;
+
+vkGetPhysicalDeviceFeatures2(physical_device, &device_features);
+
+if (!indexing_features.shaderStorageBufferArrayNonUniformIndexing) {
+    throw std::runtime_error("GPU does not support non-uniform indexing of storage buffers!");
+}
+
+// 2. Enable during device creation
+VkDeviceCreateInfo device_info {};
+device_info.pNext = &device_features; // Includes the requested indexing features
+// ... rest of device creation ...
+----
+
+== Setting Up the Descriptor Infrastructure
+
+We need a descriptor set layout that contains a runtime-sized array of storage buffer descriptors. In Vulkan, this is declared with the `VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT` and `VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT` flags, which together tell the driver that the array can be partially populated and that only the entries we actually use need to be valid:
+
+[source,cpp]
+----
+void create_morph_descriptor_set_layout(
+    VkDevice device,
+    uint32_t max_morph_targets, // Upper bound on the total number of morph target buffers
+    VkDescriptorSetLayout& out_layout)
+{
+    // Binding 0: runtime-sized array of storage buffers (one per morph target)
+    VkDescriptorSetLayoutBinding binding {};
+    binding.binding            = 0;
+    binding.descriptorType     = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+    binding.descriptorCount    = max_morph_targets;
+    binding.stageFlags         = VK_SHADER_STAGE_COMPUTE_BIT;
+
+    VkDescriptorBindingFlags flags =
+        VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT |
+        VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT           |
+        VK_DESCRIPTOR_BINDING_UPDATE_AFTER_BIND_BIT;
+
+    VkDescriptorSetLayoutBindingFlagsCreateInfo binding_flags {};
+    binding_flags.sType         = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_BINDING_FLAGS_CREATE_INFO;
+    binding_flags.bindingCount  = 1;
+    binding_flags.pBindingFlags = &flags;
+
+    VkDescriptorSetLayoutCreateInfo layout_info {};
+    layout_info.sType        = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+    layout_info.pNext        = &binding_flags;
+    layout_info.flags        = VK_DESCRIPTOR_SET_LAYOUT_CREATE_UPDATE_AFTER_BIND_POOL_BIT;
+    layout_info.bindingCount = 1;
+    layout_info.pBindings    = &binding;
+
+    vkCreateDescriptorSetLayout(device, &layout_info, nullptr, &out_layout);
+}
+
+void update_morph_descriptors(
+    VkDevice device,
+    VkDescriptorSet descriptor_set,
+    const std::vector<VkBuffer>& morph_buffers)
+{
+    std::vector<VkDescriptorBufferInfo> buffer_infos;
+    std::vector<VkWriteDescriptorSet> writes;
+
+    for (size_t i = 0; i < morph_buffers.size(); ++i) {
+        VkDescriptorBufferInfo info {};
+        info.buffer = morph_buffers[i];
+        info.offset = 0;
+        info.range  = VK_WHOLE_SIZE;
+        buffer_infos.push_back(info);
+
+        VkWriteDescriptorSet write {};
+        write.sType           = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET;
+        write.dstSet          = descriptor_set;
+        write.dstBinding      = 0;
+        write.dstArrayElement = static_cast<uint32_t>(i);
+        write.descriptorType  = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+        write.descriptorCount = 1;
+        write.pBufferInfo     = &buffer_infos.back();
+        writes.push_back(write);
+    }
+
+    vkUpdateDescriptorSets(device, static_cast<uint32_t>(writes.size()), writes.data(), 0, nullptr);
+}
+----
+
+=== Creating the Morph Compute Pipeline
+
+To run our extended compute shader, we need a pipeline that includes the bindless descriptor set. The pipeline layout must account for the bindless array at binding 0 of set 1 (matching our `[[vk::binding(0, 1)]]` declaration):
+
+[source,cpp]
+----
+void create_morph_pipeline(
+    VkDevice device,
+    VkDescriptorSetLayout base_layout,  // Binding set 0 (input/output vertices, joint matrices)
+    VkDescriptorSetLayout morph_layout, // Binding set 1 (bindless morph buffers)
+    VkShaderModule shader_module,
+    VkPipelineLayout& out_layout,
+    VkPipeline& out_pipeline)
+{
+    // 1. Pipeline Layout
+    std::vector<VkDescriptorSetLayout> layouts = { base_layout, morph_layout };
+
+    VkPushConstantRange push_range {};
+    push_range.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+    push_range.offset     = 0;
+    push_range.size       = sizeof(PushConstants);
+
+    VkPipelineLayoutCreateInfo layout_info {};
+    layout_info.sType          = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+    layout_info.setLayoutCount = static_cast<uint32_t>(layouts.size());
+    layout_info.pSetLayouts    = layouts.data();
+    layout_info.pushConstantRangeCount = 1;
+    layout_info.pPushConstantRanges    = &push_range;
+
+    vkCreatePipelineLayout(device, &layout_info, nullptr, &out_layout);
+
+    // 2. Compute Pipeline
+    VkComputePipelineCreateInfo pipeline_info {};
+    pipeline_info.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+    pipeline_info.layout = out_layout;
+    pipeline_info.stage.sType  = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+    pipeline_info.stage.stage  = VK_SHADER_STAGE_COMPUTE_BIT;
+    pipeline_info.stage.module = shader_module;
+    pipeline_info.stage.pName  = "main";
+
+    vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &pipeline_info, nullptr, &out_pipeline);
+}
+----
+
+When you allocate the descriptor set from this layout, you specify the actual count of descriptors you want (which can be less than `max_morph_targets`). Once allocated, you populate the array using `vkUpdateDescriptorSets` as shown above. This is a one-time setup at load time—the buffers themselves don't change, only the weights used to blend them.
+
+== Morph Target Animation
+
+Driving morph weights at runtime is done via glTF animation channels with the `weights` path. Unlike translation (vec3) or rotation (quat), a morph weight channel produces a scalar value (or an array of scalars if multiple morph targets are animated in the same channel).
+
+[source,cpp]
+----
+void update_morph_weights(
+    const AnimationChannel& channel,
+    const AnimationSampler& sampler,
+    float current_time,
+    std::vector<float>& mesh_weights)
+{
+    if (channel.path != AnimationChannel::WEIGHTS) return;
+
+    // Find the keyframe interval [t0, t1]
+    uint32_t k = find_keyframe(sampler, current_time);
+    float t0 = sampler.inputs[k];
+    float t1 = sampler.inputs[k+1];
+    float factor = (current_time - t0) / (t1 - t0);
+
+    // Sample the weights for all targets in this mesh
+    uint32_t target_count = mesh_weights.size();
+    for (uint32_t i = 0; i < target_count; ++i) {
+        float w0 = sampler.values[k * target_count + i].x;
+        float w1 = sampler.values[(k+1) * target_count + i].x;
+
+        // Linear interpolation is typically sufficient for facial expressions
+        mesh_weights[i] = glm::mix(w0, w1, factor);
+    }
+}
+----
+
+These weights are then packed into the `PushConstants` and sent to the compute shader.
+
+== The Extended Compute Shader
+
+We need to extend the skinning compute shader from Chapter 3 to apply morph target displacements before the skeletal skinning step. The morph application and the skeletal skinning are both linear operations on the same vertex buffer, so they compose cleanly: apply morphs first (which moves the vertex in the bind pose), then apply skeletal skinning (which transforms the morphed bind pose into the animated world pose).
+
+Here is the extended shader in Slang. The key additions are the `morphTargets` bindless buffer array, the `MorphWeightBlock` parameter, and the morph accumulation loop:
+
+[source,slang]
+----
+// Bindless array of morph target displacement buffers.
+// Each buffer contains one vec3 per vertex in the mesh.
+[[vk::binding(0, 1)]]
+StructuredBuffer<float3> morphTargets[];
+
+// Weights for the currently active morph targets.
+struct MorphWeightBlock {
+    float    weights[64];
+    uint32_t active_count;
+    uint32_t pad[3];
+};
+[[vk::push_constant]]
+struct PushConstants {
+    // From Chapter 3: skeleton joint matrix indices
+    uint32_t vertex_count;
+    uint32_t joint_count;
+    // Morph target indices (which slots in morphTargets[] are active)
+    uint32_t morph_indices[64];
+    MorphWeightBlock morph_weights;
+} pc;
+
+[[vk::binding(0, 0)]] StructuredBuffer<InputVertex>  base_vertices;
+[[vk::binding(1, 0)]] RWStructuredBuffer<OutputVertex> output_vertices;
+[[vk::binding(2, 0)]] StructuredBuffer<float4x4> joint_matrices;
+
+[shader("compute")]
+[numthreads(64, 1, 1)]
+void main(uint3 dispatch_id : SV_DispatchThreadID)
+{
+    uint vertex_idx = dispatch_id.x;
+    if (vertex_idx >= pc.vertex_count) return;
+
+    InputVertex base = base_vertices[vertex_idx];
+
+    // --- Step 1: Apply morph target displacements ---
+    float3 morphed_position = base.position;
+    float3 morphed_normal   = base.normal;
+
+    for (uint m = 0; m < pc.morph_weights.active_count; ++m) {
+        uint   target_slot = pc.morph_indices[m];
+        float  weight      = pc.morph_weights.weights[m];
+        if (weight < 1e-5f) continue;
+
+        // NonUniformResourceIndex is required when indexing bindless arrays
+        // with a non-uniform value (different invocations may use different indices)
+        float3 delta = morphTargets[NonUniformResourceIndex(target_slot)][vertex_idx];
+        morphed_position += weight * delta;
+        // Normal deltas would go here if you have a second morphNormals[] array
+    }
+
+    // --- Step 2: Skeletal skinning (same as Chapter 3) ---
+    float4x4 skin_matrix =
+        base.joint_weights.x * joint_matrices[base.joint_indices.x] +
+        base.joint_weights.y * joint_matrices[base.joint_indices.y] +
+        base.joint_weights.z * joint_matrices[base.joint_indices.z] +
+        base.joint_weights.w * joint_matrices[base.joint_indices.w];
+
+    float4 world_pos = mul(skin_matrix, float4(morphed_position, 1.0));
+    float3 world_nrm = normalize(mul(float3x3(skin_matrix), morphed_normal));
+
+    // Write to output buffer (shared with rasterizer, ray tracing, physics)
+    OutputVertex out;
+    out.position = world_pos.xyz;
+    out.normal   = world_nrm;
+    out.tangent  = base.tangent;
+    out.texcoord = base.texcoord;
+    output_vertices[vertex_idx] = out;
+}
+----
+
+The `NonUniformResourceIndex` annotation is the critical piece that makes the bindless indexing correct. Without it, the GPU driver may assume all threads in a wave (a group of threads that execute in lockstep) are accessing the same descriptor index—which would be incorrect when different characters are being skinned in the same dispatch. `NonUniformResourceIndex` tells the compiler to emit a `NonUniform` decoration on the index, which instructs the hardware to perform a non-uniform texture/buffer fetch that handles the per-thread variation correctly.
+
+== Integrating with the Dispatch Loop
+
+The morph system integrates into the frame loop naturally:
+
+1. **Update weights.** The animation system advances the `WEIGHTS` animation channel and writes new float values into the per-mesh morph weight array.
+2. **Cull inactive targets.** Any morph target with weight below a threshold (typically 0.005) is excluded from the dispatch. Build the compact `morph_indices` and `weights` arrays from only the active targets.
+3. **Push constants.** Fill the `PushConstants` struct with the active target count, the descriptor indices of the active buffers, and their weights.
+4. **Dispatch.** Bind the descriptor set (which has all morph target buffers registered), push constants, and dispatch the compute shader. One thread per vertex.
+5. **Barrier.** Issue a `VkBufferMemoryBarrier2` on the output buffer (same as Chapter 3) before the rasterizer or ray tracing pass reads from it.
+
+The only part that requires care is the per-frame push constant size. If you have up to 64 active morph targets, the push constants become large—64 indices (256 bytes) plus 64 weights (256 bytes) plus metadata equals over 512 bytes. The Vulkan specification guarantees a minimum of 128 bytes of push constant space, but most desktop GPUs support 256 bytes, and high-end GPUs support 256 bytes or more. If you exceed the available push constant space, move the weight data into a small uniform buffer instead.
+
+== Morph Targets and Skeletal Rigs Together
+
+The final question is how morph targets and the joint skeleton cooperate. When a character opens their mouth, this involves both a morph target (sliding the lips and revealing teeth in a complex, non-rigid deformation) and a joint rotation (rotating the jaw bone downward to pull the lower jaw open). These two systems must be applied in the right order and must agree on the base pose.
+
+The rule is: **apply morphs in the bind pose, then apply skeletal skinning.** The bind pose is the rest configuration that the artist sets up in Blender. Morph targets are defined as displacements from this pose. The skeletal inverse bind matrices are also defined relative to this pose. If you apply morphs first (as our shader does), then run the joint influence weighting with the inverse bind matrices, the result is mathematically correct: the morphed geometry in bind pose is transformed by the animation pose, yielding the final animated-and-morphed mesh.
+
+Getting this wrong—applying morph targets after skeletal skinning, or in world space rather than bind space—produces smearing artifacts where facial geometry stretches or fails to follow the jaw bone.
+
+xref:Advanced_glTF/Morph_Targets_Facial_Animation/02_shape_key_ingestion.adoc[Previous: Shape Key Ingestion] | xref:Advanced_glTF/Morph_Targets_Facial_Animation/04_conclusion.adoc[Next: Conclusion]
diff --git a/en/Advanced_glTF/Morph_Targets_Facial_Animation/04_conclusion.adoc b/en/Advanced_glTF/Morph_Targets_Facial_Animation/04_conclusion.adoc
new file mode 100644
index 000000000..8ea7b76aa
--- /dev/null
+++ b/en/Advanced_glTF/Morph_Targets_Facial_Animation/04_conclusion.adoc
@@ -0,0 +1,38 @@
+:pp: {plus}{plus}
+= Morph Targets: Summary & What's Next
+
+== What We Built
+
+This chapter completed the deformation pipeline that began in Chapter 3. Skeletal skinning handles the body: large-scale, rotation-driven deformation of limbs, spine, and head. Morph targets handle the face: subtle, non-rigid deformation of skin that slides, wrinkles, and bulges in response to muscle activation.
+
+We started with the glTF file format's approach to morph targets: sparse accessors that store only the vertices that actually change between the base mesh and each expression. Parsing sparse accessors correctly—expanding them into full dense displacement arrays—is the first and most error-prone step, particularly because the index component type can vary between exporters. We wrote a robust `expand_sparse_accessor` function that handles byte, short, and integer index widths and gracefully returns zeros for vertices that are not explicitly listed.
+
+With the displacement data on the CPU, we uploaded each morph target to its own GPU buffer using the staging buffer pattern from earlier chapters. We tagged these buffers with `VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT` in preparation for the bindless architecture.
+
+The bindless descriptor infrastructure uses `VK_DESCRIPTOR_BINDING_VARIABLE_DESCRIPTOR_COUNT_BIT` and `VK_DESCRIPTOR_BINDING_PARTIALLY_BOUND_BIT` to create a descriptor set with a runtime-sized array of storage buffer descriptors. All morph target buffers are registered in this array at load time. At dispatch time, we supply only the indices and weights of the *active* morph targets, avoiding the cost of touching inactive buffers.
+
+The extended compute shader applies morph displacements in the bind pose before the skeletal skinning step, using `NonUniformResourceIndex` to safely index the bindless array with per-thread values. The output is the same shared vertex buffer that the rasterizer, ray tracer, and physics system already consume—morph target support adds no new synchronization requirements to the downstream pipeline.
+
+== Tradeoffs to Keep in Mind
+
+Morph targets are fast to evaluate on modern GPU hardware because they are embarrassingly parallel: each vertex is independent, and the work per vertex is just a few multiply-add operations per active target. The bottleneck is almost always memory bandwidth—fetching the displacement arrays from GPU memory—not arithmetic. This is why the "cull inactive targets" step matters: if you have 60 morph targets but only 8 are nonzero this frame, dispatching with 8 active targets rather than 60 reduces the bandwidth by 7.5x.
+
+The push constant size concern we raised in the previous section deserves a follow-up. For large numbers of simultaneously active morph targets, moving the weight and index data into a small uniform buffer (updated with `vkCmdUpdateBuffer` or a staging copy) is the right call. The performance difference between push constants and a small uniform buffer fetch is negligible for this use case—you will not notice it in a profile—but the robustness benefit of not depending on beyond-minimum push constant sizes is real.
+
+Normal morph targets (displacement vectors for surface normals) were mentioned but not fully implemented in this chapter. If your character has morph targets that significantly change the curvature of the skin—puffing cheeks, wrinkling a brow—you should implement normal morphing alongside position morphing. The shader extension is straightforward: add a second bindless array for normal deltas and accumulate them the same way. Without normal morphing, the lighting on a morphed face will look slightly wrong, particularly under harsh directional lights.
+
+== What Comes Next
+
+In Chapter 7 we step back from the rendering and physics code to examine the **production pipeline**: the workflow from artist's Blender file to a validated, engine-ready glTF asset. We will cover the naming conventions and export settings that make the engine-side code reliable, how to use the Khronos glTF-Validator to catch format errors before they reach your loader, and how to use professional glTF viewers to establish a "ground truth" for materials and animation before you spend hours debugging what turns out to be an export problem.
+
+Understanding the production pipeline is as important as understanding the rendering code—many apparent engine bugs are actually asset bugs, and knowing how to distinguish between them will save you significant debugging time.
+
+== Verification: What to Look For
+
+To verify your morph target implementation:
+
+1.  **Sparse Accessor Expansion**: Verify that vertices with no displacement in a particular morph target correctly remain at their base positions.
+2.  **Bindless Indexing**: Use a GPU debugger (like RenderDoc) to inspect the `morphTargets[]` array. Ensure that the descriptor indices passed in `PushConstants` match the correct buffer slots.
+3.  **Order of Operations**: Ensure that morph displacements are applied **before** skeletal skinning. If they are applied after, facial features will appear to "float" or detach when the head rotates.
+
+xref:Advanced_glTF/Morph_Targets_Facial_Animation/03_bindless_morph_buffers.adoc[Previous: Bindless Morph Buffers] | xref:Advanced_glTF/Tooling_Production_Pipeline/01_introduction.adoc[Next: Tooling & The Production Pipeline]
diff --git a/en/Advanced_glTF/Physics_Integration/01_introduction.adoc b/en/Advanced_glTF/Physics_Integration/01_introduction.adoc
new file mode 100644
index 000000000..c1b46abe9
--- /dev/null
+++ b/en/Advanced_glTF/Physics_Integration/01_introduction.adoc
@@ -0,0 +1,173 @@
+:pp: {plus}{plus}
+= Physics Integration: Colliders & Ragdolls
+
+== The Gap Between Looking Right and Behaving Right
+
+Everything we have built so far has been about making a character *look* correct. The scene graph ensures that child bones follow parent bones in a physically plausible hierarchy. The compute skinning pipeline deforms the mesh in a GPU buffer that every downstream system can read. The interpolation and blending system ensures that transitions between animations are smooth and natural.
+
+But a character that looks correct and a character that *behaves* correctly in a physics simulation are two very different things. A character can have a flawlessly animated walk cycle and completely nonsensical ragdoll physics. The arm that looks graceful during animation can fly off at impossible angles the moment the physics engine takes over. The body that falls down a flight of stairs can pass straight through the steps as if they weren't there. These are not rendering problems—they are physics integration problems, and they are the focus of this chapter.
+
+The fundamental challenge is that animation systems and physics engines have fundamentally different world views. An animation system is deterministic and artist-controlled: it follows keyframe data, blends between clips, and drives joint transforms in a predictable sequence. A physics engine is a solver: it works with shapes, masses, forces, and constraints, and it produces joint transforms as an *output* rather than as an input. Bridging these two world views—making them cooperate rather than conflict—requires careful design and a clear definition of who is in control at any given moment.
+
+== What This Chapter Will Build
+
+We will construct the physical representation of our character's body from the same glTF skeleton that drives the visual mesh. The key insight is that we don't need—and shouldn't want—a high-fidelity mesh collider for each body part. That would be enormously expensive and would produce an unstable simulation. Instead, we will generate **proxy colliders**: simple geometric shapes (capsules and boxes) that approximate each bone's volume closely enough for plausible collision behavior, while remaining fast enough to simulate in real time.
+
+We will attach those collider shapes to the skeleton using **physics constraints**—specifically, hinge constraints and ball-and-socket constraints that mirror the joints of the human body. A real knee only bends in one plane. A real shoulder has a large but finite range of motion. By encoding these limits into our constraints, we prevent the ragdoll from producing the grotesque contortions that unconstrained simulations are infamous for.
+
+We will then build the machinery for the **Ragdoll Handoff**: the state transition where the animation controller, which has been driving the skeleton every frame, yields control to the physics solver. This is a surprisingly subtle problem. If you simply hand over control at the wrong moment—or without properly initializing the physics bodies' velocities from the character's current motion—the ragdoll will snap, jitter, or pop in a way that immediately breaks immersion.
+
+Finally, we will address **self-collision filtering**. Without it, the character's arm will collide with its own torso, the thighs will prevent the knees from bending, and the simulation will explode with internal constraint violations. We will use Vulkan-independent bitmask logic (since collision filtering is a physics engine concept, not a Vulkan one) to tell the simulation which body parts are allowed to interact with each other.
+
+== Recommended Engine: Jolt Physics
+
+Throughout this chapter, we will use **Jolt Physics** for our examples. Jolt is a high-performance, multi-threaded physics engine used in titles like *Horizon Forbidden West*. It is particularly well-suited for character physics due to its robust constraint solver and clean C++ API.
+
+=== Jolt Physics Setup
+To integrate Jolt into your project, you can use CMake's `FetchContent` as shown in the series introduction. Once integrated, you must initialize the engine:
+
+[source,cpp]
+----
+#include <Jolt/Jolt.h>
+#include <Jolt/RegisterTypes.h>
+#include <Jolt/Core/Factory.h>
+#include <Jolt/Physics/PhysicsSystem.h>
+
+// 1. Initialize Jolt global state
+JPH::RegisterDefaultAllocator();
+JPH::Factory::sInstance = new JPH::Factory();
+JPH::RegisterTypes();
+
+// 2. Create the Physics System
+JPH::PhysicsSystem physics_system;
+physics_system.Init(
+    max_bodies,
+    num_body_mutexes,
+    max_body_pairs,
+    max_contact_constraints,
+    broad_phase_layer_interface,
+    object_vs_broad_phase_layer_filter,
+    object_vs_object_layer_filter
+);
+----
+
+=== The PhysicsWorld Interface
+To keep our code clean and engine-agnostic, we will wrap the physics engine calls in a `PhysicsWorld` abstraction. Here is the interface we will use:
+
+[source,cpp]
+----
+struct PhysicsPose {
+    glm::vec3 position;
+    glm::quat orientation;
+
+    glm::mat4 to_matrix() const {
+        return glm::translate(glm::mat4(1.0f), position) * glm::mat4_cast(orientation);
+    }
+};
+
+class PhysicsWorld {
+public:
+    virtual ~PhysicsWorld() = default;
+
+    // Body Management
+    virtual JPH::BodyID create_body(const JPH::BodyCreationSettings& settings) = 0;
+    virtual void        destroy_body(JPH::BodyID body_id) = 0;
+    virtual void        set_motion_type(JPH::BodyID body_id, JPH::EMotionType type) = 0;
+    virtual void        activate_body(JPH::BodyID body_id) = 0;
+
+    // Syncing
+    virtual void        move_kinematic(JPH::BodyID body_id, const PhysicsPose& pose) = 0;
+    virtual PhysicsPose get_body_pose(JPH::BodyID body_id) const = 0;
+    virtual void        set_linear_velocity(JPH::BodyID body_id, const glm::vec3& velocity) = 0;
+
+    // Constraints
+    virtual void        create_ball_socket_constraint(JPH::BodyID p1, JPH::BodyID p2, float swing, float twist) = 0;
+    virtual void        create_hinge_constraint(JPH::BodyID p1, JPH::BodyID p2, const glm::vec3& axis, float min_angle, float max_angle) = 0;
+};
+----
+
+In a real implementation, `move_kinematic` would call `JPH::BodyInterface::SetPositionAndRotation` with the `EActivation::Activate` flag, and `get_body_pose` would read from `JPH::BodyInterface::GetPositionAndRotation`.
+
+=== Concrete Implementation: Jolt Physics
+
+To bridge our engine-agnostic `PhysicsWorld` to Jolt, we implement the methods using Jolt's `BodyInterface`. The `BodyInterface` is the primary way to interact with bodies in Jolt, handling thread safety and state synchronization automatically.
+
+[source,cpp]
+----
+class JoltPhysicsWorld : public PhysicsWorld {
+    JPH::PhysicsSystem* system;
+    JPH::BodyInterface* body_interface;
+
+public:
+    JoltPhysicsWorld(JPH::PhysicsSystem* s) : system(s) {
+        body_interface = &system->GetBodyInterface();
+    }
+
+    JPH::BodyID create_body(const JPH::BodyCreationSettings& settings) override {
+        // Create and add the body to the world
+        JPH::Body* body = body_interface->CreateBody(settings);
+        body_interface->AddBody(body->GetID(), JPH::EActivation::Activate);
+        return body->GetID();
+    }
+
+    void destroy_body(JPH::BodyID body_id) override {
+        body_interface->RemoveBody(body_id);
+        body_interface->DestroyBody(body_id);
+    }
+
+    void set_motion_type(JPH::BodyID body_id, JPH::EMotionType type) override {
+        body_interface->SetMotionType(body_id, type, JPH::EActivation::Activate);
+    }
+
+    void activate_body(JPH::BodyID body_id) override {
+        body_interface->ActivateBody(body_id);
+    }
+
+    void move_kinematic(JPH::BodyID body_id, const PhysicsPose& pose) override {
+        // Jolt uses float3/quat. We convert from our glm types.
+        JPH::Vec3 position(pose.position.x, pose.position.y, pose.position.z);
+        JPH::Quat rotation(pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w);
+
+        // SetPositionAndRotation is the correct way to drive kinematic bodies.
+        // It ensures velocities are correctly calculated for the next physics step.
+        body_interface->SetPositionAndRotation(body_id, position, rotation, JPH::EActivation::Activate);
+    }
+
+    PhysicsPose get_body_pose(JPH::BodyID body_id) const override {
+        JPH::RVec3 position;
+        JPH::Quat rotation;
+        body_interface->GetPositionAndRotation(body_id, position, rotation);
+
+        return {
+            glm::vec3(position.GetX(), position.GetY(), position.GetZ()),
+            glm::quat(rotation.GetW(), rotation.GetX(), rotation.GetY(), rotation.GetZ())
+        };
+    }
+
+    void set_linear_velocity(JPH::BodyID body_id, const glm::vec3& v) override {
+        body_interface->SetLinearVelocity(body_id, JPH::Vec3(v.x, v.y, v.z));
+    }
+
+    void create_ball_socket_constraint(JPH::BodyID p1, JPH::BodyID p2, float swing, float twist) override {
+        JPH::PointConstraintSettings settings;
+        settings.mSpace = JPH::EConstraintSpace::LocalToBodyCOM;
+        // In a real implementation, you would set the anchor points and limits here
+        // using JPH::PointConstraintSettings and swing/twist values.
+        system->GetConstraintInterface().AddConstraint(settings.Create(
+            *body_interface->GetBody(p1), *body_interface->GetBody(p2)));
+    }
+
+    void create_hinge_constraint(JPH::BodyID p1, JPH::BodyID p2, const glm::vec3& axis, float min_angle, float max_angle) override {
+        JPH::HingeConstraintSettings settings;
+        settings.mPoint1 = settings.mPoint2 = JPH::Vec3::sZero(); // Anchor at COM for simplicity
+        settings.mHingeAxis1 = settings.mHingeAxis2 = JPH::Vec3(axis.x, axis.y, axis.z);
+        settings.mLimitsMin = min_angle;
+        settings.mLimitsMax = max_angle;
+
+        system->GetConstraintInterface().AddConstraint(settings.Create(
+            *body_interface->GetBody(p1), *body_interface->GetBody(p2)));
+    }
+};
+----
+
+xref:Advanced_glTF/Skeletal_Compute_Skinning/06_conclusion.adoc[Previous: Compute Skinning Conclusion] | xref:Advanced_glTF/Physics_Integration/02_bone_proxy_colliders.adoc[Next: Bone Proxy Colliders]
diff --git a/en/Advanced_glTF/Physics_Integration/02_bone_proxy_colliders.adoc b/en/Advanced_glTF/Physics_Integration/02_bone_proxy_colliders.adoc
new file mode 100644
index 000000000..b4f0b7ae7
--- /dev/null
+++ b/en/Advanced_glTF/Physics_Integration/02_bone_proxy_colliders.adoc
@@ -0,0 +1,208 @@
+:pp: {plus}{plus}
+= Bone Proxy Colliders
+
+== Why Not Use the Real Mesh?
+
+The most intuitive approach to character physics collision is to use the actual mesh surface as the collision geometry. After all, the compute shader already produces an animated vertex buffer—why not hand that directly to the physics engine and let it work with the real shape?
+
+The answer is performance and stability. A physics engine's collision detection system works best with simple, convex shapes. The moment you give it a concave mesh—which is what almost every character body part is—the engine must switch to a much more expensive decomposition algorithm to handle the concavity correctly. A typical humanoid character mesh has tens of thousands of vertices. Running full mesh-vs-mesh collision detection for even a small crowd of characters would make your frame budget disappear instantly.
+
+More subtly, complex collision shapes produce more contact points per frame, which places a larger burden on the constraint solver. A physics constraint solver works by iteratively resolving all of the contact forces and joint constraints in a scene until it finds a configuration that violates none of them. More contacts mean more iterations, and more iterations mean more time—often with diminishing returns, since the extra precision of the complex shapes rarely produces visible improvements in gameplay.
+
+The solution the industry settled on decades ago is **proxy colliders**: simple shapes that are "close enough" to the bone's volume to produce believable collisions, but simple enough that the physics engine can process them at the speed required for real-time simulation. The word "proxy" is deliberate—these shapes are stand-ins for the real geometry, not representations of it.
+
+== Choosing the Right Shape
+
+For humanoid characters, two shapes cover the vast majority of cases: the **capsule** and the **box**.
+
+A capsule is a cylinder with hemispherical caps. It is the ideal shape for limbs because it is rotationally symmetric, smooth (which prevents tunneling artifacts at joints), and requires only two parameters to define: a half-height and a radius. The physics engine can perform capsule-vs-capsule collision detection extremely efficiently. Long bones—the humerus (upper arm), ulna/radius (forearm), femur (thigh), tibia (shin)—are all well-approximated by capsules.
+
+A box is better suited for the torso and pelvis, which are roughly rectangular and benefit from having distinct width, height, and depth dimensions. Boxes can also represent the skull reasonably well. They are slightly more expensive than capsules, but still vastly cheaper than convex mesh colliders.
+
+For hands and feet, you have a choice. A single capsule per hand (representing the palm and fingers as a unified blob) is the cheapest option and is fine for most games. If you need more precise hand interaction—for a game where characters grip objects, for example—you can use a small box for the palm and individual small capsules for each finger, but you should profile this carefully.
+
+== Defining Collider Properties in glTF Extras
+
+In Chapter 2 we introduced the concept of glTF "extras"—the arbitrary JSON metadata that can be attached to any node in a glTF file. This is the right place to store your collider definitions. By embedding the collider parameters in the glTF asset, you keep the physics setup co-located with the skeleton data, and you give artists a way to tune the shapes directly in Blender without touching engine code.
+
+A typical collider extra for a forearm bone might look like this in your Blender custom properties:
+
+[source,json]
+----
+{
+  "physics": {
+    "collider": "capsule",
+    "radius": 0.045,
+    "half_height": 0.13,
+    "mass": 1.2,
+    "collision_group": "arm",
+    "collision_mask": "world,props"
+  }
+}
+----
+
+The `collider` field specifies the shape type. The `radius` and `half_height` define the capsule's dimensions in the bone's local coordinate space. The `mass` gives the physics engine the weight of this body segment (important for realistic ragdoll behavior—a forearm should weigh significantly less than a torso). The `collision_group` and `collision_mask` fields we will use for self-collision filtering in a later section.
+
+On the C{pp} side, we extend the node parsing code from Chapter 2 to also extract physics metadata:
+
+[source,cpp]
+----
+struct ColliderDef {
+    enum class Shape { CAPSULE, BOX, NONE };
+    Shape shape     = Shape::NONE;
+    float radius    = 0.0f;
+    float half_height = 0.0f;
+    glm::vec3 box_half_extents = {};
+    float mass      = 1.0f;
+    std::string collision_group;
+    std::string collision_mask;
+};
+
+ColliderDef parse_collider_extras(const tinygltf::Value& extras)
+{
+    ColliderDef def;
+    if (!extras.Has("physics")) return def;
+
+    const auto& phys = extras.Get("physics");
+    if (!phys.Has("collider")) return def;
+
+    const std::string shape_str = phys.Get("collider").Get<std::string>();
+    if      (shape_str == "capsule") def.shape = ColliderDef::Shape::CAPSULE;
+    else if (shape_str == "box")     def.shape = ColliderDef::Shape::BOX;
+    else                             return def; // unknown shape, skip
+
+    if (phys.Has("radius"))       def.radius       = static_cast<float>(phys.Get("radius").GetNumberAsDouble());
+    if (phys.Has("half_height"))  def.half_height  = static_cast<float>(phys.Get("half_height").GetNumberAsDouble());
+    if (phys.Has("mass"))         def.mass         = static_cast<float>(phys.Get("mass").GetNumberAsDouble());
+
+    if (phys.Has("box_half_extents")) {
+        const auto& ext = phys.Get("box_half_extents");
+        def.box_half_extents = {
+            static_cast<float>(ext.Get(0).GetNumberAsDouble()),
+            static_cast<float>(ext.Get(1).GetNumberAsDouble()),
+            static_cast<float>(ext.Get(2).GetNumberAsDouble())
+        };
+    }
+
+    if (phys.Has("collision_group")) def.collision_group = phys.Get("collision_group").Get<std::string>();
+    if (phys.Has("collision_mask"))  def.collision_mask  = phys.Get("collision_mask").Get<std::string>();
+
+    return def;
+}
+----
+
+Notice that we return early with a default `NONE` shape if the extras don't contain the data we expect. This graceful degradation is important: bones without physics metadata should simply have no collider, rather than crashing the loader.
+
+== Creating the Physics Bodies
+
+With the collider definitions parsed, we can create the actual physics bodies. Each bone with a collider definition needs a **rigid body**: a physics object that has a position, orientation, mass, and collision shape, and that participates in the physics simulation.
+
+During the animation phase, we drive these rigid bodies from the scene graph (making them **kinematic**—they follow our transforms rather than being simulated). During the ragdoll phase, we flip them to **dynamic**—they become simulated objects that produce transforms for us to read back. We will handle that state switch in the ragdoll handoff section; for now, we create them all as kinematic.
+
+[source,cpp]
+----
+struct BoneBody {
+    uint32_t    node_index;      // Index into our scene graph Node array
+    JPH::BodyID physics_body;    // Jolt Physics body ID
+    ColliderDef collider_def;    // The parameters we parsed from extras
+};
+
+std::vector<BoneBody> create_bone_bodies(
+    const std::vector<Node>& nodes,
+    const Skin& skin,
+    PhysicsWorld& physics_world)
+{
+    std::vector<BoneBody> bodies;
+
+    for (uint32_t joint_idx = 0; joint_idx < skin.joints.size(); ++joint_idx) {
+        uint32_t node_index = skin.joints[joint_idx];
+        const Node& node    = nodes[node_index];
+
+        if (node.collider_def.shape == ColliderDef::Shape::NONE) continue;
+
+        // Create the collision shape from the parsed definition using Jolt
+        JPH::Ref<JPH::Shape> shape;
+        if (node.collider_def.shape == ColliderDef::Shape::CAPSULE) {
+            shape = new JPH::CapsuleShape(
+                node.collider_def.half_height,
+                node.collider_def.radius);
+        } else if (node.collider_def.shape == ColliderDef::Shape::BOX) {
+            shape = new JPH::BoxShape(JPH::Vec3(
+                node.collider_def.box_half_extents.x,
+                node.collider_def.box_half_extents.y,
+                node.collider_def.box_half_extents.z));
+        }
+
+        if (!shape) continue;
+
+        // Create the rigid body at the bone's current world position.
+        // We start it as KINEMATIC - it will be driven by the animation system.
+        JPH::BodyCreationSettings settings;
+        settings.SetShape(shape);
+        settings.mMassPropertiesOverride.mMass = node.collider_def.mass;
+        settings.mOverrideMassProperties = JPH::EOverrideMassProperties::CalculateInertia;
+
+        PhysicsPose pose = decompose_to_pose(node.world_matrix);
+        settings.mPosition = JPH::Vec3(pose.position.x, pose.position.y, pose.position.z);
+        settings.mRotation = JPH::Quat(pose.orientation.x, pose.orientation.y, pose.orientation.z, pose.orientation.w);
+        settings.mMotionType = JPH::EMotionType::Kinematic;
+
+        JPH::BodyID body = physics_world.create_body(settings);
+
+        bodies.push_back({ node_index, body, node.collider_def });
+    }
+
+    return bodies;
+}
+----
+
+The `decompose_to_pose` function is worth examining briefly. Physics engines typically represent a body's state as a position and a quaternion, not as a 4x4 matrix. We need to decompose the scene graph's matrix representation into these components:
+
+[source,cpp]
+----
+PhysicsPose decompose_to_pose(const glm::mat4& world_matrix)
+{
+    PhysicsPose pose;
+    // Extract translation directly from the last column
+    pose.position = glm::vec3(world_matrix[3]);
+
+    // Extract rotation by normalizing the 3x3 basis vectors to remove scale,
+    // then constructing a quaternion from the pure rotation matrix.
+    glm::mat3 rot_scale = glm::mat3(world_matrix);
+    glm::mat3 rotation;
+    rotation[0] = glm::normalize(rot_scale[0]);
+    rotation[1] = glm::normalize(rot_scale[1]);
+    rotation[2] = glm::normalize(rot_scale[2]);
+    pose.orientation = glm::quat_cast(rotation);
+
+    return pose;
+}
+----
+
+== Updating Kinematic Bodies Each Frame
+
+Every frame, while the character is animated (not ragdolling), we need to update each bone's physics body to match the scene graph's current world matrix. This is the "Animation to Physics" direction of the bi-directional link we discussed in Chapter 2.
+
+[source,cpp]
+----
+void sync_animation_to_physics(
+    const std::vector<Node>& nodes,
+    std::vector<BoneBody>& bodies,
+    PhysicsWorld& physics_world)
+{
+    for (auto& bone_body : bodies) {
+        const Node& node = nodes[bone_body.node_index];
+        PhysicsPose pose = decompose_to_pose(node.world_matrix);
+
+        // Move the kinematic body to the animated position.
+        // The physics engine will compute the velocity automatically,
+        // based on how far it moved since last frame. This velocity
+        // becomes critical during the ragdoll handoff.
+        physics_world.move_kinematic(bone_body.physics_body, pose);
+    }
+}
+----
+
+There is a subtle but important reason to use `move_kinematic` rather than simply teleporting the body to the new position: most physics engines use the delta between the previous and current position to estimate the body's velocity. When we later flip the body to dynamic mode for the ragdoll, the physics engine needs a realistic initial velocity so the ragdoll doesn't start from a dead stop. If we had been teleporting the body rather than moving it continuously, that velocity estimate would be wrong, and the ragdoll handoff would look wrong.
+
+xref:Advanced_glTF/Physics_Integration/01_introduction.adoc[Previous: Introduction] | xref:Advanced_glTF/Physics_Integration/03_constraints_and_joint_limits.adoc[Next: Constraints & Joint Limits]
diff --git a/en/Advanced_glTF/Physics_Integration/03_constraints_and_joint_limits.adoc b/en/Advanced_glTF/Physics_Integration/03_constraints_and_joint_limits.adoc
new file mode 100644
index 000000000..ca0dedc70
--- /dev/null
+++ b/en/Advanced_glTF/Physics_Integration/03_constraints_and_joint_limits.adoc
@@ -0,0 +1,188 @@
+:pp: {plus}{plus}
+= Constraints & Joint Limits
+
+== Why Unconstrained Ragdolls Are Unwatchable
+
+If you took the physics bodies we created in the previous section, switched them all to dynamic mode right now, and let the physics solver run, you would get a ragdoll—but a terrible one. Every body part would flail independently. The arm would not be connected to the shoulder. The knee would bend sideways, backward, forward, and in a complete circle. The spine would crumple into a chaotic pile of disconnected body segments. The character would look less like a falling person and more like a bag of loosely related objects that happened to occupy the same space a moment ago.
+
+The problem is that physics bodies, by default, have no knowledge of each other. They are individual simulated objects. The thing that connects them—the thing that makes a knee *a knee* and a shoulder *a shoulder*—is a **constraint**. A physics constraint is a rule that the solver must obey: it can say "these two bodies must stay a fixed distance apart," or "this body can rotate relative to that one, but only within this angular range," or "this body is attached to that one and can pivot freely in any direction."
+
+Without constraints, you have a pile of separate bricks. With constraints, you have a skeleton.
+
+== The Two Constraints Every Humanoid Needs
+
+For a humanoid ragdoll, two constraint types cover the entire body: the **Ball-and-Socket** constraint and the **Hinge** constraint.
+
+A Ball-and-Socket constraint (sometimes called a Spherical constraint) allows rotation in any direction but prevents the bodies from separating. Think of how a human shoulder or hip works: you can rotate your arm or leg in a wide cone of directions, but the joint itself doesn't translate—the ball stays in the socket. This is the right constraint for shoulders, hips, the neck, and the ankles. In physics engine terms, it removes all three translational degrees of freedom but leaves three rotational degrees of freedom active, subject to angular limits.
+
+A Hinge constraint allows rotation around exactly one axis. Think of a knee: it bends in one plane and does not rotate laterally or axially (at least not significantly). A hinge constraint enforces this single-axis rotation and is the right choice for knees, elbows, and sometimes the wrist depending on how much detail your ragdoll requires. It removes all three translational degrees of freedom and two of the three rotational degrees of freedom, leaving only one.
+
+The distinction matters because it directly determines how a ragdoll reads to the eye. A knee that bends sideways is immediately, viscerally wrong. A shoulder that bends in a cone looks right even if the exact cone angles aren't biologically precise. Matching the constraint type to the anatomical function is the single most important thing you can do to make a ragdoll look plausible.
+
+== Angular Limits: The Human Range of Motion
+
+A constraint type alone is not enough. A ball-and-socket joint with no angular limits allows the arm to rotate 360 degrees—which means a character's arm could point straight backward behind their shoulder, or spin like a helicopter blade. Even though a real shoulder is a ball-and-socket joint, it has a finite range of motion defined by soft tissue, and exceeding that range produces the "arm rotating impossibly" look that makes ragdolls laughable.
+
+Angular limits tell the physics solver the valid range of rotation for each axis. For a Hinge constraint, this is straightforward: a minimum and maximum angle around the single hinge axis. For a Ball-and-Socket, it is more complex—we typically specify a **swing limit** (the cone half-angle describing how far the bone can deviate from its neutral axis) and a **twist limit** (how far the bone can rotate around its own primary axis).
+
+Storing these limits in glTF extras follows the same pattern as the collider definitions. A shoulder bone's extras might look like this:
+
+[source,json]
+----
+{
+  "physics": {
+    "collider": "capsule",
+    "radius": 0.06,
+    "half_height": 0.14,
+    "mass": 2.5,
+    "constraint": {
+      "type": "ball_socket",
+      "swing_limit_deg": 80,
+      "twist_limit_deg": 45,
+      "parent_bone": "spine_upper"
+    },
+    "collision_group": "arm",
+    "collision_mask": "world,props"
+  }
+}
+----
+
+And a knee bone's extras:
+
+[source,json]
+----
+{
+  "physics": {
+    "collider": "capsule",
+    "radius": 0.055,
+    "half_height": 0.20,
+    "mass": 3.0,
+    "constraint": {
+      "type": "hinge",
+      "hinge_axis": [0, 0, 1],
+      "limit_min_deg": -140,
+      "limit_max_deg": 0,
+      "parent_bone": "thigh"
+    },
+    "collision_group": "leg",
+    "collision_mask": "world,props"
+  }
+}
+----
+
+The `parent_bone` field tells us which body this constraint connects to. The constraint will be created between the parent bone's physics body and this bone's physics body. The `hinge_axis` specifies the rotation axis in the bone's local space.
+
+== Parsing Constraint Definitions
+
+We extend our extras parsing to capture constraint definitions:
+
+[source,cpp]
+----
+struct ConstraintDef {
+    enum class Type { NONE, BALL_SOCKET, HINGE };
+
+    Type        type              = Type::NONE;
+    float       swing_limit_deg   = 180.0f;   // Ball-socket: cone half-angle
+    float       twist_limit_deg   = 180.0f;   // Ball-socket: twist range
+    float       hinge_min_deg     = -180.0f;  // Hinge: minimum angle
+    float       hinge_max_deg     =  180.0f;  // Hinge: maximum angle
+    glm::vec3   hinge_axis        = {0,0,1};  // Hinge: rotation axis in local space
+    std::string parent_bone;                  // Name of the parent node
+};
+
+ConstraintDef parse_constraint_def(const tinygltf::Value& phys_extras)
+{
+    ConstraintDef def;
+    if (!phys_extras.Has("constraint")) return def;
+
+    const auto& con = phys_extras.Get("constraint");
+
+    const std::string type_str = con.Get("type").Get<std::string>();
+    if      (type_str == "ball_socket") def.type = ConstraintDef::Type::BALL_SOCKET;
+    else if (type_str == "hinge")       def.type = ConstraintDef::Type::HINGE;
+    else                                return def;
+
+    if (con.Has("swing_limit_deg")) def.swing_limit_deg = static_cast<float>(con.Get("swing_limit_deg").GetNumberAsDouble());
+    if (con.Has("twist_limit_deg")) def.twist_limit_deg = static_cast<float>(con.Get("twist_limit_deg").GetNumberAsDouble());
+    if (con.Has("limit_min_deg"))   def.hinge_min_deg   = static_cast<float>(con.Get("limit_min_deg").GetNumberAsDouble());
+    if (con.Has("limit_max_deg"))   def.hinge_max_deg   = static_cast<float>(con.Get("limit_max_deg").GetNumberAsDouble());
+
+    if (con.Has("hinge_axis")) {
+        const auto& ax = con.Get("hinge_axis");
+        def.hinge_axis = {
+            static_cast<float>(ax.Get(0).GetNumberAsDouble()),
+            static_cast<float>(ax.Get(1).GetNumberAsDouble()),
+            static_cast<float>(ax.Get(2).GetNumberAsDouble())
+        };
+    }
+
+    if (con.Has("parent_bone")) def.parent_bone = con.Get("parent_bone").Get<std::string>();
+
+    return def;
+}
+----
+
+== Creating the Constraints
+
+With the definitions parsed, we create the actual physics constraints. This happens after all bone bodies have been created, because a constraint must reference two bodies—and both must exist before the constraint can be registered with the physics engine.
+
+[source,cpp]
+----
+void create_ragdoll_constraints(
+    const std::vector<Node>& nodes,
+    const std::map<std::string, uint32_t>& name_to_node,
+    std::vector<BoneBody>& bodies,
+    std::map<uint32_t, JPH::BodyID>& node_to_body,
+    PhysicsWorld& physics_world)
+{
+    for (auto& bone_body : bodies) {
+        const Node& node        = nodes[bone_body.node_index];
+        const ConstraintDef& con = node.constraint_def;
+
+        if (con.type == ConstraintDef::Type::NONE) continue;
+        if (con.parent_bone.empty()) continue;
+
+        // Look up the parent body by name
+        auto parent_node_it = name_to_node.find(con.parent_bone);
+        if (parent_node_it == name_to_node.end()) continue;
+
+        auto parent_body_it = node_to_body.find(parent_node_it->second);
+        if (parent_body_it == node_to_body.end()) continue;
+
+        JPH::BodyID parent_body = parent_body_it->second;
+        JPH::BodyID child_body  = bone_body.physics_body;
+
+        if (con.type == ConstraintDef::Type::BALL_SOCKET) {
+            physics_world.create_ball_socket_constraint(
+                parent_body,
+                child_body,
+                glm::radians(con.swing_limit_deg),
+                glm::radians(con.twist_limit_deg));
+
+        } else if (con.type == ConstraintDef::Type::HINGE) {
+            physics_world.create_hinge_constraint(
+                parent_body,
+                child_body,
+                con.hinge_axis,
+                glm::radians(con.hinge_min_deg),
+                glm::radians(con.hinge_max_deg));
+        }
+    }
+}
+----
+
+There is an important ordering consideration here. The constraints should be created after all bodies are added to the physics world, because some engines (Jolt Physics, for example) require all bodies to be present in the simulation's broadphase before constraints referencing them can be safely registered.
+
+== Reference Limits for a Humanoid
+
+To give you a starting point, here are reasonable angular limits for a generic humanoid ragdoll. These are approximate biomechanical values from sports medicine literature, slightly loosened so the ragdoll reads as believable without requiring precise tuning:
+
+The shoulder (ball-and-socket) can swing roughly 90 to 100 degrees from neutral and twist around 45 degrees in each direction. The elbow (hinge) flexes from 0 to about 145 degrees, with no meaningful angular freedom in any other axis. The wrist has some lateral deviation (about ±20 degrees) and some flexion/extension (about 70 degrees flexion, 60 degrees extension)—often simplified to a single hinge for ragdoll purposes.
+
+The hip (ball-and-socket) swings about 120 degrees forward, 15 degrees backward, and 45 degrees laterally, with about 45 degrees of axial twist. The knee (hinge) flexes from 0 to about 135 degrees. The ankle has flexion and extension of roughly 50 degrees total, often simplified to a hinge.
+
+The spine is the most complex case. In reality it is a series of small joints, each with a few degrees of freedom. For a ragdoll, it is common to model the spine as two or three ball-and-socket constraints (lower spine to mid-spine, mid-spine to upper spine) each with a small swing limit of about 15 to 20 degrees—enough to allow the spine to curve and crumple, but not enough to allow grotesque folding.
+
+These values are a starting point, not a specification. You will need to tune them for your specific art style and gameplay feel. A stylized action game may want exaggerated ranges. A realistic simulation may need tighter limits derived from biomechanical measurement. The right answer is the one that feels right to play.
+
+xref:Advanced_glTF/Physics_Integration/02_bone_proxy_colliders.adoc[Previous: Bone Proxy Colliders] | xref:Advanced_glTF/Physics_Integration/04_ragdoll_handoff.adoc[Next: The Ragdoll Handoff]
diff --git a/en/Advanced_glTF/Physics_Integration/04_ragdoll_handoff.adoc b/en/Advanced_glTF/Physics_Integration/04_ragdoll_handoff.adoc
new file mode 100644
index 000000000..ecf8231a4
--- /dev/null
+++ b/en/Advanced_glTF/Physics_Integration/04_ragdoll_handoff.adoc
@@ -0,0 +1,203 @@
+:pp: {plus}{plus}
+= The Ragdoll Handoff
+
+== The Problem with Simply Switching Modes
+
+By the time a character dies, is knocked unconscious, or is hit by a physics impulse, you have a skeleton that is being driven by an animation system. Every frame, the animation system is looking up keyframes, blending them, and depositing the results into the scene graph's joint transforms. The compute shader is then using those joint transforms to skin the mesh. Everything is deterministic, clean, and animator-controlled.
+
+Then something happens—a death event fires, a health counter hits zero, a collision impulse arrives—and you need to hand control of the skeleton over to the physics engine. The physics engine needs to take those joint positions, give each body segment a starting velocity, and then simulate the fall purely from the laws of physics.
+
+The naive approach is to simply set every bone body from kinematic mode to dynamic mode and let the physics engine run. In theory, this should work. In practice, it usually produces a jarring visual artifact: the character snaps to a slightly different pose, or the limbs pop outward from their constrained positions, or the ragdoll appears to explode away from its animated position. This is the **ragdoll handoff problem**, and it is caused by three closely related issues that we need to address explicitly.
+
+The first issue is **pose initialization**. If the physics bodies were being moved kinematically, they should already be at the correct world-space positions—provided you have been using `move_kinematic` correctly as we discussed in the previous section, not teleporting the bodies. But we need to verify that the physics engine's internal state actually matches the scene graph's current state before we flip the mode switch.
+
+The second issue is **velocity initialization**. A character in the middle of a running animation is not stationary. Their arms are swinging, their body is rotating, their center of mass is moving forward. If we switch to ragdoll physics and the starting velocity of every body segment is zero, the ragdoll will immediately produce a wrong result: the character will not continue moving forward, the arms will snap to hang at the sides, and the simulation will "teleport" the character into a state that doesn't match where the animation left off.
+
+The third issue is **constraint resolution**. When we switch from kinematic to dynamic, the constraints between bodies become active in the solver. If the bodies are not positioned such that the constraints are already nearly satisfied, the solver's first frame will produce large corrective forces to pull the bodies into valid positions—which looks like an explosion.
+
+== The State Machine
+
+The cleanest way to manage the handoff is with an explicit state machine for each character's ragdoll. The states we need are:
+
+**ANIMATED**: The animation system is fully in control. All bone bodies are kinematic. The physics engine knows where the bodies are but does not simulate them.
+
+**BLENDED**: A transitional state used when we want to blend from animated to ragdoll over a few frames, rather than switching instantly. The animation system is still running, but we are progressively reducing the influence of the animated pose. This state is optional but produces much better-looking results.
+
+**RAGDOLL**: The physics engine is fully in control. All bone bodies are dynamic. The animation system is still running its logic (so we can blend back out later if needed), but we read from the physics engine's output, not the animation system's output.
+
+[source,cpp]
+----
+enum class RagdollState {
+    ANIMATED,
+    BLENDED,   // Transitional - animation blending out, physics blending in
+    RAGDOLL
+};
+
+struct RagdollController {
+    RagdollState state          = RagdollState::ANIMATED;
+    float        blend_weight   = 0.0f;   // 0 = full animation, 1 = full ragdoll
+    float        blend_duration = 0.15f;  // How many seconds the transition takes
+    float        blend_elapsed  = 0.0f;   // Time spent in BLENDED state
+};
+----
+
+The `blend_weight` is what we passed to the animation system in Chapter 3's cross-fade blending section—it controls how much of the final skeleton pose comes from the animation versus the physics. By linearly increasing it from 0 to 1 over `blend_duration` seconds, we get a smooth visual transition instead of a hard snap.
+
+== Triggering the Handoff
+
+When we want to start a ragdoll transition, we call a function that initializes the transition state and captures the character's current velocity from the animation system:
+
+[source,cpp]
+----
+void begin_ragdoll_transition(
+    RagdollController& controller,
+    std::vector<BoneBody>& bodies,
+    const glm::vec3& character_root_velocity,
+    PhysicsWorld& physics_world)
+{
+    if (controller.state != RagdollState::ANIMATED) return;
+
+    controller.state        = RagdollState::BLENDED;
+    controller.blend_elapsed = 0.0f;
+    controller.blend_weight  = 0.0f;
+
+    // Critical step: set the initial linear velocity of the root body
+    // to match the character's current movement velocity. This prevents
+    // the ragdoll from starting from a dead stop.
+    for (auto& bone_body : bodies) {
+        // For now, give every body the root's velocity.
+        // A more sophisticated system would compute per-bone velocities
+        // from the animation's joint velocities.
+        physics_world.set_linear_velocity(bone_body.physics_body, character_root_velocity);
+
+        // Activate the body: tell the physics engine it will be simulated next frame.
+        // We do NOT switch to dynamic yet — we wait until blend_weight reaches 1.
+        physics_world.activate_body(bone_body.physics_body);
+    }
+}
+----
+
+The velocity initialization is the most important step here. The `character_root_velocity` should come from your character controller—it is the velocity vector the character was moving at when the ragdoll trigger fired. In a typical game, this is the velocity you were applying to the character capsule each frame. Providing this as the initial ragdoll velocity ensures the character continues moving in the same direction after the handoff, rather than collapsing in place.
+
+A more accurate (and more complex) approach is to compute per-bone velocities from the animation's joint velocity data. Many animation systems track the rate of change of each joint, and you can use that to initialize each bone body's angular velocity independently—giving the arms their own swing momentum rather than uniform root velocity. This produces more realistic ragdolls, especially for characters in the middle of vigorous animations, but requires more work to implement correctly.
+
+== The Transition Frame
+
+Each frame while in the BLENDED state, we need to advance the blend weight and, when it reaches 1, complete the mode switch:
+
+[source,cpp]
+----
+void update_ragdoll_controller(
+    RagdollController& controller,
+    std::vector<BoneBody>& bodies,
+    std::vector<Node>& nodes,
+    PhysicsWorld& physics_world,
+    float delta_time)
+{
+    if (controller.state == RagdollState::ANIMATED) {
+        // Sync animation transforms to kinematic physics bodies every frame
+        sync_animation_to_physics(nodes, bodies, physics_world);
+        return;
+    }
+
+    if (controller.state == RagdollState::BLENDED) {
+        controller.blend_elapsed += delta_time;
+        controller.blend_weight   = std::min(controller.blend_elapsed / controller.blend_duration, 1.0f);
+
+        // While blending, keep syncing kinematic bodies so the physics engine
+        // has up-to-date positions if/when it needs to read them
+        sync_animation_to_physics(nodes, bodies, physics_world);
+
+        if (controller.blend_weight >= 1.0f) {
+            // Transition complete: switch all bodies from kinematic to dynamic
+            for (auto& bone_body : bodies) {
+                physics_world.set_motion_type(bone_body.physics_body, JPH::EMotionType::Dynamic);
+            }
+            controller.state = RagdollState::RAGDOLL;
+        }
+        return;
+    }
+
+    if (controller.state == RagdollState::RAGDOLL) {
+        // Read back physics positions into the scene graph
+        sync_physics_to_animation(nodes, bodies, physics_world);
+    }
+}
+----
+
+Notice that we continue syncing the kinematic bodies during the BLENDED phase. This is important: during the blend, the physics engine may still be resolving constraint violations from the pose at the moment the transition began. Keeping the bodies moving in sync prevents the constraints from fighting each other during the transition.
+
+== Reading Physics Results Back into the Scene Graph
+
+Once we are in RAGDOLL state, the physics engine owns the skeleton. Each frame, we need to read the simulated body positions back into our scene graph so that the joint world matrices—and therefore the compute skinning pipeline—reflect the physics output.
+
+[source,cpp]
+----
+void sync_physics_to_animation(
+    std::vector<Node>& nodes,
+    const std::vector<BoneBody>& bodies,
+    PhysicsWorld& physics_world)
+{
+    for (const auto& bone_body : bodies) {
+        Node& node = nodes[bone_body.node_index];
+
+        // Get the physics-simulated world position and orientation
+        PhysicsPose pose = physics_world.get_body_pose(bone_body.physics_body);
+
+        // Reconstruct a world matrix from position + quaternion
+        node.world_matrix = pose.to_matrix();
+
+        // Mark the node as Clean — we just set its world matrix directly,
+        // so the scene graph should not recalculate it from local transforms
+        // this frame.
+        node.status = TransformStatus::Clean;
+    }
+
+    // Nodes that are NOT physics-driven (scene props, non-ragdoll characters)
+    // must still be updated through the normal dirty-flag propagation.
+    // The scene graph's update pass should skip nodes where status is Clean.
+}
+----
+
+There is a subtle problem here that requires care: our scene graph from Chapter 2 calculates world matrices from local transforms and parent matrices, in topological order. But during ragdoll, we are writing world matrices directly—bypassing that entire calculation. We need to ensure the scene graph update loop does not overwrite our physics-derived world matrices by recalculating them from the local transforms.
+
+The `TransformStatus::Clean` status handles this if the scene graph update code is written to skip clean nodes entirely. If your scene graph always recalculates, you need to add an explicit `is_physics_driven` flag that the update loop checks before computing the world matrix.
+
+== The Visual Blend
+
+The `blend_weight` we have been tracking in the controller is what gets passed to the character's animation blending system. When it is 0, the character's pose comes entirely from the animation. When it is 1, it comes entirely from the ragdoll. In between, we use linear interpolation to blend the two:
+
+[source,cpp]
+----
+// After updating physics and animation separately:
+if (controller.state == RagdollState::BLENDED) {
+    for (const auto& bone_body : bodies) {
+        Node& node = nodes[bone_body.node_index];
+
+        // Animated world matrix (from scene graph dirty-flag pass)
+        glm::mat4& anim_matrix = node.world_matrix_animated;
+
+        // Physics world matrix (from sync_physics_to_animation)
+        PhysicsPose pose = physics_world.get_body_pose(bone_body.physics_body);
+        glm::mat4 phys_matrix = glm::translate(glm::mat4(1.0f), pose.position)
+                              * glm::mat4_cast(pose.orientation);
+
+        // Decompose both to TRS, lerp/slerp, recompose
+        glm::vec3 anim_pos = glm::vec3(anim_matrix[3]);
+        glm::quat anim_rot = glm::quat_cast(glm::mat3(anim_matrix));
+        glm::vec3 phys_pos = glm::vec3(phys_matrix[3]);
+        glm::quat phys_rot = glm::quat_cast(glm::mat3(phys_matrix));
+
+        float w = controller.blend_weight;
+        glm::vec3 blended_pos = glm::mix(anim_pos, phys_pos, w);
+        glm::quat blended_rot = glm::slerp(anim_rot, phys_rot, w);
+
+        node.world_matrix = glm::translate(glm::mat4(1.0f), blended_pos)
+                          * glm::mat4_cast(blended_rot);
+    }
+}
+----
+
+The use of `glm::slerp` (Spherical Linear intERPolation) for quaternion blending is essential here. Direct linear interpolation of quaternions does not preserve unit length and produces rotations that slow down in the middle of the blend. SLERP produces smooth, constant-speed rotation transitions that are geometrically correct.
+
+xref:Advanced_glTF/Physics_Integration/03_constraints_and_joint_limits.adoc[Previous: Constraints & Joint Limits] | xref:Advanced_glTF/Physics_Integration/05_self_collision_filtering.adoc[Next: Self-Collision Filtering]
diff --git a/en/Advanced_glTF/Physics_Integration/05_self_collision_filtering.adoc b/en/Advanced_glTF/Physics_Integration/05_self_collision_filtering.adoc
new file mode 100644
index 000000000..b3f829559
--- /dev/null
+++ b/en/Advanced_glTF/Physics_Integration/05_self_collision_filtering.adoc
@@ -0,0 +1,152 @@
+:pp: {plus}{plus}
+= Self-Collision Filtering
+
+== Why Characters Explode Without Filtering
+
+When you enable full collision detection between all of a ragdoll's body parts, the simulation becomes unstable almost immediately. The problem is geometric overlap: in a real human body, adjacent body segments share volume. The arm is in contact with the torso. The thigh overlaps slightly with the pelvis at the hip socket. The spine segments interpenetrate. This is fine for visual purposes—the skin mesh hides all of this—but the physics engine doesn't know about the skin. It sees the underlying collision shapes, and if those shapes overlap, the solver interprets the overlap as a penetration that must be resolved with a large separating force.
+
+The result is that adjacent bone bodies push each other apart violently. The arm gets ejected from the torso. The thighs drive the pelvis upward. The spine segments fly apart. The constraints try to pull everything back together, the contacts push everything apart, and the simulation enters a state of permanent internal conflict that produces the classic "exploding ragdoll" artifact.
+
+The solution is to tell the physics engine which pairs of bodies should *not* collide with each other. This is called **collision filtering**. Physics engines implement it using bitmasks: each body belongs to one or more **collision groups**, and each body has a **collision mask** that specifies which groups it is allowed to collide with. Two bodies will only generate collision contacts if the first body's group bit is set in the second body's mask, and vice versa.
+
+== Designing the Bitmask Layout
+
+The art of collision filtering is in choosing a bitmask layout that correctly expresses the collision rules for your ragdoll without requiring you to enumerate every forbidden pair explicitly. A well-designed layout lets you express the rules compactly and extend them easily.
+
+For a humanoid character, a practical layout organizes body parts by anatomical region. Each region becomes a named bit:
+
+[source,cpp]
+----
+// Collision group bits for a humanoid ragdoll
+// Each body part type gets one bit.
+namespace CollisionGroup {
+    constexpr uint32_t WORLD     = (1 << 0); // Static world geometry
+    constexpr uint32_t PROPS     = (1 << 1); // Dynamic world objects (crates, barrels, etc.)
+    constexpr uint32_t TORSO     = (1 << 2); // Chest, spine, pelvis
+    constexpr uint32_t HEAD      = (1 << 3); // Skull and neck
+    constexpr uint32_t ARM_L     = (1 << 4); // Left upper arm and forearm
+    constexpr uint32_t ARM_R     = (1 << 5); // Right upper arm and forearm
+    constexpr uint32_t HAND_L    = (1 << 6); // Left hand and wrist
+    constexpr uint32_t HAND_R    = (1 << 7); // Right hand and wrist
+    constexpr uint32_t LEG_L     = (1 << 8); // Left thigh and shin
+    constexpr uint32_t LEG_R     = (1 << 9); // Right thigh and shin
+    constexpr uint32_t FOOT_L    = (1 << 10); // Left foot and ankle
+    constexpr uint32_t FOOT_R    = (1 << 11); // Right foot and ankle
+}
+----
+
+With this layout, the torso can be assigned to the `TORSO` group, and its collision mask can be set to allow collisions only with `WORLD` and `PROPS`—meaning it will collide with the environment and moveable objects, but not with any of the character's own limbs. The arms similarly collide with the world and props, but not with the torso or with each other.
+
+[source,cpp]
+----
+// Collision masks define what each group can collide WITH.
+namespace CollisionMask {
+    using namespace CollisionGroup;
+
+    // The torso collides with world and props, but NOT with its own arms, legs, or head.
+    // Internal ragdoll self-collision is handled by constraints, not contacts.
+    constexpr uint32_t TORSO = WORLD | PROPS;
+
+    // The head doesn't collide with the neck/torso (they are constrained together),
+    // but can hit the world.
+    constexpr uint32_t HEAD = WORLD | PROPS;
+
+    // Arms collide with world and props. They can also collide with the opposing side
+    // (left arm can interact with right arm if they cross) but not with the torso.
+    constexpr uint32_t ARM_L  = WORLD | PROPS | ARM_R;
+    constexpr uint32_t ARM_R  = WORLD | PROPS | ARM_L;
+    constexpr uint32_t HAND_L = WORLD | PROPS | ARM_R | HAND_R;
+    constexpr uint32_t HAND_R = WORLD | PROPS | ARM_L | HAND_L;
+
+    // Legs collide with world, props, and with each other (they can interact
+    // when the character's legs cross during a fall).
+    constexpr uint32_t LEG_L  = WORLD | PROPS | LEG_R;
+    constexpr uint32_t LEG_R  = WORLD | PROPS | LEG_L;
+    constexpr uint32_t FOOT_L = WORLD | PROPS | LEG_R | FOOT_R;
+    constexpr uint32_t FOOT_R = WORLD | PROPS | LEG_L | FOOT_L;
+}
+----
+
+This design allows left and right limbs to interact with each other—which is correct, since a falling character's arms can and do cross—while preventing adjacent constrained segments from generating destructive contact forces.
+
+== Applying Masks from glTF Extras
+
+Earlier, when we defined our collider extras format, we included `collision_group` and `collision_mask` as string fields. Now we can write a resolver that maps those human-readable strings to the bitmask values:
+
+[source,cpp]
+----
+uint32_t resolve_collision_group(const std::string& group_name)
+{
+    static const std::unordered_map<std::string, uint32_t> group_map = {
+        { "world",   CollisionGroup::WORLD   },
+        { "props",   CollisionGroup::PROPS   },
+        { "torso",   CollisionGroup::TORSO   },
+        { "head",    CollisionGroup::HEAD    },
+        { "arm_l",   CollisionGroup::ARM_L   },
+        { "arm_r",   CollisionGroup::ARM_R   },
+        { "hand_l",  CollisionGroup::HAND_L  },
+        { "hand_r",  CollisionGroup::HAND_R  },
+        { "leg_l",   CollisionGroup::LEG_L   },
+        { "leg_r",   CollisionGroup::LEG_R   },
+        { "foot_l",  CollisionGroup::FOOT_L  },
+        { "foot_r",  CollisionGroup::FOOT_R  },
+    };
+
+    auto it = group_map.find(group_name);
+    return (it != group_map.end()) ? it->second : 0;
+}
+
+// The mask field in extras is a comma-separated list of group names
+// e.g., "world,props,leg_r" => WORLD | PROPS | LEG_R
+uint32_t resolve_collision_mask(const std::string& mask_str)
+{
+    uint32_t mask = 0;
+    std::stringstream ss(mask_str);
+    std::string token;
+    while (std::getline(ss, token, ',')) {
+        // Trim whitespace
+        token.erase(0, token.find_first_not_of(" \t"));
+        token.erase(token.find_last_not_of(" \t") + 1);
+        mask |= resolve_collision_group(token);
+    }
+    return mask;
+}
+----
+
+When we create a physics body for a bone, we apply these masks:
+
+[source,cpp]
+----
+JPH::BodyCreationSettings settings;
+// ... set shape, mass, pose ...
+settings.mMotionType = JPH::EMotionType::Kinematic;
+
+// Apply collision filtering from the parsed extras
+// settings.mObjectLayer = resolve_collision_layer(node.collider_def.collision_group, node.collider_def.collision_mask);
+
+JPH::BodyID body = physics_world.create_body(settings);
+----
+
+== Filtering Between Characters
+
+The bitmask design above handles self-collision for a single character. But what about multiple characters? If two characters are fighting, you probably want their ragdolls to interact—one character's falling body should be able to knock the other character's body. But you still don't want Character A's arm to collide with Character A's own torso.
+
+The standard technique for multi-character filtering is to add a **character instance layer** on top of the body-part groups. Each character instance gets a unique numeric ID, and the collision mask for Character A's body parts explicitly excludes Character A's own other body parts, while including the corresponding body parts of all other characters.
+
+For small numbers of characters, the simplest approach is to just not worry about it. With proper joint limits, self-collision from adjacent constrained segments is prevented by the constraints themselves—only distant self-collisions (like the arm crossing to the far side of the body) would need explicit filtering, and those are relatively rare. The important filtering is between adjacent segments that are connected by constraints, which we always want to filter out.
+
+For large numbers of characters—a crowd simulation, for example—you will need a more sophisticated filtering system. One common approach is to allocate a block of 8 or 16 bits per character and assign one bit per body region within that block. This allows the broadphase to quickly cull collisions between same-character body parts at the layer level, without requiring per-pair contact filtering.
+
+== Verifying Your Filters
+
+After setting up collision filtering, it is worth taking a few minutes to test it deliberately. The easiest test is to spawn a single character, trigger the ragdoll, and watch it fall. Signs of filtering problems:
+
+If the ragdoll immediately explodes on the first frame, adjacent body parts are generating penetration contacts—you have a filtering gap between constrained neighbors. Check that the groups and masks for adjacent bones (e.g., upper arm and forearm) exclude each other.
+
+If the character's arms pass straight through walls or floors, a mask is too restrictive—the body part is not colliding with WORLD geometry. Check that `WORLD` is included in every body part's collision mask.
+
+If two characters' ragdolls merge together and pass through each other when they fall on top of each other, the mask does not include the opposing character's body part groups. This is expected if you filtered out all character-vs-character collision, but undesirable if you wanted characters to interact.
+
+The debug drawer we will build in Chapter 8 will be invaluable for visualizing collision shapes and their assigned layers, making these issues much easier to diagnose visually.
+
+xref:Advanced_glTF/Physics_Integration/04_ragdoll_handoff.adoc[Previous: The Ragdoll Handoff] | xref:Advanced_glTF/Physics_Integration/06_conclusion.adoc[Next: Conclusion]
diff --git a/en/Advanced_glTF/Physics_Integration/06_conclusion.adoc b/en/Advanced_glTF/Physics_Integration/06_conclusion.adoc
new file mode 100644
index 000000000..164401275
--- /dev/null
+++ b/en/Advanced_glTF/Physics_Integration/06_conclusion.adoc
@@ -0,0 +1,39 @@
+:pp: {plus}{plus}
+= Physics Integration: Conclusion
+
+== What We Built
+
+This chapter constructed the physical layer of our character system, and it is worth stepping back to appreciate how many distinct problems we solved.
+
+We started by establishing *why* you cannot simply hand the physics engine your mesh geometry and expect good results. The practical answer—proxy colliders—sounds like a compromise, but it is not. Capsules and boxes produce more stable simulations, run faster, and require less memory than mesh colliders, all while producing ragdoll behavior that is visually indistinguishable from higher-fidelity approximations. The art of choosing the right shape for each body part is a craft skill that develops with experience, and the glTF extras system we put in place gives artists direct control over those shapes without requiring code changes.
+
+We connected the proxy colliders into a functioning skeleton using physics constraints. The conceptual leap here—understanding that a Ball-and-Socket constraint represents a shoulder, and a Hinge represents a knee—is more important than any of the code. Once you have that mental model, the angular limit values are just numbers to be tuned. The reference humanoid ranges we provided are a starting point, not a specification; every game has a different feel it wants from its characters.
+
+We then solved the **ragdoll handoff problem** properly, which means: initializing physics body velocities from the character's current motion, using a state machine to manage the transition, continuing to feed kinematic updates to the physics engine during the blend so constraint positions are always approximately valid, and then reading physics results back into the scene graph so the compute skinning pipeline sees the correct animated state.
+
+Finally, we addressed self-collision filtering with a bitmask design that prevents the classic "exploding ragdoll" artifact by excluding adjacent constrained body parts from generating contacts with each other, while still allowing the character to interact correctly with the world and with other characters.
+
+== The Tradeoffs You Should Understand
+
+The system we built assumes a fixed topology: one ragdoll per character, one set of bones, one set of constraints. This works well for humanoid characters with a predictable skeletal structure. It is less well-suited for quadrupeds, creatures with unusual anatomy, or procedurally generated characters where the skeleton cannot be known at asset creation time. For those cases, the extras-driven approach becomes more valuable—you can define constraints generically and resolve them at runtime—but you will need more sophisticated tools for authoring and validating the constraint graph.
+
+The blend weight approach we used for the handoff (linearly increasing from 0 to 1 over a fixed duration) is simple and reliable, but it has limitations. A 150ms blend looks great for a character that is falling smoothly. It can look wrong for a character that is hit by a sudden large impulse—in that case, you might want to snap to ragdoll immediately rather than blending. The state machine design we built supports this: you can bypass the BLENDED state entirely and transition directly to RAGDOLL if the incoming impulse exceeds a threshold.
+
+The collision filtering system we built uses a flat bitmask with one bit per body region. This gives you 32 distinct collision layers in a 32-bit mask, which is more than enough for a single character but becomes a constraint for large crowd systems. Jolt Physics and PhysX both offer more sophisticated filtering mechanisms (broad-phase layers, narrow-phase callbacks) that allow you to scale the filtering system to hundreds or thousands of characters.
+
+== What Comes Next
+
+With our visual rendering and physics systems both operating from the same scene graph and producing correct, physics-accurate behavior, we are in a strong position to add the final layer of sophistication: **procedural animation**.
+
+Procedural animation is the discipline of generating or modifying joint transforms at runtime using algorithms, rather than artist-authored keyframes. The most important application for games is **Inverse Kinematics (IK)**: rather than animating the foot's position directly, we compute the necessary joint angles from where we want the foot to land. This allows characters to place their feet correctly on uneven terrain, to reach for objects that aren't in the exact position the animator expected, and to respond to physics events in ways that look natural rather than canned.
+
+Chapter 5 will build an IK system from scratch, starting with the theory of why forward kinematics cannot solve the foot-placement problem and why IK can. We will implement both the simple CCD (Cyclic Coordinate Descent) algorithm and the more accurate FABRIK (Forward And Backward Reaching IK) algorithm, and we will wire them into the same scene graph and animation blending infrastructure we have built throughout this series.
+
+== Verification: What to Look For
+
+To verify your physics integration:
+1.  **Kinematic Sync**: With the character in `ANIMATED` state, enable debug visualization. The collision capsules should perfectly track the rendered mesh as it moves.
+2.  **Ragdoll Handoff**: When triggering a ragdoll, the character should not "pop" or "snap" to a new pose. The transition should be smooth, with the character's momentum preserved.
+3.  **Self-Collision**: Ensure that character limbs do not collide with each other during movement. If limbs are "pushing" each other apart, check your collision filtering bitmasks.
+
+xref:Advanced_glTF/Physics_Integration/05_self_collision_filtering.adoc[Previous: Self-Collision Filtering] | xref:Advanced_glTF/Procedural_Animation_IK/01_introduction.adoc[Next Chapter: Procedural Animation & IK]
diff --git a/en/Advanced_glTF/Procedural_Animation_IK/01_introduction.adoc b/en/Advanced_glTF/Procedural_Animation_IK/01_introduction.adoc
new file mode 100644
index 000000000..9231a9b5e
--- /dev/null
+++ b/en/Advanced_glTF/Procedural_Animation_IK/01_introduction.adoc
@@ -0,0 +1,30 @@
+:pp: {plus}{plus}
+= Procedural Animation & Inverse Kinematics
+
+== The Limits of Keyframe Animation
+
+Everything we have built so far—the scene graph, the compute skinning pipeline, the physics ragdoll system—has been in service of playing back and blending animations that an artist created in Blender. Keyframe animation is a powerful tool, and for the majority of a character's movement it is exactly the right approach. An artist can spend hours perfecting a walk cycle, a combat strike, or an idle breathing animation, and the investment shows.
+
+But keyframe animation has a fundamental limitation: it was authored for a specific environment. A walk cycle assumes flat ground. A hand-reaching animation assumes the object being reached for is exactly where the animator placed it. A death animation assumes the character falls on an open, featureless floor. The real world—or the real game world—is none of those things.
+
+A character crossing a rocky hillside needs each foot to land precisely on the surface it encounters, not float half a meter above the ground or clip into a slope. A character reaching for a dynamically placed pickup needs their hand to arrive at the actual object position, which changes every frame. A soldier diving behind cover needs their head to track the enemy even while their body is committed to a pre-authored crouch animation. A character running at full speed around a sharp corner should lean into the turn, not stand rigidly upright as if skating on rails.
+
+These problems cannot be solved by making more keyframe animations. There are infinitely many terrain configurations, object positions, and dynamic scenarios. The solution is **procedural animation**: the real-time, mathematical computation of joint transforms that are not authored by an artist but are instead computed by the engine in response to the current world state.
+
+== What Procedural Animation Is and Isn't
+
+It is worth being precise about what we mean by procedural animation, because the term is used loosely in the industry. In this chapter, we mean the runtime modification of a skeleton's joint transforms using algorithms that respond to the current state of the scene. We are not talking about procedural content generation, particle systems, or anything that generates geometry. We are talking about computing joint rotations.
+
+Procedural animation is typically layered *on top of* keyframe animation, not used instead of it. The base pose comes from the animator's clips; the procedural layer adjusts specific joints to respond to the environment. This is an additive process: the keyframe system provides a biologically plausible base motion, and the procedural layer makes small, targeted corrections that the artist couldn't anticipate.
+
+The central technique for positional correction is **Inverse Kinematics**, abbreviated IK. Standard keyframe animation works *forward kinematically*: you set joint rotations, and the end effector (the hand, the foot) ends up wherever the math puts it. IK works in reverse: you specify where the end effector *must* be, and the algorithm computes the joint rotations required to put it there. This is not a trivial inversion—for a chain with multiple joints, there are typically infinitely many valid solutions, and the algorithm must pick the one that looks most natural.
+
+We will cover two IK algorithms in depth: **Cyclic Coordinate Descent (CCD)** and **FABRIK** (Forward And Backward Reaching IK). Both are practical, real-time algorithms that are widely used in game engines. We will also cover the two most common applications: foot placement on uneven terrain and look-at controllers for head tracking. Finally, we will implement physics-driven procedural lean, which derives character body tilt from the physics simulation's velocity data.
+
+== How This Chapter Fits the Rest of the Series
+
+This chapter assumes you have a functioning scene graph (Chapter 2) with properly propagated world matrices, and that you understand the animation blending system (Chapter 3). The IK algorithms we implement will read joint world matrices from the scene graph and write corrected joint local transforms back into it, triggering the dirty flag propagation system to update all descendant nodes.
+
+The ragdoll system from Chapter 4 is also relevant here, particularly in the foot placement section. When a character is standing on uneven terrain, the foot IK system must not fight the physics system—they must cooperate. We will discuss the hierarchy of control and how to ensure that IK corrections and physics feedback reinforce rather than contradict each other.
+
+xref:Advanced_glTF/Physics_Integration/06_conclusion.adoc[Previous: Physics Integration Conclusion] | xref:Advanced_glTF/Procedural_Animation_IK/02_ccd_ik.adoc[Next: Cyclic Coordinate Descent IK]
diff --git a/en/Advanced_glTF/Procedural_Animation_IK/02_ccd_ik.adoc b/en/Advanced_glTF/Procedural_Animation_IK/02_ccd_ik.adoc
new file mode 100644
index 000000000..33b824b68
--- /dev/null
+++ b/en/Advanced_glTF/Procedural_Animation_IK/02_ccd_ik.adoc
@@ -0,0 +1,155 @@
+:pp: {plus}{plus}
+= Cyclic Coordinate Descent IK
+
+== Understanding the Problem Geometrically
+
+Before we look at any algorithm, it helps to understand what we are actually asking the computer to solve. Imagine a three-link chain: a thigh connected to a shin connected to a foot. Each joint can rotate. The foot (the **end effector**) is currently somewhere in the air, and we want it to land exactly on a target point on the ground. The question is: what rotations do we need to apply to the thigh and shin joints to make that happen?
+
+If you have ever tried to solve this with a direct analytical formula, you quickly discover the problem. For a two-bone chain (think: a simple arm with a shoulder and elbow), there is a closed-form solution—it involves a bit of trigonometry and produces either two solutions (elbow up or elbow down) or no solution if the target is out of reach. But for a three-bone chain, or a five-bone spine, there is no clean closed form. The system becomes **underdetermined**: there are more degrees of freedom (rotation axes) than constraints (the three coordinates of the target), which means there are infinitely many poses that place the end effector at the target, and we need an algorithm to pick among them.
+
+**Cyclic Coordinate Descent**, or CCD, solves this with a beautifully simple heuristic: instead of trying to solve the whole chain at once, it rotates one joint at a time to bring the end effector as close as possible to the target, then moves to the next joint, then the next, cycling through the chain repeatedly until the end effector is close enough or we have run out of iterations.
+
+== The CCD Algorithm Step by Step
+
+Let's trace through one iteration of CCD for a leg chain. Assume we have joints `[thigh, knee, ankle]` and a target position on the ground. CCD works from the end of the chain backward toward the root.
+
+**Step 1 — Start at the joint closest to the end effector.** For a leg chain, this is the ankle. Compute two vectors: one from the ankle to the current end effector position (the toe), and one from the ankle to the target position. Rotate the ankle joint so that the first vector aligns with the second. Because the ankle is the joint closest to the end effector, rotating it produces the most direct movement of the effector toward the target.
+
+**Step 2 — Move to the next joint inward.** Now consider the knee. Again, compute the vector from the knee to the end effector's *new* position (it moved when we rotated the ankle), and the vector from the knee to the target. Rotate the knee to align them. The knee has a longer lever arm than the ankle—rotating it sweeps the end effector through a larger arc—so this step often produces a large correction.
+
+**Step 3 — Move to the root joint.** Repeat for the thigh. By now, the end effector may already be very close to the target—the algorithm's convergence property means that each pass through the chain reduces the error.
+
+**Step 4 — Check convergence.** Compute the distance from the end effector to the target. If it is below some threshold (typically a millimeter or less in world space), stop. If not, start another pass from the ankle and repeat.
+
+The genius of CCD is that each individual rotation step is trivially computed: it is just the quaternion that rotates one vector onto another. The algorithm's convergence is not guaranteed in the worst case, but in practice for humanoid skeletons with reasonable joint limits, it converges in two to five iterations.
+
+== Implementing CCD
+
+Let's write a self-contained CCD solver. It takes the joint chain (as a list of node indices into our scene graph), the target world position, and the maximum number of iterations:
+
+[source,cpp]
+----
+struct IKChain {
+    std::vector<uint32_t> joints;  // Ordered from root to end effector
+    uint32_t effector_node;        // The node whose position we are trying to place
+    float    threshold;            // Convergence threshold in world-space units
+    int      max_iterations;       // Safety cap
+    glm::vec3 target_world;        // Target position
+    glm::vec3 pole_vector;         // For algorithms like FABRIK or constrained CCD
+};
+
+// Rotate joint 'node' so that the vector from the joint to the effector aligns
+// with the vector from the joint to the target. Updates the node's local rotation
+// and marks the subtree dirty so world matrices are recomputed.
+static void ccd_rotate_joint(
+    std::vector<Node>& nodes,
+    uint32_t           joint_idx,
+    uint32_t           effector_idx,
+    const glm::vec3&   target_world)
+{
+    Node& joint = nodes[joint_idx];
+
+    // We need the world position of the joint. This assumes world matrices
+    // are up to date for this joint (but not necessarily for descendants).
+    glm::vec3 joint_world = glm::vec3(joint.world_matrix[3]);
+
+    // Vector from this joint to the current effector position
+    glm::vec3 to_effector = glm::normalize(
+        glm::vec3(nodes[effector_idx].world_matrix[3]) - joint_world);
+
+    // Vector from this joint to where we want the effector to be
+    glm::vec3 to_target = target_world - joint_world;
+    float dist = glm::length(to_target);
+    if (dist < 1e-6f) return; // Effector is already at the joint; nothing to do
+    to_target /= dist;
+
+    // Compute the rotation that takes to_effector onto to_target
+    float dot = glm::clamp(glm::dot(to_effector, to_target), -1.0f, 1.0f);
+    if (dot > 0.9999f) return; // Already aligned
+
+    float     angle = std::acos(dot);
+    glm::vec3 axis  = glm::cross(to_effector, to_target);
+    if (glm::length(axis) < 1e-6f) return; // Parallel or anti-parallel vectors
+    axis = glm::normalize(axis);
+
+    glm::quat delta_rotation = glm::angleAxis(angle, axis);
+
+    // Apply this rotation in world space, then convert to local space.
+    // The joint's local rotation is relative to its parent's world rotation.
+    // parent_world_rotation * local_rotation = world_rotation
+    // Therefore: local_rotation = inverse(parent_world_rotation) * world_rotation
+
+    glm::quat world_rotation = delta_rotation * joint.get_world_rotation();
+    glm::quat parent_world_rotation = (joint.parent_index != INVALID_NODE_INDEX)
+        ? nodes[joint.parent_index].get_world_rotation()
+        : glm::quat(1,0,0,0);
+
+    joint.local_rotation = glm::inverse(parent_world_rotation) * world_rotation;
+    joint.mark_dirty();
+
+    // Recompute world matrices for this joint and all its descendants
+    // so that subsequent CCD steps see correct positions.
+    update_world_matrices_subtree(nodes, joint_idx);
+}
+
+void solve_ccd(std::vector<Node>& nodes, const IKChain& chain, const glm::vec3& target_world)
+{
+    // Check reachability: if the target is farther than the total chain length,
+    // we can still run CCD but it will converge toward the direction of the target
+    // without reaching it. For foot placement, this is usually a sign that the
+    // character's body needs to be lowered, which we handle at a higher level.
+
+    for (int iter = 0; iter < chain.max_iterations; ++iter) {
+        // Check if we are already close enough
+        glm::vec3 effector_pos = glm::vec3(nodes[chain.effector_node].world_matrix[3]);
+        if (glm::distance(effector_pos, target_world) < chain.threshold) break;
+
+        // Iterate from the joint closest to the end effector toward the root.
+        // chain.joints is ordered root-to-tip, so we iterate it in reverse.
+        for (int j = static_cast<int>(chain.joints.size()) - 1; j >= 0; --j) {
+            ccd_rotate_joint(nodes, chain.joints[j], chain.effector_node, target_world);
+        }
+    }
+}
+----
+
+The `get_world_rotation` helper (which we added to our `Node` struct in Chapter 2) extracts the pure rotation quaternion from a world matrix by normalizing the basis columns to remove scale.
+
+== Adding Joint Limits to CCD
+
+The basic CCD implementation above has one critical problem: it applies rotations in any direction, without regard for anatomical constraints. The knee might bend sideways, the ankle might rotate in ways that look broken. We need to clamp the joint's rotation to its allowed range after each CCD step.
+
+The cleanest way to integrate joint limits with CCD is to clamp the joint's local rotation after each `ccd_rotate_joint` call. Since our joint limits are stored as part of the physics constraint definition (from Chapter 4), we can re-use that data:
+
+[source,cpp]
+----
+// After computing joint.local_rotation in ccd_rotate_joint, apply limits:
+void apply_hinge_limit(Node& joint, const ConstraintDef& limits)
+{
+    // For a hinge joint, the local rotation should only have a component
+    // around the hinge axis. We extract the angle around that axis and clamp it.
+    glm::vec3 axis = limits.hinge_axis;
+    float angle = 2.0f * std::atan2(
+        glm::dot(glm::vec3(joint.local_rotation.x, joint.local_rotation.y, joint.local_rotation.z), axis),
+        joint.local_rotation.w
+    );
+    float clamped = glm::clamp(
+        glm::degrees(angle),
+        limits.hinge_min_deg,
+        limits.hinge_max_deg
+    );
+    joint.local_rotation = glm::angleAxis(glm::radians(clamped), axis);
+}
+----
+
+For ball-and-socket joints, limiting the rotation is more involved. The swing limit is a cone constraint, and enforcing it requires clamping the swing component of the quaternion's decomposition. Most physics engines provide a helper for this; if yours does not, you can implement it by decomposing the local rotation into swing-and-twist components relative to the joint's rest pose.
+
+== CCD's Strengths and Weaknesses
+
+CCD is fast. Each iteration is a small number of vector operations and quaternion multiplications, and it converges quickly for most humanoid chains. It integrates naturally with joint limits and is straightforward to implement correctly. For these reasons, it was the dominant IK algorithm in real-time game development for many years and remains a good default choice for most applications.
+
+CCD's main weakness is a visual artifact known as **end-effector bias**: because it processes joints from tip to root, the joints near the end effector tend to rotate more than joints near the root. For a leg, this means the ankle tends to be over-rotated while the thigh moves only slightly. This produces a characteristic stiff-thigh, ankle-bent look that becomes obvious when the character is reaching to extremes. For foot placement on gently uneven terrain, this artifact is generally not noticeable. For reaching animations where the arm extends far from the body, it can look wrong.
+
+The algorithm we will look at next, FABRIK, addresses this weakness with a different strategy.
+
+xref:Advanced_glTF/Procedural_Animation_IK/01_introduction.adoc[Previous: Introduction] | xref:Advanced_glTF/Procedural_Animation_IK/03_fabrik.adoc[Next: FABRIK]
diff --git a/en/Advanced_glTF/Procedural_Animation_IK/03_fabrik.adoc b/en/Advanced_glTF/Procedural_Animation_IK/03_fabrik.adoc
new file mode 100644
index 000000000..ddda85010
--- /dev/null
+++ b/en/Advanced_glTF/Procedural_Animation_IK/03_fabrik.adoc
@@ -0,0 +1,169 @@
+:pp: {plus}{plus}
+= FABRIK: Forward And Backward Reaching IK
+
+== A Different Philosophy
+
+Where CCD thinks about joints and rotations, FABRIK—**Forward And Backward Reaching Inverse Kinematics**—thinks about positions. This is a fundamental difference in approach, and it produces fundamentally different visual results.
+
+FABRIK was published by Andreas Aristidou and Joan Lasenby in 2011, and it became influential quickly because it is both visually superior to CCD in many scenarios and remarkably simple to implement. The core idea is to treat each joint as a point in space and each bone as a rigid rod connecting two consecutive points. Given a target for the end effector, FABRIK redistributes the joints along the chain in two phases: a backward phase that anchors the end effector at the target and propagates the chain toward the root, and a forward phase that re-anchors the root at its original position and propagates back toward the end effector. Repeating these two phases rapidly converges to a solution.
+
+The reason FABRIK avoids the end-effector bias that CCD can exhibit is that both phases treat every joint in the chain symmetrically. No joint has more influence than another by default—each one simply adjusts to satisfy the constraint imposed by its neighbor. The resulting pose tends to look more natural for reaching motions where the whole arm or leg should participate in the movement.
+
+== The FABRIK Algorithm
+
+Let's be concrete. We have a chain of joints at positions `p[0], p[1], ..., p[n]`, where `p[0]` is the root and `p[n]` is the end effector. The bone lengths are `d[i] = |p[i+1] - p[i]|`, and they are fixed—bones don't stretch. The target is `t`.
+
+**Backward phase:**
+
+1. Set `p[n] = t`. The end effector is now at the target.
+2. For each joint from `n-1` down to `0`: compute the direction from `p[i]` to `p[i+1]`, and place `p[i]` along that direction at distance `d[i]` from `p[i+1]`. In other words, `p[i] = p[i+1] + normalize(p[i] - p[i+1]) * d[i]`.
+
+After the backward phase, the end effector is at the target, but the root `p[0]` has moved from its original position.
+
+**Forward phase:**
+
+1. Set `p[0]` back to its original position `b` (the root is fixed; it can't move).
+2. For each joint from `1` to `n`: compute the direction from `p[i-1]` to `p[i]`, and place `p[i]` along that direction at distance `d[i-1]` from `p[i-1]`. In other words, `p[i] = p[i-1] + normalize(p[i] - p[i-1]) * d[i-1]`.
+
+After the forward phase, the root is back in its correct place, but the end effector may have moved slightly away from the target again.
+
+**Iteration:** Repeat backward and forward phases until `|p[n] - t| < threshold` or the maximum iteration count is reached.
+
+The convergence is remarkably fast—typically two to four full iterations (one backward plus one forward each) are sufficient for humanoid chains. The reason is that FABRIK is essentially performing gradient descent in position space, and the objective function is smooth and well-conditioned.
+
+== Implementing FABRIK
+
+FABRIK operates on positions rather than rotations, so we first extract positions from the scene graph, run the solver, and then convert the result back into joint rotations. This conversion step—positions-to-rotations—is what makes FABRIK slightly more involved than it first appears:
+
+[source,cpp]
+----
+void solve_fabrik(std::vector<Node>& nodes, const IKChain& chain, const glm::vec3& target_world)
+{
+    const int n = static_cast<int>(chain.joints.size());
+    if (n == 0) return;
+
+    // Step 1: Extract current world positions and compute bone lengths.
+    std::vector<glm::vec3> positions(n + 1);
+    std::vector<float>     lengths(n);
+
+    for (int i = 0; i < n; ++i) {
+        positions[i] = glm::vec3(nodes[chain.joints[i]].world_matrix[3]);
+    }
+    positions[n] = glm::vec3(nodes[chain.effector_node].world_matrix[3]);
+
+    for (int i = 0; i < n; ++i) {
+        lengths[i] = glm::distance(positions[i], positions[i + 1]);
+        if (lengths[i] < 1e-6f) lengths[i] = 1e-6f; // Prevent division by zero
+    }
+
+    // Step 2: Check if the target is reachable.
+    float total_length = 0.0f;
+    for (float l : lengths) total_length += l;
+
+    glm::vec3 root_pos = positions[0]; // Remember the root position
+
+    // Step 3: Iterate backward-forward passes.
+    for (int iter = 0; iter < chain.max_iterations; ++iter) {
+        // Check convergence
+        if (glm::distance(positions[n], target_world) < chain.threshold) break;
+
+        // --- Backward pass: anchor the end effector at target ---
+        positions[n] = target_world;
+        for (int i = n - 1; i >= 0; --i) {
+            glm::vec3 dir = glm::normalize(positions[i] - positions[i + 1]);
+            positions[i] = positions[i + 1] + dir * lengths[i];
+        }
+
+        // --- Forward pass: re-anchor the root at its original position ---
+        positions[0] = root_pos;
+        for (int i = 1; i <= n; ++i) {
+            glm::vec3 dir = glm::normalize(positions[i] - positions[i - 1]);
+            positions[i] = positions[i - 1] + dir * lengths[i - 1];
+        }
+    }
+
+    // Step 4: Convert solved positions back to joint rotations.
+    // For each joint, we know where it is (positions[i]) and where its
+    // child is (positions[i+1]). We want to rotate the joint so that
+    // its local "bone direction" vector points toward the child's new position.
+
+    for (int i = 0; i < n; ++i) {
+        Node& joint = nodes[chain.joints[i]];
+
+        // The direction the bone should point after solving
+        glm::vec3 solved_dir = glm::normalize(positions[i + 1] - positions[i]);
+
+        // The direction the bone currently points in world space
+        // (the unit vector from this joint's world position to the next joint's
+        // current world position, before we apply our correction)
+        glm::vec3 current_dir = glm::normalize(
+            glm::vec3(nodes[i + 1 < n ? chain.joints[i + 1] : chain.effector_node].world_matrix[3])
+            - glm::vec3(joint.world_matrix[3])
+        );
+
+        float dot = glm::clamp(glm::dot(current_dir, solved_dir), -1.0f, 1.0f);
+        if (dot > 0.9999f) continue; // Already pointing the right way
+
+        float     angle = std::acos(dot);
+        glm::vec3 axis  = glm::cross(current_dir, solved_dir);
+        if (glm::length(axis) < 1e-6f) continue;
+        axis = glm::normalize(axis);
+
+        glm::quat world_correction = glm::angleAxis(angle, axis);
+        glm::quat new_world_rot    = world_correction * joint.get_world_rotation();
+
+        glm::quat parent_world_rot = (joint.parent_index != INVALID_NODE_INDEX)
+            ? nodes[joint.parent_index].get_world_rotation()
+            : glm::quat(1, 0, 0, 0);
+
+        joint.local_rotation = glm::inverse(parent_world_rot) * new_world_rot;
+        joint.mark_dirty();
+        update_world_matrices_subtree(nodes, chain.joints[i]);
+    }
+}
+----
+
+The positions-to-rotations conversion (Step 4) may look familiar—it is the same "rotate one vector onto another" operation as CCD, just applied with the solved bone direction as the target rather than the IK target position. The key difference is that when we enter this loop, we already know exactly where every joint *should* be, so we are making precise corrections rather than iterative approximations.
+
+== FABRIK with Constraints
+
+Adding joint limits to FABRIK is slightly more complex than with CCD because FABRIK repositions joints as points rather than rotating them directly. The standard approach is to enforce constraints *during* the backward and forward passes by clamping the direction vector before computing the new joint position.
+
+For a hinge constraint, when computing the direction from joint `i` to joint `i+1`, we project that direction onto the plane perpendicular to the hinge axis, then clamp the angle. For a ball-and-socket constraint, we clamp the direction to the cone defined by the swing limit. The implementation is more involved than the unconstrained case, and getting it right requires careful handling of the reference frames:
+
+[source,cpp]
+----
+// Clamp direction 'd' to be within a cone of half-angle 'limit_deg'
+// around the reference direction 'ref', all in world space.
+glm::vec3 clamp_to_cone(const glm::vec3& d, const glm::vec3& ref, float limit_deg)
+{
+    float limit_rad = glm::radians(limit_deg);
+    float current_angle = std::acos(glm::clamp(glm::dot(d, ref), -1.0f, 1.0f));
+
+    if (current_angle <= limit_rad) return d; // Already within cone
+
+    // Rotate d toward ref by the excess angle
+    glm::vec3 axis = glm::cross(d, ref);
+    if (glm::length(axis) < 1e-6f) return ref; // Anti-parallel; snap to ref
+    axis = glm::normalize(axis);
+
+    float excess = current_angle - limit_rad;
+    return glm::mat3(glm::angleAxis(excess, axis)) * d;
+}
+----
+
+Applying this during the FABRIK pass changes the position computation for ball-and-socket joints from the unconstrained formula to one that clamps the direction before computing the step. This preserves the bone length while enforcing the cone constraint.
+
+== Choosing Between CCD and FABRIK
+
+Both algorithms are practical, both are fast, and both produce good results for the common cases. The choice comes down to the specific motion:
+
+For **foot placement on uneven terrain**, CCD is usually sufficient and is simpler to implement correctly. The corrections are small, the chain is short (two or three bones), and the end-effector bias of CCD is not perceptible for small adjustments.
+
+For **full-arm reaching**, FABRIK generally looks better. When a character reaches for an object at arm's length, you want the entire arm—shoulder, elbow, wrist—to participate naturally in the reach. FABRIK distributes the correction across the whole chain more evenly than CCD does.
+
+For **spines and longer chains**, FABRIK is strongly preferred. CCD applied to a five-bone spine will over-rotate the bones near the pelvis while the upper spine barely moves, which looks stiff and unnatural. FABRIK produces a smooth, fluid bend that reads correctly.
+
+Many production engines use both: CCD for limb IK and FABRIK for spines and special cases. There is no reason you cannot do the same—the two algorithms have the same interface from the scene graph's perspective.
+
+xref:Advanced_glTF/Procedural_Animation_IK/02_ccd_ik.adoc[Previous: Cyclic Coordinate Descent IK] | xref:Advanced_glTF/Procedural_Animation_IK/04_foot_placement.adoc[Next: Foot Placement on Uneven Terrain]
diff --git a/en/Advanced_glTF/Procedural_Animation_IK/04_foot_placement.adoc b/en/Advanced_glTF/Procedural_Animation_IK/04_foot_placement.adoc
new file mode 100644
index 000000000..a3794373d
--- /dev/null
+++ b/en/Advanced_glTF/Procedural_Animation_IK/04_foot_placement.adoc
@@ -0,0 +1,175 @@
+:pp: {plus}{plus}
+= Foot Placement on Uneven Terrain
+
+== Why This Is Harder Than It Sounds
+
+Foot placement is the first practical application most developers want from an IK system, and it reveals several subtleties that the algorithm descriptions above don't fully prepare you for. Getting the feet to touch the ground sounds simple: cast a ray downward from each foot, find where it hits the terrain, and run IK to place the foot there. In practice, there are at least three additional problems you need to solve before this looks correct.
+
+The first problem is **body height**. If you push the feet down to meet the terrain, but the character's body height (the position of the root joint, typically the pelvis or hips) doesn't adjust, you will get a character whose feet are sinking into the floor on one side of a slope while floating in the air on the other. The pelvis needs to move down to accommodate the terrain's lowest foot contact point.
+
+The second problem is **foot rotation**. Dropping the foot down to the terrain surface is only half the correction—the foot also needs to *rotate* to match the terrain normal. A foot planted on a slope should angle itself to conform to the slope, not remain horizontal as if standing on flat ground. This requires computing the terrain normal at the contact point and rotating the foot to align with it.
+
+The third problem is **smoothing**. If you apply IK corrections directly every frame, the feet will jump to the exact IK solution with no inertia. When the terrain surface changes rapidly—say, the character steps from a slope onto a flat area—the foot will snap instantly. In reality, a foot has momentum and should approach its target position over several frames. We need to smooth the IK targets in time, not just compute them.
+
+== The Foot Placement Pipeline
+
+A complete foot placement system has four distinct stages that run in order every frame:
+
+**Stage 1 — Raycast.** For each foot, fire a ray downward (in world space, along the gravity direction) from a position above the foot. The ray origin should be high enough to account for the character potentially being above the terrain—typically the character's full height above the foot position. Record the hit point and the surface normal at each contact.
+
+**Stage 2 — Smooth the targets.** Apply a low-pass filter to the hit positions and normals over time. A simple exponential moving average is sufficient: `smoothed = lerp(smoothed, raw, smoothing_factor * delta_time)`. The smoothing factor controls how quickly the foot "catches up" to the terrain—higher values mean faster response but more snapping. A value around 8 to 12 typically produces natural-looking results.
+
+**Stage 3 — Adjust the pelvis.** Determine how much each foot needs to move vertically (down from its animated position to the smoothed terrain contact). Take the larger of the two vertical adjustments, and move the entire character's root joint down by that amount. This ensures neither foot needs to extend more than necessary to reach the terrain.
+
+**Stage 4 — Solve IK.** Run the IK solver (CCD or FABRIK) on each leg chain, with the smoothed hit point as the target. Then rotate the foot joint to align with the smoothed terrain normal.
+
+== Implementing the Pelvis Adjustment
+
+The pelvis adjustment is handled before the IK solve. We query the desired foot positions and compute the necessary downward offset:
+
+[source,cpp]
+----
+struct FootPlacementData {
+    glm::vec3 target_position;  // Smoothed world-space target for the foot
+    glm::vec3 surface_normal;   // Smoothed terrain normal at the contact point
+    float     ik_weight;        // 0 = no IK, 1 = full IK (for blending)
+};
+
+struct FootPlacementState {
+    FootPlacementData left;
+    FootPlacementData right;
+    glm::vec3 smoothed_pelvis_offset; // Low-pass filtered pelvis height adjustment
+};
+
+void update_foot_targets(
+    const PhysicsWorld&  world,
+    const glm::vec3&     left_foot_world,
+    const glm::vec3&     right_foot_world,
+    float                character_height,
+    float                delta_time,
+    FootPlacementState&  state)
+{
+    const float SMOOTHING = 10.0f;
+
+    auto raycast_foot = [&](const glm::vec3& foot_world) -> std::pair<glm::vec3, glm::vec3> {
+        glm::vec3 ray_origin = foot_world + glm::vec3(0, character_height, 0);
+        glm::vec3 ray_dir    = glm::vec3(0, -1, 0);
+        float     ray_length = character_height * 2.0f;
+
+        // Jolt Physics Raycast Implementation
+        JPH::RayCast ray;
+        ray.mOrigin = JPH::Vec3(ray_origin.x, ray_origin.y, ray_origin.z);
+        ray.mDirection = JPH::Vec3(ray_dir.x * ray_length, ray_dir.y * ray_length, ray_dir.z * ray_length);
+
+        JPH::RayCastResult hit;
+        // We use a collision filter to ensure the ray hits the terrain but ignores the character itself
+        if (world.get_physics_system().GetNarrowPhaseQuery().CastRay(ray, hit,
+            JPH::SpecifiedBroadPhaseLayerFilter(Layers::NON_MOVING),
+            JPH::SpecifiedObjectLayerFilter(Layers::NON_MOVING)))
+        {
+            JPH::BodyLockRead lock(world.get_physics_system().GetBodyLockInterface(), hit.mBodyID);
+            if (lock.Succeeded()) {
+                const JPH::Body& body = lock.GetBody();
+                JPH::Vec3 pos = ray.GetPointOnRay(hit.mFraction);
+                JPH::Vec3 normal = body.GetWorldSpaceSurfaceNormal(hit.mSubShapeID2, pos);
+                return { glm::vec3(pos.GetX(), pos.GetY(), pos.GetZ()),
+                         glm::vec3(normal.GetX(), normal.GetY(), normal.GetZ()) };
+            }
+        }
+        // No hit: keep the animated foot position, flat normal
+        return { foot_world, glm::vec3(0, 1, 0) };
+    };
+
+    auto [left_pos, left_normal]   = raycast_foot(left_foot_world);
+    auto [right_pos, right_normal] = raycast_foot(right_foot_world);
+
+    float t = glm::clamp(SMOOTHING * delta_time, 0.0f, 1.0f);
+
+    state.left.target_position  = glm::mix(state.left.target_position,  left_pos,    t);
+    state.left.surface_normal   = glm::normalize(glm::mix(state.left.surface_normal,   left_normal,  t));
+    state.right.target_position = glm::mix(state.right.target_position, right_pos,   t);
+    state.right.surface_normal  = glm::normalize(glm::mix(state.right.surface_normal,  right_normal, t));
+
+    // Pelvis adjustment: push the pelvis down by the maximum foot drop
+    float left_drop  = left_foot_world.y  - state.left.target_position.y;
+    float right_drop = right_foot_world.y - state.right.target_position.y;
+    float pelvis_drop = std::max(0.0f, std::max(left_drop, right_drop));
+
+    glm::vec3 desired_pelvis_offset(0, -pelvis_drop, 0);
+    state.smoothed_pelvis_offset = glm::mix(
+        state.smoothed_pelvis_offset, desired_pelvis_offset, t);
+}
+----
+
+Notice the use of `std::max(0.0f, ...)` on the pelvis drop. We only push the pelvis *down*, never up. If the terrain is higher than the animated foot position (the character is about to step on a raised surface), the IK will bend the leg to handle it; we don't want the pelvis being pushed skyward on slopes that rise quickly.
+
+== Applying the IK and Foot Rotation
+
+After adjusting the pelvis, the leg chain has been repositioned slightly. We then run the IK solver on each leg, followed by the foot rotation:
+
+[source,cpp]
+----
+void apply_foot_ik(
+    std::vector<Node>&          nodes,
+    const IKChain&              leg_chain,
+    uint32_t                    foot_node_idx,
+    const FootPlacementData&    foot_data,
+    const glm::vec3&            up_axis)
+{
+    // Run the IK solver to place the foot at the target position
+    solve_ccd(nodes, leg_chain, foot_data.target_position);
+
+    // Now rotate the foot to align with the terrain normal.
+    // The foot's "up" direction in its local space needs to align with the surface normal.
+    Node& foot = nodes[foot_node_idx];
+
+    // The foot's current world-space up direction
+    // (assuming the foot's local Y is the "up" direction—adjust for your rig)
+    glm::vec3 foot_current_up = glm::normalize(
+        glm::mat3(foot.world_matrix) * glm::vec3(0, 1, 0));
+
+    float dot = glm::clamp(glm::dot(foot_current_up, foot_data.surface_normal), -1.0f, 1.0f);
+    if (dot < 0.9999f) {
+        float     angle = std::acos(dot);
+        glm::vec3 axis  = glm::normalize(glm::cross(foot_current_up, foot_data.surface_normal));
+
+        glm::quat world_correction = glm::angleAxis(angle, axis);
+        glm::quat new_world_rot    = world_correction * foot.get_world_rotation();
+
+        glm::quat parent_world_rot = (foot.parent_index != INVALID_NODE_INDEX)
+            ? nodes[foot.parent_index].get_world_rotation()
+            : glm::quat(1, 0, 0, 0);
+
+        foot.local_rotation = glm::inverse(parent_world_rot) * new_world_rot;
+        foot.mark_dirty();
+        update_world_matrices_subtree(nodes, foot_node_idx);
+    }
+}
+----
+
+== Blending IK In and Out
+
+There is one more thing we need to handle: the IK system must not fight the animation system when the character is in motion. Consider what happens when a character starts walking. Their feet are lifting off the ground constantly—the animated foot trajectory deliberately lifts each foot through the air during the swing phase. If we run foot placement IK during the swing, the IK system will immediately try to pull the foot back down to the ground, preventing it from lifting at all.
+
+The solution is to gate the IK correction with a **plant weight**: a per-foot value that is 1.0 when the foot should be firmly planted and 0.0 when the foot is in the air. The plant weight transitions smoothly between these extremes.
+
+The plant weight can be derived from the animation itself. Most animation systems annotate walk cycles with **foot contact events**—keyframe-triggered events that mark the moments when each foot makes and breaks contact with the ground. If your animation tool generates these events, listen for them to drive the plant weight.
+
+If foot contact events are not available, you can derive the plant weight heuristically: when the foot's animated velocity (the rate of change of its world position across frames) drops below a threshold, treat the foot as planted. This works reasonably well for walk cycles but can fail for faster gaits.
+
+Once you have the plant weight, apply it as a blend between the animated position and the IK-corrected position:
+
+[source,cpp]
+----
+// After computing the IK target position, blend it with the animated position
+// based on the plant weight. A weight of 1.0 uses full IK; 0.0 uses animation.
+glm::vec3 blended_target = glm::mix(
+    animated_foot_position,
+    foot_data.target_position,
+    foot_data.ik_weight
+);
+----
+
+This makes the foot placement system a layer that enhances the base animation rather than overriding it. When the foot is planted, IK brings it to precise contact with the terrain. When the foot is in the air, IK gracefully fades out, letting the animation play undisturbed.
+
+xref:Advanced_glTF/Procedural_Animation_IK/03_fabrik.adoc[Previous: FABRIK] | xref:Advanced_glTF/Procedural_Animation_IK/05_look_at.adoc[Next: Look-At Controllers]
diff --git a/en/Advanced_glTF/Procedural_Animation_IK/05_look_at.adoc b/en/Advanced_glTF/Procedural_Animation_IK/05_look_at.adoc
new file mode 100644
index 000000000..3b37cd033
--- /dev/null
+++ b/en/Advanced_glTF/Procedural_Animation_IK/05_look_at.adoc
@@ -0,0 +1,198 @@
+:pp: {plus}{plus}
+= Look-At Controllers
+
+== What a Look-At Controller Does
+
+A look-at controller is a procedural system that rotates one or more joints—typically the neck and head—so that the character appears to be looking at a specific point in the world. Unlike foot placement IK, which corrects the position of an end effector, a look-at controller is purely rotational: it answers the question "what rotation must these joints have so that the eye direction points at the target?"
+
+The application domain is wide. An enemy soldier turning their head to track a player who is moving across their line of sight. A shop NPC whose head follows the player character as they walk through a market. A sniper character whose neck and upper spine slowly rotate toward a distant target. A horse whose head turns toward an interesting sound. In all of these cases, the base body animation continues playing—the character walks, talks, breathes—while the look-at system independently rotates the head joints.
+
+Look-at controllers can be implemented as a special case of IK (the "effector" is the eye direction rather than a position), or as a direct rotation computation. We will implement the direct approach because it is simpler, more controllable, and sufficient for the vast majority of practical use cases.
+
+== Understanding the Coordinate Space Problem
+
+The tricky part of a look-at controller is not the math—the math is simple—but getting the coordinate spaces right. Every joint has a local coordinate system defined by the glTF hierarchy. The "forward" direction of the head joint is not necessarily the world Y or Z axis; it depends on how the artist set up the rig in Blender. You need to know which local axis of the head joint points "forward" (toward the face) and which points "up," because these are the axes you will be aligning.
+
+In a standard Blender humanoid rig following common conventions, the head's local +Y axis typically points upward (toward the crown of the head) and the +Z axis points forward (toward the nose). But this is a convention, not a law—you should verify it for your specific rig by inspecting the joint orientation in Blender. We will parameterize the forward and up axes in our look-at implementation so that it works regardless of rig convention.
+
+== The Look-At Rotation
+
+Given the head joint's current world position and orientation, and a target world position, the look-at rotation is computed as follows:
+
+1. Compute the desired forward direction: `desired_forward = normalize(target - head_world_pos)`.
+2. Extract the current forward direction: `current_forward = normalize(head_world_rotation * local_forward_axis)`.
+3. Compute the rotation from current to desired: this is the same "rotate one vector onto another" quaternion we used in CCD.
+4. Optionally apply the angle to the local rotation in the scene graph, converting from world space back to parent space the same way we did in the IK solver.
+
+Let's implement this as a reusable function:
+
+[source,cpp]
+----
+struct LookAtController {
+    uint32_t  head_node_idx;           // The primary head/neck joint
+    glm::vec3 local_forward_axis;      // The "forward" axis in the joint's local space
+    glm::vec3 local_up_axis;           // The "up" axis in the joint's local space
+    float     max_angle_degrees;       // Maximum deviation from the rest pose
+    float     weight;                  // Blend weight: 0 = no look-at, 1 = full
+};
+
+void apply_look_at(
+    std::vector<Node>&     nodes,
+    const LookAtController& controller,
+    const glm::vec3&        target_world)
+{
+    Node& head = nodes[controller.head_node_idx];
+
+    // World position of the head joint
+    glm::vec3 head_world_pos = glm::vec3(head.world_matrix[3]);
+
+    glm::vec3 to_target = target_world - head_world_pos;
+    float dist = glm::length(to_target);
+    if (dist < 1e-4f) return; // Target is inside the head; ignore
+    to_target /= dist;
+
+    // Current world-space forward direction of the head joint
+    glm::quat head_world_rot = head.get_world_rotation();
+    glm::vec3 current_forward = head_world_rot * controller.local_forward_axis;
+
+    float dot = glm::clamp(glm::dot(current_forward, to_target), -1.0f, 1.0f);
+    if (dot > 0.9999f) return; // Already looking at target
+
+    float angle = std::acos(dot);
+
+    // Clamp the rotation to the maximum allowed angle.
+    // This prevents the head from twisting 180 degrees around.
+    float clamped_angle = std::min(angle, glm::radians(controller.max_angle_degrees));
+
+    // Apply the weight: a weight of 0.5 would only rotate halfway to the target.
+    // This is useful for blending the look-at in and out smoothly.
+    clamped_angle *= controller.weight;
+    if (clamped_angle < 1e-5f) return;
+
+    glm::vec3 rotation_axis = glm::cross(current_forward, to_target);
+    if (glm::length(rotation_axis) < 1e-6f) {
+        // Vectors are anti-parallel; choose an arbitrary perpendicular axis.
+        // Use the joint's up axis as the rotation axis to avoid gimbal issues.
+        rotation_axis = head_world_rot * controller.local_up_axis;
+    } else {
+        rotation_axis = glm::normalize(rotation_axis);
+    }
+
+    glm::quat world_correction  = glm::angleAxis(clamped_angle, rotation_axis);
+    glm::quat new_world_rot     = world_correction * head_world_rot;
+
+    glm::quat parent_world_rot = (head.parent_index != INVALID_NODE_INDEX)
+        ? nodes[head.parent_index].get_world_rotation()
+        : glm::quat(1, 0, 0, 0);
+
+    head.local_rotation = glm::inverse(parent_world_rot) * new_world_rot;
+    head.mark_dirty();
+    update_world_matrices_subtree(nodes, controller.head_node_idx);
+}
+----
+
+== Distributing the Rotation Across Multiple Joints
+
+A single-joint look-at controller applied only to the head joint works but does not look natural. A real person does not rotate only their head to look at something—they also rotate their neck, and for large angular differences, they rotate their upper spine. The look-at rotation should be **distributed** across the joint chain from spine to head, with each joint taking a share of the total rotation.
+
+This distribution serves two purposes. First, it avoids the mechanical look of a head that rotates on a fixed neck—humans have a flexible neck that participates in the rotation. Second, it allows the total look-at angle to exceed the head's individual maximum range of motion, since multiple joints each contribute a smaller part of the total.
+
+Distributing the rotation is straightforward: instead of applying the full angle to the head joint, apply a fraction of it to each joint in the chain, from spine to head. The fractions should sum to one and can be weighted however you like—typically the head gets the largest share and the spine gets the smallest:
+
+[source,cpp]
+----
+struct LookAtChain {
+    std::vector<uint32_t> joints;      // Ordered from root (spine) to tip (head)
+    std::vector<float>    weights;     // Per-joint share of the total rotation (must sum to 1)
+    glm::vec3 local_forward_axis;      // Forward axis in the *head joint's* local space
+    glm::vec3 local_up_axis;           // Up axis in the *head joint's* local space
+    float     max_total_angle_degrees; // Maximum total look-at angle across all joints
+    float     blend_weight;            // Global blend weight (0 = none, 1 = full)
+};
+
+void apply_look_at_chain(
+    std::vector<Node>&    nodes,
+    const LookAtChain&    chain,
+    const glm::vec3&      target_world)
+{
+    if (chain.joints.empty()) return;
+    uint32_t head_idx = chain.joints.back();
+
+    // Compute the total desired rotation (same as single-joint look-at)
+    glm::vec3 head_world_pos  = glm::vec3(nodes[head_idx].world_matrix[3]);
+    glm::vec3 to_target       = target_world - head_world_pos;
+    float dist = glm::length(to_target);
+    if (dist < 1e-4f) return;
+    to_target /= dist;
+
+    glm::quat head_world_rot   = nodes[head_idx].get_world_rotation();
+    glm::vec3 current_forward  = head_world_rot * chain.local_forward_axis;
+
+    float dot   = glm::clamp(glm::dot(current_forward, to_target), -1.0f, 1.0f);
+    float angle = std::acos(dot);
+    if (angle < 1e-5f) return;
+
+    float total_angle = std::min(angle, glm::radians(chain.max_total_angle_degrees));
+    total_angle *= chain.blend_weight;
+    if (total_angle < 1e-5f) return;
+
+    glm::vec3 rotation_axis = glm::cross(current_forward, to_target);
+    if (glm::length(rotation_axis) < 1e-6f) {
+        rotation_axis = head_world_rot * chain.local_up_axis;
+    } else {
+        rotation_axis = glm::normalize(rotation_axis);
+    }
+
+    // Distribute the rotation across the chain, applying each joint's share
+    for (size_t i = 0; i < chain.joints.size(); ++i) {
+        uint32_t joint_idx    = chain.joints[i];
+        float    joint_angle  = total_angle * chain.weights[i];
+        if (joint_angle < 1e-5f) continue;
+
+        Node& joint = nodes[joint_idx];
+        glm::quat world_correction = glm::angleAxis(joint_angle, rotation_axis);
+        glm::quat new_world_rot    = world_correction * joint.get_world_rotation();
+
+        glm::quat parent_world_rot = (joint.parent_index != INVALID_NODE_INDEX)
+            ? nodes[joint.parent_index].get_world_rotation()
+            : glm::quat(1, 0, 0, 0);
+
+        joint.local_rotation = glm::inverse(parent_world_rot) * new_world_rot;
+        joint.mark_dirty();
+        update_world_matrices_subtree(nodes, joint_idx);
+    }
+}
+----
+
+A typical weight distribution for a three-joint neck-to-head chain might be `[0.2, 0.3, 0.5]`: the lower neck contributes 20% of the rotation, the upper neck contributes 30%, and the head contributes 50%. These values are aesthetic choices—profile them visually and adjust until the motion looks natural for your character.
+
+== Smoothing the Look-At Target
+
+Just like foot placement, a look-at controller needs temporal smoothing or the character's head will snap directly to a new target position every frame. The smoothing is the same exponential moving average approach: maintain a smoothed target position that chases the raw target each frame:
+
+[source,cpp]
+----
+struct LookAtState {
+    glm::vec3 smoothed_target;  // The temporally-smoothed look-at target
+    bool initialized = false;
+};
+
+void update_look_at_target(
+    const glm::vec3& raw_target,
+    float            speed,      // Higher = faster tracking; typical range 3–15
+    float            delta_time,
+    LookAtState&     state)
+{
+    if (!state.initialized) {
+        state.smoothed_target = raw_target;
+        state.initialized     = true;
+        return;
+    }
+    float t = glm::clamp(speed * delta_time, 0.0f, 1.0f);
+    state.smoothed_target = glm::mix(state.smoothed_target, raw_target, t);
+}
+----
+
+The `speed` parameter controls the "stiffness" of the head tracking. A speed of 3 produces slow, languid tracking suitable for a character casually watching something in the distance. A speed of 12 produces snappy tracking suitable for an alert soldier scanning for threats. Varying the speed based on the character's state—calm versus alert—adds a subtle layer of expressiveness that pays dividends in cutscenes.
+
+xref:Advanced_glTF/Procedural_Animation_IK/04_foot_placement.adoc[Previous: Foot Placement on Uneven Terrain] | xref:Advanced_glTF/Procedural_Animation_IK/06_physics_driven_lean.adoc[Next: Physics-Driven Lean]
diff --git a/en/Advanced_glTF/Procedural_Animation_IK/06_physics_driven_lean.adoc b/en/Advanced_glTF/Procedural_Animation_IK/06_physics_driven_lean.adoc
new file mode 100644
index 000000000..7438f7b26
--- /dev/null
+++ b/en/Advanced_glTF/Procedural_Animation_IK/06_physics_driven_lean.adoc
@@ -0,0 +1,163 @@
+:pp: {plus}{plus}
+= Physics-Driven Procedural Lean
+
+== Why Characters Need to Lean
+
+The single most common visual artifact in third-person action games—visible to any player who knows what to look for—is characters that run around corners in a perfectly vertical stance. In reality, when a person (or any object with mass) turns, inertia pushes them outward and they lean inward to compensate. A bicycle rider leaning hard into a turn. A sprinter's whole body angled forward. A soldier pivoting sharply to respond to a sound. All of these involve a characteristic tilt of the torso that we read as "momentum" and "weight."
+
+When this lean is missing, characters feel weightless. They look like they are gliding on rails rather than moving through a world that resists them. Adding even a small amount of physically-derived tilt to a character—particularly during acceleration, deceleration, and turning—dramatically improves the sense of mass and presence.
+
+The good news is that this does not require a full physics simulation. The lean angle is a function of the character's current velocity vector and its rate of change, both of which we can compute from the physics simulation's output (or from the character controller, if you have one). We can then apply that lean as a procedural rotation to the spine joints, layered on top of the base animation.
+
+== The Physics of Leaning
+
+Let's understand what lean actually is before we compute it. When a character accelerates forward, inertia resists the change in velocity. From the character's reference frame, this feels like a backward force—which is why you lean forward when running and slightly backward when braking hard. The lean angle is determined by the ratio of the horizontal acceleration to the vertical gravitational acceleration:
+
+`lean_angle = atan2(horizontal_acceleration, gravity)`
+
+For a character accelerating at 5 m/s² on a planet with standard gravity (9.8 m/s²), the lean angle is approximately `atan2(5, 9.8) ≈ 27 degrees`. That is a substantial lean—the kind you see in sprinters—and reflects the fact that aggressive acceleration requires a significant forward tilt to keep the body's center of mass over the feet.
+
+For turning, the physics are slightly different. When a character moves in a circle, centripetal acceleration pulls them toward the center of the turn. The lean required to counteract this is:
+
+`lean_angle = atan2(v² / r, gravity)`
+
+where `v` is the speed and `r` is the turn radius. This is why a tighter turn at the same speed requires more lean, and why faster turns require more lean than slower ones.
+
+In practice, we compute the acceleration vector and centripetal acceleration from the character's velocity history, combine them into a single "effective gravity" vector, and tilt the spine to align with that vector.
+
+== Computing the Lean Vector
+
+We need the character's velocity and the change in velocity (acceleration) over the last frame. If you are using a physics engine, both of these are available from the character controller's rigid body. If you are using a kinematic character controller (one that you move directly), you will need to track the previous velocity and compute the delta yourself:
+
+[source,cpp]
+----
+struct CharacterMotionState {
+    glm::vec3 velocity;           // Current world-space velocity
+    glm::vec3 previous_velocity;  // Velocity from last frame
+    glm::vec3 smoothed_lean_vec;  // Low-pass filtered lean direction
+};
+
+// Compute the "effective downward" direction that the character perceives
+// due to their motion. The spine should align perpendicular to this direction.
+glm::vec3 compute_lean_vector(
+    const CharacterMotionState& motion,
+    float                       gravity,     // Positive value (e.g. 9.8)
+    float                       delta_time,
+    float                       lean_strength) // Scale factor; 1.0 = physically accurate
+{
+    // Linear acceleration from velocity change
+    glm::vec3 accel = (motion.velocity - motion.previous_velocity) / delta_time;
+
+    // For lean, we care about the horizontal component of acceleration.
+    // Vertical acceleration (e.g., falling) should not tilt the spine.
+    glm::vec3 horizontal_accel(accel.x, 0, accel.z);
+
+    // The effective gravity that the character "feels" in their reference frame
+    // is the sum of real gravity (downward) and the reaction to their acceleration
+    // (opposite to their acceleration direction).
+    // We negate horizontal_accel because lean opposes the acceleration direction.
+    glm::vec3 effective_gravity(
+        -horizontal_accel.x * lean_strength,
+        -gravity,
+        -horizontal_accel.z * lean_strength
+    );
+
+    return glm::normalize(effective_gravity);
+}
+----
+
+The `lean_strength` parameter is a tuning knob. A value of 1.0 gives you physically accurate lean angles, which may be too extreme for a game character (27 degrees of lean looks natural on a sprinting athlete but might look exaggerated on a game character who is also waving a sword). Values between 0.3 and 0.7 typically look best for game characters—enough to convey weight and momentum without looking like the character is about to fall over.
+
+== Applying the Lean to the Spine
+
+With the lean vector computed, we apply it as a spine rotation. The spine's "up" axis (the direction the torso points) should align with the negative of the effective gravity vector—in other words, the spine should tilt opposite to the perceived gravitational pull.
+
+We distribute the lean across the spine joints using the same approach as the look-at chain: each joint takes a fraction of the total tilt angle, with the lower spine joints contributing more than the upper ones (the lean originates from the hips and is transmitted upward through the spine):
+
+[source,cpp]
+----
+struct LeanController {
+    std::vector<uint32_t> spine_joints;   // Ordered from pelvis to upper spine
+    std::vector<float>    joint_weights;  // Per-joint share (should sum to 1)
+    glm::vec3 local_up_axis;             // The joint's "up" in local space
+    float     max_lean_degrees;          // Safety cap on total lean
+    float     smoothing_speed;           // How quickly lean responds to velocity changes
+};
+
+void apply_physics_lean(
+    std::vector<Node>&          nodes,
+    const LeanController&       controller,
+    const CharacterMotionState& motion,
+    float                       gravity,
+    float                       delta_time,
+    float                       lean_strength,
+    glm::vec3&                  smoothed_lean_vec) // persistent state
+{
+    // Compute the raw lean vector from this frame's physics data
+    glm::vec3 raw_lean_vec = compute_lean_vector(motion, gravity, delta_time, lean_strength);
+
+    // Smooth it over time to prevent jittery lean from noisy acceleration data
+    float t = glm::clamp(controller.smoothing_speed * delta_time, 0.0f, 1.0f);
+    smoothed_lean_vec = glm::normalize(glm::mix(smoothed_lean_vec, raw_lean_vec, t));
+
+    // The world "up" direction
+    const glm::vec3 world_up(0, 1, 0);
+
+    // The lean angle is the angle between world_up and the smoothed lean vector
+    float lean_angle = std::acos(glm::clamp(glm::dot(smoothed_lean_vec, -world_up), -1.0f, 1.0f));
+    if (lean_angle < glm::radians(0.5f)) return; // Less than half a degree; skip
+
+    // Clamp to the maximum allowed lean
+    lean_angle = std::min(lean_angle, glm::radians(controller.max_lean_degrees));
+
+    // The axis around which we lean is perpendicular to both world_up and the lean vector
+    glm::vec3 lean_axis = glm::cross(-world_up, smoothed_lean_vec);
+    if (glm::length(lean_axis) < 1e-6f) return; // No lean direction; moving straight up/down
+    lean_axis = glm::normalize(lean_axis);
+
+    // Distribute the lean across the spine joints
+    for (size_t i = 0; i < controller.spine_joints.size(); ++i) {
+        uint32_t joint_idx   = controller.spine_joints[i];
+        float    joint_angle = lean_angle * controller.joint_weights[i];
+        if (joint_angle < 1e-5f) continue;
+
+        Node& joint = nodes[joint_idx];
+        glm::quat world_correction = glm::angleAxis(joint_angle, lean_axis);
+        glm::quat new_world_rot    = world_correction * joint.get_world_rotation();
+
+        glm::quat parent_world_rot = (joint.parent_index != INVALID_NODE_INDEX)
+            ? nodes[joint.parent_index].get_world_rotation()
+            : glm::quat(1, 0, 0, 0);
+
+        joint.local_rotation = glm::inverse(parent_world_rot) * new_world_rot;
+        joint.mark_dirty();
+        update_world_matrices_subtree(nodes, joint_idx);
+    }
+}
+----
+
+== Combining Lean with Look-At and IK
+
+At this point, we have three procedural layers that all modify spine and head joints: foot placement IK (which adjusts the pelvis height and leg chains), look-at (which rotates the neck and head joints), and physics-driven lean (which tilts the spine joints). All three need to play together without interference.
+
+The key is the **order of application**. Procedural corrections to the scene graph are applied sequentially, and each layer builds on the result of the previous one. The correct order for a typical humanoid character is:
+
+1. **Play the base animation.** This sets all joint local rotations from keyframe data. The scene graph now reflects the animation pose.
+
+2. **Propagate world matrices.** Run the dirty-flag update to compute world matrices for all nodes.
+
+3. **Apply foot placement IK.** This adjusts the pelvis translation and leg chain rotations. The foot positions are now correct relative to the terrain.
+
+4. **Propagate world matrices again.** The leg corrections changed world matrices for all leg descendants; we need fresh world matrices before proceeding.
+
+5. **Apply physics-driven lean.** This tilts the spine joints. The lean is applied on top of the animation and foot corrections.
+
+6. **Propagate world matrices again.**
+
+7. **Apply look-at.** This rotates the neck and head. By applying look-at last, it can respond to the spine's lean-adjusted orientation—which is what a real neck does when the body is tilted.
+
+8. **Final world matrix propagation.** The scene graph is now ready for the skinning compute shader.
+
+Performing multiple world matrix propagation passes each frame may seem expensive. In practice, for a humanoid character with a skeleton of ~60 joints, each propagation pass is a handful of matrix multiplications per joint—easily done in microseconds. The total cost is negligible compared to the draw calls and compute dispatches that follow.
+
+xref:Advanced_glTF/Procedural_Animation_IK/05_look_at.adoc[Previous: Look-At Controllers] | xref:Advanced_glTF/Procedural_Animation_IK/07_conclusion.adoc[Next: Conclusion]
diff --git a/en/Advanced_glTF/Procedural_Animation_IK/07_conclusion.adoc b/en/Advanced_glTF/Procedural_Animation_IK/07_conclusion.adoc
new file mode 100644
index 000000000..a2d6cd3f5
--- /dev/null
+++ b/en/Advanced_glTF/Procedural_Animation_IK/07_conclusion.adoc
@@ -0,0 +1,41 @@
+:pp: {plus}{plus}
+= Procedural Animation: Summary & What's Next
+
+== What We Built
+
+Over the course of this chapter we moved from a character that plays back artist-authored animations faithfully, to a character that adapts to its environment in real time. The distance between these two things—while it can be measured in lines of code—is far more significant in terms of what players perceive. A character that steps correctly over rocks, turns its head to watch something interesting, and leans naturally into a corner feels *present* in the game world in a way that a purely keyframed character does not.
+
+Let's review what we implemented. We started with the two fundamental IK algorithms: CCD and FABRIK. CCD works joint by joint, rotating each one to bring the end effector incrementally closer to the target, and converges quickly for short chains with tight joint limits. FABRIK thinks in terms of positions rather than rotations, redistributes the chain more evenly, and tends to look more natural for longer chains and full-arm reaching. Both integrate with the scene graph's dirty-flag system: they read world matrices, compute corrections, and write back local rotations.
+
+We then applied CCD to foot placement—the most universally impactful IK application for humanoid characters. Foot placement required us to solve three problems beyond the basic IK: body height adjustment (pelvis must move down to accommodate uneven terrain), foot rotation (the foot must align to the terrain normal, not just reach the right height), and temporal smoothing (raw IK targets snap; smooth targets feel grounded). We also covered the plant weight system that gates IK correction based on whether the foot is in a planted or swing phase, ensuring the IK enhances rather than overrides the base animation.
+
+The look-at controller introduced a purely rotational procedural technique. By computing the angle between the head's current forward direction and the direction to a target, and distributing that angle across a spine-to-head joint chain, we achieve natural head-and-neck tracking. The smoothed target system ensures the tracking responds with an appropriate sense of inertia—fast for an alert character, slow for a relaxed one.
+
+Finally, the physics-driven lean system connects the character's movement physics to its body posture. By computing the effective gravity vector from the character's acceleration, we derive a lean angle that is applied across the spine joints. The result is a character whose body communicates velocity, weight, and direction of travel without a single artist-authored lean animation.
+
+== Tradeoffs to Remember
+
+Every system we built this chapter involves a tradeoff, and it is worth naming them explicitly.
+
+CCD is simpler to implement correctly than FABRIK but can produce biased poses for long chains or extreme reaches. FABRIK is more visually balanced but requires the additional step of converting solved positions back to joint rotations, which introduces complexity around coordinate spaces. For the foot chains typical in humanoid characters, CCD is usually the right default; for spines and full-arm reach, prefer FABRIK.
+
+Foot placement with raycast and pelvis adjustment looks significantly better than no foot placement, but it introduces a latency between terrain change and character adaptation (the smoothing time constant). Aggressive smoothing reduces the latency but increases the chance of visible snapping. Conservative smoothing feels more natural but means the character's feet will float momentarily after stepping off a ledge. Profile this on your target terrain and tune accordingly.
+
+Look-at controllers based on direct rotation work well for moderate angular ranges—perhaps up to 60 or 70 degrees from the rest pose. Beyond that, distributing the rotation across multiple joints helps, but you will hit the limits of what looks natural. For characters that need to look behind them, you typically need to animate a "turn to look" animation that physically turns the body, rather than trying to stretch the look-at angle to 180 degrees.
+
+Physics-driven lean is purely heuristic. Physically correct lean angles (derived from `atan2(acceleration, gravity)`) are often visually too extreme for game characters who are simultaneously performing other exaggerated motions. The `lean_strength` tuning parameter exists precisely because physical accuracy and visual plausibility are not the same thing. Trust your eyes over the formula.
+
+== What Comes Next
+
+In Chapter 6 we turn our attention to **Morph Targets and Facial Animation**. Everything we have done so far has concerned skeletal animation—the hierarchical joint system that deforms the body. Morph targets are a fundamentally different representation: instead of rotating joints, they store per-vertex displacement vectors that blend between facial expressions, lip sync poses, and other continuous shape changes.
+
+Morph targets present a different set of challenges. The vertex count is large—a high-quality face mesh may have tens of thousands of vertices—and the number of active morph targets can be substantial in a cutscene with heavy facial animation. We will use Vulkan 1.4's descriptor indexing to handle the morph target displacement buffers without per-draw descriptor updates, and integrate the morph results with the skinning compute pipeline we built in Chapter 3.
+
+== Verification: What to Look For
+
+To verify your IK implementation:
+1.  **Convergence**: In CCD or FABRIK, monitor the error distance over iterations. It should decrease monotonically. If it oscillates or increases, your joint rotation logic (specifically the parent-to-local space conversion) is likely wrong.
+2.  **Stability**: Test your IK solver with targets that are out of reach. The chain should extend fully toward the target without jittering or spinning wildly.
+3.  **Pelvis Adjustment**: Ensure the character's hips lower correctly when standing on a steep slope. If they do not, the legs will appear over-extended or "stiff."
+
+xref:Advanced_glTF/Procedural_Animation_IK/06_physics_driven_lean.adoc[Previous: Physics-Driven Lean] | xref:Advanced_glTF/Morph_Targets_Facial_Animation/01_introduction.adoc[Next: Morph Targets & Facial Animation]
diff --git a/en/Advanced_glTF/Scene_Graph_Hierarchy/01_introduction.adoc b/en/Advanced_glTF/Scene_Graph_Hierarchy/01_introduction.adoc
new file mode 100644
index 000000000..c6d936def
--- /dev/null
+++ b/en/Advanced_glTF/Scene_Graph_Hierarchy/01_introduction.adoc
@@ -0,0 +1,46 @@
+= The Scene Graph & Transform Hierarchy
+
+== Unifying the World
+
+In the "Building a Simple Engine" series, we implemented a hybrid scene architecture. We maintained a flat collection of `ObjectInstance` entities in our world, each of which contained a `Model` that possessed its own internal glTF node hierarchy. This was a perfect middle ground for learning: it allowed us to load complex glTF assets while keeping the top-level engine logic simple.
+
+However, as we move into advanced character animation and physics, this separation becomes a barrier. A character isn't just a self-contained model; it is a collection of interacting systems. Their hand might be a node that needs to grasp a separate sword entity. Their feet might be nodes that must interact with the terrain's physics collider. To support these cross-entity interactions seamlessly, we need to move to a **Unified Scene Graph**.
+
+In this chapter, we will dissolve the boundary between "the world" and "the model." We will transition our engine to a single, global recursive node system where every object—whether it's a 200-bone character, a static building, or a light source—is simply a `Node` in a universal hierarchy.
+
+== The Performance Barrier: Redundant Computation
+
+The "Simple Engine" used a straightforward recursive traversal to calculate world-space transformation matrices:
+
+[source,cpp]
+----
+void renderNode(const std::vector<Node*>& nodes, const glm::mat4& parentMatrix) {
+    for (const auto node : nodes) {
+        glm::mat4 nodeMatrix = parentMatrix * node->getLocalMatrix();
+        // ... render and recurse ...
+    }
+}
+----
+
+While elegant, this approach has a major performance flaw: it recalculates the world matrix for every node, every single frame, regardless of whether anything actually moved. In a scene with thousands of nodes—common in modern character-heavy environments—this results in a massive waste of CPU cycles and memory bandwidth.
+
+To solve this, we will implement the **Dirty Flag Pattern**. Each node will track whether its local transform or its parent's world transform has changed. We will only perform the expensive matrix multiplications when a node is "dirty." This optimization is the difference between an engine that chokes on a few characters and one that can handle a whole crowd.
+
+== Bridging Animation and Physics
+
+The Scene Graph is the architectural "nervous system" of our character pipeline. It is where the mathematical theory of skeletal animation meets the physical reality of the engine world.
+
+A character's skeleton is, at its heart, just a specialized scene graph. By building a robust, optimized hierarchy first, we create a system that can handle both simple parent-child attachments and complex skeletal structures with the same underlying logic. Furthermore, this hierarchy provides the hooks for our physics engine. We will attach collision proxies (capsules, boxes) directly to nodes in our hierarchy. When the animation system moves a bone node, the physics engine's collider follows automatically. This bi-directional syncing is what allows a character's arm to react to collisions while it's in the middle of a swing.
+
+== What We'll Build
+
+In this chapter, we will lay the foundational "plumbing" for our advanced character pipeline across four key areas:
+
+*   **The Engine Expansion**: We'll refactor our entity management into a global, recursive `Node` system. We'll implement the Dirty Flag optimization and explore a more data-oriented layout to improve cache locality.
+*   **Metadata & Physics Extras**: We'll explore the use of glTF "extras" to automate our physics setup. We'll leverage custom properties defined in 3D tools to automatically generate colliders and physical properties within our engine.
+*   **Physics Syncing Theory**: we'll establish the mathematical link between our visual nodes and our physics rigid bodies, preparing us for the Kinematic-to-Dynamic handoff (Ragdolls) later in the series.
+*   **The Transformation Pipeline**: We'll refine our math to handle inverse transforms and global-to-local conversions, which are the prerequisites for Inverse Kinematics (IK).
+
+Let's begin by expanding our engine's core and building a scene graph that is ready for the rigors of modern character animation.
+
+xref:Advanced_glTF/introduction.adoc[Previous: Advanced glTF Introduction] | xref:Advanced_glTF/Scene_Graph_Hierarchy/02_engine_expansion.adoc[Next: The Engine Expansion]
diff --git a/en/Advanced_glTF/Scene_Graph_Hierarchy/02_engine_expansion.adoc b/en/Advanced_glTF/Scene_Graph_Hierarchy/02_engine_expansion.adoc
new file mode 100644
index 000000000..04bc38cd9
--- /dev/null
+++ b/en/Advanced_glTF/Scene_Graph_Hierarchy/02_engine_expansion.adoc
@@ -0,0 +1,214 @@
+= The Engine Expansion: Optimizing the Scene Graph
+
+== From Simple Instances to a Global Hierarchy
+
+In the "Building a Simple Engine" series, we managed our world objects using a straightforward `ObjectInstance` vector. Each object had its own position, rotation, and scale, and we rendered them one by one:
+
+[source,cpp]
+----
+struct ObjectInstance {
+    glm::vec3 position;
+    glm::vec3 rotation;
+    glm::vec3 scale;
+    // ...
+};
+std::vector<ObjectInstance> objectInstances;
+----
+
+This approach is perfect for learning the basics of Vulkan, but it creates what we call a **"Shallow World."** In this model, every object is independent. While a model inside an instance could have its own internal glTF hierarchy, the objects in the world couldn't easily interact with each other.
+
+As we move into advanced character animation, this "shallow" architecture becomes a major roadblock. Imagine a character holding a sword. In a shallow world, the sword and the character are two separate entities. If the character's hand moves, we have to manually calculate where the sword should be every single frame. This becomes exponentially more difficult when you add holsters, shields, or even multiple characters riding on a moving vehicle.
+
+To solve this, we must replace our flat list with a **Unified Scene Graph**. In this system, every entity in our engine—whether it's a character's pelvis bone, a sword, or a massive elevator—is a **Node** in a single, global hierarchy. By giving every node an optional `parent_index`, we allow any object to be attached to any other object. When the parent moves, the children follow automatically.
+
+== The Performance Barrier: Redundant Computation
+
+While a recursive hierarchy is architecturally elegant, it introduces a significant performance challenge. In a traditional scene graph, calculating a node's "World Matrix" (its final position in the 3D world) requires multiplying its local transform by its parent's world matrix:
+
+`WorldMatrix = ParentWorldMatrix * LocalMatrix`
+
+In a complex character with a 200-bone skeleton, calculating the world-space position of a fingertip requires traversing the entire hierarchy from the pelvis, through the spine, shoulder, arm, and wrist. If we perform this calculation for every bone, every single frame, we waste a massive amount of CPU cycles—especially if the character isn't even moving!
+
+To achieve professional-grade performance, we will implement the **Dirty Flag Pattern**.
+
+=== Understanding the Dirty Flag
+
+A "Dirty Flag" is a simple but powerful optimization. Instead of blindly recalculating matrices every frame, we add a small "status" variable to each node. When we change a node's position, we mark it as **"Dirty."**
+
+We then only perform the expensive matrix multiplications if a node (or its parent) is dirty. If a node is **"Clean,"** we simply reuse the world matrix we calculated in the previous frame.
+
+[source,cpp]
+----
+const uint32_t INVALID_NODE_INDEX = 0xFFFFFFFF;
+
+enum TransformStatus : uint8_t {
+    Clean = 0,
+    LocalDirty = 1 << 0,  // This node's position/rotation/scale changed
+    WorldDirty = 1 << 1   // This node's world matrix needs recalculation
+};
+
+struct ColliderDef {
+    enum class Shape { CAPSULE, BOX, NONE };
+    Shape shape     = Shape::NONE;
+    float radius    = 0.0f;
+    float half_height = 0.0f;
+    glm::vec3 box_half_extents = {0,0,0};
+    float mass      = 1.0f;
+    std::string collision_group;
+    std::string collision_mask;
+};
+
+struct ConstraintDef {
+    enum class Type { NONE, BALL_SOCKET, HINGE };
+    Type        type              = Type::NONE;
+    float       swing_limit_deg   = 180.0f;
+    float       twist_limit_deg   = 180.0f;
+    float       hinge_min_deg     = -180.0f;
+    float       hinge_max_deg     =  180.0f;
+    glm::vec3   hinge_axis        = {0,0,1};
+    std::string parent_bone;
+};
+
+struct Node {
+    uint32_t node_index;
+    uint32_t parent_index = INVALID_NODE_INDEX;
+    std::vector<uint32_t> child_indices;
+    std::string name;
+
+    // Local transform data (SRT: Scale, Rotation, Translation)
+    glm::vec3 translation    = {0,0,0};
+    glm::quat local_rotation = glm::identity<glm::quat>();
+    glm::vec3 scale          = {1,1,1};
+
+    // Cached world matrix - we only update this when dirty!
+    glm::mat4 world_matrix = glm::mat4(1.0f);
+
+    uint8_t status = TransformStatus::Clean;
+    bool is_joint = false;
+
+    // Physics metadata (extracted from glTF extras)
+    ColliderDef collider_def;
+    ConstraintDef constraint_def;
+
+    // Call this whenever you change translation, rotation, or scale
+    void mark_dirty() {
+        status |= TransformStatus::LocalDirty | TransformStatus::WorldDirty;
+    }
+
+    // Computes the local transform matrix from SRT components.
+    // The order is TRS (Translation * Rotation * Scale).
+    glm::mat4 get_local_matrix() const {
+        return glm::translate(glm::mat4(1.0f), translation) *
+               glm::mat4_cast(local_rotation) *
+               glm::scale(glm::mat4(1.0f), scale);
+    }
+
+    void set_local_matrix(const glm::mat4& m) {
+        translation = glm::vec3(m[3]);
+        local_rotation = glm::quat_cast(m);
+        // Extract scale from the matrix basis vectors
+        scale = glm::vec3(
+            glm::length(glm::vec3(m[0])),
+            glm::length(glm::vec3(m[1])),
+            glm::length(glm::vec3(m[2]))
+        );
+        mark_dirty();
+    }
+
+    // Safely extracts rotation from the world matrix, stripping any scale.
+    glm::quat get_world_rotation() const {
+        glm::mat3 rot_scale = glm::mat3(world_matrix);
+        glm::mat3 rotation;
+        // We must normalize the axes to remove scale before converting to a quaternion,
+        // otherwise non-uniform scaling will corrupt the rotation data.
+        rotation[0] = glm::normalize(rot_scale[0]);
+        rotation[1] = glm::normalize(rot_scale[1]);
+        rotation[2] = glm::normalize(rot_scale[2]);
+        return glm::quat_cast(rotation);
+    }
+};
+----
+
+This optimization is the difference between an engine that chokes on a few animated characters and one that can handle a whole crowd. By tracking the "dirtiness" of our transforms, we ensure that static environmental objects—which make up the bulk of most scenes—consume almost zero CPU time for transform updates.
+
+== Cache-Friendly Memory: Data-Oriented Design (DOD)
+
+How we store these nodes in memory is just as important as the math we use to update them. Traditional scene graphs often use pointers (e.g., `std::vector<Node*> children`), which leads to **"Pointer Chasing."** When the CPU tries to update the hierarchy, it has to jump to different, unpredictable locations in memory to find each child node. This "random access" is incredibly slow because it misses the CPU's high-speed cache.
+
+To maximize performance, we will store all of our nodes in a **Flat Data-Oriented Array**. Instead of pointers, we use stable indices (`uint32_t`) to link parents and children.
+
+[source,cpp]
+----
+struct SceneGraph {
+    // All nodes in the entire world live in this one contiguous block of memory
+    std::vector<Node> nodes;
+
+    // Linear update: Only works if nodes are topologically sorted (parent before children)
+    void update_transforms() {
+        for (auto& node : nodes) {
+            if (node.status & TransformStatus::WorldDirty) {
+                if (node.parent_index != INVALID_NODE_INDEX) {
+                    node.world_matrix = nodes[node.parent_index].world_matrix * node.get_local_matrix();
+                } else {
+                    node.world_matrix = node.get_local_matrix();
+                }
+
+                for (uint32_t child_index : node.child_indices) {
+                    nodes[child_index].status |= TransformStatus::WorldDirty;
+                }
+                node.status = TransformStatus::Clean;
+            }
+        }
+    }
+
+    // Recursive update: Correctly handles any node order and sub-tree updates
+    void update_world_matrices_subtree(uint32_t index) {
+        Node& node = nodes[index];
+
+        if (node.parent_index != INVALID_NODE_INDEX) {
+            node.world_matrix = nodes[node.parent_index].world_matrix * node.get_local_matrix();
+        } else {
+            node.world_matrix = node.get_local_matrix();
+        }
+
+        for (uint32_t child_idx : node.child_indices) {
+            // We force WorldDirty on children because their parent's matrix just changed
+            nodes[child_idx].status |= TransformStatus::WorldDirty;
+            update_world_matrices_subtree(child_idx);
+        }
+
+        node.status = TransformStatus::Clean;
+    }
+};
+----
+
+=== The Importance of Order: Topological Sorting
+
+There is a subtle but critical detail in the loop above. For the "parent-to-child" update to work correctly in a single pass, a parent node must **always** appear in the array before its children.
+
+If the pelvis bone is at index 0 and the thigh bone is at index 50, the pelvis will be updated first, its `world_matrix` will be ready, and its "dirty" state will be passed to the thigh. If the order were reversed, the thigh would try to update using the *old* pelvis matrix from the previous frame, leading to a "jittering" effect where the character's limbs appear to lag behind their bodies.
+
+Ensuring this order is called a **Topological Sort**. By maintaining this specific order in our flat array, we allow the CPU to linearly prefetch node data, resulting in extremely fast, cache-friendly updates that can handle thousands of nodes in a fraction of a millisecond.
+
+== Preparing for the Character Pipeline
+
+This engine expansion isn't just about rendering performance; it's about architectural readiness. By implementing a unified, dirty-flag-driven scene graph, we've prepared the ground for several advanced features we'll implement later:
+
+*   **Skeletal Animation**: Our bones will simply be nodes in this hierarchy.
+*   **Physics Syncing**: We'll attach physics rigid bodies to specific nodes. When a node's `world_matrix` becomes dirty, we'll know exactly when to update the physics proxy.
+*   **Ray Tracing (RTAS)**: Building Acceleration Structures for ray tracing is expensive. Our dirty flags will tell us exactly which BLAS (Bottom-Level Acceleration Structure) needs to be rebuilt because an object moved.
+
+Let's now move to the next section and look at how we link this visual hierarchy to our physics simulation.
+
+== Migrating from the Simple Engine
+
+If you are coming from the "Building a Simple Engine" series, you likely have a collection of `ObjectInstance` structs. Transitioning to a unified Scene Graph is a major architectural shift. Here is a migration checklist:
+
+1.  **Switch from Pointers to Indices**: Replace any `Node*` or `Entity*` references with `uint32_t` indices into a global `std::vector<Node>`.
+2.  **Unify the Storage**: Instead of each `Model` owning its own internal node hierarchy, load all glTF nodes directly into the global `SceneGraph`.
+3.  **Adopt the Dirty Flag**: Remove any per-frame `Parent * Local` matrix math from your render loop. Instead, call `update_transforms()` once at the start of your frame.
+4.  **Topological Sorting**: When loading a glTF, ensure you add nodes to the `std::vector` in an order where parents always precede their children. This allows the high-speed linear update loop to function correctly.
+
+By making these changes, you move from a collection of independent objects to a cohesive world where every element can react to its parents, setting the stage for complex character interactions.
+
+xref:Advanced_glTF/Scene_Graph_Hierarchy/01_introduction.adoc[Previous: Introduction] | xref:Advanced_glTF/Scene_Graph_Hierarchy/03_physics_syncing.adoc[Next: Physics Syncing]
diff --git a/en/Advanced_glTF/Scene_Graph_Hierarchy/03_physics_syncing.adoc b/en/Advanced_glTF/Scene_Graph_Hierarchy/03_physics_syncing.adoc
new file mode 100644
index 000000000..5a22def15
--- /dev/null
+++ b/en/Advanced_glTF/Scene_Graph_Hierarchy/03_physics_syncing.adoc
@@ -0,0 +1,142 @@
+= Physics Syncing: The Animation-Physics Link
+
+== Bridging Two Worlds: The Animation-Physics Conflict
+
+In modern game engines, a character's movement is almost always a tug-of-war between two fundamentally different systems: the **Animation System** and the **Physics Engine**. To understand why we need to "sync" these systems, we first have to recognize how they view the world in completely different ways.
+
+The **Animation System** is essentially a playback device. It takes pre-recorded or calculated "poses" (bone positions and rotations) and applies them to a visual mesh. It doesn't care if a character's arm passes through a stone wall or if their feet are hovering six inches off the ground. Its only goal is to make the character *look* like they are performing a specific action, like walking, swinging a sword, or reacting to a hit.
+
+The **Physics Engine**, on the other hand, is a mathematical simulation of the physical laws of our world. It calculates how objects move based on forces like gravity, friction, and collisions. It doesn't know about "animations" or "skeletons." It only knows about **Rigid Bodies**—mathematical shapes like boxes, spheres, and capsules that have mass and momentum. Its goal is to ensure that objects don't pass through each other and that they react realistically when they collide.
+
+The challenge we face is that these two systems often disagree on where a character should be. If an animation says a character's hand should be inside a wall, the physics engine says "No, it must stop at the surface." If the physics engine says a character should be falling, the animation system might still be playing a "stand idle" clip.
+
+To resolve this, we need a **Bi-directional Link**: a bridge that allows us to pass transformation data back and forth between our visual scene graph and the physics simulation.
+
+== Understanding the Players: Kinematic vs. Dynamic
+
+To implement this bridge, we must understand the two primary modes that a physics body can operate in. These modes define which system is currently the "Master" and which is the "Slave."
+
+=== 1. Kinematic: Animation in Control
+A **Kinematic** body is a special type of rigid body that is "immune" to the laws of physics simulation. It has infinite mass, it isn't affected by gravity, and it cannot be pushed by other objects. Instead, its position and rotation are set directly by the CPU (in our case, by the animation system).
+
+Think of a kinematic body as a "God Mode" object. It moves exactly where the animation tells it to go, and it will push any dynamic objects (like boxes or other characters) out of its way with unstoppable force. This is how we represent a character's limbs during normal gameplay. The arm moves because the animation says so, and the physics engine uses the kinematic collider to check if that arm has hit anything else in the world.
+
+=== 2. Dynamic: Physics in Control
+A **Dynamic** body (often called a "Ragdoll" body in character contexts) is a standard rigid body that is fully simulated by the physics engine. It reacts to gravity, it can be pushed by other objects, and it follows the laws of momentum and friction.
+
+In this mode, the animation system is effectively turned off for that specific bone. The physics engine becomes the "Source of Truth," and we pull the resulting transformation data back into our scene graph so the visual mesh follows the physical simulation. This is the classic "ragdoll" effect you see when a character is knocked unconscious or killed.
+
+== The Synchronization Pipeline
+
+The synchronization between our Scene Graph and the Physics Engine is a carefully orchestrated loop that occurs every single frame. If we sync at the wrong time, we risk **One-Frame Lag**, where the visual character is always one step behind their physics representation, leading to "ghosting" or clipping artifacts where the character appears to pass through objects they should be colliding with.
+
+The standard engine update loop follows this specific sequence:
+
+1.  **Animation Update**: The engine calculates new local poses for all bones based on time, blending, and animation clips.
+2.  **Scene Graph Update**: We propagate these local poses down the hierarchy to calculate the `world_matrix` for every node (using our **Dirty Flag** system).
+3.  **Kinematic Sync (Animation -> Physics)**: For every node that is in "Kinematic" mode, we push its newly calculated `world_matrix` into the physics engine. This "positions" the colliders for the upcoming simulation step.
+4.  **Physics Step**: The physics engine simulates one tick of time (e.g., 1/60th of a second). It calculates collisions and resolves forces for all dynamic bodies.
+5.  **Dynamic Sync (Physics -> Animation)**: For every node that is in "Dynamic" mode, we pull its new world position from the physics engine and convert it back into our Scene Graph's local-space coordinates.
+6.  **Final Cleanup**: We mark any modified nodes as "Dirty" so that the renderer and other systems (like ray-tracing acceleration structure updates) use the correct, physics-driven positions.
+
+== Implementation: Kinematic Sync (Animation Driving Physics)
+
+When our character is performing a standard animation, the Scene Graph is the primary source of truth. We must "teleport" the physics engine's bone proxies (colliders) to match the visual nodes.
+
+Because physics engines usually expect separate position and rotation data rather than a raw 4x4 matrix, we must decompose our node's `world_matrix`.
+
+[source,cpp]
+----
+void sync_kinematic_to_physics(const Node& node, PhysicsBody& body) {
+    // Optimization: Only update physics if the visual node has actually moved
+    if (node.status & TransformStatus::WorldDirty) {
+
+        // 1. Extract the Translation
+        // The 4th column of a standard 4x4 matrix contains the world-space position.
+        glm::vec3 world_pos = glm::vec3(node.world_matrix[3]);
+
+        // 2. Extract the Rotation
+        // This gives us the world-space orientation without any scaling factors.
+        glm::quat world_rot = node.get_world_rotation();
+
+        // 3. Update the Physics Representation
+        // This effectively 'teleports' the collider to the visual node's location.
+        body.set_transform(world_pos, world_rot);
+    }
+}
+----
+
+**Note on Scaling**: Most physics engines assume that rigid bodies have a scale of 1.0. If your artist has scaled a bone in Blender, the `world_matrix` will contain that scale. By decomposing the matrix into a position and a quaternion, we intentionally strip away the scale, providing the physics engine with exactly what it needs for a stable simulation.
+
+== Implementation: Dynamic Sync (The Ragdoll Handoff)
+
+The most complex part of the bi-directional link is **Dynamic Syncing**, where physics takes control. To visualize a ragdoll, we must map the physics body's world-space transform back into our hierarchical Scene Graph nodes.
+
+The math here is a common trap. Because our scene graph is hierarchical, we cannot simply overwrite the `world_matrix`. If we did, the next time `update_transforms()` is called, our parent-child calculation (`ParentWorld * LocalMatrix`) would overwrite the physics position with the (now incorrect) animation position.
+
+Instead, we must calculate what the **local transform** needs to be, relative to the parent, to result in the specific world-space position provided by the physics engine.
+
+[source,cpp]
+----
+void sync_physics_to_dynamic(Node& node, const PhysicsBody& body) {
+    // 1. Get the new world transform from the physics simulation
+    glm::mat4 new_world_matrix = body.get_world_matrix();
+
+    // 2. Convert world matrix to local transform relative to parent
+    // The fundamental formula for our hierarchy is: World = ParentWorld * Local
+    // To solve for Local, we multiply both sides by the inverse of ParentWorld:
+    // Local = Inverse(ParentWorld) * World
+    if (node.parent_index != INVALID_NODE_INDEX) {
+        const Node& parent = nodes[node.parent_index];
+        glm::mat4 inv_parent_matrix = glm::inverse(parent.world_matrix);
+
+        // Calculate and set the node's new local matrix
+        node.set_local_matrix(inv_parent_matrix * new_world_matrix);
+    } else {
+        // Root nodes have no parent, so world space is identical to local space
+        node.set_local_matrix(new_world_matrix);
+    }
+
+    // 3. Mark the node dirty so its children and the renderer update correctly
+    node.mark_dirty();
+}
+----
+
+By converting the physics world-space coordinate into a local-space coordinate, we maintain the integrity of our hierarchical scene graph. This ensures that if a character is ragdolled while standing on a moving platform (like an elevator), the ragdoll will correctly follow the parent platform's movement while still being physically simulated.
+
+== Managing the Simulation: Collision Filtering
+
+One final, critical aspect of character physics is **Self-Collision**. In a ragdoll, every limb is a separate rigid body. If the arm's capsule collider is allowed to collide with the torso's collider, the ragdoll will "explode" or jitter violently as the limbs fight for space at the joints.
+
+To prevent this, we use **Collision Bitmasks**. We define groups for our objects and tell the physics engine which groups are allowed to interact.
+
+[source,cpp]
+----
+// Define our collision groups using bitwise flags
+enum CollisionGroup {
+    Group_Environment = 1 << 0, // Ground, walls, buildings
+    Group_Character   = 1 << 1, // The character's own limbs
+    Group_Player      = 1 << 2  // Other players or NPCs
+};
+
+// Example: Setting up a bone collider
+void setup_bone_physics(PhysicsBody& body) {
+    // This body identifies as part of the 'Character' group
+    body.set_collision_group(Group_Character);
+
+    // This body should collide with the Environment and the Player,
+    // but we EXCLUDE the Character group to prevent self-collisions.
+    uint32_t mask = Group_Environment | Group_Player;
+    body.set_collision_mask(mask);
+}
+----
+
+This "Social Distancing" for colliders ensures that a character's arms can pass through their own chest without generating phantom forces, while still being able to hit the ground or be struck by a player's weapon.
+
+== Summary
+
+By implementing bi-directional syncing, we've bridged the gap between the aesthetic world of animation and the mathematical world of physics. Our scene graph nodes now act as a unified interface that can be driven by either system, allowing for the complex transitions between scripted animation and emergent physical simulation that define modern character action.
+
+In the next section, we'll see how to automate the setup of these colliders. Instead of manually configuring capsules for every bone in code, we will leverage glTF "extras" and custom metadata to build our physics proxies directly from the artist's source file.
+
+xref:Advanced_glTF/Scene_Graph_Hierarchy/02_engine_expansion.adoc[Previous: The Engine Expansion] | xref:Advanced_glTF/Scene_Graph_Hierarchy/04_metadata_and_physics_extras.adoc[Next: Metadata & Physics Extras]
diff --git a/en/Advanced_glTF/Scene_Graph_Hierarchy/04_metadata_and_physics_extras.adoc b/en/Advanced_glTF/Scene_Graph_Hierarchy/04_metadata_and_physics_extras.adoc
new file mode 100644
index 000000000..74ed8441f
--- /dev/null
+++ b/en/Advanced_glTF/Scene_Graph_Hierarchy/04_metadata_and_physics_extras.adoc
@@ -0,0 +1,66 @@
+= Metadata & Physics Extras: Automating the Pipeline
+
+== Beyond Geometry
+
+A modern character pipeline is about more than just meshes and textures. It is about **intent**. When an artist rigs a character in Blender, they aren't just positioning bones; they are defining how that character should interact with the world. In the "Simple Engine," we manually placed our entities and manually defined their properties. In an advanced character engine, this manual approach is a bottleneck that leads to errors and slow iteration times.
+
+The glTF format provides a powerful but often overlooked feature for solving this: the **`extras`** field. This field is a catch-all container for custom JSON metadata that can be attached to any object in the glTF hierarchy—nodes, meshes, materials, and even animations. By leveraging these extras, we can create a data-driven pipeline where the artist's work in the 3D tool automatically configures the engine's behavior.
+
+== The Power of "Extras"
+
+In a professional production, an artist shouldn't have to write code to define where a character's collision shapes go. Instead, they define them as custom properties in Blender. When the glTF is exported, these properties are embedded in the `extras` field of the corresponding node.
+
+Consider a character's forearm bone. In Blender, we can add a custom property called `physics` with a value that defines a capsule collider. When our engine parses the glTF, it doesn't just see a bone node; it sees a request to attach a physical proxy.
+
+[source,cpp]
+----
+// Example of extracting "extras" during node loading
+void processNodeExtras(const tinygltf::Node& gltfNode, Node& engineNode) {
+    if (!gltfNode.extras.IsNull() && gltfNode.extras.IsObject()) {
+        if (gltfNode.extras.Has("physics")) {
+            const auto& physicsData = gltfNode.extras.Get("physics");
+
+            std::string type = physicsData.Get("type").Get<std::string>();
+
+            if (type == "capsule") {
+                float radius = static_cast<float>(physicsData.Get("radius").Get<double>());
+                float height = static_cast<float>(physicsData.Get("height").Get<double>());
+
+                // The Scene Graph node now acts as a hook for the physics engine
+                engineNode.attachPhysicsProxy(createCapsuleCollider(radius, height));
+            }
+        }
+    }
+}
+----
+
+This approach bridges the gap between the visual and the physical. If the artist scales the character or adjusts the length of a limb, the physics proxy—and therefore the character's collision behavior—updates automatically.
+
+== Automating the Skeleton-Physics Link
+
+The most critical application of this metadata is the **Kinematic-to-Dynamic link**. For a character to react to physics (like a ragdoll), the physics engine needs to know which rigid bodies correspond to which bones in our Scene Graph.
+
+By tagging nodes in the glTF as "physics-enabled," we can automate the creation of the entire ragdoll hierarchy. During the scene loading process, our engine can:
+
+1.  Identify nodes with physics metadata.
+2.  Create corresponding rigid bodies in the physics world (PhysX, Jolt, or Bullet).
+3.  Establish the bi-directional syncing link we discussed in the previous section.
+4.  Generate physics constraints (like hinge or ball-and-socket joints) based on additional "joint_limit" metadata.
+
+This automation ensures that our visual skeleton and our physical skeleton are always in perfect sync, without requiring a single line of hardcoded bone names or magic numbers in our engine core.
+
+== Production Benefits: Iteration Speed
+
+The shift from manual setup to metadata-driven automation is a major leap in engine maturity. It transforms the engine from a static renderer into a flexible platform for artists.
+
+*   **Zero-Code Colliders**: Artists can add, remove, or refine collision shapes without programmer intervention.
+*   **Asset-Specific Logic**: Metadata can define surface types (e.g., "metal" vs. "flesh" for footstep sounds), interaction points ("grab_here" for a sword), or AI hints ("cover_node").
+*   **Validation**: Because the metadata is part of the asset, it can be checked by the **Khronos glTF-Validator**, ensuring that every character in the game has a valid physics setup before it ever reaches the engine.
+
+== Summary
+
+By leveraging glTF "extras," we've moved the responsibility of scene configuration from the programmer to the source asset. Our Scene Graph is no longer just a collection of matrices; it is a rich, annotated hierarchy that informs every system in our engine—from the renderer to the physics solver.
+
+With our Scene Graph unified and our asset pipeline automated, we are ready to tackle the core of character movement: **Advanced Skeletal & Compute Skinning**. In the next chapter, we'll move the heavy lifting of vertex deformation into Vulkan Compute shaders, enabling characters that are as performant as they are expressive.
+
+xref:Advanced_glTF/Scene_Graph_Hierarchy/03_physics_syncing.adoc[Previous: Physics Syncing] | xref:Advanced_glTF/Scene_Graph_Hierarchy/05_conclusion.adoc[Next: Conclusion]
diff --git a/en/Advanced_glTF/Scene_Graph_Hierarchy/05_conclusion.adoc b/en/Advanced_glTF/Scene_Graph_Hierarchy/05_conclusion.adoc
new file mode 100644
index 000000000..7ad9c352c
--- /dev/null
+++ b/en/Advanced_glTF/Scene_Graph_Hierarchy/05_conclusion.adoc
@@ -0,0 +1,25 @@
+= Conclusion: A Foundation for Characters
+
+In this chapter, we have significantly evolved our engine's architecture to support the demands of advanced character pipelines. We moved beyond the simple, hybrid scene management of the "Building a Simple Engine" series and implemented a **Unified, Global Scene Graph**.
+
+Key achievements in this chapter:
+
+*   **Unified Hierarchy**: We dissolved the boundary between world entities and internal model nodes, allowing for seamless cross-entity parenting and interaction.
+*   **Dirty Flag Optimization**: We implemented a state-tracking system that eliminates redundant matrix multiplications, ensuring our engine remains performant even as we add complex skeletons with hundreds of bones.
+*   **Data-Oriented Design**: By moving to a flat array of nodes with index-based linking, we maximized CPU cache efficiency and prepared our engine for large-scale character scenes.
+*   **Bi-directional Physics Syncing**: We established the mathematical bridge between our Scene Graph and the physics solver, enabling both kinematic animation control and dynamic ragdoll simulation.
+*   **Metadata-Driven Automation**: We modernized our asset pipeline by extracting glTF "extras," allowing artists to automatically define physics proxies and interaction hooks directly within their 3D tools.
+
+This optimized scene graph is the essential "nervous system" for everything that follows. Without this robust foundation, implementing features like skeletal animation, IK, and physics-driven ragdolls would lead to a fragmented and unoptimized codebase.
+
+Now that our world can efficiently manage complex hierarchies, we are ready to bring our characters to life. In the next chapter, we will implement **Advanced Skeletal & Compute Skinning**, moving the heavy lifting of vertex deformation to the GPU and setting the stage for expressive, high-performance animation.
+
+== Verification: What to Look For
+
+To verify your Scene Graph implementation:
+
+1.  **Topological Sort**: Check that parent nodes are always added to the `nodes` vector before their children.
+2.  **Dirty Flag Propagation**: Set a breakpoint in `update_transforms()`. Ensure that `WorldDirty` is correctly set on all children when a parent is modified.
+3.  **Matrix Correctness**: Compare your `world_matrix` results against a known good source (like the glTF-Validator's node transforms) for a simple two-node hierarchy.
+
+xref:Advanced_glTF/Scene_Graph_Hierarchy/04_metadata_and_physics_extras.adoc[Previous: Metadata & Physics Extras] | xref:Advanced_glTF/Skeletal_Compute_Skinning/01_introduction.adoc[Next: Skeletal & Compute Skinning Introduction]
diff --git a/en/Advanced_glTF/Skeletal_Compute_Skinning/01_introduction.adoc b/en/Advanced_glTF/Skeletal_Compute_Skinning/01_introduction.adoc
new file mode 100644
index 000000000..23903012c
--- /dev/null
+++ b/en/Advanced_glTF/Skeletal_Compute_Skinning/01_introduction.adoc
@@ -0,0 +1,38 @@
+:pp: {plus}{plus}
+= Advanced Skeletal & Compute Skinning
+
+== What Skinning Actually Is
+
+If you have worked through the "Building a Simple Engine" series, you have already seen skeletal animation in action. We loaded a glTF file, parsed its animation channels, and updated the bone matrices each frame to move the model's joints. But in that tutorial, we deliberately kept things straightforward: the mesh deformation—the actual process of making the vertices *follow* the bones—happened in the vertex shader, one vertex at a time, as part of the normal rasterization pipeline.
+
+That approach works. For simple scenes with a handful of animated characters, it is perfectly acceptable. But it carries a hidden cost that only becomes apparent at scale: every system that needs access to the deformed mesh has to do the math itself, in isolation.
+
+Think about what happens when you add physics to your scene. Your character's ragdoll needs to know where the mesh surface is so that environmental objects collide with the *animated* body, not with the original T-pose. But your physics engine doesn't know about your vertex shader. It can't see the deformed positions. So either you run an entire second skinning calculation on the CPU for the physics system, or you accept that your physics colliders will be in the wrong place—an approximation that looks increasingly bad the more dramatic the animation.
+
+Add ray tracing to that picture and the problem compounds. A Vulkan Ray Tracing Acceleration Structure—the data structure that enables effects like reflections, ambient occlusion, and ray-traced shadows—is built from the mesh's vertex positions. If those positions are still in T-pose because the skinning only happens in the rasterization vertex shader, then your shadows and reflections will be cast by a lifeless mannequin instead of your animated character.
+
+The solution to all of these problems is a strategy we call **"Skin Once, Use Everywhere."** Instead of skinning the mesh as a side effect of rendering it, we skin the mesh *first*, as a dedicated step, and store the resulting deformed vertex positions in a GPU buffer. The rasterizer, the ray tracer, and the physics system all then read from that *same* buffer. Every system sees the correct, animated mesh, and we only do the expensive deformation math once per frame.
+
+Implementing this strategy requires moving the skinning work from the vertex shader into a **Vulkan Compute Shader**. This chapter is about building that compute-based skinning pipeline from the ground up.
+
+== What You Need to Know First
+
+Before diving into the implementation, it is worth being explicit about what this chapter assumes you already understand, and what it will teach you from scratch.
+
+You should be comfortable with the basics of glTF skeletal animation from the "Building a Simple Engine" series. Specifically, you should understand what **joints** and **skins** are in a glTF file, how animation channels drive bone transformations over time, and how the `AnimationSampler` and `AnimationChannel` structures work together to produce a pose. If any of those terms are unfamiliar, you should revisit that material before continuing here.
+
+You should also have a solid grasp of the **Scene Graph** system we built in the previous chapter of this series. The skinning system we are about to build is deeply integrated with that hierarchy. We will be reading joint world matrices directly from our `Node` array, so understanding how those matrices are calculated and cached using the Dirty Flag pattern is essential.
+
+What this chapter *will* teach you is the mathematical relationship between a vertex's rest position and its animated position—specifically, the role of the **Inverse Bind Matrix** and how it anchors the skinning calculation to the original T-pose. It will then show you how to encode that math in a Vulkan Compute shader, how to manage the input and output buffers, and how to structure the pipeline so that the output can be consumed by multiple downstream systems simultaneously.
+
+== The Plan for This Chapter
+
+We will build this system in layers. First, we will establish the mathematical foundation—exactly what computation we need to perform on each vertex, and why. Without understanding the math, the shader code is just a sequence of matrix multiplications that produce magic. With it, every line of the shader becomes a deliberate, principled step.
+
+Second, we will design and implement the Vulkan Compute pipeline. This involves creating shader storage buffers for the input vertices, the joint matrices, and the output vertices, writing the compute shader in Slang, and dispatching it correctly each frame.
+
+Third, we will wire up the output buffer as a shared resource. We will show how the rasterization pipeline reads from it instead of the original vertex buffer, how to use it as the geometry source for a ray tracing Bottom-Level Acceleration Structure (BLAS), and how to expose it to a physics system for accurate collision queries.
+
+Finally, we will tackle the more sophisticated aspects of animation: **cubic spline interpolation** for smooth, natural movement, and **cross-fade blending** for transitioning between animation clips—including the critical case of blending from a scripted animation into a physics-driven ragdoll.
+
+xref:Advanced_glTF/Skeletal_Compute_Skinning/02_skinning_math.adoc[Next: The Mathematics of Skinning]
diff --git a/en/Advanced_glTF/Skeletal_Compute_Skinning/02_skinning_math.adoc b/en/Advanced_glTF/Skeletal_Compute_Skinning/02_skinning_math.adoc
new file mode 100644
index 000000000..da6e14f16
--- /dev/null
+++ b/en/Advanced_glTF/Skeletal_Compute_Skinning/02_skinning_math.adoc
@@ -0,0 +1,117 @@
+:pp: {plus}{plus}
+= The Mathematics of Skinning
+
+== Why the Math Matters
+
+It is tempting to treat skinning as a black box. You give the GPU some bones and a mesh, and it figures out where the vertices should go. Many developers work this way for years. But when something goes wrong—and in skeletal animation, things always go wrong at some point—the developers who understand the underlying math can diagnose the problem in minutes, while those who don't are left staring at a contorted mesh with no idea why their character looks like they were put through a blender.
+
+This section will not be a brief overview. We are going to build the skinning equation from first principles, and by the end of it, you should be able to look at the compute shader code in the next section and understand exactly why every matrix multiplication is there.
+
+== The Bind Pose and Why It Exists
+
+Every skinned character starts its life in what is called the **Bind Pose** (sometimes called the **T-Pose** or **Rest Pose**). This is the specific configuration of the skeleton that the artist used when they attached the mesh to the rig. It is the "neutral" state where the bones and the mesh were in perfect alignment when the skinning weights were painted.
+
+The bind pose is critically important because all of the animation data in a glTF file is expressed as a *delta from the bind pose*. When a joint's animation channel says "rotate 30 degrees around the Z axis," it means "30 degrees more than the bind pose rotation." This implies that the bind pose is the shared reference frame that ties the animation data to the mesh geometry.
+
+In glTF, the bind pose information is stored as the **Inverse Bind Matrix** for each joint. This matrix is exactly what its name suggests: the mathematical inverse of the joint's transformation matrix at the time the mesh was bound to the skeleton. If the joint's world matrix in the bind pose was `B`, then the Inverse Bind Matrix stored in the file is `B⁻¹`.
+
+We will see exactly why we need this inverse in a moment.
+
+== The Three-Step Skinning Equation
+
+Calculating where an animated vertex should be is a three-step process. Understanding each step individually makes the combined equation obvious rather than mysterious.
+
+=== Step 1: Transform the Vertex to Bone-Local Space
+
+Every vertex in the mesh is stored in **model space**—its position is relative to the model's origin. But to apply a joint's rotation, we need the vertex to be in **bone-local space**, where the joint itself is the origin.
+
+This is exactly what the Inverse Bind Matrix does. When we multiply a vertex position by the Inverse Bind Matrix of a joint, we are expressing that vertex's position *relative to that joint as it existed in the bind pose*. We are essentially asking, "If the bind-pose joint were sitting at the origin of its own coordinate system, where would this vertex be?"
+
+For example, if a character's wrist joint was at position `(0.5, 1.2, 0.0)` in the bind pose, then the Inverse Bind Matrix for the wrist encodes that offset. When we multiply a vertex near the hand by this matrix, the result is a position close to the origin—close to the wrist—rather than somewhere in world space.
+
+=== Step 2: Apply the Current Joint Transform
+
+Now that the vertex is expressed relative to the bind-pose joint, we can apply the *current* joint transformation. This is the animated world matrix of the joint—the one our Scene Graph calculated in the previous chapter using the Dirty Flag system.
+
+When we multiply the bone-local vertex position by the current joint world matrix, we are saying: "Take this vertex that was positioned relative to the bind-pose joint, and move it to where the animated joint is now." If the wrist is currently rotated 45 degrees from its bind pose, the vertex will follow that rotation correctly because we first expressed it relative to the wrist's origin.
+
+The combined operation for a single joint is:
+
+`Animated Position = Joint World Matrix * Inverse Bind Matrix * Rest Position`
+
+Reading this right-to-left (as matrix multiplications work): we take the rest position, transform it into the joint's local space, and then transform it from the joint's local space into the final animated world space.
+
+=== Step 3: Blend Multiple Joint Influences
+
+A real character mesh is not driven by a single bone. Each vertex is influenced by multiple joints simultaneously. This is called **Linear Blend Skinning (LBS)**, or sometimes **Smooth Skinning**. Without it, character meshes fold and crumple at joints like a piece of cardboard, rather than stretching smoothly like skin over a real skeleton.
+
+In glTF, each vertex stores up to four **joint indices** and four corresponding **weights**. The joint indices identify which bones influence this vertex, and the weights specify how much each bone contributes. The weights must sum to 1.0 to ensure the vertex doesn't get scaled or translated by accident.
+
+The complete skinning equation for a single vertex is a weighted sum of the per-joint transforms:
+
+----
+Skinned_Position = (w0 * J0 * IB0 + w1 * J1 * IB1 + w2 * J2 * IB2 + w3 * J3 * IB3) * Rest_Position
+----
+
+Where `Jn` is the current world matrix of joint `n`, `IBn` is its Inverse Bind Matrix, and `wn` is the vertex's weight for that joint.
+
+This weighted sum is sometimes called the **Skin Matrix** for a vertex. Different vertices near a joint will have different weight distributions—a vertex right on the elbow might be 50% upper arm and 50% forearm, while a vertex halfway up the arm might be 90% upper arm and 10% forearm. The artist controls these weights using a weight-painting tool in their modeling software.
+
+== The Joint Matrix: Pre-Computing the Key Operation
+
+In the equation above, we are multiplying `Jn * IBn` for every vertex. But notice that this product is the same for every vertex influenced by the same joint. It doesn't matter whether we are computing vertex 100 or vertex 100,000—if both are influenced by joint `5`, they both use the same `J5 * IB5` matrix.
+
+This means we can and should pre-compute these products on the CPU before uploading them to the GPU. Instead of having each shader invocation look up two matrices and multiply them, we create an array of pre-multiplied **Joint Matrices**—one per joint—and upload only those to the GPU.
+
+[source,cpp]
+----
+// Called once per frame, after the animation update and scene graph update
+void compute_joint_matrices(
+    const Skin& skin,
+    const std::vector<Node>& nodes,
+    std::vector<glm::mat4>& joint_matrices_out)
+{
+    joint_matrices_out.resize(skin.joints.size());
+
+    for (size_t i = 0; i < skin.joints.size(); ++i) {
+        // Get the current world matrix of this joint's node
+        // (already computed by the scene graph's dirty flag system)
+        const Node& joint_node = nodes[skin.joints[i]];
+        const glm::mat4& current_world = joint_node.world_matrix;
+
+        // Multiply: Current Joint Transform * Inverse Bind Matrix
+        // This pre-computes the per-joint portion of the skinning equation.
+        // The shader only needs to multiply this by the vertex's rest position.
+        joint_matrices_out[i] = current_world * skin.inverse_bind_matrices[i];
+    }
+}
+----
+
+The `Skin` structure here mirrors what we parsed from the glTF file: a list of joint node indices and their corresponding inverse bind matrices. The output `joint_matrices_out` is what we will upload to the GPU as a **Shader Storage Buffer Object (SSBO)**—a large, GPU-accessible buffer that the compute shader can index into freely.
+
+== Handling Normals
+
+The skinning equation above correctly transforms vertex positions, but a complete skinning implementation must also transform **vertex normals**. Normals are the vectors perpendicular to the mesh surface, and they drive all of the lighting calculations. If you deform the mesh but don't deform the normals to match, your character will appear to be lit as if it were still in the T-pose—a jarring visual artifact.
+
+Normals cannot be transformed by the full joint matrix (which includes translation). Instead, they must be transformed by the **Normal Matrix**, which is the transpose of the inverse of the 3x3 rotation and scale portion of the joint matrix. In most cases—and this is the standard practice for character animation—we can assume that our joint matrices don't contain non-uniform scale. In that scenario, the normal matrix simplifies to the 3x3 portion of the joint matrix directly.
+
+[source,cpp]
+----
+// For joint matrices without non-uniform scale, the normal transform is simpler:
+glm::mat3 normal_matrix = glm::mat3(joint_matrix);
+
+// If non-uniform scale is present (e.g., squash-and-stretch animation):
+// glm::mat3 normal_matrix = glm::transpose(glm::inverse(glm::mat3(joint_matrix)));
+----
+
+This is a tradeoff worth being explicit about: skipping the full normal matrix calculation is a performance optimization that is almost always invisible in practice, because character rigs rarely use non-uniform scale on their skeletal joints. If your engine ever needs to support squash-and-stretch bone animation, you will need to revisit this assumption.
+
+== Tangents and Bitangents
+
+For characters using normal maps—which is nearly universal in AAA production—we also need to skin the **tangent vectors**. The tangent and bitangent define a per-vertex coordinate system that maps the normal map's texture space into the world space of the deformed mesh. If the mesh deforms but the tangent frame doesn't follow, normal-mapped lighting will be incorrect.
+
+Tangents are transformed using the same approach as normals—the 3x3 rotation portion of the joint matrix, without translation. The bitangent is typically not stored directly; it is reconstructed in the fragment shader as the cross product of the normal and tangent, using a handedness sign stored in the tangent's W component.
+
+With this mathematical foundation in place, we now have everything we need to write the compute shader. We know what inputs are required (rest positions, normals, tangents, joint indices, weights, and pre-computed joint matrices), we know what computation to perform, and we know what outputs to produce (animated positions, normals, and tangents in a new buffer that everyone else can read from).
+
+xref:Advanced_glTF/Skeletal_Compute_Skinning/01_introduction.adoc[Previous: Introduction] | xref:Advanced_glTF/Skeletal_Compute_Skinning/03_compute_skinning.adoc[Next: The Compute Skinning Pipeline]
diff --git a/en/Advanced_glTF/Skeletal_Compute_Skinning/03_compute_skinning.adoc b/en/Advanced_glTF/Skeletal_Compute_Skinning/03_compute_skinning.adoc
new file mode 100644
index 000000000..aebf3e28d
--- /dev/null
+++ b/en/Advanced_glTF/Skeletal_Compute_Skinning/03_compute_skinning.adoc
@@ -0,0 +1,349 @@
+:pp: {plus}{plus}
+= The Compute Skinning Pipeline
+
+== Why a Compute Shader?
+
+In a traditional rendering pipeline, the vertex shader runs once per vertex, as the first stage of rasterization. This makes it an obvious candidate for skinning—the vertex shader already has access to each vertex's position and attributes, and it can transform them before the triangle assembly and rasterization stages do their work.
+
+However, the vertex shader has a fundamental constraint: its output is ephemeral. The results of a vertex shader invocation exist only for the duration of that draw call. Once the frame is rendered, those transformed positions are gone. No other system—not the physics engine, not the ray tracer—can read what the vertex shader produced. This is precisely the "pay for it twice" problem we identified in the introduction.
+
+A **Compute Shader** is a general-purpose GPU program that runs outside the rasterization pipeline entirely. It reads from **Shader Storage Buffer Objects (SSBOs)** and writes its results back to other SSBOs. Those output buffers persist on the GPU and can be used by any subsequent pipeline stage or draw call. The compute shader runs once per frame, skins the mesh, and deposits the animated vertex data into a buffer that every downstream consumer can read at no additional cost.
+
+This is the core architectural shift: we go from skinning as a *side effect of rendering* to skinning as a *dedicated, first-class step* in our frame pipeline.
+
+== Buffer Architecture
+
+Before writing a single line of shader code, it is worth designing the buffer layout carefully. A skinned character's compute pipeline requires four key buffers:
+
+**Input Vertex Buffer (Read-Only)**: This stores the mesh vertices in their rest pose—exactly as loaded from the glTF file. It never changes after the initial upload. It contains positions, normals, tangents, texture coordinates, and the joint indices and weights for each vertex.
+
+**Joint Matrix Buffer (Write-Once-Per-Frame)**: This stores the array of pre-computed joint matrices we calculated on the CPU in the previous section. It is uploaded to the GPU at the beginning of each frame, after the animation system and scene graph have been updated. Its contents change every frame.
+
+**Output Vertex Buffer (Write by Compute, Read by Everything Else)**: This is the key buffer. The compute shader writes the animated positions, normals, and tangents here. The rasterizer's draw call uses this as its vertex input. The ray tracer's BLAS update reads from it. The physics system can query it for surface queries.
+
+**Indirect Draw Buffer (Optional but Recommended)**: For complex scenes with many animated characters, you may also want a buffer for GPU-driven indirect draw commands, but that is a more advanced topic we will address in a later chapter.
+
+The C{pp} side of this setup looks like this:
+
+[source,cpp]
+----
+struct SkinComputeResources {
+    // The mesh's original, unchanging rest-pose vertices
+    VkBuffer input_vertex_buffer;
+    VkDeviceMemory input_vertex_memory;
+    uint32_t vertex_count;
+
+    // Updated each frame with the current joint matrices
+    VkBuffer joint_matrix_buffer;
+    VkDeviceMemory joint_matrix_memory;
+    uint32_t joint_count;
+
+    // Written by compute, read by rasterizer/raytracer/physics
+    VkBuffer output_vertex_buffer;
+    VkDeviceMemory output_vertex_memory;
+
+    // Descriptor set referencing all three buffers
+    VkDescriptorSet descriptor_set;
+};
+----
+
+Both the input and output vertex buffers should be created with the `VK_BUFFER_USAGE_STORAGE_BUFFER_BIT` flag so the compute shader can read and write them via SSBO bindings. The output buffer additionally needs `VK_BUFFER_USAGE_VERTEX_BUFFER_BIT` so the rasterizer can use it directly as a vertex input source.
+
+[source,cpp]
+----
+// Output buffer usage flags - the key to "skin once, use everywhere"
+VkBufferUsageFlags output_usage =
+    VK_BUFFER_USAGE_STORAGE_BUFFER_BIT |    // Compute shader can write to it
+    VK_BUFFER_USAGE_VERTEX_BUFFER_BIT  |    // Rasterizer can read from it
+    VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT | // Ray tracing BLAS can reference it
+    VK_BUFFER_USAGE_TRANSFER_SRC_BIT;       // Can be read back to CPU if needed
+----
+
+== The Compute Shader
+
+Now we can write the actual skinning shader. We will use **Slang**, the modern GPU shading language that offers C{pp}-like syntax and excellent cross-compilation support.
+
+=== Why Slang?
+While you may be familiar with GLSL from the "Simple Engine," Slang offers several advantages for complex character pipelines:
+* **Natural Alignment**: Slang automatically handles the alignment of structs to match SPIR-V standards, reducing the "padding bugs" common in GLSL SSBOs.
+* **Modern Syntax**: It supports generics, interfaces, and operator overloading, making complex skinning math much cleaner.
+* **C{pp} Compatibility**: Slang's syntax is so close to C{pp} that you can often share struct definitions between your engine and your shaders.
+
+[NOTE]
+====
+To follow this section, you will need the **Slang compiler (`slangc`)**. You can download the latest binaries from the link:https://github.com/shader-slang/slang/releases[Slang GitHub releases page]. Ensure `slangc` is in your system PATH.
+====
+
+=== Compiling and Loading Slang Shaders
+Slang compiles `.slang` files into standard SPIR-V (`.spv`), which your Vulkan engine can load exactly like the GLSL shaders you used previously.
+
+[source,cmake]
+----
+# Example CMake integration for Slang
+add_custom_command(
+    OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/shaders/skinning.spv
+    COMMAND slangc ${CMAKE_CURRENT_SOURCE_DIR}/shaders/skinning.slang -target spirv -o ${CMAKE_CURRENT_BINARY_DIR}/shaders/skinning.spv
+    DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/shaders/skinning.slang
+    COMMENT "Compiling Slang shader to SPIR-V"
+)
+----
+
+To load the shader, use your existing `VkShaderModule` creation code:
+[source,cpp]
+----
+// Load the compiled .spv file
+auto shaderCode = readFile("shaders/skinning.spv");
+VkShaderModuleCreateInfo createInfo{};
+createInfo.sType = VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO;
+createInfo.codeSize = shaderCode.size();
+createInfo.pCode = reinterpret_cast<const uint32_t*>(shaderCode.data());
+vkCreateShaderModule(device, &createInfo, nullptr, &skinningShaderModule);
+----
+
+[source,slang]
+----
+// skinning.slang
+
+// The input vertex layout, matching our rest-pose buffer
+struct InputVertex {
+    float3 position;
+    float3 normal;
+    float4 tangent;         // w component stores handedness (-1 or +1)
+    float2 texcoord;
+    uint4  joint_indices;   // Up to 4 bone influences
+    float4 joint_weights;   // Corresponding weights (must sum to 1.0)
+};
+
+// The output vertex layout - a subset of the input (no joint data needed)
+struct OutputVertex {
+    float3 position;
+    float3 normal;
+    float4 tangent;
+    float2 texcoord;
+};
+
+// Shader Storage Buffer bindings
+[[vk::binding(0, 0)]] StructuredBuffer<InputVertex>  input_vertices;
+[[vk::binding(1, 0)]] StructuredBuffer<float4x4>     joint_matrices;
+[[vk::binding(2, 0)]] RWStructuredBuffer<OutputVertex> output_vertices;
+
+// Push constants for per-dispatch data
+struct SkinPushConstants {
+    uint vertex_count;
+};
+[[vk::push_constant]] SkinPushConstants push_constants;
+
+[shader("compute")]
+[numthreads(64, 1, 1)]
+void main(uint3 dispatchThreadID : SV_DispatchThreadID)
+{
+    uint vertex_id = dispatchThreadID.x;
+
+    // Guard: don't process vertices beyond the mesh
+    if (vertex_id >= push_constants.vertex_count) {
+        return;
+    }
+
+    InputVertex v = input_vertices[vertex_id];
+
+    // Build the blended skin matrix from the four joint influences.
+    // This is the core of the Linear Blend Skinning (LBS) equation:
+    // SkinMatrix = sum(weight_i * JointMatrix_i) for i in [0, 3]
+    float4x4 skin_matrix =
+        v.joint_weights.x * joint_matrices[v.joint_indices.x] +
+        v.joint_weights.y * joint_matrices[v.joint_indices.y] +
+        v.joint_weights.z * joint_matrices[v.joint_indices.z] +
+        v.joint_weights.w * joint_matrices[v.joint_indices.w];
+
+    // Transform the position using the full 4x4 skin matrix
+    float4 animated_pos = mul(skin_matrix, float4(v.position, 1.0));
+
+    // Normals and tangents use only the 3x3 rotational portion (no translation).
+    // We extract it directly from the skin matrix.
+    float3x3 skin_rot = float3x3(
+        skin_matrix[0].xyz,
+        skin_matrix[1].xyz,
+        skin_matrix[2].xyz
+    );
+
+    float3 animated_normal  = normalize(mul(skin_rot, v.normal));
+    float3 animated_tangent = normalize(mul(skin_rot, v.tangent.xyz));
+
+    // Write the animated data to the output buffer
+    OutputVertex out_v;
+    out_v.position = animated_pos.xyz;
+    out_v.normal   = animated_normal;
+    out_v.tangent  = float4(animated_tangent, v.tangent.w); // Preserve handedness
+    out_v.texcoord = v.texcoord;  // UV coordinates are not affected by skinning
+
+    output_vertices[vertex_id] = out_v;
+}
+----
+
+The structure of this shader maps directly to the math we built in the previous section. Each thread processes exactly one vertex. It looks up the vertex's four joint matrices, builds the blended skin matrix from them, and applies it to the position, normal, and tangent. The output goes into the shared buffer.
+
+The thread group size of 64 is a common starting point for vertex-processing compute shaders. Modern GPU architectures process threads in groups of 32 or 64 (called a "warp" on NVIDIA and a "wave" on AMD). By aligning our thread group to these hardware boundaries, we avoid wasting processing lanes.
+
+== Compute Pipeline Setup
+
+To run our shader, we need to create a `VkComputePipeline`. This requires defining the descriptor set layout that matches our shader's bindings.
+
+[source,cpp]
+----
+// 1. Create Descriptor Set Layout
+VkDescriptorSetLayoutBinding bindings[3] = {};
+// t0: Input vertices
+bindings[0].binding = 0;
+bindings[0].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+bindings[0].descriptorCount = 1;
+bindings[0].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+// t1: Joint matrices
+bindings[1].binding = 1;
+bindings[1].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+bindings[1].descriptorCount = 1;
+bindings[1].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+// u2: Output vertices
+bindings[2].binding = 2;
+bindings[2].descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER;
+bindings[2].descriptorCount = 1;
+bindings[2].stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+
+VkDescriptorSetLayoutCreateInfo layoutInfo{};
+layoutInfo.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO;
+layoutInfo.bindingCount = 3;
+layoutInfo.pBindings = bindings;
+vkCreateDescriptorSetLayout(device, &layoutInfo, nullptr, &computeLayout);
+
+// 2. Create Pipeline Layout (including push constants)
+VkPushConstantRange pushRange{};
+pushRange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT;
+pushRange.offset = 0;
+pushRange.size = sizeof(SkinPushConstants);
+
+VkPipelineLayoutCreateInfo pipelineLayoutInfo{};
+pipelineLayoutInfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO;
+pipelineLayoutInfo.setLayoutCount = 1;
+pipelineLayoutInfo.pSetLayouts = &computeLayout;
+pipelineLayoutInfo.pushConstantRangeCount = 1;
+pipelineLayoutInfo.pPushConstantRanges = &pushRange;
+vkCreatePipelineLayout(device, &pipelineLayoutInfo, nullptr, &pipelineLayout);
+
+// 3. Create Compute Pipeline
+VkComputePipelineCreateInfo pipelineInfo{};
+pipelineInfo.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO;
+pipelineInfo.layout = pipelineLayout;
+pipelineInfo.stage.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO;
+pipelineInfo.stage.stage = VK_SHADER_STAGE_COMPUTE_BIT;
+pipelineInfo.stage.module = skinningShaderModule;
+pipelineInfo.stage.pName = "main";
+vkCreateComputePipelines(device, VK_NULL_HANDLE, 1, &pipelineInfo, nullptr, &computePipeline);
+----
+
+== Dispatching the Compute Shader
+
+On the CPU side, we need to dispatch this shader once per skinned mesh, per frame, with enough thread groups to cover all the vertices.
+
+[source,cpp]
+----
+void dispatch_skinning(
+    VkCommandBuffer cmd,
+    const SkinComputeResources& skin,
+    VkPipeline compute_pipeline,
+    VkPipelineLayout pipeline_layout)
+{
+    // Bind the compute pipeline and its descriptor set
+    vkCmdBindPipeline(cmd, VK_PIPELINE_BIND_POINT_COMPUTE, compute_pipeline);
+    vkCmdBindDescriptorSets(cmd,
+        VK_PIPELINE_BIND_POINT_COMPUTE,
+        pipeline_layout,
+        0, 1, &skin.descriptor_set,
+        0, nullptr);
+
+    // Upload the vertex count as a push constant
+    SkinPushConstants constants;
+    constants.vertex_count = skin.vertex_count;
+    vkCmdPushConstants(cmd, pipeline_layout,
+        VK_SHADER_STAGE_COMPUTE_BIT, 0,
+        sizeof(SkinPushConstants), &constants);
+
+    // Calculate the number of thread groups needed.
+    // Each group processes 64 vertices; round up to cover all vertices.
+    uint32_t group_count = (skin.vertex_count + 63) / 64;
+    vkCmdDispatch(cmd, group_count, 1, 1);
+}
+----
+
+== Synchronization: The Pipeline Barrier
+
+A critical detail that is easy to miss: after the compute shader writes to the output vertex buffer, we must insert a **pipeline barrier** before any other pipeline stage reads from it. Without this barrier, the GPU might start rasterizing with stale data from the previous frame while the compute shader is still writing the current frame's results.
+
+[source,cpp]
+----
+void insert_skinning_barrier(VkCommandBuffer cmd, VkBuffer output_vertex_buffer)
+{
+    VkBufferMemoryBarrier2 barrier{};
+    barrier.sType         = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER_2;
+    barrier.buffer        = output_vertex_buffer;
+    barrier.offset        = 0;
+    barrier.size          = VK_WHOLE_SIZE;
+
+    // The compute stage writes to the buffer...
+    barrier.srcStageMask  = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT;
+    barrier.srcAccessMask = VK_ACCESS_2_SHADER_WRITE_BIT;
+
+    // ...and both the vertex input stage and the ray tracing stage need to read it.
+    barrier.dstStageMask  = VK_PIPELINE_STAGE_2_VERTEX_INPUT_BIT |
+                            VK_PIPELINE_STAGE_2_RAY_TRACING_SHADER_BIT_KHR;
+    barrier.dstAccessMask = VK_ACCESS_2_VERTEX_ATTRIBUTE_READ_BIT |
+                            VK_ACCESS_2_SHADER_READ_BIT;
+
+    VkDependencyInfo dep_info{};
+    dep_info.sType                    = VK_STRUCTURE_TYPE_DEPENDENCY_INFO;
+    dep_info.bufferMemoryBarrierCount = 1;
+    dep_info.pBufferMemoryBarriers    = &barrier;
+
+    vkCmdPipelineBarrier2(cmd, &dep_info);
+}
+----
+
+This barrier uses the Vulkan 1.3 `VkPipelineBarrier2` API, which allows us to express the source and destination stages with fine-grained granularity in a single call. The GPU's scheduler will ensure the compute writes complete before any vertex input or ray tracing reads begin.
+
+== The Frame Loop
+
+With all the pieces in place, our per-frame compute skinning loop looks like this:
+
+[source,cpp]
+----
+void frame_update(VkCommandBuffer cmd, Scene& scene)
+{
+    // 1. Update animations (CPU): advance time, sample keyframes
+    scene.animation_system.update(delta_time);
+
+    // 2. Update scene graph (CPU): propagate dirty flags, recalculate world matrices
+    scene.scene_graph.update();
+
+    // 3. Upload joint matrices to GPU (CPU->GPU transfer)
+    for (auto& skinned_mesh : scene.skinned_meshes) {
+        compute_joint_matrices(skinned_mesh.skin, scene.nodes, joint_matrices_staging);
+        upload_joint_matrices(cmd, joint_matrices_staging, skinned_mesh.compute_resources);
+    }
+
+    // 4. Dispatch compute skinning for each skinned mesh
+    for (auto& skinned_mesh : scene.skinned_meshes) {
+        dispatch_skinning(cmd, skinned_mesh.compute_resources,
+            skinned_pipeline, skinned_pipeline_layout);
+    }
+
+    // 5. Insert barrier: compute writes must complete before rasterizer/raytracer reads
+    for (auto& skinned_mesh : scene.skinned_meshes) {
+        insert_skinning_barrier(cmd, skinned_mesh.compute_resources.output_vertex_buffer);
+    }
+
+    // 6. Now rasterize, ray trace, and run physics queries — all reading from the same output buffer
+    render_scene(cmd, scene);
+}
+----
+
+In the next section, we will look in detail at steps 6 and beyond—how the rasterizer, the ray tracing acceleration structure, and the physics system each consume the same output vertex buffer, and what the setup for each of those consumers looks like.
+
+xref:Advanced_glTF/Skeletal_Compute_Skinning/02_skinning_math.adoc[Previous: The Mathematics of Skinning] | xref:Advanced_glTF/Skeletal_Compute_Skinning/04_shared_vertex_buffer.adoc[Next: Skin Once, Use Everywhere]
diff --git a/en/Advanced_glTF/Skeletal_Compute_Skinning/04_shared_vertex_buffer.adoc b/en/Advanced_glTF/Skeletal_Compute_Skinning/04_shared_vertex_buffer.adoc
new file mode 100644
index 000000000..b36eb17fa
--- /dev/null
+++ b/en/Advanced_glTF/Skeletal_Compute_Skinning/04_shared_vertex_buffer.adoc
@@ -0,0 +1,220 @@
+:pp: {plus}{plus}
+= Skin Once, Use Everywhere
+
+== The Payoff
+
+In the previous two sections, we built a compute skinning pipeline that transforms a rest-pose vertex buffer into an animated vertex buffer, and deposits the results into a dedicated output SSBO. Now we collect the payoff from that investment.
+
+The premise of "Skin Once, Use Everywhere" is that every system that needs animated vertex data should read from the same output buffer. We do not run separate skinning passes for the rasterizer and the ray tracer. We do not approximate the physics collision by using T-pose geometry. We skin once, and everyone gets the same, correct, animated mesh. This section explains how to wire up each of those consumers.
+
+== Consumer 1: The Rasterizer
+
+This is the simplest consumer to wire up because the output vertex buffer was already created with the `VK_BUFFER_USAGE_VERTEX_BUFFER_BIT` flag. From Vulkan's perspective, it is just a vertex buffer. The rasterizer does not know or care that it was written by a compute shader.
+
+The change is minimal. Instead of binding your rest-pose vertex buffer when issuing a draw call for a skinned mesh, you bind the output vertex buffer:
+
+[source,cpp]
+----
+void draw_skinned_mesh(
+    VkCommandBuffer cmd,
+    const SkinComputeResources& skin,
+    VkBuffer index_buffer,
+    uint32_t index_count)
+{
+    // The key change: bind the COMPUTE OUTPUT buffer as the vertex source,
+    // not the original rest-pose vertex buffer.
+    VkDeviceSize offset = 0;
+    vkCmdBindVertexBuffers(cmd, 0, 1, &skin.output_vertex_buffer, &offset);
+    vkCmdBindIndexBuffer(cmd, index_buffer, 0, VK_INDEX_TYPE_UINT32);
+
+    // Draw as normal - the rasterizer will read animated positions and normals
+    vkCmdDrawIndexed(cmd, index_count, 1, 0, 0, 0);
+}
+----
+
+One important note: the **index buffer** is unchanged. The topology of the mesh—which vertices form which triangles—doesn't change when a character animates. Only the vertex positions, normals, and tangents change. The index buffer always references the rest-pose vertex layout, which now maps directly onto our output buffer's structure since we kept the vertex order identical.
+
+Your vertex shader for skinned meshes also becomes simpler. It no longer needs to perform any skinning math. It simply reads the already-animated position and normal from the buffer and proceeds to clip-space transformation:
+
+[source,slang]
+----
+// skinned_mesh.vert.slang
+// The vertex shader for skinned meshes does NO skinning itself.
+// The compute shader already did the work.
+struct VertexInput {
+    float3 position  : POSITION;
+    float3 normal    : NORMAL;
+    float4 tangent   : TANGENT;
+    float2 texcoord  : TEXCOORD0;
+};
+
+struct VertexOutput {
+    float4 clip_pos  : SV_Position;
+    float3 world_pos : TEXCOORD1;
+    float3 normal    : TEXCOORD2;
+    float2 texcoord  : TEXCOORD0;
+};
+
+cbuffer CameraUBO {
+    float4x4 view;
+    float4x4 projection;
+};
+
+[shader("vertex")]
+VertexOutput main(VertexInput input)
+{
+    VertexOutput output;
+    // The position is already in world space — compute shader handled the animation.
+    output.clip_pos  = mul(projection, mul(view, float4(input.position, 1.0)));
+    output.world_pos = input.position;
+    output.normal    = input.normal;
+    output.texcoord  = input.texcoord;
+    return output;
+}
+----
+
+== Consumer 2: The Ray Tracing Acceleration Structure
+
+Ray tracing in Vulkan requires a two-level hierarchy of acceleration structures. The **Bottom-Level Acceleration Structure (BLAS)** stores the actual geometry—the triangles of a specific mesh. The **Top-Level Acceleration Structure (TLAS)** stores instances of BLASes, each with a transform matrix, and is what the ray tracing shaders actually traverse.
+
+For static geometry, you build the BLAS once and never touch it again. For animated, skinned geometry, the vertex positions change every frame, which means the BLAS must also be updated every frame.
+
+Vulkan supports this through **BLAS updates**, also called **BLAS refits**. A refit operation is a fast, in-place update of the acceleration structure that re-triangulates the BVH (Bounding Volume Hierarchy) tree without reconstructing it from scratch. Refits are significantly faster than full rebuilds (typically 5-10x faster), making them suitable for per-frame updates on skinned geometry.
+
+The key to making this work with our shared buffer is simple: when we specify the geometry for the BLAS, we point it at our compute shader's output buffer, not the rest-pose buffer.
+
+[source,cpp]
+----
+// Called at initialization: create the initial BLAS pointing at our output buffer.
+// The BLAS uses our output buffer as its geometry source.
+VkAccelerationStructureGeometryKHR geometry{};
+geometry.sType        = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_KHR;
+geometry.geometryType = VK_GEOMETRY_TYPE_TRIANGLES_KHR;
+
+auto& triangles = geometry.geometry.triangles;
+triangles.sType          = VK_STRUCTURE_TYPE_ACCELERATION_STRUCTURE_GEOMETRY_TRIANGLES_DATA_KHR;
+triangles.vertexFormat   = VK_FORMAT_R32G32B32_SFLOAT;
+triangles.vertexStride   = sizeof(OutputVertex); // Stride of our output vertex layout
+
+// The crucial line: use the output vertex buffer's device address as the geometry source
+triangles.vertexData.deviceAddress = get_buffer_device_address(device, skin.output_vertex_buffer);
+triangles.maxVertex      = skin.vertex_count - 1;
+triangles.indexType      = VK_INDEX_TYPE_UINT32;
+triangles.indexData.deviceAddress = get_buffer_device_address(device, skin.index_buffer);
+----
+
+Then, every frame—after the compute skinning dispatch and its pipeline barrier—we issue a BLAS refit:
+
+[source,cpp]
+----
+// Called every frame, after the skinning barrier
+void update_blas(
+    VkCommandBuffer cmd,
+    VkAccelerationStructureKHR blas,
+    const VkAccelerationStructureBuildGeometryInfoKHR& build_info_template,
+    uint32_t primitive_count)
+{
+    VkAccelerationStructureBuildGeometryInfoKHR build_info = build_info_template;
+
+    // Specify UPDATE mode (not BUILD) — this is the "refit" operation
+    build_info.mode = VK_BUILD_ACCELERATION_STRUCTURE_MODE_UPDATE_KHR;
+    build_info.srcAccelerationStructure = blas; // Refit in-place
+    build_info.dstAccelerationStructure = blas;
+
+    VkAccelerationStructureBuildRangeInfoKHR range_info{};
+    range_info.primitiveCount = primitive_count; // Number of triangles
+
+    const VkAccelerationStructureBuildRangeInfoKHR* p_range_info = &range_info;
+    vkCmdBuildAccelerationStructuresKHR(cmd, 1, &build_info, &p_range_info);
+}
+----
+
+After the refit completes, the BLAS contains the updated geometry, and any ray tracing queries during this frame will intersect the animated mesh correctly. The character's shadows, reflections, and ambient occlusion will all respond to their animated pose, not their T-pose.
+
+One important caveat: BLAS refits work best when the topology doesn't change and the vertex positions don't move dramatically between frames. They are optimal for smooth skeletal animation. If your animation involves vertices moving large distances (such as cloth simulation or a very fast action), a full BLAS rebuild may produce better ray traversal performance, at the cost of the additional rebuild time.
+
+== Consumer 3: Physics System Integration
+
+The physics system is perhaps the most nuanced consumer of the skinned vertex buffer. Unlike the rasterizer and ray tracer—which can directly address GPU buffers—most physics engines (Bullet, Jolt, PhysX) are CPU-side libraries. They calculate collisions and forces entirely on the CPU and have no direct access to GPU memory.
+
+This creates a tension: our animated vertex data is on the GPU, but the physics engine needs it on the CPU.
+
+There are two approaches to resolving this, and the right choice depends on your use case.
+
+=== Approach 1: Bone-Proxy Colliders (Recommended)
+
+For character ragdolls and most gameplay physics, reading back the full mesh vertex buffer is unnecessary and expensive. Instead, we attach simplified **collision shapes**—capsules, boxes, or spheres—to specific bones in the skeleton. We already established this pattern in the "Physics Syncing" section of the previous chapter.
+
+In this approach, we never need to read the actual mesh vertices from the GPU. The physics system operates on the bone proxies, which are driven by the skeleton's joint transforms. Those transforms are already on the CPU (we computed them in the joint matrix calculation step). The mesh skinning on the GPU is purely visual—it makes the character *look* correct.
+
+This is the right approach for 95% of character physics use cases. It is cheap, robust, and gives artists direct control over the collision geometry.
+
+=== Approach 2: Mesh-Accurate Physics Queries (Advanced)
+
+Occasionally, you need physics accuracy that bone proxies cannot provide. Examples include:
+
+- **Cloth and soft-body simulation** where the physics engine must know every vertex's position
+- **Mesh-accurate collision** for very detailed objects like a character's cape interacting with geometry
+- **Physics-based sound triggers** that fire when specific mesh regions collide
+
+In these cases, you need to read back some or all of the output vertex buffer from the GPU to the CPU. This is done using a **staging buffer** and a GPU-to-CPU memory transfer.
+
+[source,cpp]
+----
+// Only do this if you truly need mesh-accurate CPU-side physics.
+// This is expensive: it stalls the GPU pipeline and involves a memory copy.
+void readback_vertices_to_physics(
+    VkCommandBuffer cmd,
+    VkBuffer output_vertex_buffer,
+    VkBuffer staging_buffer,  // CPU-visible (host-coherent) staging buffer
+    uint32_t vertex_count,
+    std::vector<PhysicsVertex>& physics_verts_out)
+{
+    VkBufferCopy copy_region{};
+    copy_region.srcOffset = 0;
+    copy_region.dstOffset = 0;
+    copy_region.size = vertex_count * sizeof(OutputVertex);
+
+    // Copy from GPU-local output buffer to CPU-accessible staging buffer
+    vkCmdCopyBuffer(cmd, output_vertex_buffer, staging_buffer, 1, &copy_region);
+
+    // After the command buffer executes and fences signal, map the staging buffer
+    // and copy the data into the physics system's vertex array.
+    // (This part happens after vkQueueSubmit and fence wait)
+}
+----
+
+This approach adds latency: you are reading data that was computed *this frame* into physics that will affect *next frame's* simulation. For most games, this one-frame lag is acceptable and invisible. For very fast gameplay requiring tight timing, you may need to architect around this latency.
+
+== Putting It Together: The Data Flow
+
+It is worth pausing to appreciate the data flow we have built:
+
+....
+[glTF File on Disk]
+         |
+         | (load at startup)
+         v
+[CPU: Rest-Pose Vertices]  ──upload──>  [GPU: Input Vertex Buffer]
+[CPU: Inverse Bind Matrices]            (never changes again)
+         |
+    (each frame)
+         |
+[CPU: Animation Update]
+[CPU: Scene Graph Update]
+[CPU: Compute Joint Matrices]  ──upload──>  [GPU: Joint Matrix Buffer]
+                                             (updated each frame)
+         |
+    (GPU Compute Dispatch)
+         v
+         [GPU: Output Vertex Buffer]  (animated, correct, up to date)
+        /           |              \
+       v            v               v
+[Rasterizer]  [Ray Tracing BLAS]  [Physics Readback (if needed)]
+....
+
+Every part of the rendering and simulation stack reads from a single, authoritative source of animated vertex data. If the character's arm is raised, the shadow, the reflection, the collision, and the rasterized image all show the arm raised. There is no inconsistency, no double-computation, and no T-pose artifacts.
+
+In the next section, we turn our attention to the quality of the animation itself. We have been assuming linear keyframe interpolation, which is simple but produces robotic, mechanical movement. We will implement cubic spline interpolation for smooth, natural curves, and add the ability to blend between multiple animation clips—including the critical transition from animation to physics-driven ragdoll.
+
+xref:Advanced_glTF/Skeletal_Compute_Skinning/03_compute_skinning.adoc[Previous: The Compute Skinning Pipeline] | xref:Advanced_glTF/Skeletal_Compute_Skinning/05_interpolation_blending.adoc[Next: Interpolation & Blending]
diff --git a/en/Advanced_glTF/Skeletal_Compute_Skinning/05_interpolation_blending.adoc b/en/Advanced_glTF/Skeletal_Compute_Skinning/05_interpolation_blending.adoc
new file mode 100644
index 000000000..382be569b
--- /dev/null
+++ b/en/Advanced_glTF/Skeletal_Compute_Skinning/05_interpolation_blending.adoc
@@ -0,0 +1,306 @@
+:pp: {plus}{plus}
+= Interpolation & Blending
+
+== Why Interpolation Quality Matters
+
+In the "Building a Simple Engine" series, we implemented **linear interpolation** for animation keyframes, which is sometimes called **LERP** for positions and **SLERP** for rotations. Linear interpolation is simple and fast: given two keyframe values and a normalized time factor `t` (between 0 and 1), the result is a straight-line blend between them.
+
+For many applications, this is perfectly adequate. But if you have ever watched a game character whose arm snaps to a different speed at each keyframe, or whose movement feels stiff and mechanical despite clearly having smooth animation curves in the authoring tool, you have witnessed the visual consequence of linear-only interpolation. The velocity of the motion is constant within each keyframe interval but jumps abruptly at the boundary. Animators spend significant time crafting **easing**—the way a motion accelerates into and decelerates out of a pose—and linear interpolation discards all of that work.
+
+glTF supports three interpolation modes in its animation samplers: `LINEAR`, `STEP`, and `CUBICSPLINE`. We already handle `LINEAR` and `STEP` (which is simply "hold the previous value with no transition"). `CUBICSPLINE` is the mode that preserves the animator's easing, and it is what we need to implement to achieve professional-quality character animation.
+
+== Understanding Cubic Spline Interpolation
+
+A **Cubic Hermite Spline** is a mathematical curve that passes through a set of control points (our keyframes) while also respecting **tangent vectors** at each control point. These tangents define the slope—the velocity—of the curve as it approaches and leaves each keyframe.
+
+In glTF's cubic spline format, each keyframe stores three values instead of one:
+
+- An **in-tangent**: the tangent vector as the curve *arrives at* this keyframe
+- A **value**: the actual pose value at this keyframe
+- An **out-tangent**: the tangent vector as the curve *departs from* this keyframe
+
+These tangents are set by the artist in their animation software (Blender, Maya, etc.) to control the shape of the motion curves. When the animator says "I want this arm to ease in slowly and snap out quickly," those intentions are encoded in the tangent values.
+
+The cubic Hermite interpolation formula for a value at time `t` between keyframes `p0` (with out-tangent `m0`) and `p1` (with in-tangent `m1`) is:
+
+....
+h00 = 2t³ - 3t² + 1       (basis function for p0)
+h10 = t³ - 2t² + t        (basis function for m0)
+h01 = -2t³ + 3t²          (basis function for p1)
+h11 = t³ - t²             (basis function for m1)
+
+result = h00 * p0 + h10 * delta_time * m0 + h01 * p1 + h11 * delta_time * m1
+....
+
+The `delta_time` here is the duration of the keyframe interval (the time difference between `p0` and `p1`). It is necessary to scale the tangents correctly—without it, the curve's shape depends on the keyframe timing in an uncontrolled way.
+
+In C{pp}, the implementation for a `glm::vec3` translation channel looks like this:
+
+[source,cpp]
+----
+glm::vec3 cubic_spline_interpolate_vec3(
+    float t,              // Normalized time in [0, 1]
+    float dt,             // Duration of the keyframe interval
+    glm::vec3 p0,         // Previous keyframe value
+    glm::vec3 out_tan0,   // Out-tangent of previous keyframe
+    glm::vec3 p1,         // Next keyframe value
+    glm::vec3 in_tan1)    // In-tangent of next keyframe
+{
+    // Compute the four Hermite basis polynomials
+    float t2 = t * t;
+    float t3 = t2 * t;
+
+    float h00 =  2.0f * t3 - 3.0f * t2 + 1.0f;
+    float h10 =           t3 - 2.0f * t2 + t;
+    float h01 = -2.0f * t3 + 3.0f * t2;
+    float h11 =           t3 -        t2;
+
+    // Combine: value is the weighted sum of the four components
+    return h00 * p0 + h10 * dt * out_tan0
+         + h01 * p1 + h11 * dt * in_tan1;
+}
+----
+
+For rotations, the process is slightly different. Quaternions cannot be blended by direct arithmetic the way vectors can—you must use **Squad** (Spherical Quadrangle interpolation) or, more commonly in game engines, you blend the tangents as vectors and then normalize the resulting quaternion. The glTF specification recommends normalizing the result after the Hermite blend:
+
+[source,cpp]
+----
+glm::quat cubic_spline_interpolate_quat(
+    float t,
+    float dt,
+    glm::quat p0, glm::quat out_tan0,
+    glm::quat p1, glm::quat in_tan1)
+{
+    float t2 = t * t;
+    float t3 = t2 * t;
+
+    float h00 =  2.0f * t3 - 3.0f * t2 + 1.0f;
+    float h10 =           t3 - 2.0f * t2 + t;
+    float h01 = -2.0f * t3 + 3.0f * t2;
+    float h11 =           t3 -        t2;
+
+    // Blend the quaternion components as if they were 4-component vectors
+    glm::vec4 blended =
+          h00 * glm::vec4(p0.x,       p0.y,       p0.z,       p0.w)
+        + h10 * dt * glm::vec4(out_tan0.x, out_tan0.y, out_tan0.z, out_tan0.w)
+        + h01 * glm::vec4(p1.x,       p1.y,       p1.z,       p1.w)
+        + h11 * dt * glm::vec4(in_tan1.x,  in_tan1.y,  in_tan1.z,  in_tan1.w);
+
+    // The glTF spec requires normalization after the cubic blend
+    return glm::normalize(glm::quat(blended.w, blended.x, blended.y, blended.z));
+}
+----
+
+== Integrating Cubic Splines Into the Animation System
+
+To support cubic splines, our `AnimationSampler` data structure needs to store three values per keyframe instead of one. When the sampler's interpolation type is `CUBICSPLINE`, each keyframe in the glTF binary buffer contains `[in_tangent, value, out_tangent]` in that order.
+
+[source,cpp]
+----
+enum InterpolationMode {
+    STEP,
+    LINEAR,
+    CUBICSPLINE
+};
+
+struct AnimationSampler {
+    InterpolationMode interpolation;
+    std::vector<float> inputs;              // Timestamps
+    std::vector<glm::vec4> outputs_raw;     // Raw packed output data
+
+    // For CUBICSPLINE, we split the raw data for easier interpolation
+    std::vector<glm::vec4> in_tangents;
+    std::vector<glm::vec4> values;
+    std::vector<glm::vec4> out_tangents;
+};
+
+struct AnimationChannel {
+    enum PathType { TRANSLATION, ROTATION, SCALE, WEIGHTS };
+    PathType path;
+    uint32_t node_index;
+    uint32_t sampler_index;
+};
+----
+
+Our updated sampler loading code must handle this layout:
+
+[source,cpp]
+----
+void load_animation_sampler(
+    const tinygltf::AnimationSampler& gltf_sampler,
+    const tinygltf::Model& model,
+    AnimationSampler& out_sampler)
+{
+    // Parse the interpolation type from the glTF string
+    if (gltf_sampler.interpolation == "LINEAR")           out_sampler.interpolation = LINEAR;
+    else if (gltf_sampler.interpolation == "STEP")        out_sampler.interpolation = STEP;
+    else if (gltf_sampler.interpolation == "CUBICSPLINE") out_sampler.interpolation = CUBICSPLINE;
+
+    // Load the input timestamps (these are always plain floats, no tangents)
+    load_accessor_float(model, gltf_sampler.input, out_sampler.inputs);
+
+    // For CUBICSPLINE, the output accessor contains 3 values per timestamp:
+    // [in_tangent, value, out_tangent]. The total count is 3 * keyframe_count.
+    // For LINEAR/STEP, the output count equals the keyframe count.
+    load_accessor_vec4(model, gltf_sampler.output, out_sampler.outputs_raw);
+
+    if (out_sampler.interpolation == CUBICSPLINE) {
+        // Split the interleaved data into separate arrays
+        size_t keyframe_count = out_sampler.inputs.size();
+        out_sampler.in_tangents.resize(keyframe_count);
+        out_sampler.values.resize(keyframe_count);
+        out_sampler.out_tangents.resize(keyframe_count);
+
+        for (size_t i = 0; i < keyframe_count; ++i) {
+            out_sampler.in_tangents[i]  = out_sampler.outputs_raw[i * 3 + 0];
+            out_sampler.values[i]       = out_sampler.outputs_raw[i * 3 + 1];
+            out_sampler.out_tangents[i] = out_sampler.outputs_raw[i * 3 + 2];
+        }
+    } else {
+        out_sampler.values = out_sampler.outputs_raw;
+    }
+}
+----
+
+== Animation Blending: Cross-Fading Between Clips
+
+Interpolation improves the quality of motion *within* a single animation clip. But a real character engine needs to transition *between* clips—from a walk to a run, from an idle to an attack, or most critically for our purposes, from a scripted animation to a physics-driven ragdoll.
+
+**Cross-fading** is the most common technique. When a transition is triggered, we don't immediately switch to the new clip. Instead, we play both clips simultaneously and blend their output poses together over a short transition period (typically 0.1 to 0.3 seconds). At the start of the transition, the old clip has weight 1.0 and the new clip has weight 0.0. By the end, the weights have reversed. During the transition, both are non-zero and their poses are blended.
+
+The core of this system is a **Pose Blend** function that takes two poses (sets of joint transforms) and a blend factor, and returns a weighted combination:
+
+[source,cpp]
+----
+// A Pose is a snapshot of all joint local transforms for a single frame
+struct Pose {
+    std::vector<glm::vec3> translations;
+    std::vector<glm::quat> rotations;
+    std::vector<glm::vec3> scales;
+};
+
+// Blend two poses together: result = (1-t) * pose_a + t * pose_b
+void blend_poses(const Pose& pose_a, const Pose& pose_b, float t, Pose& out)
+{
+    assert(pose_a.translations.size() == pose_b.translations.size());
+    size_t joint_count = pose_a.translations.size();
+
+    out.translations.resize(joint_count);
+    out.rotations.resize(joint_count);
+    out.scales.resize(joint_count);
+
+    for (size_t i = 0; i < joint_count; ++i) {
+        // Translation and scale use standard linear interpolation
+        out.translations[i] = glm::mix(pose_a.translations[i], pose_b.translations[i], t);
+        out.scales[i]       = glm::mix(pose_a.scales[i],       pose_b.scales[i],       t);
+
+        // Rotation uses spherical linear interpolation (SLERP) for shortest path
+        out.rotations[i] = glm::slerp(pose_a.rotations[i], pose_b.rotations[i], t);
+    }
+}
+----
+
+A cross-fade controller manages the transition state:
+
+[source,cpp]
+----
+struct CrossFadeState {
+    uint32_t  from_clip_index;
+    uint32_t  to_clip_index;
+    float     blend_factor;   // 0.0 = fully from_clip, 1.0 = fully to_clip
+    float     transition_duration;
+    bool      active = false;
+};
+
+void update_cross_fade(CrossFadeState& fade, float delta_time)
+{
+    if (!fade.active) return;
+
+    fade.blend_factor += delta_time / fade.transition_duration;
+    if (fade.blend_factor >= 1.0f) {
+        // Transition complete - snap to the destination clip
+        fade.blend_factor = 1.0f;
+        fade.active = false;
+        // The caller should now update the "current clip" to to_clip_index
+    }
+}
+----
+
+== The Ragdoll Blend: From Animation to Physics
+
+The cross-fade system becomes especially important for the animation-to-ragdoll handoff. This is not merely a cross-fade between two animation clips—it is a transition between the animation system driving the skeleton and the physics system driving it.
+
+Doing this as an abrupt switch creates a jarring visual artifact: the character instantly snaps from their animated pose into whatever pose the physics engine calculated as its initial state. Even if the physics engine starts from the correct pose, the absence of a smooth transition makes the transition look wrong.
+
+Instead, we implement a **Physics Blend Weight**. The animation system continues to calculate its pose. The physics system also calculates its pose (ragdoll). We blend between them based on a `ragdoll_weight` value that we ramp from 0.0 to 1.0 over the transition period.
+
+[source,cpp]
+----
+struct CharacterAnimationState {
+    Pose    animation_pose;  // Pose from the current animation clip
+    Pose    physics_pose;    // Pose from the ragdoll physics simulation
+    float   ragdoll_weight;  // 0.0 = fully animated, 1.0 = fully ragdoll
+    bool    ragdoll_active;  // Is the physics simulation enabled?
+};
+
+void update_character_pose(
+    CharacterAnimationState& state,
+    float delta_time,
+    std::vector<Node>& nodes,
+    const Skin& skin)
+{
+    // Sample the animation clip to get animation_pose (as before)
+    sample_animation(state.animation_pose, delta_time);
+
+    if (state.ragdoll_active) {
+        // Read the physics simulation results into physics_pose
+        read_physics_pose(state.physics_pose, nodes, skin);
+
+        // Ramp the ragdoll weight up to 1.0 over the transition duration
+        state.ragdoll_weight = glm::min(state.ragdoll_weight + delta_time * 5.0f, 1.0f);
+
+        // Blend between animation and physics poses
+        Pose blended_pose;
+        blend_poses(state.animation_pose, state.physics_pose, state.ragdoll_weight, blended_pose);
+
+        // Apply the blended pose to the scene graph nodes
+        apply_pose_to_scene_graph(blended_pose, nodes, skin);
+    } else {
+        // Fully animation-driven: apply directly without blending
+        apply_pose_to_scene_graph(state.animation_pose, nodes, skin);
+    }
+}
+
+/**
+ * Applies a Pose to the Scene Graph.
+ * We write the pose data to the nodes' local transforms. The Scene Graph's
+ * dirty propagation system will then handle updating the world matrices.
+ */
+void apply_pose_to_scene_graph(const Pose& pose, std::vector<Node>& nodes, const Skin& skin) {
+    for (size_t i = 0; i < skin.joints.size(); ++i) {
+        uint32_t node_idx = skin.joints[i];
+        Node& node = nodes[node_idx];
+
+        node.translation    = pose.translations[i];
+        node.local_rotation = pose.rotations[i];
+        node.scale          = pose.scales[i];
+
+        // Trigger dirty flag so world matrix is recalculated
+        node.mark_dirty();
+    }
+}
+----
+
+The `ragdoll_weight * 5.0f` multiplier means the transition completes in 0.2 seconds (`1.0 / 5.0`). You can tune this constant to taste—faster transitions look more like a sudden "drop," while slower transitions make the character look like they are being gently handed off to gravity.
+
+One critical implementation detail: when `ragdoll_active` becomes true, we must immediately set the physics body's initial state to match the current animation pose. If the physics simulation starts from an arbitrary or zero pose, the blend will look wrong for the first few frames as the physics system "snaps" to the right starting position. By initializing the physics bodies from the animation pose at the moment of the handoff, the blend begins from a state of near-zero difference, ensuring a seamless transition.
+
+== Summary
+
+With cubic spline interpolation, our animation system now faithfully reproduces the subtle easing and velocity curves that animators craft in their authoring tools. Characters no longer feel like they are driven by mechanical, constant-velocity keyframes.
+
+With cross-fade blending, we can transition smoothly between any two animation clips, and most importantly, we can blend from a scripted animation into a full physics ragdoll simulation in a way that looks natural and responsive.
+
+These two systems—interpolation quality and blending—are what separate animation that merely "works" from animation that *feels alive*.
+
+xref:Advanced_glTF/Skeletal_Compute_Skinning/04_shared_vertex_buffer.adoc[Previous: Skin Once, Use Everywhere] | xref:Advanced_glTF/Skeletal_Compute_Skinning/06_conclusion.adoc[Next: Chapter Conclusion]
diff --git a/en/Advanced_glTF/Skeletal_Compute_Skinning/06_conclusion.adoc b/en/Advanced_glTF/Skeletal_Compute_Skinning/06_conclusion.adoc
new file mode 100644
index 000000000..95eda5a6d
--- /dev/null
+++ b/en/Advanced_glTF/Skeletal_Compute_Skinning/06_conclusion.adoc
@@ -0,0 +1,67 @@
+:pp: {plus}{plus}
+= Skeletal & Compute Skinning: Conclusion
+
+== What We Built
+
+In this chapter, we transformed our animation system from a vertex-shader-based approximation into a dedicated, production-grade compute pipeline. Let's take a moment to appreciate the full scope of what we built and why each piece matters.
+
+We started with the mathematical foundation, establishing exactly why the Inverse Bind Matrix exists and what problem it solves. The three-step skinning equation—transform to bone-local space, apply the animated joint transform, blend multiple influences—is no longer a black box. You can now look at a corrupted or misaligned skinned mesh and reason about which step in that equation is likely responsible.
+
+We moved that skinning math from the vertex shader into a Vulkan Compute Shader, and in doing so, we created a persistent, GPU-resident output vertex buffer that holds the authoritative animated state of the character's mesh. This architectural change is what enabled everything that followed.
+
+By pointing the rasterizer, the ray tracing BLAS, and the physics readback system at the same output buffer, we eliminated the fragmentation that plagues naive implementations. There are no more T-pose shadows. No more collision hulls lagging behind the visual mesh. No more paying the skinning cost multiple times for different rendering features. One dispatch, one buffer, everyone benefits.
+
+Finally, we improved the quality of motion itself—moving from linear interpolation (which is accurate but lifeless) to cubic Hermite splines (which preserve the animator's intent), and adding cross-fade blending so our characters transition between states smoothly rather than snapping abruptly.
+
+== Bridging Animation to the Scene Graph
+
+A common point of confusion for beginners is how the output of the animation system—the `Pose`—actually moves the character in the world. The `Pose` contains the local transforms for every joint in a skeleton, but those joints are also `Node` objects in our scene graph. To bridge the two, we must apply the pose data back to the nodes:
+
+[source,cpp]
+----
+void apply_pose_to_scene_graph(
+    std::vector<Node>& nodes,
+    const Pose& pose,
+    const std::vector<uint32_t>& joint_indices)
+{
+    // For each joint in the animation, update its corresponding node
+    for (size_t i = 0; i < joint_indices.size(); ++i) {
+        Node& node = nodes[joint_indices[i]];
+        node.translation    = pose.translations[i];
+        node.local_rotation = pose.rotations[i];
+        node.scale          = pose.scales[i];
+
+        // Mark as dirty so world matrices are recomputed!
+        node.mark_dirty();
+    }
+}
+----
+
+This step is the "glue" between the character-specific animation system and the engine-wide scene graph. Once applied, the next scene graph update will propagate these animated transforms down the hierarchy, ensuring that anything attached to the character (like a sword or a helmet) follows the motion perfectly.
+
+== The Tradeoffs You Should Understand
+
+Before moving on, it is worth being clear about the limitations and tradeoffs of the approach we took.
+
+**Linear Blend Skinning is not perfect.** LBS is the industry standard because it is fast and generally "good enough," but it has a well-known artifact called the **"candy wrapper" or "collapsing elbows" problem**. When a joint rotates 180 degrees (like a wrist rotating palm-up to palm-down), LBS interpolates through zero volume—the mesh collapses at the joint before expanding again. The alternative, **Dual Quaternion Skinning (DQS)**, avoids this artifact by blending quaternion-based rigid transforms instead of matrices. Many production games use DQS for their primary characters and LBS for distant or less important objects. Implementing DQS is a straightforward extension of the math we built here—the compute shader structure stays identical; only the blending formula changes.
+
+**BLAS refits have quality costs.** While BLAS refits are much faster than full rebuilds, the BVH tree structure is optimized for the rest-pose geometry and becomes progressively less optimal as the mesh deforms further from that rest pose. In extreme cases (characters performing very large, fast motions), this can lead to reduced ray traversal performance. A common production technique is to occasionally issue a full BLAS rebuild (every N frames, or when the skeleton moves beyond a threshold) while using refits for the frames in between.
+
+**The GPU-to-CPU readback for physics is a last resort.** We covered this option for completeness, but the right default for character physics is bone-proxy colliders driven directly from the CPU-side joint matrices. The readback approach should be reserved for the specific cases we described—cloth, soft bodies, and mesh-accurate queries—where there is no practical alternative.
+
+== What Comes Next
+
+The system we have built handles the "how" of character deformation extremely well. But there is an entire layer of the character pipeline we haven't addressed yet: the physical properties of the character's body.
+
+Right now, we have a visual mesh that deforms correctly and a set of joint transforms that drive it. But we have no formal definition of what physical shape each bone represents. We have no joint limits that prevent a character's elbow from bending the wrong direction. We have no constraints that define how a ragdoll's joints interact with each other.
+
+In the next chapter, **Physics Integration: Colliders & Ragdolls**, we will build out this physical layer. We will auto-generate bone-proxy collision capsules, define joint constraints with proper limits (so the arm bends correctly even in a ragdoll), and implement the state machine that governs the transition between animation control and physics control. We will also address a topic we mentioned in the previous chapter—collision group filtering—in much greater depth, exploring how to build characters that collide correctly with the world while not fighting themselves.
+
+== Verification: What to Look For
+
+To verify your compute skinning implementation:
+1.  **Inverse Bind Matrices**: If your mesh appears "exploded" or turned inside-out, your inverse bind matrices are likely being applied in the wrong space or coordinate system.
+2.  **Pipeline Barrier**: If you see "jittering" or flickering geometry, check that your `VkPipelineBarrier2` is correctly placed between the compute dispatch and the draw call.
+3.  **Normal/Tangent Quality**: If your lighting looks wrong on an animated character (e.g., seams appear at joints), verify that you are correctly normalizing the animated normals and tangents in the Slang shader.
+
+xref:Advanced_glTF/Skeletal_Compute_Skinning/05_interpolation_blending.adoc[Previous: Interpolation & Blending] | xref:Advanced_glTF/Physics_Integration/01_introduction.adoc[Next Chapter: Physics Integration]
diff --git a/en/Advanced_glTF/Tooling_Production_Pipeline/01_introduction.adoc b/en/Advanced_glTF/Tooling_Production_Pipeline/01_introduction.adoc
new file mode 100644
index 000000000..6dee401d1
--- /dev/null
+++ b/en/Advanced_glTF/Tooling_Production_Pipeline/01_introduction.adoc
@@ -0,0 +1,14 @@
+:pp: {plus}{plus}
+= Tooling & The Production Pipeline
+
+== The Hidden Source of Engine Bugs
+
+There is a category of problem that every graphics programmer encounters eventually, and it is deeply frustrating because the debugging tools point in the wrong direction: an asset bug that looks like a code bug. The character's limb is pointing backward. The normal map is completely wrong. The animation drives the forearm when it should drive the upper arm. The ragdoll collapses immediately upon activation. After hours of stepping through the skinning shader, checking quaternion conventions, and verifying matrix multiplication order, you discover that the Blender export settings were wrong all along—the normals were flipped in the export, or the joint naming convention didn't match what your loader expected.
+
+This chapter is about preventing that class of problem systematically. We will establish a clear production pipeline: a sequence of steps that takes a Blender scene from the artist's workstation to a validated, engine-ready glTF asset, with checkpoints at each stage so that errors are caught as early as possible and in a context that makes them easy to diagnose.
+
+The pipeline has three stages. The first is **authoring**: the Blender-side choices that determine how the asset will export. Rig naming conventions, vertex group assignments, custom property definitions for physics extras, and export settings all fall here. Getting these right means the resulting glTF file will be correct by construction, without requiring special-case handling in the engine loader. The second stage is **validation**: running the exported file through the Khronos glTF-Validator to check for format violations, missing data, and spec non-compliance before the file ever reaches your loader. The third stage is **ground truth establishment**: using a professional-grade glTF viewer to confirm that the asset looks and behaves correctly before concluding that any remaining problems are in the engine code.
+
+This is not glamorous work. But it is the kind of work that separates a professional production pipeline from a perpetual debugging loop.
+
+xref:Advanced_glTF/Morph_Targets_Facial_Animation/04_conclusion.adoc[Previous: Morph Targets Conclusion] | xref:Advanced_glTF/Tooling_Production_Pipeline/02_blender_workflow.adoc[Next: Blender-to-Vulkan Workflow]
diff --git a/en/Advanced_glTF/Tooling_Production_Pipeline/02_blender_workflow.adoc b/en/Advanced_glTF/Tooling_Production_Pipeline/02_blender_workflow.adoc
new file mode 100644
index 000000000..e82003619
--- /dev/null
+++ b/en/Advanced_glTF/Tooling_Production_Pipeline/02_blender_workflow.adoc
@@ -0,0 +1,112 @@
+:pp: {plus}{plus}
+= Blender-to-Vulkan Workflow
+
+== Naming Conventions: The Unwritten Contract
+
+The most consistent source of confusion between artists and engine programmers is naming. An engine loader that looks for a bone named `"Spine1"` to attach an IK chain will silently fail if the rig uses `"spine_1"`, `"Spine_01"`, or `"spine.001"` (Blender's default naming when you duplicate a bone). This is not a bug in the loader or in the rig—it is a breakdown in the naming contract between the two.
+
+Establishing naming conventions before the first character is built, writing them down, and enforcing them consistently is one of the highest-leverage investments a small team can make. The conventions don't need to be elaborate—they just need to be agreed upon and followed.
+
+For skeleton joints, the standard that works well in practice is: all lowercase, words separated by underscores, side indicated by a `_l` or `_r` suffix (left and right respectively), and a consistent hierarchy naming pattern. A humanoid spine chain under this convention would be `hips`, `spine`, `spine_upper`, `chest`, `neck`, `head`. A leg chain would be `thigh_l`, `shin_l`, `foot_l`, `toes_l`. This is the convention used by Blender's built-in Rigify system and by Mixamo, which makes it compatible with a large body of existing animation content.
+
+For your engine's benefit, the loader code should never hardcode joint names—it should discover them from the glTF file or from a small configuration file that maps semantic names to glTF names. This allows different assets to use different naming conventions without engine code changes. However, the extras-based physics and constraint metadata we defined in Chapter 2 uses `"parent_bone"` as a string reference to joint names, so those must match the exported joint names exactly.
+
+== Vertex Groups and Skinning Weights
+
+In Blender, skeletal skinning is defined through **vertex groups**: named collections of vertices where each vertex has a weight value (between 0 and 1) indicating how much that group's bone influences it. Blender exports vertex group weights as the `JOINTS` and `WEIGHTS` accessors in the glTF file.
+
+Several Blender-side issues can corrupt the skinning data on export:
+
+**Unnormalized weights.** If the sum of a vertex's bone weights is not 1.0, the skinning math produces incorrect results—the vertex will shrink or expand as it moves. Blender's Weight Paint mode has a normalize option; use it before exporting. The glTF specification does not require normalized weights, but your skinning shader almost certainly computes the blend matrix as `sum(weight_i * joint_matrix_i)`, which assumes normalization.
+
+**More than four influences per vertex.** glTF's standard skinning (`JOINTS_0`/`WEIGHTS_0`) supports exactly four bone influences per vertex. Blender rigs frequently produce more than four influences when automatic weight painting is used—particularly near joint intersections like the armpit or hip crease. Blender's Limit Total option in the Weight Paint tools reduces each vertex to at most N influences; set N to 4 before exporting and re-normalize afterwards.
+
+**Zero-weight vertex groups.** If a vertex group exists but a vertex has zero weight in it, Blender may still include that group as one of the four influences, wasting a slot that could be used for a meaningful influence. Clean this up with Blender's Clean Weights option.
+
+**Rest pose mismatch.** The glTF export captures the rig in its current pose at export time as the bind pose. If the rig has been posed (not in rest pose) when you export, all the inverse bind matrices will be wrong, and the character will appear deformed in the base T-pose. Always ensure the armature is in rest position (`Pose mode → Pose → Apply Pose as Rest Pose` if needed, then export) before exporting for the first time, and make it a workflow rule to export only from rest pose.
+
+== Custom Properties for Physics Extras
+
+In Chapter 2 we described how physics collider and constraint definitions are stored in glTF "extras" JSON and parsed by the engine at load time. On the Blender side, these extras originate from Blender's **Custom Properties** panel, available on bones in Pose mode (Properties → Bone → Custom Properties).
+
+The critical detail is the data format. Blender exports custom properties as raw JSON values in the extras field. A float property becomes a JSON number. A string property becomes a JSON string. Modern Blender glTF exporters handle dictionary properties directly, allowing you to create complex, nested JSON structures in the glTF `extras` field without manual string encoding.
+
+The cleanest approach for physics extras is a Python script run from Blender's Script Editor. The script iterates over all bones in the armature and sets custom properties programmatically based on a configuration dictionary:
+
+[source,python]
+----
+import bpy, json
+
+# Configuration: bone name -> physics settings
+physics_config = {
+    "shin_l": {
+        "physics": {
+            "collider": "capsule",
+            "radius": 0.05,
+            "half_height": 0.18,
+            "mass": 3.0,
+            "constraint": {
+                "type": "hinge",
+                "hinge_axis": [0, 0, 1],
+                "limit_min_deg": -140,
+                "limit_max_deg": 0,
+                "parent_bone": "thigh_l"
+            },
+            "collision_group": "leg",
+            "collision_mask": "world,props"
+        }
+    },
+    # ... more bones
+}
+
+armature = bpy.data.objects["Armature"]
+bpy.context.view_layer.objects.active = armature
+bpy.ops.object.mode_set(mode='POSE')
+
+for bone_name, config in physics_config.items():
+    if bone_name in armature.pose.bones:
+        bone = armature.pose.bones[bone_name]
+        for key, value in config.items():
+            # Modern Blender glTF exporters handle dicts directly,
+            # ensuring 'extras' contains a proper JSON object.
+            bone[key] = value
+
+bpy.ops.object.mode_set(mode='OBJECT')
+print("Physics properties applied.")
+----
+
+This script produces the nested JSON structure that the `parse_collider_extras` and `parse_constraint_def` functions from Chapter 2 expect. Modern Blender ensures that these dictionary properties are exported as proper JSON objects in the glTF `extras` field, matching the C{pp} parsing logic we implemented using `tinygltf`. Store this script alongside the blend file and run it after any rig changes. Version-controlling the script together with the blend file ensures that physics parameters are reproducible and tracked.
+
+== glTF Export Checklist
+
+When exporting from Blender, use the following settings in the `Export glTF 2.0` dialog to ensure your character data arrives correctly in the engine:
+
+*   **Format**: `glTF Binary (.glb)`
+*   **Include**:
+    *   [x] `Selected Objects` (Ensure only Mesh and Armature are selected)
+    *   [x] `Custom Properties` (**Critical**: This is where your physics metadata lives)
+*   **Transform**:
+    *   [x] `+Y Up` (Standard glTF convention)
+*   **Data -> Mesh**:
+    *   [x] `UVs`, `Normals`, `Tangents` (Required for PBR and skinning)
+    *   [x] `Vertex Weights` (Required for skeletal animation)
+    *   [x] `Shape Keys` (Required for morph targets)
+*   **Data -> Armature**:
+    *   [x] `Use Rest Position Armature`
+    *   [ ] `Export Deformation Bones Only` (Keep this **off** if you have non-deforming control bones that IK needs to reference)
+    *   [ ] `Add Leaf Bones` (**Off**: avoids extra bones at chain tips)
+*   **Animation**:
+    *   [x] `Animation` (Enable to export clips)
+    *   [x] `Shape Key Animation` (Required if your facial expressions are keyed)
+
+== Morph Target (Shape Key) Workflow
+
+Morph targets in glTF (called **Shape Keys** in Blender) allow for complex deformations like facial expressions that skeletal animation alone cannot handle.
+
+1.  **Creation**: Create Shape Keys in the `Mesh Data` properties panel. The first key is always the `Basis` (the rest pose). Additional keys define displacements from that basis.
+2.  **Naming**: Give your shape keys semantic names (e.g., `blink_l`, `smile`, `mouth_open`). These names will be exported in the glTF `extras.targetNames` field, allowing your engine to address them by name.
+3.  **Range**: Ensure your shape keys are designed to be additive. If a vertex is moved by both a `smile` morph and a `blink` morph, the engine will sum their displacements.
+4.  **Export**: As noted in the checklist, ensure the **Shape Keys** and **Shape Key Animation** options are enabled in the glTF exporter.
+5.  **Validation**: After exporting, run your `.glb` through the **glTF-Validator**. It will check that your shape keys use **Sparse Accessors** where appropriate, which is a critical optimization for complex facial rigs.
+
+xref:Advanced_glTF/Tooling_Production_Pipeline/01_introduction.adoc[Previous: Introduction] | xref:Advanced_glTF/Tooling_Production_Pipeline/03_validation.adoc[Next: Validation]
diff --git a/en/Advanced_glTF/Tooling_Production_Pipeline/03_validation.adoc b/en/Advanced_glTF/Tooling_Production_Pipeline/03_validation.adoc
new file mode 100644
index 000000000..c042bb66a
--- /dev/null
+++ b/en/Advanced_glTF/Tooling_Production_Pipeline/03_validation.adoc
@@ -0,0 +1,90 @@
+:pp: {plus}{plus}
+= Validation: The Khronos glTF-Validator
+
+== Why Validate Before Loading
+
+Your engine's glTF loader is not a general-purpose parser—it is written to handle correctly-formed assets and may silently misbehave or crash when it encounters malformed data. The Khronos glTF-Validator is a reference implementation that checks a glTF file against the full specification and reports every violation, warning, and hint it finds. Running your assets through this tool before loading them in the engine costs seconds and can save hours.
+
+The validator catches a wide range of issues that are easy to accidentally introduce during export:
+
+- Accessor byte offsets that are not properly aligned for the component type (a common issue with custom exporters and Python scripts that write binary data manually).
+- Skin joints that reference nodes not listed in the scene hierarchy.
+- Animation samplers with input arrays that are not strictly monotonically increasing (which violates the spec and can cause interpolation to produce nonsense).
+- Buffer views that overlap or extend beyond the buffer length.
+- Morph target accessors with the wrong vertex count (does not match the base primitive).
+- Missing required fields (a skin without an `inverseBindMatrices` accessor, for example).
+
+Many of these issues would not cause an immediate crash in a lenient loader like tinygltf—they would silently produce wrong data or be ignored. The validator finds them when the asset is still isolated and the source of the error is obvious.
+
+== Installing and Running the Validator
+
+The Khronos glTF-Validator is available as a command-line tool and as a web-based drag-and-drop interface. For integration into a production pipeline, the command-line tool is the right choice. It is distributed as a Dart application; install it via the Dart package manager:
+
+[source,bash]
+----
+# Install Dart SDK (Linux, using apt)
+sudo apt-get update && sudo apt-get install dart
+
+# Install the validator
+dart pub global activate gltf_validator
+
+# Add Dart's global pub cache to your PATH (if not already done)
+export PATH="$PATH:$HOME/.pub-cache/bin"
+----
+
+On macOS with Homebrew:
+
+[source,bash]
+----
+brew install dart
+dart pub global activate gltf_validator
+----
+
+Once installed, validate a file with:
+
+[source,bash]
+----
+gltf_validator character.glb
+----
+
+For CI integration, use the `--format json` flag to get machine-readable output:
+
+[source,bash]
+----
+gltf_validator --format json character.glb > validation_report.json
+----
+
+The validator exits with a non-zero status code if any errors are found, which makes it straightforward to fail a CI build on invalid assets.
+
+== Reading the Validator Output
+
+The validator categorizes its findings into four levels: **Errors**, **Warnings**, **Infos**, and **Hints**.
+
+An **Error** indicates a spec violation that will definitely produce incorrect behavior. If the validator reports any errors, the asset should not be used until they are fixed. Common errors include buffer overflows (an accessor claims to read data that extends beyond the buffer), invalid joint indices (a skin references a node index that doesn't exist), and malformed morph target counts (the number of morph target accessors doesn't match across primitives).
+
+A **Warning** indicates a situation that is technically valid according to the spec but is likely wrong or potentially problematic. Unnormalized skinning weights produce a warning—they are allowed by the spec but almost always indicate an artist workflow problem. Duplicate vertex positions within a primitive are also a warning—they suggest the mesh was not properly welded and may have topology issues.
+
+An **Info** message is informational: the asset has valid but unusual characteristics that might be intentional or might not. A very large number of morph targets (say, 100 or more) generates an info message—not because it's wrong, but because it's unusual enough to warrant attention.
+
+A **Hint** suggests a style or optimization issue that doesn't affect correctness. For example, using 32-bit floats for texture coordinates when 16-bit would be sufficient with no quality loss.
+
+For character assets specifically, pay close attention to warnings about skinning weights and to any errors relating to animation samplers. Animation errors are particularly insidious because they may only manifest at specific points in the animation timeline—a clip that looks correct at frame 0 might produce corrupt data at frame 120 if the sampler's input timestamps are incorrectly formed.
+
+== Integrating Validation into the Build Pipeline
+
+In a team environment, validation should run automatically as part of the asset build pipeline, not as a manual step. The exact integration depends on your build system, but the pattern is always the same: after any glTF export step, run the validator and fail the build if any errors are found.
+
+A simple Makefile rule:
+
+[source,makefile]
+----
+assets/%.glb: blender/%.blend
+	blender -b $< --python scripts/export_gltf.py -- $@
+	gltf_validator $@ || (echo "Validation failed for $@"; exit 1)
+----
+
+This ensures that any `.glb` asset in the `assets/` directory was produced by the Blender export script and passed validation before it was committed. If the validation fails, the build fails, and the error is visible immediately—not three days later when an engineer loads the character and gets a crash.
+
+For Python-based pipelines, the validator can also be invoked programmatically. The `gltf_validator` package exposes a library API that returns structured result objects, which you can inspect to implement custom policies (for example, treating certain info-level messages as errors in your project).
+
+xref:Advanced_glTF/Tooling_Production_Pipeline/02_blender_workflow.adoc[Previous: Blender Workflow] | xref:Advanced_glTF/Tooling_Production_Pipeline/04_gltf_viewer_audit.adoc[Next: glTF Viewer Audit]
diff --git a/en/Advanced_glTF/Tooling_Production_Pipeline/04_gltf_viewer_audit.adoc b/en/Advanced_glTF/Tooling_Production_Pipeline/04_gltf_viewer_audit.adoc
new file mode 100644
index 000000000..455b26a4a
--- /dev/null
+++ b/en/Advanced_glTF/Tooling_Production_Pipeline/04_gltf_viewer_audit.adoc
@@ -0,0 +1,48 @@
+:pp: {plus}{plus}
+= glTF Viewer Audit: Establishing Ground Truth
+
+== The Purpose of a Ground Truth
+
+After an asset passes validation, you know it is spec-compliant. You do not yet know whether it looks correct. The glTF-Validator tells you that the file is well-formed; it does not tell you whether the normals point the right way, whether the skeleton drives the mesh correctly, or whether the animation clips play as the animator intended. That is what a reference viewer is for.
+
+A **ground truth** is a known-correct rendering of the asset—one produced by a mature, thoroughly tested implementation of the glTF specification—that you can compare against your engine's output. When the two match, any subsequent discrepancy is almost certainly in your engine code. When they differ, you have evidence that the discrepancy originates in the asset or in your engine's interpretation of the spec, not in some mysterious graphics API behavior.
+
+The discipline of establishing ground truth before debugging engine code is high-leverage. Without it, every visual artifact is ambiguous: it might be a shader bug, a Vulkan API misuse, a wrong matrix, or an asset problem. With a ground truth reference, you can immediately classify the artifact. If the reference viewer shows the same problem, it's an asset problem—fix the blend file. If the reference viewer looks correct but your engine doesn't, it's an engine problem—debug the code. This classification alone can save hours.
+
+== Recommended Viewers
+
+Several mature, high-quality glTF viewers are freely available. For our purposes, we recommend two:
+
+**Khronos glTF Sample Viewer** (https://github.khronos.org/glTF-Sample-Viewer-Release). This is the reference implementation from the Khronos Group itself—the same organization that maintains the glTF specification. It is built on WebGL and runs in any modern browser, which makes it instantly accessible without installation. It implements the full glTF PBR material model, all animation interpolation modes (linear, step, cubic spline), skinning, and morph targets. Because it is the reference implementation, it is the highest-authority viewer for questions about spec compliance. If your asset looks wrong here, the problem is definitively in the asset.
+
+**Babylon.js Sandbox** (https://sandbox.babylonjs.com). This is a mature commercial-grade implementation with good support for all glTF features and a well-maintained UI that makes it easy to isolate and inspect individual animations, bones, and morph targets. Its node inspector allows you to examine the scene graph hierarchy, inspect bone transforms at each animation frame, and verify that morph target weights are being driven correctly. For debugging skeletal animation issues specifically, this is often more useful than the Khronos viewer.
+
+Both viewers accept `.glb` files via drag-and-drop, which makes the audit workflow fast: export from Blender, drag onto the viewer, and within seconds you have a reference rendering.
+
+== What to Look For in the Viewer
+
+A systematic audit should check the following, in order:
+
+**Rest pose.** Open the file and immediately pause any animation. The character should be in the T-pose or A-pose that was established as the bind pose in Blender. If the character is deformed in rest pose—limbs bent, mesh pulled toward wrong bones—the skinning weights or inverse bind matrices are incorrect at the source.
+
+**Materials.** Confirm that all materials render with correct colors, roughness, metalness, and normal maps. Pay particular attention to whether normal maps produce convex or concave results in the right places—a flipped normal map green channel is a common export error that appears as inverted shading on fine surface detail.
+
+**Animation playback.** Play each animation clip in the viewer and confirm that it matches the artist's intent. Watch for bones that drive the wrong mesh region, animations that play at the wrong speed (timestamp scaling issues), and clips that snap or jump rather than interpolating smoothly. If any clip uses cubic spline interpolation, confirm that the tangents produce a smooth curve—a common error is exporting tangents with wrong scaling that produces oscillation artifacts in the middle of a clip.
+
+**Morph targets.** In the viewer's material or animation inspector, manually drive each morph target weight from 0 to 1 and confirm that the face deforms as expected. If a morph target appears to have no effect, the displacement accessor may have been exported as all zeros, or the target may have been assigned to the wrong mesh primitive. If a morph target produces extreme deformation, the displacement vectors may be in the wrong coordinate space (a scale or axis convention issue).
+
+**Skeleton hierarchy.** In viewers that support bone inspection (like Babylon.js Sandbox), expand the scene graph and verify that the parent-child relationships match the Blender rig hierarchy. Confirm that leaf bones (if any were accidentally exported) are not being treated as significant joints.
+
+== From Viewer to Engine: What to Do When They Differ
+
+When your engine rendering differs from the viewer reference, the debugging process becomes structured:
+
+If the **materials** differ: check your PBR shader against the glTF spec's BRDF equations. The most common differences are the roughness-to-perceptual-roughness conversion (glTF uses perceptual roughness, some PBR implementations use linear roughness) and the metalness workflow (glTF always uses metalness/roughness, never specular/glossiness).
+
+If the **skeleton** is wrong: compare the joint world matrices your engine computes at rest pose against what the viewer shows. Export your engine's computed matrices as a debug overlay and compare them joint by joint. The most common causes are wrong matrix multiplication order (`parent * local` vs `local * parent`) and wrong application of the inverse bind matrix (`joint_matrix = global_joint_transform * inverse_bind_matrix`, where global_joint_transform is computed by traversing the scene graph from root to joint).
+
+If **morph targets** are wrong: verify the expand_sparse_accessor implementation by comparing the expanded data against the raw glTF JSON values. A single off-by-one error in the sparse index scatter step will shift every displacement by one vertex, producing a characteristic "swimming" artifact where the face deforms in the wrong region.
+
+If **animations** are wrong: compare the sampler input timestamps from the glTF file against the timestamps your engine is using for interpolation. If the file stores timestamps in seconds and your engine interprets them as milliseconds (or vice versa), every animation will play at 1/1000 speed or 1000x speed. Also verify that your cubic spline interpolation formula matches the glTF spec exactly—the spec uses the Hermite basis with the tangent vectors pre-scaled by the time interval, which is different from some textbook presentations.
+
+xref:Advanced_glTF/Tooling_Production_Pipeline/03_validation.adoc[Previous: Validation] | xref:Advanced_glTF/Tooling_Production_Pipeline/05_conclusion.adoc[Next: Conclusion]
diff --git a/en/Advanced_glTF/Tooling_Production_Pipeline/05_conclusion.adoc b/en/Advanced_glTF/Tooling_Production_Pipeline/05_conclusion.adoc
new file mode 100644
index 000000000..8a9856747
--- /dev/null
+++ b/en/Advanced_glTF/Tooling_Production_Pipeline/05_conclusion.adoc
@@ -0,0 +1,32 @@
+:pp: {plus}{plus}
+= Tooling: Summary & What's Next
+
+== What We Established
+
+This chapter was deliberately not about code. Instead, it was about process—the discipline that makes a character pipeline reliable and maintainable rather than a source of perpetual mystery.
+
+We established naming conventions as the unwritten contract between artist and engineer. Consistent bone naming—lowercase, underscore-separated, left/right suffixed—makes every system in the engine that needs to look up joints by name reliable by default. We discussed the specific Blender-side problems that corrupt skinning data on export: unnormalized weights, more than four influences per vertex, zero-weight ghost influences, and rest pose mismatches at export time. Each of these produces a specific class of visual artifact, and knowing which artifact corresponds to which cause makes diagnosis fast.
+
+We covered the physics extras workflow in depth: how custom properties written by a Python script in Blender's Script Editor become JSON "extras" in the glTF file, and how that data flows into the collider and constraint parsing code from Chapter 2. Storing the configuration script alongside the blend file and in version control ensures that physics parameters are reproducible and auditable.
+
+The glTF export settings discussion covered the choices that matter most: binary vs. separate format (use binary for production), the leaf bones option (disable it), Apply Modifiers (on for final export), and Force Sampling for animations (off by default, with caveats for procedural drivers).
+
+The Khronos glTF-Validator provides a fast, authoritative check for spec compliance. Integrating it into the build pipeline ensures that invalid assets are caught at source rather than at runtime. The four output levels—Error, Warning, Info, Hint—provide a graduated view of asset quality; errors must be fixed, warnings should be investigated.
+
+Finally, reference viewers (the Khronos glTF Sample Viewer and Babylon.js Sandbox) provide the ground truth that makes engine debugging tractable. The discipline of checking assets in a reference viewer before concluding that a visual problem is in the engine code is one of the most useful habits a graphics programmer can develop.
+
+== What Comes Next
+
+Chapter 8 closes the tutorial series with **Debugging and Visual Auditing**: the tools and techniques for identifying problems in the running engine itself. We have established that asset problems should be caught at source; Chapter 8 addresses what happens when the asset is correct and the problem is in the engine.
+
+We will implement engine-side debug drawers for the skeleton, collision shapes, and physics constraints—visual overlays that let you see the physics representation alongside the rendered character. We will cover skinning weight heatmaps, which render vertex bone influence as color to identify "pinched" geometry or incorrect weight painting that only becomes visible in motion. And we will walk through using RenderDoc to inspect the compute skinning output buffer, verifying that the post-skinning vertex positions and normals are correct before the rasterizer reads them.
+
+== Verification: What to Look For
+
+To verify your production pipeline:
+
+1.  **Validator Green Light**: Every exported `.glb` should pass the **glTF-Validator** with zero errors and zero warnings.
+2.  **Naming Consistency**: Check that your physics metadata `"parent_bone"` strings match the actual node names in the glTF JSON.
+3.  **Export Settings**: Verify that **Custom Properties**, **Shape Keys**, and **Vertex Weights** are all included in your exported file by inspecting it in a third-party viewer like the Babylon.js sandbox.
+
+xref:Advanced_glTF/Tooling_Production_Pipeline/04_gltf_viewer_audit.adoc[Previous: glTF Viewer Audit] | xref:Advanced_glTF/Debugging_Visual_Auditing/01_introduction.adoc[Next: Debugging & Visual Auditing]
diff --git a/en/Advanced_glTF/appendix_types.adoc b/en/Advanced_glTF/appendix_types.adoc
new file mode 100644
index 000000000..2abe25248
--- /dev/null
+++ b/en/Advanced_glTF/appendix_types.adoc
@@ -0,0 +1,233 @@
+= Appendix: Common Types Reference
+
+This appendix provides authoritative, consolidated definitions for the shared types and interfaces used throughout the Advanced glTF series.
+
+== Core Scene Graph Types
+
+[source,cpp]
+----
+const uint32_t INVALID_NODE_INDEX = 0xFFFFFFFF;
+
+enum TransformStatus : uint8_t {
+    Clean      = 0,
+    LocalDirty = 1 << 0,  // This node's SRT components changed
+    WorldDirty = 1 << 1   // This node's world matrix needs recalculation
+};
+
+struct ColliderDef {
+    enum class Shape { CAPSULE, BOX, NONE };
+    Shape shape     = Shape::NONE;
+    float radius    = 0.0f;
+    float half_height = 0.0f;
+    glm::vec3 box_half_extents = {0,0,0};
+    float mass      = 1.0f;
+    std::string collision_group;
+    std::string collision_mask;
+};
+
+struct ConstraintDef {
+    enum class Type { NONE, BALL_SOCKET, HINGE };
+    Type        type              = Type::NONE;
+    float       swing_limit_deg   = 180.0f;
+    float       twist_limit_deg   = 180.0f;
+    float       hinge_min_deg     = -180.0f;
+    float       hinge_max_deg     =  180.0f;
+    glm::vec3   hinge_axis        = {0,0,1};
+    std::string parent_bone;
+};
+
+struct Node {
+    uint32_t node_index;
+    uint32_t parent_index = INVALID_NODE_INDEX;
+    std::vector<uint32_t> child_indices;
+    std::string name;
+
+    // Local transform data (SRT: Scale, Rotation, Translation)
+    glm::vec3 translation    = {0,0,0};
+    glm::quat local_rotation = glm::identity<glm::quat>();
+    glm::vec3 scale          = {1,1,1};
+
+    // Cached world matrix
+    glm::mat4 world_matrix = glm::mat4(1.0f);
+
+    uint8_t status = TransformStatus::Clean;
+    bool is_joint = false;
+
+    // Physics metadata (extracted from glTF extras)
+    ColliderDef collider_def;
+    ConstraintDef constraint_def;
+
+    // Call this whenever you change translation, rotation, or scale
+    void mark_dirty() {
+        status |= TransformStatus::LocalDirty | TransformStatus::WorldDirty;
+    }
+
+    // Computes the local transform matrix from SRT components (TRS order)
+    glm::mat4 get_local_matrix() const {
+        return glm::translate(glm::mat4(1.0f), translation) *
+               glm::mat4_cast(local_rotation) *
+               glm::scale(glm::mat4(1.0f), scale);
+    }
+
+    // Safely extracts rotation from the world matrix, stripping any scale.
+    glm::quat get_world_rotation() const {
+        glm::mat3 rot_scale = glm::mat3(world_matrix);
+        glm::mat3 rotation;
+        rotation[0] = glm::normalize(rot_scale[0]);
+        rotation[1] = glm::normalize(rot_scale[1]);
+        rotation[2] = glm::normalize(rot_scale[2]);
+        return glm::quat_cast(rotation);
+    }
+};
+
+class SceneGraph {
+public:
+    std::vector<Node> nodes;
+
+    // Linear update: Only works if nodes are topologically sorted
+    void update_transforms() {
+        for (auto& node : nodes) {
+            if (node.status & TransformStatus::WorldDirty) {
+                if (node.parent_index != INVALID_NODE_INDEX) {
+                    node.world_matrix = nodes[node.parent_index].world_matrix * node.get_local_matrix();
+                } else {
+                    node.world_matrix = node.get_local_matrix();
+                }
+
+                for (uint32_t child_index : node.child_indices) {
+                    nodes[child_index].status |= TransformStatus::WorldDirty;
+                }
+                node.status = TransformStatus::Clean;
+            }
+        }
+    }
+
+    // Recursive update: Handles any node order and sub-tree updates
+    void update_world_matrices_subtree(uint32_t index) {
+        Node& node = nodes[index];
+        if (node.parent_index != INVALID_NODE_INDEX) {
+            node.world_matrix = nodes[node.parent_index].world_matrix * node.get_local_matrix();
+        } else {
+            node.world_matrix = node.get_local_matrix();
+        }
+        for (uint32_t child_idx : node.child_indices) {
+            nodes[child_idx].status |= TransformStatus::WorldDirty;
+            update_world_matrices_subtree(child_idx);
+        }
+        node.status = TransformStatus::Clean;
+    }
+};
+
+// Free-function helper for recursive subtree updates (matches IK chapter usage)
+inline void update_world_matrices_subtree(std::vector<Node>& nodes, uint32_t index) {
+    Node& node = nodes[index];
+    if (node.parent_index != INVALID_NODE_INDEX) {
+        node.world_matrix = nodes[node.parent_index].world_matrix * node.get_local_matrix();
+    } else {
+        node.world_matrix = node.get_local_matrix();
+    }
+    for (uint32_t child_idx : node.child_indices) {
+        nodes[child_idx].status |= TransformStatus::WorldDirty;
+        update_world_matrices_subtree(nodes, child_idx);
+    }
+    node.status = TransformStatus::Clean;
+}
+----
+
+== Animation & Skinning Types
+
+[source,cpp]
+----
+enum InterpolationMode { STEP, LINEAR, CUBICSPLINE };
+
+struct AnimationSampler {
+    InterpolationMode interpolation;
+    std::vector<float> inputs;              // Timestamps
+    std::vector<glm::vec4> outputs_raw;     // Raw packed output data
+
+    // For CUBICSPLINE, we split the raw data for easier interpolation
+    std::vector<glm::vec4> in_tangents;
+    std::vector<glm::vec4> values;
+    std::vector<glm::vec4> out_tangents;
+};
+
+struct AnimationChannel {
+    enum PathType { TRANSLATION, ROTATION, SCALE, WEIGHTS };
+    PathType path;
+    uint32_t node_index;
+    uint32_t sampler_index;
+};
+
+struct Pose {
+    std::vector<glm::vec3> translations;
+    std::vector<glm::quat> rotations;
+    std::vector<glm::vec3> scales;
+};
+
+// Binary search for the keyframe index corresponding to time
+uint32_t find_keyframe(const AnimationSampler& sampler, float time) {
+    if (sampler.inputs.size() < 2) return 0;
+    auto it = std::lower_bound(sampler.inputs.begin(), sampler.inputs.end(), time);
+    uint32_t idx = static_cast<uint32_t>(std::distance(sampler.inputs.begin(), it));
+    return (idx > 0) ? idx - 1 : 0;
+}
+
+// Bridges the Animation output (Pose) back to the Scene Graph Nodes
+void apply_pose_to_scene_graph(std::vector<Node>& nodes, const Pose& pose, const std::vector<uint32_t>& joint_indices) {
+    for (size_t i = 0; i < joint_indices.size(); ++i) {
+        Node& node = nodes[joint_indices[i]];
+        node.translation    = pose.translations[i];
+        node.local_rotation = pose.rotations[i];
+        node.scale          = pose.scales[i];
+        node.mark_dirty();
+    }
+}
+----
+
+== Physics Types
+
+[source,cpp]
+----
+struct PhysicsPose {
+    glm::vec3 position;
+    glm::quat orientation;
+
+    glm::mat4 to_matrix() const {
+        return glm::translate(glm::mat4(1.0f), position) * glm::mat4_cast(orientation);
+    }
+};
+
+class PhysicsWorld {
+public:
+    virtual ~PhysicsWorld() = default;
+
+    // Body Management
+    virtual JPH::BodyID create_body(const JPH::BodyCreationSettings& settings) = 0;
+    virtual void        destroy_body(JPH::BodyID body_id) = 0;
+    virtual void        set_motion_type(JPH::BodyID body_id, JPH::EMotionType type) = 0;
+    virtual void        activate_body(JPH::BodyID body_id) = 0;
+
+    // Syncing
+    virtual void        move_kinematic(JPH::BodyID body_id, const PhysicsPose& pose) = 0;
+    virtual PhysicsPose get_body_pose(JPH::BodyID body_id) const = 0;
+    virtual void        set_linear_velocity(JPH::BodyID body_id, const glm::vec3& velocity) = 0;
+
+    // Constraints
+    virtual void        create_ball_socket_constraint(JPH::BodyID p1, JPH::BodyID p2, float swing, float twist) = 0;
+    virtual void        create_hinge_constraint(JPH::BodyID p1, JPH::BodyID p2, const glm::vec3& axis, float min_angle, float max_angle) = 0;
+};
+----
+
+== IK & Procedural Types
+
+[source,cpp]
+----
+struct IKChain {
+    std::vector<uint32_t> joints;  // Ordered from root to end effector
+    uint32_t effector_node;        // The node whose position we are trying to place
+    float    threshold;            // Convergence threshold in world-space units
+    int      max_iterations;       // Safety cap
+    glm::vec3 target_world;        // Target position
+    glm::vec3 pole_vector;         // For algorithms like FABRIK or constrained CCD
+};
+----
diff --git a/en/Advanced_glTF/introduction.adoc b/en/Advanced_glTF/introduction.adoc
new file mode 100644
index 000000000..ab9177110
--- /dev/null
+++ b/en/Advanced_glTF/introduction.adoc
@@ -0,0 +1,85 @@
+= Advanced glTF: High-Performance Character Pipelines
+
+== Beyond Static Models
+
+The "Building a Simple Engine" series established a solid foundation for Vulkan rendering. It introduced the core mechanics of glTF loading via `tinygltf`, PBR material shaders, and a basic hierarchical scene graph within each `Model`. However, as our engine evolves from rendering static dioramas to complex, interactive worlds, we encounter the limitations of that initial architecture.
+
+In a production-ready engine, a character isn't just a mesh; it's a convergence of multiple high-performance systems. When a character's foot touches uneven terrain, the inverse kinematics (IK) system must adjust the leg's joint rotation. When that same character is struck by a projectile, the animation system must seamlessly hand control over to the physics engine for a ragdoll simulation. This interplay requires an architecture that is significantly more robust and optimized than a basic recursive traversal.
+
+In this tutorial series, we will evolve our engine to handle these advanced character pipelines. We aren't just adding "features" like animation or physics; we are redesigning our core systems to handle the scale, performance, and complexity of modern 3D assets.
+
+== The Technical Roadmap
+
+=== 1. Introduction
+This series is structured around the lifecycle of a modern character, from parsing the glTF file to simulating its physical interaction with the world. We begin with this introduction, establishing the prerequisites and setting up the development environment.
+
+=== 2. The Scene Graph & Transform Hierarchy
+We begin by unifying our scene representation. While our previous engine used a hybrid approach—a flat list of entities containing hierarchical models—we will move to a **unified, global scene graph**. We will also replace the redundant frame-by-frame matrix multiplications with a **Dirty Flag** system and a more data-oriented layout (DOD). This ensures that we only update world-space transforms when absolutely necessary, a critical optimization as scene complexity grows. We'll also explore automated collider setup by parsing **"extras" metadata** from our glTF files, ensuring our physics proxies are perfectly aligned with our visual meshes.
+
+=== 3. Advanced Skeletal & Compute Skinning
+Skeletal animation is the heartbeat of character movement. We will implement high-performance **Compute Skinning**, moving the heavy lifting of vertex deformation from the vertex shader to specialized compute kernels. This approach allows us to "skin once, use everywhere"—storing the results in a shared buffer that can be accessed by the rasterizer, ray-tracing acceleration structures, and even physics queries.
+
+=== 4. Physics Integration: Colliders & Ragdolls
+Physics and animation are often treated as separate entities, but in advanced pipelines, they are deeply intertwined. We'll explore **Bi-directional Syncing**, where animation can drive physics (Kinematic) and physics can take over and drive the skeleton (Ragdoll).
+
+=== 5. Procedural Animation & IK
+Characters feel alive when they react to their environment. We'll implement **Inverse Kinematics (IK)** using algorithms like CCD (Cyclic Coordinate Descent) or FABRIK to ensure feet align with terrain and heads track points of interest. This "procedural layer" sits atop our traditional animation clips, providing the final polish that makes a character feel grounded.
+
+=== 6. Morph Targets & Facial Animation
+For expressive characters, we need more than just bones. We'll implement support for **Morph Targets** (shape keys), allowing for complex facial expressions and localized mesh deformations. We'll leverage modern Vulkan features like **Descriptor Indexing** to handle these displacement buffers without the overhead of per-draw descriptor swaps.
+
+=== 7. Tooling & Production Pipeline
+Code is only half the battle. We'll discuss the "Blender-to-Vulkan" workflow, establishing naming conventions and export settings that ensure our assets "just work" when they hit the engine. We'll also integrate the **Khronos glTF-Validator** into our pipeline to catch asset corruption before it causes mysterious engine bugs.
+
+=== 8. Debugging & Visual Auditing
+Even with a perfect pipeline, things go wrong. We'll build specialized debugging tools to visualize the hidden state of our character systems. This includes deferred line drawers for skeletons and collision shapes, skinning weight heatmaps for identifying "pinched" geometry, and techniques for using RenderDoc to audit the compute skinning output.
+
+== The Advanced Toolkit
+
+Throughout this series, we will upgrade our development toolkit to reflect modern standards:
+
+*   **Slang Shaders**: All of our new shader work—from compute skinning to PBR enhancements—will be written in **Slang**. Slang's modularity and natural alignment features make it far more productive than raw GLSL for complex rendering pipelines.
+*   **Vulkan 1.4**: We will utilize features like **Dynamic Rendering** and **Descriptor Indexing** to simplify our pipeline management and improve performance.
+*   **Jolt Physics**: We have selected **Jolt Physics** as our simulation engine. Its performance-first design and clean C++ API make it the ideal choice for real-time character dynamics and ragdolls.
+
+== Setting Up Your Development Environment
+
+Before we dive into the code, you will need to prepare your development environment with several new tools and libraries.
+
+=== 1. The Slang Shader Compiler
+We will use **Slang** for all compute and graphics shaders in this series.
+
+* **Download**: Grab the latest binary release from the link:https://github.com/shader-slang/slang/releases[Slang GitHub Releases].
+* **Setup**: Ensure the `slangc` executable is in your system PATH.
+* **Vulkan Integration**: Slang will compile our `.slang` files directly to SPIR-V (`.spv`), which can then be loaded using your existing Vulkan shader loading code.
+
+=== 2. Jolt Physics
+Jolt is our recommended physics engine for this series.
+* **Integration**: We recommend using CMake's `FetchContent` to integrate Jolt directly into your project:
+[source,cmake]
+----
+include::FetchContent.cmake
+FetchContent_Declare(
+    Jolt
+    GIT_REPOSITORY https://github.com/jrouwe/JoltPhysics.git
+    GIT_TAG master
+)
+FetchContent_MakeAvailable(Jolt)
+----
+* **Initialization**: Jolt requires a small amount of boilerplate code (memory management, job system, and layer interfaces) which we will cover in the Physics Integration chapter.
+
+=== 3. Khronos glTF-Validator
+To ensure our custom glTF assets are valid, we will use the official validator.
+* **Installation**: Follow the instructions on the link:https://github.com/KhronosGroup/glTF-Validator[glTF-Validator repository]. It is available as a standalone CLI tool or via NPM.
+
+== Preparing the Starter Project
+
+This series assumes you have a working engine from the "Building a Simple Engine" series. To transition to the Advanced glTF architecture, you should prepare a "Starter Project" that:
+
+1.  **Supports Vulkan 1.4**: Ensure your instance and device creation request the 1.4 API version.
+2.  **Enables Dynamic Rendering**: This series moves away from fixed Render Passes and Framebuffers in favor of `VK_KHR_dynamic_rendering`.
+3.  **Integrates GLM**: We will use GLM extensively for math operations, specifically for quaternion support in skeletal animation.
+
+If you are starting fresh, we recommend refactoring your Simple Engine core to support a global scene graph before proceeding to the next chapter.
+
+xref:Building_a_Simple_Engine/introduction.adoc[Previous: Building a Simple Engine] | xref:Advanced_glTF/Scene_Graph_Hierarchy/01_introduction.adoc[Next: Scene Graph & Transform Hierarchy]