Devsh-Graphics-Programming · devshgraphicsprogramming · Sep 30, 2025 · Oct 2, 2025 · Oct 9, 2025 · Oct 9, 2025
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ compiled.spv
 */.vscode/*
 */__main__.py
 /tmp/rtSamples.bin
+imgui.ini
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -98,6 +98,8 @@ if(NBL_BUILD_EXAMPLES)
 	add_subdirectory(71_RayTracingPipeline)
 	add_subdirectory(72_CooperativeBinarySearch)
 
+	add_subdirectory(MeshShader)
+
 	# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
 	NBL_GET_ALL_TARGETS(TARGETS)
 

diff --git a/MeshShader/CMakeLists.txt b/MeshShader/CMakeLists.txt
@@ -0,0 +1,45 @@
+include(common RESULT_VARIABLE RES)
+if(NOT RES)
+        message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
+endif()
+
+if(NBL_BUILD_IMGUI)
+	set(NBL_EXTRA_SOURCES
+		#"${CMAKE_CURRENT_SOURCE_DIR}/src/transform.cpp" #just leaving this so i can easily reference it later
+		"${CMAKE_CURRENT_SOURCE_DIR}/src/SampleApp.cpp"
+		"${CMAKE_CURRENT_SOURCE_DIR}/src/MeshRenderer.cpp"
+	)
+
+	set(NBL_INCLUDE_SEARCH_DIRECTORIES
+		"${CMAKE_CURRENT_SOURCE_DIR}/include"
+	)
+
+	list(APPEND NBL_LIBRARIES 
+		imtestengine
+		imguizmo
+		"${NBL_EXT_IMGUI_UI_LIB}"
+	)
+	if(NBL_EMBED_BUILTIN_RESOURCES)
+		set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
+		set(RESOURCE_DIR "app_resources")
+
+		get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
+		get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
+		get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)
+
+		file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
+		foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
+			LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
+		endforeach()
+
+		ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")
+
+		LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
+	endif()
+
+
+	# TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !?
+	nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SEARCH_DIRECTORIES}" "${NBL_LIBRARIES}")
+	# TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet
+	# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
+endif()
diff --git a/MeshShader/README.md b/MeshShader/README.md
@@ -0,0 +1,49 @@
+## 9/30/2025 - GDBobby
+Here's the current plan, front to back
+
+1. Remove all unnecessary parts from my copy of example 61.
+
+    1.1 figure out what IS necessary.
+
+    1.2 trace the graphics pipeline used, so I can figure out how the mesh pipeline should look
+
+2. i dont have much experience with viewports and scissors yet, so I'd like to change
+    how the imgui viewport is handled just for the fun of it. 61 mentions it's rendered to a
+    temporary color attachment which is then sourced as a texture in imgui. id like to change it so
+    that imgui literally just puts a box around a viewport thats rendered to directly
+
+3. Create the Mesh Pipeline.
+
+    3.1. I want to support generative (procedural) mesh shaders, which take 0 input vertices
+
+    3.2. I want to support meshlets - small meshes that are defined by pre-existing vertices
+
+    3.3. I want to re-compile the mesh shader into a compute and vertex shader combo, 
+        which can be used on machines that don't support the mesh shader extension 
+        (mostly GPUs older than 2016)
+
+
+I think, to prevent controlling two different branches in two different repos, I'll stuff everything into this example in the beginning. 
+Once everything start to come together, I'll start moving things, like the Mesh Pipeline class, into more appropriate places, like Nabla itself.
+
+
+## 9/31
+I'll create a mesh shader tomorrow. I don't really know what to do yet but I'll start with procedural gen.
+
+I think I'll also make a different pipeline object that supports the geometry from example 61?
+
+I had my fun with viewports. idk what i expected tbh
+
+I need to search a little deeper in the spec for other mesh pipeline related rules. I need to research subpasses as well.
+
+
+## 10/3
+Beginning shader experimentation. Setting up easy reload of shaders so I don't have to relaunch every test iteration.
+
+## 10/6
+I need to add CPU side verification that mesh shader vert and prim count are below vulkan limits. The same as how work group size is verified.
+
+Mesh and Task shaders having branches where the output is not defined is incorrect. The glslc compiler won't warn the user, I'll have to check for DXC. Nvidia will assume it's a 0 group output, but AMD will get DEVICE_LOST. If it's possible, having a warning or compile check for that would be nice. Most likely outside the scope of Nabla, but possibly not. I'll have to ask.
+
+## 10/9
+On the bug hunting phase. Should be finished shortly, then I'll hit the cleanup phase.
diff --git a/MeshShader/app_resources/FirstBuild.mesh.hlsl b/MeshShader/app_resources/FirstBuild.mesh.hlsl
@@ -0,0 +1,23 @@
+//https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#primitive-attributes
+
+struct SInterpolants{
+    float4 ndc : SV_Position;
+};
+struct Primo {
+    uint vertexID : SV_PrimitiveID;
+};
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+[outputtopology("point")]
+
+[shader("mesh")]
+void main(
+    in uint3 ID : SV_DispatchThreadID,
+    out vertices SInterpolants verts[WORKGROUP_SIZE],
+    out indices uint prims[WORKGROUP_SIZE]
+)
+{
+    verts[ID.x].ndc = float32_t4(ID.x, 0.0, 0.0, 1.0);
+    prims[ID.x] = ID.x;
+    SetMeshOutputCounts(WORKGROUP_SIZE, WORKGROUP_SIZE);
+}
diff --git a/MeshShader/app_resources/geom.frag.hlsl b/MeshShader/app_resources/geom.frag.hlsl
@@ -0,0 +1,13 @@
+
+struct VertexOut {
+    float32_t4 ndc : SV_Position;
+    float32_t3 meta : COLOR1;
+};
+
+
+[shader("pixel")]
+float32_t4 main(VertexOut input) : SV_Target0
+{
+    const float32_t3 normal = input.meta;
+    return float32_t4(normalize(normal) * 0.5f + float32_t3(0.5f, 0.5f, 0.5f), 1.f);
+}
diff --git a/MeshShader/app_resources/geom.mesh.hlsl b/MeshShader/app_resources/geom.mesh.hlsl
@@ -0,0 +1,51 @@
+//https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#primitive-attributes
+
+#include "task_mesh_common.hlsl"
+
+//            (binding, set)
+[[vk::binding(0, 0)]] StructuredBuffer<float3> MeshVertexBuffer;
+
+struct VertexOut {
+    float32_t4 ndc : SV_Position;
+    float32_t3 meta : COLOR1;
+};
+
+[numthreads(WORKGROUP_SIZE,1,1)]
+
+[outputtopology("triangle")]
+[shader("mesh")]
+void main(
+    in uint3 id : SV_DispatchThreadID,
+    in uint3 groupThreadID : SV_GroupThreadID,
+    out vertices VertexOut verts[WORKGROUP_SIZE],
+    out indices uint3 prims[WORKGROUP_SIZE]
+)
+{
+
+    // i havent benchmarked this personally, but my understandign is that AMD devices prefer mesh shaders to be "by primitive"
+    // and that nvidia devices prefer mesh shaders to be "by vertex".
+    // ideally, i'd benchmark both and setup branches so that each device can specialize the shader basedo n what it likes 
+    //(theres a property in VkMeshProperties that would indicate this)
+    if (id.x < pc.vertCount) {
+        const float32_t3 position = MeshVertexBuffer[id.x];
+
+        // verts[id.x].ndc = mul(float32_t4(position, 1.0), worldViewProj);
+        verts[id.x].ndc = mul(pc.mvp, float32_t4(position, 1.0));
+
+        verts[id.x].meta = position;
+    }
+
+    // im just assuming its a triangle list right now. wont work if its not
+    if (id.x < pc.vertCount / 3) {
+
+        prims[id.x] = uint3(
+                        id.x * 3, 
+                        id.x * 3 + 1, 
+                        id.x * 3 + 2
+                    );
+    }
+
+
+
+    SetMeshOutputCounts(pc.vertCount, pc.vertCount / 3);
+}
diff --git a/MeshShader/app_resources/geom.task.hlsl b/MeshShader/app_resources/geom.task.hlsl
@@ -0,0 +1,22 @@
+
+#include "task_mesh_common.hlsl"
+
+groupshared TaskToMeshPayload taskToMeshPayload;
+
+[numthreads(1,1,1)]
+void main(
+	in uint3 id : SV_DispatchThreadID,
+	in uint3 groupThreadId : SV_GroupThreadID
+	//out payload TaskToMeshPayload taskToMeshPayload, interestingly, thats not how it's done here
+){
+	uint objectCount = 0;
+	for(uint i = 0; i < OBJECT_COUNT; i++){
+		for(uint j = 0; j < pc.objectCount[i]; j++){
+			taskToMeshPayload.objectType[objectCount] = i;
+			objectCount++;
+		}
+	}
+
+    printf("dispatching meshes - %u", objectCount);
+	DispatchMesh(objectCount, 1, 1, taskToMeshPayload);
+}
diff --git a/MeshShader/app_resources/task_mesh_common.hlsl b/MeshShader/app_resources/task_mesh_common.hlsl
@@ -0,0 +1,30 @@
+
+//this is user defined data sent from the task shader to the mesh shader
+//1 packet is sent, but it can use arrays so that each workgroup can receive customized data
+//struct TaskToMeshPayload {
+//    uint objectType[INSTANCE_COUNT * OBJECT_COUNT];
+//};
+
+//1 is cone, 2 is for fan, anything else for trangle list without the special normal calc.
+//cone can be handled in the task shader or the mesh shader, I'm going to handle it in the task shader
+//#define OTHER_OBJECTS 0
+#define CONE_OBJECT_TYPE 1
+#define T_FAN_OBJECT_TYPE 2
+struct MeshData{
+    uint vertCount;
+    uint primCount; //were assuming vertCount is always equal to primCount (no index buffer)
+    uint objType; 
+	uint positionView;
+    uint normalView;
+    uint indexView;
+};
+
+
+#define PushDescCount (0x1<<16)-1
+struct SPushConstants {
+	float4x4 mvp;
+    uint vertCount;
+};
+
+//im not keen on trying to figure out how the push constant abstraction worked before without documentation
+[[vk::push_constant]] SPushConstants pc;
diff --git a/MeshShader/include/MeshRenderer.hpp b/MeshShader/include/MeshRenderer.hpp
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
+#include "nbl/examples/geometry/SPushConstants.hlsl"
+
+namespace nbl::examples
+{
+
+	enum class MeshletObjectTypes {
+		Cube,
+		Rectangle,
+		Disk,
+		Sphere,
+		Cylinder,
+		Cone,
+		Icosphere,
+
+		COUNT
+	};
+		//this is buffer data
+	struct MeshletObjectData {
+		uint32_t vertCount;
+		uint32_t objectType;
+		uint32_t positionView;
+		uint32_t normalView;
+		uint32_t indexView;
+	};
+	struct MeshDataBuffer {
+		//if gpuGeometry is nullptr or std::nullopt or whatever, then mesh object type is invalid, the CPU memory failed to transfer to GPU for whatever reason
+		core::smart_refctd_ptr<const video::IGPUPolygonGeometry> gpuGeometry{};
+
+		static constexpr std::size_t MaxObjectCount = static_cast<std::size_t>(MeshletObjectTypes::COUNT);
+		static constexpr std::size_t MaxInstanceCount = 8; //for each object
+
+		MeshletObjectData meshData[MaxObjectCount];
+		hlsl::float32_t4x4 transforms[MaxInstanceCount];
+
+		//remove index type to avoid branch in shader
+		//asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN;
+	};
+
+
+class MeshDebugRenderer final : public core::IReferenceCounted {
+#define EXPOSE_NABLA_NAMESPACES \
+		using namespace nbl::core; \
+		using namespace nbl::system; \
+		using namespace nbl::asset; \
+		using namespace nbl::video
+
+public:
+	//
+	constexpr static inline uint16_t VertexAttrubUTBDescBinding = 0;
+
+	constexpr static inline auto MissingView = hlsl::examples::geometry_creator_scene::SPushConstants::DescriptorCount;
+
+	//
+	struct SInstance
+	{
+		struct SPushConstants
+		{
+			NBL_CONSTEXPR_STATIC_INLINE uint32_t DescriptorCount = (0x1 << 16) - 1;
+
+			nbl::hlsl::float32_t4x4 viewProj;
+			uint32_t vertCount;
+		};
+
+		hlsl::float32_t3x4 world;
+	};
+
+	static std::array<const core::smart_refctd_ptr<nbl::asset::IShader>, 2> CreateTestShader(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX);
+
+	//
+	static core::smart_refctd_ptr<MeshDebugRenderer> create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX);
+	//
+	struct SInitParams {
+
+		core::smart_refctd_ptr<video::IGPUDescriptorSet> meshDescriptor;
+		core::smart_refctd_ptr<video::IGPUPipelineLayout> pipe_layout; //when im looking at it from outside the class i need to know what kind of layout this is
+		core::smart_refctd_ptr<video::IGPUMeshPipeline> pipeline;
+	};
+	inline SInitParams& getInitParams() {return m_params;}
+
+	//im not going to go thru every example to fix them up to use this static function instead, so im leaving the old one
+	//device should be const* but im not going to fix it right now 
+	//(scope creep)
+
+	bool addGeometries();
+
+	void removeGeometry(const uint32_t ix, const video::ISemaphore::SWaitInfo& info);
+
+	inline const auto& getGeometries() const {return m_geoms;}
+
+	void render(video::IGPUCommandBuffer* cmdbuf, nbl::hlsl::float32_t4x4 const& mvp) const;
+
+	SInstance m_instance;
+
+	//mesh layout
+	//PVP vertices at set 0 binding 0
+	//mesh data at set 1 binding 0
+	//they should be in the same set but tiny bit slower (1 additional API call) for a tiny bit easier programming
+	nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSetLayout> mesh_layout{};
+
+	MeshDataBuffer m_geoms;
+protected:
+	inline MeshDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {}
+	inline ~MeshDebugRenderer()	{
+		// clean shutdown, can also make SubAllocatedDescriptorSet resillient against that, and issue `device->waitIdle` if not everything is freed
+		const_cast<video::ILogicalDevice*>(m_params.pipe_layout->getOriginDevice())->waitIdle();
+		clearGeometries({});
+	}
+	void clearGeometries(const video::ISemaphore::SWaitInfo& info);
+
+	SInitParams m_params;
+#undef EXPOSE_NABLA_NAMESPACES
+};
+
+}