Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ compiled.spv
*/.vscode/*
*/__main__.py
/tmp/rtSamples.bin
imgui.ini
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ if(NBL_BUILD_EXAMPLES)
add_subdirectory(71_RayTracingPipeline)
add_subdirectory(72_CooperativeBinarySearch)

add_subdirectory(MeshShader)

# add new examples *before* NBL_GET_ALL_TARGETS invocation, it gathers recursively all targets created so far in this subdirectory
NBL_GET_ALL_TARGETS(TARGETS)

Expand Down
45 changes: 45 additions & 0 deletions MeshShader/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

if(NBL_BUILD_IMGUI)
set(NBL_EXTRA_SOURCES
#"${CMAKE_CURRENT_SOURCE_DIR}/src/transform.cpp" #just leaving this so i can easily reference it later
"${CMAKE_CURRENT_SOURCE_DIR}/src/SampleApp.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/src/MeshRenderer.cpp"
)

set(NBL_INCLUDE_SEARCH_DIRECTORIES
"${CMAKE_CURRENT_SOURCE_DIR}/include"
)

list(APPEND NBL_LIBRARIES
imtestengine
imguizmo
"${NBL_EXT_IMGUI_UI_LIB}"
)
if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()


# TODO; Arek I removed `NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET` from the last parameter here, doesn't this macro have 4 arguments anyway !?
nbl_create_executable_project("${NBL_EXTRA_SOURCES}" "" "${NBL_INCLUDE_SEARCH_DIRECTORIES}" "${NBL_LIBRARIES}")
# TODO: Arek temporarily disabled cause I haven't figured out how to make this target yet
# LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} nblExamplesGeometrySpirvBRD)
endif()
49 changes: 49 additions & 0 deletions MeshShader/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
## 9/30/2025 - GDBobby
Here's the current plan, front to back

1. Remove all unnecessary parts from my copy of example 61.

1.1 figure out what IS necessary.

1.2 trace the graphics pipeline used, so I can figure out how the mesh pipeline should look

2. i dont have much experience with viewports and scissors yet, so I'd like to change
how the imgui viewport is handled just for the fun of it. 61 mentions it's rendered to a
temporary color attachment which is then sourced as a texture in imgui. id like to change it so
that imgui literally just puts a box around a viewport thats rendered to directly

3. Create the Mesh Pipeline.

3.1. I want to support generative (procedural) mesh shaders, which take 0 input vertices

3.2. I want to support meshlets - small meshes that are defined by pre-existing vertices

3.3. I want to re-compile the mesh shader into a compute and vertex shader combo,
which can be used on machines that don't support the mesh shader extension
(mostly GPUs older than 2016)


I think, to prevent controlling two different branches in two different repos, I'll stuff everything into this example in the beginning.
Once everything start to come together, I'll start moving things, like the Mesh Pipeline class, into more appropriate places, like Nabla itself.


## 9/31
I'll create a mesh shader tomorrow. I don't really know what to do yet but I'll start with procedural gen.

I think I'll also make a different pipeline object that supports the geometry from example 61?

I had my fun with viewports. idk what i expected tbh

I need to search a little deeper in the spec for other mesh pipeline related rules. I need to research subpasses as well.


## 10/3
Beginning shader experimentation. Setting up easy reload of shaders so I don't have to relaunch every test iteration.

## 10/6
I need to add CPU side verification that mesh shader vert and prim count are below vulkan limits. The same as how work group size is verified.

Mesh and Task shaders having branches where the output is not defined is incorrect. The glslc compiler won't warn the user, I'll have to check for DXC. Nvidia will assume it's a 0 group output, but AMD will get DEVICE_LOST. If it's possible, having a warning or compile check for that would be nice. Most likely outside the scope of Nabla, but possibly not. I'll have to ask.

## 10/9
On the bug hunting phase. Should be finished shortly, then I'll hit the cleanup phase.
23 changes: 23 additions & 0 deletions MeshShader/app_resources/FirstBuild.mesh.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
//https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#primitive-attributes

struct SInterpolants{
float4 ndc : SV_Position;
};
struct Primo {
uint vertexID : SV_PrimitiveID;
};

[numthreads(WORKGROUP_SIZE,1,1)]
[outputtopology("point")]

[shader("mesh")]
void main(
in uint3 ID : SV_DispatchThreadID,
out vertices SInterpolants verts[WORKGROUP_SIZE],
out indices uint prims[WORKGROUP_SIZE]
)
{
verts[ID.x].ndc = float32_t4(ID.x, 0.0, 0.0, 1.0);
prims[ID.x] = ID.x;
SetMeshOutputCounts(WORKGROUP_SIZE, WORKGROUP_SIZE);
}
13 changes: 13 additions & 0 deletions MeshShader/app_resources/geom.frag.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@

struct VertexOut {
float32_t4 ndc : SV_Position;
float32_t3 meta : COLOR1;
};


[shader("pixel")]
float32_t4 main(VertexOut input) : SV_Target0
{
const float32_t3 normal = input.meta;
return float32_t4(normalize(normal) * 0.5f + float32_t3(0.5f, 0.5f, 0.5f), 1.f);
}
51 changes: 51 additions & 0 deletions MeshShader/app_resources/geom.mesh.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
//https://microsoft.github.io/DirectX-Specs/d3d/MeshShader.html#primitive-attributes

#include "task_mesh_common.hlsl"

// (binding, set)
[[vk::binding(0, 0)]] StructuredBuffer<float3> MeshVertexBuffer;

struct VertexOut {
float32_t4 ndc : SV_Position;
float32_t3 meta : COLOR1;
};

[numthreads(WORKGROUP_SIZE,1,1)]

[outputtopology("triangle")]
[shader("mesh")]
void main(
in uint3 id : SV_DispatchThreadID,
in uint3 groupThreadID : SV_GroupThreadID,
out vertices VertexOut verts[WORKGROUP_SIZE],
out indices uint3 prims[WORKGROUP_SIZE]
)
{

// i havent benchmarked this personally, but my understandign is that AMD devices prefer mesh shaders to be "by primitive"
// and that nvidia devices prefer mesh shaders to be "by vertex".
// ideally, i'd benchmark both and setup branches so that each device can specialize the shader basedo n what it likes
//(theres a property in VkMeshProperties that would indicate this)
if (id.x < pc.vertCount) {
const float32_t3 position = MeshVertexBuffer[id.x];

// verts[id.x].ndc = mul(float32_t4(position, 1.0), worldViewProj);
verts[id.x].ndc = mul(pc.mvp, float32_t4(position, 1.0));

verts[id.x].meta = position;
}

// im just assuming its a triangle list right now. wont work if its not
if (id.x < pc.vertCount / 3) {

prims[id.x] = uint3(
id.x * 3,
id.x * 3 + 1,
id.x * 3 + 2
);
}



SetMeshOutputCounts(pc.vertCount, pc.vertCount / 3);
}
22 changes: 22 additions & 0 deletions MeshShader/app_resources/geom.task.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@

#include "task_mesh_common.hlsl"

groupshared TaskToMeshPayload taskToMeshPayload;

[numthreads(1,1,1)]
void main(
in uint3 id : SV_DispatchThreadID,
in uint3 groupThreadId : SV_GroupThreadID
//out payload TaskToMeshPayload taskToMeshPayload, interestingly, thats not how it's done here
){
uint objectCount = 0;
for(uint i = 0; i < OBJECT_COUNT; i++){
for(uint j = 0; j < pc.objectCount[i]; j++){
taskToMeshPayload.objectType[objectCount] = i;
objectCount++;
}
}

printf("dispatching meshes - %u", objectCount);
DispatchMesh(objectCount, 1, 1, taskToMeshPayload);
}
30 changes: 30 additions & 0 deletions MeshShader/app_resources/task_mesh_common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@

//this is user defined data sent from the task shader to the mesh shader
//1 packet is sent, but it can use arrays so that each workgroup can receive customized data
//struct TaskToMeshPayload {
// uint objectType[INSTANCE_COUNT * OBJECT_COUNT];
//};

//1 is cone, 2 is for fan, anything else for trangle list without the special normal calc.
//cone can be handled in the task shader or the mesh shader, I'm going to handle it in the task shader
//#define OTHER_OBJECTS 0
#define CONE_OBJECT_TYPE 1
#define T_FAN_OBJECT_TYPE 2
struct MeshData{
uint vertCount;
uint primCount; //were assuming vertCount is always equal to primCount (no index buffer)
uint objType;
uint positionView;
uint normalView;
uint indexView;
};


#define PushDescCount (0x1<<16)-1
struct SPushConstants {
float4x4 mvp;
uint vertCount;
};

//im not keen on trying to figure out how the push constant abstraction worked before without documentation
[[vk::push_constant]] SPushConstants pc;
117 changes: 117 additions & 0 deletions MeshShader/include/MeshRenderer.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#pragma once

#include "nbl/builtin/hlsl/math/linalg/fast_affine.hlsl"
#include "nbl/examples/geometry/SPushConstants.hlsl"

namespace nbl::examples
{

enum class MeshletObjectTypes {
Cube,
Rectangle,
Disk,
Sphere,
Cylinder,
Cone,
Icosphere,

COUNT
};
//this is buffer data
struct MeshletObjectData {
uint32_t vertCount;
uint32_t objectType;
uint32_t positionView;
uint32_t normalView;
uint32_t indexView;
};
struct MeshDataBuffer {
//if gpuGeometry is nullptr or std::nullopt or whatever, then mesh object type is invalid, the CPU memory failed to transfer to GPU for whatever reason
core::smart_refctd_ptr<const video::IGPUPolygonGeometry> gpuGeometry{};

static constexpr std::size_t MaxObjectCount = static_cast<std::size_t>(MeshletObjectTypes::COUNT);
static constexpr std::size_t MaxInstanceCount = 8; //for each object

MeshletObjectData meshData[MaxObjectCount];
hlsl::float32_t4x4 transforms[MaxInstanceCount];

//remove index type to avoid branch in shader
//asset::E_INDEX_TYPE indexType = asset::EIT_UNKNOWN;
};


class MeshDebugRenderer final : public core::IReferenceCounted {
#define EXPOSE_NABLA_NAMESPACES \
using namespace nbl::core; \
using namespace nbl::system; \
using namespace nbl::asset; \
using namespace nbl::video

public:
//
constexpr static inline uint16_t VertexAttrubUTBDescBinding = 0;

constexpr static inline auto MissingView = hlsl::examples::geometry_creator_scene::SPushConstants::DescriptorCount;

//
struct SInstance
{
struct SPushConstants
{
NBL_CONSTEXPR_STATIC_INLINE uint32_t DescriptorCount = (0x1 << 16) - 1;

nbl::hlsl::float32_t4x4 viewProj;
uint32_t vertCount;
};

hlsl::float32_t3x4 world;
};

static std::array<const core::smart_refctd_ptr<nbl::asset::IShader>, 2> CreateTestShader(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX);

//
static core::smart_refctd_ptr<MeshDebugRenderer> create(asset::IAssetManager* assMan, video::IGPURenderpass* renderpass, const uint32_t subpassIX);
//
struct SInitParams {

core::smart_refctd_ptr<video::IGPUDescriptorSet> meshDescriptor;
core::smart_refctd_ptr<video::IGPUPipelineLayout> pipe_layout; //when im looking at it from outside the class i need to know what kind of layout this is
core::smart_refctd_ptr<video::IGPUMeshPipeline> pipeline;
};
inline SInitParams& getInitParams() {return m_params;}

//im not going to go thru every example to fix them up to use this static function instead, so im leaving the old one
//device should be const* but im not going to fix it right now
//(scope creep)

bool addGeometries();

void removeGeometry(const uint32_t ix, const video::ISemaphore::SWaitInfo& info);

inline const auto& getGeometries() const {return m_geoms;}

void render(video::IGPUCommandBuffer* cmdbuf, nbl::hlsl::float32_t4x4 const& mvp) const;

SInstance m_instance;

//mesh layout
//PVP vertices at set 0 binding 0
//mesh data at set 1 binding 0
//they should be in the same set but tiny bit slower (1 additional API call) for a tiny bit easier programming
nbl::core::smart_refctd_ptr<nbl::video::IGPUDescriptorSetLayout> mesh_layout{};

MeshDataBuffer m_geoms;
protected:
inline MeshDebugRenderer(SInitParams&& _params) : m_params(std::move(_params)) {}
inline ~MeshDebugRenderer() {
// clean shutdown, can also make SubAllocatedDescriptorSet resillient against that, and issue `device->waitIdle` if not everything is freed
const_cast<video::ILogicalDevice*>(m_params.pipe_layout->getOriginDevice())->waitIdle();
clearGeometries({});
}
void clearGeometries(const video::ISemaphore::SWaitInfo& info);

SInitParams m_params;
#undef EXPOSE_NABLA_NAMESPACES
};

}
Loading