Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions 73_ImageUploadBenchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
include(common RESULT_VARIABLE RES)
if(NOT RES)
message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory")
endif()

nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}")

if(NBL_EMBED_BUILTIN_RESOURCES)
set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData)
set(RESOURCE_DIR "app_resources")

get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE)
get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE)

file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*")
foreach(RES_FILE ${BUILTIN_RESOURCE_FILES})
LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}")
endforeach()

ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}")

LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_})
endif()

set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen")
set(DEPENDS
app_resources/common.hlsl
app_resources/tile_upload.comp.hlsl
)
target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS})
set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON)

set(SM 6_8)
set(JSON [=[
[
{
"INPUT": "app_resources/tile_upload.comp.hlsl",
"KEY": "snakeStore"
}
]
]=])
string(CONFIGURE "${JSON}" JSON)

NBL_CREATE_NSC_COMPILE_RULES(
TARGET ${EXECUTABLE_NAME}SPIRV
LINK_TO ${EXECUTABLE_NAME}
DEPENDS ${DEPENDS}
BINARY_DIR ${OUTPUT_DIRECTORY}
MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT
COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -T lib_${SM}
OUTPUT_VAR KEYS
INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp
NAMESPACE nbl::this_example::builtin::build
INPUTS ${JSON}
)

NBL_CREATE_RESOURCE_ARCHIVE(
NAMESPACE nbl::this_example::builtin::build
TARGET ${EXECUTABLE_NAME}_builtinsBuild
LINK_TO ${EXECUTABLE_NAME}
BIND ${OUTPUT_DIRECTORY}
BUILTINS ${KEYS}
)
8 changes: 8 additions & 0 deletions 73_ImageUploadBenchmark/app_resources/common.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
struct PushConstantData
{
uint64_t deviceBufferAddress;
uint32_t2 dstOffset;
uint32_t srcWidth;
uint32_t srcHeight;
uint32_t tilesPerRow;
};
260 changes: 260 additions & 0 deletions 73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
#include "common.hlsl"

[[vk::binding(0,0)]] RWTexture2D<float32_t4> dstImage;
[[vk::push_constant]] PushConstantData pc;

using namespace nbl::hlsl;

static const uint32_t TILE_WIDTH = 16u;
static const uint32_t TILE_HEIGHT = 8u;

[numthreads(128, 1, 1)]
[shader("compute")]
void linearStore(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 pixelPos = uint32_t2(gIdx % pc.srcWidth, gIdx / pc.srcWidth);

if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
return;

uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);

float32_t4 rgba = float32_t4(
float32_t((packed >> 0u) & 0xFFu) / 255.0f,
float32_t((packed >> 8u) & 0xFFu) / 255.0f,
float32_t((packed >> 16u) & 0xFFu) / 255.0f,
float32_t((packed >> 24u) & 0xFFu) / 255.0f
);

dstImage[pc.dstOffset + pixelPos] = rgba;
}

[numthreads(128, 1, 1)]
[shader("compute")]
void linearLoad(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 pixelPos = uint32_t2(gIdx % pc.srcWidth, gIdx / pc.srcWidth);

if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
return;

float32_t4 color = dstImage[pc.dstOffset + pixelPos];

uint32_t r = uint32_t(color.r * 255.0f + 0.5f);
uint32_t g = uint32_t(color.g * 255.0f + 0.5f);
uint32_t b = uint32_t(color.b * 255.0f + 0.5f);
uint32_t a = uint32_t(color.a * 255.0f + 0.5f);
uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u);
vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed);
}


uint32_t2 snakePixelPos(uint32_t gIdx, uint32_t srcWidth)
{
static const uint32_t PIXELS_PER_TILE = TILE_WIDTH * TILE_HEIGHT;
uint32_t tilesPerRow = srcWidth / TILE_WIDTH;

uint32_t tileIdx = gIdx / PIXELS_PER_TILE;
uint32_t localIdx = gIdx % PIXELS_PER_TILE;

uint32_t tileRow = tileIdx / tilesPerRow;
uint32_t tileCol = tileIdx % tilesPerRow;
// Odd rows: reverse X direction
if (tileRow & 1u)
tileCol = tilesPerRow - 1u - tileCol;

uint32_t localX = localIdx % TILE_WIDTH;
uint32_t localY = localIdx / TILE_WIDTH;

return uint32_t2(
tileCol * TILE_WIDTH + localX,
tileRow * TILE_HEIGHT + localY
);
}

[numthreads(128, 1, 1)]
[shader("compute")]
void SnakeOrderStore(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 pixelPos = snakePixelPos(gIdx, pc.srcWidth);

if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
return;

uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);

float32_t4 rgba = float32_t4(
float32_t((packed >> 0u) & 0xFFu) / 255.0f,
float32_t((packed >> 8u) & 0xFFu) / 255.0f,
float32_t((packed >> 16u) & 0xFFu) / 255.0f,
float32_t((packed >> 24u) & 0xFFu) / 255.0f
);

dstImage[pc.dstOffset + pixelPos] = rgba;
}

[numthreads(128, 1, 1)]
[shader("compute")]
void SnakeOrderLoad(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 pixelPos = snakePixelPos(gIdx, pc.srcWidth);

if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
return;

float32_t4 color = dstImage[pc.dstOffset + pixelPos];

uint32_t r = uint32_t(color.r * 255.0f + 0.5f);
uint32_t g = uint32_t(color.g * 255.0f + 0.5f);
uint32_t b = uint32_t(color.b * 255.0f + 0.5f);
uint32_t a = uint32_t(color.a * 255.0f + 0.5f);
uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u);

vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed);
}

uint32_t mortonCompact1By1(uint32_t x)
{
x &= 0x55555555u;
x = (x ^ (x >> 1u)) & 0x33333333u;
x = (x ^ (x >> 2u)) & 0x0f0f0f0fu;
x = (x ^ (x >> 4u)) & 0x00ff00ffu;
x = (x ^ (x >> 8u)) & 0x0000ffffu;
return x;
}

uint32_t2 mortonDecode(uint32_t code)
{
return uint32_t2(
mortonCompact1By1(code),
mortonCompact1By1(code >> 1u)
);
}
Comment on lines +120 to +136
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we already have morton code function in our hlsl library.
already mentioned here: https://discord.com/channels/593902898015109131/1450452175234011147/1471719738525614334

remove this and include "nbl/builtin/hlsl/morton.hlsl".
if usage is not clear search on discord and ask


void batchedTileInfo(uint32_t gIdx, uint32_t tileW, uint32_t tileH, uint32_t tilesPerRow,
out uint32_t2 tileBase, out uint32_t localIdx)
{
uint32_t pixelsPerTile = tileW * tileH;
uint32_t tileIdx = gIdx / pixelsPerTile;
localIdx = gIdx % pixelsPerTile;
uint32_t tileCol = tileIdx % tilesPerRow;
uint32_t tileRow = tileIdx / tilesPerRow;
tileBase = uint32_t2(tileCol * tileW, tileRow * tileH);
}

float32_t4 unpackRGBA(uint32_t packed)
{
return float32_t4(
float32_t((packed >> 0u) & 0xFFu) / 255.0f,
float32_t((packed >> 8u) & 0xFFu) / 255.0f,
float32_t((packed >> 16u) & 0xFFu) / 255.0f,
float32_t((packed >> 24u) & 0xFFu) / 255.0f
);
}

[numthreads(128, 1, 1)]
[shader("compute")]
void BatchedLinearStore(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 tileBase;
uint32_t localIdx;
batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx);

uint32_t2 localPos = uint32_t2(localIdx % pc.srcWidth, localIdx / pc.srcWidth);
uint32_t2 pixelPos = tileBase + localPos;

uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
dstImage[pixelPos] = unpackRGBA(packed);
}

[numthreads(128, 1, 1)]
[shader("compute")]
void BatchedSnakeStore(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 tileBase;
uint32_t localIdx;
batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx);

// Snake within tile row-major with zigzag on odd tile rows
uint32_t localTilesPerRow = pc.srcWidth / TILE_WIDTH;
uint32_t subTileIdx = localIdx / (TILE_WIDTH * TILE_HEIGHT);
uint32_t subLocalIdx = localIdx % (TILE_WIDTH * TILE_HEIGHT);
uint32_t subRow = subTileIdx / localTilesPerRow;
uint32_t subCol = subTileIdx % localTilesPerRow;
if (subRow & 1u)
subCol = localTilesPerRow - 1u - subCol;
uint32_t localX = subCol * TILE_WIDTH + (subLocalIdx % TILE_WIDTH);
uint32_t localY = subRow * TILE_HEIGHT + (subLocalIdx / TILE_WIDTH);
uint32_t2 pixelPos = tileBase + uint32_t2(localX, localY);

uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
dstImage[pixelPos] = unpackRGBA(packed);
}

[numthreads(128, 1, 1)]
[shader("compute")]
void BatchedMortonStore(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 tileBase;
uint32_t localIdx;
batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx);

uint32_t2 localPos = mortonDecode(localIdx);
uint32_t2 pixelPos = tileBase + localPos;

if (localPos.x >= pc.srcWidth || localPos.y >= pc.srcHeight)
return;

uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);
dstImage[pixelPos] = unpackRGBA(packed);
}

[numthreads(128, 1, 1)]
[shader("compute")]
void MortonOrderStore(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 pixelPos = mortonDecode(gIdx);

if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
return;

uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u);

float32_t4 rgba = float32_t4(
float32_t((packed >> 0u) & 0xFFu) / 255.0f,
float32_t((packed >> 8u) & 0xFFu) / 255.0f,
float32_t((packed >> 16u) & 0xFFu) / 255.0f,
float32_t((packed >> 24u) & 0xFFu) / 255.0f
);

dstImage[pc.dstOffset + pixelPos] = rgba;
}

[numthreads(128, 1, 1)]
[shader("compute")]
void MortonOrderLoad(uint32_t3 ID : SV_DispatchThreadID)
{
uint32_t gIdx = ID.x;
uint32_t2 pixelPos = mortonDecode(gIdx);

if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight)
return;

float32_t4 color = dstImage[pc.dstOffset + pixelPos];

uint32_t r = uint32_t(color.r * 255.0f + 0.5f);
uint32_t g = uint32_t(color.g * 255.0f + 0.5f);
uint32_t b = uint32_t(color.b * 255.0f + 0.5f);
uint32_t a = uint32_t(color.a * 255.0f + 0.5f);
uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u);

vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed);
}
28 changes: 28 additions & 0 deletions 73_ImageUploadBenchmark/config.json.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
{
"enableParallelBuild": true,
"threadsPerBuildProcess" : 2,
"isExecuted": false,
"scriptPath": "",
"cmake": {
"configurations": [ "Release", "Debug", "RelWithDebInfo" ],
"buildModes": [],
"requiredOptions": []
},
"profiles": [
{
"backend": "vulkan", // should be none
"platform": "windows",
"buildModes": [],
"runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example
"gpuArchitectures": []
}
],
"dependencies": [],
"data": [
{
"dependencies": [],
"command": [""],
"outputs": []
}
]
}
Loading