-
Notifications
You must be signed in to change notification settings - Fork 16
Image upload benchamark #238
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
CrabExtra
wants to merge
7
commits into
master
Choose a base branch
from
image_upload_benchamark
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
6635ba9
Add 73_ImageUploadBenchmark example
CrabExtra 951e2fd
Simple benchmark HOST_VISIBLE vs HOST_VISIBLE & DEVICE_LOCAL
CrabExtra 141295b
Measurment was wierd, added some detail and also fix a bug related to…
CrabExtra 874814a
Resolved PR comments + adding timestamp query
CrabExtra ddb7bfc
Adding more logs to release build
CrabExtra f1fc8d5
Added image to image copy
CrabExtra 7abe408
compute shader added
CrabExtra File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| include(common RESULT_VARIABLE RES) | ||
| if(NOT RES) | ||
| message(FATAL_ERROR "common.cmake not found. Should be in {repo_root}/cmake directory") | ||
| endif() | ||
|
|
||
| nbl_create_executable_project("" "" "" "" "${NBL_EXECUTABLE_PROJECT_CREATION_PCH_TARGET}") | ||
|
|
||
| if(NBL_EMBED_BUILTIN_RESOURCES) | ||
| set(_BR_TARGET_ ${EXECUTABLE_NAME}_builtinResourceData) | ||
| set(RESOURCE_DIR "app_resources") | ||
|
|
||
| get_filename_component(_SEARCH_DIRECTORIES_ "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE) | ||
| get_filename_component(_OUTPUT_DIRECTORY_SOURCE_ "${CMAKE_CURRENT_BINARY_DIR}/src" ABSOLUTE) | ||
| get_filename_component(_OUTPUT_DIRECTORY_HEADER_ "${CMAKE_CURRENT_BINARY_DIR}/include" ABSOLUTE) | ||
|
|
||
| file(GLOB_RECURSE BUILTIN_RESOURCE_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}" CONFIGURE_DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${RESOURCE_DIR}/*") | ||
| foreach(RES_FILE ${BUILTIN_RESOURCE_FILES}) | ||
| LIST_BUILTIN_RESOURCE(RESOURCES_TO_EMBED "${RES_FILE}") | ||
| endforeach() | ||
|
|
||
| ADD_CUSTOM_BUILTIN_RESOURCES(${_BR_TARGET_} RESOURCES_TO_EMBED "${_SEARCH_DIRECTORIES_}" "${RESOURCE_DIR}" "nbl::this_example::builtin" "${_OUTPUT_DIRECTORY_HEADER_}" "${_OUTPUT_DIRECTORY_SOURCE_}") | ||
|
|
||
| LINK_BUILTIN_RESOURCES_TO_TARGET(${EXECUTABLE_NAME} ${_BR_TARGET_}) | ||
| endif() | ||
|
|
||
| set(OUTPUT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/auto-gen") | ||
| set(DEPENDS | ||
| app_resources/common.hlsl | ||
| app_resources/tile_upload.comp.hlsl | ||
| ) | ||
| target_sources(${EXECUTABLE_NAME} PRIVATE ${DEPENDS}) | ||
| set_source_files_properties(${DEPENDS} PROPERTIES HEADER_FILE_ONLY ON) | ||
|
|
||
| set(SM 6_8) | ||
| set(JSON [=[ | ||
| [ | ||
| { | ||
| "INPUT": "app_resources/tile_upload.comp.hlsl", | ||
| "KEY": "snakeStore" | ||
| } | ||
| ] | ||
| ]=]) | ||
| string(CONFIGURE "${JSON}" JSON) | ||
|
|
||
| NBL_CREATE_NSC_COMPILE_RULES( | ||
| TARGET ${EXECUTABLE_NAME}SPIRV | ||
| LINK_TO ${EXECUTABLE_NAME} | ||
| DEPENDS ${DEPENDS} | ||
| BINARY_DIR ${OUTPUT_DIRECTORY} | ||
| MOUNT_POINT_DEFINE NBL_THIS_EXAMPLE_BUILD_MOUNT_POINT | ||
| COMMON_OPTIONS -I "${CMAKE_CURRENT_SOURCE_DIR}" -T lib_${SM} | ||
| OUTPUT_VAR KEYS | ||
| INCLUDE nbl/this_example/builtin/build/spirv/keys.hpp | ||
| NAMESPACE nbl::this_example::builtin::build | ||
| INPUTS ${JSON} | ||
| ) | ||
|
|
||
| NBL_CREATE_RESOURCE_ARCHIVE( | ||
| NAMESPACE nbl::this_example::builtin::build | ||
| TARGET ${EXECUTABLE_NAME}_builtinsBuild | ||
| LINK_TO ${EXECUTABLE_NAME} | ||
| BIND ${OUTPUT_DIRECTORY} | ||
| BUILTINS ${KEYS} | ||
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| struct PushConstantData | ||
| { | ||
| uint64_t deviceBufferAddress; | ||
| uint32_t2 dstOffset; | ||
| uint32_t srcWidth; | ||
| uint32_t srcHeight; | ||
| uint32_t tilesPerRow; | ||
| }; |
260 changes: 260 additions & 0 deletions
260
73_ImageUploadBenchmark/app_resources/tile_upload.comp.hlsl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,260 @@ | ||
| #include "common.hlsl" | ||
|
|
||
| [[vk::binding(0,0)]] RWTexture2D<float32_t4> dstImage; | ||
| [[vk::push_constant]] PushConstantData pc; | ||
|
|
||
| using namespace nbl::hlsl; | ||
|
|
||
| static const uint32_t TILE_WIDTH = 16u; | ||
| static const uint32_t TILE_HEIGHT = 8u; | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void linearStore(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 pixelPos = uint32_t2(gIdx % pc.srcWidth, gIdx / pc.srcWidth); | ||
|
|
||
| if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) | ||
| return; | ||
|
|
||
| uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u); | ||
|
|
||
| float32_t4 rgba = float32_t4( | ||
| float32_t((packed >> 0u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 8u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 16u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 24u) & 0xFFu) / 255.0f | ||
| ); | ||
|
|
||
| dstImage[pc.dstOffset + pixelPos] = rgba; | ||
| } | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void linearLoad(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 pixelPos = uint32_t2(gIdx % pc.srcWidth, gIdx / pc.srcWidth); | ||
|
|
||
| if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) | ||
| return; | ||
|
|
||
| float32_t4 color = dstImage[pc.dstOffset + pixelPos]; | ||
|
|
||
| uint32_t r = uint32_t(color.r * 255.0f + 0.5f); | ||
| uint32_t g = uint32_t(color.g * 255.0f + 0.5f); | ||
| uint32_t b = uint32_t(color.b * 255.0f + 0.5f); | ||
| uint32_t a = uint32_t(color.a * 255.0f + 0.5f); | ||
| uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u); | ||
| vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed); | ||
| } | ||
|
|
||
|
|
||
| uint32_t2 snakePixelPos(uint32_t gIdx, uint32_t srcWidth) | ||
| { | ||
| static const uint32_t PIXELS_PER_TILE = TILE_WIDTH * TILE_HEIGHT; | ||
| uint32_t tilesPerRow = srcWidth / TILE_WIDTH; | ||
|
|
||
| uint32_t tileIdx = gIdx / PIXELS_PER_TILE; | ||
| uint32_t localIdx = gIdx % PIXELS_PER_TILE; | ||
|
|
||
| uint32_t tileRow = tileIdx / tilesPerRow; | ||
| uint32_t tileCol = tileIdx % tilesPerRow; | ||
| // Odd rows: reverse X direction | ||
| if (tileRow & 1u) | ||
| tileCol = tilesPerRow - 1u - tileCol; | ||
|
|
||
| uint32_t localX = localIdx % TILE_WIDTH; | ||
| uint32_t localY = localIdx / TILE_WIDTH; | ||
|
|
||
| return uint32_t2( | ||
| tileCol * TILE_WIDTH + localX, | ||
| tileRow * TILE_HEIGHT + localY | ||
| ); | ||
| } | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void SnakeOrderStore(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 pixelPos = snakePixelPos(gIdx, pc.srcWidth); | ||
|
|
||
| if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) | ||
| return; | ||
|
|
||
| uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u); | ||
|
|
||
| float32_t4 rgba = float32_t4( | ||
| float32_t((packed >> 0u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 8u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 16u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 24u) & 0xFFu) / 255.0f | ||
| ); | ||
|
|
||
| dstImage[pc.dstOffset + pixelPos] = rgba; | ||
| } | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void SnakeOrderLoad(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 pixelPos = snakePixelPos(gIdx, pc.srcWidth); | ||
|
|
||
| if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) | ||
| return; | ||
|
|
||
| float32_t4 color = dstImage[pc.dstOffset + pixelPos]; | ||
|
|
||
| uint32_t r = uint32_t(color.r * 255.0f + 0.5f); | ||
| uint32_t g = uint32_t(color.g * 255.0f + 0.5f); | ||
| uint32_t b = uint32_t(color.b * 255.0f + 0.5f); | ||
| uint32_t a = uint32_t(color.a * 255.0f + 0.5f); | ||
| uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u); | ||
|
|
||
| vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed); | ||
| } | ||
|
|
||
| uint32_t mortonCompact1By1(uint32_t x) | ||
| { | ||
| x &= 0x55555555u; | ||
| x = (x ^ (x >> 1u)) & 0x33333333u; | ||
| x = (x ^ (x >> 2u)) & 0x0f0f0f0fu; | ||
| x = (x ^ (x >> 4u)) & 0x00ff00ffu; | ||
| x = (x ^ (x >> 8u)) & 0x0000ffffu; | ||
| return x; | ||
| } | ||
|
|
||
| uint32_t2 mortonDecode(uint32_t code) | ||
| { | ||
| return uint32_t2( | ||
| mortonCompact1By1(code), | ||
| mortonCompact1By1(code >> 1u) | ||
| ); | ||
| } | ||
|
|
||
| void batchedTileInfo(uint32_t gIdx, uint32_t tileW, uint32_t tileH, uint32_t tilesPerRow, | ||
| out uint32_t2 tileBase, out uint32_t localIdx) | ||
| { | ||
| uint32_t pixelsPerTile = tileW * tileH; | ||
| uint32_t tileIdx = gIdx / pixelsPerTile; | ||
| localIdx = gIdx % pixelsPerTile; | ||
| uint32_t tileCol = tileIdx % tilesPerRow; | ||
| uint32_t tileRow = tileIdx / tilesPerRow; | ||
| tileBase = uint32_t2(tileCol * tileW, tileRow * tileH); | ||
| } | ||
|
|
||
| float32_t4 unpackRGBA(uint32_t packed) | ||
| { | ||
| return float32_t4( | ||
| float32_t((packed >> 0u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 8u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 16u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 24u) & 0xFFu) / 255.0f | ||
| ); | ||
| } | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void BatchedLinearStore(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 tileBase; | ||
| uint32_t localIdx; | ||
| batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx); | ||
|
|
||
| uint32_t2 localPos = uint32_t2(localIdx % pc.srcWidth, localIdx / pc.srcWidth); | ||
| uint32_t2 pixelPos = tileBase + localPos; | ||
|
|
||
| uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u); | ||
| dstImage[pixelPos] = unpackRGBA(packed); | ||
| } | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void BatchedSnakeStore(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 tileBase; | ||
| uint32_t localIdx; | ||
| batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx); | ||
|
|
||
| // Snake within tile row-major with zigzag on odd tile rows | ||
| uint32_t localTilesPerRow = pc.srcWidth / TILE_WIDTH; | ||
| uint32_t subTileIdx = localIdx / (TILE_WIDTH * TILE_HEIGHT); | ||
| uint32_t subLocalIdx = localIdx % (TILE_WIDTH * TILE_HEIGHT); | ||
| uint32_t subRow = subTileIdx / localTilesPerRow; | ||
| uint32_t subCol = subTileIdx % localTilesPerRow; | ||
| if (subRow & 1u) | ||
| subCol = localTilesPerRow - 1u - subCol; | ||
| uint32_t localX = subCol * TILE_WIDTH + (subLocalIdx % TILE_WIDTH); | ||
| uint32_t localY = subRow * TILE_HEIGHT + (subLocalIdx / TILE_WIDTH); | ||
| uint32_t2 pixelPos = tileBase + uint32_t2(localX, localY); | ||
|
|
||
| uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u); | ||
| dstImage[pixelPos] = unpackRGBA(packed); | ||
| } | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void BatchedMortonStore(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 tileBase; | ||
| uint32_t localIdx; | ||
| batchedTileInfo(gIdx, pc.srcWidth, pc.srcHeight, pc.tilesPerRow, tileBase, localIdx); | ||
|
|
||
| uint32_t2 localPos = mortonDecode(localIdx); | ||
| uint32_t2 pixelPos = tileBase + localPos; | ||
|
|
||
| if (localPos.x >= pc.srcWidth || localPos.y >= pc.srcHeight) | ||
| return; | ||
|
|
||
| uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u); | ||
| dstImage[pixelPos] = unpackRGBA(packed); | ||
| } | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void MortonOrderStore(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 pixelPos = mortonDecode(gIdx); | ||
|
|
||
| if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) | ||
| return; | ||
|
|
||
| uint32_t packed = vk::RawBufferLoad<uint32_t>(pc.deviceBufferAddress + gIdx * 4u); | ||
|
|
||
| float32_t4 rgba = float32_t4( | ||
| float32_t((packed >> 0u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 8u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 16u) & 0xFFu) / 255.0f, | ||
| float32_t((packed >> 24u) & 0xFFu) / 255.0f | ||
| ); | ||
|
|
||
| dstImage[pc.dstOffset + pixelPos] = rgba; | ||
| } | ||
|
|
||
| [numthreads(128, 1, 1)] | ||
| [shader("compute")] | ||
| void MortonOrderLoad(uint32_t3 ID : SV_DispatchThreadID) | ||
| { | ||
| uint32_t gIdx = ID.x; | ||
| uint32_t2 pixelPos = mortonDecode(gIdx); | ||
|
|
||
| if (pixelPos.x >= pc.srcWidth || pixelPos.y >= pc.srcHeight) | ||
| return; | ||
|
|
||
| float32_t4 color = dstImage[pc.dstOffset + pixelPos]; | ||
|
|
||
| uint32_t r = uint32_t(color.r * 255.0f + 0.5f); | ||
| uint32_t g = uint32_t(color.g * 255.0f + 0.5f); | ||
| uint32_t b = uint32_t(color.b * 255.0f + 0.5f); | ||
| uint32_t a = uint32_t(color.a * 255.0f + 0.5f); | ||
| uint32_t packed = (r << 0u) | (g << 8u) | (b << 16u) | (a << 24u); | ||
|
|
||
| vk::RawBufferStore<uint32_t>(pc.deviceBufferAddress + gIdx * 4u, packed); | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,28 @@ | ||
| { | ||
| "enableParallelBuild": true, | ||
| "threadsPerBuildProcess" : 2, | ||
| "isExecuted": false, | ||
| "scriptPath": "", | ||
| "cmake": { | ||
| "configurations": [ "Release", "Debug", "RelWithDebInfo" ], | ||
| "buildModes": [], | ||
| "requiredOptions": [] | ||
| }, | ||
| "profiles": [ | ||
| { | ||
| "backend": "vulkan", // should be none | ||
| "platform": "windows", | ||
| "buildModes": [], | ||
| "runConfiguration": "Release", // we also need to run in Debug nad RWDI because foundational example | ||
| "gpuArchitectures": [] | ||
| } | ||
| ], | ||
| "dependencies": [], | ||
| "data": [ | ||
| { | ||
| "dependencies": [], | ||
| "command": [""], | ||
| "outputs": [] | ||
| } | ||
| ] | ||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we already have morton code function in our hlsl library.
already mentioned here: https://discord.com/channels/593902898015109131/1450452175234011147/1471719738525614334
remove this and include "nbl/builtin/hlsl/morton.hlsl".
if usage is not clear search on discord and ask