From 9ae0346e0287060611365fe73dc63b21ae28f17b Mon Sep 17 00:00:00 2001 From: yumemiso Date: Thu, 11 Jun 2026 22:38:52 +0800 Subject: [PATCH] add client hook methods for gpu virtualization in cloud --- .../gpu-remoting/include/clientEndpoint.h | 95 + .../gpu-remoting/src/client/cublasHook.cc | 204 ++ .../gpu-remoting/src/client/cublasLtHook.cc | 306 ++ .../gpu-remoting/src/client/cudaHook.cc | 123 + .../gpu-remoting/src/client/cudaHook.cc.tmp | 3013 +++++++++++++++++ .../gpu-remoting/src/client/cudartHook.cc | 1185 +++++++ .../gpu-remoting/src/client/cudnnHook.cc | 1412 ++++++++ .../gpu-remoting/src/client/ncclHook.cc | 690 ++++ .../gpu-remoting/src/client/nvmlHook.cc | 65 + 9 files changed, 7093 insertions(+) create mode 100644 GPU-Virtual-Service/gpu-remoting/include/clientEndpoint.h create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/cublasHook.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/cublasLtHook.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/cudaHook.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/cudaHook.cc.tmp create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/cudartHook.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/cudnnHook.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/ncclHook.cc create mode 100644 GPU-Virtual-Service/gpu-remoting/src/client/nvmlHook.cc diff --git a/GPU-Virtual-Service/gpu-remoting/include/clientEndpoint.h b/GPU-Virtual-Service/gpu-remoting/include/clientEndpoint.h new file mode 100644 index 0000000..604ba12 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/include/clientEndpoint.h @@ -0,0 +1,95 @@ +#ifndef CLIENT_ENDPOINT_H +#define CLIENT_ENDPOINT_H + +#include "configure.h" +#include "shmqueue/shmUtil.h" +#include "ucpConnection.h" +#include "requestBuffer.h" +#include "requestIOV.h" +#include "registerIOV.h" +#include "gpuIdMap.h" +#include "./hashing/robin_hood.h" +#include "./conqueue/readerwriterqueue.h" + +class SpinLock { +private: + std::atomic_flag flag = ATOMIC_FLAG_INIT; + +public: + void lock() { + while (flag.test_and_set(std::memory_order_acquire)) { + // Busy wait + } + } + + void unlock() { + flag.clear(std::memory_order_release); + } +}; + +class ClientEndpoint { + private: + const char* myName_ = "ClientEndpoint"; + char* clientIP_; + char* clientPort_; + size_t priority_ = 0; + + ConnStatus_t connStatus_ = {false, true}; + + vector clearTimes_; + + RequestIOV* recordedReq = NULL; + SpinLock listLock_; + bool memcpyRecord_ = false; + boost::intrusive::list reqIOVList; + // int ckptIter = 0; + // int ckptCnt = 0; + + public: + uint64_t _clientID; + int _myDevIdx = 0; + int _threadID = 0; + int _processID = 0; + + ucp_worker_h _dataWorker; + ucp_ep_h _serverEp; + + size_t _copySize = 0; // data size that has been copied + SharedMemoryOpt* _shmOpt = NULL; + SharedMemoryOpt* _GpuIdMap = NULL; + + ClientEndpoint(uint64_t clientID, size_t priority, ucp_worker_h clientWorker, int dev); + + ~ClientEndpoint(); + + void Connect(bool replay = false); + void CloseEp(uint64_t flags); + + static void SendRegisterRequest(ClientEndpoint* curEp, bool forcedEager = true); + ucs_status_t SendRequest(RequestIOV* reqBuffer, bool forcedEager = false, bool isCheckpoint = true); + void SendRequestH2D(RequestIOV* reqBuffer, uint8_t* header, size_t headerSize, bool forcedEager = false, bool isCheckpoint = true); + ucs_status_t SendRequestRecvResponse(RequestIOV* reqBuffer, RequestIOV* responseBuffer, bool forcedEager = false, bool isCheckpoint = true); + void SendNewIterRequest(size_t iterNum); + + void Checkpointing(); + void Replay(); + void UpdateReqIOVList(RequestIOV* reqBuffer); + void UpdateReqIOVList(RequestIOV* reqBuffer, uint8_t* header, size_t headerSize); + void ShrinkReqIOVList(); + + void SendMainDevice(bool replay); +}; + +extern std::mutex reConnectMutex; +extern bool isReConnected; +extern thread_local int ttID; +extern SharedMemoryOpt* shmOpt; +extern std::vector regIOVList; +extern GPUidMap* gpuIdMap; +extern Configure* config_; +extern thread_local std::vector tensorByteList; +extern void CheckTensors(int reqType); +extern ucs_status_t ServerStatusCallback(void *arg, const void *header, size_t header_length, void *data, size_t length, const ucp_am_recv_param_t *param); +extern void ClientErrorCallback(void *arg, ucp_ep_h ep, ucs_status_t status); + +#endif \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/cublasHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/cublasHook.cc new file mode 100644 index 0000000..4c677b0 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/cublasHook.cc @@ -0,0 +1,204 @@ +#include "../../include/hook/hook.h" + + +cublasStatus_t cublasCreate_v2(cublasHandle_t *handle) { + const char* func_name = "cublasCreate_v2"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasHandle_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLAS_CREATE_V2); + *handle = NULL; + reqBuf.Push(handle); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(handle); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(handle); +} + +cublasStatus_t cublasSgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const float *alpha, const float *A, int lda, + const float *B, int ldb, const float *beta, + float *C, int ldc) { + const char* func_name = "cublasSgemm_v2"; + HookLog(func_name); + using func_ptr = + cublasStatus_t (*)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, + const float *, int, const float *, int, const float *, float *, int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasSgemm_v2")); + + // printf("alpha: %p, beta: %p\n", alpha, beta); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cublasOperation_t) + sizeof(cublasOperation_t) + 3 * sizeof(int) + 2 * (sizeof(size_t)+sizeof(const float)) + 3 * sizeof(uint64_t) + 3 * sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLAS_SGEMM_V2); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(transa); + reqBuf.Push(transb); + reqBuf.Push(m); + reqBuf.Push(n); + reqBuf.Push(k); + reqBuf.PushConst(alpha); + reqBuf.Push64BitPointer(A); // device pointer + reqBuf.Push(lda); + reqBuf.Push64BitPointer(B); // device pointer + reqBuf.Push(ldb); + reqBuf.PushConst(beta); + reqBuf.Push64BitPointer(C); // device pointer + reqBuf.Push(ldc); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); +} + +cublasStatus_t cublasSgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, cublasOperation_t transb, + int m, int n, int k, + const float* alpha, const float* A, int lda, long long int strideA, + const float* B, int ldb, long long int strideB, const float* beta, + float* C, int ldc, long long int strideC, int batchCount) { + const char* func_name = "cublasSgemmStridedBatched"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasHandle_t, cublasOperation_t, cublasOperation_t, int, int, int, const float *, + const float *, int, long long int, const float *, int, long long int, const float *, + float *, int, long long int, int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasSgemmStridedBatched")); + + // printf("alpha: %p, beta: %p\n", alpha, beta); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cublasOperation_t) + sizeof(cublasOperation_t) + 3 * sizeof(int) + 2 * (sizeof(size_t)+sizeof(const float)) + 3 * sizeof(uint64_t) + 3 * sizeof(int) + 3 * sizeof(long long int) + sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLAS_SGEMM_STRIDED_BATCHED); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(transa); + reqBuf.Push(transb); + reqBuf.Push(m); + reqBuf.Push(n); + reqBuf.Push(k); + reqBuf.PushConst(alpha); + reqBuf.Push64BitPointer(A); // device pointer + reqBuf.Push(lda); + reqBuf.Push(strideA); + reqBuf.Push64BitPointer(B); // device pointer + reqBuf.Push(ldb); + reqBuf.Push(strideB); + reqBuf.PushConst(beta); + reqBuf.Push64BitPointer(C); // device pointer + reqBuf.Push(ldc); + reqBuf.Push(strideC); + reqBuf.Push(batchCount); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(handle, transa, transb, m, n, k, alpha, A, lda, strideA, B, ldb, strideB, beta, C, ldc, strideC, batchCount); +} + +cublasStatus_t cublasDestroy_v2(cublasHandle_t handle) { + const char* func_name = "cublasDestroy_v2"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasHandle_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasDestroy_v2")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLAS_DESTROY_V2); + reqBuf.Push64BitPointer(handle); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(handle); +} + +//todo: to be validated +cublasStatus_t cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId) { + const char* func_name = "cublasSetStream_v2"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasHandle_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasSetStream_v2")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLAS_SET_STREAM_V2); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(streamId); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(handle, streamId); +} + +cublasStatus_t cublasSetWorkspace_v2(cublasHandle_t handle, + void *workspace, size_t workspaceSizeInBytes) { + const char* func_name = "cublasSetWorkspace_v2"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasHandle_t, void *, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasSetWorkspace_v2")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLAS_SET_WORKSPACE_V2); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(workspace); + reqBuf.Push(workspaceSizeInBytes); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(handle, workspace, workspaceSizeInBytes); +} + +cublasStatus_t cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode) { + const char* func_name = "cublasSetMathMode"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasHandle_t, cublasMath_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasSetMathMode")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cublasMath_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLAS_SET_MATH_MODE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(mode); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(handle, mode); +} + +cublasStatus_t cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode) { + const char* func_name = "cublasGetMathMode"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasHandle_t, cublasMath_t*); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasGetMathMode")); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLAS_GET_MATH_MODE); + reqBuf.Push64BitPointer(handle); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(mode); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(handle, mode); +} + diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/cublasLtHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/cublasLtHook.cc new file mode 100644 index 0000000..d25c19f --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/cublasLtHook.cc @@ -0,0 +1,306 @@ +#include "../../include/hook/hook.h" + +cublasStatus_t cublasLtCreate(cublasLtHandle_t *lightHandle) { + const char* func_name = "cublasLtCreate"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtHandle_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtCreate")); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_CREATE); + *lightHandle = NULL; + reqBuf.Push(lightHandle); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(lightHandle); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(lightHandle); +} + +cublasStatus_t cublasLtDestroy(cublasLtHandle_t lightHandle) { + const char* func_name = "cublasLtDestroy"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtHandle_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtDestroy")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_DESTROY); + reqBuf.Push64BitPointer(lightHandle); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(lightHandle); +} + +cublasStatus_t cublasLtMatmulDescCreate(cublasLtMatmulDesc_t *matmulDesc, + cublasComputeType_t computeType, cudaDataType_t scaleType) { + const char* func_name = "cublasLtMatmulDescCreate"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtMatmulDesc_t *, cublasComputeType_t, cudaDataType_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatmulDescCreate")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cublasComputeType_t) + sizeof(cudaDataType_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATMULDESC_CREATE); + *matmulDesc = NULL; + reqBuf.Push(matmulDesc); + reqBuf.Push(computeType); + reqBuf.Push(scaleType); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(matmulDesc); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(matmulDesc, computeType, scaleType); +} + +cublasStatus_t cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc) { + const char* func_name = "cublasLtMatmulDescDestroy"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtMatmulDesc_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatmulDescDestroy")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATMULDESC_DESTROY); + reqBuf.Push64BitPointer(matmulDesc); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(matmulDesc); +} + +cublasStatus_t cublasLtMatmulDescSetAttribute(cublasLtMatmulDesc_t matmulDesc, + cublasLtMatmulDescAttributes_t attr, const void *buf, size_t sizeInBytes) { + const char* func_name = "cublasLtMatmulDescSetAttribute"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, const void *, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatmulDescSetAttribute")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cublasLtMatmulDescAttributes_t) + sizeof(size_t)+sizeInBytes + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATMULDESC_SETATTRIBUTE); + reqBuf.Push64BitPointer(matmulDesc); + reqBuf.Push(attr); + reqBuf.PushConst((uint8_t*)buf, sizeInBytes); + reqBuf.Push(sizeInBytes); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(matmulDesc, attr, buf, sizeInBytes); +} + +cublasStatus_t cublasLtMatrixLayoutCreate(cublasLtMatrixLayout_t *matLayout, cudaDataType type, + uint64_t rows, uint64_t cols, int64_t ld) { + const char* func_name = "cublasLtMatrixLayoutCreate"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtMatrixLayout_t *, cudaDataType, uint64_t, uint64_t, int64_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatrixLayoutCreate")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudaDataType) + sizeof(uint64_t) + sizeof(uint64_t) + sizeof(int64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATRIX_LAYOUT_CREATE); + *matLayout = NULL; + reqBuf.Push(matLayout); + reqBuf.Push(type); + reqBuf.Push(rows); + reqBuf.Push(cols); + reqBuf.Push(ld); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(matLayout); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(matLayout, type, rows, cols, ld); +} + +cublasStatus_t cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout) { + const char* func_name = "cublasLtMatrixLayoutDestroy"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtMatrixLayout_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatrixLayoutDestroy")); + tool::Logging(LOG_DEBUG, func_name, "matLayout: %p\n", matLayout); + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATRIX_LAYOUT_DESTROY); + reqBuf.Push64BitPointer(matLayout); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(matLayout); +} + +cublasStatus_t cublasLtMatrixLayoutSetAttribute(cublasLtMatrixLayout_t matLayout, + cublasLtMatrixLayoutAttribute_t attr, + const void *buf, size_t sizeInBytes) { + const char* func_name = "cublasLtMatrixLayoutSetAttribute"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, const void *, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatrixLayoutSetAttribute")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cublasLtMatrixLayoutAttribute_t) + sizeof(size_t)+sizeInBytes + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATRIX_LAYOUT_SETATTRIBUTE); + reqBuf.Push64BitPointer(matLayout); + reqBuf.Push(attr); + reqBuf.PushConst((uint8_t*)buf, sizeInBytes); + reqBuf.Push(sizeInBytes); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(matLayout, attr, buf, sizeInBytes); +} + +cublasStatus_t cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t *pref) { + const char* func_name = "cublasLtMatmulPreferenceCreate"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtMatmulPreference_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatmulPreferenceCreate")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATMULPREFERENCE_CREATE); + *pref = NULL; + reqBuf.Push(pref); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(pref); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(pref); +} + +cublasStatus_t cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref) { + const char* func_name = "cublasLtMatmulPreferenceDestroy"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtMatmulPreference_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatmulPreferenceDestroy")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATMULPREFERENCE_DESTROY); + reqBuf.Push64BitPointer(pref); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(pref); +} + +cublasStatus_t cublasLtMatmulPreferenceSetAttribute(cublasLtMatmulPreference_t pref, + cublasLtMatmulPreferenceAttributes_t attr, const void *buf, size_t sizeInBytes) { + const char* func_name = "cublasLtMatmulPreferenceSetAttribute"; + HookLog(func_name); + using func_ptr = + cublasStatus_t (*)(cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t, const void *, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatmulPreferenceSetAttribute")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cublasLtMatmulPreferenceAttributes_t) + sizeof(size_t)+sizeInBytes + sizeInBytes); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATMULPREFERENCE_SETATTRIBUTE); + reqBuf.Push64BitPointer(pref); + reqBuf.Push(attr); + reqBuf.PushConst((uint8_t*)buf, sizeInBytes); + reqBuf.Push(sizeInBytes); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(pref, attr, buf, sizeInBytes); +} + +cublasStatus_t cublasLtMatmulAlgoGetHeuristic( + cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t operationDesc, cublasLtMatrixLayout_t Adesc, + cublasLtMatrixLayout_t Bdesc, cublasLtMatrixLayout_t Cdesc, cublasLtMatrixLayout_t Ddesc, + cublasLtMatmulPreference_t preference, int requestedAlgoCount, + cublasLtMatmulHeuristicResult_t heuristicResultsArray[], int *returnAlgoCount) { + const char* func_name = "cublasLtMatmulAlgoGetHeuristic"; + HookLog(func_name); + using func_ptr = cublasStatus_t (*)(cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t, + cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, + cublasLtMatmulPreference_t, int, cublasLtMatmulHeuristicResult_t[], int *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatmulAlgoGetHeuristic")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) * 7 + sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATMULALGO_GETHEURISTIC); + reqBuf.Push64BitPointer(lightHandle); + reqBuf.Push64BitPointer(operationDesc); + reqBuf.Push64BitPointer(Adesc); + reqBuf.Push64BitPointer(Bdesc); + reqBuf.Push64BitPointer(Cdesc); + reqBuf.Push64BitPointer(Ddesc); + reqBuf.Push64BitPointer(preference); + reqBuf.Push(requestedAlgoCount); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(returnAlgoCount); + resBuf.Push(heuristicResultsArray, requestedAlgoCount); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, requestedAlgoCount, heuristicResultsArray, returnAlgoCount); +} + +cublasStatus_t cublasLtMatmul( + cublasLtHandle_t lightHandle, cublasLtMatmulDesc_t computeDesc, const void *alpha, const void *A, + cublasLtMatrixLayout_t Adesc, const void *B, cublasLtMatrixLayout_t Bdesc, const void *beta, const void *C, + cublasLtMatrixLayout_t Cdesc, void *D, cublasLtMatrixLayout_t Ddesc, const cublasLtMatmulAlgo_t *algo, + void *workspace, size_t workspaceSizeInBytes, cudaStream_t stream) { + const char* func_name = "cublasLtMatmul"; + HookLog(func_name); + using func_ptr = + cublasStatus_t (*)(cublasLtHandle_t, cublasLtMatmulDesc_t, const void *, const void *, cublasLtMatrixLayout_t, + const void *, cublasLtMatrixLayout_t, const void *, const void *, cublasLtMatrixLayout_t, + void *, cublasLtMatrixLayout_t, const cublasLtMatmulAlgo_t *, void *, size_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cublasLtMatmul")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)*2 + (sizeof(size_t)+sizeof(const float))*2 + sizeof(uint64_t)*8 + sizeof(size_t)+sizeof(const cublasLtMatmulAlgo_t) + sizeof(uint64_t) + sizeof(size_t) + sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUBLASLT_MATMUL); + reqBuf.Push64BitPointer(lightHandle); + reqBuf.Push64BitPointer(computeDesc); + reqBuf.PushConst((const float*)alpha); //todo: device or host + reqBuf.Push64BitPointer(A); // device pointer + reqBuf.Push64BitPointer(Adesc); + reqBuf.Push64BitPointer(B); // device pointer + reqBuf.Push64BitPointer(Bdesc); + reqBuf.PushConst((const float*)beta); //todo: device or host + reqBuf.Push64BitPointer(C); // device pointer + reqBuf.Push64BitPointer(Cdesc); + reqBuf.Push64BitPointer(D); // device pointer + reqBuf.Push64BitPointer(Ddesc); + reqBuf.PushConst(algo); + reqBuf.Push64BitPointer(workspace); // device pointer + reqBuf.Push(workspaceSizeInBytes); + reqBuf.Push64BitPointer(stream); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUBLAS_STATUS_SUCCESS; + + //return func_entry(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, D, Ddesc, algo, workspace,workspaceSizeInBytes, stream); +} \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/cudaHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/cudaHook.cc new file mode 100644 index 0000000..4421776 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/cudaHook.cc @@ -0,0 +1,123 @@ +#include "../../include/hook/hook.h" +#include + +void* get_cuda_handle() { + static void* handle = nullptr; + if (!handle) { + handle = dlopen("/usr/lib/x86_64-linux-gnu/libcuda.so", RTLD_NOW | RTLD_LOCAL); + if (!handle) { + tool::Logging(LOG_ERROR, HOOK_LOG_TAG, "Failed to load libcuda.so: \n", dlerror()); + } + } + return handle; +} + +CUresult cuInit(unsigned int Flags) { + const char* func_name = "cuInit"; + // HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(Flags); +} + +CUresult cuDeviceGetCount(int *count) { + const char* func_name = "cuDeviceGetCount"; + // HookLog(func_name); + using func_ptr = CUresult (*)(int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(count); +} + +CUresult cuModuleLoadData(CUmodule *module, const void *image) { + const char* func_name = "cuModuleLoadData"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmodule * , const void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(module, image); +} + +CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) { + const char* func_name = "cuModuleGetFunction"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction * , CUmodule, const char * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, hmod, name); +} + +CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) { + const char* func_name = "cuOccupancyMaxActiveBlocksPerMultiprocessor"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUfunction, int, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(numBlocks, func, blockSize, dynamicSMemSize); +} + +CUresult cuGetErrorString(CUresult error, const char **pStr) { + const char* func_name = "cuGetErrorString"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUresult, const char **); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + *pStr = "CUDA_SUCCESS"; + return CUDA_SUCCESS; + // return func_entry(error, pStr); +} + +CUresult cuCtxGetCurrent(CUcontext *pctx) { + const char* func_name = "cuCtxGetCurrent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pctx); +} + +CUresult cuModuleUnload(CUmodule hmod) { + const char* func_name = "cuModuleUnload"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmodule); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hmod); +} + +CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active) { + const char* func_name = "cuDevicePrimaryCtxGetState"; + HookLog(func_name, false); + using func_ptr = CUresult (*)(CUdevice, unsigned int * , int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + *flags = 0; + *active = 1; + tool::Logging(LOG_DEBUG, func_name, "dev: %d\n", dev); + return CUDA_SUCCESS; + // return func_entry(dev, flags, active); +} + +CUresult cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut) { + const char* func_name = "cuLinkCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int, CUjit_option * , void * * , CUlinkState * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(numOptions, options, optionValues, stateOut); +} + +CUresult cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut) { + const char* func_name = "cuLinkComplete"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUlinkState, void * * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(state, cubinOut, sizeOut); +} + +CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) { + const char* func_name = "cuFuncSetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, CUfunction_attribute, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, attrib, value); +} + +CUresult cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc) { + const char* func_name = "cuFuncGetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUfunction_attribute, CUfunction); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pi, attrib, hfunc); +} diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/cudaHook.cc.tmp b/GPU-Virtual-Service/gpu-remoting/src/client/cudaHook.cc.tmp new file mode 100644 index 0000000..9fff9cd --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/cudaHook.cc.tmp @@ -0,0 +1,3013 @@ +#include "../../include/hook/hook.h" + +CUresult cuGetErrorString(CUresult error, const char **pStr) { + const char* func_name = "cuGetErrorString"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUresult, const char * * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(error, pStr); +} + +CUresult cuGetErrorName(CUresult error, const char **pStr) { + const char* func_name = "cuGetErrorName"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUresult, const char * * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(error, pStr); +} + +CUresult cuInit(unsigned int Flags) { + const char* func_name = "cuInit"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + //return func_entry(Flags); + return CUDA_SUCCESS; +} + +CUresult cuDriverGetVersion(int *driverVersion) { + const char* func_name = "cuDriverGetVersion"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(driverVersion); +} + +CUresult cuDeviceGet(CUdevice *device, int ordinal) { + const char* func_name = "cuDeviceGet"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice * , int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(device, ordinal); +} + +CUresult cuDeviceGetCount(int *count) { + const char* func_name = "cuDeviceGetCount"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + // return func_entry(count); + *count = 0; + return CUDA_SUCCESS; +} + +CUresult cuDeviceGetName(char *name, int len, CUdevice dev) { + const char* func_name = "cuDeviceGetName"; + HookLog(func_name); + using func_ptr = CUresult (*)(char * , int, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(name, len, dev); +} + +CUresult cuDeviceGetUuid(CUuuid *uuid, CUdevice dev) { + const char* func_name = "cuDeviceGetUuid"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUuuid * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(uuid, dev); +} + +CUresult cuDeviceGetUuid_v2(CUuuid *uuid, CUdevice dev) { + const char* func_name = "cuDeviceGetUuid_v2"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUuuid * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(uuid, dev); +} + +CUresult cuDeviceGetLuid(char *luid, unsigned int *deviceNodeMask, CUdevice dev) { + const char* func_name = "cuDeviceGetLuid"; + HookLog(func_name); + using func_ptr = CUresult (*)(char * , unsigned int * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(luid, deviceNodeMask, dev); +} + +CUresult cuDeviceTotalMem(size_t *bytes, CUdevice dev) { + const char* func_name = "cuDeviceTotalMem"; + HookLog(func_name); + using func_ptr = CUresult (*)(size_t * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(bytes, dev); +} + +CUresult cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements, CUarray_format format, unsigned numChannels, CUdevice dev) { + const char* func_name = "cuDeviceGetTexture1DLinearMaxWidth"; + HookLog(func_name); + using func_ptr = CUresult (*)(size_t * , CUarray_format, unsigned, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(maxWidthInElements, format, numChannels, dev); +} + +CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev) { + const char* func_name = "cuDeviceGetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUdevice_attribute, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pi, attrib, dev); +} + +CUresult cuDeviceGetNvSciSyncAttributes(void *nvSciSyncAttrList, CUdevice dev, int flags) { + const char* func_name = "cuDeviceGetNvSciSyncAttributes"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUdevice, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(nvSciSyncAttrList, dev, flags); +} + +CUresult cuDeviceSetMemPool(CUdevice dev, CUmemoryPool pool) { + const char* func_name = "cuDeviceSetMemPool"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice, CUmemoryPool); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dev, pool); +} + +CUresult cuDeviceGetMemPool(CUmemoryPool *pool, CUdevice dev) { + const char* func_name = "cuDeviceGetMemPool"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool, dev); +} + +CUresult cuDeviceGetDefaultMemPool(CUmemoryPool *pool_out, CUdevice dev) { + const char* func_name = "cuDeviceGetDefaultMemPool"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool_out, dev); +} + +CUresult cuFlushGPUDirectRDMAWrites(CUflushGPUDirectRDMAWritesTarget target, CUflushGPUDirectRDMAWritesScope scope) { + const char* func_name = "cuFlushGPUDirectRDMAWrites"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUflushGPUDirectRDMAWritesTarget, CUflushGPUDirectRDMAWritesScope); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(target, scope); +} + +CUresult cuDeviceGetProperties(CUdevprop *prop, CUdevice dev) { + const char* func_name = "cuDeviceGetProperties"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevprop * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(prop, dev); +} + +CUresult cuDeviceComputeCapability(int *major, int *minor, CUdevice dev) { + const char* func_name = "cuDeviceComputeCapability"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , int * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(major, minor, dev); +} + +CUresult cuDevicePrimaryCtxRetain(CUcontext *pctx, CUdevice dev) { + const char* func_name = "cuDevicePrimaryCtxRetain"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pctx, dev); +} + +CUresult cuDevicePrimaryCtxRelease(CUdevice dev) { + const char* func_name = "cuDevicePrimaryCtxRelease"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dev); +} + +CUresult cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags) { + const char* func_name = "cuDevicePrimaryCtxSetFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dev, flags); +} + +CUresult cuDevicePrimaryCtxGetState(CUdevice dev, unsigned int *flags, int *active) { + const char* func_name = "cuDevicePrimaryCtxGetState"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice, unsigned int * , int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dev, flags, active); +} + +CUresult cuDevicePrimaryCtxReset(CUdevice dev) { + const char* func_name = "cuDevicePrimaryCtxReset"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dev); +} + +CUresult cuDeviceGetExecAffinitySupport(int *pi, CUexecAffinityType type, CUdevice dev) { + const char* func_name = "cuDeviceGetExecAffinitySupport"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUexecAffinityType, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pi, type, dev); +} + +CUresult cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) { + const char* func_name = "cuCtxCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext * , unsigned int, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pctx, flags, dev); +} + +CUresult cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArray, int numParams, unsigned int flags, CUdevice dev) { + const char* func_name = "cuCtxCreate_v3"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext * , CUexecAffinityParam * , int, unsigned int, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pctx, paramsArray, numParams, flags, dev); +} + +CUresult cuCtxDestroy(CUcontext ctx) { + const char* func_name = "cuCtxDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ctx); +} + +CUresult cuCtxPushCurrent(CUcontext ctx) { + const char* func_name = "cuCtxPushCurrent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ctx); +} + +CUresult cuCtxPopCurrent(CUcontext *pctx) { + const char* func_name = "cuCtxPopCurrent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pctx); +} + +CUresult cuCtxSetCurrent(CUcontext ctx) { + const char* func_name = "cuCtxSetCurrent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ctx); +} + +CUresult cuCtxGetCurrent(CUcontext *pctx) { + const char* func_name = "cuCtxGetCurrent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pctx); +} + +CUresult cuCtxGetDevice(CUdevice *device) { + const char* func_name = "cuCtxGetDevice"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(device); +} + +CUresult cuCtxGetFlags(unsigned int *flags) { + const char* func_name = "cuCtxGetFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(flags); +} + +CUresult cuCtxSynchronize() { + const char* func_name = "cuCtxSynchronize"; + HookLog(func_name); + using func_ptr = CUresult (*)(); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(); +} + +CUresult cuCtxSetLimit(CUlimit limit, size_t value) { + const char* func_name = "cuCtxSetLimit"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUlimit, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(limit, value); +} + +CUresult cuCtxGetLimit(size_t *pvalue, CUlimit limit) { + const char* func_name = "cuCtxGetLimit"; + HookLog(func_name); + using func_ptr = CUresult (*)(size_t * , CUlimit); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pvalue, limit); +} + +CUresult cuCtxGetCacheConfig(CUfunc_cache *pconfig) { + const char* func_name = "cuCtxGetCacheConfig"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunc_cache * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pconfig); +} + +CUresult cuCtxSetCacheConfig(CUfunc_cache config) { + const char* func_name = "cuCtxSetCacheConfig"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunc_cache); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(config); +} + +CUresult cuCtxGetSharedMemConfig(CUsharedconfig *pConfig) { + const char* func_name = "cuCtxGetSharedMemConfig"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUsharedconfig * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pConfig); +} + +CUresult cuCtxSetSharedMemConfig(CUsharedconfig config) { + const char* func_name = "cuCtxSetSharedMemConfig"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUsharedconfig); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(config); +} + +CUresult cuCtxGetApiVersion(CUcontext ctx, unsigned int *version) { + const char* func_name = "cuCtxGetApiVersion"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext, unsigned int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ctx, version); +} + +CUresult cuCtxGetStreamPriorityRange(int *leastPriority, int *greatestPriority) { + const char* func_name = "cuCtxGetStreamPriorityRange"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(leastPriority, greatestPriority); +} + +CUresult cuCtxResetPersistingL2Cache() { + const char* func_name = "cuCtxResetPersistingL2Cache"; + HookLog(func_name); + using func_ptr = CUresult (*)(); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(); +} + +CUresult cuCtxGetExecAffinity(CUexecAffinityParam *pExecAffinity, CUexecAffinityType type) { + const char* func_name = "cuCtxGetExecAffinity"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUexecAffinityParam * , CUexecAffinityType); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pExecAffinity, type); +} + +CUresult cuCtxAttach(CUcontext *pctx, unsigned int flags) { + const char* func_name = "cuCtxAttach"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pctx, flags); +} + +CUresult cuCtxDetach(CUcontext ctx) { + const char* func_name = "cuCtxDetach"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ctx); +} + +CUresult cuModuleLoad(CUmodule *module, const char *fname) { + const char* func_name = "cuModuleLoad"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmodule * , const char * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(module, fname); +} + +CUresult cuModuleLoadData(CUmodule *module, const void *image) { + const char* func_name = "cuModuleLoadData"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmodule * , const void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(module, image); +} + +CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues) { + const char* func_name = "cuModuleLoadDataEx"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmodule * , const void * , unsigned int, CUjit_option * , void * * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(module, image, numOptions, options, optionValues); +} + +CUresult cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin) { + const char* func_name = "cuModuleLoadFatBinary"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmodule * , const void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(module, fatCubin); +} + +CUresult cuModuleUnload(CUmodule hmod) { + const char* func_name = "cuModuleUnload"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmodule); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hmod); +} + +CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) { + const char* func_name = "cuModuleGetFunction"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction * , CUmodule, const char * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, hmod, name); +} + +CUresult cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name) { + const char* func_name = "cuModuleGetGlobal"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t * , CUmodule, const char * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr, bytes, hmod, name); +} + +CUresult cuModuleGetTexRef(CUtexref *pTexRef, CUmodule hmod, const char *name) { + const char* func_name = "cuModuleGetTexRef"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref * , CUmodule, const char * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pTexRef, hmod, name); +} + +CUresult cuModuleGetSurfRef(CUsurfref *pSurfRef, CUmodule hmod, const char *name) { + const char* func_name = "cuModuleGetSurfRef"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUsurfref * , CUmodule, const char * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pSurfRef, hmod, name); +} + +CUresult cuLinkCreate(unsigned int numOptions, CUjit_option *options, void **optionValues, CUlinkState *stateOut) { + const char* func_name = "cuLinkCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int, CUjit_option * , void * * , CUlinkState * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(numOptions, options, optionValues, stateOut); +} + +CUresult cuLinkAddData(CUlinkState state, CUjitInputType type, void *data, size_t size, const char *name, unsigned int numOptions, CUjit_option *options, void **optionValues) { + const char* func_name = "cuLinkAddData"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUlinkState, CUjitInputType, void * , size_t, const char * , unsigned int, CUjit_option * , void * * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(state, type, data, size, name, numOptions, options, optionValues); +} + +CUresult cuLinkAddFile(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues) { + const char* func_name = "cuLinkAddFile"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUlinkState, CUjitInputType, const char * , unsigned int, CUjit_option * , void * * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(state, type, path, numOptions, options, optionValues); +} + +CUresult cuLinkComplete(CUlinkState state, void **cubinOut, size_t *sizeOut) { + const char* func_name = "cuLinkComplete"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUlinkState, void * * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(state, cubinOut, sizeOut); +} + +CUresult cuLinkDestroy(CUlinkState state) { + const char* func_name = "cuLinkDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUlinkState); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(state); +} + +CUresult cuMemGetInfo(size_t *free, size_t *total) { + const char* func_name = "cuMemGetInfo"; + HookLog(func_name); + using func_ptr = CUresult (*)(size_t * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(free, total); +} + +CUresult cuMemAlloc(CUdeviceptr *dptr, size_t bytesize) { + const char* func_name = "cuMemAlloc"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr, bytesize); +} + +CUresult cuMemAllocPitch(CUdeviceptr *dptr, size_t *pPitch, size_t WidthInBytes, size_t Height, unsigned int ElementSizeBytes) { + const char* func_name = "cuMemAllocPitch"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t * , size_t, size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr, pPitch, WidthInBytes, Height, ElementSizeBytes); +} + +CUresult cuMemFree(CUdeviceptr dptr) { + const char* func_name = "cuMemFree"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr); +} + +CUresult cuMemGetAddressRange(CUdeviceptr *pbase, size_t *psize, CUdeviceptr dptr) { + const char* func_name = "cuMemGetAddressRange"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t * , CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pbase, psize, dptr); +} + +CUresult cuMemAllocHost(void **pp, size_t bytesize) { + const char* func_name = "cuMemAllocHost"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pp, bytesize); +} + +CUresult cuMemFreeHost(void *p) { + const char* func_name = "cuMemFreeHost"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(p); +} + +CUresult cuMemHostAlloc(void **pp, size_t bytesize, unsigned int Flags) { + const char* func_name = "cuMemHostAlloc"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * * , size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pp, bytesize, Flags); +} + +CUresult cuMemHostGetDevicePointer(CUdeviceptr *pdptr, void *p, unsigned int Flags) { + const char* func_name = "cuMemHostGetDevicePointer"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , void * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pdptr, p, Flags); +} + +CUresult cuMemHostGetFlags(unsigned int *pFlags, void *p) { + const char* func_name = "cuMemHostGetFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int * , void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pFlags, p); +} + +CUresult cuMemAllocManaged(CUdeviceptr *dptr, size_t bytesize, unsigned int flags) { + const char* func_name = "cuMemAllocManaged"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr, bytesize, flags); +} + +CUresult cuDeviceGetByPCIBusId(CUdevice *dev, const char *pciBusId) { + const char* func_name = "cuDeviceGetByPCIBusId"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice * , const char * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dev, pciBusId); +} + +CUresult cuDeviceGetPCIBusId(char *pciBusId, int len, CUdevice dev) { + const char* func_name = "cuDeviceGetPCIBusId"; + HookLog(func_name); + using func_ptr = CUresult (*)(char * , int, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pciBusId, len, dev); +} + +CUresult cuIpcGetEventHandle(CUipcEventHandle *pHandle, CUevent event) { + const char* func_name = "cuIpcGetEventHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUipcEventHandle * , CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pHandle, event); +} + +CUresult cuIpcOpenEventHandle(CUevent *phEvent, CUipcEventHandle handle) { + const char* func_name = "cuIpcOpenEventHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUevent * , CUipcEventHandle); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phEvent, handle); +} + +CUresult cuIpcGetMemHandle(CUipcMemHandle *pHandle, CUdeviceptr dptr) { + const char* func_name = "cuIpcGetMemHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUipcMemHandle * , CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pHandle, dptr); +} + +CUresult cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags) { + const char* func_name = "cuIpcOpenMemHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , CUipcMemHandle, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pdptr, handle, Flags); +} + +CUresult cuIpcCloseMemHandle(CUdeviceptr dptr) { + const char* func_name = "cuIpcCloseMemHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr); +} + +CUresult cuMemHostRegister(void *p, size_t bytesize, unsigned int Flags) { + const char* func_name = "cuMemHostRegister"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(p, bytesize, Flags); +} + +CUresult cuMemHostUnregister(void *p) { + const char* func_name = "cuMemHostUnregister"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(p); +} + +CUresult cuMemcpy(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount) { + const char* func_name = "cuMemcpy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dst, src, ByteCount); +} + +CUresult cuMemcpyPeer(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount) { + const char* func_name = "cuMemcpyPeer"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, dstContext, srcDevice, srcContext, ByteCount); +} + +CUresult cuMemcpyHtoD(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount) { + const char* func_name = "cuMemcpyHtoD"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, const void * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, srcHost, ByteCount); +} + +CUresult cuMemcpyDtoH(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount) { + const char* func_name = "cuMemcpyDtoH"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstHost, srcDevice, ByteCount); +} + +CUresult cuMemcpyDtoD(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount) { + const char* func_name = "cuMemcpyDtoD"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, srcDevice, ByteCount); +} + +CUresult cuMemcpyDtoA(CUarray dstArray, size_t dstOffset, CUdeviceptr srcDevice, size_t ByteCount) { + const char* func_name = "cuMemcpyDtoA"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray, size_t, CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstArray, dstOffset, srcDevice, ByteCount); +} + +CUresult cuMemcpyAtoD(CUdeviceptr dstDevice, CUarray srcArray, size_t srcOffset, size_t ByteCount) { + const char* func_name = "cuMemcpyAtoD"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, CUarray, size_t, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, srcArray, srcOffset, ByteCount); +} + +CUresult cuMemcpyHtoA(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount) { + const char* func_name = "cuMemcpyHtoA"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray, size_t, const void * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstArray, dstOffset, srcHost, ByteCount); +} + +CUresult cuMemcpyAtoH(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount) { + const char* func_name = "cuMemcpyAtoH"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUarray, size_t, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstHost, srcArray, srcOffset, ByteCount); +} + +CUresult cuMemcpyAtoA(CUarray dstArray, size_t dstOffset, CUarray srcArray, size_t srcOffset, size_t ByteCount) { + const char* func_name = "cuMemcpyAtoA"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray, size_t, CUarray, size_t, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstArray, dstOffset, srcArray, srcOffset, ByteCount); +} + +CUresult cuMemcpy2D(const CUDA_MEMCPY2D *pCopy) { + const char* func_name = "cuMemcpy2D"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUDA_MEMCPY2D * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pCopy); +} + +CUresult cuMemcpy2DUnaligned(const CUDA_MEMCPY2D *pCopy) { + const char* func_name = "cuMemcpy2DUnaligned"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUDA_MEMCPY2D * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pCopy); +} + +CUresult cuMemcpy3D(const CUDA_MEMCPY3D *pCopy) { + const char* func_name = "cuMemcpy3D"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUDA_MEMCPY3D * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pCopy); +} + +CUresult cuMemcpy3DPeer(const CUDA_MEMCPY3D_PEER *pCopy) { + const char* func_name = "cuMemcpy3DPeer"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUDA_MEMCPY3D_PEER * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pCopy); +} + +CUresult cuMemcpyAsync(CUdeviceptr dst, CUdeviceptr src, size_t ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, CUdeviceptr, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dst, src, ByteCount, hStream); +} + +CUresult cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, size_t ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyPeerAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, CUcontext, CUdeviceptr, CUcontext, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, dstContext, srcDevice, srcContext, ByteCount, hStream); +} + +CUresult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyHtoDAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, const void * , size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, srcHost, ByteCount, hStream); +} + +CUresult cuMemcpyDtoHAsync(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyDtoHAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUdeviceptr, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstHost, srcDevice, ByteCount, hStream); +} + +CUresult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyDtoDAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, CUdeviceptr, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, srcDevice, ByteCount, hStream); +} + +CUresult cuMemcpyHtoAAsync(CUarray dstArray, size_t dstOffset, const void *srcHost, size_t ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyHtoAAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray, size_t, const void * , size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstArray, dstOffset, srcHost, ByteCount, hStream); +} + +CUresult cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, size_t srcOffset, size_t ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyAtoHAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUarray, size_t, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstHost, srcArray, srcOffset, ByteCount, hStream); +} + +CUresult cuMemcpy2DAsync(const CUDA_MEMCPY2D *pCopy, CUstream hStream) { + const char* func_name = "cuMemcpy2DAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUDA_MEMCPY2D * , CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pCopy, hStream); +} + +CUresult cuMemcpy3DAsync(const CUDA_MEMCPY3D *pCopy, CUstream hStream) { + const char* func_name = "cuMemcpy3DAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUDA_MEMCPY3D * , CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pCopy, hStream); +} + +CUresult cuMemcpy3DPeerAsync(const CUDA_MEMCPY3D_PEER *pCopy, CUstream hStream) { + const char* func_name = "cuMemcpy3DPeerAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUDA_MEMCPY3D_PEER * , CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pCopy, hStream); +} + +CUresult cuMemsetD8(CUdeviceptr dstDevice, unsigned char uc, size_t N) { + const char* func_name = "cuMemsetD8"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, unsigned char, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, uc, N); +} + +CUresult cuMemsetD16(CUdeviceptr dstDevice, unsigned short us, size_t N) { + const char* func_name = "cuMemsetD16"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, unsigned short, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, us, N); +} + +CUresult cuMemsetD32(CUdeviceptr dstDevice, unsigned int ui, size_t N) { + const char* func_name = "cuMemsetD32"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, unsigned int, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, ui, N); +} + +CUresult cuMemsetD2D8(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height) { + const char* func_name = "cuMemsetD2D8"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, unsigned char, size_t, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, dstPitch, uc, Width, Height); +} + +CUresult cuMemsetD2D16(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height) { + const char* func_name = "cuMemsetD2D16"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, unsigned short, size_t, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, dstPitch, us, Width, Height); +} + +CUresult cuMemsetD2D32(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height) { + const char* func_name = "cuMemsetD2D32"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, unsigned int, size_t, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, dstPitch, ui, Width, Height); +} + +CUresult cuMemsetD8Async(CUdeviceptr dstDevice, unsigned char uc, size_t N, CUstream hStream) { + const char* func_name = "cuMemsetD8Async"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, unsigned char, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, uc, N, hStream); +} + +CUresult cuMemsetD16Async(CUdeviceptr dstDevice, unsigned short us, size_t N, CUstream hStream) { + const char* func_name = "cuMemsetD16Async"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, unsigned short, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, us, N, hStream); +} + +CUresult cuMemsetD32Async(CUdeviceptr dstDevice, unsigned int ui, size_t N, CUstream hStream) { + const char* func_name = "cuMemsetD32Async"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, unsigned int, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, ui, N, hStream); +} + +CUresult cuMemsetD2D8Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned char uc, size_t Width, size_t Height, CUstream hStream) { + const char* func_name = "cuMemsetD2D8Async"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, unsigned char, size_t, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, dstPitch, uc, Width, Height, hStream); +} + +CUresult cuMemsetD2D16Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned short us, size_t Width, size_t Height, CUstream hStream) { + const char* func_name = "cuMemsetD2D16Async"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, unsigned short, size_t, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, dstPitch, us, Width, Height, hStream); +} + +CUresult cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsigned int ui, size_t Width, size_t Height, CUstream hStream) { + const char* func_name = "cuMemsetD2D32Async"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, unsigned int, size_t, size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstDevice, dstPitch, ui, Width, Height, hStream); +} + +CUresult cuArrayCreate(CUarray *pHandle, const CUDA_ARRAY_DESCRIPTOR *pAllocateArray) { + const char* func_name = "cuArrayCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray * , const CUDA_ARRAY_DESCRIPTOR * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pHandle, pAllocateArray); +} + +CUresult cuArrayGetDescriptor(CUDA_ARRAY_DESCRIPTOR *pArrayDescriptor, CUarray hArray) { + const char* func_name = "cuArrayGetDescriptor"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_ARRAY_DESCRIPTOR * , CUarray); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pArrayDescriptor, hArray); +} + +CUresult cuArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUarray array) { + const char* func_name = "cuArrayGetSparseProperties"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_ARRAY_SPARSE_PROPERTIES * , CUarray); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(sparseProperties, array); +} + +CUresult cuMipmappedArrayGetSparseProperties(CUDA_ARRAY_SPARSE_PROPERTIES *sparseProperties, CUmipmappedArray mipmap) { + const char* func_name = "cuMipmappedArrayGetSparseProperties"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_ARRAY_SPARSE_PROPERTIES * , CUmipmappedArray); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(sparseProperties, mipmap); +} + +CUresult cuArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUarray array, CUdevice device) { + const char* func_name = "cuArrayGetMemoryRequirements"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_ARRAY_MEMORY_REQUIREMENTS * , CUarray, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(memoryRequirements, array, device); +} + +CUresult cuMipmappedArrayGetMemoryRequirements(CUDA_ARRAY_MEMORY_REQUIREMENTS *memoryRequirements, CUmipmappedArray mipmap, CUdevice device) { + const char* func_name = "cuMipmappedArrayGetMemoryRequirements"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_ARRAY_MEMORY_REQUIREMENTS * , CUmipmappedArray, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(memoryRequirements, mipmap, device); +} + +CUresult cuArrayGetPlane(CUarray *pPlaneArray, CUarray hArray, unsigned int planeIdx) { + const char* func_name = "cuArrayGetPlane"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray * , CUarray, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pPlaneArray, hArray, planeIdx); +} + +CUresult cuArrayDestroy(CUarray hArray) { + const char* func_name = "cuArrayDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hArray); +} + +CUresult cuArray3DCreate(CUarray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pAllocateArray) { + const char* func_name = "cuArray3DCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray * , const CUDA_ARRAY3D_DESCRIPTOR * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pHandle, pAllocateArray); +} + +CUresult cuArray3DGetDescriptor(CUDA_ARRAY3D_DESCRIPTOR *pArrayDescriptor, CUarray hArray) { + const char* func_name = "cuArray3DGetDescriptor"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_ARRAY3D_DESCRIPTOR * , CUarray); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pArrayDescriptor, hArray); +} + +CUresult cuMipmappedArrayCreate(CUmipmappedArray *pHandle, const CUDA_ARRAY3D_DESCRIPTOR *pMipmappedArrayDesc, unsigned int numMipmapLevels) { + const char* func_name = "cuMipmappedArrayCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmipmappedArray * , const CUDA_ARRAY3D_DESCRIPTOR * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pHandle, pMipmappedArrayDesc, numMipmapLevels); +} + +CUresult cuMipmappedArrayGetLevel(CUarray *pLevelArray, CUmipmappedArray hMipmappedArray, unsigned int level) { + const char* func_name = "cuMipmappedArrayGetLevel"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray * , CUmipmappedArray, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pLevelArray, hMipmappedArray, level); +} + +CUresult cuMipmappedArrayDestroy(CUmipmappedArray hMipmappedArray) { + const char* func_name = "cuMipmappedArrayDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmipmappedArray); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hMipmappedArray); +} + +CUresult cuMemAddressReserve(CUdeviceptr *ptr, size_t size, size_t alignment, CUdeviceptr addr, unsigned long long flags) { + const char* func_name = "cuMemAddressReserve"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t, size_t, CUdeviceptr, unsigned long long); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ptr, size, alignment, addr, flags); +} + +CUresult cuMemAddressFree(CUdeviceptr ptr, size_t size) { + const char* func_name = "cuMemAddressFree"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ptr, size); +} + +CUresult cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, const CUmemAllocationProp *prop, unsigned long long flags) { + const char* func_name = "cuMemCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemGenericAllocationHandle * , size_t, const CUmemAllocationProp * , unsigned long long); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(handle, size, prop, flags); +} + +CUresult cuMemRelease(CUmemGenericAllocationHandle handle) { + const char* func_name = "cuMemRelease"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemGenericAllocationHandle); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(handle); +} + +CUresult cuMemMap(CUdeviceptr ptr, size_t size, size_t offset, CUmemGenericAllocationHandle handle, unsigned long long flags) { + const char* func_name = "cuMemMap"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, size_t, CUmemGenericAllocationHandle, unsigned long long); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ptr, size, offset, handle, flags); +} + +CUresult cuMemMapArrayAsync(CUarrayMapInfo *mapInfoList, unsigned int count, CUstream hStream) { + const char* func_name = "cuMemMapArrayAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarrayMapInfo * , unsigned int, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(mapInfoList, count, hStream); +} + +CUresult cuMemUnmap(CUdeviceptr ptr, size_t size) { + const char* func_name = "cuMemUnmap"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ptr, size); +} + +CUresult cuMemSetAccess(CUdeviceptr ptr, size_t size, const CUmemAccessDesc *desc, size_t count) { + const char* func_name = "cuMemSetAccess"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, const CUmemAccessDesc * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ptr, size, desc, count); +} + +CUresult cuMemGetAccess(unsigned long long *flags, const CUmemLocation *location, CUdeviceptr ptr) { + const char* func_name = "cuMemGetAccess"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned long long * , const CUmemLocation * , CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(flags, location, ptr); +} + +CUresult cuMemExportToShareableHandle(void *shareableHandle, CUmemGenericAllocationHandle handle, CUmemAllocationHandleType handleType, unsigned long long flags) { + const char* func_name = "cuMemExportToShareableHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUmemGenericAllocationHandle, CUmemAllocationHandleType, unsigned long long); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(shareableHandle, handle, handleType, flags); +} + +CUresult cuMemImportFromShareableHandle(CUmemGenericAllocationHandle *handle, void *osHandle, CUmemAllocationHandleType shHandleType) { + const char* func_name = "cuMemImportFromShareableHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemGenericAllocationHandle * , void * , CUmemAllocationHandleType); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(handle, osHandle, shHandleType); +} + +CUresult cuMemGetAllocationGranularity(size_t *granularity, const CUmemAllocationProp *prop, CUmemAllocationGranularity_flags option) { + const char* func_name = "cuMemGetAllocationGranularity"; + HookLog(func_name); + using func_ptr = CUresult (*)(size_t * , const CUmemAllocationProp * , CUmemAllocationGranularity_flags); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(granularity, prop, option); +} + +CUresult cuMemGetAllocationPropertiesFromHandle(CUmemAllocationProp *prop, CUmemGenericAllocationHandle handle) { + const char* func_name = "cuMemGetAllocationPropertiesFromHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemAllocationProp * , CUmemGenericAllocationHandle); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(prop, handle); +} + +CUresult cuMemRetainAllocationHandle(CUmemGenericAllocationHandle *handle, void *addr) { + const char* func_name = "cuMemRetainAllocationHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemGenericAllocationHandle * , void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(handle, addr); +} + +CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) { + const char* func_name = "cuMemFreeAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr, hStream); +} + +CUresult cuMemAllocAsync(CUdeviceptr *dptr, size_t bytesize, CUstream hStream) { + const char* func_name = "cuMemAllocAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr, bytesize, hStream); +} + +CUresult cuMemPoolTrimTo(CUmemoryPool pool, size_t minBytesToKeep) { + const char* func_name = "cuMemPoolTrimTo"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool, minBytesToKeep); +} + +CUresult cuMemPoolSetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value) { + const char* func_name = "cuMemPoolSetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool, CUmemPool_attribute, void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool, attr, value); +} + +CUresult cuMemPoolGetAttribute(CUmemoryPool pool, CUmemPool_attribute attr, void *value) { + const char* func_name = "cuMemPoolGetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool, CUmemPool_attribute, void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool, attr, value); +} + +CUresult cuMemPoolSetAccess(CUmemoryPool pool, const CUmemAccessDesc *map, size_t count) { + const char* func_name = "cuMemPoolSetAccess"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool, const CUmemAccessDesc * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool, map, count); +} + +CUresult cuMemPoolGetAccess(CUmemAccess_flags *flags, CUmemoryPool memPool, CUmemLocation *location) { + const char* func_name = "cuMemPoolGetAccess"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemAccess_flags * , CUmemoryPool, CUmemLocation * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(flags, memPool, location); +} + +CUresult cuMemPoolCreate(CUmemoryPool *pool, const CUmemPoolProps *poolProps) { + const char* func_name = "cuMemPoolCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool * , const CUmemPoolProps * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool, poolProps); +} + +CUresult cuMemPoolDestroy(CUmemoryPool pool) { + const char* func_name = "cuMemPoolDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool); +} + +CUresult cuMemAllocFromPoolAsync(CUdeviceptr *dptr, size_t bytesize, CUmemoryPool pool, CUstream hStream) { + const char* func_name = "cuMemAllocFromPoolAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t, CUmemoryPool, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dptr, bytesize, pool, hStream); +} + +CUresult cuMemPoolExportToShareableHandle(void *handle_out, CUmemoryPool pool, CUmemAllocationHandleType handleType, unsigned long long flags) { + const char* func_name = "cuMemPoolExportToShareableHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUmemoryPool, CUmemAllocationHandleType, unsigned long long); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(handle_out, pool, handleType, flags); +} + +CUresult cuMemPoolImportFromShareableHandle(CUmemoryPool *pool_out, void *handle, CUmemAllocationHandleType handleType, unsigned long long flags) { + const char* func_name = "cuMemPoolImportFromShareableHandle"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemoryPool * , void * , CUmemAllocationHandleType, unsigned long long); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pool_out, handle, handleType, flags); +} + +CUresult cuMemPoolExportPointer(CUmemPoolPtrExportData *shareData_out, CUdeviceptr ptr) { + const char* func_name = "cuMemPoolExportPointer"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmemPoolPtrExportData * , CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(shareData_out, ptr); +} + +CUresult cuMemPoolImportPointer(CUdeviceptr *ptr_out, CUmemoryPool pool, CUmemPoolPtrExportData *shareData) { + const char* func_name = "cuMemPoolImportPointer"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , CUmemoryPool, CUmemPoolPtrExportData * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ptr_out, pool, shareData); +} + +CUresult cuPointerGetAttribute(void *data, CUpointer_attribute attribute, CUdeviceptr ptr) { + const char* func_name = "cuPointerGetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUpointer_attribute, CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(data, attribute, ptr); +} + +CUresult cuMemPrefetchAsync(CUdeviceptr devPtr, size_t count, CUdevice dstDevice, CUstream hStream) { + const char* func_name = "cuMemPrefetchAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, CUdevice, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(devPtr, count, dstDevice, hStream); +} + +CUresult cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advice, CUdevice device) { + const char* func_name = "cuMemAdvise"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr, size_t, CUmem_advise, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(devPtr, count, advice, device); +} + +CUresult cuMemRangeGetAttribute(void *data, size_t dataSize, CUmem_range_attribute attribute, CUdeviceptr devPtr, size_t count) { + const char* func_name = "cuMemRangeGetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , size_t, CUmem_range_attribute, CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(data, dataSize, attribute, devPtr, count); +} + +CUresult cuMemRangeGetAttributes(void **data, size_t *dataSizes, CUmem_range_attribute *attributes, size_t numAttributes, CUdeviceptr devPtr, size_t count) { + const char* func_name = "cuMemRangeGetAttributes"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * * , size_t * , CUmem_range_attribute * , size_t, CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(data, dataSizes, attributes, numAttributes, devPtr, count); +} + +CUresult cuPointerSetAttribute(const void *value, CUpointer_attribute attribute, CUdeviceptr ptr) { + const char* func_name = "cuPointerSetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(const void * , CUpointer_attribute, CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(value, attribute, ptr); +} + +CUresult cuPointerGetAttributes(unsigned int numAttributes, CUpointer_attribute *attributes, void **data, CUdeviceptr ptr) { + const char* func_name = "cuPointerGetAttributes"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int, CUpointer_attribute * , void * * , CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(numAttributes, attributes, data, ptr); +} + +CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags) { + const char* func_name = "cuStreamCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phStream, Flags); +} + +CUresult cuStreamCreateWithPriority(CUstream *phStream, unsigned int flags, int priority) { + const char* func_name = "cuStreamCreateWithPriority"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream * , unsigned int, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phStream, flags, priority); +} + +CUresult cuStreamGetPriority(CUstream hStream, int *priority) { + const char* func_name = "cuStreamGetPriority"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, priority); +} + +CUresult cuStreamGetFlags(CUstream hStream, unsigned int *flags) { + const char* func_name = "cuStreamGetFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, unsigned int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, flags); +} + +CUresult cuStreamGetCtx(CUstream hStream, CUcontext *pctx) { + const char* func_name = "cuStreamGetCtx"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUcontext * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, pctx); +} + +CUresult cuStreamWaitEvent(CUstream hStream, CUevent hEvent, unsigned int Flags) { + const char* func_name = "cuStreamWaitEvent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUevent, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, hEvent, Flags); +} + +CUresult cuStreamAddCallback(CUstream hStream, CUstreamCallback callback, void *userData, unsigned int flags) { + const char* func_name = "cuStreamAddCallback"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUstreamCallback, void * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, callback, userData, flags); +} + +CUresult cuStreamBeginCapture(CUstream hStream, CUstreamCaptureMode mode) { + const char* func_name = "cuStreamBeginCapture"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUstreamCaptureMode); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, mode); +} + +CUresult cuThreadExchangeStreamCaptureMode(CUstreamCaptureMode *mode) { + const char* func_name = "cuThreadExchangeStreamCaptureMode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstreamCaptureMode * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(mode); +} + +CUresult cuStreamEndCapture(CUstream hStream, CUgraph *phGraph) { + const char* func_name = "cuStreamEndCapture"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUgraph * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, phGraph); +} + +CUresult cuStreamIsCapturing(CUstream hStream, CUstreamCaptureStatus *captureStatus) { + const char* func_name = "cuStreamIsCapturing"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUstreamCaptureStatus * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, captureStatus); +} + +CUresult cuStreamGetCaptureInfo(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out) { + const char* func_name = "cuStreamGetCaptureInfo"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUstreamCaptureStatus * , cuuint64_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, captureStatus_out, id_out); +} + +CUresult cuStreamGetCaptureInfo_v2(CUstream hStream, CUstreamCaptureStatus *captureStatus_out, cuuint64_t *id_out, CUgraph *graph_out, const CUgraphNode **dependencies_out, size_t *numDependencies_out) { + const char* func_name = "cuStreamGetCaptureInfo_v2"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUstreamCaptureStatus * , cuuint64_t * , CUgraph * , const CUgraphNode * * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out); +} + +CUresult cuStreamUpdateCaptureDependencies(CUstream hStream, CUgraphNode *dependencies, size_t numDependencies, unsigned int flags) { + const char* func_name = "cuStreamUpdateCaptureDependencies"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUgraphNode * , size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, dependencies, numDependencies, flags); +} + +CUresult cuStreamAttachMemAsync(CUstream hStream, CUdeviceptr dptr, size_t length, unsigned int flags) { + const char* func_name = "cuStreamAttachMemAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, dptr, length, flags); +} + +CUresult cuStreamQuery(CUstream hStream) { + const char* func_name = "cuStreamQuery"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream); +} + +CUresult cuStreamSynchronize(CUstream hStream) { + const char* func_name = "cuStreamSynchronize"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream); +} + +CUresult cuStreamDestroy(CUstream hStream) { + const char* func_name = "cuStreamDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream); +} + +CUresult cuStreamCopyAttributes(CUstream dst, CUstream src) { + const char* func_name = "cuStreamCopyAttributes"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dst, src); +} + +CUresult cuStreamGetAttribute(CUstream hStream, CUstreamAttrID attr, CUstreamAttrValue *value_out) { + const char* func_name = "cuStreamGetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUstreamAttrID, CUstreamAttrValue * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, attr, value_out); +} + +CUresult cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *value) { + const char* func_name = "cuStreamSetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUstreamAttrID, const CUstreamAttrValue * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, attr, value); +} + +CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags) { + const char* func_name = "cuEventCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUevent * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phEvent, Flags); +} + +CUresult cuEventRecord(CUevent hEvent, CUstream hStream) { + const char* func_name = "cuEventRecord"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUevent, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hEvent, hStream); +} + +CUresult cuEventRecordWithFlags(CUevent hEvent, CUstream hStream, unsigned int flags) { + const char* func_name = "cuEventRecordWithFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUevent, CUstream, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hEvent, hStream, flags); +} + +CUresult cuEventQuery(CUevent hEvent) { + const char* func_name = "cuEventQuery"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hEvent); +} + +CUresult cuEventSynchronize(CUevent hEvent) { + const char* func_name = "cuEventSynchronize"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hEvent); +} + +CUresult cuEventDestroy(CUevent hEvent) { + const char* func_name = "cuEventDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hEvent); +} + +CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd) { + const char* func_name = "cuEventElapsedTime"; + HookLog(func_name); + using func_ptr = CUresult (*)(float * , CUevent, CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pMilliseconds, hStart, hEnd); +} + +CUresult cuImportExternalMemory(CUexternalMemory *extMem_out, const CUDA_EXTERNAL_MEMORY_HANDLE_DESC *memHandleDesc) { + const char* func_name = "cuImportExternalMemory"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUexternalMemory * , const CUDA_EXTERNAL_MEMORY_HANDLE_DESC * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(extMem_out, memHandleDesc); +} + +CUresult cuExternalMemoryGetMappedBuffer(CUdeviceptr *devPtr, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC *bufferDesc) { + const char* func_name = "cuExternalMemoryGetMappedBuffer"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , CUexternalMemory, const CUDA_EXTERNAL_MEMORY_BUFFER_DESC * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(devPtr, extMem, bufferDesc); +} + +CUresult cuExternalMemoryGetMappedMipmappedArray(CUmipmappedArray *mipmap, CUexternalMemory extMem, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC *mipmapDesc) { + const char* func_name = "cuExternalMemoryGetMappedMipmappedArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmipmappedArray * , CUexternalMemory, const CUDA_EXTERNAL_MEMORY_MIPMAPPED_ARRAY_DESC * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(mipmap, extMem, mipmapDesc); +} + +CUresult cuDestroyExternalMemory(CUexternalMemory extMem) { + const char* func_name = "cuDestroyExternalMemory"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUexternalMemory); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(extMem); +} + +CUresult cuImportExternalSemaphore(CUexternalSemaphore *extSem_out, const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC *semHandleDesc) { + const char* func_name = "cuImportExternalSemaphore"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUexternalSemaphore * , const CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(extSem_out, semHandleDesc); +} + +CUresult cuSignalExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream) { + const char* func_name = "cuSignalExternalSemaphoresAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUexternalSemaphore * , const CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS * , unsigned int, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(extSemArray, paramsArray, numExtSems, stream); +} + +CUresult cuWaitExternalSemaphoresAsync(const CUexternalSemaphore *extSemArray, const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS *paramsArray, unsigned int numExtSems, CUstream stream) { + const char* func_name = "cuWaitExternalSemaphoresAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(const CUexternalSemaphore * , const CUDA_EXTERNAL_SEMAPHORE_WAIT_PARAMS * , unsigned int, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(extSemArray, paramsArray, numExtSems, stream); +} + +CUresult cuDestroyExternalSemaphore(CUexternalSemaphore extSem) { + const char* func_name = "cuDestroyExternalSemaphore"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUexternalSemaphore); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(extSem); +} + +CUresult cuStreamWaitValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) { + const char* func_name = "cuStreamWaitValue32"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, cuuint32_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, addr, value, flags); +} + +CUresult cuStreamWaitValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) { + const char* func_name = "cuStreamWaitValue64"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, cuuint64_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, addr, value, flags); +} + +CUresult cuStreamWriteValue32(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) { + const char* func_name = "cuStreamWriteValue32"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, cuuint32_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, addr, value, flags); +} + +CUresult cuStreamWriteValue64(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) { + const char* func_name = "cuStreamWriteValue64"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, cuuint64_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, addr, value, flags); +} + +CUresult cuStreamBatchMemOp(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags) { + const char* func_name = "cuStreamBatchMemOp"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, unsigned int, CUstreamBatchMemOpParams * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, count, paramArray, flags); +} + +CUresult cuStreamWaitValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) { + const char* func_name = "cuStreamWaitValue32_v2"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, cuuint32_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, addr, value, flags); +} + +CUresult cuStreamWaitValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) { + const char* func_name = "cuStreamWaitValue64_v2"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, cuuint64_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, addr, value, flags); +} + +CUresult cuStreamWriteValue32_v2(CUstream stream, CUdeviceptr addr, cuuint32_t value, unsigned int flags) { + const char* func_name = "cuStreamWriteValue32_v2"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, cuuint32_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, addr, value, flags); +} + +CUresult cuStreamWriteValue64_v2(CUstream stream, CUdeviceptr addr, cuuint64_t value, unsigned int flags) { + const char* func_name = "cuStreamWriteValue64_v2"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUdeviceptr, cuuint64_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, addr, value, flags); +} + +CUresult cuStreamBatchMemOp_v2(CUstream stream, unsigned int count, CUstreamBatchMemOpParams *paramArray, unsigned int flags) { + const char* func_name = "cuStreamBatchMemOp_v2"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, unsigned int, CUstreamBatchMemOpParams * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(stream, count, paramArray, flags); +} + +CUresult cuFuncGetAttribute(int *pi, CUfunction_attribute attrib, CUfunction hfunc) { + const char* func_name = "cuFuncGetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUfunction_attribute, CUfunction); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pi, attrib, hfunc); +} + +CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value) { + const char* func_name = "cuFuncSetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, CUfunction_attribute, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, attrib, value); +} + +CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config) { + const char* func_name = "cuFuncSetCacheConfig"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, CUfunc_cache); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, config); +} + +CUresult cuFuncSetSharedMemConfig(CUfunction hfunc, CUsharedconfig config) { + const char* func_name = "cuFuncSetSharedMemConfig"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, CUsharedconfig); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, config); +} + +CUresult cuFuncGetModule(CUmodule *hmod, CUfunction hfunc) { + const char* func_name = "cuFuncGetModule"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmodule * , CUfunction); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hmod, hfunc); +} + +CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra) { + const char* func_name = "cuLaunchKernel"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void * * , void * * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra); +} + +CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams) { + const char* func_name = "cuLaunchCooperativeKernel"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void * * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams); +} + +CUresult cuLaunchCooperativeKernelMultiDevice(CUDA_LAUNCH_PARAMS *launchParamsList, unsigned int numDevices, unsigned int flags) { + const char* func_name = "cuLaunchCooperativeKernelMultiDevice"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_LAUNCH_PARAMS * , unsigned int, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(launchParamsList, numDevices, flags); +} + +CUresult cuLaunchHostFunc(CUstream hStream, CUhostFn fn, void *userData) { + const char* func_name = "cuLaunchHostFunc"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream, CUhostFn, void * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream, fn, userData); +} + +CUresult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z) { + const char* func_name = "cuFuncSetBlockShape"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, int, int, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, x, y, z); +} + +CUresult cuFuncSetSharedSize(CUfunction hfunc, unsigned int bytes) { + const char* func_name = "cuFuncSetSharedSize"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, bytes); +} + +CUresult cuParamSetSize(CUfunction hfunc, unsigned int numbytes) { + const char* func_name = "cuParamSetSize"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, numbytes); +} + +CUresult cuParamSeti(CUfunction hfunc, int offset, unsigned int value) { + const char* func_name = "cuParamSeti"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, int, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, offset, value); +} + +CUresult cuParamSetf(CUfunction hfunc, int offset, float value) { + const char* func_name = "cuParamSetf"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, int, float); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, offset, value); +} + +CUresult cuParamSetv(CUfunction hfunc, int offset, void *ptr, unsigned int numbytes) { + const char* func_name = "cuParamSetv"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, int, void * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, offset, ptr, numbytes); +} + +CUresult cuLaunch(CUfunction f) { + const char* func_name = "cuLaunch"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(f); +} + +CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height) { + const char* func_name = "cuLaunchGrid"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, int, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(f, grid_width, grid_height); +} + +CUresult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) { + const char* func_name = "cuLaunchGridAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, int, int, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(f, grid_width, grid_height, hStream); +} + +CUresult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef) { + const char* func_name = "cuParamSetTexRef"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfunction, int, CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hfunc, texunit, hTexRef); +} + +CUresult cuGraphCreate(CUgraph *phGraph, unsigned int flags) { + const char* func_name = "cuGraphCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraph, flags); +} + +CUresult cuGraphAddKernelNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_KERNEL_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphAddKernelNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, const CUDA_KERNEL_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, nodeParams); +} + +CUresult cuGraphKernelNodeGetParams(CUgraphNode hNode, CUDA_KERNEL_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphKernelNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUDA_KERNEL_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphKernelNodeSetParams(CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphKernelNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, const CUDA_KERNEL_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphAddMemcpyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMCPY3D *copyParams, CUcontext ctx) { + const char* func_name = "cuGraphAddMemcpyNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, const CUDA_MEMCPY3D * , CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, copyParams, ctx); +} + +CUresult cuGraphMemcpyNodeGetParams(CUgraphNode hNode, CUDA_MEMCPY3D *nodeParams) { + const char* func_name = "cuGraphMemcpyNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUDA_MEMCPY3D * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphMemcpyNodeSetParams(CUgraphNode hNode, const CUDA_MEMCPY3D *nodeParams) { + const char* func_name = "cuGraphMemcpyNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, const CUDA_MEMCPY3D * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphAddMemsetNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) { + const char* func_name = "cuGraphAddMemsetNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, const CUDA_MEMSET_NODE_PARAMS * , CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, memsetParams, ctx); +} + +CUresult cuGraphMemsetNodeGetParams(CUgraphNode hNode, CUDA_MEMSET_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphMemsetNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUDA_MEMSET_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphMemsetNodeSetParams(CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphMemsetNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, const CUDA_MEMSET_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphAddHostNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_HOST_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphAddHostNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, const CUDA_HOST_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, nodeParams); +} + +CUresult cuGraphHostNodeGetParams(CUgraphNode hNode, CUDA_HOST_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphHostNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUDA_HOST_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphHostNodeSetParams(CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphHostNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, const CUDA_HOST_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphAddChildGraphNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUgraph childGraph) { + const char* func_name = "cuGraphAddChildGraphNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, CUgraph); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, childGraph); +} + +CUresult cuGraphChildGraphNodeGetGraph(CUgraphNode hNode, CUgraph *phGraph) { + const char* func_name = "cuGraphChildGraphNodeGetGraph"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUgraph * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, phGraph); +} + +CUresult cuGraphAddEmptyNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies) { + const char* func_name = "cuGraphAddEmptyNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies); +} + +CUresult cuGraphAddEventRecordNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event) { + const char* func_name = "cuGraphAddEventRecordNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, event); +} + +CUresult cuGraphEventRecordNodeGetEvent(CUgraphNode hNode, CUevent *event_out) { + const char* func_name = "cuGraphEventRecordNodeGetEvent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUevent * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, event_out); +} + +CUresult cuGraphEventRecordNodeSetEvent(CUgraphNode hNode, CUevent event) { + const char* func_name = "cuGraphEventRecordNodeSetEvent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, event); +} + +CUresult cuGraphAddEventWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUevent event) { + const char* func_name = "cuGraphAddEventWaitNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, event); +} + +CUresult cuGraphEventWaitNodeGetEvent(CUgraphNode hNode, CUevent *event_out) { + const char* func_name = "cuGraphEventWaitNodeGetEvent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUevent * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, event_out); +} + +CUresult cuGraphEventWaitNodeSetEvent(CUgraphNode hNode, CUevent event) { + const char* func_name = "cuGraphEventWaitNodeSetEvent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, event); +} + +CUresult cuGraphAddExternalSemaphoresSignalNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphAddExternalSemaphoresSignalNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, nodeParams); +} + +CUresult cuGraphExternalSemaphoresSignalNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *params_out) { + const char* func_name = "cuGraphExternalSemaphoresSignalNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUDA_EXT_SEM_SIGNAL_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, params_out); +} + +CUresult cuGraphExternalSemaphoresSignalNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphExternalSemaphoresSignalNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphAddExternalSemaphoresWaitNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphAddExternalSemaphoresWaitNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, const CUDA_EXT_SEM_WAIT_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, nodeParams); +} + +CUresult cuGraphExternalSemaphoresWaitNodeGetParams(CUgraphNode hNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS *params_out) { + const char* func_name = "cuGraphExternalSemaphoresWaitNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUDA_EXT_SEM_WAIT_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, params_out); +} + +CUresult cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphExternalSemaphoresWaitNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphAddBatchMemOpNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphAddBatchMemOpNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, const CUDA_BATCH_MEM_OP_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, nodeParams); +} + +CUresult cuGraphBatchMemOpNodeGetParams(CUgraphNode hNode, CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams_out) { + const char* func_name = "cuGraphBatchMemOpNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUDA_BATCH_MEM_OP_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams_out); +} + +CUresult cuGraphBatchMemOpNodeSetParams(CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphBatchMemOpNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, nodeParams); +} + +CUresult cuGraphExecBatchMemOpNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphExecBatchMemOpNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, const CUDA_BATCH_MEM_OP_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, nodeParams); +} + +CUresult cuGraphAddMemAllocNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUDA_MEM_ALLOC_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphAddMemAllocNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, CUDA_MEM_ALLOC_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, nodeParams); +} + +CUresult cuGraphMemAllocNodeGetParams(CUgraphNode hNode, CUDA_MEM_ALLOC_NODE_PARAMS *params_out) { + const char* func_name = "cuGraphMemAllocNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUDA_MEM_ALLOC_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, params_out); +} + +CUresult cuGraphAddMemFreeNode(CUgraphNode *phGraphNode, CUgraph hGraph, const CUgraphNode *dependencies, size_t numDependencies, CUdeviceptr dptr) { + const char* func_name = "cuGraphAddMemFreeNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraph, const CUgraphNode * , size_t, CUdeviceptr); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphNode, hGraph, dependencies, numDependencies, dptr); +} + +CUresult cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dptr_out) { + const char* func_name = "cuGraphMemFreeNodeGetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUdeviceptr * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, dptr_out); +} + +CUresult cuDeviceGraphMemTrim(CUdevice device) { + const char* func_name = "cuDeviceGraphMemTrim"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(device); +} + +CUresult cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value) { + const char* func_name = "cuDeviceGetGraphMemAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice, CUgraphMem_attribute, void* ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(device, attr, value); +} + +CUresult cuDeviceSetGraphMemAttribute(CUdevice device, CUgraphMem_attribute attr, void* value) { + const char* func_name = "cuDeviceSetGraphMemAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdevice, CUgraphMem_attribute, void* ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(device, attr, value); +} + +CUresult cuGraphClone(CUgraph *phGraphClone, CUgraph originalGraph) { + const char* func_name = "cuGraphClone"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph * , CUgraph); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphClone, originalGraph); +} + +CUresult cuGraphNodeFindInClone(CUgraphNode *phNode, CUgraphNode hOriginalNode, CUgraph hClonedGraph) { + const char* func_name = "cuGraphNodeFindInClone"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode * , CUgraphNode, CUgraph); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phNode, hOriginalNode, hClonedGraph); +} + +CUresult cuGraphNodeGetType(CUgraphNode hNode, CUgraphNodeType *type) { + const char* func_name = "cuGraphNodeGetType"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUgraphNodeType * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, type); +} + +CUresult cuGraphGetNodes(CUgraph hGraph, CUgraphNode *nodes, size_t *numNodes) { + const char* func_name = "cuGraphGetNodes"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph, CUgraphNode * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraph, nodes, numNodes); +} + +CUresult cuGraphGetRootNodes(CUgraph hGraph, CUgraphNode *rootNodes, size_t *numRootNodes) { + const char* func_name = "cuGraphGetRootNodes"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph, CUgraphNode * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraph, rootNodes, numRootNodes); +} + +CUresult cuGraphGetEdges(CUgraph hGraph, CUgraphNode *from, CUgraphNode *to, size_t *numEdges) { + const char* func_name = "cuGraphGetEdges"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph, CUgraphNode * , CUgraphNode * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraph, from, to, numEdges); +} + +CUresult cuGraphNodeGetDependencies(CUgraphNode hNode, CUgraphNode *dependencies, size_t *numDependencies) { + const char* func_name = "cuGraphNodeGetDependencies"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUgraphNode * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, dependencies, numDependencies); +} + +CUresult cuGraphNodeGetDependentNodes(CUgraphNode hNode, CUgraphNode *dependentNodes, size_t *numDependentNodes) { + const char* func_name = "cuGraphNodeGetDependentNodes"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUgraphNode * , size_t * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, dependentNodes, numDependentNodes); +} + +CUresult cuGraphAddDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies) { + const char* func_name = "cuGraphAddDependencies"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph, const CUgraphNode * , const CUgraphNode * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraph, from, to, numDependencies); +} + +CUresult cuGraphRemoveDependencies(CUgraph hGraph, const CUgraphNode *from, const CUgraphNode *to, size_t numDependencies) { + const char* func_name = "cuGraphRemoveDependencies"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph, const CUgraphNode * , const CUgraphNode * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraph, from, to, numDependencies); +} + +CUresult cuGraphDestroyNode(CUgraphNode hNode) { + const char* func_name = "cuGraphDestroyNode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode); +} + +CUresult cuGraphInstantiate(CUgraphExec *phGraphExec, CUgraph hGraph, CUgraphNode *phErrorNode, char *logBuffer, size_t bufferSize) { + const char* func_name = "cuGraphInstantiate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec * , CUgraph, CUgraphNode * , char * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphExec, hGraph, phErrorNode, logBuffer, bufferSize); +} + +CUresult cuGraphInstantiateWithFlags(CUgraphExec *phGraphExec, CUgraph hGraph, unsigned long long flags) { + const char* func_name = "cuGraphInstantiateWithFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec * , CUgraph, unsigned long long); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phGraphExec, hGraph, flags); +} + +CUresult cuGraphExecKernelNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_KERNEL_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphExecKernelNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, const CUDA_KERNEL_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, nodeParams); +} + +CUresult cuGraphExecMemcpyNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMCPY3D *copyParams, CUcontext ctx) { + const char* func_name = "cuGraphExecMemcpyNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, const CUDA_MEMCPY3D * , CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, copyParams, ctx); +} + +CUresult cuGraphExecMemsetNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_MEMSET_NODE_PARAMS *memsetParams, CUcontext ctx) { + const char* func_name = "cuGraphExecMemsetNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, const CUDA_MEMSET_NODE_PARAMS * , CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, memsetParams, ctx); +} + +CUresult cuGraphExecHostNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_HOST_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphExecHostNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, const CUDA_HOST_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, nodeParams); +} + +CUresult cuGraphExecChildGraphNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, CUgraph childGraph) { + const char* func_name = "cuGraphExecChildGraphNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, CUgraph); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, childGraph); +} + +CUresult cuGraphExecEventRecordNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event) { + const char* func_name = "cuGraphExecEventRecordNodeSetEvent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, event); +} + +CUresult cuGraphExecEventWaitNodeSetEvent(CUgraphExec hGraphExec, CUgraphNode hNode, CUevent event) { + const char* func_name = "cuGraphExecEventWaitNodeSetEvent"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, CUevent); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, event); +} + +CUresult cuGraphExecExternalSemaphoresSignalNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphExecExternalSemaphoresSignalNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, const CUDA_EXT_SEM_SIGNAL_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, nodeParams); +} + +CUresult cuGraphExecExternalSemaphoresWaitNodeSetParams(CUgraphExec hGraphExec, CUgraphNode hNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS *nodeParams) { + const char* func_name = "cuGraphExecExternalSemaphoresWaitNodeSetParams"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, const CUDA_EXT_SEM_WAIT_NODE_PARAMS * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, nodeParams); +} + +CUresult cuGraphNodeSetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int isEnabled) { + const char* func_name = "cuGraphNodeSetEnabled"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, isEnabled); +} + +CUresult cuGraphNodeGetEnabled(CUgraphExec hGraphExec, CUgraphNode hNode, unsigned int *isEnabled) { + const char* func_name = "cuGraphNodeGetEnabled"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraphNode, unsigned int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hNode, isEnabled); +} + +CUresult cuGraphUpload(CUgraphExec hGraphExec, CUstream hStream) { + const char* func_name = "cuGraphUpload"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hStream); +} + +CUresult cuGraphLaunch(CUgraphExec hGraphExec, CUstream hStream) { + const char* func_name = "cuGraphLaunch"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hStream); +} + +CUresult cuGraphExecDestroy(CUgraphExec hGraphExec) { + const char* func_name = "cuGraphExecDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec); +} + +CUresult cuGraphDestroy(CUgraph hGraph) { + const char* func_name = "cuGraphDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraph); +} + +CUresult cuGraphExecUpdate(CUgraphExec hGraphExec, CUgraph hGraph, CUgraphNode *hErrorNode_out, CUgraphExecUpdateResult *updateResult_out) { + const char* func_name = "cuGraphExecUpdate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphExec, CUgraph, CUgraphNode * , CUgraphExecUpdateResult * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraphExec, hGraph, hErrorNode_out, updateResult_out); +} + +CUresult cuGraphKernelNodeCopyAttributes(CUgraphNode dst, CUgraphNode src) { + const char* func_name = "cuGraphKernelNodeCopyAttributes"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUgraphNode); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dst, src); +} + +CUresult cuGraphKernelNodeGetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, CUkernelNodeAttrValue *value_out) { + const char* func_name = "cuGraphKernelNodeGetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUkernelNodeAttrID, CUkernelNodeAttrValue * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, attr, value_out); +} + +CUresult cuGraphKernelNodeSetAttribute(CUgraphNode hNode, CUkernelNodeAttrID attr, const CUkernelNodeAttrValue *value) { + const char* func_name = "cuGraphKernelNodeSetAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphNode, CUkernelNodeAttrID, const CUkernelNodeAttrValue * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hNode, attr, value); +} + +CUresult cuGraphDebugDotPrint(CUgraph hGraph, const char *path, unsigned int flags) { + const char* func_name = "cuGraphDebugDotPrint"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph, const char * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hGraph, path, flags); +} + +CUresult cuUserObjectCreate(CUuserObject *object_out, void *ptr, CUhostFn destroy, unsigned int initialRefcount, unsigned int flags) { + const char* func_name = "cuUserObjectCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUuserObject * , void * , CUhostFn, unsigned int, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(object_out, ptr, destroy, initialRefcount, flags); +} + +CUresult cuUserObjectRetain(CUuserObject object, unsigned int count) { + const char* func_name = "cuUserObjectRetain"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUuserObject, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(object, count); +} + +CUresult cuUserObjectRelease(CUuserObject object, unsigned int count) { + const char* func_name = "cuUserObjectRelease"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUuserObject, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(object, count); +} + +CUresult cuGraphRetainUserObject(CUgraph graph, CUuserObject object, unsigned int count, unsigned int flags) { + const char* func_name = "cuGraphRetainUserObject"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph, CUuserObject, unsigned int, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(graph, object, count, flags); +} + +CUresult cuGraphReleaseUserObject(CUgraph graph, CUuserObject object, unsigned int count) { + const char* func_name = "cuGraphReleaseUserObject"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraph, CUuserObject, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(graph, object, count); +} + +CUresult cuOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize) { + const char* func_name = "cuOccupancyMaxActiveBlocksPerMultiprocessor"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUfunction, int, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(numBlocks, func, blockSize, dynamicSMemSize); +} + +CUresult cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int *numBlocks, CUfunction func, int blockSize, size_t dynamicSMemSize, unsigned int flags) { + const char* func_name = "cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUfunction, int, size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(numBlocks, func, blockSize, dynamicSMemSize, flags); +} + +CUresult cuOccupancyMaxPotentialBlockSize(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit) { + const char* func_name = "cuOccupancyMaxPotentialBlockSize"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , int * , CUfunction, CUoccupancyB2DSize, size_t, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit); +} + +CUresult cuOccupancyMaxPotentialBlockSizeWithFlags(int *minGridSize, int *blockSize, CUfunction func, CUoccupancyB2DSize blockSizeToDynamicSMemSize, size_t dynamicSMemSize, int blockSizeLimit, unsigned int flags) { + const char* func_name = "cuOccupancyMaxPotentialBlockSizeWithFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , int * , CUfunction, CUoccupancyB2DSize, size_t, int, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, dynamicSMemSize, blockSizeLimit, flags); +} + +CUresult cuOccupancyAvailableDynamicSMemPerBlock(size_t *dynamicSmemSize, CUfunction func, int numBlocks, int blockSize) { + const char* func_name = "cuOccupancyAvailableDynamicSMemPerBlock"; + HookLog(func_name); + using func_ptr = CUresult (*)(size_t * , CUfunction, int, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dynamicSmemSize, func, numBlocks, blockSize); +} + +CUresult cuTexRefSetArray(CUtexref hTexRef, CUarray hArray, unsigned int Flags) { + const char* func_name = "cuTexRefSetArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, CUarray, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, hArray, Flags); +} + +CUresult cuTexRefSetMipmappedArray(CUtexref hTexRef, CUmipmappedArray hMipmappedArray, unsigned int Flags) { + const char* func_name = "cuTexRefSetMipmappedArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, CUmipmappedArray, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, hMipmappedArray, Flags); +} + +CUresult cuTexRefSetAddress(size_t *ByteOffset, CUtexref hTexRef, CUdeviceptr dptr, size_t bytes) { + const char* func_name = "cuTexRefSetAddress"; + HookLog(func_name); + using func_ptr = CUresult (*)(size_t * , CUtexref, CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ByteOffset, hTexRef, dptr, bytes); +} + +CUresult cuTexRefSetAddress2D(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch) { + const char* func_name = "cuTexRefSetAddress2D"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, const CUDA_ARRAY_DESCRIPTOR * , CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, desc, dptr, Pitch); +} + +CUresult cuTexRefSetFormat(CUtexref hTexRef, CUarray_format fmt, int NumPackedComponents) { + const char* func_name = "cuTexRefSetFormat"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, CUarray_format, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, fmt, NumPackedComponents); +} + +CUresult cuTexRefSetAddressMode(CUtexref hTexRef, int dim, CUaddress_mode am) { + const char* func_name = "cuTexRefSetAddressMode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, int, CUaddress_mode); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, dim, am); +} + +CUresult cuTexRefSetFilterMode(CUtexref hTexRef, CUfilter_mode fm) { + const char* func_name = "cuTexRefSetFilterMode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, CUfilter_mode); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, fm); +} + +CUresult cuTexRefSetMipmapFilterMode(CUtexref hTexRef, CUfilter_mode fm) { + const char* func_name = "cuTexRefSetMipmapFilterMode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, CUfilter_mode); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, fm); +} + +CUresult cuTexRefSetMipmapLevelBias(CUtexref hTexRef, float bias) { + const char* func_name = "cuTexRefSetMipmapLevelBias"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, float); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, bias); +} + +CUresult cuTexRefSetMipmapLevelClamp(CUtexref hTexRef, float minMipmapLevelClamp, float maxMipmapLevelClamp) { + const char* func_name = "cuTexRefSetMipmapLevelClamp"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, float, float); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, minMipmapLevelClamp, maxMipmapLevelClamp); +} + +CUresult cuTexRefSetMaxAnisotropy(CUtexref hTexRef, unsigned int maxAniso) { + const char* func_name = "cuTexRefSetMaxAnisotropy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, maxAniso); +} + +CUresult cuTexRefSetBorderColor(CUtexref hTexRef, float *pBorderColor) { + const char* func_name = "cuTexRefSetBorderColor"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, float * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, pBorderColor); +} + +CUresult cuTexRefSetFlags(CUtexref hTexRef, unsigned int Flags) { + const char* func_name = "cuTexRefSetFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, Flags); +} + +CUresult cuTexRefGetAddress(CUdeviceptr *pdptr, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetAddress"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pdptr, hTexRef); +} + +CUresult cuTexRefGetArray(CUarray *phArray, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phArray, hTexRef); +} + +CUresult cuTexRefGetMipmappedArray(CUmipmappedArray *phMipmappedArray, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetMipmappedArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmipmappedArray * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phMipmappedArray, hTexRef); +} + +CUresult cuTexRefGetAddressMode(CUaddress_mode *pam, CUtexref hTexRef, int dim) { + const char* func_name = "cuTexRefGetAddressMode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUaddress_mode * , CUtexref, int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pam, hTexRef, dim); +} + +CUresult cuTexRefGetFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetFilterMode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfilter_mode * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pfm, hTexRef); +} + +CUresult cuTexRefGetFormat(CUarray_format *pFormat, int *pNumChannels, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetFormat"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray_format * , int * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pFormat, pNumChannels, hTexRef); +} + +CUresult cuTexRefGetMipmapFilterMode(CUfilter_mode *pfm, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetMipmapFilterMode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUfilter_mode * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pfm, hTexRef); +} + +CUresult cuTexRefGetMipmapLevelBias(float *pbias, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetMipmapLevelBias"; + HookLog(func_name); + using func_ptr = CUresult (*)(float * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pbias, hTexRef); +} + +CUresult cuTexRefGetMipmapLevelClamp(float *pminMipmapLevelClamp, float *pmaxMipmapLevelClamp, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetMipmapLevelClamp"; + HookLog(func_name); + using func_ptr = CUresult (*)(float * , float * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pminMipmapLevelClamp, pmaxMipmapLevelClamp, hTexRef); +} + +CUresult cuTexRefGetMaxAnisotropy(int *pmaxAniso, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetMaxAnisotropy"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pmaxAniso, hTexRef); +} + +CUresult cuTexRefGetBorderColor(float *pBorderColor, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetBorderColor"; + HookLog(func_name); + using func_ptr = CUresult (*)(float * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pBorderColor, hTexRef); +} + +CUresult cuTexRefGetFlags(unsigned int *pFlags, CUtexref hTexRef) { + const char* func_name = "cuTexRefGetFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int * , CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pFlags, hTexRef); +} + +CUresult cuTexRefCreate(CUtexref *pTexRef) { + const char* func_name = "cuTexRefCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pTexRef); +} + +CUresult cuTexRefDestroy(CUtexref hTexRef) { + const char* func_name = "cuTexRefDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef); +} + +CUresult cuSurfRefSetArray(CUsurfref hSurfRef, CUarray hArray, unsigned int Flags) { + const char* func_name = "cuSurfRefSetArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUsurfref, CUarray, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hSurfRef, hArray, Flags); +} + +CUresult cuSurfRefGetArray(CUarray *phArray, CUsurfref hSurfRef) { + const char* func_name = "cuSurfRefGetArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray * , CUsurfref); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(phArray, hSurfRef); +} + +CUresult cuTexObjectCreate(CUtexObject *pTexObject, const CUDA_RESOURCE_DESC *pResDesc, const CUDA_TEXTURE_DESC *pTexDesc, const CUDA_RESOURCE_VIEW_DESC *pResViewDesc) { + const char* func_name = "cuTexObjectCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexObject * , const CUDA_RESOURCE_DESC * , const CUDA_TEXTURE_DESC * , const CUDA_RESOURCE_VIEW_DESC * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pTexObject, pResDesc, pTexDesc, pResViewDesc); +} + +CUresult cuTexObjectDestroy(CUtexObject texObject) { + const char* func_name = "cuTexObjectDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexObject); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(texObject); +} + +CUresult cuTexObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUtexObject texObject) { + const char* func_name = "cuTexObjectGetResourceDesc"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_RESOURCE_DESC * , CUtexObject); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pResDesc, texObject); +} + +CUresult cuTexObjectGetTextureDesc(CUDA_TEXTURE_DESC *pTexDesc, CUtexObject texObject) { + const char* func_name = "cuTexObjectGetTextureDesc"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_TEXTURE_DESC * , CUtexObject); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pTexDesc, texObject); +} + +CUresult cuTexObjectGetResourceViewDesc(CUDA_RESOURCE_VIEW_DESC *pResViewDesc, CUtexObject texObject) { + const char* func_name = "cuTexObjectGetResourceViewDesc"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_RESOURCE_VIEW_DESC * , CUtexObject); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pResViewDesc, texObject); +} + +CUresult cuSurfObjectCreate(CUsurfObject *pSurfObject, const CUDA_RESOURCE_DESC *pResDesc) { + const char* func_name = "cuSurfObjectCreate"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUsurfObject * , const CUDA_RESOURCE_DESC * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pSurfObject, pResDesc); +} + +CUresult cuSurfObjectDestroy(CUsurfObject surfObject) { + const char* func_name = "cuSurfObjectDestroy"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUsurfObject); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(surfObject); +} + +CUresult cuSurfObjectGetResourceDesc(CUDA_RESOURCE_DESC *pResDesc, CUsurfObject surfObject) { + const char* func_name = "cuSurfObjectGetResourceDesc"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUDA_RESOURCE_DESC * , CUsurfObject); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pResDesc, surfObject); +} + +CUresult cuDeviceCanAccessPeer(int *canAccessPeer, CUdevice dev, CUdevice peerDev) { + const char* func_name = "cuDeviceCanAccessPeer"; + HookLog(func_name); + using func_ptr = CUresult (*)(int * , CUdevice, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(canAccessPeer, dev, peerDev); +} + +CUresult cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int Flags) { + const char* func_name = "cuCtxEnablePeerAccess"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(peerContext, Flags); +} + +CUresult cuCtxDisablePeerAccess(CUcontext peerContext) { + const char* func_name = "cuCtxDisablePeerAccess"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUcontext); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(peerContext); +} + +CUresult cuDeviceGetP2PAttribute(int* value, CUdevice_P2PAttribute attrib, CUdevice srcDevice, CUdevice dstDevice) { + const char* func_name = "cuDeviceGetP2PAttribute"; + HookLog(func_name); + using func_ptr = CUresult (*)(int* , CUdevice_P2PAttribute, CUdevice, CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(value, attrib, srcDevice, dstDevice); +} + +CUresult cuGraphicsUnregisterResource(CUgraphicsResource resource) { + const char* func_name = "cuGraphicsUnregisterResource"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphicsResource); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(resource); +} + +CUresult cuGraphicsSubResourceGetMappedArray(CUarray *pArray, CUgraphicsResource resource, unsigned int arrayIndex, unsigned int mipLevel) { + const char* func_name = "cuGraphicsSubResourceGetMappedArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray * , CUgraphicsResource, unsigned int, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pArray, resource, arrayIndex, mipLevel); +} + +CUresult cuGraphicsResourceGetMappedMipmappedArray(CUmipmappedArray *pMipmappedArray, CUgraphicsResource resource) { + const char* func_name = "cuGraphicsResourceGetMappedMipmappedArray"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmipmappedArray * , CUgraphicsResource); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pMipmappedArray, resource); +} + +CUresult cuGraphicsResourceGetMappedPointer(CUdeviceptr *pDevPtr, size_t *pSize, CUgraphicsResource resource) { + const char* func_name = "cuGraphicsResourceGetMappedPointer"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUdeviceptr * , size_t * , CUgraphicsResource); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pDevPtr, pSize, resource); +} + +CUresult cuGraphicsResourceSetMapFlags(CUgraphicsResource resource, unsigned int flags) { + const char* func_name = "cuGraphicsResourceSetMapFlags"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUgraphicsResource, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(resource, flags); +} + +CUresult cuGraphicsMapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream) { + const char* func_name = "cuGraphicsMapResources"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int, CUgraphicsResource * , CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(count, resources, hStream); +} + +CUresult cuGraphicsUnmapResources(unsigned int count, CUgraphicsResource *resources, CUstream hStream) { + const char* func_name = "cuGraphicsUnmapResources"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int, CUgraphicsResource * , CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(count, resources, hStream); +} + +CUresult cuGetProcAddress(const char *symbol, void **pfn, int cudaVersion, cuuint64_t flags) { + const char* func_name = "cuGetProcAddress"; + HookLog(func_name); + using func_ptr = CUresult (*)(const char * , void * * , int, cuuint64_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(symbol, pfn, cudaVersion, flags); +} + +CUresult cuModuleGetLoadingMode(CUmoduleLoadingMode *mode) { + const char* func_name = "cuModuleGetLoadingMode"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUmoduleLoadingMode * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(mode); +} + +CUresult cuMemGetHandleForAddressRange(void *handle, CUdeviceptr dptr, size_t size, CUmemRangeHandleType handleType, unsigned long long flags) { + const char* func_name = "cuMemGetHandleForAddressRange"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUdeviceptr, size_t, CUmemRangeHandleType, unsigned long long); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(handle, dptr, size, handleType, flags); +} + +CUresult cuGetExportTable(const void **ppExportTable, const CUuuid *pExportTableId) { + const char* func_name = "cuGetExportTable"; + HookLog(func_name); + using func_ptr = CUresult (*)(const void * * , const CUuuid * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(ppExportTable, pExportTableId); +} + +CUresult cuTexRefSetAddress2D_v2(CUtexref hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, CUdeviceptr dptr, size_t Pitch) { + const char* func_name = "cuTexRefSetAddress2D_v2"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUtexref, const CUDA_ARRAY_DESCRIPTOR * , CUdeviceptr, size_t); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hTexRef, desc, dptr, Pitch); +} + +CUresult cuDeviceTotalMem(unsigned int *bytes, CUdevice dev) { + const char* func_name = "cuDeviceTotalMem"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int * , CUdevice); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(bytes, dev); +} + +CUresult cuMemGetInfo(unsigned int *free, unsigned int *total) { + const char* func_name = "cuMemGetInfo"; + HookLog(func_name); + using func_ptr = CUresult (*)(unsigned int * , unsigned int * ); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(free, total); +} + +CUresult cuMemAllocHost(void **pp, unsigned int bytesize) { + const char* func_name = "cuMemAllocHost"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(pp, bytesize); +} + +CUresult cuMemcpyHtoA(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount) { + const char* func_name = "cuMemcpyHtoA"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray, unsigned int, const void * , unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstArray, dstOffset, srcHost, ByteCount); +} + +CUresult cuMemcpyAtoH(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount) { + const char* func_name = "cuMemcpyAtoH"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUarray, unsigned int, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstHost, srcArray, srcOffset, ByteCount); +} + +CUresult cuMemcpyAtoA(CUarray dstArray, unsigned int dstOffset, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount) { + const char* func_name = "cuMemcpyAtoA"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray, unsigned int, CUarray, unsigned int, unsigned int); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstArray, dstOffset, srcArray, srcOffset, ByteCount); +} + +CUresult cuMemcpyHtoAAsync(CUarray dstArray, unsigned int dstOffset, const void *srcHost, unsigned int ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyHtoAAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUarray, unsigned int, const void * , unsigned int, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstArray, dstOffset, srcHost, ByteCount, hStream); +} + +CUresult cuMemcpyAtoHAsync(void *dstHost, CUarray srcArray, unsigned int srcOffset, unsigned int ByteCount, CUstream hStream) { + const char* func_name = "cuMemcpyAtoHAsync"; + HookLog(func_name); + using func_ptr = CUresult (*)(void * , CUarray, unsigned int, unsigned int, CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(dstHost, srcArray, srcOffset, ByteCount, hStream); +} + +CUresult cuStreamBeginCapture(CUstream hStream) { + const char* func_name = "cuStreamBeginCapture"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream); +} + +CUresult cuStreamBeginCapture_ptsz(CUstream hStream) { + const char* func_name = "cuStreamBeginCapture_ptsz"; + HookLog(func_name); + using func_ptr = CUresult (*)(CUstream); + auto func_entry = reinterpret_cast(dlsym(get_cuda_handle(), func_name)); + return func_entry(hStream); +} + diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/cudartHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/cudartHook.cc new file mode 100644 index 0000000..f8e4456 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/cudartHook.cc @@ -0,0 +1,1185 @@ +#include "../../include/hook/hook.h" +#include "../../include/hook/elfHandle.h" + +/* ---- CUDA Runtime Internal API ---- */ + +extern "C" void** __cudaRegisterFatBinary(void* fatCubin) { + std::call_once(initFlag, Intialize); + const char* func_name = "__cudaRegisterFatBinary"; + HookLog(func_name, false, LOG_REGS); + using func_ptr = void **(*)(void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + FatHeader_t* fatBinHeader = (FatHeader_t*)fatCubin; + size_t fatBinSize = 0; + uint8_t* fatBinText = NULL; + size_t kernelNum = registeredKernels.size(); + if (GetFatbinInfo(fatBinHeader, ®isteredKernels, + (uint8_t **)&fatBinText, &fatBinSize) != 0) { + tool::Logging(LOG_ERROR, func_name, "error getting fatbin info\n"); + return NULL; + } + kernelNum = registeredKernels.size() - kernelNum; + void ** fatBinHandle = (void**)calloc(1, 0x58); + // uint8_t* fatBinTextCopy = (uint8_t*)malloc(fatBinSize); //todo + // memcpy(fatBinTextCopy, fatBinText, fatBinSize); // data needs to be in the heap instead of the stack or the data segment for ucx rdma + + regIOVList.push_back(new RegisterIOV()); + regIOVList.back()->PushSubRequestType(__CUDA_REGISTER_FAT_BINARY); + regIOVList.back()->Push64BitPointer(fatBinHandle); + regIOVList.back()->Push(fatBinSize); + regIOVList.back()->Push(kernelNum); + regIOVList.back()->Push(fatBinText, fatBinSize); + tool::Logging(LOG_REGS, func_name, "fatBinHandle: %p, fatBinSize: %zu, kernelNum: %zu\n", fatBinHandle, fatBinSize, kernelNum); + + // free(fatBinTextCopy); + return fatBinHandle; + //return func_entry(fatCubin); +} + +extern "C" void __cudaRegisterVar(void **fatCubinHandle, char *hostVar, char *deviceAddress, + const char *deviceName, int ext, size_t size, int constant, + int global) { + const char* func_name = "__cudaRegisterVar"; + HookLog(func_name, false, LOG_REGS); + using func_ptr = void (*)(void **, char *, char *, const char *, int, size_t, int, int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + tool::Logging(LOG_REGS, func_name, "hostVar: %p , deviceAddr: %p, deviceName: %s , ext: %d , size: %zu , constant: %d , global: %d\n", hostVar, deviceAddress, deviceName, ext, size, constant, global); + // std::cout << " ,deviceName:" << deviceName << " ,ext:" << ext << " ,size:" << size << " ,constant:" << constant + // << " ,global:" << global << std::endl; + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) + sizeof(size_t)+strlen(deviceAddress)+1 + sizeof(size_t)+strlen(deviceName)+1 + sizeof(int)*3 + sizeof(size_t)); + + regIOVList.back()->PushSubRequestType(__CUDA_REGISTER_VAR); + regIOVList.back()->Push64BitPointer(fatCubinHandle); + regIOVList.back()->Push64BitPointer(hostVar); + regIOVList.back()->PushCString(deviceName); + tool::Logging(LOG_REGS, func_name, "fatCubinHandle: %p, hostVar: %p", fatCubinHandle, hostVar); + + return; + //return func_entry(fatCubinHandle, hostVar, deviceAddress, deviceName, ext, size, constant, global); +} + +extern "C" void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun, + const char *deviceName, int thread_limit, uint3 *tid, + uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize) { + const char* func_name = "__cudaRegisterFunction"; + HookLog(func_name, false, LOG_REGS); + using func_ptr = + void (*)(void **, const char *, char *, const char *, int, uint3 *, uint3 *, dim3 *, dim3 *, int *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // tool::Logging(LOG_REGS, func_name, "fatCubinHandle: %p, hostFun pointer: %p, deviceFun: %s", fatCubinHandle, hostFun, deviceFun); + + KernelInfo_t *info = GetKernelInfoByKernelName(®isteredKernels, (char *)deviceName); + if (info == NULL) { + tool::Logging(LOG_ERROR, func_name, "error: kernel info not found\n"); + return; + } + + regIOVList.back()->PushSubRequestType(__CUDA_REGISTER_FUNCTION); + regIOVList.back()->Push64BitPointer(fatCubinHandle); + regIOVList.back()->Push64BitPointer(hostFun); + regIOVList.back()->PushCString(deviceName); + regIOVList.back()->Push(info->paramNum); + tool::Logging(LOG_REGS, func_name, "fatCubinHandle: %p, hostFun: %p, paramNum: %d\n", fatCubinHandle, hostFun, info->paramNum); + + info->host_fun = (void*)hostFun; + mapHost2KernelInfo.insert({(uint64_t)hostFun, info}); + + // // free(newDeviceFun); + // free(newDeviceName); + + return; + //return func_entry(fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid, bDim, gDim, wSize); +} + +extern "C" void __cudaRegisterFatBinaryEnd(void **fatCubinHandle) { + const char* func_name = "__cudaRegisterFatBinaryEnd"; + HookLog(func_name, false, LOG_REGS); + using func_ptr = void (*)(void **); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + regIOVList.back()->PushThreadID(ttID); + registeredKernels.clear(); // no need to free them since they are stored in the mapHost2KernelInfo + + return func_entry(fatCubinHandle); // it is neccessary to call +} + +extern "C" void __cudaUnregisterFatBinary(void **fatCubinHandle) { + const char* func_name = "__cudaUnregisterFatBinary"; + DestoryResources(); + // HookLog(func_name); + // using func_ptr = void (*)(void **); + // auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + // reqBuf.PushRequestType(__CUDA_UNREGISTER_FAT_BINARY); + // reqBuf.Push64BitPointer(fatCubinHandle); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + + // std::call_once(destroyFlag, ClientDestory); + + //return func_entry(fatCubinHandle); +} + +/* ---- CUDA Runtime Execution API ---- */ + +cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, + size_t sharedMem, cudaStream_t stream) { + const char* func_name = "cudaLaunchKernel"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(const void *, dim3, dim3, void **, size_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + auto it = mapHost2KernelInfo.find((uint64_t)func); + if (it == mapHost2KernelInfo.end()) { + tool::Logging(LOG_ERROR, func_name, "error: kernel info not found\n"); + return cudaErrorInvalidDeviceFunction; + } + KernelInfo_t *info = it->second; + tool::Logging(LOG_DEBUG, func_name, "kernel info found(name: %s, paramSize: %zd, paramNum: %zd)\n", info->name, info->paramSize, info->paramNum); + + uint8_t* paraValList = (uint8_t*)malloc(info->paramSize); + for (size_t j = 0; j < info->paramNum; j++) { + memcpy(paraValList + info->paramOffsets[j], args[j], info->paramSizes[j]); + // std::cout << "arg[" << j << "]: " << "size=" << info->paramSizes[j] << " " << "offset=" << info->paramOffsets[j] << std::endl; + // if (info->paramSizes[j] == sizeof(uint64_t)) { + // void* actualVal; + // memcpy(&actualVal, args[j], info->paramSizes[j]); + // printf("arg[%zd]: %p ", j, actualVal); + // } + } + // printf("\n"); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(dim3)*2 + sizeof(size_t)+info->paramSize + sizeof(size_t) + sizeof(uint64_t) + sizeof(size_t) + 2*sizeof(size_t)+2*sizeof(uint16_t)*info->paramNum); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_LAUNCH_KERNEL); + reqBuf.Push64BitPointer(func); // func is host function + reqBuf.PushCString(info->name); + reqBuf.Push(gridDim); + reqBuf.Push(blockDim); + reqBuf.Push(paraValList, info->paramSize); //? + reqBuf.Push(sharedMem); + reqBuf.Push64BitPointer(stream); + reqBuf.Push(info->paramNum); + reqBuf.Push(info->paramOffsets, info->paramNum); + reqBuf.Push(info->paramSizes, info->paramNum); + + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + + free(paraValList); + + return cudaSuccess; + + // return func_entry(func, gridDim, blockDim, args, sharedMem, stream); +} + +cudaError_t cudaFuncGetAttributes(struct cudaFuncAttributes *attr, const void *func) { + const char* func_name = "cudaFuncGetAttributes"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(struct cudaFuncAttributes *, const void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_FUNC_GET_ATTRIBUTES); + reqBuf.Push64BitPointer(func); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(attr); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return cudaSuccess; + + //return func_entry(attr, func); +} + +const char *cudaGetErrorName(cudaError_t error) { + const char* func_name = "cudaGetErrorName"; + HookLog(func_name); + using func_ptr = const char *(*)(cudaError_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + return func_entry(error); +} + +/* ---- CUDA Runtime Memory API ---- */ + +cudaError_t cudaMalloc(void** devPtr, size_t size) { + const char* func_name = "cudaMalloc"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void **, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + std::call_once(registerFlag, ClientEndpoint::SendRegisterRequest, clientEpObj, true); // send the register requests first + + // bool essential = !(tool::CheckPyStackTrace("forward")); + bool essential = true; + // std::string filename = "cudaMalloc-trace-" + std::to_string(processID) + ".log"; + if (ttID == 1) { + if (curIter > 1) { // upon checkpoint, the memory state will be saved + essential = false; + } + else { + essential = !(tool::CheckPyStackTrace("forward")); + } + // tool::PrintPyStackTrace(filename, true); + } + else { + essential = false; + // tool::PrintStackTrace(filename, true); + } + + if (isTraining == false && essential == false) { // todo: maybe ttID > 1 + isTraining = true; + tool::Logging(LOG_DEBUG, func_name, "start training\n"); + + // busy waiting for 10ms, avoid the deadlock + auto wait_start = std::chrono::high_resolution_clock::now(); + while (true) { + auto wait_now = std::chrono::high_resolution_clock::now(); + auto elapsed = std::chrono::duration_cast(wait_now - wait_start).count(); + if (elapsed >= 1000) { + break; + } + } + } + + RequestIOV reqBuf = RequestIOV(); + // RequestBuffer reqBuf = RequestBuffer(sizeof(size_t)); + reqBuf.PushRequestType(CUDA_MALLOC); + reqBuf.Push(size); + reqBuf.Push(essential); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(devPtr); // pass a pointer instead of a value + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + // clientEpObj->ReceiveResponse(sizeof(uint64_t), devPtr); // pass a pointer that refers to a unit storing the CUDA memory address + tool::Logging(LOG_DEBUG, func_name, "[pid:%d, tid:%d, ttid:%d] allocated devPtr = %p, size = %zu, essential = %d\n\n", processID, threadID, ttID, *devPtr, size, essential); + return cudaSuccess; + + //return func_entry(devPtr, size); +} + +cudaError_t cudaMallocHost(void **ptr, size_t size) { + const char* func_name = "cudaMallocHost"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void **, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + *ptr = malloc(size); + return cudaSuccess; + + //return func_entry(ptr, size); +} + +cudaError_t cudaHostAlloc(void **pHost, size_t size, unsigned int flags) { + const char* func_name = "cudaHostAlloc"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void **, size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + if (flags != 0x00) { // only support cudaHostAllocDefault + return cudaErrorInvalidValue; + } + + *pHost = malloc(size); + return cudaSuccess; + + //return func_entry(pHost, size, flags); +} + +cudaError_t cudaMemset(void *devPtr, int value, size_t count) { + const char* func_name = "cudaMemset"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void *, int, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(int) + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_MEMSET); + reqBuf.Push64BitPointer(devPtr); + reqBuf.Push(value); + reqBuf.Push(count); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return cudaSuccess; + + //return func_entry(devPtr, value, count); +} + +cudaError_t cudaMemsetAsync(void *devPtr, int value, size_t count, cudaStream_t stream) { + const char* func_name = "cudaMemsetAsync"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void *, int, size_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(int) + sizeof(size_t) + sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_MEMSET_ASYNC); + reqBuf.Push64BitPointer(devPtr); + reqBuf.Push(value); + reqBuf.Push(count); + reqBuf.Push64BitPointer(stream); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return cudaSuccess; + + //return func_entry(devPtr, value, count); +} + +cudaError_t cudaMemcpy(void *dst, const void *src, size_t count, enum cudaMemcpyKind kind) { + const char* func_name = "cudaMemcpy"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void *, const void *, size_t, enum cudaMemcpyKind); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudaMemcpyKind) + sizeof(size_t) + sizeof(uint64_t) + sizeof(uint64_t)); + + RequestIOV reqBuf = RequestIOV(); + if (kind == cudaMemcpyHostToDevice) { + // uint8_t headers[sizeof(cudaMemcpyKind) + sizeof(size_t) + sizeof(uint64_t) + sizeof(int)]; + uint8_t headers[sizeof(cudaMemcpyKind) + sizeof(size_t) + sizeof(uint64_t)]; + size_t headerSize = 0; + memcpy(headers + headerSize, &kind, sizeof(cudaMemcpyKind)); + headerSize += sizeof(cudaMemcpyKind); + memcpy(headers + headerSize, &count, sizeof(size_t)); + headerSize += sizeof(size_t); + memcpy(headers + headerSize, &dst, sizeof(uint64_t)); + headerSize += sizeof(uint64_t); + + reqBuf.PushRequestType(CUDA_MEMCPY_H2D); + reqBuf.Push((uint8_t*)src, count); + + clientEpObj->SendRequestH2D(&reqBuf, headers, headerSize); + tool::Logging(LOG_DEBUG, func_name, "H2D: Host -> Device(%p) with %zu B\n", dst, count); + } + else if (kind == cudaMemcpyDeviceToHost) { + reqBuf.PushRequestType(CUDA_MEMCPY_D2H); + reqBuf.Push(kind); + reqBuf.Push(count); + reqBuf.Push64BitPointer(src); + + tool::Logging(LOG_DEBUG, func_name, "D2H: Device(%p) -> Host with %zu B\n", src, count); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push((uint8_t*)dst, count); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + } + else if (kind == cudaMemcpyDeviceToDevice) { + reqBuf.PushRequestType(CUDA_MEMCPY_D2D); + reqBuf.Push(kind); + reqBuf.Push(count); + reqBuf.Push64BitPointer(src); + reqBuf.Push64BitPointer(dst); + clientEpObj->SendRequest(&reqBuf); + tool::Logging(LOG_DEBUG, func_name, "D2D: Device(%p) -> Device(%p) with %zu B\n", src, dst, count); + } + else { + tool::Logging(LOG_ERROR, func_name, "failed: invalid direction: %d\n", kind); + return cudaErrorInvalidMemcpyDirection; + //todo; + } + + return cudaSuccess; + //return func_entry(dst, src, count, kind); +} + +cudaError_t cudaMemcpyAsync(void *dst, const void *src, size_t count, + enum cudaMemcpyKind kind, cudaStream_t stream) { + const char* func_name = "cudaMemcpyAsync"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void *, const void *, size_t, enum cudaMemcpyKind, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + RequestIOV reqBuf = RequestIOV(); + if (kind == cudaMemcpyHostToDevice) { + BatchInfo_t batchInfo; + if (clientEpObj->_shmOpt->ReadCurBatchInfo(&batchInfo) == false) { + tool::Logging(LOG_ERROR, func_name, "failed: cannot read CurType from shared memory\n"); + return cudaErrorUnknown; + } + else { + tool::Logging(LOG_DEBUG, func_name, "batchInfo curType: %d, curSize: %zu\n", batchInfo.curType, batchInfo.curBatchSize); + } + + // tool::PrintPyStackTrace("cudaMemcpy-trace.log", true); + + uint8_t headers[sizeof(cudaMemcpyKind)+sizeof(size_t)+sizeof(uint64_t)+sizeof(uint8_t)+sizeof(uint64_t)]; + size_t headerSize = 0; + memcpy(headers + headerSize, &kind, sizeof(cudaMemcpyKind)); + headerSize += sizeof(cudaMemcpyKind); + memcpy(headers + headerSize, &count, sizeof(size_t)); + headerSize += sizeof(size_t); + memcpy(headers + headerSize, &stream, sizeof(uint64_t)); + headerSize += sizeof(uint64_t); + memcpy(headers + headerSize, &batchInfo.curType, sizeof(uint8_t)); + headerSize += sizeof(uint8_t); + memcpy(headers + headerSize, &dst, sizeof(uint64_t)); + headerSize += sizeof(uint64_t); + + reqBuf.PushRequestType(CUDA_MEMCPY_ASYNC_H2D); + if (batchInfo.curType != MEMCPY_OTHER) { // with data preloading optimization, data is already in remote node + clientEpObj->_copySize += count; + if (clientEpObj->_copySize >= batchInfo.curBatchSize) { + clientEpObj->_shmOpt->ResetBatchInfo(MEMCPY_OTHER); // avoid conflicts with copy model parameters in DDP + clientEpObj->_copySize = 0; + } + } + else { // without data preloading, data will be sent to remote node + reqBuf.Push((uint8_t*)src, count); + } + clientEpObj->SendRequestH2D(&reqBuf, headers, headerSize); + tool::Logging(LOG_DEBUG, func_name, "H2D: Host -> Device(%p) with %zu B\n", dst, count); + + CheckIteration(dst, count); + } + else if (kind == cudaMemcpyDeviceToHost) { + cudaStreamSynchronize(stream); // for eager/rndv synchronization between host and device + reqBuf.PushRequestType(CUDA_MEMCPY_ASYNC_D2H); + reqBuf.Push(kind); + reqBuf.Push(count); + reqBuf.Push64BitPointer(stream); + reqBuf.Push64BitPointer(src); + + tool::Logging(LOG_DEBUG, func_name, "D2H: Device(%p) -> Host with %zu B\n", src, count); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push((uint8_t*)dst, count); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + } + else if (kind == cudaMemcpyDeviceToDevice) { + reqBuf.PushRequestType(CUDA_MEMCPY_ASYNC_D2D); + reqBuf.Push(kind); + reqBuf.Push(count); + reqBuf.Push64BitPointer(stream); + reqBuf.Push64BitPointer(src); + reqBuf.Push64BitPointer(dst); + clientEpObj->SendRequest(&reqBuf); + tool::Logging(LOG_DEBUG, func_name, "D2D: Device(%p) -> Device(%p) with %zu B\n", src, dst, count); + } + else { + tool::Logging(LOG_ERROR, func_name, "failed: invalid direction(%d)\n", kind); + return cudaErrorInvalidMemcpyDirection; + } + + return cudaSuccess; + //return func_entry(dst, src, count, kind); +} + +cudaError_t cudaMemcpyToSymbol(const void *symbol, const void *src, size_t count, + size_t offset, enum cudaMemcpyKind kind) { + const char* func_name = "cudaMemcpyToSymbol"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(const void *, const void *, size_t, size_t, enum cudaMemcpyKind); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudaMemcpyKind) + sizeof(size_t)*2 + sizeof(uint64_t) + sizeof(size_t)+count); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_MEMCPY_TO_SYMBOL); + reqBuf.Push(kind); + reqBuf.Push(count); + reqBuf.Push(offset); + + if (kind == cudaMemcpyHostToDevice || kind == cudaMemcpyDefault) { + reqBuf.Push64BitPointer(symbol); + reqBuf.Push((uint8_t*)src, count); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + + // clientEpObj->AddIOV(count, src); + // clientEpObj->SendRequest(false); + tool::Logging(LOG_DEBUG, func_name, "send host data to remote device symbol\n"); + } + else { + return cudaErrorInvalidMemcpyDirection; + } + + return cudaSuccess; + //return func_entry(symbol, src, count, offset, kind); +} + +cudaError_t cudaMemGetInfo(size_t *free, size_t *total) { + const char* func_name = "cudaMemGetInfo"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(size_t *, size_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(size_t) + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_MEM_GET_INFO); + int tmpDev = 0; + reqBuf.Push(tmpDev); // dummy + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(free); + resBuf.Push(total); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return cudaSuccess; + + //return func_entry(free, total); +} + +cudaError_t cudaFree(void *devPtr) { + const char* func_name = "cudaFree"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_FREE); + reqBuf.Push64BitPointer(devPtr); + + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + clientEpObj->SendRequest(&reqBuf); + return cudaSuccess; + //return func_entry(devPtr); +} + +cudaError_t cudaFreeHost(void *ptr) { + const char* func_name = "cudaFreeHost"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + free(ptr); + return cudaSuccess; + //return func_entry(ptr); +} + +/* ---- CUDA Runtime Device API ---- */ + +cudaError_t cudaGetDeviceProperties(struct cudaDeviceProp *prop, int device) { + const char* func_name = "cudaGetDeviceProperties"; + HookLog(func_name, false); + using func_ptr = cudaError_t (*)(struct cudaDeviceProp *, int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + +#ifdef GV_GPUMAP + return (gpuIdMap->GetGPUprop(device, prop)) ? cudaSuccess : cudaErrorInvalidDevice; +#endif + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_GET_DEVICE_PROPERTIES); + int gpuIdInNode = device; + reqBuf.Push(gpuIdInNode); + + // tool::Logging(LOG_INFO, func_name, "device = %d, gpuIdInNode = %d\n", device, gpuIdInNode); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(prop); + + SwitchClientEp(device); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + SwitchClientEp(myDevIdx); + + return cudaSuccess; + + //return func_entry(prop, device); +} + +cudaError_t cudaDeviceGetAttribute(int *value, enum cudaDeviceAttr attr, int device) { + const char* func_name = "cudaDeviceGetAttribute"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(int *, enum cudaDeviceAttr, int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + +/* local optimization + RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + reqBuf.PushRequestType(CUDA_GET_DEVICE_PROPERTIES); + reqBuf.Push(device); + clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + clientEpObj->SendRequest(); + + struct cudaDeviceProp deviceProp; + clientEpObj->ReceiveResponse(sizeof(struct cudaDeviceProp), &deviceProp); + + switch (attr) { + case cudaDevAttrMaxThreadsPerBlock: + *value = deviceProp.maxThreadsPerBlock; + break; + case cudaDevAttrMaxBlockDimX: + *value = deviceProp.maxThreadsDim[0]; + break; + case cudaDevAttrMaxBlockDimY: + *value = deviceProp.maxThreadsDim[1]; + break; + case cudaDevAttrMaxBlockDimZ: + *value = deviceProp.maxThreadsDim[2]; + break; + case cudaDevAttrMaxGridDimX: + *value = deviceProp.maxGridSize[0]; + break; + case cudaDevAttrMaxGridDimY: + *value = deviceProp.maxGridSize[1]; + break; + case cudaDevAttrMaxGridDimZ: + *value = deviceProp.maxGridSize[2]; + break; + case cudaDevAttrMaxSharedMemoryPerBlock: + *value = static_cast(deviceProp.sharedMemPerBlock); + break; + case cudaDevAttrTotalConstantMemory: + *value = static_cast(deviceProp.totalConstMem); + break; + case cudaDevAttrWarpSize: + *value = deviceProp.warpSize; + break; + case cudaDevAttrComputeMode: + *value = deviceProp.computeMode; + break; + case cudaDevAttrComputeCapabilityMajor: + *value = deviceProp.major; + break; + case cudaDevAttrComputeCapabilityMinor: + *value = deviceProp.minor; + break; + case cudaDevAttrMultiProcessorCount: + *value = deviceProp.multiProcessorCount; + break; + case cudaDevAttrClockRate: + *value = deviceProp.clockRate; + break; + case cudaDevAttrIntegrated: + *value = deviceProp.integrated; + break; + // ... Handling of other attributes ... + default: + // Handles unknown or unsupported attributes + *value = -1; + break; + } + + if (*value == -1) { + tool::Logging(LOG_ERROR, func_name, "unsupported attribute\n"); + return cudaErrorInvalidValue; + } + else { + return cudaSuccess; + } +*/ + // RequestBuffer reqBuf = RequestBuffer(sizeof(enum cudaDeviceAttr) + sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_DEVICE_GET_ATTRIBUTE); + reqBuf.Push(attr); + int gpuIdInNode = device; +#ifdef GV_GPUMAP + gpuIdMap->GetGPUId(device, &gpuIdInNode); +#endif + reqBuf.Push(gpuIdInNode); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(value); + int oldDevice = clientEpObj->_myDevIdx; + SwitchClientEp(device); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + SwitchClientEp(oldDevice); + + return cudaSuccess; + + //return func_entry(value, attr, device); +} + +cudaError_t cudaGetDeviceCount(int* count) { + const char* func_name = "cudaGetDeviceCount"; + + using func_ptr = cudaError_t (*)(int *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + if (connectionObj == nullptr) { + // when creating the ucp connection, cudaGetDeviceCount() will be called + *count = 0; + } + else { + HookLog(func_name, false); + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + // RequestIOV reqBuf = RequestIOV(); + // reqBuf.PushRequestType(CUDA_GET_DEVICE_COUNT); + // int tmpDev = 0; + // reqBuf.Push(tmpDev); // dummy + + // RequestIOV resBuf = RequestIOV(); + // resBuf.Push(count); + // clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + // Configure* config = new Configure("config.json", true); + *count = config_->GetReqGPUnum(); + + // *count = 1; + } + return cudaSuccess; + + //return func_entry(count); +} + +cudaError_t cudaGetDevice(int *device) { + const char* func_name = "cudaGetDevice"; + HookLog(func_name, false); + using func_ptr = cudaError_t (*)(int *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + *device = myDevIdx; // return the mapped device id + + tool::Logging(LOG_DEBUG, func_name, "[pid:%d, tid:%d] device = %d\n", processID, threadID, *device); + + // *device = 0; + // todo: recv device id from remote gpu node + return cudaSuccess; + + //return func_entry(device); +} + +cudaError_t cudaSetDevice(int device) { + const char* func_name = "cudaSetDevice"; + HookLog(func_name, false); + using func_ptr = cudaError_t (*)(int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // void* callstack[128]; + // int frames = backtrace(callstack, 128); + // char** strs = backtrace_symbols(callstack, frames); + // for (int i = 0; i < frames; ++i) { + // std::cout << strs[i] << std::endl; + // } + // free(strs); + + // exit(EXIT_FAILURE); + + tool::Logging(LOG_DEBUG, func_name, "[pid:%d, tid:%d, ttid:%d] device = %d\n", processID, threadID, ttID, device); + + myDevIdx = device; + // if (clientEpObj) { // if not null, means that the current clientEp has been initialized and ready to participate in the training + // SwitchClientEp(device); + // } + + // RequestIOV reqBuf = RequestIOV(); + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + // reqBuf.PushRequestType(CUDA_SET_DEVICE); + // reqBuf.Push(device); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(&reqBuf); + return cudaSuccess; + + //return func_entry(device); +} + +cudaError_t cudaDeviceSynchronize() { + const char* func_name = "cudaDeviceSynchronize"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_DEVICE_SYNCHRONIZE); + int tmpDev = 0; + reqBuf.Push(tmpDev); // dummy + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + // func_entry(); + clientEpObj->SendRequest(&reqBuf); + return cudaSuccess; + //return func_entry(); +} + +/* ---- CUDA Runtime Stream API ---- */ + +cudaError_t cudaStreamIsCapturing(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus) { + const char* func_name = "cudaStreamIsCapturing"; + HookLog(func_name, false); + using func_ptr = cudaError_t (*)(cudaStream_t, enum cudaStreamCaptureStatus *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudaStream_t) + sizeof(enum cudaStreamCaptureStatus *)); + + if (clientEpObj == nullptr) { + *pCaptureStatus = cudaStreamCaptureStatusNone; + } + else { + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_STREAM_IS_CAPTURING); + reqBuf.Push64BitPointer(stream); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(pCaptureStatus); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + } + + tool::Logging(LOG_DEBUG, func_name, "success, stream = %p, captureStatus = %d\n", stream, *pCaptureStatus); + return cudaSuccess; + + //return func_entry(stream, pCaptureStatus); +} + +cudaError_t cudaStreamGetCaptureInfo(cudaStream_t stream, enum cudaStreamCaptureStatus *pCaptureStatus, + unsigned long long *pId) { + const char* func_name = "cudaStreamGetCaptureInfo"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaStream_t, enum cudaStreamCaptureStatus *, unsigned long long *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudaStream_t) + sizeof(enum cudaStreamCaptureStatus *) + sizeof(unsigned long long *)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_STREAM_GET_CAPTURE_INFO); + reqBuf.Push64BitPointer(stream); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(pCaptureStatus); + resBuf.Push(pId); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "success, status = %d, pId = %llu\n", *pCaptureStatus, *pId); + + return cudaSuccess; + + //return func_entry(stream, pCaptureStatus, pId); +} + +cudaError_t cudaStreamWaitEvent(cudaStream_t stream, cudaEvent_t event, + unsigned int flags) { + const char* func_name = "cudaStreamWaitEvent"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaStream_t, cudaEvent_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudaStream_t) + sizeof(cudaEvent_t) + sizeof(uint)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_STREAM_WAIT_EVENT); + reqBuf.Push64BitPointer(stream); + reqBuf.Push64BitPointer(event); + reqBuf.Push(flags); + + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return cudaSuccess; + //end + //return func_entry(stream, event, flags); +} + +cudaError_t cudaStreamSynchronize(cudaStream_t stream) { + const char* func_name = "cudaStreamSynchronize"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudaStream_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_STREAM_SYNCHRONIZE); + reqBuf.Push64BitPointer(stream); + + cudaError_t result = cudaSuccess; + RequestIOV resBuf = RequestIOV(); + resBuf.Push(&result); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + // clientEpObj->SendRequest(&reqBuf); + // tool::Logging(LOG_DEBUG, func_name, "send cudaStreamSynchronize request, stream = %p\n", stream); + + return result; + + //return func_entry(stream); +} + +cudaError_t cudaStreamCreate(cudaStream_t *pStream) { + const char* func_name = "cudaStreamCreate"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaStream_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_STREAM_CREATE); + *pStream = NULL; + reqBuf.Push(pStream); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(pStream); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + tool::Logging(LOG_DEBUG, func_name, "create stream success, pStream = %p\n", *pStream); + + return cudaSuccess; +} + +cudaError_t cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) { + const char* func_name = "cudaStreamCreateWithFlags"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaStream_t *, unsigned int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_STREAM_CREATE_WITH_FLAGS); + *pStream = NULL; + reqBuf.Push(pStream); + reqBuf.Push(flags); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(pStream); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "create stream with flags success, pStream = %p\n", *pStream); + + return cudaSuccess; + + //return func_entry(pStream, flags); +} + +cudaError_t cudaStreamCreateWithPriority(cudaStream_t *pStream, unsigned int flags, + int priority) { + const char* func_name = "cudaStreamCreateWithPriority"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaStream_t *, unsigned int, int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint) + sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_STREAM_CREATE_WITH_PRIORITY); + *pStream = NULL; + reqBuf.Push(pStream); + reqBuf.Push(flags); + reqBuf.Push(priority); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(pStream); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "create stream with priority success, pStream = %p\n", *pStream); + + return cudaSuccess; + //return func_entry(pStream, flags, priority); +} + +cudaError_t cudaStreamDestroy(cudaStream_t stream) { + const char* func_name = "cudaStreamDestroy"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudaStream_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_STREAM_DESTROY); + reqBuf.Push64BitPointer(stream); + + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return cudaSuccess; + //return func_entry(stream); +} + +/* ---- CUDA Runtime Event API ---- */ + +cudaError_t cudaEventRecord(cudaEvent_t event, cudaStream_t stream) { + const char* func_name = "cudaEventRecord"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaEvent_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // uint32_t size = sizeof(uint64_t) * 2; + // RequestBuffer reqBuf = RequestBuffer(size); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_EVENT_RECORD); + reqBuf.Push64BitPointer(event); + reqBuf.Push64BitPointer(stream); + + // clientEpObj->AddIOV(reqBuf.GetSize(),reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return cudaSuccess; + // return func_entry(event, stream); +} + +cudaError_t cudaEventCreate(cudaEvent_t *event) { + const char* func_name = "cudaEventCreate"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaEvent_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // uint32_t size = 0; + // RequestBuffer reqBuf = RequestBuffer(size); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_EVENT_CREATE); + *event = NULL; + reqBuf.Push(event); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(event); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "allocated eventPtr = %p\n", *event); + return cudaSuccess; + // return func_entry(event); +} + +cudaError_t cudaEventCreateWithFlags(cudaEvent_t *event, unsigned int flags) { + const char* func_name = "cudaEventCreateWithFlags"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaEvent_t *, unsigned int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // uint32_t size = sizeof(unsigned int); + // RequestBuffer reqBuf = RequestBuffer(size); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_EVENT_CREATE_WITH_FLAGS); + *event = NULL; + reqBuf.Push(event); + reqBuf.Push(flags); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(event); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + tool::Logging(LOG_DEBUG, func_name, "allocated eventPtr = %p\n", *event); + return cudaSuccess; + // return func_entry(event, flags); +} + +cudaError_t cudaEventQuery(cudaEvent_t event) { //todo: DDP + const char* func_name = "cudaEventQuery"; + // tool::Logging(LOG_DEBUG, func_name, "[pid:%d, tid:%d] ready to query event, eventPtr = %p\n", processID, threadID, event); + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaEvent_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_EVENT_QUERY); + reqBuf.Push64BitPointer(event); + + cudaError_t result = cudaSuccess; + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(&result); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + if(result == cudaSuccess) + tool::Logging(LOG_DEBUG, func_name, "success\n"); + else + tool::Logging(LOG_DEBUG, func_name, "error: %s\n",cudaGetErrorName(result)); + return result; // return the event query result + + // //return func_entry(event); +} + +cudaError_t cudaEventDestroy(cudaEvent_t event) { + const char* func_name = "cudaEventDestroy"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(cudaEvent_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // uint32_t size = sizeof(uint64_t); + // RequestBuffer reqBuf = RequestBuffer(size); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_EVENT_DESTROY); + reqBuf.Push64BitPointer(event); + // clientEpObj->AddIOV(reqBuf.GetSize(),reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + + tool::Logging(LOG_DEBUG, func_name, "destroyed eventPtr = %p\n", event); + + return cudaSuccess; + // return func_entry(event); +} + +cudaError_t cudaEventElapsedTime(float *ms, cudaEvent_t start, cudaEvent_t end) { + const char* func_name = "cudaEventElapsedTime"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(float *, cudaEvent_t, cudaEvent_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) * 2); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_EVENT_ELAPSED_TIME); + reqBuf.Push64BitPointer(start); + reqBuf.Push64BitPointer(end); + + cudaError_t result = cudaSuccess; + RequestIOV resBuf = RequestIOV(); + resBuf.Push(result); + resBuf.Push(ms); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return result; + //return func_entry(ms, start, end); +} + +/* ---- CUDA Runtime Other API ---- */ + +cudaError_t cudaGetLastError() { + const char* func_name = "cudaGetLastError"; + HookLog(func_name, false); + using func_ptr = cudaError_t (*)(); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + return cudaSuccess; //todo: return the remote device error +} + +cudaError_t cudaPeekAtLastError() { + const char* func_name = "cudaPeekAtLastError"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + return cudaSuccess; //todo: return the remote device error +} + +cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(int *numBlocks, const void *func, + int blockSize, size_t dynamicSMemSize) { + const char* func_name = "cudaOccupancyMaxActiveBlocksPerMultiprocessor"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(int *, const void *, int, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + std::call_once(registerFlag, ClientEndpoint::SendRegisterRequest, clientEpObj, true); // send the register requests first + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)*2 + sizeof(uint64_t) + sizeof(int) + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_OCCUPANCY_MAX_ACTIVE_BLOCKS_PER_MULTIPROCESSOR); + reqBuf.Push(numBlocks); + reqBuf.Push64BitPointer(func); + reqBuf.Push(blockSize); + reqBuf.Push(dynamicSMemSize); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(numBlocks); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return cudaSuccess; + + //return func_entry(numBlocks, func, blockSize, dynamicSMemSize); +} + +cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int *numBlocks, const void *func, int blockSize, size_t dynamicSMemSize, unsigned int flags) { + const char* func_name = "cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags"; + HookLog(func_name); + using func_ptr = cudaError_t (*)(int *, const void *, int, size_t, unsigned int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, func_name)); + + std::call_once(registerFlag, ClientEndpoint::SendRegisterRequest, clientEpObj, true); // send the register requests first + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(int) + sizeof(size_t) + sizeof(unsigned int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDA_OCCUPANCY_MAX_ACTIVE_BLOCKS_PER_MULTIPROCESSOR_WITH_FLAGS); + // reqBuf.Push(numBlocks); + reqBuf.Push64BitPointer(func); + reqBuf.Push(blockSize); + reqBuf.Push(dynamicSMemSize); + reqBuf.Push(flags); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(numBlocks); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + tool::Logging(LOG_DEBUG, func_name, "numBlocks = %d\n", *numBlocks); + return cudaSuccess; + + //return func_entry(numBlocks, func, blockSize, dynamicSMemSize); +} \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/cudnnHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/cudnnHook.cc new file mode 100644 index 0000000..0123ba8 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/cudnnHook.cc @@ -0,0 +1,1412 @@ +#include "../../include/hook/hook.h" + +inline size_t getSizeOfAttributeType(cudnnBackendAttributeType_t attributeType) { + size_t varSize = 0; + switch (attributeType) { + // ref: https://docs.nvidia.com/deeplearning/cudnn/api/cudnn-graph-library.html?highlight=cudnnbackendcreatedescriptor#id89 + case CUDNN_TYPE_HANDLE: + varSize = sizeof(cudnnHandle_t); + break; + case CUDNN_TYPE_DATA_TYPE: + varSize = sizeof(cudnnDataType_t); + break; + case CUDNN_TYPE_BOOLEAN: + varSize = sizeof(bool); + break; + case CUDNN_TYPE_INT64: + varSize = sizeof(int64_t); + break; + case CUDNN_TYPE_FLOAT: + varSize = sizeof(float); + break; + case CUDNN_TYPE_DOUBLE: + varSize = sizeof(double); + break; + case CUDNN_TYPE_VOID_PTR: + varSize = sizeof(uint64_t); // maybe device pointer + break; + case CUDNN_TYPE_CONVOLUTION_MODE: + varSize = sizeof(cudnnConvolutionMode_t); + break; + case CUDNN_TYPE_HEUR_MODE: + varSize = sizeof(cudnnBackendHeurMode_t); + break; + case CUDNN_TYPE_KNOB_TYPE: + varSize = sizeof(cudnnBackendKnobType_t); + break; + case CUDNN_TYPE_NAN_PROPOGATION: + varSize = sizeof(cudnnNanPropagation_t); + break; + case CUDNN_TYPE_NUMERICAL_NOTE: + varSize = sizeof(cudnnBackendNumericalNote_t); + break; + case CUDNN_TYPE_LAYOUT_TYPE: + varSize = sizeof(cudnnBackendLayoutType_t); + break; + case CUDNN_TYPE_ATTRIB_NAME: + varSize = sizeof(cudnnBackendAttributeName_t); + break; + case CUDNN_TYPE_POINTWISE_MODE: + varSize = sizeof(cudnnPointwiseMode_t); + break; + case CUDNN_TYPE_BACKEND_DESCRIPTOR: + varSize = sizeof(cudnnBackendDescriptor_t); + break; + case CUDNN_TYPE_GENSTATS_MODE: + varSize = sizeof(cudnnGenStatsMode_t); + break; + case CUDNN_TYPE_BN_FINALIZE_STATS_MODE: + varSize = sizeof(cudnnBnFinalizeStatsMode_t); + break; + case CUDNN_TYPE_REDUCTION_OPERATOR_TYPE: + varSize = sizeof(cudnnReduceTensorOp_t); + break; + case CUDNN_TYPE_BEHAVIOR_NOTE: + varSize = sizeof(cudnnBackendBehaviorNote_t); + break; + case CUDNN_TYPE_TENSOR_REORDERING_MODE: + varSize = sizeof(cudnnBackendTensorReordering_t); + break; + case CUDNN_TYPE_RESAMPLE_MODE: + varSize = sizeof(cudnnResampleMode_t); + break; + case CUDNN_TYPE_PADDING_MODE: + varSize = sizeof(cudnnPaddingMode_t); + break; + case CUDNN_TYPE_INT32: + varSize = sizeof(int32_t); + break; + case CUDNN_TYPE_CHAR: + varSize = sizeof(char); + break; + case CUDNN_TYPE_SIGNAL_MODE: + varSize = sizeof(cudnnSignalMode_t); + break; + case CUDNN_TYPE_FRACTION: + varSize = sizeof(cudnnFraction_t); + break; + case CUDNN_TYPE_NORM_MODE: + varSize = sizeof(cudnnBackendNormMode_t); + break; + case CUDNN_TYPE_NORM_FWD_PHASE: + varSize = sizeof(cudnnBackendNormFwdPhase_t); + break; + default: + varSize = sizeof(uint64_t); // Unknown type + break; + // case CUDNN_TYPE_RNG_DISTRIBUTION: + // varSize = sizeof(cudnnRngDistribution_t); + // break; + } + return varSize; +} + +cudnnStatus_t cudnnCreate(cudnnHandle_t *handle) { + const char* func_name = "cudnnCreate"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnCreate")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_CREATE); + *handle = NULL; + reqBuf.Push(handle); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(handle); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle); +} + +cudnnStatus_t cudnnDestroy(cudnnHandle_t handle) { + const char* func_name = "cudnnDestroy"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnDestroy")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_DESTROY); + reqBuf.Push64BitPointer(handle); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle); +} + +cudnnStatus_t cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc) { + const char* func_name = "cudnnCreateTensorDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnTensorDescriptor_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnCreateTensorDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_CREATE_TENSOR_DESCRIPTOR); + *tensorDesc = NULL; + reqBuf.Push(tensorDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(tensorDesc); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + // clientEpObj->ReceiveResponse(sizeof(cudnnTensorDescriptor_t), tensorDesc); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(tensorDesc); +} + +cudnnStatus_t cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc) { + const char* func_name = "cudnnDestroyTensorDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnTensorDescriptor_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnDestroyTensorDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_DESTROY_TENSOR_DESCRIPTOR); + reqBuf.Push64BitPointer(tensorDesc); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(tensorDesc); +} + +cudnnStatus_t cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, + size_t *size){ + const char* func_name = "cudnnGetTensorSizeInBytes"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(const cudnnTensorDescriptor_t, size_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetTensorSizeInBytes")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_TENSOR_SIZE_IN_BYTES); + reqBuf.Push64BitPointer((cudnnTensorDescriptor_t)tensorDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(size); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(tensorDesc, size); +} + +cudnnStatus_t cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc, + cudnnTensorFormat_t format, cudnnDataType_t dataType, + int n, int c, int h, int w) { + const char* func_name = "cudnnSetTensor4dDescriptor"; + HookLog(func_name); + using func_ptr = + cudnnStatus_t (*)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, int, int, int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetTensor4dDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnTensorFormat_t) + sizeof(cudnnDataType_t) + 4 * sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_TENSOR_4D_DESCRIPTOR); + reqBuf.Push64BitPointer(tensorDesc); + reqBuf.Push(format); + reqBuf.Push(dataType); + reqBuf.Push(n); + reqBuf.Push(c); + reqBuf.Push(h); + reqBuf.Push(w); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(tensorDesc, format, dataType, n, c, h, w); +} + +cudnnStatus_t cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc, cudnnDataType_t dataType, + int nbDims, const int dimA[], const int strideA[]) { + const char* func_name = "cudnnSetTensorNdDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnTensorDescriptor_t, cudnnDataType_t, int, const int[], const int[]); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetTensorNdDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnDataType_t) + sizeof(int) + sizeof(size_t)+sizeof(const int)*nbDims + sizeof(size_t)+sizeof(const int)*nbDims); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_TENSOR_ND_DESCRIPTOR); + reqBuf.Push64BitPointer(tensorDesc); + reqBuf.Push(dataType); + reqBuf.Push(nbDims); + reqBuf.PushConst(dimA, nbDims); + reqBuf.PushConst(strideA, nbDims); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(tensorDesc, dataType, nbDims, dimA, strideA); +} + +cudnnStatus_t cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc, cudnnTensorFormat_t format, + cudnnDataType_t dataType, int nbDims, const int dimA[]) { + const char* func_name = "cudnnSetTensorNdDescriptorEx"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnTensorDescriptor_t, cudnnTensorFormat_t, cudnnDataType_t, int, const int[]); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetTensorNdDescriptorEx")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnTensorFormat_t) + sizeof(cudnnDataType_t) + sizeof(int) + sizeof(size_t)+sizeof(int)*nbDims); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_TENSOR_ND_DESCRIPTOR_EX); + reqBuf.Push64BitPointer(tensorDesc); + reqBuf.Push(format); + reqBuf.Push(dataType); + reqBuf.Push(nbDims); + reqBuf.PushConst(dimA, nbDims); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + + return CUDNN_STATUS_SUCCESS; + + //return func_entry(tensorDesc, format, dataType, nbDims, dimA); +} + +cudnnStatus_t cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc) { + const char* func_name = "cudnnCreateTensorTransformDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnTensorTransformDescriptor_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnCreateTensorTransformDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_CREATE_TENSOR_TRANSFORM_DESCRIPTOR); + *transformDesc = NULL; + reqBuf.Push(transformDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(transformDesc); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(transformDesc); +} + +cudnnStatus_t cudnnSetTensorTransformDescriptor( + cudnnTensorTransformDescriptor_t transformDesc, const uint32_t nbDims, + const cudnnTensorFormat_t destFormat, + const int32_t padBeforeA[], const int32_t padAfterA[], + const uint32_t foldA[], const cudnnFoldingDirection_t direction) { + const char* func_name = "cudnnSetTensorTransformDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnTensorTransformDescriptor_t, const uint32_t, const cudnnTensorFormat_t, + const int32_t[], const int32_t[], const uint32_t[], const cudnnFoldingDirection_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetTensorTransformDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint32_t) + sizeof(cudnnTensorFormat_t) + sizeof(size_t)+sizeof(int32_t)*nbDims + sizeof(size_t)+sizeof(int32_t)*nbDims + sizeof(size_t)+sizeof(uint32_t)*nbDims + sizeof(cudnnFoldingDirection_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_TENSOR_TRANSFORM_DESCRIPTOR); + reqBuf.Push64BitPointer(transformDesc); + reqBuf.PushConst(nbDims); + reqBuf.PushConst(destFormat); + reqBuf.PushConst(padBeforeA, nbDims); + reqBuf.PushConst(padAfterA, nbDims); + reqBuf.PushConst(foldA, nbDims - 2); // spatial dimension (dimensions 2 and up) + reqBuf.PushConst(direction); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(transformDesc, nbDims, destFormat, padBeforeA, padAfterA, foldA, direction); +} + +cudnnStatus_t cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc) { + const char* func_name = "cudnnDestroyTensorTransformDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnTensorTransformDescriptor_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnDestroyTensorTransformDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_DESTROY_TENSOR_TRANSFORM_DESCRIPTOR); + reqBuf.Push64BitPointer(transformDesc); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(transformDesc); +} + +cudnnStatus_t cudnnInitTransformDest( + const cudnnTensorTransformDescriptor_t transformDesc, + const cudnnTensorDescriptor_t srcDesc, + cudnnTensorDescriptor_t destDesc, size_t *destSizeInBytes) { + const char* func_name = "cudnnInitTransformDest"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(const cudnnTensorTransformDescriptor_t, const cudnnTensorDescriptor_t, + cudnnTensorDescriptor_t, size_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnInitTransformDest")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) + sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_INIT_TRANSFORM_DEST); + reqBuf.Push64BitPointer(transformDesc); + reqBuf.Push64BitPointer(srcDesc); + reqBuf.Push64BitPointer(destDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(destSizeInBytes); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(transformDesc, srcDesc, destDesc, destSizeInBytes); +} + +cudnnStatus_t cudnnTransformTensorEx( + cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnTensorDescriptor_t srcDesc, const void *srcData, + const void *beta,const cudnnTensorDescriptor_t destDesc, + void *destData) { + const char* func_name = "cudnnTransformTensorEx"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, + const cudnnTensorDescriptor_t, const void *, const void *, + const cudnnTensorDescriptor_t, void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnTransformTensorEx")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) + (sizeof(size_t)+sizeof(const float))*2 + sizeof(uint64_t)*2 + sizeof(uint64_t)*2); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_TRANSFORM_TENSOR_EX); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(transDesc); + reqBuf.PushConst((const float*)alpha); + reqBuf.Push64BitPointer(srcDesc); + reqBuf.Push64BitPointer(srcData); // Data pointer to GPU memory, not to host memory in official documetation + reqBuf.PushConst((const float*)beta); + reqBuf.Push64BitPointer(destDesc); + reqBuf.Push64BitPointer(destData); // Data pointer to GPU memory, not to host memory in official documetation + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData); +} + +cudnnStatus_t cudnnTransformFilter( + cudnnHandle_t handle, + const cudnnTensorTransformDescriptor_t transDesc, + const void *alpha, + const cudnnFilterDescriptor_t srcDesc, const void *srcData, + const void *beta, + const cudnnFilterDescriptor_t destDesc, void *destData) { + const char* func_name = "cudnnTransformFilter"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const cudnnTensorTransformDescriptor_t, const void *, + const cudnnFilterDescriptor_t, const void *, const void *, + const cudnnFilterDescriptor_t, void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnTransformFilter")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) + (sizeof(size_t)+sizeof(const float))*2 + sizeof(uint64_t)*2 + sizeof(uint64_t)*2); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_TRANSFORM_FILTER); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(transDesc); + reqBuf.PushConst((const float*)alpha); + reqBuf.Push64BitPointer(srcDesc); + reqBuf.Push64BitPointer(srcData); // Data pointer to GPU memory, not to host memory in official documetation + reqBuf.PushConst((const float*)beta); + reqBuf.Push64BitPointer(destDesc); + reqBuf.Push64BitPointer(destData); // Data pointer to GPU memory, not to host memory in official documetation + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, transDesc, alpha, srcDesc, srcData, beta, destDesc, destData); +} + +cudnnStatus_t cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc) { + const char* func_name = "cudnnCreateFilterDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnFilterDescriptor_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnCreateFilterDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_CREATE_FILTER_DESCRIPTOR); + *filterDesc = NULL; + reqBuf.Push(filterDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(filterDesc); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(filterDesc); +} + +cudnnStatus_t cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc, + cudnnDataType_t dataType, cudnnTensorFormat_t format, + int nbDims, const int filterDimA[]) { + const char* func_name = "cudnnSetFilterNdDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnFilterDescriptor_t, cudnnDataType_t, cudnnTensorFormat_t, int, const int[]); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetFilterNdDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnDataType_t) + sizeof(cudnnTensorFormat_t) + sizeof(int) + sizeof(size_t)+sizeof(int)*nbDims); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_FILTER_ND_DESCRIPTOR); + reqBuf.Push64BitPointer(filterDesc); + reqBuf.Push(dataType); + reqBuf.Push(format); + reqBuf.Push(nbDims); + reqBuf.PushConst(filterDimA, nbDims); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(filterDesc, dataType, format, nbDims, filterDimA); +} + +cudnnStatus_t cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc) { + const char* func_name = "cudnnDestroyFilterDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnFilterDescriptor_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnDestroyFilterDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_DESTROY_FILTER_DESCRIPTOR); + reqBuf.Push64BitPointer(filterDesc); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(filterDesc); +} + +cudnnStatus_t cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size) { + const char* func_name = "cudnnGetFilterSizeInBytes"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(const cudnnFilterDescriptor_t, size_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetFilterSizeInBytes")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_FILTER_SIZE_IN_BYTES); + reqBuf.Push64BitPointer(filterDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(size); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(filterDesc, size); +} + +cudnnStatus_t cudnnGetFoldedConvBackwardDataDescriptors( + const cudnnHandle_t handle, const cudnnFilterDescriptor_t filterDesc, + const cudnnTensorDescriptor_t diffDesc, const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t gradDesc, const cudnnTensorFormat_t transformFormat, + cudnnFilterDescriptor_t foldedFilterDesc, cudnnTensorDescriptor_t paddedDiffDesc, + cudnnConvolutionDescriptor_t foldedConvDesc, cudnnTensorDescriptor_t foldedGradDesc, + cudnnTensorTransformDescriptor_t filterFoldTransDesc, cudnnTensorTransformDescriptor_t diffPadTransDesc, + cudnnTensorTransformDescriptor_t gradFoldTransDesc, cudnnTensorTransformDescriptor_t gradUnfoldTransDesc) { + const char* func_name = "cudnnGetFoldedConvBackwardDataDescriptors"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(const cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, + const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorFormat_t, + cudnnFilterDescriptor_t, cudnnTensorDescriptor_t, cudnnConvolutionDescriptor_t, cudnnTensorDescriptor_t, + cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t, cudnnTensorTransformDescriptor_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetFoldedConvBackwardDataDescriptors")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t)*4 + sizeof(cudnnTensorFormat_t) + sizeof(uint64_t)*8); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_FOLDED_CONV_BACKWARD_DATA_DESCRIPTORS); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(filterDesc); + reqBuf.Push64BitPointer(diffDesc); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push64BitPointer(gradDesc); + reqBuf.PushConst(transformFormat); + reqBuf.Push64BitPointer(foldedFilterDesc); + reqBuf.Push64BitPointer(paddedDiffDesc); + reqBuf.Push64BitPointer(foldedConvDesc); + reqBuf.Push64BitPointer(foldedGradDesc); + reqBuf.Push64BitPointer(filterFoldTransDesc); + reqBuf.Push64BitPointer(diffPadTransDesc); + reqBuf.Push64BitPointer(gradFoldTransDesc); + reqBuf.Push64BitPointer(gradUnfoldTransDesc); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, filterDesc, diffDesc, convDesc, gradDesc, transformFormat, foldedFilterDesc, paddedDiffDesc, foldedConvDesc, foldedGradDesc, filterFoldTransDesc, diffPadTransDesc, gradFoldTransDesc, gradUnfoldTransDesc); +} + +cudnnStatus_t cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId) { + const char* func_name = "cudnnSetStream"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetStream")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_STREAM); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(streamId); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, streamId); +} + +cudnnStatus_t cudnnBatchNormalizationBackwardEx( + cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, + const void *alphaDataDiff, const void *betaDataDiff, + const void *alphaParamDiff, const void *betaParamDiff, + const cudnnTensorDescriptor_t xDesc, const void *xData, + const cudnnTensorDescriptor_t yDesc, const void *yData, + const cudnnTensorDescriptor_t dyDesc, const void *dyData, + const cudnnTensorDescriptor_t dzDesc, void *dzData, + const cudnnTensorDescriptor_t dxDesc, void *dxData, + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const void *bnScaleData, const void *bnBiasData, + void *dBnScaleData, void *dBnBiasData, double epsilon, + const void *savedMean, const void *savedInvVariance, + cudnnActivationDescriptor_t activationDesc, + void *workSpace, size_t workSpaceSizeInBytes, + void *reserveSpace, size_t reserveSpaceSizeInBytes) { + const char* func_name = "cudnnBatchNormalizationBackwardEx"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)( + cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, const void *, + const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, + const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, void *, + const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, void *, + void *, double, const void *, const void *, cudnnActivationDescriptor_t, void *, size_t, void *, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBatchNormalizationBackwardEx")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnBatchNormMode_t) + sizeof(cudnnBatchNormOps_t) + (sizeof(size_t)+sizeof(const float))*4 + sizeof(uint64_t)*15 + sizeof(double) + sizeof(uint64_t)*5 + sizeof(uint64_t)*2); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BATCH_NORMALIZATION_BACKWARD_EX); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(mode); + reqBuf.Push(bnOps); + reqBuf.PushConst((const float*)alphaDataDiff); // Pointers to scaling factors (in host memory) + reqBuf.PushConst((const float*)betaDataDiff); + reqBuf.PushConst((const float*)alphaParamDiff); + reqBuf.PushConst((const float*)betaParamDiff); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(xData); // Data pointer to GPU memory + reqBuf.Push64BitPointer(yDesc); + reqBuf.Push64BitPointer(yData); // Data pointer to GPU memory + reqBuf.Push64BitPointer(dyDesc); + reqBuf.Push64BitPointer(dyData); // Data pointer to GPU memory + reqBuf.Push64BitPointer(dzDesc); + reqBuf.Push64BitPointer(dzData); // Data pointer to GPU memory + reqBuf.Push64BitPointer(dxDesc); + reqBuf.Push64BitPointer(dxData); // Data pointer to GPU memory + reqBuf.Push64BitPointer(dBnScaleBiasDesc); + reqBuf.Push64BitPointer(bnScaleData); // located in GPU memory + reqBuf.Push64BitPointer(bnBiasData); + reqBuf.Push64BitPointer(dBnScaleData); + reqBuf.Push64BitPointer(dBnBiasData); + reqBuf.Push(epsilon); + reqBuf.Push64BitPointer(savedMean); + reqBuf.Push64BitPointer(savedInvVariance); + reqBuf.Push64BitPointer(activationDesc); + reqBuf.Push64BitPointer(workSpace); + reqBuf.Push(workSpaceSizeInBytes); + reqBuf.Push64BitPointer(reserveSpace); + reqBuf.Push(reserveSpaceSizeInBytes); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, mode, bnOps, alphaDataDiff, betaDataDiff, alphaParamDiff, betaParamDiff, xDesc, xData, yDesc, yData, dyDesc, dyData, dzDesc, dzData, dxDesc, dxData, dBnScaleBiasDesc, bnScaleData, bnBiasData, dBnScaleData, dBnBiasData, epsilon, savedMean, savedInvVariance, activationDesc, workSpace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes); +} + +cudnnStatus_t cudnnBatchNormalizationForwardTrainingEx( + cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, + const void *alpha, const void *beta, + const cudnnTensorDescriptor_t xDesc, const void *xData, + const cudnnTensorDescriptor_t zDesc, const void *zData, + const cudnnTensorDescriptor_t yDesc, void *yData, + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScaleData, const void *bnBiasData, double exponentialAverageFactor, + void *resultRunningMean, void *resultRunningVariance, double epsilon, + void *resultSaveMean, void *resultSaveInvVariance, + const cudnnActivationDescriptor_t activationDesc, + void *workspace, size_t workSpaceSizeInBytes, + void *reserveSpace, size_t reserveSpaceSizeInBytes) { + const char* func_name = "cudnnBatchNormalizationForwardTrainingEx"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)( + cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const void *, const void *, + const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, + const cudnnTensorDescriptor_t, void *, const cudnnTensorDescriptor_t, const void *, const void *, double, + void *, void *, double, void *, void *, const cudnnActivationDescriptor_t, void *, size_t, void *, size_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBatchNormalizationForwardTrainingEx")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnBatchNormMode_t) + sizeof(cudnnBatchNormOps_t) + (sizeof(size_t)+sizeof(const float))*2 + sizeof(uint64_t)*9 + sizeof(double) + sizeof(uint64_t)*2 + sizeof(double) + sizeof(uint64_t)*3 + sizeof(uint64_t)*2 + sizeof(size_t)*2); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BATCH_NORMALIZATION_FORWARD_TRAINING_EX); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(mode); + reqBuf.Push(bnOps); + reqBuf.PushConst((const float*)alpha); // Pointers to scaling factors (in host memory) + reqBuf.PushConst((const float*)beta); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(xData); // Data pointer to GPU memory + reqBuf.Push64BitPointer(zDesc); + reqBuf.Push64BitPointer(zData); + reqBuf.Push64BitPointer(yDesc); + reqBuf.Push64BitPointer(yData); + reqBuf.Push64BitPointer(bnScaleBiasMeanVarDesc); + reqBuf.Push64BitPointer(bnScaleData); // located in GPU memory + reqBuf.Push64BitPointer(bnBiasData); + reqBuf.Push(exponentialAverageFactor); + reqBuf.Push64BitPointer(resultRunningMean); + reqBuf.Push64BitPointer(resultRunningVariance); + reqBuf.Push(epsilon); + reqBuf.Push64BitPointer(resultSaveMean); + reqBuf.Push64BitPointer(resultSaveInvVariance); + reqBuf.Push64BitPointer(activationDesc); + reqBuf.Push64BitPointer(workspace); + reqBuf.Push(workSpaceSizeInBytes); + reqBuf.Push64BitPointer(reserveSpace); + reqBuf.Push(reserveSpaceSizeInBytes); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, mode, bnOps, alpha, beta, xDesc, xData, zDesc, zData, yDesc, yData, bnScaleBiasMeanVarDesc, bnScaleData, bnBiasData, exponentialAverageFactor, resultRunningMean, resultRunningVariance, epsilon, resultSaveMean, resultSaveInvVariance, activationDesc, workspace, workSpaceSizeInBytes, reserveSpace, reserveSpaceSizeInBytes); +} + +cudnnStatus_t cudnnBatchNormalizationForwardInference( + cudnnHandle_t handle, cudnnBatchNormMode_t mode, + const void *alpha, const void *beta, + const cudnnTensorDescriptor_t xDesc, const void *x, + const cudnnTensorDescriptor_t yDesc, void *y, + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const void *bnScale, const void *bnBias, + const void *estimatedMean, const void *estimatedVariance, double epsilon) { + const char* func_name = "cudnnBatchNormalizationForwardInference"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, cudnnBatchNormMode_t, const void *, const void *, + const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, + void *, const cudnnTensorDescriptor_t, const void *, const void *, const void *, + const void *, double); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBatchNormalizationForwardInference")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnBatchNormMode_t) + (sizeof(size_t)+sizeof(const float))*2 + sizeof(uint64_t)*9 + sizeof(double)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BATCH_NORMALIZATION_FORWARD_INFERENCE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(mode); + reqBuf.PushConst((const float*)alpha); // Pointers to scaling factors (in host memory) + reqBuf.PushConst((const float*)beta); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(x); // Data pointer to GPU memory + reqBuf.Push64BitPointer(yDesc); + reqBuf.Push64BitPointer(y); + reqBuf.Push64BitPointer(bnScaleBiasMeanVarDesc); + reqBuf.Push64BitPointer(bnScale); // located in GPU memory + reqBuf.Push64BitPointer(bnBias); + reqBuf.Push64BitPointer(estimatedMean); + reqBuf.Push64BitPointer(estimatedVariance); + reqBuf.Push(epsilon); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, mode, alpha, beta, xDesc, x, yDesc, y, bnScaleBiasMeanVarDesc, bnScale, bnBias, estimatedMean, estimatedVariance, epsilon); +} + +cudnnStatus_t cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor) { + const char* func_name = "cudnnBackendCreateDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnBackendDescriptorType_t, cudnnBackendDescriptor_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBackendCreateDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(cudnnBackendDescriptorType_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BACKEND_CREATE_DESCRIPTOR); + reqBuf.Push(descriptorType); + *descriptor = NULL; + reqBuf.Push(descriptor); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(descriptor); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(descriptorType, descriptor); +} + +cudnnStatus_t cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor) { + const char* func_name = "cudnnBackendDestroyDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnBackendDescriptor_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBackendDestroyDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BACKEND_DESTROY_DESCRIPTOR); + reqBuf.Push64BitPointer(descriptor); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(descriptor); +} + +cudnnStatus_t cudnnBackendSetAttribute( + cudnnBackendDescriptor_t descriptor, cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, + int64_t elementCount, const void *arrayOfElements) { //! arrayOfElements is const void* type, not void* type declared in the official document + const char* func_name = "cudnnBackendSetAttribute"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnBackendDescriptor_t, cudnnBackendAttributeName_t, cudnnBackendAttributeType_t, int64_t, const void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBackendSetAttribute")); + + size_t varSize = getSizeOfAttributeType(attributeType); + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnBackendAttributeName_t) + sizeof(cudnnBackendAttributeType_t) + sizeof(int64_t) + sizeof(size_t)+varSize*elementCount); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BACKEND_SET_ATTRIBUTE); + reqBuf.Push64BitPointer(descriptor); + reqBuf.Push(attributeName); + reqBuf.Push(attributeType); + reqBuf.Push(elementCount); + reqBuf.PushConst((const uint8_t*)arrayOfElements, varSize * elementCount); + + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(descriptor, attributeName, attributeType, elementCount, arrayOfElements); +} + +cudnnStatus_t cudnnBackendGetAttribute( + cudnnBackendDescriptor_t descriptor, cudnnBackendAttributeName_t attributeName, + cudnnBackendAttributeType_t attributeType, int64_t requestedElementCount, + int64_t *elementCount, void *arrayOfElements) { + const char* func_name = "cudnnBackendGetAttribute"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnBackendDescriptor_t, cudnnBackendAttributeName_t, cudnnBackendAttributeType_t, int64_t, int64_t *, void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBackendGetAttribute")); + + size_t varSize = getSizeOfAttributeType(attributeType); + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnBackendAttributeName_t) + sizeof(cudnnBackendAttributeType_t) + sizeof(int64_t) + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BACKEND_GET_ATTRIBUTE); + reqBuf.Push64BitPointer(descriptor); + reqBuf.Push(attributeName); + reqBuf.Push(attributeType); + reqBuf.Push(requestedElementCount); + reqBuf.Push(varSize); // notify the server the size of each element + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(elementCount); + resBuf.Push((uint8_t*)arrayOfElements, varSize * requestedElementCount); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUDNN_STATUS_SUCCESS; + + // return func_entry(descriptor, attributeName, attributeType, requestedElementCount, elementCount, arrayOfElements); +} + +cudnnStatus_t cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, + cudnnBackendDescriptor_t varianPack) { + const char* func_name = "cudnnBackendExecute"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, cudnnBackendDescriptor_t, cudnnBackendDescriptor_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBackendExecute")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) + sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BACKEND_EXECUTE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(executionPlan); + reqBuf.Push64BitPointer(varianPack); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, executionPlan, varianPack); +} + +cudnnStatus_t cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor) { + const char* func_name = "cudnnBackendFinalize"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnBackendDescriptor_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnBackendFinalize")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_BACKEND_FINALIZE); + reqBuf.Push64BitPointer(descriptor); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(descriptor); +} + +cudnnStatus_t cudnnGetBatchNormalizationBackwardExWorkspaceSize( + cudnnHandle_t handle, cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t yDesc, const cudnnTensorDescriptor_t dyDesc, + const cudnnTensorDescriptor_t dzDesc,const cudnnTensorDescriptor_t dxDesc, + const cudnnTensorDescriptor_t dBnScaleBiasDesc, + const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) { + const char* func_name = "cudnnGetBatchNormalizationBackwardExWorkspaceSize"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)( + cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, + const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, + const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnActivationDescriptor_t, size_t *); + auto func_entry = + reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetBatchNormalizationBackwardExWorkspaceSize")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnBatchNormMode_t) + sizeof(cudnnBatchNormOps_t) + sizeof(uint64_t) * 7); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_BATCH_NORMALIZATION_BACKWARD_EX_WORKSPACE_SIZE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(mode); + reqBuf.Push(bnOps); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(yDesc); + reqBuf.Push64BitPointer(dyDesc); + reqBuf.Push64BitPointer(dzDesc); + reqBuf.Push64BitPointer(dxDesc); + reqBuf.Push64BitPointer(dBnScaleBiasDesc); + reqBuf.Push64BitPointer(activationDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(sizeInBytes); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, mode, bnOps, xDesc, yDesc, dyDesc, dzDesc, dxDesc, dBnScaleBiasDesc, activationDesc, sizeInBytes); +} + +cudnnStatus_t cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + cudnnHandle_t handle, cudnnBatchNormMode_t mode, + cudnnBatchNormOps_t bnOps, const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t zDesc, const cudnnTensorDescriptor_t yDesc, + const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc, + const cudnnActivationDescriptor_t activationDesc, size_t *sizeInBytes) { + const char* func_name = "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize"; + HookLog(func_name); + using func_ptr = + cudnnStatus_t (*)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, const cudnnTensorDescriptor_t, + const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, + const cudnnActivationDescriptor_t, size_t *); + auto func_entry = + reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnBatchNormMode_t) + sizeof(cudnnBatchNormOps_t) + sizeof(uint64_t) * 5); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_BATCH_NORMALIZATION_FORWARD_TRAINING_EX_WORKSPACE_SIZE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(mode); + reqBuf.Push(bnOps); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(zDesc); + reqBuf.Push64BitPointer(yDesc); + reqBuf.Push64BitPointer(bnScaleBiasMeanVarDesc); + reqBuf.Push64BitPointer(activationDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(sizeInBytes); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, mode, bnOps, xDesc, zDesc, yDesc, bnScaleBiasMeanVarDesc, activationDesc, sizeInBytes); +} + +cudnnStatus_t cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + cudnnHandle_t handle, cudnnBatchNormMode_t mode, cudnnBatchNormOps_t bnOps, + const cudnnActivationDescriptor_t activationDesc, + const cudnnTensorDescriptor_t xDesc, size_t *sizeInBytes) { + const char* func_name = "cudnnGetBatchNormalizationTrainingExReserveSpaceSize"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, cudnnBatchNormMode_t, cudnnBatchNormOps_t, + const cudnnActivationDescriptor_t, const cudnnTensorDescriptor_t, size_t *); + auto func_entry = + reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetBatchNormalizationTrainingExReserveSpaceSize")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnBatchNormMode_t) + sizeof(cudnnBatchNormOps_t) + sizeof(uint64_t) * 2); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_BATCH_NORMALIZATION_TRAINING_EX_RESERVE_SPACE_SIZE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push(mode); + reqBuf.Push(bnOps); + reqBuf.Push64BitPointer(activationDesc); + reqBuf.Push64BitPointer(xDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(sizeInBytes); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, mode, bnOps, activationDesc, xDesc, sizeInBytes); +} + +cudnnStatus_t cudnnCreateConvolutionDescriptor(cudnnConvolutionDescriptor_t *convDesc) { + const char* func_name = "cudnnCreateConvolutionDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnConvolutionDescriptor_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnCreateConvolutionDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_CREATE_CONVOLUTION_DESCRIPTOR); + *convDesc = NULL; + reqBuf.Push(convDesc); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(convDesc); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(convDesc); +} + +cudnnStatus_t cudnnDestroyConvolutionDescriptor(cudnnConvolutionDescriptor_t convDesc) { + const char* func_name = "cudnnDestroyConvolutionDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnConvolutionDescriptor_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnDestroyConvolutionDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_DESTROY_CONVOLUTION_DESCRIPTOR); + reqBuf.Push64BitPointer(convDesc); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(convDesc); +} + +cudnnStatus_t cudnnSetConvolutionGroupCount(cudnnConvolutionDescriptor_t convDesc, + int groupCount) { + const char* func_name = "cudnnSetConvolutionGroupCount"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnConvolutionDescriptor_t, int); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetConvolutionGroupCount")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_CONVOLUTION_GROUP_COUNT); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push(groupCount); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(convDesc, groupCount); +} + +cudnnStatus_t cudnnSetConvolutionMathType(cudnnConvolutionDescriptor_t convDesc, + cudnnMathType_t mathType) { + const char* func_name = "cudnnSetConvolutionMathType"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnConvolutionDescriptor_t, cudnnMathType_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetConvolutionMathType")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnMathType_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_CONVOLUTION_MATH_TYPE); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push(mathType); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(convDesc, mathType); +} + +cudnnStatus_t cudnnSetConvolutionNdDescriptor(cudnnConvolutionDescriptor_t convDesc, + int arrayLength, const int padA[], + const int filterStrideA[], const int dilationA[], + cudnnConvolutionMode_t mode, cudnnDataType_t dataType) { + const char* func_name = "cudnnSetConvolutionNdDescriptor"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnConvolutionDescriptor_t, int, const int[], const int[], const int[], cudnnConvolutionMode_t, cudnnDataType_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetConvolutionNdDescriptor")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(int) + (sizeof(size_t)+sizeof(int)*arrayLength)* 3 + sizeof(cudnnConvolutionMode_t) + sizeof(cudnnDataType_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_CONVOLUTION_ND_DESCRIPTOR); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push(arrayLength); + reqBuf.PushConst(padA, arrayLength); + reqBuf.PushConst(filterStrideA, arrayLength); + reqBuf.PushConst(dilationA, arrayLength); + reqBuf.Push(mode); + reqBuf.Push(dataType); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(convDesc, arrayLength, padA, filterStrideA, dilationA, mode, dataType); +} + +cudnnStatus_t cudnnSetConvolutionReorderType(cudnnConvolutionDescriptor_t convDesc, + cudnnReorderType_t reorderType) { + const char* func_name = "cudnnSetConvolutionReorderType"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnConvolutionDescriptor_t, cudnnReorderType_t); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnSetConvolutionReorderType")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(cudnnReorderType_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_SET_CONVOLUTION_REORDER_TYPE); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push(reorderType); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(convDesc, reorderType); +} + +// cudnnStatus_t cudnnReorderFilterAndBias( +// cudnnHandle_t handle, +// const cudnnFilterDescriptor_t filterDesc, cudnnReorderType_t reorderType, +// const void *filterData, void *reorderedFilterData, +// int reorderBias, const void *biasData, void *reorderedBiasData) { +// const char* func_name = "cudnnReorderFilterAndBias"; +// HookLog(func_name); +// using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const cudnnFilterDescriptor_t, cudnnReorderType_t, const void *, void *, int, const void *, void *); +// auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnReorderFilterAndBias")); + +// RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) + sizeof(cudnnReorderType_t) + sizeof(uint64_t) * 2 + sizeof(int) + sizeof(uint64_t) * 2); +// } + +cudnnStatus_t cudnnGetConvolutionForwardAlgorithm_v7( + cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + const int requestedAlgoCount, int *returnedAlgoCount, + cudnnConvolutionFwdAlgoPerf_t *perfResults) { + const char* func_name = "cudnnGetConvolutionForwardAlgorithm_v7"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)( cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionFwdAlgoPerf_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetConvolutionForwardAlgorithm_v7")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) * 4 + sizeof(const int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_CONVOLUTION_FORWARD_ALGORITHM_V7); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(wDesc); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push64BitPointer(yDesc); + reqBuf.PushConst(requestedAlgoCount); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(returnedAlgoCount); + resBuf.Push(perfResults, requestedAlgoCount); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, xDesc, wDesc, convDesc, yDesc, requestedAlgoCount, returnedAlgoCount, perfResults); +} + +cudnnStatus_t cudnnGetConvolutionBackwardFilterAlgorithm_v7( + cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + const int requestedAlgoCount, int *returnedAlgoCount, + cudnnConvolutionBwdFilterAlgoPerf_t *perfResults) { + const char* func_name = "cudnnGetConvolutionBackwardFilterAlgorithm_v7"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)( cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, const int, int *, cudnnConvolutionBwdFilterAlgoPerf_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetConvolutionBackwardFilterAlgorithm_v7")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) * 4 + sizeof(const int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_CONVOLUTION_BACKWARD_FILTER_ALGORITHM_V7); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(dyDesc); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push64BitPointer(dwDesc); + reqBuf.PushConst(requestedAlgoCount); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(returnedAlgoCount); + resBuf.Push(perfResults, requestedAlgoCount); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, xDesc, dyDesc, convDesc, dwDesc, requestedAlgoCount, returnedAlgoCount, perfResults); +} + +cudnnStatus_t cudnnGetConvolutionBackwardDataAlgorithm_v7( + cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + const int requestedAlgoCount, int *returnedAlgoCount, + cudnnConvolutionBwdDataAlgoPerf_t *perfResults) { + const char* func_name = "cudnnGetConvolutionBackwardDataAlgorithm_v7"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)( cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, const int, int *, cudnnConvolutionBwdDataAlgoPerf_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetConvolutionBackwardDataAlgorithm_v7")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) * 4 + sizeof(const int)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_CONVOLUTION_BACKWARD_DATA_ALGORITHM_V7); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(wDesc); + reqBuf.Push64BitPointer(dyDesc); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push64BitPointer(dxDesc); + reqBuf.PushConst(requestedAlgoCount); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(returnedAlgoCount); + resBuf.Push(perfResults, requestedAlgoCount); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, wDesc, dyDesc, convDesc, dxDesc, requestedAlgoCount, returnedAlgoCount, perfResults); +} + +cudnnStatus_t cudnnGetConvolutionForwardWorkspaceSize( + cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, + const cudnnFilterDescriptor_t wDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t yDesc, + cudnnConvolutionFwdAlgo_t algo, size_t *sizeInBytes) { + const char* func_name = "cudnnGetConvolutionForwardWorkspaceSize"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnFilterDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionFwdAlgo_t, size_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetConvolutionForwardWorkspaceSize")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) * 4 + sizeof(cudnnConvolutionFwdAlgo_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_CONVOLUTION_FORWARD_WORKSPACE_SIZE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(wDesc); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push64BitPointer(yDesc); + reqBuf.Push(algo); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(sizeInBytes); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, xDesc, wDesc, convDesc, yDesc, algo, sizeInBytes); +} + +cudnnStatus_t cudnnConvolutionForward( + cudnnHandle_t handle, const void *alpha, + const cudnnTensorDescriptor_t xDesc, const void *x, + const cudnnFilterDescriptor_t wDesc, const void *w, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionFwdAlgo_t algo, + void *workSpace, size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t yDesc, void *y){ + const char* func_name = "cudnnConvolutionForward"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionFwdAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnConvolutionForward")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + (sizeof(size_t)+sizeof(const float))*2 + sizeof(uint64_t)*6 + sizeof(uint64_t) + sizeof(cudnnConvolutionFwdAlgo_t) + sizeof(uint64_t) + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_CONVOLUTION_FORWARD); + reqBuf.Push64BitPointer(handle); + reqBuf.PushConst((const float*)alpha); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(x); + reqBuf.Push64BitPointer(wDesc); + reqBuf.Push64BitPointer(w); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push(algo); + reqBuf.Push64BitPointer(workSpace); + reqBuf.Push(workSpaceSizeInBytes); + reqBuf.PushConst((const float*)beta); + reqBuf.Push64BitPointer(yDesc); + reqBuf.Push64BitPointer(y); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y); +} + +cudnnStatus_t cudnnGetConvolutionBackwardDataWorkspaceSize( + cudnnHandle_t handle, const cudnnFilterDescriptor_t wDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnTensorDescriptor_t dxDesc, + cudnnConvolutionBwdDataAlgo_t algo, size_t *sizeInBytes) { + const char* func_name = "cudnnGetConvolutionBackwardDataWorkspaceSize"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const cudnnFilterDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnTensorDescriptor_t, cudnnConvolutionBwdDataAlgo_t, size_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetConvolutionBackwardDataWorkspaceSize")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) * 4 + sizeof(cudnnConvolutionBwdDataAlgo_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_CONVOLUTION_BACKWARD_DATA_WORKSPACE_SIZE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(wDesc); + reqBuf.Push64BitPointer(dyDesc); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push64BitPointer(dxDesc); + reqBuf.Push(algo); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(sizeInBytes); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, wDesc, dyDesc, convDesc, dxDesc, algo, sizeInBytes); +} + +cudnnStatus_t cudnnConvolutionBackwardFilter( + cudnnHandle_t handle, const void *alpha, + const cudnnTensorDescriptor_t xDesc, const void *x, + const cudnnTensorDescriptor_t dyDesc,const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdFilterAlgo_t algo, + void *workSpace, size_t workSpaceSizeInBytes, + const void *beta, + const cudnnFilterDescriptor_t dwDesc,void *dw) { + const char* func_name = "cudnnConvolutionBackwardFilter"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, void *, size_t, const void *, const cudnnFilterDescriptor_t, void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnConvolutionBackwardFilter")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + (sizeof(size_t)+sizeof(const float))*2 + sizeof(uint64_t)*6 + sizeof(uint64_t) + sizeof(cudnnConvolutionBwdFilterAlgo_t) + sizeof(uint64_t) + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_CONVOLUTION_BACKWARD_FILTER); + reqBuf.Push64BitPointer(handle); + reqBuf.PushConst((const float*)alpha); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(x); + reqBuf.Push64BitPointer(dyDesc); + reqBuf.Push64BitPointer(dy); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push(algo); + reqBuf.Push64BitPointer(workSpace); + reqBuf.Push(workSpaceSizeInBytes); + reqBuf.PushConst((const float*)beta); + reqBuf.Push64BitPointer(dwDesc); + reqBuf.Push64BitPointer(dw); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dwDesc, dw); +} + +cudnnStatus_t cudnnGetConvolutionBackwardFilterWorkspaceSize( + cudnnHandle_t handle, const cudnnTensorDescriptor_t xDesc, + const cudnnTensorDescriptor_t dyDesc, + const cudnnConvolutionDescriptor_t convDesc, + const cudnnFilterDescriptor_t dwDesc, + cudnnConvolutionBwdFilterAlgo_t algo, size_t *sizeInBytes) { + const char* func_name = "cudnnGetConvolutionBackwardFilterWorkspaceSize"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const cudnnTensorDescriptor_t, const cudnnTensorDescriptor_t, const cudnnConvolutionDescriptor_t, const cudnnFilterDescriptor_t, cudnnConvolutionBwdFilterAlgo_t, size_t *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnGetConvolutionBackwardFilterWorkspaceSize")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + sizeof(uint64_t) * 4 + sizeof(cudnnConvolutionBwdFilterAlgo_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_GET_CONVOLUTION_BACKWARD_FILTER_WORKSPACE_SIZE); + reqBuf.Push64BitPointer(handle); + reqBuf.Push64BitPointer(xDesc); + reqBuf.Push64BitPointer(dyDesc); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push64BitPointer(dwDesc); + reqBuf.Push(algo); + + RequestIOV resBuf = RequestIOV(); + resBuf.Push(sizeInBytes); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, xDesc, dyDesc, convDesc, dwDesc, algo, sizeInBytes); +} + +cudnnStatus_t cudnnConvolutionBackwardData( + cudnnHandle_t handle, const void *alpha, + const cudnnFilterDescriptor_t wDesc, const void *w, + const cudnnTensorDescriptor_t dyDesc,const void *dy, + const cudnnConvolutionDescriptor_t convDesc, + cudnnConvolutionBwdDataAlgo_t algo, + void *workSpace, size_t workSpaceSizeInBytes, + const void *beta, + const cudnnTensorDescriptor_t dxDesc, void *dx) { + const char* func_name = "cudnnConvolutionBackwardData"; + HookLog(func_name); + using func_ptr = cudnnStatus_t (*)(cudnnHandle_t, const void *, const cudnnFilterDescriptor_t, const void *, const cudnnTensorDescriptor_t, const void *, const cudnnConvolutionDescriptor_t, cudnnConvolutionBwdDataAlgo_t, void *, size_t, const void *, const cudnnTensorDescriptor_t, void *); + auto func_entry = reinterpret_cast(dlsym(RTLD_NEXT, "cudnnConvolutionBackwardData")); + + // RequestBuffer reqBuf = RequestBuffer(sizeof(uint64_t) + (sizeof(size_t)+sizeof(const float))*2 + sizeof(uint64_t)*6 + sizeof(uint64_t) + sizeof(cudnnConvolutionBwdDataAlgo_t) + sizeof(uint64_t) + sizeof(size_t)); + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(CUDNN_CONVOLUTION_BACKWARD_DATA); + reqBuf.Push64BitPointer(handle); + reqBuf.PushConst((const float*)alpha); + reqBuf.Push64BitPointer(wDesc); + reqBuf.Push64BitPointer(w); + reqBuf.Push64BitPointer(dyDesc); + reqBuf.Push64BitPointer(dy); + reqBuf.Push64BitPointer(convDesc); + reqBuf.Push(algo); + reqBuf.Push64BitPointer(workSpace); + reqBuf.Push(workSpaceSizeInBytes); + reqBuf.PushConst((const float*)beta); + reqBuf.Push64BitPointer(dxDesc); + reqBuf.Push64BitPointer(dx); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + clientEpObj->SendRequest(&reqBuf); + return CUDNN_STATUS_SUCCESS; + + //return func_entry(handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, dxDesc, dx); +} \ No newline at end of file diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/ncclHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/ncclHook.cc new file mode 100644 index 0000000..1de4449 --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/ncclHook.cc @@ -0,0 +1,690 @@ +#include "../../include/hook/hook.h" + +inline int GetNcclTypeSize(ncclDataType_t type) { + switch (type) { + case ncclInt8: + case ncclUint8: + return 1; + case ncclFloat16: +#if defined(__CUDA_BF16_TYPES_EXIST__) + case ncclBfloat16: +#endif + return 2; + case ncclInt32: + case ncclUint32: + case ncclFloat32: + return 4; + case ncclInt64: + case ncclUint64: + case ncclFloat64: + return 8; + default: + return -1; + } +} + +void* get_nccl_handle() { + static void* handle = nullptr; + if (!handle) { + handle = dlopen("libnccl.so", RTLD_LAZY); + if (!handle) { + tool::Logging(LOG_ERROR, HOOK_LOG_TAG, "Failed to load 'libnccl.so': \n", dlerror()); + std::cerr << "Failed to load 'libnccl.so': " << dlerror() << std::endl; + } + } + return handle; +} + +ncclResult_t ncclMemAlloc(void** ptr, size_t size) { + const char* func_name = "ncclMemAlloc"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(void* * , size_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_MEM_ALLOC); + reqBuf.Push(size); + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = ptr; + // res_iov.length = sizeof(uint64_t); + // clientEpObj->RecvResponse(NCCL_MEM_ALLOC, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(ptr); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "allocated ptr = %p\n", *ptr, "\n"); + return ncclSuccess; + + // return func_entry(ptr, size); +} + +ncclResult_t ncclMemFree(void *ptr) { + const char* func_name = "ncclMemFree"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(void * ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_MEM_FREE); + reqBuf.Push64BitPointer(ptr); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(ptr); +} + +ncclResult_t ncclGetVersion(int *version) { + const char* func_name = "ncclGetVersion"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(int * ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_GET_VERSION); + int tmpDev = 0; + reqBuf.Push(tmpDev); // dummy + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = version; + // res_iov.length = sizeof(int); + // clientEpObj->RecvResponse(NCCL_GET_VERSION, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(version); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return ncclSuccess; + + // return func_entry(version); +} + +ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId) { + const char* func_name = "ncclGetUniqueId"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclUniqueId* ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_GET_UNIQUE_ID); + int tmpDev = 0; + reqBuf.Push(tmpDev); // dummy + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = uniqueId; + // res_iov.length = sizeof(ncclUniqueId); + // clientEpObj->RecvResponse(NCCL_GET_UNIQUE_ID, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(uniqueId); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + +#ifdef GV_GPUMAP + gpuIdMap->UpdateUniqueID((uint8_t*)uniqueId, sizeof(ncclUniqueId)); + // ncclUniqueId tmpId; + // gpuIdMap->RequestUniqueID((uint8_t*)&tmpId, sizeof(ncclUniqueId)); +#endif + + return ncclSuccess; + + // return func_entry(uniqueId); +} + +ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config) { + const char* func_name = "ncclCommInitRankConfig"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclComm_t* , int, ncclUniqueId, int, ncclConfig_t* ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_INIT_RANK_CONFIG); + *comm = NULL; + reqBuf.Push(comm); + reqBuf.Push(nranks); + reqBuf.Push(commId); + reqBuf.Push(rank); + reqBuf.Push(config); + reqBuf.PushCString(config->netName); + + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = comm; + // res_iov.length = sizeof(uint64_t); + // clientEpObj->RecvResponse(NCCL_COMM_INIT_RANK_CONFIG, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(comm); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "allocated comm = %p", *comm, "\n"); + commDevIdx = myDevIdx; + return ncclSuccess; + + // return func_entry(comm, nranks, commId, rank, config); +} + +ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank) { + const char* func_name = "ncclCommInitRank"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclComm_t* , int, ncclUniqueId, int); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_INIT_RANK); + *comm = NULL; + reqBuf.Push(comm); + reqBuf.Push(nranks); + reqBuf.Push(commId); + reqBuf.Push(rank); + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = comm; + // res_iov.length = sizeof(uint64_t); + // clientEpObj->RecvResponse(NCCL_COMM_INIT_RANK, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(comm); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "[pid:%d, tid:%d] allocated comm = %p\n", processID, threadID, *comm); + commDevIdx = myDevIdx; + return ncclSuccess; + + // return func_entry(comm, nranks, commId, rank); +} + +ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist) { //todo: devlist needs to re-map + const char* func_name = "ncclCommInitAll"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclComm_t* , int, const int* ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_INIT_ALL); + for (int i = 0; i < ndev; i++) { + comm[i] = NULL; + } + reqBuf.Push((uint64_t*)comm, ndev); + reqBuf.Push(ndev); + reqBuf.PushConst(devlist, ndev); + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = comm; + // res_iov.length = sizeof(uint64_t) * ndev; + // clientEpObj->RecvResponse(NCCL_COMM_INIT_ALL, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push((uint64_t*)comm, ndev); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "allocated comm for %d devices\n", ndev); + return ncclSuccess; + + // return func_entry(comm, ndev, devlist); +} + +ncclResult_t ncclCommFinalize(ncclComm_t comm) { + const char* func_name = "ncclCommFinalize"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclComm_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_FINALIZE); + reqBuf.Push64BitPointer(comm); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(comm); +} + +ncclResult_t ncclCommDestroy(ncclComm_t comm) { + const char* func_name = "ncclCommDestroy"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclComm_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_DESTROY); + reqBuf.Push64BitPointer(comm); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(comm); +} + +ncclResult_t ncclCommAbort(ncclComm_t comm) { + const char* func_name = "ncclCommAbort"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclComm_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_ABORT); + reqBuf.Push64BitPointer(comm); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(comm); +} + +ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config) { + const char* func_name = "ncclCommSplit"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclComm_t, int, int, ncclComm_t * , ncclConfig_t* ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_SPLIT); + reqBuf.Push64BitPointer(comm); + reqBuf.Push(color); + reqBuf.Push(key); + *newcomm = NULL; + reqBuf.Push(newcomm); + reqBuf.Push(config); + reqBuf.PushCString(config->netName); + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = newcomm; + // res_iov.length = sizeof(uint64_t); + // clientEpObj->RecvResponse(NCCL_COMM_INIT_RANK, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(newcomm); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + tool::Logging(LOG_DEBUG, func_name, "allocated comm = %p", *newcomm, "\n"); + return ncclSuccess; + + // return func_entry(comm, color, key, newcomm, config); +} + +ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError) { //todo: DDP + const char* func_name = "ncclCommGetAsyncError"; + + if (clientEpObj == nullptr) { + myDevIdx = commDevIdx; + } // new thread will call this function at the beginning + + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclComm_t, ncclResult_t * ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_GET_ASYNC_ERROR); + reqBuf.Push64BitPointer(comm); + // // clientEpObj->SendRequest(&reqBuf); + + // // ucp_dt_iov_t res_iov; + // // res_iov.buffer = asyncError; + // // res_iov.length = sizeof(ncclResult_t); + // // clientEpObj->RecvResponse(NCCL_COMM_GET_ASYNC_ERROR, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(asyncError); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + // *asyncError = ncclSuccess; + return ncclSuccess; + + // return func_entry(comm, asyncError); +} + +const char* ncclGetLastError(ncclComm_t comm) { + const char* func_name = "ncclGetLastError"; + HookLog(func_name); + using func_ptr = const char* (*)(ncclComm_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + return func_entry(comm); +} + +const char* ncclGetErrorString(ncclResult_t result) { + const char* func_name = "ncclGetErrorString"; + HookLog(func_name); + using func_ptr = const char* (*)(ncclResult_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + return func_entry(result); +} + +ncclResult_t ncclCommCount(const ncclComm_t comm, int* count) { + const char* func_name = "ncclCommCount"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const ncclComm_t, int* ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_COUNT); + reqBuf.Push64BitPointer(comm); + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = count; + // res_iov.length = sizeof(int); + // clientEpObj->RecvResponse(NCCL_COMM_COUNT, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(count); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return ncclSuccess; + + // return func_entry(comm, count); +} + +ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device) { + const char* func_name = "ncclCommCuDevice"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const ncclComm_t, int* ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_CU_DEVICE); + reqBuf.Push64BitPointer(comm); + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = device; + // res_iov.length = sizeof(int); + // clientEpObj->RecvResponse(NCCL_COMM_CU_DEVICE, &res_iov, 1); + int gpuIdInNode = 0; + RequestIOV resBuf = RequestIOV(); + resBuf.Push(gpuIdInNode); // todo: re-map the device id + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + +#ifdef GV_GPUMAP + if (gpuIdMap->GetGPUKey(gpuIdInNode, device) == false) { + tool::Logging(LOG_ERROR, func_name, "failed to get the virtual device index for the GPU ID %d\n", gpuIdInNode); + return ncclSystemError; + } +#else + *device = gpuIdInNode; +#endif + + return ncclSuccess; + + // return func_entry(comm, device); +} + +ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank) { + const char* func_name = "ncclCommUserRank"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const ncclComm_t, int* ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_USER_RANK); + reqBuf.Push64BitPointer(comm); + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = rank; + // res_iov.length = sizeof(int); + // clientEpObj->RecvResponse(NCCL_COMM_USER_RANK, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(rank); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return ncclSuccess; + + // return func_entry(comm, rank); +} + +ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle) { + const char* func_name = "ncclCommRegister"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const ncclComm_t, void* , size_t, void* * ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_REGISTER); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(buff); // devptr + reqBuf.Push(size); + *handle = NULL; + reqBuf.Push(handle); + printf("comm: %p, buff: %p, size: %zu\n", comm, buff, size); + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = handle; + // res_iov.length = sizeof(uint64_t); + // clientEpObj->RecvResponse(NCCL_COMM_REGISTER, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(handle); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return ncclSuccess; + + // return func_entry(comm, buff, size, handle); +} + +ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle) { + const char* func_name = "ncclCommDeregister"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const ncclComm_t, void* ); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_COMM_DEREGISTER); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(handle); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(comm, handle); +} + +ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm) { + const char* func_name = "ncclRedOpCreatePreMulSum"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclRedOp_t * , void * , ncclDataType_t, ncclScalarResidence_t, ncclComm_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_RED_OP_CREATE_PRE_MUL_SUM); + *op = ncclSum; + reqBuf.Push(op); + reqBuf.Push(datatype); + reqBuf.Push(residence); + reqBuf.Push64BitPointer(comm); + if (residence == ncclScalarHostImmediate) { + reqBuf.PushVar(scalar, GetNcclTypeSize(datatype)); + } else { + reqBuf.Push64BitPointer(scalar); + } + // clientEpObj->SendRequest(&reqBuf); + + // ucp_dt_iov_t res_iov; + // res_iov.buffer = op; + // res_iov.length = sizeof(ncclRedOp_t); + // clientEpObj->RecvResponse(NCCL_RED_OP_CREATE_PRE_MUL_SUM, &res_iov, 1); + RequestIOV resBuf = RequestIOV(); + resBuf.Push(op); + clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + return ncclSuccess; + + // return func_entry(op, scalar, datatype, residence, comm); +} + +ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm) { + const char* func_name = "ncclRedOpDestroy"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(ncclRedOp_t, ncclComm_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_RED_OP_DESTROY); + reqBuf.Push(op); + reqBuf.Push64BitPointer(comm); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(op, comm); +} + +ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream) { + const char* func_name = "ncclReduce"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const void* , void* , size_t, ncclDataType_t, ncclRedOp_t, int, ncclComm_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_REDUCE); + reqBuf.Push64BitPointer(sendbuff); // devptr + reqBuf.Push64BitPointer(recvbuff); // devptr + reqBuf.Push(count); + reqBuf.Push(datatype); + reqBuf.Push(op); + reqBuf.Push(root); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(stream); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(sendbuff, recvbuff, count, datatype, op, root, comm, stream); +} + +ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream) { + const char* func_name = "ncclBroadcast"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const void* , void* , size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_BROADCAST); + reqBuf.Push64BitPointer(sendbuff); // devptr + reqBuf.Push64BitPointer(recvbuff); // devptr + reqBuf.Push(count); + reqBuf.Push(datatype); + reqBuf.Push(root); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(stream); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(sendbuff, recvbuff, count, datatype, root, comm, stream); +} + +ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + const char* func_name = "ncclAllReduce"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const void* , void* , size_t, ncclDataType_t, ncclRedOp_t, ncclComm_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_ALL_REDUCE); + reqBuf.Push64BitPointer(sendbuff); // devptr + reqBuf.Push64BitPointer(recvbuff); // devptr + reqBuf.Push(count); + reqBuf.Push(datatype); + reqBuf.Push(op); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(stream); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(sendbuff, recvbuff, count, datatype, op, comm, stream); +} + +ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream) { + const char* func_name = "ncclReduceScatter"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const void* , void* , size_t, ncclDataType_t, ncclRedOp_t, ncclComm_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_REDUCE_SCATTER); + reqBuf.Push64BitPointer(sendbuff); // devptr + reqBuf.Push64BitPointer(recvbuff); // devptr + reqBuf.Push(recvcount); + reqBuf.Push(datatype); + reqBuf.Push(op); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(stream); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(sendbuff, recvbuff, recvcount, datatype, op, comm, stream); +} + +ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream) { + const char* func_name = "ncclAllGather"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const void* , void* , size_t, ncclDataType_t, ncclComm_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_ALL_GATHER); + reqBuf.Push64BitPointer(sendbuff); // devptr + reqBuf.Push64BitPointer(recvbuff); // devptr + reqBuf.Push(sendcount); + reqBuf.Push(datatype); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(stream); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(sendbuff, recvbuff, sendcount, datatype, comm, stream); +} + +ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { + const char* func_name = "ncclSend"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(const void* , size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_SEND); + reqBuf.Push64BitPointer(sendbuff); // devptr + reqBuf.Push(count); + reqBuf.Push(datatype); + reqBuf.Push(peer); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(stream); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(sendbuff, count, datatype, peer, comm, stream); +} + +ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, ncclComm_t comm, cudaStream_t stream) { + const char* func_name = "ncclRecv"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(void* , size_t, ncclDataType_t, int, ncclComm_t, cudaStream_t); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_RECV); + reqBuf.Push64BitPointer(recvbuff); // devptr + reqBuf.Push(count); + reqBuf.Push(datatype); + reqBuf.Push(peer); + reqBuf.Push64BitPointer(comm); + reqBuf.Push64BitPointer(stream); + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(recvbuff, count, datatype, peer, comm, stream); +} + +ncclResult_t ncclGroupStart() { + const char* func_name = "ncclGroupStart"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_GROUP_START); + int tmpDev = 0; + reqBuf.Push(tmpDev); // dummy + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(); +} + +ncclResult_t ncclGroupEnd() { + const char* func_name = "ncclGroupEnd"; + HookLog(func_name); + using func_ptr = ncclResult_t (*)(); + auto func_entry = reinterpret_cast(dlsym(get_nccl_handle(), func_name)); + + RequestIOV reqBuf = RequestIOV(); + reqBuf.PushRequestType(NCCL_GROUP_END); + int tmpDev = 0; + reqBuf.Push(tmpDev); // dummy + clientEpObj->SendRequest(&reqBuf); + return ncclSuccess; + + // return func_entry(); +} + diff --git a/GPU-Virtual-Service/gpu-remoting/src/client/nvmlHook.cc b/GPU-Virtual-Service/gpu-remoting/src/client/nvmlHook.cc new file mode 100644 index 0000000..07e8aeb --- /dev/null +++ b/GPU-Virtual-Service/gpu-remoting/src/client/nvmlHook.cc @@ -0,0 +1,65 @@ +#include "../../include/hook/hook.h" + +void* get_nvml_handle() { + static void* handle = nullptr; + if (!handle) { + handle = dlopen("libnvidia-ml.so", RTLD_LAZY); + if (!handle) { + tool::Logging(LOG_ERROR, HOOK_LOG_TAG, "Failed to load llibnvidia-ml.so: \n", dlerror()); + } + } + return handle; +} + +nvmlReturn_t nvmlInit_v2(void) { + const char* func_name = "nvmlInit_v2"; + HookLog(func_name, false); + using func_ptr = nvmlReturn_t (*)(); + auto func_entry = reinterpret_cast(dlsym(get_nvml_handle(), func_name)); + + return NVML_SUCCESS; + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + // reqBuf.PushRequestType(NVML_INIT_V2); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + // return func_entry(); +} + +nvmlReturn_t nvmlShutdown(void) { + const char* func_name = "nvmlShutdown"; + HookLog(func_name, false); + using func_ptr = nvmlReturn_t (*)(); + auto func_entry = reinterpret_cast(dlsym(get_nvml_handle(), func_name)); + + // printf("zwx: nvmlDeviceGetCount_v2\n"); + + return NVML_SUCCESS; + + // RequestBuffer reqBuf = RequestBuffer(sizeof(int)); + // reqBuf.PushRequestType(NVML_SHUTDOWN); + // clientEpObj->AddIOV(reqBuf.GetSize(), reqBuf._dataBuffer); + // clientEpObj->SendRequest(); + // return func_entry(); +} + +nvmlReturn_t nvmlDeviceGetCount_v2(unsigned int* deviceCount) { + std::call_once(initFlag, Intialize); // for nvidia-smi hook + const char* func_name = "nvmlDeviceGetCount_v2"; + HookLog(func_name, false); + using func_ptr = nvmlReturn_t (*)(unsigned int*); + auto func_entry = reinterpret_cast(dlsym(get_nvml_handle(), func_name)); + + // RequestIOV reqBuf = RequestIOV(); + // reqBuf.PushRequestType(CUDA_GET_DEVICE_COUNT); + // int tmpDev = 0; + // reqBuf.Push(tmpDev); + + // RequestIOV resBuf = RequestIOV(); + // resBuf.Push(deviceCount); + // clientEpObj->SendRequestRecvResponse(&reqBuf, &resBuf); + *deviceCount = config_->GetReqGPUnum(); + + + return NVML_SUCCESS; +}