XSched integrated with priority scheduling for high-priority task acceleration, auto-detects GPU architecture, adapts preemption level, and runs stably without CUDA errors

songyuqin0686 · songyuqin0686 · commit f3d117b13a2e · 2026-04-03T16:32:47.000+08:00
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
@@ -25,6 +25,7 @@
 #include <cstdio>
 #include <string>
 #include <vector>
+#include <mutex>
 
 #if defined(GGML_USE_HIP)
 #include "vendors/hip.h"
@@ -751,7 +752,7 @@ struct ggml_tensor_extra_gpu {
 
 
 #if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
-#define USE_CUDA_GRAPH
+// #define USE_CUDA_GRAPH
 #endif
 
 struct ggml_graph_node_properties {
@@ -799,8 +800,11 @@ struct ggml_backend_cuda_context {
     cudaEvent_t copy_event = nullptr;
 
     cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
+    HwQueueHandle hwqueues[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { 0 } };
+    XQueueHandle xqueues[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { 0 } };
     cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
+    mutable std::mutex streams_mutex;
     std::unique_ptr<ggml_cuda_graph> cuda_graph;
 
     int priority = 0;
@@ -812,16 +816,63 @@ struct ggml_backend_cuda_context {
 
     ~ggml_backend_cuda_context();
 
+    // Disable copying and moving to prevent resource management issues
+    ggml_backend_cuda_context(const ggml_backend_cuda_context&) = delete;
+    ggml_backend_cuda_context& operator=(const ggml_backend_cuda_context&) = delete;
+    ggml_backend_cuda_context(ggml_backend_cuda_context&&) = delete;
+    ggml_backend_cuda_context& operator=(ggml_backend_cuda_context&&) = delete;
+
+    int get_max_supported_preempt_level(int device_id) {
+        cudaDeviceProp prop;
+        CUDA_CHECK(cudaGetDeviceProperties(&prop, device_id));
+        int arch = prop.major * 10 + prop.minor;
+        switch (arch) {
+            case 35:  // Kepler: K20, K40, GTX TITAN
+                return 2; // kPreemptLevelDeactivate
+            case 70:  // Volta: V100
+            case 86:  // Ampere: RTX 30系列
+                return 3; // kPreemptLevelInterrupt
+            default:  // 其他架构（包括 A100 arch=80）
+                return 1; // kPreemptLevelBlock
+        }
+    }
+
     cudaStream_t stream(int device, int stream) {
+        std::lock_guard<std::mutex> lock(streams_mutex);
+        
+        // 如果流不存在，创建流和XSched队列
         if (streams[device][stream] == nullptr) {
             ggml_cuda_set_device(device);
             CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking));
-            HwQueueHandle hwqueue;
-            CudaQueueCreate(&hwqueue,streams[device][stream]);
-            XQueueHandle xqueue;
-            XQueueCreate(&xqueue, hwqueue, kPreemptLevelDeactivate, kQueueCreateFlagNone);
-            XHintPriority(xqueue, priority); // In XSched, lower number means lower priority
+            
+            HwQueueHandle hwqueue = 0;
+            XResult res = CudaQueueCreate(&hwqueue, streams[device][stream]);
+            if (res != kXSchedSuccess) {
+                CUDA_CHECK(cudaStreamDestroy(streams[device][stream]));
+                streams[device][stream] = nullptr;
+                GGML_ABORT("CudaQueueCreate failed: %d", res);
+            }
+            
+            XQueueHandle xqueue = 0;
+            res = XQueueCreate(&xqueue, hwqueue, get_max_supported_preempt_level(device), kQueueCreateFlagNone);
+            if (res != kXSchedSuccess) {
+                HwQueueDestroy(hwqueue);
+                CUDA_CHECK(cudaStreamDestroy(streams[device][stream]));
+                streams[device][stream] = nullptr;
+                GGML_ABORT("XQueueCreate failed: %d", res);
+            }
+            
+            hwqueues[device][stream] = hwqueue;
+            xqueues[device][stream] = xqueue;
+            
+            // 设置初始优先级（总是设置，包括优先级0）
+            XHintPriority(xqueue, priority);
+        }
+        // 如果流已存在但XSched队列未绑定（不应该发生，但确保健壮性）
+        else if (xqueues[device][stream] == 0) {
+            GGML_ABORT("Stream exists but XQueue not bound - internal error");
         }
+        
         return streams[device][stream];
     }
 
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -531,14 +531,26 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {
     if (copy_event != nullptr) {
         CUDA_CHECK(cudaEventDestroy(copy_event));
     }
+    
+    // 销毁所有XSched队列和硬件队列
     for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {
         for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {
+            if (xqueues[i][j] != 0) {
+                XQueueDestroy(xqueues[i][j]);
+                xqueues[i][j] = 0;
+            }
+            if (hwqueues[i][j] != 0) {
+                HwQueueDestroy(hwqueues[i][j]);
+                hwqueues[i][j] = 0;
+            }
             if (streams[i][j] != nullptr) {
                 CUDA_CHECK(cudaStreamDestroy(streams[i][j]));
+                streams[i][j] = nullptr;
             }
         }
         if (cublas_handles[i] != nullptr) {
             CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));
+            cublas_handles[i] = nullptr;
         }
     }
 }
@@ -2855,19 +2867,19 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev
 
 static void ggml_backend_cuda_set_priority(ggml_backend_t backend, int prio) {
     ggml_backend_cuda_context *cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+    
+    std::lock_guard<std::mutex> lock(cuda_ctx->streams_mutex);
+    
+    // 更新所有已存在XQueue的优先级
     for (int device = 0; device < GGML_CUDA_MAX_DEVICES; device++) {
         for (int idx = 0; idx < GGML_CUDA_MAX_STREAMS; idx++) {
-            auto stream = cuda_ctx->streams[device][idx];
-            if(stream == nullptr) {
-                continue;
+            if (cuda_ctx->xqueues[device][idx] != 0) {
+                XHintPriority(cuda_ctx->xqueues[device][idx], prio);
             }
-            HwQueueHandle hwqueue;
-            CudaQueueCreate(&hwqueue,stream);
-            XQueueHandle xqueue;
-            XQueueCreate(&xqueue, hwqueue, kPreemptLevelDeactivate, kQueueCreateFlagNone);
-            XHintPriority(xqueue, prio); // In XSched, lower number means lower priority
         }
     }
+    
+    // 存储新优先级，后续创建的XQueue将使用此优先级
     cuda_ctx->priority = prio;
 }
 
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
@@ -5010,9 +5010,9 @@ int main(int argc, char ** argv) {
     std::vector<std::thread> threads;
     // this call blocks the main thread until queue_tasks.terminate() is called
     for(int i = 0; i < SERVER_TASK_PRIO_COUNT; i++) {
-        threads.emplace_back([&ctx_server, &i]() {
-            ctx_server[i].queue_tasks.start_loop();
-        });
+        threads.emplace_back([&ctx_server](int ind) {
+            ctx_server[ind].queue_tasks.start_loop();
+        },i);
     }
 
     for(auto &thread: threads) {

Original file line number	Diff line number	Diff line change
`@@ -531,14 +531,26 @@ ggml_backend_cuda_context::~ggml_backend_cuda_context() {`
`531`	`531`	`if (copy_event != nullptr) {`
`532`	`532`	`CUDA_CHECK(cudaEventDestroy(copy_event));`
`533`	`533`	`}`
	`534`	`+`
	`535`	`+ // 销毁所有XSched队列和硬件队列`
`534`	`536`	`for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) {`
`535`	`537`	`for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) {`
	`538`	`+ if (xqueues[i][j] != 0) {`
	`539`	`+ XQueueDestroy(xqueues[i][j]);`
	`540`	`+ xqueues[i][j] = 0;`
	`541`	`+ }`
	`542`	`+ if (hwqueues[i][j] != 0) {`
	`543`	`+ HwQueueDestroy(hwqueues[i][j]);`
	`544`	`+ hwqueues[i][j] = 0;`
	`545`	`+ }`
`536`	`546`	`if (streams[i][j] != nullptr) {`
`537`	`547`	`CUDA_CHECK(cudaStreamDestroy(streams[i][j]));`
	`548`	`+ streams[i][j] = nullptr;`
`538`	`549`	`}`
`539`	`550`	`}`
`540`	`551`	`if (cublas_handles[i] != nullptr) {`
`541`	`552`	`CUBLAS_CHECK(cublasDestroy(cublas_handles[i]));`
	`553`	`+ cublas_handles[i] = nullptr;`
`542`	`554`	`}`
`543`	`555`	`}`
`544`	`556`	`}`
`@@ -2855,19 +2867,19 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev`
`2855`	`2867`
`2856`	`2868`	`static void ggml_backend_cuda_set_priority(ggml_backend_t backend, int prio) {`
`2857`	`2869`	`ggml_backend_cuda_context cuda_ctx = (ggml_backend_cuda_context )backend->context;`
	`2870`	`+`
	`2871`	`+ std::lock_guard<std::mutex> lock(cuda_ctx->streams_mutex);`
	`2872`	`+`
	`2873`	`+ // 更新所有已存在XQueue的优先级`
`2858`	`2874`	`for (int device = 0; device < GGML_CUDA_MAX_DEVICES; device++) {`
`2859`	`2875`	`for (int idx = 0; idx < GGML_CUDA_MAX_STREAMS; idx++) {`
`2860`		`- auto stream = cuda_ctx->streams[device][idx];`
`2861`		`- if(stream == nullptr) {`
`2862`		`- continue;`
	`2876`	`+ if (cuda_ctx->xqueues[device][idx] != 0) {`
	`2877`	`+ XHintPriority(cuda_ctx->xqueues[device][idx], prio);`
`2863`	`2878`	`}`
`2864`		`- HwQueueHandle hwqueue;`
`2865`		`- CudaQueueCreate(&hwqueue,stream);`
`2866`		`- XQueueHandle xqueue;`
`2867`		`- XQueueCreate(&xqueue, hwqueue, kPreemptLevelDeactivate, kQueueCreateFlagNone);`
`2868`		`- XHintPriority(xqueue, prio); // In XSched, lower number means lower priority`
`2869`	`2879`	`}`
`2870`	`2880`	`}`
	`2881`	`+`
	`2882`	`+ // 存储新优先级，后续创建的XQueue将使用此优先级`
`2871`	`2883`	`cuda_ctx->priority = prio;`
`2872`	`2884`	`}`
`2873`	`2885`