Skip to content

Commit 686a492

Browse files
committed
update
1 parent ea5f44d commit 686a492

5 files changed

Lines changed: 1058 additions & 1 deletion

File tree

lib/qemu

Submodule qemu updated from 6c62fdc to 8d22647
Lines changed: 375 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,375 @@
1+
/*
2+
* CXL Bias Mode Benchmark
3+
* Device-bias vs host-bias latency comparison
4+
*
5+
* SPDX-License-Identifier: GPL-2.0-or-later
6+
*/
7+
8+
#include <stdio.h>
9+
#include <stdlib.h>
10+
#include <string.h>
11+
#include <stdint.h>
12+
#include <time.h>
13+
14+
#include "cxl_gpu_cmd.h"
15+
16+
/* CUDA types */
17+
typedef int CUresult;
18+
typedef int CUdevice;
19+
typedef void *CUcontext;
20+
typedef uint64_t CUdeviceptr;
21+
22+
#define CUDA_SUCCESS 0
23+
24+
/* Coherency stats */
25+
typedef struct {
26+
uint64_t snoop_hits;
27+
uint64_t snoop_misses;
28+
uint64_t coherency_requests;
29+
uint64_t back_invalidations;
30+
uint64_t writebacks;
31+
uint64_t evictions;
32+
uint64_t bias_flips;
33+
uint64_t device_bias_hits;
34+
uint64_t host_bias_hits;
35+
uint64_t upgrades;
36+
uint64_t downgrades;
37+
uint64_t directory_entries;
38+
} CXLCoherencyStats;
39+
40+
/* External APIs */
41+
extern CUresult cuInit(unsigned int flags);
42+
extern CUresult cuDeviceGet(CUdevice *device, int ordinal);
43+
extern CUresult cuCtxCreate_v2(CUcontext *ctx, unsigned int flags, CUdevice dev);
44+
45+
extern int cxlCoherentAlloc(uint64_t size, void **host_ptr);
46+
extern int cxlCoherentFree(void *host_ptr);
47+
extern int cxlCoherentFence(void);
48+
extern int cxlSetBias(void *host_ptr, uint64_t size, int bias_mode);
49+
extern int cxlGetBias(void *host_ptr, int *bias_mode);
50+
extern int cxlBiasFlip(void *host_ptr, uint64_t size, int new_bias);
51+
extern int cxlGetCoherencyStats(CXLCoherencyStats *stats);
52+
extern int cxlResetCoherencyStats(void);
53+
54+
#define REGION_SIZE (64 * 1024) /* 64KB test region */
55+
#define NUM_ITERATIONS 10000
56+
#define STRIDE 64 /* Cache line stride */
57+
58+
static uint64_t time_diff_ns(struct timespec *start, struct timespec *end)
59+
{
60+
return (end->tv_sec - start->tv_sec) * 1000000000ULL +
61+
(end->tv_nsec - start->tv_nsec);
62+
}
63+
64+
static void benchmark_cpu_writes(void *region, size_t size, const char *label)
65+
{
66+
struct timespec t1, t2;
67+
volatile uint64_t *data = (volatile uint64_t *)region;
68+
size_t count = size / STRIDE;
69+
70+
clock_gettime(CLOCK_MONOTONIC, &t1);
71+
for (int iter = 0; iter < NUM_ITERATIONS; iter++) {
72+
for (size_t i = 0; i < count; i++) {
73+
data[i * (STRIDE / sizeof(uint64_t))] = (uint64_t)iter + i;
74+
}
75+
}
76+
clock_gettime(CLOCK_MONOTONIC, &t2);
77+
78+
uint64_t total_ns = time_diff_ns(&t1, &t2);
79+
uint64_t ops = (uint64_t)NUM_ITERATIONS * count;
80+
printf(" %s CPU writes: %lu ns total, %lu ns/op (%lu ops)\n",
81+
label, (unsigned long)total_ns,
82+
(unsigned long)(total_ns / ops), (unsigned long)ops);
83+
}
84+
85+
static void benchmark_cpu_reads(void *region, size_t size, const char *label)
86+
{
87+
struct timespec t1, t2;
88+
volatile uint64_t *data = (volatile uint64_t *)region;
89+
size_t count = size / STRIDE;
90+
volatile uint64_t sink = 0;
91+
92+
clock_gettime(CLOCK_MONOTONIC, &t1);
93+
for (int iter = 0; iter < NUM_ITERATIONS; iter++) {
94+
for (size_t i = 0; i < count; i++) {
95+
sink += data[i * (STRIDE / sizeof(uint64_t))];
96+
}
97+
}
98+
clock_gettime(CLOCK_MONOTONIC, &t2);
99+
100+
(void)sink;
101+
102+
uint64_t total_ns = time_diff_ns(&t1, &t2);
103+
uint64_t ops = (uint64_t)NUM_ITERATIONS * count;
104+
printf(" %s CPU reads: %lu ns total, %lu ns/op (%lu ops)\n",
105+
label, (unsigned long)total_ns,
106+
(unsigned long)(total_ns / ops), (unsigned long)ops);
107+
}
108+
109+
static void test_bias_set_get(void)
110+
{
111+
printf(" [TEST] Bias set/get... ");
112+
113+
void *region = NULL;
114+
int ret = cxlCoherentAlloc(REGION_SIZE, &region);
115+
if (ret != CUDA_SUCCESS || !region) {
116+
printf("FAIL: alloc failed\n");
117+
return;
118+
}
119+
120+
/* Default should be host-biased */
121+
int bias = -1;
122+
ret = cxlGetBias(region, &bias);
123+
if (ret != CUDA_SUCCESS) {
124+
printf("FAIL: get bias failed\n");
125+
cxlCoherentFree(region);
126+
return;
127+
}
128+
129+
/* Set to device-biased */
130+
ret = cxlSetBias(region, REGION_SIZE, CXL_BIAS_DEVICE);
131+
if (ret != CUDA_SUCCESS) {
132+
printf("FAIL: set bias failed\n");
133+
cxlCoherentFree(region);
134+
return;
135+
}
136+
137+
ret = cxlGetBias(region, &bias);
138+
if (ret != CUDA_SUCCESS || bias != CXL_BIAS_DEVICE) {
139+
printf("FAIL: bias not device after set\n");
140+
cxlCoherentFree(region);
141+
return;
142+
}
143+
144+
/* Flip back to host */
145+
ret = cxlBiasFlip(region, REGION_SIZE, CXL_BIAS_HOST);
146+
if (ret != CUDA_SUCCESS) {
147+
printf("FAIL: bias flip failed\n");
148+
cxlCoherentFree(region);
149+
return;
150+
}
151+
152+
ret = cxlGetBias(region, &bias);
153+
if (ret != CUDA_SUCCESS || bias != CXL_BIAS_HOST) {
154+
printf("FAIL: bias not host after flip\n");
155+
cxlCoherentFree(region);
156+
return;
157+
}
158+
159+
cxlCoherentFree(region);
160+
printf("PASS\n");
161+
}
162+
163+
static void benchmark_host_bias(void)
164+
{
165+
printf("\n [BENCH] Host-biased mode:\n");
166+
167+
void *region = NULL;
168+
int ret = cxlCoherentAlloc(REGION_SIZE, &region);
169+
if (ret != CUDA_SUCCESS || !region) {
170+
printf(" SKIP: alloc failed\n");
171+
return;
172+
}
173+
174+
/* Set host-biased */
175+
cxlSetBias(region, REGION_SIZE, CXL_BIAS_HOST);
176+
cxlCoherentFence();
177+
178+
cxlResetCoherencyStats();
179+
180+
benchmark_cpu_writes(region, REGION_SIZE, "Host-bias");
181+
benchmark_cpu_reads(region, REGION_SIZE, "Host-bias");
182+
183+
CXLCoherencyStats stats;
184+
cxlGetCoherencyStats(&stats);
185+
printf(" Host-bias stats: snoop_hits=%lu, host_bias_hits=%lu, "
186+
"device_bias_hits=%lu\n",
187+
(unsigned long)stats.snoop_hits,
188+
(unsigned long)stats.host_bias_hits,
189+
(unsigned long)stats.device_bias_hits);
190+
191+
cxlCoherentFree(region);
192+
}
193+
194+
static void benchmark_device_bias(void)
195+
{
196+
printf("\n [BENCH] Device-biased mode:\n");
197+
198+
void *region = NULL;
199+
int ret = cxlCoherentAlloc(REGION_SIZE, &region);
200+
if (ret != CUDA_SUCCESS || !region) {
201+
printf(" SKIP: alloc failed\n");
202+
return;
203+
}
204+
205+
/* Set device-biased */
206+
cxlSetBias(region, REGION_SIZE, CXL_BIAS_DEVICE);
207+
cxlCoherentFence();
208+
209+
cxlResetCoherencyStats();
210+
211+
benchmark_cpu_writes(region, REGION_SIZE, "Dev-bias");
212+
benchmark_cpu_reads(region, REGION_SIZE, "Dev-bias");
213+
214+
CXLCoherencyStats stats;
215+
cxlGetCoherencyStats(&stats);
216+
printf(" Device-bias stats: snoop_hits=%lu, host_bias_hits=%lu, "
217+
"device_bias_hits=%lu\n",
218+
(unsigned long)stats.snoop_hits,
219+
(unsigned long)stats.host_bias_hits,
220+
(unsigned long)stats.device_bias_hits);
221+
222+
cxlCoherentFree(region);
223+
}
224+
225+
static void benchmark_bias_flip_overhead(void)
226+
{
227+
printf("\n [BENCH] Bias flip overhead:\n");
228+
229+
void *region = NULL;
230+
int ret = cxlCoherentAlloc(REGION_SIZE, &region);
231+
if (ret != CUDA_SUCCESS || !region) {
232+
printf(" SKIP: alloc failed\n");
233+
return;
234+
}
235+
236+
struct timespec t1, t2;
237+
int flips = 100;
238+
239+
cxlResetCoherencyStats();
240+
clock_gettime(CLOCK_MONOTONIC, &t1);
241+
for (int i = 0; i < flips; i++) {
242+
cxlBiasFlip(region, REGION_SIZE,
243+
(i % 2 == 0) ? CXL_BIAS_DEVICE : CXL_BIAS_HOST);
244+
}
245+
clock_gettime(CLOCK_MONOTONIC, &t2);
246+
247+
uint64_t total_ns = time_diff_ns(&t1, &t2);
248+
printf(" %d bias flips: %lu ns total, %lu ns/flip\n",
249+
flips, (unsigned long)total_ns,
250+
(unsigned long)(total_ns / flips));
251+
252+
CXLCoherencyStats stats;
253+
cxlGetCoherencyStats(&stats);
254+
printf(" Flip stats: bias_flips=%lu, writebacks=%lu\n",
255+
(unsigned long)stats.bias_flips,
256+
(unsigned long)stats.writebacks);
257+
258+
cxlCoherentFree(region);
259+
}
260+
261+
static void benchmark_phase_pattern(void)
262+
{
263+
printf("\n [BENCH] Phase-based access pattern (CPU-write then GPU-read):\n");
264+
265+
void *region = NULL;
266+
int ret = cxlCoherentAlloc(REGION_SIZE, &region);
267+
if (ret != CUDA_SUCCESS || !region) {
268+
printf(" SKIP: alloc failed\n");
269+
return;
270+
}
271+
272+
struct timespec t1, t2;
273+
int phases = 50;
274+
volatile uint64_t *data = (volatile uint64_t *)region;
275+
size_t count = REGION_SIZE / sizeof(uint64_t);
276+
277+
/* Without bias control: just do writes and reads */
278+
cxlResetCoherencyStats();
279+
clock_gettime(CLOCK_MONOTONIC, &t1);
280+
for (int phase = 0; phase < phases; phase++) {
281+
/* CPU write phase */
282+
for (size_t i = 0; i < count; i++) {
283+
data[i] = (uint64_t)phase + i;
284+
}
285+
cxlCoherentFence();
286+
287+
/* Simulated GPU read phase (CPU reads standing in) */
288+
volatile uint64_t sink = 0;
289+
for (size_t i = 0; i < count; i++) {
290+
sink += data[i];
291+
}
292+
(void)sink;
293+
}
294+
clock_gettime(CLOCK_MONOTONIC, &t2);
295+
uint64_t no_bias_ns = time_diff_ns(&t1, &t2);
296+
297+
CXLCoherencyStats stats_no_bias;
298+
cxlGetCoherencyStats(&stats_no_bias);
299+
300+
/* With bias control: flip at phase boundaries */
301+
cxlResetCoherencyStats();
302+
clock_gettime(CLOCK_MONOTONIC, &t1);
303+
for (int phase = 0; phase < phases; phase++) {
304+
/* Switch to host-bias for CPU write phase */
305+
cxlBiasFlip(region, REGION_SIZE, CXL_BIAS_HOST);
306+
307+
for (size_t i = 0; i < count; i++) {
308+
data[i] = (uint64_t)phase + i;
309+
}
310+
cxlCoherentFence();
311+
312+
/* Switch to device-bias for GPU read phase */
313+
cxlBiasFlip(region, REGION_SIZE, CXL_BIAS_DEVICE);
314+
315+
volatile uint64_t sink = 0;
316+
for (size_t i = 0; i < count; i++) {
317+
sink += data[i];
318+
}
319+
(void)sink;
320+
}
321+
clock_gettime(CLOCK_MONOTONIC, &t2);
322+
uint64_t bias_ns = time_diff_ns(&t1, &t2);
323+
324+
CXLCoherencyStats stats_bias;
325+
cxlGetCoherencyStats(&stats_bias);
326+
327+
printf(" Without bias control: %lu ns (%lu ns/phase)\n",
328+
(unsigned long)no_bias_ns, (unsigned long)(no_bias_ns / phases));
329+
printf(" coherency_reqs=%lu, back_inv=%lu\n",
330+
(unsigned long)stats_no_bias.coherency_requests,
331+
(unsigned long)stats_no_bias.back_invalidations);
332+
printf(" With bias control: %lu ns (%lu ns/phase)\n",
333+
(unsigned long)bias_ns, (unsigned long)(bias_ns / phases));
334+
printf(" coherency_reqs=%lu, back_inv=%lu, bias_flips=%lu\n",
335+
(unsigned long)stats_bias.coherency_requests,
336+
(unsigned long)stats_bias.back_invalidations,
337+
(unsigned long)stats_bias.bias_flips);
338+
339+
cxlCoherentFree(region);
340+
}
341+
342+
int main(void)
343+
{
344+
printf("=== CXL Bias Mode Benchmark ===\n\n");
345+
346+
CUresult err = cuInit(0);
347+
if (err != CUDA_SUCCESS) {
348+
fprintf(stderr, "cuInit failed: %d\n", err);
349+
return 1;
350+
}
351+
352+
CUdevice dev;
353+
err = cuDeviceGet(&dev, 0);
354+
if (err != CUDA_SUCCESS) {
355+
fprintf(stderr, "cuDeviceGet failed: %d\n", err);
356+
return 1;
357+
}
358+
359+
CUcontext ctx;
360+
err = cuCtxCreate_v2(&ctx, 0, dev);
361+
if (err != CUDA_SUCCESS) {
362+
fprintf(stderr, "cuCtxCreate failed: %d\n", err);
363+
return 1;
364+
}
365+
366+
printf("Running bias mode tests and benchmarks:\n");
367+
test_bias_set_get();
368+
benchmark_host_bias();
369+
benchmark_device_bias();
370+
benchmark_bias_flip_overhead();
371+
benchmark_phase_pattern();
372+
373+
printf("\n=== Benchmark complete ===\n");
374+
return 0;
375+
}

0 commit comments

Comments
 (0)