@@ -280,6 +280,17 @@ static int find_and_map_device(void)
280280 DLOG ("Device mapped successfully, magic=0x%x version=0x%x\n" ,
281281 magic , reg_read32 (CXL_GPU_REG_VERSION ));
282282
283+ /* Check device ready */
284+ uint32_t status = reg_read32 (CXL_GPU_REG_STATUS );
285+ if (!(status & CXL_GPU_STATUS_READY )) {
286+ DLOG ("Device not ready, status=0x%x\n" , status );
287+ munmap (map , g_bar_size );
288+ close (g_pci_fd );
289+ g_pci_fd = -1 ;
290+ g_regs = NULL ;
291+ continue ; /* try next device */
292+ }
293+
283294 closedir (dir );
284295 return 0 ;
285296 }
@@ -1126,6 +1137,166 @@ int cxl_p2p_get_status(int *num_peers, uint64_t *transfers_completed,
11261137 return CUDA_SUCCESS ;
11271138}
11281139
1140+ /* ========================================================================
1141+ * BAR4 Coherent Memory Support
1142+ * ======================================================================== */
1143+
1144+ static int g_bar4_fd = -1 ;
1145+ static volatile uint8_t * g_bar4_ptr = NULL ;
1146+ static size_t g_bar4_size = 0 ;
1147+ static uint64_t g_coh_offset = 0 ; /* bump allocator offset */
1148+
1149+ static volatile uint8_t * ensure_bar4 (void )
1150+ {
1151+ if (g_bar4_ptr ) return g_bar4_ptr ;
1152+
1153+ /* Find BAR4 for the device we already mapped */
1154+ char path [256 ];
1155+ /* Scan sysfs for the device whose BAR2 we have open */
1156+ DIR * dir = opendir ("/sys/bus/pci/devices" );
1157+ if (!dir ) return NULL ;
1158+ struct dirent * ent ;
1159+ while ((ent = readdir (dir )) != NULL ) {
1160+ if (ent -> d_name [0 ] == '.' ) continue ;
1161+ snprintf (path , sizeof (path ), "/sys/bus/pci/devices/%s/resource" , ent -> d_name );
1162+ FILE * fp = fopen (path , "r" );
1163+ if (!fp ) continue ;
1164+ uint64_t start , end , flags ;
1165+ /* Skip BAR0..BAR3 (4 lines) */
1166+ for (int i = 0 ; i < 4 ; i ++ ) {
1167+ if (fscanf (fp , "0x%lx 0x%lx 0x%lx\n" , & start , & end , & flags ) != 3 ) break ;
1168+ }
1169+ /* Read BAR4 */
1170+ if (fscanf (fp , "0x%lx 0x%lx 0x%lx" , & start , & end , & flags ) == 3 && end > start ) {
1171+ g_bar4_size = end - start + 1 ;
1172+ }
1173+ fclose (fp );
1174+ if (g_bar4_size == 0 ) continue ;
1175+
1176+ /* Check vendor/device match */
1177+ snprintf (path , sizeof (path ), "/sys/bus/pci/devices/%s/vendor" , ent -> d_name );
1178+ int fd = open (path , O_RDONLY );
1179+ if (fd < 0 ) continue ;
1180+ char buf [32 ]; int n = read (fd , buf , sizeof (buf )- 1 ); close (fd );
1181+ if (n <= 0 ) continue ; buf [n ] = '\0' ;
1182+ if ((uint16_t )strtol (buf , NULL , 16 ) != CXL_TYPE2_VENDOR_ID ) { g_bar4_size = 0 ; continue ; }
1183+
1184+ snprintf (path , sizeof (path ), "/sys/bus/pci/devices/%s/device" , ent -> d_name );
1185+ fd = open (path , O_RDONLY );
1186+ if (fd < 0 ) continue ;
1187+ n = read (fd , buf , sizeof (buf )- 1 ); close (fd );
1188+ if (n <= 0 ) continue ; buf [n ] = '\0' ;
1189+ if ((uint16_t )strtol (buf , NULL , 16 ) != CXL_TYPE2_DEVICE_ID ) { g_bar4_size = 0 ; continue ; }
1190+
1191+ /* Check this is the same device we're using (status must be READY) */
1192+ snprintf (path , sizeof (path ), "/sys/bus/pci/devices/%s/resource2" , ent -> d_name );
1193+ int bar2_fd = open (path , O_RDWR | O_SYNC );
1194+ if (bar2_fd < 0 ) { g_bar4_size = 0 ; continue ; }
1195+ void * bar2_map = mmap (NULL , 4096 , PROT_READ , MAP_SHARED , bar2_fd , 0 );
1196+ if (bar2_map == MAP_FAILED ) { close (bar2_fd ); g_bar4_size = 0 ; continue ; }
1197+ uint32_t magic = * (volatile uint32_t * )bar2_map ;
1198+ uint32_t status = * (volatile uint32_t * )((uint8_t * )bar2_map + 8 );
1199+ munmap (bar2_map , 4096 );
1200+ close (bar2_fd );
1201+ if (magic != CXL_GPU_MAGIC || !(status & CXL_GPU_STATUS_READY )) { g_bar4_size = 0 ; continue ; }
1202+
1203+ /* Map BAR4 */
1204+ snprintf (path , sizeof (path ), "/sys/bus/pci/devices/%s/resource4" , ent -> d_name );
1205+ g_bar4_fd = open (path , O_RDWR );
1206+ if (g_bar4_fd < 0 ) { g_bar4_size = 0 ; continue ; }
1207+ void * b4 = mmap (NULL , g_bar4_size , PROT_READ | PROT_WRITE , MAP_SHARED , g_bar4_fd , 0 );
1208+ if (b4 == MAP_FAILED ) { close (g_bar4_fd ); g_bar4_fd = -1 ; g_bar4_size = 0 ; continue ; }
1209+ g_bar4_ptr = (volatile uint8_t * )b4 ;
1210+ DLOG ("Mapped BAR4 for %s (%zu MB)\n" , ent -> d_name , g_bar4_size >> 20 );
1211+ closedir (dir );
1212+ return g_bar4_ptr ;
1213+ }
1214+ closedir (dir );
1215+ return NULL ;
1216+ }
1217+
1218+ int cxlCoherentAlloc (uint64_t size , void * * host_ptr )
1219+ {
1220+ DLOG ("cxlCoherentAlloc(size=%lu)\n" , (unsigned long )size );
1221+ if (!host_ptr || size == 0 ) return 1 ;
1222+ volatile uint8_t * bar4 = ensure_bar4 ();
1223+ if (!bar4 ) return 3 ;
1224+
1225+ size = (size + 4095 ) & ~4095UL ;
1226+ if (g_coh_offset + size > g_bar4_size ) return 2 ;
1227+
1228+ * host_ptr = (void * )(bar4 + g_coh_offset );
1229+ g_coh_offset += size ;
1230+ return 0 ;
1231+ }
1232+
1233+ int cxlCoherentFree (void * host_ptr )
1234+ {
1235+ (void )host_ptr ;
1236+ return 0 ;
1237+ }
1238+
1239+ void * cxlDeviceToHost (uint64_t dev_offset )
1240+ {
1241+ volatile uint8_t * bar4 = ensure_bar4 ();
1242+ if (!bar4 ) return NULL ;
1243+ return (void * )(bar4 + dev_offset );
1244+ }
1245+
1246+ int cxlCoherentFence (void )
1247+ {
1248+ __sync_synchronize ();
1249+ return 0 ;
1250+ }
1251+
1252+ int cxlSetBias (void * host_ptr , uint64_t size , int bias_mode )
1253+ {
1254+ (void )host_ptr ; (void )size ; (void )bias_mode ;
1255+ return 0 ;
1256+ }
1257+
1258+ int cxlGetBias (void * host_ptr , int * bias_mode )
1259+ {
1260+ (void )host_ptr ;
1261+ if (bias_mode ) * bias_mode = 0 ;
1262+ return 0 ;
1263+ }
1264+
1265+ int cxlBiasFlip (void * host_ptr , uint64_t size , int new_bias )
1266+ {
1267+ (void )host_ptr ; (void )size ; (void )new_bias ;
1268+ return 0 ;
1269+ }
1270+
1271+ typedef struct {
1272+ uint64_t snoop_hits ; uint64_t snoop_misses ;
1273+ uint64_t coherency_requests ; uint64_t back_invalidations ;
1274+ uint64_t writebacks ; uint64_t evictions ; uint64_t bias_flips ;
1275+ uint64_t device_bias_hits ; uint64_t host_bias_hits ;
1276+ uint64_t upgrades ; uint64_t downgrades ; uint64_t directory_entries ;
1277+ } CXLCoherencyStats ;
1278+
1279+ int cxlGetCoherencyStats (CXLCoherencyStats * stats )
1280+ {
1281+ if (!stats ) return 1 ;
1282+ memset (stats , 0 , sizeof (* stats ));
1283+ return 0 ;
1284+ }
1285+
1286+ int cxlResetCoherencyStats (void )
1287+ {
1288+ return 0 ;
1289+ }
1290+
1291+ CUresult cuCxlGetCoherentBase (CUdeviceptr * base , size_t * size , CUdevice dev )
1292+ {
1293+ (void )dev ;
1294+ volatile uint8_t * bar4 = ensure_bar4 ();
1295+ if (base ) * base = bar4 ? (CUdeviceptr )(uintptr_t )bar4 : 0 ;
1296+ if (size ) * size = g_bar4_size ;
1297+ return 0 ;
1298+ }
1299+
11291300/* Library initialization/cleanup */
11301301__attribute__((constructor ))
11311302static void libcuda_init (void )
@@ -1137,6 +1308,14 @@ __attribute__((destructor))
11371308static void libcuda_cleanup (void )
11381309{
11391310 DLOG ("libcuda.so unloading\n" );
1311+ if (g_bar4_ptr ) {
1312+ munmap ((void * )g_bar4_ptr , g_bar4_size );
1313+ g_bar4_ptr = NULL ;
1314+ }
1315+ if (g_bar4_fd >= 0 ) {
1316+ close (g_bar4_fd );
1317+ g_bar4_fd = -1 ;
1318+ }
11401319 if (g_regs ) {
11411320 munmap ((void * )g_regs , g_bar_size );
11421321 g_regs = NULL ;
0 commit comments