@@ -760,21 +760,10 @@ static void init_cxl_memory(void) {
760760 header_size = (header_size + CXL_ALIGNMENT - 1 ) & ~(CXL_ALIGNMENT - 1 );
761761 atomic_store (& g_cxl .header -> alloc_offset , header_size );
762762
763- // Initialize only the mailboxes that fit within the mapped region
764- // Use cxl_safe_memset instead of memset to avoid SIMD SIGILL on CXL memory
765- for (int i = 0 ; i < max_usable_ranks ; i ++ ) {
766- atomic_store (& g_cxl .mailboxes [i ].head , 0 );
767- atomic_store (& g_cxl .mailboxes [i ].tail , 0 );
768- atomic_store (& g_cxl .mailboxes [i ].active , 0 );
769- g_cxl .mailboxes [i ].rank = i ;
770- g_cxl .mailboxes [i ].pid = 0 ;
771- cxl_safe_memset (g_cxl .mailboxes [i ].hostname , 0 , sizeof (g_cxl .mailboxes [i ].hostname ));
772-
773- // Initialize message slots
774- for (int j = 0 ; j < CXL_MSG_QUEUE_SIZE ; j ++ ) {
775- atomic_store (& g_cxl .mailboxes [i ].messages [j ].state , CXL_MSG_EMPTY );
776- }
777- }
763+ // Initialize all mailboxes using cxl_safe_memset to avoid SIMD SIGILL.
764+ // Zero-fills all fields: head=0, tail=0, active=0, messages[].state=0(EMPTY).
765+ cxl_safe_memset (g_cxl .mailboxes , 0 ,
766+ max_usable_ranks * sizeof (cxl_rank_mailbox_t ));
778767
779768 // Initialize collective operation fields
780769 atomic_store (& g_cxl .header -> coll_barrier_count , 0 );
@@ -850,14 +839,11 @@ static void cxl_register_rank(int rank, int world_size) {
850839 atomic_store (& g_cxl .header -> alloc_offset , (uint64_t )CXL_HEADER_SIZE );
851840
852841 // Reset ALL mailbox slots to clear stale messages from previous runs.
853- for (int i = 0 ; i < g_cxl .max_ranks ; i ++ ) {
854- atomic_store (& g_cxl .mailboxes [i ].active , 0 );
855- atomic_store (& g_cxl .mailboxes [i ].head , 0 );
856- atomic_store (& g_cxl .mailboxes [i ].tail , 0 );
857- for (int j = 0 ; j < CXL_MSG_QUEUE_SIZE ; j ++ ) {
858- atomic_store (& g_cxl .mailboxes [i ].messages [j ].state , CXL_MSG_EMPTY );
859- }
860- }
842+ // Use cxl_safe_memset (volatile 8-byte writes) instead of atomic_store
843+ // loops — the compiler at -O3 can optimize atomic_store loops into SIMD
844+ // memset which causes SIGILL on CXL device memory.
845+ cxl_safe_memset (g_cxl .mailboxes , 0 ,
846+ g_cxl .max_ranks * sizeof (cxl_rank_mailbox_t ));
861847
862848 __atomic_thread_fence (__ATOMIC_SEQ_CST );
863849
@@ -872,10 +858,16 @@ static void cxl_register_rank(int rank, int world_size) {
872858
873859 cxl_rank_mailbox_t * my_mailbox = & g_cxl .mailboxes [rank ];
874860
875- // Set up mailbox for this rank
876- my_mailbox -> rank = rank ;
877- my_mailbox -> pid = getpid ();
878- gethostname (my_mailbox -> hostname , sizeof (my_mailbox -> hostname ) - 1 );
861+ // Set up mailbox for this rank (use volatile writes for scalar fields,
862+ // and cxl_safe_memcpy for strings — libc functions like gethostname/strncpy
863+ // use SIMD internally and SIGILL on CXL device memory)
864+ * (volatile uint32_t * )& my_mailbox -> rank = rank ;
865+ * (volatile uint32_t * )& my_mailbox -> pid = getpid ();
866+ {
867+ char tmp_hostname [64 ] = {0 };
868+ gethostname (tmp_hostname , sizeof (tmp_hostname ) - 1 );
869+ cxl_safe_memcpy (my_mailbox -> hostname , tmp_hostname , sizeof (my_mailbox -> hostname ));
870+ }
879871 atomic_store (& my_mailbox -> head , 0 );
880872 atomic_store (& my_mailbox -> tail , 0 );
881873 atomic_store (& my_mailbox -> active , 1 );
@@ -2432,8 +2424,8 @@ int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype da
24322424 // Use MPI_Barrier for reliable synchronization across nodes
24332425 orig_MPI_Barrier (comm );
24342426
2435- // Initialize result with my own data
2436- memcpy (recvbuf , sendbuf , total_size );
2427+ // Initialize result with my own data (sendbuf may be in CXL memory)
2428+ cxl_safe_memcpy (recvbuf , sendbuf , total_size );
24372429
24382430 // Read and reduce data from all other ranks
24392431 for (int r = 0 ; r < g_cxl .world_size ; r ++ ) {
0 commit comments