Skip to content

Conversation

@roiser
Copy link
Owner

@roiser roiser commented Nov 23, 2025

No description provided.

roiser pushed a commit to roiser/madgraph4gpu-generated-processes that referenced this pull request Jan 14, 2026
roiser pushed a commit to roiser/madgraph4gpu-generated-processes that referenced this pull request Jan 14, 2026
roiser pushed a commit to roiser/madgraph4gpu-generated-processes that referenced this pull request Jan 14, 2026
@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
8 files need formatting.
See attached patch for details:

@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
8 files need formatting.
See attached patch for details:
--- src/mgOnGpuConfig.h 2026-01-14 14:00:34.435664702 +0000
+++ - 2026-01-14 14:00:36.427511728 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT

  • typedef float fptype; // single precision (4 bytes, fp32)

--- src/mgOnGpuVectors.h 2026-01-14 14:00:34.357664656 +0000
+++ - 2026-01-14 14:00:36.620091862 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
typedef long int bool_v attribute( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT

  • typedef int bool_v attribute( ( ext_vector_type( neppV ) ) ); // bbbb
  • typedef int bool_v attribute( ( ext_vector_type( neppV ) ) ); // bbbb
    #endif
    #else // gcc
    typedef unsigned int uint_v attribute( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_uux_ttx/color_sum.cc 2026-01-14 14:00:34.435664702 +0000
+++ - 2026-01-14 14:00:37.369229554 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
void

  • color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
  •             const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
    
  •             fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
    
  •             gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
    
  •             gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
    
  •             const int nGoodHel,               // input: number of good helicities
    
  •             const int gpublocks,              // input: cuda gpublocks
    
  •             const int gputhreads )            // input: cuda gputhreads
    
  • color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
  •             const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
    
  •             fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
    
  •             gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
    
  •             gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
    
  •             const int nGoodHel,           // input: number of good helicities
    
  •             const int gpublocks,          // input: cuda gpublocks
    
  •             const int gputhreads )        // input: cuda gputhreads
    
    {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
  •  assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
    
  •  assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     // Loop over helicities
     for( int ighel = 0; ighel < nGoodHel; ighel++ )
     {
    

@@ -409,13 +409,13 @@
assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed

  •  // Reset the tmp buffer
    
  •                                      // Reset the tmp buffer
    

#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif

  •  // Delegate the color sum to BLAS for 
    
  •  // Delegate the color sum to BLAS for
     color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
    

#endif
}

--- SubProcesses/P1_uux_ttx/CPPProcess.cc 2026-01-14 14:00:34.434664701 +0000
+++ - 2026-01-14 14:00:37.444504349 +0000
@@ -735,11 +735,11 @@
const fptype* allcouplings, // input: couplings[nevtndcoup2]
fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL

  •                   fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
    
  •                   fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
    
  •                   fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
    
  •                   fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
    

#endif

  •                   bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
    
  •                   const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
    
  •                   bool* isGoodHel, // output: isGoodHel[ncomb] - host array
    
  •                   const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
    
    {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    @@ -780,11 +780,11 @@
    #endif
    }
    constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
  •    //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
    
  •                                             //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
    

#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else

  •    cxtype_sv jamp_sv[ncolor] = {};  // all zeros
    
  •    cxtype_sv jamp_sv[ncolor] = {}; // all zeros
    

#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off /
constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1121,7 +1121,7 @@
assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2
neppV-page
const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else

  • const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
  •  fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
    
  •  fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
    

#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttx/check_sa.cc 2026-01-14 14:00:34.434664701 +0000
+++ - 2026-01-14 14:00:37.768768570 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else

  • RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
  • RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
    #endif
    // Bridge emulation mode (NB Bridge implies RamboHost!)
    bool bridge = false;
    @@ -918,7 +918,7 @@
    wrkflwtxt += "/sse4";
    #endif
    #else
  • wrkflwtxt += "/????"; // no path to this statement
  • wrkflwtxt += "/????"; // no path to this statement
    #endif
    // -- Has cxtype_v::operator[] bracket with non-const reference?
    #if defined MGONGPU_CPPSIMD
    @@ -1143,7 +1143,7 @@
    #elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
    << ""STD::COMPLEX"," << std::endl
    #else
  •         << "\"???\"," << std::endl                           // no path to this statement...
    
  •         << "\"???\"," << std::endl // no path to this statement...
    

#endif
<< ""RanNumb memory layout": "
<< ""AOSOA[" << neppR << "]""

--- SubProcesses/P1_gg_ttx/color_sum.cc 2026-01-14 14:00:34.387664674 +0000
+++ - 2026-01-14 14:00:38.172421262 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
void

  • color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
  •             const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
    
  •             fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
    
  •             gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
    
  •             gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
    
  •             const int nGoodHel,               // input: number of good helicities
    
  •             const int gpublocks,              // input: cuda gpublocks
    
  •             const int gputhreads )            // input: cuda gputhreads
    
  • color_sum_gpu( fptype* ghelAllMEs, // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
  •             const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
    
  •             fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
    
  •             gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
    
  •             gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
    
  •             const int nGoodHel,           // input: number of good helicities
    
  •             const int gpublocks,          // input: cuda gpublocks
    
  •             const int gputhreads )        // input: cuda gputhreads
    
    {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
  •  assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
    
  •  assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
     // Loop over helicities
     for( int ighel = 0; ighel < nGoodHel; ighel++ )
     {
    

@@ -409,13 +409,13 @@
assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed

  •  // Reset the tmp buffer
    
  •                                      // Reset the tmp buffer
    

#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif

  •  // Delegate the color sum to BLAS for 
    
  •  // Delegate the color sum to BLAS for
     color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
    

#endif
}

--- SubProcesses/P1_gg_ttx/CPPProcess.cc 2026-01-14 14:00:34.386664673 +0000
+++ - 2026-01-14 14:00:38.249067813 +0000
@@ -758,11 +758,11 @@
const fptype* allcouplings, // input: couplings[nevtndcoup2]
fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL

  •                   fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
    
  •                   fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
    
  •                   fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
    
  •                   fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
    

#endif

  •                   bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
    
  •                   const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
    
  •                   bool* isGoodHel, // output: isGoodHel[ncomb] - host array
    
  •                   const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
    
    {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    @@ -803,11 +803,11 @@
    #endif
    }
    constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
  •    //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
    
  •                                             //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
    

#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else

  •    cxtype_sv jamp_sv[ncolor] = {};  // all zeros
    
  •    cxtype_sv jamp_sv[ncolor] = {}; // all zeros
    

#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off /
constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1144,7 +1144,7 @@
assert( npagV % 2 == 0 ); // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2
neppV-page
const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else

  • const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
  •  fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
    
  •  fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
    

#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttx/check_sa.cc 2026-01-14 14:00:34.387664674 +0000
+++ - 2026-01-14 14:00:38.573087912 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else

  • RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
  • RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
    #endif
    // Bridge emulation mode (NB Bridge implies RamboHost!)
    bool bridge = false;
    @@ -918,7 +918,7 @@
    wrkflwtxt += "/sse4";
    #endif
    #else
  • wrkflwtxt += "/????"; // no path to this statement
  • wrkflwtxt += "/????"; // no path to this statement
    #endif
    // -- Has cxtype_v::operator[] bracket with non-const reference?
    #if defined MGONGPU_CPPSIMD
    @@ -1143,7 +1143,7 @@
    #elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
    << ""STD::COMPLEX"," << std::endl
    #else
  •         << "\"???\"," << std::endl                           // no path to this statement...
    
  •         << "\"???\"," << std::endl // no path to this statement...
    

#endif
<< ""RanNumb memory layout": "
<< ""AOSOA[" << neppR << "]""

@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
8 files need formatting.
See attached patch for details:

--- src/mgOnGpuConfig.h	2026-01-14 14:07:20.545692722 +0000
+++ -	2026-01-14 14:07:24.334005860 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef float fptype;  // single precision (4 bytes, fp32)
+  typedef float fptype; // single precision (4 bytes, fp32)
#endif

  // Floating point type (for color algebra alone #537): fptype2

--- src/mgOnGpuVectors.h	2026-01-14 14:07:20.468692884 +0000
+++ -	2026-01-14 14:07:24.527893709 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#endif
#else // gcc
  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_uux_ttx/color_sum.cc	2026-01-14 14:07:20.545692722 +0000
+++ -	2026-01-14 14:07:25.256162896 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_uux_ttx/CPPProcess.cc	2026-01-14 14:07:20.544692724 +0000
+++ -	2026-01-14 14:07:25.335112782 +0000
@@ -735,11 +735,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -780,11 +780,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1121,7 +1121,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1184,7 +1184,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttx/check_sa.cc	2026-01-14 14:07:20.544692724 +0000
+++ -	2026-01-14 14:07:25.660834524 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gg_ttx/color_sum.cc	2026-01-14 14:07:20.498692821 +0000
+++ -	2026-01-14 14:07:26.063950125 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gg_ttx/CPPProcess.cc	2026-01-14 14:07:20.497692823 +0000
+++ -	2026-01-14 14:07:26.141225849 +0000
@@ -758,11 +758,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -803,11 +803,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1144,7 +1144,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1207,7 +1207,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttx/check_sa.cc	2026-01-14 14:07:20.498692821 +0000
+++ -	2026-01-14 14:07:26.465730473 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""


roiser pushed a commit to roiser/madgraph4gpu-generated-processes that referenced this pull request Jan 14, 2026
@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
8 files need formatting.

Note: The CI is setup so it fails early, if formatting issues are detected for any of the processes.
The report below is for process PROC_pp_ttx with backend avx2.

See attached patch for details:

--- src/mgOnGpuConfig.h	2026-01-14 14:41:45.916186263 +0000
+++ -	2026-01-14 14:41:47.564286723 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef float fptype;  // single precision (4 bytes, fp32)
+  typedef float fptype; // single precision (4 bytes, fp32)
#endif

  // Floating point type (for color algebra alone #537): fptype2

--- src/mgOnGpuVectors.h	2026-01-14 14:41:45.841187085 +0000
+++ -	2026-01-14 14:41:47.750103864 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#endif
#else // gcc
  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_uux_ttx/color_sum.cc	2026-01-14 14:41:45.916186263 +0000
+++ -	2026-01-14 14:41:48.387612967 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_uux_ttx/CPPProcess.cc	2026-01-14 14:41:45.915186274 +0000
+++ -	2026-01-14 14:41:48.458200431 +0000
@@ -735,11 +735,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -780,11 +780,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1121,7 +1121,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1184,7 +1184,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttx/check_sa.cc	2026-01-14 14:41:45.916186263 +0000
+++ -	2026-01-14 14:41:48.774384174 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gg_ttx/color_sum.cc	2026-01-14 14:41:45.870186767 +0000
+++ -	2026-01-14 14:41:49.157288966 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gg_ttx/CPPProcess.cc	2026-01-14 14:41:45.869186778 +0000
+++ -	2026-01-14 14:41:49.229550576 +0000
@@ -758,11 +758,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -803,11 +803,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1144,7 +1144,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1207,7 +1207,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttx/check_sa.cc	2026-01-14 14:41:45.869186778 +0000
+++ -	2026-01-14 14:41:49.545385979 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""


@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
8 files need formatting.

Note: The CI is setup so it fails early, if formatting issues are detected for any of the processes.
The report below is for process PROC_pp_ttx with backend avx512.

See attached patch for details:

--- src/mgOnGpuConfig.h	2026-01-14 14:41:46.457628633 +0000
+++ -	2026-01-14 14:41:48.681624263 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef float fptype;  // single precision (4 bytes, fp32)
+  typedef float fptype; // single precision (4 bytes, fp32)
#endif

  // Floating point type (for color algebra alone #537): fptype2

--- src/mgOnGpuVectors.h	2026-01-14 14:41:46.383628628 +0000
+++ -	2026-01-14 14:41:48.869405649 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#endif
#else // gcc
  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_uux_ttx/color_sum.cc	2026-01-14 14:41:46.458628634 +0000
+++ -	2026-01-14 14:41:49.557764425 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_uux_ttx/CPPProcess.cc	2026-01-14 14:41:46.456628633 +0000
+++ -	2026-01-14 14:41:49.632977531 +0000
@@ -735,11 +735,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -780,11 +780,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1121,7 +1121,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1184,7 +1184,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttx/check_sa.cc	2026-01-14 14:41:46.457628633 +0000
+++ -	2026-01-14 14:41:49.952676473 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gg_ttx/color_sum.cc	2026-01-14 14:41:46.412628630 +0000
+++ -	2026-01-14 14:41:50.343310435 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gg_ttx/CPPProcess.cc	2026-01-14 14:41:46.411628630 +0000
+++ -	2026-01-14 14:41:50.417472321 +0000
@@ -758,11 +758,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -803,11 +803,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1144,7 +1144,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1207,7 +1207,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttx/check_sa.cc	2026-01-14 14:41:46.411628630 +0000
+++ -	2026-01-14 14:41:50.740263303 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""


@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
14 files need formatting.

Note: The CI is setup so it fails early, if formatting issues are detected for any of the processes.
The report below is for process PROC_pp_ttxj with backend avx2.

See attached patch for details:

--- src/mgOnGpuConfig.h	2026-01-14 14:41:45.641644678 +0000
+++ -	2026-01-14 14:41:47.716162589 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef float fptype;  // single precision (4 bytes, fp32)
+  typedef float fptype; // single precision (4 bytes, fp32)
#endif

  // Floating point type (for color algebra alone #537): fptype2

--- src/mgOnGpuVectors.h	2026-01-14 14:41:45.390640573 +0000
+++ -	2026-01-14 14:41:47.909269564 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#endif
#else // gcc
  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_gu_ttxu/color_sum.cc	2026-01-14 14:41:45.539643010 +0000
+++ -	2026-01-14 14:41:48.461525595 +0000
@@ -381,20 +381,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -411,13 +411,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gu_ttxu/CPPProcess.cc	2026-01-14 14:41:45.538642994 +0000
+++ -	2026-01-14 14:41:48.539176086 +0000
@@ -813,11 +813,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -858,11 +858,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1199,7 +1199,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1262,7 +1262,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gu_ttxu/check_sa.cc	2026-01-14 14:41:45.538642994 +0000
+++ -	2026-01-14 14:41:48.863849626 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_uux_ttxg/color_sum.cc	2026-01-14 14:41:45.642644694 +0000
+++ -	2026-01-14 14:41:49.285530406 +0000
@@ -381,20 +381,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -411,13 +411,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_uux_ttxg/CPPProcess.cc	2026-01-14 14:41:45.641644678 +0000
+++ -	2026-01-14 14:41:49.365265783 +0000
@@ -813,11 +813,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -858,11 +858,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1199,7 +1199,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1262,7 +1262,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttxg/check_sa.cc	2026-01-14 14:41:45.641644678 +0000
+++ -	2026-01-14 14:41:49.687296390 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gux_ttxux/color_sum.cc	2026-01-14 14:41:45.591643860 +0000
+++ -	2026-01-14 14:41:52.745951450 +0000
@@ -381,20 +381,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -411,13 +411,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gux_ttxux/CPPProcess.cc	2026-01-14 14:41:45.590643844 +0000
+++ -	2026-01-14 14:41:52.823911749 +0000
@@ -813,11 +813,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -858,11 +858,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1199,7 +1199,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1262,7 +1262,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gux_ttxux/check_sa.cc	2026-01-14 14:41:45.590643844 +0000
+++ -	2026-01-14 14:41:53.148642413 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gg_ttxg/color_sum.cc	2026-01-14 14:41:45.472641914 +0000
+++ -	2026-01-14 14:41:53.603699141 +0000
@@ -383,20 +383,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -413,13 +413,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gg_ttxg/CPPProcess.cc	2026-01-14 14:41:45.471641898 +0000
+++ -	2026-01-14 14:41:53.689442459 +0000
@@ -975,11 +975,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1020,11 +1020,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1361,7 +1361,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1424,7 +1424,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttxg/check_sa.cc	2026-01-14 14:41:45.471641898 +0000
+++ -	2026-01-14 14:41:54.022920505 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""


@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
8 files need formatting.

Note: The CI is setup so it fails early, if formatting issues are detected for any of the processes.
The report below is for process PROC_pp_ttx with backend cuda.

See attached patch for details:

--- src/mgOnGpuConfig.h	2026-01-14 14:41:46.275649385 +0000
+++ -	2026-01-14 14:41:49.806868900 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef float fptype;  // single precision (4 bytes, fp32)
+  typedef float fptype; // single precision (4 bytes, fp32)
#endif

  // Floating point type (for color algebra alone #537): fptype2

--- src/mgOnGpuVectors.h	2026-01-14 14:41:46.199649129 +0000
+++ -	2026-01-14 14:41:49.999500392 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#endif
#else // gcc
  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_uux_ttx/color_sum.cc	2026-01-14 14:41:46.275649385 +0000
+++ -	2026-01-14 14:41:50.716907218 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_uux_ttx/CPPProcess.cc	2026-01-14 14:41:46.275649385 +0000
+++ -	2026-01-14 14:41:50.792411347 +0000
@@ -735,11 +735,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -780,11 +780,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1121,7 +1121,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1184,7 +1184,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttx/check_sa.cc	2026-01-14 14:41:46.275649385 +0000
+++ -	2026-01-14 14:41:51.115680348 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gg_ttx/color_sum.cc	2026-01-14 14:41:46.229649230 +0000
+++ -	2026-01-14 14:41:51.518864936 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gg_ttx/CPPProcess.cc	2026-01-14 14:41:46.228649226 +0000
+++ -	2026-01-14 14:41:51.596814709 +0000
@@ -758,11 +758,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -803,11 +803,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1144,7 +1144,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1207,7 +1207,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttx/check_sa.cc	2026-01-14 14:41:46.228649226 +0000
+++ -	2026-01-14 14:41:51.923222061 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""


@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
14 files need formatting.

Note: The CI is setup so it fails early, if formatting issues are detected for any of the processes.
The report below is for process PROC_pp_ttxj with backend avx512.

See attached patch for details:

--- src/mgOnGpuConfig.h	2026-01-14 14:41:45.457122020 +0000
+++ -	2026-01-14 14:41:48.867643356 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef float fptype;  // single precision (4 bytes, fp32)
+  typedef float fptype; // single precision (4 bytes, fp32)
#endif

  // Floating point type (for color algebra alone #537): fptype2

--- src/mgOnGpuVectors.h	2026-01-14 14:41:45.205123714 +0000
+++ -	2026-01-14 14:41:49.053673132 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#endif
#else // gcc
  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_gu_ttxu/color_sum.cc	2026-01-14 14:41:45.355122706 +0000
+++ -	2026-01-14 14:41:49.595211719 +0000
@@ -381,20 +381,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -411,13 +411,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gu_ttxu/CPPProcess.cc	2026-01-14 14:41:45.354122712 +0000
+++ -	2026-01-14 14:41:49.671154815 +0000
@@ -813,11 +813,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -858,11 +858,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1199,7 +1199,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1262,7 +1262,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gu_ttxu/check_sa.cc	2026-01-14 14:41:45.354122712 +0000
+++ -	2026-01-14 14:41:49.993347181 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_uux_ttxg/color_sum.cc	2026-01-14 14:41:45.457122020 +0000
+++ -	2026-01-14 14:41:50.410946597 +0000
@@ -381,20 +381,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -411,13 +411,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_uux_ttxg/CPPProcess.cc	2026-01-14 14:41:45.456122027 +0000
+++ -	2026-01-14 14:41:50.487291823 +0000
@@ -813,11 +813,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -858,11 +858,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1199,7 +1199,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1262,7 +1262,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttxg/check_sa.cc	2026-01-14 14:41:45.456122027 +0000
+++ -	2026-01-14 14:41:50.807240488 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gux_ttxux/color_sum.cc	2026-01-14 14:41:45.406122363 +0000
+++ -	2026-01-14 14:41:53.836555139 +0000
@@ -381,20 +381,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -411,13 +411,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gux_ttxux/CPPProcess.cc	2026-01-14 14:41:45.405122370 +0000
+++ -	2026-01-14 14:41:53.914077071 +0000
@@ -813,11 +813,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -858,11 +858,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1199,7 +1199,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1262,7 +1262,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gux_ttxux/check_sa.cc	2026-01-14 14:41:45.406122363 +0000
+++ -	2026-01-14 14:41:54.233630187 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gg_ttxg/color_sum.cc	2026-01-14 14:41:45.288123156 +0000
+++ -	2026-01-14 14:41:54.684000977 +0000
@@ -383,20 +383,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -413,13 +413,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gg_ttxg/CPPProcess.cc	2026-01-14 14:41:45.287123163 +0000
+++ -	2026-01-14 14:41:54.769152262 +0000
@@ -975,11 +975,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -1020,11 +1020,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1361,7 +1361,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1424,7 +1424,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttxg/check_sa.cc	2026-01-14 14:41:45.287123163 +0000
+++ -	2026-01-14 14:41:55.097979057 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""


@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
8 files need formatting.

Note: The CI is setup so it fails early, if formatting issues are detected for any of the processes.
The report below is for process PROC_pp_ttx with backend scalar.

See attached patch for details:

--- src/mgOnGpuConfig.h	2026-01-14 14:41:48.436766918 +0000
+++ -	2026-01-14 14:41:50.217650410 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef float fptype;  // single precision (4 bytes, fp32)
+  typedef float fptype; // single precision (4 bytes, fp32)
#endif

  // Floating point type (for color algebra alone #537): fptype2

--- src/mgOnGpuVectors.h	2026-01-14 14:41:48.351766523 +0000
+++ -	2026-01-14 14:41:50.409747751 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#endif
#else // gcc
  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_uux_ttx/color_sum.cc	2026-01-14 14:41:48.436766918 +0000
+++ -	2026-01-14 14:41:51.094459859 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_uux_ttx/CPPProcess.cc	2026-01-14 14:41:48.435766913 +0000
+++ -	2026-01-14 14:41:51.168911526 +0000
@@ -735,11 +735,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -780,11 +780,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1121,7 +1121,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1184,7 +1184,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttx/check_sa.cc	2026-01-14 14:41:48.435766913 +0000
+++ -	2026-01-14 14:41:51.487867143 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gg_ttx/color_sum.cc	2026-01-14 14:41:48.381766662 +0000
+++ -	2026-01-14 14:41:51.879278498 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gg_ttx/CPPProcess.cc	2026-01-14 14:41:48.380766658 +0000
+++ -	2026-01-14 14:41:51.953198464 +0000
@@ -758,11 +758,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -803,11 +803,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1144,7 +1144,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1207,7 +1207,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttx/check_sa.cc	2026-01-14 14:41:48.381766662 +0000
+++ -	2026-01-14 14:41:52.275690747 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""


@github-actions
Copy link

❌ Code Format Check Failed

The generated code does not conform to clang-format rules.

Please update your code generator to produce properly formatted code.
8 files need formatting.

Note: The CI is setup so it fails early, if formatting issues are detected for any of the processes.
The report below is for process PROC_pp_ttx with backend hip.

See attached patch for details:

--- src/mgOnGpuConfig.h	2026-01-14 14:41:48.546753444 +0000
+++ -	2026-01-14 14:41:50.537749405 +0000
@@ -180,7 +180,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef double fptype; // double precision (8 bytes, fp64)
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef float fptype;  // single precision (4 bytes, fp32)
+  typedef float fptype; // single precision (4 bytes, fp32)
#endif

  // Floating point type (for color algebra alone #537): fptype2

--- src/mgOnGpuVectors.h	2026-01-14 14:41:48.470752483 +0000
+++ -	2026-01-14 14:41:50.717310388 +0000
@@ -123,7 +123,7 @@
#if defined MGONGPU_FPTYPE_DOUBLE
  typedef long int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#elif defined MGONGPU_FPTYPE_FLOAT
-  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) );                         // bbbb
+  typedef int bool_v __attribute__( ( ext_vector_type( neppV ) ) ); // bbbb
#endif
#else // gcc
  typedef unsigned int uint_v __attribute__( ( vector_size( neppV * sizeof( unsigned int ) ) ) );

--- SubProcesses/P1_uux_ttx/color_sum.cc	2026-01-14 14:41:48.546753444 +0000
+++ -	2026-01-14 14:41:51.362661886 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_uux_ttx/CPPProcess.cc	2026-01-14 14:41:48.544753419 +0000
+++ -	2026-01-14 14:41:51.433884888 +0000
@@ -735,11 +735,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -780,11 +780,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1121,7 +1121,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1184,7 +1184,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_uux_ttx/check_sa.cc	2026-01-14 14:41:48.545753432 +0000
+++ -	2026-01-14 14:41:51.748913363 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""

--- SubProcesses/P1_gg_ttx/color_sum.cc	2026-01-14 14:41:48.500752863 +0000
+++ -	2026-01-14 14:41:52.134106287 +0000
@@ -379,20 +379,20 @@

#ifdef MGONGPUCPP_GPUIMPL
  void
-  color_sum_gpu( fptype* ghelAllMEs,               // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
-                 const fptype* ghelAllJamps,       // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
-                 fptype2* ghelAllBlasTmp,          // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
-                 gpuBlasHandle_t* pBlasHandle,     // input: cuBLAS/hipBLAS handle
-                 gpuStream_t* ghelStreams,         // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
-                 const int nGoodHel,               // input: number of good helicities
-                 const int gpublocks,              // input: cuda gpublocks
-                 const int gputhreads )            // input: cuda gputhreads
+  color_sum_gpu( fptype* ghelAllMEs,           // output: allMEs super-buffer for nGoodHel <= ncomb individual helicities (index is ighel)
+                 const fptype* ghelAllJamps,   // input: allJamps super-buffer[2][ncol][nGoodHel][nevt] for nGoodHel <= ncomb individual helicities
+                 fptype2* ghelAllBlasTmp,      // tmp: allBlasTmp super-buffer for nGoodHel <= ncomb individual helicities
+                 gpuBlasHandle_t* pBlasHandle, // input: cuBLAS/hipBLAS handle
+                 gpuStream_t* ghelStreams,     // input: cuda streams (index is ighel: only the first nGoodHel <= ncomb are non-null)
+                 const int nGoodHel,           // input: number of good helicities
+                 const int gpublocks,          // input: cuda gpublocks
+                 const int gputhreads )        // input: cuda gputhreads
  {
    const int nevt = gpublocks * gputhreads;
    // CASE 1: KERNEL
    if( !pBlasHandle )
    {
-      assert( ghelAllBlasTmp == nullptr );  // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
+      assert( ghelAllBlasTmp == nullptr ); // sanity check for HASBLAS=hasNoBlas or CUDACPP_RUNTIME_BLASCOLORSUM not set
      // Loop over helicities
      for( int ighel = 0; ighel < nGoodHel; ighel++ )
      {
@@ -409,13 +409,13 @@
      assert( false ); // sanity check: no path to this statement for HASBLAS=hasNoBlas
#else
      checkGpu( gpuDeviceSynchronize() ); // do not start the BLAS color sum for all helicities until the loop over helicities has completed
-      // Reset the tmp buffer
+                                          // Reset the tmp buffer
#if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( 2 * ncolor * mgOnGpu::nx2 + 1 ) * sizeof( fptype2 ) );
#else
      gpuMemset( ghelAllBlasTmp, 0, nGoodHel * nevt * ( ncolor * mgOnGpu::nx2 ) * sizeof( fptype2 ) );
#endif
-      // Delegate the color sum to BLAS for 
+      // Delegate the color sum to BLAS for
      color_sum_blas( ghelAllMEs, ghelAllJamps, ghelAllBlasTmp, pBlasHandle, ghelStreams, nGoodHel, gpublocks, gputhreads );
#endif
    }

--- SubProcesses/P1_gg_ttx/CPPProcess.cc	2026-01-14 14:41:48.499752850 +0000
+++ -	2026-01-14 14:41:52.206247236 +0000
@@ -758,11 +758,11 @@
                       const fptype* allcouplings, // input: couplings[nevt*ndcoup*2]
                       fptype* allMEs,             // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL
-                       fptype* allNumerators,      // output: multichannel numerators[nevt], running_sum_over_helicities
-                       fptype* allDenominators,    // output: multichannel denominators[nevt], running_sum_over_helicities
+                       fptype* allNumerators,   // output: multichannel numerators[nevt], running_sum_over_helicities
+                       fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities
#endif
-                       bool* isGoodHel,            // output: isGoodHel[ncomb] - host array
-                       const int nevt )            // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
+                       bool* isGoodHel, // output: isGoodHel[ncomb] - host array
+                       const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
  {
    //assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
    //assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment [COMMENT OUT TO TEST MISALIGNED ACCESS]
@@ -803,11 +803,11 @@
#endif
        }
        constexpr fptype_sv* jamp2_sv = nullptr; // no need for color selection during helicity filtering
-        //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
+                                                 //std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
        cxtype_sv jamp_sv[2 * ncolor] = {}; // all zeros
#else
-        cxtype_sv jamp_sv[ncolor] = {};  // all zeros
+        cxtype_sv jamp_sv[ncolor] = {}; // all zeros
#endif
#ifdef MGONGPU_SUPPORTS_MULTICHANNEL /* clang-format off */
        constexpr unsigned int channelId = 0; // disable multichannel single-diagram enhancement
@@ -1144,7 +1144,7 @@
    assert( npagV % 2 == 0 );     // SANITY CHECK for mixed fptypes: two neppV-pages are merged to one 2*neppV-page
    const int npagV2 = npagV / 2; // loop on two SIMD pages (neppV events) at a time
#else
-    const int npagV2 = npagV;            // loop on one SIMD page (neppV events) at a time
+    const int npagV2 = npagV; // loop on one SIMD page (neppV events) at a time
#endif
#ifdef _OPENMP
    // OMP multithreading #575 (NB: tested only with gcc11 so far)
@@ -1207,7 +1207,7 @@
      // Running sum of partial amplitudes squared for event by event color selection (#402)
      // (jamp2[nParity][ncolor][neppV] for the SIMD vector - or the two SIMD vectors - of events processed in calculate_jamps)
      fptype_sv jamp2_sv[nParity * ncolor] = {};
-      fptype_sv MEs_ighel[ncomb] = {};  // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
+      fptype_sv MEs_ighel[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the first - and/or only - neppV page)
#if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT
      fptype_sv MEs_ighel2[ncomb] = {}; // sum of MEs for all good helicities up to ighel (for the second neppV page)
#endif

--- SubProcesses/P1_gg_ttx/check_sa.cc	2026-01-14 14:41:48.499752850 +0000
+++ -	2026-01-14 14:41:52.526936543 +0000
@@ -135,7 +135,7 @@
#ifdef MGONGPUCPP_GPUIMPL
  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU
#else
-  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost;   // default on CPU
+  RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU
#endif
  // Bridge emulation mode (NB Bridge implies RamboHost!)
  bool bridge = false;
@@ -918,7 +918,7 @@
  wrkflwtxt += "/sse4";
#endif
#else
-  wrkflwtxt += "/????";                                           // no path to this statement
+  wrkflwtxt += "/????"; // no path to this statement
#endif
  // -- Has cxtype_v::operator[] bracket with non-const reference?
#if defined MGONGPU_CPPSIMD
@@ -1143,7 +1143,7 @@
#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX
             << "\"STD::COMPLEX\"," << std::endl
#else
-             << "\"???\"," << std::endl                           // no path to this statement...
+             << "\"???\"," << std::endl // no path to this statement...
#endif
             << "\"RanNumb memory layout\": "
             << "\"AOSOA[" << neppR << "]\""


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants