Added Quantize Configs to grouped Qauntization

Oleg-Goncharov · Oleg-Goncharov · commit 6c5cc7fe8a88 · 2026-03-09T16:04:36.000Z
Signed-off-by: Oleg Goncharov &lt;ogoncharov@nvidia.com&gt;
diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose_grouped.cu b/tests/cpp/operator/test_cast_nvfp4_transpose_grouped.cu
@@ -227,7 +227,7 @@ void compare_nvfp4_tensors(const std::string& name,
         }
     }
 
-    constexpr bool print_detailed_summary = true;
+    bool print_detailed_summary = false;
     if (print_detailed_summary) {
         // Always report summary - either success or failure
         std::cout << "=== SUMMARY for tensor " << name << " ===" << std::endl;
@@ -492,7 +492,11 @@ void performTest(const ShapeRepresentation shape_rep,
                                       &offsets_tensor, sizeof(offsets_tensor));
     }
 
-    nvte_group_quantize(in_group_tensor, out_group_tensor, 0);
+    QuantizationConfigWrapper quant_config;
+    quant_config.set_use_fast_math(use_fast_math);
+    quant_config.set_stochastic_rounding(false);
+
+    nvte_group_quantize_v2(in_group_tensor, out_group_tensor, quant_config, 0);
     cudaDeviceSynchronize();
     auto err = cudaGetLastError();
     ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
@@ -548,14 +552,14 @@ void performTest(const ShapeRepresentation shape_rep,
 
 // {shape_representation, num_tensors, [logical_shape_M, logical_shape_K], [M_i], [K_i]}
 std::vector<std::vector<size_t>> grouped_input_config = {
-    // {SAME_BOTH_DIMS,        1,      128,128},
-    // {SAME_BOTH_DIMS,        2,      256,128},
-    // {VARYING_FIRST_DIM,     2,      512,128,                    128,384},
-    // {VARYING_FIRST_DIM,     3,      1024,160,                   128,384,512},
-    // {VARYING_FIRST_DIM,     4,      1536,160,                   128,384,512,512},
-    // {VARYING_FIRST_DIM,     5,      4096,512,                   128,256,384,1024,2304},
-    // {VARYING_LAST_DIM,      3,      256,896,                    128,256,512},
-    // {VARYING_BOTH_DIMS,     2,      1,(128*128)+(256*256),      128,256,        128,256},
+    {SAME_BOTH_DIMS,        1,      128,128},
+    {SAME_BOTH_DIMS,        2,      256,128},
+    {VARYING_FIRST_DIM,     2,      512,128,                    128,384},
+    {VARYING_FIRST_DIM,     3,      1024,160,                   128,384,512},
+    {VARYING_FIRST_DIM,     4,      1536,160,                   128,384,512,512},
+    {VARYING_FIRST_DIM,     5,      4096,512,                   128,256,384,1024,2304},
+    {VARYING_LAST_DIM,      3,      256,896,                    128,256,512},
+    {VARYING_BOTH_DIMS,     2,      1,(128*128)+(256*256),      128,256,        128,256},
     {VARYING_BOTH_DIMS,     2,      1,(256*128)+(512*640),      256,512,        128,640},
 };
 
diff --git a/transformer_engine/common/cast/cast.cu b/transformer_engine/common/cast/cast.cu
@@ -56,6 +56,15 @@ void nvte_quantize_v2(const NVTETensor input, NVTETensor output,
   dispatch::quantize_fwd_helper<IS_ACT, Empty, nullptr>(input, output, quant_config, stream);
 }
 
+void nvte_group_quantize_v2(const NVTEGroupedTensor input, NVTEGroupedTensor output,
+                            const NVTEQuantizationConfig quant_config, cudaStream_t stream) {
+  NVTE_API_CALL(nvte_group_quantize_v2);
+  using namespace transformer_engine;
+
+  constexpr bool IS_ACT = false;
+  dispatch::group_quantize_fwd_helper<IS_ACT, Empty, nullptr>(input, output, quant_config, stream);
+}
+
 void nvte_quantize_dbias(const NVTETensor input, NVTETensor output, NVTETensor dbias,
                          NVTETensor workspace, cudaStream_t stream) {
   NVTE_API_CALL(nvte_quantize_dbias);
diff --git a/transformer_engine/common/include/transformer_engine/cast.h b/transformer_engine/common/include/transformer_engine/cast.h
@@ -124,6 +124,17 @@ void nvte_quantize_noop(const NVTETensor input, NVTETensor output, NVTETensor no
 void nvte_quantize_v2(const NVTETensor input, NVTETensor output,
                       const NVTEQuantizationConfig quant_config, cudaStream_t stream);
 
+
+/*! \brief Casts input tensor to quantized output tensor, with advanced quantization options.
+ *
+ *  \param[in]      input            Input grouped tensor to be cast.
+ *  \param[in,out]  output           Output grouped quantized tensor.
+ *  \param[in]      quant_config     Quantization configuration.
+ *  \param[in]      stream           CUDA stream used for the operation.
+ */
+void nvte_group_quantize_v2(const NVTEGroupedTensor input, NVTEGroupedTensor output,
+                            const NVTEQuantizationConfig quant_config, cudaStream_t stream);
+
 /*! \brief Casts input tensor to MXFP8. Additionally, reduces the input along columns.
  *         If the scaling mode of the output tensor is set to NVTE_MXFP8_1D_SCALING,
  *         the block quantization (MXFP8) of the specified shape of the block will be used.