Fix cuDNN convolution precision on Ampere+ GPUs (#3127)

joelnn · web-flow · commit 07c1e739fada · 2025-12-28T15:38:13.000-05:00
On Ampere and later GPUs (SM 8.0+), cuDNN's default math mode permits
TF32 Tensor Core operations which use reduced mantissa precision. This
causes numerical differences when comparing CUDA vs CPU convolution
results, particularly in cudnnConvolutionBackwardFilter().

Explicitly set CUDNN_FMA_MATH to force true FP32 computation for
consistent numerical results across all GPU architectures.
diff --git a/dlib/cuda/cudnn_dlibapi.cpp b/dlib/cuda/cudnn_dlibapi.cpp
@@ -1044,6 +1044,15 @@ namespace dlib
                         CUDNN_CROSS_CORRELATION)); // could also be CUDNN_CONVOLUTION
 #endif
 
+#if CUDNN_MAJOR >= 8
+                // On Ampere and later GPUs, CUDNN_DEFAULT_MATH permits TF32 Tensor Core
+                // operations which have reduced precision. Use CUDNN_FMA_MATH to force
+                // true FP32 computation for consistent numerical results.
+                CHECK_CUDNN(cudnnSetConvolutionMathType(
+                        (cudnnConvolutionDescriptor_t)conv_handle,
+                        CUDNN_FMA_MATH));
+#endif
+
                 CHECK_CUDNN(cudnnGetConvolution2dForwardOutputDim(
                         (const cudnnConvolutionDescriptor_t)conv_handle,
                         descriptor(data),