fix(dnn/cuda): disallow implicit dtype conversion in cublaslt matmul algos

disable tensor op matmul kernels when input and output tensors are in f32 data type to avoid potential accuracy loss GitOrigin-RevId: 36859cba5a526a7778e12c03ac32815144fe0505

fix(dnn/cuda): disallow implicit dtype conversion in cublaslt matmul algos
disable tensor op matmul kernels when input and output tensors are in f32 data type to avoid potential accuracy loss GitOrigin-RevId: 36859cba5a526a7778e12c03ac32815144fe0505
0708bc78 · Megvii Engine Team · 3f01112a · 0708bc78 · 0708bc78 · 0708bc78
3 changed file
--- a/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp
+++ b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp
@@ -313,6 +313,19 @@ bool CUBLASLTMatmulDesc::get_algorithm_heuristic(const SizeArgs& args,
    cublas_check(cublasLtMatmulPreferenceSetAttribute(
            algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit,
            sizeof(algo_ws_limit)));
+#if CUDA_VERSION < 11000
+    bool is_f32_config = args.layout_a.dtype == dtype::Float32() &&
+                         args.layout_b.dtype == dtype::Float32() &&
+                         args.layout_c.dtype == dtype::Float32();
+    if (is_f32_config) {
+        // disable HMMA tensor op matmul when inputs and output are all f32
+        // tensors, to avoid the potential accuracy loss
+        uint32_t math_mode = CUBLAS_DEFAULT_MATH;
+        cublas_check(cublasLtMatmulPreferenceSetAttribute(
+                algo_pref, CUBLASLT_MATMUL_PREF_MATH_MODE_MASK, &math_mode,
+                sizeof(math_mode)));
+    }
+#endif
    status = cublasLtMatmulAlgoGetHeuristic(
            cublasLt_handle, matmul_desc,
            dt_c == CUDA_R_32I ? layout_trans_b : layout_b,

--- a/dnn/test/cuda/cutlass_matmul.cpp
+++ b/dnn/test/cuda/cutlass_matmul.cpp
@@ -215,6 +215,7 @@ std::vector<BenchArgs> get_feat_model_args() {
    return args;
 }

+#if CUDA_VERSION >= 10020
 std::vector<BenchArgs> get_f16_feat_model_args() {
    std::vector<BenchArgs> args;
    args.emplace_back(BenchArgs{128, 9216, 9216});
@@ -222,6 +223,7 @@ std::vector<BenchArgs> get_f16_feat_model_args() {
    args.emplace_back(BenchArgs{128, 5184, 5184});
    return args;
 }
+#endif

 void benchmark_matrix_mul(
        Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype,

--- a/dnn/test/cuda/matrix_mul.cpp
+++ b/dnn/test/cuda/matrix_mul.cpp
@@ -473,7 +473,34 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
            execs({A, B, {}});
    }
 }
+TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) {
+    require_compute_capability(7, 5);
+    size_t m = 128, n = 1024, k = 18432;
+    Checker<MatrixMul> checker(handle_cuda());
+    checker.set_before_exec_callback(
+            AlgoChecker<MatrixMulForward>("CUBLAS_LT"));

+    using Param = MatrixMul::Param;
+
+    Param param;
+    DType stype = dtype::Float32();
+    DType dtype = dtype::Float32();
+    TensorShape A, B;
+    param.transposeA = param.transposeB = 0;
+    if (param.transposeA)
+        A = TensorShape{k, m};
+    else
+        A = TensorShape{m, k};
+    if (param.transposeB)
+        B = TensorShape{n, k};
+    else
+        B = TensorShape{k, n};
+    checker.set_param(param)
+            .set_dtype(0, stype)
+            .set_dtype(1, stype)
+            .set_dtype(2, dtype)
+            .execs({A, B, {}});
+}
 } // namespace test
 } // namespace megdnn
 // vim: syntax=cpp.doxygen