diff --git a/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp
index d79c743483dc899a0f718873873e8b01415517ea..1226fb6c90429dc50cd82c5b1920f0620abfb9c0 100644
--- a/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp
+++ b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp
@@ -313,6 +313,19 @@ bool CUBLASLTMatmulDesc::get_algorithm_heuristic(const SizeArgs& args,
     cublas_check(cublasLtMatmulPreferenceSetAttribute(
             algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit,
             sizeof(algo_ws_limit)));
+#if CUDA_VERSION < 11000
+    bool is_f32_config = args.layout_a.dtype == dtype::Float32() &&
+                         args.layout_b.dtype == dtype::Float32() &&
+                         args.layout_c.dtype == dtype::Float32();
+    if (is_f32_config) {
+        // disable HMMA tensor op matmul when inputs and output are all f32
+        // tensors, to avoid the potential accuracy loss
+        uint32_t math_mode = CUBLAS_DEFAULT_MATH;
+        cublas_check(cublasLtMatmulPreferenceSetAttribute(
+                algo_pref, CUBLASLT_MATMUL_PREF_MATH_MODE_MASK, &math_mode,
+                sizeof(math_mode)));
+    }
+#endif
     status = cublasLtMatmulAlgoGetHeuristic(
             cublasLt_handle, matmul_desc,
             dt_c == CUDA_R_32I ? layout_trans_b : layout_b,
diff --git a/dnn/test/cuda/cutlass_matmul.cpp b/dnn/test/cuda/cutlass_matmul.cpp
index 50e42e96f835c4df4096b3425300e4dabd147f73..913a6ec618d8559832282e1ef3eb0acc65e862f7 100644
--- a/dnn/test/cuda/cutlass_matmul.cpp
+++ b/dnn/test/cuda/cutlass_matmul.cpp
@@ -215,6 +215,7 @@ std::vector<BenchArgs> get_feat_model_args() {
     return args;
 }
 
+#if CUDA_VERSION >= 10020
 std::vector<BenchArgs> get_f16_feat_model_args() {
     std::vector<BenchArgs> args;
     args.emplace_back(BenchArgs{128, 9216, 9216});
@@ -222,6 +223,7 @@ std::vector<BenchArgs> get_f16_feat_model_args() {
     args.emplace_back(BenchArgs{128, 5184, 5184});
     return args;
 }
+#endif
 
 void benchmark_matrix_mul(
         Handle* handle, const std::vector<BenchArgs>& args, DType A_dtype,
diff --git a/dnn/test/cuda/matrix_mul.cpp b/dnn/test/cuda/matrix_mul.cpp
index 4cc9cf811b1937ce70519bbe5a39225072bc7e03..f2f6465a015a1b7be7892beae2237bb96184db63 100644
--- a/dnn/test/cuda/matrix_mul.cpp
+++ b/dnn/test/cuda/matrix_mul.cpp
@@ -473,7 +473,34 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) {
             execs({A, B, {}});
     }
 }
+TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) {
+    require_compute_capability(7, 5);
+    size_t m = 128, n = 1024, k = 18432;
+    Checker<MatrixMul> checker(handle_cuda());
+    checker.set_before_exec_callback(
+            AlgoChecker<MatrixMulForward>("CUBLAS_LT"));
 
+    using Param = MatrixMul::Param;
+
+    Param param;
+    DType stype = dtype::Float32();
+    DType dtype = dtype::Float32();
+    TensorShape A, B;
+    param.transposeA = param.transposeB = 0;
+    if (param.transposeA)
+        A = TensorShape{k, m};
+    else
+        A = TensorShape{m, k};
+    if (param.transposeB)
+        B = TensorShape{n, k};
+    else
+        B = TensorShape{k, n};
+    checker.set_param(param)
+            .set_dtype(0, stype)
+            .set_dtype(1, stype)
+            .set_dtype(2, dtype)
+            .execs({A, B, {}});
+}
 } // namespace test
 } // namespace megdnn
 // vim: syntax=cpp.doxygen