diff --git a/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp index d79c743483dc899a0f718873873e8b01415517ea..1226fb6c90429dc50cd82c5b1920f0620abfb9c0 100644 --- a/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp +++ b/dnn/src/cuda/matrix_mul/cublasLt_wrapper.cpp @@ -313,6 +313,19 @@ bool CUBLASLTMatmulDesc::get_algorithm_heuristic(const SizeArgs& args, cublas_check(cublasLtMatmulPreferenceSetAttribute( algo_pref, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &algo_ws_limit, sizeof(algo_ws_limit))); +#if CUDA_VERSION < 11000 + bool is_f32_config = args.layout_a.dtype == dtype::Float32() && + args.layout_b.dtype == dtype::Float32() && + args.layout_c.dtype == dtype::Float32(); + if (is_f32_config) { + // disable HMMA tensor op matmul when inputs and output are all f32 + // tensors, to avoid the potential accuracy loss + uint32_t math_mode = CUBLAS_DEFAULT_MATH; + cublas_check(cublasLtMatmulPreferenceSetAttribute( + algo_pref, CUBLASLT_MATMUL_PREF_MATH_MODE_MASK, &math_mode, + sizeof(math_mode))); + } +#endif status = cublasLtMatmulAlgoGetHeuristic( cublasLt_handle, matmul_desc, dt_c == CUDA_R_32I ? layout_trans_b : layout_b, diff --git a/dnn/test/cuda/cutlass_matmul.cpp b/dnn/test/cuda/cutlass_matmul.cpp index 50e42e96f835c4df4096b3425300e4dabd147f73..913a6ec618d8559832282e1ef3eb0acc65e862f7 100644 --- a/dnn/test/cuda/cutlass_matmul.cpp +++ b/dnn/test/cuda/cutlass_matmul.cpp @@ -215,6 +215,7 @@ std::vector get_feat_model_args() { return args; } +#if CUDA_VERSION >= 10020 std::vector get_f16_feat_model_args() { std::vector args; args.emplace_back(BenchArgs{128, 9216, 9216}); @@ -222,6 +223,7 @@ std::vector get_f16_feat_model_args() { args.emplace_back(BenchArgs{128, 5184, 5184}); return args; } +#endif void benchmark_matrix_mul( Handle* handle, const std::vector& args, DType A_dtype, diff --git a/dnn/test/cuda/matrix_mul.cpp b/dnn/test/cuda/matrix_mul.cpp index 4cc9cf811b1937ce70519bbe5a39225072bc7e03..f2f6465a015a1b7be7892beae2237bb96184db63 100644 --- a/dnn/test/cuda/matrix_mul.cpp +++ b/dnn/test/cuda/matrix_mul.cpp @@ -473,7 +473,34 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT_INT8) { execs({A, B, {}}); } } +TEST_F(CUDA, MATRIX_MUL_CUBLASLT_F32) { + require_compute_capability(7, 5); + size_t m = 128, n = 1024, k = 18432; + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + AlgoChecker("CUBLAS_LT")); + using Param = MatrixMul::Param; + + Param param; + DType stype = dtype::Float32(); + DType dtype = dtype::Float32(); + TensorShape A, B; + param.transposeA = param.transposeB = 0; + if (param.transposeA) + A = TensorShape{k, m}; + else + A = TensorShape{m, k}; + if (param.transposeB) + B = TensorShape{n, k}; + else + B = TensorShape{k, n}; + checker.set_param(param) + .set_dtype(0, stype) + .set_dtype(1, stype) + .set_dtype(2, dtype) + .execs({A, B, {}}); +} } // namespace test } // namespace megdnn // vim: syntax=cpp.doxygen