diff --git a/dnn/src/cuda/conv_bias/batched_matmul.cpp b/dnn/src/cuda/conv_bias/batched_matmul.cpp index 543e88b8bee00ce11199255c0ec083ce14da5594..08f53cc8504e5e88b929d2b6c0229e1b116fff63 100644 --- a/dnn/src/cuda/conv_bias/batched_matmul.cpp +++ b/dnn/src/cuda/conv_bias/batched_matmul.cpp @@ -44,6 +44,9 @@ std::pair sub_opr_config( B.dtype = src_layout.dtype; C = {{dst_layout.shape[0], dst_layout.shape[1], B.shape[2]}, dst_layout.dtype}; + C.stride[2] = 1; + C.stride[1] = dst_layout.stride[1]; + C.stride[0] = dst_layout.stride[0]; MatrixMulForward::Param param; if (opr->param().compute_mode == param::Convolution::ComputeMode::FLOAT32) { @@ -89,6 +92,8 @@ bool ConvBiasForwardImpl::AlgoBatchedMatmul::is_available( return false; auto config = prepare_sub_opr(args); + //! The dst of batched matmul should be contiguous + if (!config.first[2].is_contiguous()) return false; auto&& fm = args.filter_meta; return fm.format == Param::Format::NCHW && diff --git a/dnn/src/cuda/conv_bias/group_conv.cpp b/dnn/src/cuda/conv_bias/group_conv.cpp index ee0df4476d6dabffc7803513a2c720d57dc771cf..49d9e3d5bc9e791027fd93e8217eb00d700d80ac 100644 --- a/dnn/src/cuda/conv_bias/group_conv.cpp +++ b/dnn/src/cuda/conv_bias/group_conv.cpp @@ -109,7 +109,8 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec( auto sub_args = args; sub_args.dst_tensor = &conv_dst_tensor; sub_args.dst_layout = &conv_dst_tensor.layout; - TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor}, tbias{*args.bias_tensor}; + TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor}, + tbias{*args.bias_tensor}; SmallVector flt_shape(0); std::vector flt_stride(0); size_t idx = 0; diff --git a/dnn/test/common/matrix_mul.cpp b/dnn/test/common/matrix_mul.cpp index f1f96dc2e6862958c315669470789ead10af00ab..4cbe5c77a00cf312351c0bcac2245c27f1424af7 100644 --- a/dnn/test/common/matrix_mul.cpp +++ b/dnn/test/common/matrix_mul.cpp @@ -17,6 +17,8 @@ using namespace megdnn; using namespace test; +constexpr size_t matrix_mul::TestArg::UNSET_STRIDE_VAL; + std::vector matrix_mul::get_matmul_args_no_mask() { std::vector args; @@ -57,7 +59,9 @@ matrix_mul::get_batched_matmul_args_cublaslt() { // so please uncomment it if the bug is fixed for (size_t k : {32, 64}) { - args.emplace_back(m, n, k, 0, 0, 0, 0, 2); + args.emplace_back(m, n, k, 0, TestArg::UNSET_STRIDE_VAL, + TestArg::UNSET_STRIDE_VAL, + TestArg::UNSET_STRIDE_VAL, 2); } } } @@ -70,7 +74,9 @@ matrix_mul::get_batched_matmul_args_int8x8x32() { for (size_t m : {1, 2, 3, 4, 5, 8, 64}) { for (size_t n : {1, 2, 3, 4, 5, 8, 64}) { for (size_t k : {1, 2, 3, 4, 5, 8, 64}) { - args.emplace_back(m, n, k, 0, 0, 0, 0, 2); + args.emplace_back(m, n, k, 0, TestArg::UNSET_STRIDE_VAL, + TestArg::UNSET_STRIDE_VAL, + TestArg::UNSET_STRIDE_VAL, 2); } } } @@ -136,6 +142,30 @@ std::vector matrix_mul::get_batched_matmul_args() { return args; } +std::vector +matrix_mul::get_batched_matmul_broadcast_args() { + std::vector args; + for (size_t mask = 0; mask < 4; ++mask) { + std::vector args_temp = + matrix_mul::get_batched_matmul_broadcast_args_mask(mask); + for (auto arg : args_temp) + args.emplace_back(arg); + } + return args; +} + +std::vector +matrix_mul::get_batched_matmul_broadcast_args_mask(uint8_t mask) { + std::vector args; + std::vector args_temp = + matrix_mul::get_batched_matmul_args_mask(mask); + for (auto arg : args_temp) { + args.emplace_back(arg); + args.back().A_batch_stride = 0; + } + return args; +} + template void matrix_mul::check_matrix_mul(DType A_dtype, DType B_dtype, DType C_dtype, Handle* handle, @@ -170,9 +200,9 @@ void matrix_mul::check_matrix_mul(DType A_dtype, DType B_dtype, DType C_dtype, checker.set_rng(0, rng.get()).set_rng(1, rng.get()); } - //! return expect if stride == 0, stride otherwise + //! return expect if stride == -1, stride otherwise auto stride_val = [](size_t stride, size_t expect) -> size_t { - if (stride == 0) { + if (stride == TestArg::UNSET_STRIDE_VAL) { return expect; } else { return stride; diff --git a/dnn/test/common/matrix_mul.h b/dnn/test/common/matrix_mul.h index 5da5bcec0330aa60cc2e373c3f176361f2f2fb2b..d52f1814aac4120902b9dd68ffa87af0bd16c731 100644 --- a/dnn/test/common/matrix_mul.h +++ b/dnn/test/common/matrix_mul.h @@ -24,15 +24,19 @@ namespace matrix_mul { // mask & 1 denotes transposeA; mask & 2 denotes transposeB struct TestArg { + constexpr static size_t UNSET_STRIDE_VAL = static_cast(-1); size_t m, n, k, mask; size_t A_stride, B_stride, C_stride, b; size_t A_batch_stride, B_batch_stride, C_batch_stride; // stride = 0 means the default stride, the dim is contiguous, i.e. the // stride value which makes tensor compact. - TestArg(size_t m, size_t n, size_t k, size_t mask, size_t A_stride = 0, - size_t B_stride = 0, size_t C_stride = 0, size_t b = 1, - size_t A_batch_stride = 0, size_t B_batch_stride = 0, - size_t C_batch_stride = 0) + TestArg(size_t m, size_t n, size_t k, size_t mask, + size_t A_stride = UNSET_STRIDE_VAL, + size_t B_stride = UNSET_STRIDE_VAL, + size_t C_stride = UNSET_STRIDE_VAL, size_t b = 1, + size_t A_batch_stride = UNSET_STRIDE_VAL, + size_t B_batch_stride = UNSET_STRIDE_VAL, + size_t C_batch_stride = UNSET_STRIDE_VAL) : m{m}, n{n}, k{k}, @@ -51,6 +55,8 @@ std::vector get_matmul_args_mask(uint8_t mask); std::vector get_matmul_args(); std::vector get_batched_matmul_args_mask(uint8_t mask); std::vector get_batched_matmul_args(); +std::vector get_batched_matmul_broadcast_args(); +std::vector get_batched_matmul_broadcast_args_mask(uint8_t mask); std::vector get_matmul_mk_packed_args(size_t nbase); std::vector get_batched_matmul_args_cublaslt(); std::vector get_batched_matmul_args_int8x8x32(); diff --git a/dnn/test/cuda/batched_matrix_mul.cpp b/dnn/test/cuda/batched_matrix_mul.cpp index d237416de52b44906bef7f12e344a7042d0a584f..2a942a44b595da02ad1d24e43b542973023bc19b 100644 --- a/dnn/test/cuda/batched_matrix_mul.cpp +++ b/dnn/test/cuda/batched_matrix_mul.cpp @@ -8,6 +8,7 @@ * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ +#include #include "test/cuda/fixture.h" #include "test/common/checker.h" @@ -62,6 +63,30 @@ TEST_F(CUDA, BATCHED_MATRIX_MUL_LT_F32_PART4) { #undef F32_TEST_PART +TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART1){ + matrix_mul::check_batched_matrix_mul( + dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS", + 1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(0)); +} + +TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART2){ + matrix_mul::check_batched_matrix_mul( + dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS", + 1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(1)); +} + +TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART3){ + matrix_mul::check_batched_matrix_mul( + dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS", + 1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(2)); +} + +TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BROADCAST_PART4){ + matrix_mul::check_batched_matrix_mul( + dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), "CUBLAS", + 1e-3, matrix_mul::get_batched_matmul_broadcast_args_mask(3)); +} + TEST_F(CUDA, BATCHED_MATRIX_MUL_F32_BRUTE_FORCE_PART1) { matrix_mul::check_batched_matrix_mul( dtype::Float32{}, dtype::Float32{}, {}, handle_cuda(), diff --git a/dnn/test/cuda/dilated_convolution.cpp b/dnn/test/cuda/dilated_convolution.cpp index 97c13a30691af8078d0d21890ed5ac3ddd65b5d4..302ca47f90bf2e9109dec3f6264e4b5c46e8277d 100644 --- a/dnn/test/cuda/dilated_convolution.cpp +++ b/dnn/test/cuda/dilated_convolution.cpp @@ -75,8 +75,8 @@ TEST_F(CUDA, DILATED_CONVOLUTION_BACKWARD_DATA) "CUDNN_CONVOLUTION_BWD_DATA_ALGO_1" CUDNN_VERSION_STRING)); printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); #else - checker.set_before_exec_callback( - AlgoChecker("MATMUL")); + checker.set_before_exec_callback(AlgoChecker( + ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); #endif NormalRNG default_rng; for (auto &&arg: args) { @@ -139,8 +139,8 @@ TEST_F(CUDA, DILATED_CONVOLUTION_BACKWARD_FILTER) "CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1" CUDNN_VERSION_STRING)); printf("cudnn version >= 7.5, use cudnn impl for dilated convolution\n"); #else - checker.set_before_exec_callback( - AlgoChecker("MATMUL")); + checker.set_before_exec_callback(AlgoChecker( + ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); #endif NormalRNG default_rng; bool first_run = true; diff --git a/dnn/test/cuda/matrix_mul.cpp b/dnn/test/cuda/matrix_mul.cpp index 2b7f293db762193264eb71e7e52592176e403c73..9ba0b3cdb762b9ccf5f0d2c3562ba01ad14ca072 100644 --- a/dnn/test/cuda/matrix_mul.cpp +++ b/dnn/test/cuda/matrix_mul.cpp @@ -51,7 +51,8 @@ TEST_F(CUDA, MATRIX_MUL_QUANTIZED4x4x32) { if (cuda::current_device_prop().major < 7 || (cuda::current_device_prop().major == 7 && cuda::current_device_prop().minor < 5)) { - printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device doesn't support\n"); + printf("Skip CUDA.MATRIX_MUL_QUANTIZED4x4x32 test as current device " + "doesn't support\n"); return; } Checker checker(handle_cuda(), false); @@ -257,19 +258,19 @@ TEST_F(CUDA, MATRIX_MUL) { BS = TensorShape{k, n}; CS = TensorShape{m, n}; TensorLayout AL, BL, CL; - if (arg.A_stride == 0) { + if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { AL = TensorLayout(AS, dtype::Float32()); } else { AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, dtype::Float32()); } - if (arg.B_stride == 0) { + if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { BL = TensorLayout(BS, dtype::Float32()); } else { BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, dtype::Float32()); } - if (arg.C_stride == 0) { + if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { CL = TensorLayout(CS, dtype::Float32()); } else { CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1}, @@ -285,8 +286,9 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT) NormalRNG normal_rng; Checker checker(handle_cuda()); checker.set_rng(0, &normal_rng) - .set_rng(1, &normal_rng) - .set_before_exec_callback(AlgoChecker("CUBLAS_LT")); + .set_rng(1, &normal_rng) + .set_before_exec_callback( + AlgoChecker("CUBLAS_LT")); using Param = MatrixMul::Param; size_t m = 32, n = 32, k = 32; // test Int8 matmul @@ -350,19 +352,19 @@ TEST_F(CUDA, MATRIX_MUL_CUBLASLT) BS = TensorShape{k, n}; CS = TensorShape{m, n}; TensorLayout AL, BL, CL; - if (arg.A_stride == 0) { + if (arg.A_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { AL = TensorLayout(AS, dtype::Float32()); } else { AL = TensorLayout(AS, {ptrdiff_t(arg.A_stride), 1}, dtype::Float32()); } - if (arg.B_stride == 0) { + if (arg.B_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { BL = TensorLayout(BS, dtype::Float32()); } else { BL = TensorLayout(BS, {ptrdiff_t(arg.B_stride), 1}, dtype::Float32()); } - if (arg.C_stride == 0) { + if (arg.C_stride == matrix_mul::TestArg::UNSET_STRIDE_VAL) { CL = TensorLayout(CS, dtype::Float32()); } else { CL = TensorLayout(CS, {ptrdiff_t(arg.C_stride), 1},