diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 8368aafac031912bb634e4e32dd574ea0b3cb3c5..213062c1dc2fb161fa208995e84b5b50d4bdc2f8 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -361,9 +361,6 @@ private: }; class ConvBiasForwardImpl::AlgoBatchedMatmul final : public AlgoBase { - static void extract_matmul_layouts(const SizeArgs& args, TensorLayout& A, - TensorLayout& B, TensorLayout& C); - public: bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; @@ -372,10 +369,15 @@ public: const char* name() const override { if (m_name.empty()) { m_name = ConvBiasForward::algo_name( - "BATCHEDMATMUL", {}); + "BATCHED_MATMUL", {}); } return m_name.c_str(); } + + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; + bool is_reproducible() const override { return true; } MEGDNN_DECL_ALGO_TYPE(CUDA_BATCHED_MATMUL) diff --git a/dnn/src/cuda/conv_bias/batched_matmul.cpp b/dnn/src/cuda/conv_bias/batched_matmul.cpp index cc5da62596e5f1696dbbfb5b6cf041f7e00b6ecd..0bee59852ab0ff653092a544bb4e44aee2982cc1 100644 --- a/dnn/src/cuda/conv_bias/batched_matmul.cpp +++ b/dnn/src/cuda/conv_bias/batched_matmul.cpp @@ -6,10 +6,13 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ +#include "src/common/algo_chooser.h" #include "src/common/conv_bias.h" +#include "src/cuda/batched_matrix_mul/algo.h" #include "src/cuda/conv_bias/algo.h" #include "src/cuda/handle.h" #include "src/cuda/utils.cuh" @@ -18,18 +21,72 @@ using namespace megdnn; using namespace cuda; using namespace conv_bias; +namespace { +std::pair sub_opr_config( + const ConvBiasForwardImpl::CanonizedFilterMeta& fm, + const TensorLayout& src_layout, const TensorLayout&, + const TensorLayout& dst_layout, const ConvBiasForwardImpl* opr) { + // A {N, OC, IC} + // B {N, IC, H * W} + // C {N, OC, H * W} + size_t batched = src_layout.shape[0]; + TensorLayout A, B, C; + A = {{batched, fm.ocpg, fm.icpg}, fm.dtype}; + A.stride[0] = 0; + B.ndim = 3; + B.shape[1] = src_layout.shape[1]; + B.shape[2] = src_layout.shape[2] * src_layout.shape[3]; + B.shape[0] = batched; + B.stride[2] = 1; + B.stride[1] = src_layout.stride[1]; + B.stride[0] = src_layout.stride[0]; + B.dtype = src_layout.dtype; + C = {{dst_layout.shape[0], dst_layout.shape[1], B.shape[2]}, + dst_layout.dtype}; + + MatrixMulForward::Param param; + if (opr->param().compute_mode == param::Convolution::ComputeMode::FLOAT32) { + param.compute_mode = param::MatrixMul::ComputeMode::FLOAT32; + } + + return {{A, B, C}, param}; +} +} // namespace + +std::vector +ConvBiasForwardImpl::AlgoBatchedMatmul::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + const ConvBiasForwardImpl* conv_bias_opr = + static_cast(opr); + CanonizedFilterMeta fm = + conv_bias_opr->check_layout_fwd(layouts[0], layouts[1], layouts[4]); + auto&& config = sub_opr_config(fm, layouts[0], layouts[1], layouts[4], + conv_bias_opr); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::BATCHED_MATRIX_MUL_FORWARD, param_str, + config.first}}; +} + bool ConvBiasForwardImpl::AlgoBatchedMatmul::is_available( const SizeArgs& args) const { if (args.z_layout->ndim > 0) return false; - //! cudnn batched matmul with discontinuous stride has many bugs, so disable - //! here. - TensorLayout A, B, C; - extract_matmul_layouts(args, A, B, C); - if (!B.is_contiguous()) { - return false; + auto bmatmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid() && + !args.opr->execution_policy().sub_policy.empty()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; } + + auto&& config = + sub_opr_config(args.filter_meta, *args.src_layout, + *args.filter_layout, *args.dst_layout, args.opr); + bmatmul_opr->param() = config.second; + auto&& fm = args.filter_meta; return fm.format == Param::Format::NCHW && (fm.dtype.enumv() == DTypeEnum::Float32 || @@ -37,29 +94,10 @@ bool ConvBiasForwardImpl::AlgoBatchedMatmul::is_available( fm.spatial_ndim == 2 && fm.group == 1 && fm.dilation[0] == 1 && fm.dilation[1] == 1 && fm.spatial[0] == 1 && fm.spatial[1] == 1 && fm.padding[0] == 0 && fm.padding[1] == 0 && fm.stride[0] == 1 && - fm.stride[1] == 1; -} - -void ConvBiasForwardImpl::AlgoBatchedMatmul::extract_matmul_layouts( - const SizeArgs& args, TensorLayout& A, TensorLayout& B, - TensorLayout& C) { - auto&& fm = args.filter_meta; - // A {N, OC, IC} - // B {N, IC, H * W} - // C {N, OC, H * W} - size_t batched = args.src_layout->shape[0]; - A = {{batched, fm.ocpg, fm.icpg}, fm.dtype}; - A.stride[0] = 0; - B.ndim = 3; - B.shape[1] = args.src_layout->shape[1]; - B.shape[2] = args.src_layout->shape[2] * args.src_layout->shape[3]; - B.shape[0] = batched; - B.stride[2] = 1; - B.stride[1] = args.src_layout->stride[1]; - B.stride[0] = args.src_layout->stride[0]; - B.dtype = args.src_layout->dtype; - C = {{args.dst_layout->shape[0], args.dst_layout->shape[1], B.shape[2]}, - args.dst_layout->dtype}; + fm.stride[1] == 1 && + get_algorithm( + static_cast(bmatmul_opr.get()), + config.first[0], config.first[1], config.first[2]); } WorkspaceBundle ConvBiasForwardImpl::AlgoBatchedMatmul::get_workspace_bundle( @@ -76,11 +114,23 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoBatchedMatmul::get_workspace_bundle( SizeArgs conv_args = args; conv_args.dst_layout = &dst_layout; - TensorLayout A, B, C; - extract_matmul_layouts(conv_args, A, B, C); - sizes.insert( - sizes.begin(), - args.handle->batched_matrix_mul()->get_workspace_in_bytes(A, B, C)); + + auto bmatmul_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid() && + !args.opr->execution_policy().sub_policy.empty()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + + auto&& config = + sub_opr_config(args.filter_meta, *args.src_layout, + *args.filter_layout, *args.dst_layout, args.opr); + bmatmul_opr->param() = config.second; + + sizes.insert(sizes.begin(), + args.handle->batched_matrix_mul()->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2])); return {ptr, std::move(sizes)}; } @@ -104,13 +154,23 @@ void ConvBiasForwardImpl::AlgoBatchedMatmul::exec(const ExecArgs& args) const { conv_args.dst_tensor = &conv_dst_tensor; conv_args.dst_layout = &conv_dst_tensor.layout; { - TensorND A, B, C; - extract_matmul_layouts(args, A.layout, B.layout, C.layout); - A.raw_ptr = args.filter_tensor->raw_ptr; - B.raw_ptr = args.src_tensor->raw_ptr; - C.raw_ptr = args.dst_tensor->raw_ptr; - auto mm = args.handle->batched_matrix_mul(); - mm->exec(A, B, C, bundle.get_workspace(0)); + auto bmatmul_opr = + args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + bmatmul_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + + auto&& config = + sub_opr_config(args.filter_meta, *args.src_layout, + *args.filter_layout, *args.dst_layout, args.opr); + bmatmul_opr->param() = config.second; + + TensorND A{args.filter_tensor->raw_ptr, config.first[0]}, + B{args.src_tensor->raw_ptr, config.first[1]}, + C{args.dst_tensor->raw_ptr, config.first[2]}; + bmatmul_opr->exec(A, B, C, bundle.get_workspace(0)); } handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, &conv_dst_tensor, args.dst_tensor, diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h index 388c990f403e907db680f99642d0f7b159df832f..8ca2f68dc653638d5358d2b3d8a236db61483699 100644 --- a/dnn/test/common/opr_proxy.h +++ b/dnn/test/common/opr_proxy.h @@ -46,6 +46,7 @@ struct OprTypeFromOprTrait; } cb(MATRIX_MUL_FORWARD, MatrixMulForward); +cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward); cb(CONVOLUTION_FORWARD, ConvolutionForward); cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); @@ -66,6 +67,7 @@ cb(CONVBIAS_FORWARD, ConvBiasForward); // clang-format off #define FOREACH_OPR_TYPE(cb) \ cb(MATRIX_MUL_FORWARD) \ + cb(BATCHED_MATRIX_MUL_FORWARD) \ cb(CONVOLUTION_FORWARD) \ cb(CONVOLUTION_BACKWARD_DATA) \ cb(CONVOLUTION_BACKWARD_FILTER) \ @@ -83,6 +85,7 @@ cb(CONVBIAS_FORWARD, ConvBiasForward); #define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ cb(MATRIX_MUL_FORWARD, stmt) \ + cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \ cb(CONVOLUTION_FORWARD, stmt) \ cb(CONVOLUTION_BACKWARD_DATA, stmt) \ cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index 5c382d4d450cf3ca23fbf6c9b3d666bae140047a..901bc371dc5afcb537b2904fad042354f688c641 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -821,7 +821,7 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_NCHW4) { {{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {1, 64, 1, 1, 4}, {}, {}}); } -TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_1x1) { +TEST_F(CUDA, CONV_BIAS_FORWARD_BATCHED_MATMUL) { using namespace conv_bias; std::vector args = get_args_1x1(); Checker checker(handle_cuda()); @@ -834,13 +834,15 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_MATMUL_1x1) { .set_rng(1, &default_rng) .set_rng(2, &default_rng) .set_epsilon(1e-3); + checker.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + ConvBiasForward::algo_name( + "BATCHED_MATMUL", {}) + .c_str(), + {{"CUBLAS", {}}}})); + for (auto&& arg : args) { checker.set_param(arg.param); - checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< - ConvBias>( - ConvBiasForward::algo_name( - "BATCHEDMATMUL", {}) - .c_str())); checker.execs({arg.src, arg.filter, arg.bias, {}, {}}); } }