diff --git a/dnn/src/cuda/conv_bias/algo.cpp b/dnn/src/cuda/conv_bias/algo.cpp index adf6a93c17ad122512608f49c2f7d21b9c759640..fd290e08aa61aa28ff0135f73640ea40babeda74 100644 --- a/dnn/src/cuda/conv_bias/algo.cpp +++ b/dnn/src/cuda/conv_bias/algo.cpp @@ -42,24 +42,11 @@ ConvBiasForwardImpl::AlgoPack::AlgoPack() { conv_algos.push_back(&matmul); conv_algos.push_back(&matmul8x8x32); conv_algos.push_back(&batched_matmul); - - conv_algos.reserve(conv_algos.size() * 2); - //! add gconv algos by AlgoGroupConvGeneral - size_t algo_size = conv_algos.size(); - for (size_t i = 3; i < algo_size; ++i) { - gconv_refhold.emplace_back(new AlgoGroupConvGeneral(conv_algos[i])); - algo2gconv[conv_algos[i]] = gconv_refhold.back().get(); - conv_algos.push_back(gconv_refhold.back().get()); - } + conv_algos.push_back(&group); for (auto&& algo : conv_algos) { all_algos.push_back(algo); } - non_cudnn_algos.push_back(all_algos.rbegin()[4]); // group inplace_matmul - non_cudnn_algos.push_back(all_algos.rbegin()[3]); // group matmul - non_cudnn_algos.push_back(all_algos.rbegin()[2]); // group matmul_8x8x32 - non_cudnn_algos.push_back(all_algos.rbegin()[1]); // group batched_matmul - non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group 1x1 all_algos.push_back(&bfloat16); bfloat16_algos.push_back(&bfloat16); @@ -118,7 +105,7 @@ ConvBiasForwardImpl::AlgoPack ConvBiasForwardImpl::sm_algo_pack; MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvBiasForwardImpl) ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( - ConvBiasForwardImpl* o, const TensorLayout& src, + const ConvBiasForwardImpl* o, const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) @@ -127,7 +114,7 @@ ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( dst, preprocessed_filter) {} ConvBiasForwardImpl::AlgoBase::SizeArgs::SizeArgs( - ConvBiasForwardImpl* o, const TensorLayout& src, + const ConvBiasForwardImpl* o, const TensorLayout& src, const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) diff --git a/dnn/src/cuda/conv_bias/algo.h b/dnn/src/cuda/conv_bias/algo.h index 691298c6dd4ae1557047378e174c6872a733961b..f58e1d483dc6e3a914185a8a8fc3eccfa4dc7c29 100644 --- a/dnn/src/cuda/conv_bias/algo.h +++ b/dnn/src/cuda/conv_bias/algo.h @@ -78,15 +78,15 @@ public: AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } struct SizeArgs : public conv_bias::BiasForwardSizeArgs { - ConvBiasForwardImpl* opr; + const ConvBiasForwardImpl* opr; const PreprocessedFilter* preprocessed_filter; std::string to_string() const; - SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, + SizeArgs(const ConvBiasForwardImpl* opr, const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter = nullptr); - SizeArgs(ConvBiasForwardImpl* opr, const TensorLayout& src, + SizeArgs(const ConvBiasForwardImpl* opr, const TensorLayout& src, const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, const TensorLayout& bias, const TensorLayout& z, @@ -434,27 +434,24 @@ private: //! implement group conv by another algo class ConvBiasForwardImpl::AlgoGroupConvGeneral final : public AlgoBase { public: - AlgoGroupConvGeneral(AlgoBase* impl); - bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; - const char* name() const override { return m_name.c_str(); } + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; - AlgoAttribute attribute() const override { - auto ret = AlgoAttribute::DEFAULT; -#define cb(attr) \ - if (m_impl->contain_attribute_all(attr)) { \ - ret |= attr; \ + const char* name() const override { + if (m_name.empty()) { + m_name = ConvBiasForward::algo_name("CUDA:GROUP_CONV", + {}); + } + return m_name.c_str(); } - MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb) -#undef cb - if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) { - ret |= AlgoAttribute::REPRODUCIBLE; - } - return ret; + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; } static void modify_size_args(SizeArgs& args, TensorLayout& src_pg, @@ -463,8 +460,7 @@ public: private: WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; - AlgoBase* m_impl; - std::string m_name; + mutable std::string m_name; }; #if CUDA_VERSION >= 10000 @@ -1087,9 +1083,8 @@ public: std::vector int4_int4_nhwc_imma; std::vector uint4_int4_nhwc_imma; #endif - std::vector> gconv_refhold; + AlgoGroupConvGeneral group; AlgoBFloat16 bfloat16; - std::unordered_map algo2gconv; AlgoBase* cudnn_conv_bias_act_from_enum(cudnnConvolutionFwdAlgo_t algo); diff --git a/dnn/src/cuda/conv_bias/group_conv.cpp b/dnn/src/cuda/conv_bias/group_conv.cpp index 49d9e3d5bc9e791027fd93e8217eb00d700d80ac..64d9ceb5ee391ef407a0a1f65164604d5bbae8c2 100644 --- a/dnn/src/cuda/conv_bias/group_conv.cpp +++ b/dnn/src/cuda/conv_bias/group_conv.cpp @@ -9,6 +9,7 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ +#include #include "src/common/conv_bias.h" #include "src/cuda/conv_bias/algo.h" @@ -16,36 +17,80 @@ using namespace megdnn; using namespace cuda; using namespace conv_bias; -void ConvBiasForwardImpl::AlgoGroupConvGeneral::modify_size_args( - ConvBiasForwardImpl::AlgoBase::SizeArgs& args, TensorLayout& src_pg, - TensorLayout& dst_pg, TensorLayout& bias_pg) { - src_pg = *args.src_layout; - dst_pg = *args.dst_layout; - bias_pg = *args.bias_layout; +namespace { +std::pair sub_opr_config( + const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) { + TensorLayout src_pg = *args.src_layout; + + SmallVector flt_shape(0); + std::vector flt_stride(0); + size_t idx = 0; + // check if the first dim is group + if (args.filter_layout->ndim > args.src_layout->ndim) + ++idx; + for (; idx < args.filter_layout->ndim; ++idx) { + flt_shape.push_back(args.filter_layout->shape[idx]); + flt_stride.push_back(args.filter_layout->stride[idx]); + } + TensorLayout filter_pg(flt_shape, flt_stride, + args.filter_layout->dtype, + args.filter_layout->format); + TensorLayout bias_pg = *args.bias_layout; + TensorLayout z_pg = *args.z_layout; + TensorLayout dst_pg = *args.dst_layout; + auto nr_grp = args.filter_meta.group; - args.filter_meta.group = 1; size_t c_pos; - if (args.filter_meta.format == Param::Format::NCHW || - args.filter_meta.format == Param::Format::NCHW4) { + if (args.filter_meta.format == megdnn::param::ConvBias::Format::NCHW || + args.filter_meta.format == megdnn::param::ConvBias::Format::NCHW4) { c_pos = 1; } else { - megdnn_assert(args.filter_meta.format == Param::Format::NHWC, + megdnn_assert(args.filter_meta.format == + megdnn::param::ConvBias::Format::NHWC, "invalid conv format"); c_pos = 3; } src_pg.shape[c_pos] /= nr_grp; - dst_pg.shape[c_pos] /= nr_grp; bias_pg.ndim = 0; - args.src_layout = &src_pg; - args.dst_layout = &dst_pg; - args.bias_layout = &bias_pg; - args.nonlinear_mode = Param::NonlineMode::IDENTITY; + dst_pg.shape[c_pos] /= nr_grp; + + megdnn::param::ConvBias param = args.opr->param(); + param.sparse = megdnn::param::ConvBias::Sparse::DENSE; + param.nonlineMode = + megdnn::param::ConvBias::NonlineMode::IDENTITY; + std::pair ret; + ret.first = {src_pg, filter_pg, bias_pg, z_pg, dst_pg}; + ret.second = param; + + return ret; } -ConvBiasForwardImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral(AlgoBase* impl) - : m_impl{impl} { - m_name = ConvBiasForward::algo_name( - ssprintf("%s:%s", "CUDA:GROUP_CONV", impl->name()), {}); +std::pair> prepare_sub_opr( + const ConvBiasForwardImpl::AlgoBase::SizeArgs& args) { + auto convbias_opr = args.handle->create_operator(); + set_execution_policy( + args.opr, convbias_opr.get()); + auto&& config = sub_opr_config(args); + convbias_opr->param() = config.second; + + return {config.first, std::move(convbias_opr)}; +} +} // namespace + +std::vector +ConvBiasForwardImpl::AlgoGroupConvGeneral::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + AlgoBase::SizeArgs args{static_cast(opr), + layouts[0], + layouts[1], + layouts[2], + layouts[3], + layouts[4]}; + auto&& config = sub_opr_config(args); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::CONVBIAS_FORWARD, param_str, config.first}}; } bool ConvBiasForwardImpl::AlgoGroupConvGeneral::is_available( @@ -62,10 +107,10 @@ bool ConvBiasForwardImpl::AlgoGroupConvGeneral::is_available( param.format == param::ConvBias::Format::NCHW32) return false; - auto sub_args = args; - TensorLayout src_pg, dst_pg, bias_pg; - modify_size_args(sub_args, src_pg, dst_pg, bias_pg); - return m_impl->is_available(sub_args); + auto config = prepare_sub_opr(args); + return get_algorithm(static_cast(config.second.get()), + config.first[0], config.first[1], config.first[2], + config.first[3], config.first[4]); } WorkspaceBundle ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_bundle( @@ -80,12 +125,12 @@ WorkspaceBundle ConvBiasForwardImpl::AlgoGroupConvGeneral::get_workspace_bundle( sizes.push_back(dst_layout.span().dist_byte()); } - auto sub_args = args; - sub_args.dst_layout = &dst_layout; - TensorLayout src_pg, dst_pg, bias_pg; - modify_size_args(sub_args, src_pg, dst_pg, bias_pg); - sizes.insert(sizes.begin(), - m_impl->get_workspace_in_bytes(sub_args)); + auto config = prepare_sub_opr(args); + size_t mm_ws = config.second->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2], + config.first[3], config.first[4], nullptr); + + sizes.insert(sizes.begin(), mm_ws); return {ptr, std::move(sizes)}; } @@ -109,28 +154,13 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec( auto sub_args = args; sub_args.dst_tensor = &conv_dst_tensor; sub_args.dst_layout = &conv_dst_tensor.layout; - TensorND tsrc{*args.src_tensor}, tdst{conv_dst_tensor}, - tbias{*args.bias_tensor}; - SmallVector flt_shape(0); - std::vector flt_stride(0); - size_t idx = 0; - // check if the first dim is group - if (args.filter_tensor->layout.ndim > args.src_layout->ndim) - ++idx; - for (; idx < args.filter_tensor->layout.ndim; ++idx) { - flt_shape.push_back(args.filter_tensor->layout[idx]); - flt_stride.push_back(args.filter_tensor->layout.stride[idx]); - } - TensorND tflt{args.filter_tensor->raw_ptr, - TensorLayout{flt_shape, flt_stride, - args.filter_tensor->layout.dtype, - args.filter_tensor->layout.format}}; - modify_size_args(sub_args, tsrc.layout, tdst.layout, tbias.layout); - sub_args.src_tensor = &tsrc; - sub_args.dst_tensor = &tdst; - sub_args.filter_tensor = &tflt; - sub_args.bias_tensor = &tbias; + auto config = prepare_sub_opr(sub_args); + TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]}; + TensorND tfilter{args.filter_tensor->raw_ptr, config.first[1]}; + TensorND tbias{args.bias_tensor->raw_ptr, config.first[2]}; + TensorND tz{args.z_tensor->raw_ptr, config.first[3]}; + TensorND tdst{conv_dst_tensor.raw_ptr, config.first[4]}; size_t c_pos; if (args.filter_meta.format == Param::Format::NCHW || @@ -150,16 +180,17 @@ void ConvBiasForwardImpl::AlgoGroupConvGeneral::exec( strd_dst = tdst.layout.stride[c_pos] * fm.ocpg * tdst.layout.dtype.size(), strd_flt = fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * - tflt.layout.dtype.size(); + tfilter.layout.dtype.size(); if (args.filter_meta.format == Param::Format::NCHW4) { strd_src >>= 2; strd_dst >>= 2; } for (uint32_t g = 0; g < grp; ++g) { - m_impl->exec(sub_args); + config.second->exec(tsrc, tfilter, tbias, + tz, tdst, nullptr, bundle.get_workspace(0)); incr_voidp(tsrc.raw_ptr, strd_src); incr_voidp(tdst.raw_ptr, strd_dst); - incr_voidp(tflt.raw_ptr, strd_flt); + incr_voidp(tfilter.raw_ptr, strd_flt); } } handle_bias_and_nonlinear(args.handle, args.nonlinear_mode, diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index 373d6392c60ca18bcd5c02cba40e7421d027e2a7..c6bc73f5d4f6893a48b1908dd0ab8834d99910c1 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -193,25 +193,17 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic( return algo; } - if (args.filter_meta.group > 1) { - auto orig_args = conv_args; - TensorLayout src, dst, bias; - AlgoGroupConvGeneral::modify_size_args(conv_args, src, dst, bias); - if (auto algo = get_1x1_algo(conv_args)) { - return sm_algo_pack.algo2gconv.at(algo); - } - if (is_cudnn_supported(conv_args)) { - if (auto algo = get_cudnn_algo(cudnn_conv_from_enum_wrapper)) { - return sm_algo_pack.algo2gconv.at(algo); - } - } - conv_args = orig_args; - } - if (auto algo = get_1x1_algo(args)) { return algo; } + if (args.filter_meta.group > 1) { + if (auto algo = megdnn::get_algo_match_attribute( + &sm_algo_pack.group, positive_attr, negative_attr)){ + return algo; + } + } + if (sm_algo_pack.fallback_nchw_qs8.is_available_attribute( args, positive_attr, negative_attr, workspace_limit_in_bytes)) { return &sm_algo_pack.fallback_nchw_qs8; diff --git a/dnn/src/cuda/convolution/backward_data/algo.cpp b/dnn/src/cuda/convolution/backward_data/algo.cpp index bc91fa3f80c612a4270e7350ce51cafe4e06624e..b4a243258e84c593c1606bcd47e4370bd28b2063 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.cpp +++ b/dnn/src/cuda/convolution/backward_data/algo.cpp @@ -39,25 +39,9 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { int8_algos.push_back(&int8_nchw_dotprod); all_algos.push_back(&int8_nchw_dotprod); - all_algos.reserve(all_algos.size() * 2); - - // add gconv algos by AlgoGroupConvGeneral - auto all_algos_data = all_algos.data(); - size_t group_algo_start = 2; - for (size_t i = group_algo_start; i < all_algos.size(); ++i) { - gconv.push_back({all_algos[i]}); - } - for (size_t i = group_algo_start; i < all_algos.size(); ++i) { - algo2gconv[all_algos[i]] = &gconv[i - group_algo_start]; - } - for (auto&& i : gconv) { - all_algos.push_back(&i); - } - megdnn_assert(all_algos_data == all_algos.data()); - - non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul all_algos.push_back(&bfloat16); bfloat16_algos.push_back(&bfloat16); + all_algos.push_back(&group); for (auto&& algo : all_algos) { m_all_algos_map.emplace(algo->info().desc, algo); @@ -80,13 +64,13 @@ ConvolutionBackwardDataImpl::AlgoPack::cudnn_from_enum( ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack; ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( - ConvolutionBackwardDataImpl* o, const TensorLayout& filter, + const ConvolutionBackwardDataImpl* o, const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) : SizeArgs(o, filter, o->make_canonized_filter_meta(grad.ndim, filter), diff, grad) {} ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( - ConvolutionBackwardDataImpl* o, const TensorLayout& filter, + const ConvolutionBackwardDataImpl* o, const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, const TensorLayout& grad) : handle{concrete_handle(o->handle())}, @@ -97,7 +81,7 @@ ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( opr{o} {} ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs( - ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, + const ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) : SizeArgs(opr, filter.layout, diff.layout, grad.layout), diff --git a/dnn/src/cuda/convolution/backward_data/algo.h b/dnn/src/cuda/convolution/backward_data/algo.h index d53b3c617927d196369d0eed16b535c37d98e21b..e9b3545471174b294604ec1445539e9e5ae9b65f 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.h +++ b/dnn/src/cuda/convolution/backward_data/algo.h @@ -49,15 +49,17 @@ public: HandleImpl* handle; CanonizedFilterMeta filter_meta; const TensorLayout *diff_layout, *grad_layout, *filter_layout; - ConvolutionBackwardDataImpl* opr; + const ConvolutionBackwardDataImpl* opr; std::string to_string() const; void init_desc(convolution::CUDNNBwdDataDescs& desc) const { desc.set(filter_meta, *diff_layout, *grad_layout, opr->param()); } - SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter, - const TensorLayout& diff, const TensorLayout& grad); - SizeArgs(ConvolutionBackwardDataImpl* opr, const TensorLayout& filter, + SizeArgs(const ConvolutionBackwardDataImpl* opr, + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad); + SizeArgs(const ConvolutionBackwardDataImpl* opr, + const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, const TensorLayout& grad); @@ -70,7 +72,7 @@ public: const TensorND *filter_tensor, *diff_tensor, *grad_tensor; Workspace workspace; - ExecArgs(ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, + ExecArgs(const ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace); }; @@ -219,35 +221,26 @@ private: //! implement group conv by another algo class ConvolutionBackwardDataImpl::AlgoGroupConvGeneral final : public AlgoBase { - AlgoBase* m_impl; - std::string m_name; - public: - AlgoGroupConvGeneral(AlgoBase* impl); - bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; - const char* name() const override { return m_name.c_str(); } + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; + const char* name() const override { + return "CUDA:GROUP_CONV_BACKWARD_DATA"; + } - static void modify_size_args(SizeArgs& args, TensorLayout& diff_pg, - TensorLayout& grad_pg); MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL) AlgoAttribute attribute() const override { - auto ret = AlgoAttribute::DEFAULT; -#define cb(attr) \ - if (m_impl->contain_attribute_all(attr)) { \ - ret |= attr; \ - } - MEGDNN_FOREACH_ALGO_ATTRIBUTE_INHERITABLE(cb) -#undef cb - if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) { - ret |= AlgoAttribute::REPRODUCIBLE; - } - return ret; + return AlgoAttribute::REPRODUCIBLE; } + +private: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; }; class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final @@ -319,9 +312,8 @@ public: AlgoMatmul matmul; AlgoChanwise chanwise; AlgoChanwiseSmall chanwise_small; - std::vector gconv; - std::unordered_map algo2gconv; AlgoBFloat16 bfloat16; + AlgoGroupConvGeneral group; std::vector int8_nchw4_dotprod; AlgoInt8NCHWDotProdImplicitGemm int8_nchw_dotprod; diff --git a/dnn/src/cuda/convolution/backward_data/group_conv.cpp b/dnn/src/cuda/convolution/backward_data/group_conv.cpp index d32a407fefeae9c0b8198732d764df505d9091f3..5637c769780b61933b71b8cee044a8c353b4d508 100644 --- a/dnn/src/cuda/convolution/backward_data/group_conv.cpp +++ b/dnn/src/cuda/convolution/backward_data/group_conv.cpp @@ -16,24 +16,63 @@ using namespace megdnn; using namespace cuda; using namespace convolution; -void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::modify_size_args( - ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, - TensorLayout& diff_pg, TensorLayout& grad_pg) { - diff_pg = *args.diff_layout; - grad_pg = *args.grad_layout; +namespace { +std::pair sub_opr_config( + const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args) { + SmallVector flt_shape(0); + std::vector flt_stride(0); + size_t idx = 0; + // check if the first dim is group + if (args.filter_layout->ndim > args.diff_layout->ndim) + ++idx; + for (; idx < args.filter_layout->ndim; ++idx) { + flt_shape.push_back(args.filter_layout->shape[idx]); + flt_stride.push_back(args.filter_layout->stride[idx]); + } + TensorLayout filter_pg(flt_shape, flt_stride, args.filter_layout->dtype, + args.filter_layout->format); + TensorLayout diff_pg = *args.diff_layout; + TensorLayout grad_pg = *args.grad_layout; + auto nr_grp = args.filter_meta.group; - args.filter_meta.group = 1; - diff_pg.shape[1] /= nr_grp; - grad_pg.shape[1] /= nr_grp; - args.diff_layout = &diff_pg; - args.grad_layout = &grad_pg; + size_t c_pos = 1; + diff_pg.shape[c_pos] /= nr_grp; + grad_pg.shape[c_pos] /= nr_grp; + + megdnn::param::Convolution param = args.opr->param(); + param.sparse = megdnn::param::ConvBias::Sparse::DENSE; + std::pair ret; + ret.first = {filter_pg, diff_pg, grad_pg}; + ret.second = param; + + return ret; } -ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( - AlgoBase* impl) - : m_impl{impl} { - m_name = "group_conv:"; - m_name += impl->name(); +std::pair> +prepare_sub_opr(const ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args) { + auto conv_bwd_data_opr = + args.handle->create_operator(); + set_execution_policy( + args.opr, conv_bwd_data_opr.get()); + auto&& config = sub_opr_config(args); + conv_bwd_data_opr->param() = config.second; + + return {config.first, std::move(conv_bwd_data_opr)}; +} +} // namespace + +std::vector +ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + AlgoBase::SizeArgs args{ + static_cast(opr), layouts[0], + layouts[1], layouts[2]}; + auto&& config = sub_opr_config(args); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::CONVOLUTION_BACKWARD_DATA, param_str, + config.first}}; } bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( @@ -46,44 +85,60 @@ bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( } if (args.filter_meta.group <= 1) return false; - auto sub_args = args; - TensorLayout diff_pg, grad_pg; - modify_size_args(sub_args, diff_pg, grad_pg); - return m_impl->is_available(sub_args); + + if (args.filter_meta.format != + megdnn::param::Convolution::Format::NCHW) { + return false; + } + + auto config = prepare_sub_opr(args); + return get_algorithm( + static_cast(config.second.get()), + config.first[0], config.first[1], config.first[2]); +} + +WorkspaceBundle +ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto config = prepare_sub_opr(args); + size_t sizes = config.second->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2]); + return {ptr, {sizes}}; } size_t ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( const SizeArgs& args) const { - auto sub_args = args; - TensorLayout diff_pg, grad_pg; - modify_size_args(sub_args, diff_pg, grad_pg); - return m_impl->get_workspace_in_bytes(sub_args); + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); } void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( const ExecArgs& args) const { - auto sub_args = args; - TensorND tflt{*args.filter_tensor}, tdiff{*args.diff_tensor}, - tgrad{*args.grad_tensor}; - modify_size_args(sub_args, tdiff.layout, tgrad.layout); - sub_args.filter_tensor = &tflt; - sub_args.diff_tensor = &tdiff; - sub_args.grad_tensor = &tgrad; - auto grp = args.filter_meta.group; - - auto&& fm = args.filter_meta; - auto strd_flt = (fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * - tflt.layout.dtype.size()), - strd_diff = - (tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), - strd_grad = - (tgrad.layout.stride[1] * fm.icpg * tgrad.layout.dtype.size()); - for (uint32_t g = 0; g < grp; ++g) { - m_impl->exec(sub_args); - incr_voidp(tflt.raw_ptr, strd_flt); - incr_voidp(tdiff.raw_ptr, strd_diff); - incr_voidp(tgrad.raw_ptr, strd_grad); + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + { + auto config = prepare_sub_opr(args); + TensorND tfilter{args.filter_tensor->raw_ptr, config.first[0]}; + TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]}; + TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]}; + + size_t c_pos = 1; + + auto&& fm = args.filter_meta; + + auto strd_flt = fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * + tfilter.layout.dtype.size(), + strd_diff = tdiff.layout.stride[c_pos] * fm.ocpg * + tdiff.layout.dtype.size(), + strd_grad = (tgrad.layout.stride[c_pos] * fm.icpg * + tgrad.layout.dtype.size()); + + auto grp = args.filter_meta.group; + for (uint32_t g = 0; g < grp; ++g) { + config.second->exec(tfilter, tdiff, tgrad, bundle.get_workspace(0)); + incr_voidp(tfilter.raw_ptr, strd_flt); + incr_voidp(tdiff.raw_ptr, strd_diff); + incr_voidp(tgrad.raw_ptr, strd_grad); + } } } diff --git a/dnn/src/cuda/convolution/backward_filter/algo.cpp b/dnn/src/cuda/convolution/backward_filter/algo.cpp index 7560e43850dd21549b0af6dbe12c0aa8a9bfa341..9bf5c4ed72317c1ead4465d524b5abc6e9bc1ffc 100644 --- a/dnn/src/cuda/convolution/backward_filter/algo.cpp +++ b/dnn/src/cuda/convolution/backward_filter/algo.cpp @@ -26,23 +26,8 @@ ConvolutionBackwardFilterImpl::AlgoPack::AlgoPack() { all_algos.push_back(&i); } all_algos.push_back(&matmul); + all_algos.push_back(&group); - all_algos.reserve(all_algos.size() * 2); - - // add gconv algos by AlgoGroupConvGeneral - auto all_algos_data = all_algos.data(); - for (size_t i = 1; i < all_algos.size(); ++ i) { - gconv.push_back({all_algos[i]}); - } - for (size_t i = 1; i < all_algos.size(); ++ i) { - algo2gconv[all_algos[i]] = &gconv[i - 1]; - } - for (auto &&i: gconv) { - all_algos.push_back(&i); - } - megdnn_assert(all_algos_data == all_algos.data()); - - non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul all_algos.push_back(&bfloat16); bfloat16_algos.push_back(&bfloat16); @@ -68,7 +53,7 @@ ConvolutionBackwardFilterImpl::AlgoPack ConvolutionBackwardFilterImpl::sm_algo_pack; ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( - ConvolutionBackwardFilterImpl *o, + const ConvolutionBackwardFilterImpl *o, const TensorLayout &src, const TensorLayout &diff, const TensorLayout &grad): SizeArgs(o, src, diff, grad, o->make_canonized_filter_meta(src.ndim, grad)) @@ -76,7 +61,7 @@ ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( } ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( - ConvolutionBackwardFilterImpl* o, const TensorLayout& src, + const ConvolutionBackwardFilterImpl* o, const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad, const CanonizedFilterMeta& grad_meta) : handle{concrete_handle(o->handle())}, @@ -87,7 +72,7 @@ ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs::SizeArgs( opr{o} {} ConvolutionBackwardFilterImpl::AlgoBase::ExecArgs::ExecArgs( - ConvolutionBackwardFilterImpl *opr, + const ConvolutionBackwardFilterImpl *opr, _megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_out grad, diff --git a/dnn/src/cuda/convolution/backward_filter/algo.h b/dnn/src/cuda/convolution/backward_filter/algo.h index 3d38d65064487834734c733229e3ae5fdabc9588..bb83c4c5d529319cc4f5a90ae9d80cc14d447497 100644 --- a/dnn/src/cuda/convolution/backward_filter/algo.h +++ b/dnn/src/cuda/convolution/backward_filter/algo.h @@ -45,16 +45,18 @@ public: HandleImpl* handle; const TensorLayout *src_layout, *diff_layout, *grad_layout; CanonizedFilterMeta grad_filter_meta; - ConvolutionBackwardFilterImpl* opr; + const ConvolutionBackwardFilterImpl* opr; std::string to_string() const; void init_desc(convolution::CUDNNBwdFilterDescs& desc) const { desc.set(*src_layout, *diff_layout, grad_filter_meta, opr->param()); } - SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src, - const TensorLayout& diff, const TensorLayout& grad); - SizeArgs(ConvolutionBackwardFilterImpl* opr, const TensorLayout& src, - const TensorLayout& diff, const TensorLayout& grad, + SizeArgs(const ConvolutionBackwardFilterImpl* opr, + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad); + SizeArgs(const ConvolutionBackwardFilterImpl* opr, + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad, const CanonizedFilterMeta& grad_meta); convolution::ForwardSizeArgs as_fwd_args() const { @@ -66,9 +68,9 @@ public: const TensorND *src_tensor, *diff_tensor, *grad_tensor; Workspace workspace; - ExecArgs(ConvolutionBackwardFilterImpl* opr, _megdnn_tensor_in src, - _megdnn_tensor_in diff, _megdnn_tensor_out grad, - _megdnn_workspace workspace); + ExecArgs(const ConvolutionBackwardFilterImpl* opr, + _megdnn_tensor_in src, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace); }; virtual bool is_available(const SizeArgs& args) const = 0; virtual size_t get_workspace_in_bytes(const SizeArgs& args) const = 0; @@ -203,29 +205,25 @@ private: //! implement group conv by another algo class ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral final : public AlgoBase { - AlgoBase* m_impl; - std::string m_name; - public: - AlgoGroupConvGeneral(AlgoBase* impl); - bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; - const char* name() const override { return m_name.c_str(); } - - static void modify_size_args(SizeArgs& args, TensorLayout& src_pg, - TensorLayout& diff_pg); + const char* name() const override { + return "CUDA:GROUP_CONV_BACKWARD_FILTER"; + } MEGDNN_DECL_ALGO_TYPE(CUDA_GROUP_CONV_GENERAL) AlgoAttribute attribute() const override { - auto ret = static_cast(0); - if (m_impl->contain_attribute_all(AlgoAttribute::REPRODUCIBLE)) { - ret |= AlgoAttribute::REPRODUCIBLE; - } - return ret; + return AlgoAttribute::REPRODUCIBLE; } + +private: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; }; class ConvolutionBackwardFilterImpl::AlgoPack : NonCopyableObj { @@ -240,8 +238,7 @@ public: std::vector cudnn; AlgoMatmul matmul; AlgoChanwise chanwise; - std::vector gconv; - std::unordered_map algo2gconv; + AlgoGroupConvGeneral group; AlgoBFloat16 bfloat16; std::vector diff --git a/dnn/src/cuda/convolution/backward_filter/group_conv.cpp b/dnn/src/cuda/convolution/backward_filter/group_conv.cpp index 0e0a8e1bd5364799f2fb5f8ba612251148e4ad2c..aa731c18454fb4f64df68eb571114ef62aa6c8da 100644 --- a/dnn/src/cuda/convolution/backward_filter/group_conv.cpp +++ b/dnn/src/cuda/convolution/backward_filter/group_conv.cpp @@ -15,25 +15,63 @@ using namespace megdnn; using namespace cuda; using namespace convolution; -void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::modify_size_args( - ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs &args, - TensorLayout &src_pg, TensorLayout &diff_pg) { - src_pg = *args.src_layout; - diff_pg = *args.diff_layout; +namespace { +std::pair sub_opr_config( + const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args) { + SmallVector flt_shape(0); + std::vector flt_stride(0); + size_t idx = 0; + // check if the first dim is group + if (args.grad_layout->ndim > args.diff_layout->ndim) + ++idx; + for (; idx < args.grad_layout->ndim; ++idx) { + flt_shape.push_back(args.grad_layout->shape[idx]); + flt_stride.push_back(args.grad_layout->stride[idx]); + } + TensorLayout filter_pg(flt_shape, flt_stride, args.grad_layout->dtype, + args.grad_layout->format); + TensorLayout src_pg = *args.src_layout; + TensorLayout diff_pg = *args.diff_layout; + auto nr_grp = args.grad_filter_meta.group; - args.grad_filter_meta.group = 1; - src_pg.shape[1] /= nr_grp; - diff_pg.shape[1] /= nr_grp; - args.src_layout = &src_pg; - args.diff_layout = &diff_pg; + size_t c_pos = 1; + src_pg.shape[c_pos] /= nr_grp; + diff_pg.shape[c_pos] /= nr_grp; + + megdnn::param::Convolution param = args.opr->param(); + param.sparse = megdnn::param::ConvBias::Sparse::DENSE; + std::pair ret; + ret.first = {src_pg, diff_pg, filter_pg}; + ret.second = param; + + return ret; } -ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( - AlgoBase *impl): - m_impl{impl} -{ - m_name = "group_conv:"; - m_name += impl->name(); +std::pair> +prepare_sub_opr(const ConvolutionBackwardFilterImpl::AlgoBase::SizeArgs& args) { + auto conv_bwd_filter_opr = + args.handle->create_operator(); + set_execution_policy( + args.opr, conv_bwd_filter_opr.get()); + auto&& config = sub_opr_config(args); + conv_bwd_filter_opr->param() = config.second; + + return {config.first, std::move(conv_bwd_filter_opr)}; +} +} // namespace + +std::vector +ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + AlgoBase::SizeArgs args{ + static_cast(opr), layouts[0], + layouts[1], layouts[2]}; + auto&& config = sub_opr_config(args); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::CONVOLUTION_BACKWARD_FILTER, param_str, + config.first}}; } bool ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::is_available( @@ -44,44 +82,60 @@ bool ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::is_available( } if (args.grad_filter_meta.group <= 1) return false; - auto sub_args = args; - TensorLayout src_pg, diff_pg; - modify_size_args(sub_args, src_pg, diff_pg); - return m_impl->is_available(sub_args); + if (args.grad_filter_meta.format != + megdnn::param::Convolution::Format::NCHW) { + return false; + } + + auto config = prepare_sub_opr(args); + return get_algorithm( + static_cast(config.second.get()), + config.first[0], config.first[1], config.first[2]); } -size_t ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral:: -get_workspace_in_bytes(const SizeArgs &args) const { - auto sub_args = args; - TensorLayout src_pg, diff_pg; - modify_size_args(sub_args, src_pg, diff_pg); - return m_impl->get_workspace_in_bytes(sub_args); +WorkspaceBundle +ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + auto config = prepare_sub_opr(args); + size_t sizes = config.second->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2]); + return {ptr, {sizes}}; +} + +size_t +ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); } void ConvolutionBackwardFilterImpl::AlgoGroupConvGeneral::exec( - const ExecArgs &args) const { - auto sub_args = args; - TensorND tsrc{*args.src_tensor}, tdiff{*args.diff_tensor}, - tgrad{*args.grad_tensor}; - modify_size_args(sub_args, tsrc.layout, tdiff.layout); - sub_args.src_tensor = &tsrc; - sub_args.diff_tensor = &tdiff; - sub_args.grad_tensor = &tgrad; - - auto &&fm = args.grad_filter_meta; - auto grp = fm.group; - - auto strd_src = ( - tsrc.layout.stride[1] * fm.icpg * tsrc.layout.dtype.size()), - strd_diff = ( - tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), - strd_grad = (fm.icpg * fm.ocpg * - fm.spatial[0] * fm.spatial[1] * tgrad.layout.dtype.size()); - for (uint32_t g = 0; g < grp; ++ g) { - m_impl->exec(sub_args); - incr_voidp(tsrc.raw_ptr, strd_src); - incr_voidp(tdiff.raw_ptr, strd_diff); - incr_voidp(tgrad.raw_ptr, strd_grad); + const ExecArgs& args) const { + auto bundle = get_workspace_bundle(args.workspace.raw_ptr, args); + + { + auto config = prepare_sub_opr(args); + TensorND tsrc{args.src_tensor->raw_ptr, config.first[0]}; + TensorND tdiff{args.diff_tensor->raw_ptr, config.first[1]}; + TensorND tgrad{args.grad_tensor->raw_ptr, config.first[2]}; + + size_t c_pos = 1; + + auto&& fm = args.grad_filter_meta; + + auto strd_src = tsrc.layout.stride[c_pos] * fm.icpg * + tsrc.layout.dtype.size(), + strd_diff = tdiff.layout.stride[c_pos] * fm.ocpg * + tdiff.layout.dtype.size(), + strd_grad = fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * + tgrad.layout.dtype.size(); + + auto grp = fm.group; + for (uint32_t g = 0; g < grp; ++g) { + config.second->exec(tsrc, tdiff, tgrad, bundle.get_workspace(0)); + incr_voidp(tsrc.raw_ptr, strd_src); + incr_voidp(tdiff.raw_ptr, strd_diff); + incr_voidp(tgrad.raw_ptr, strd_grad); + } } } diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp index eb3e3e302a26555e0fc6a41efd8e61c9b9166ae2..630cba15a0cd2da989b9cab5a81ac7c08227f99f 100644 --- a/dnn/src/cuda/convolution/opr_impl.cpp +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -104,19 +104,7 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic( const TensorLayout& grad, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) { - auto fm = check_layout_fwd(grad, filter, diff); - return get_algorithm_heuristic(filter, fm, diff, grad, - workspace_limit_in_bytes, positive_attr, - negative_attr); -} - -ConvolutionBackwardDataImpl::Algorithm* -ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, - const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, - const TensorLayout& grad, size_t workspace_limit_in_bytes, - const AlgoAttribute& positive_attr, - const AlgoAttribute& negative_attr) { - AlgoBase::SizeArgs args(this, filter, filter_meta, diff, grad); + AlgoBase::SizeArgs args(this, filter, diff, grad); if (args.filter_meta.group > 1 && sm_algo_pack.chanwise.is_available_attribute( @@ -186,14 +174,11 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, } if (args.filter_meta.group > 1) { - auto orig_args = args; - TensorLayout a, b; - AlgoGroupConvGeneral::modify_size_args(args, a, b); - if (is_cudnn_supported(args.as_fwd_args())) { - if (auto algo = get_cudnn_algo()) - return sm_algo_pack.algo2gconv.at(algo); + if (auto algo = megdnn::get_algo_match_attribute< + ConvolutionBackwardDataImpl>( + &sm_algo_pack.group, positive_attr, negative_attr)) { + return algo; } - args = orig_args; } if (args.filter_layout->dtype.enumv() != @@ -212,7 +197,7 @@ size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) { AlgoBase::SizeArgs args(this, filter, diff, grad); - return get_algorithm(this, filter, args.filter_meta, diff, grad) + return get_algorithm(this, filter, diff, grad) ->get_workspace_in_bytes(args); } @@ -227,8 +212,7 @@ void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out grad, _megdnn_workspace workspace) { AlgoBase::ExecArgs args(this, src, diff, grad, workspace); - auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout, - args.grad_filter_meta); + auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout); algo->check_workspace(args, workspace).exec(args); } @@ -246,20 +230,7 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( const TensorLayout& grad, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) { - auto fm = check_layout_fwd(src, grad, diff); - return get_algorithm_heuristic(src, diff, grad, fm, - workspace_limit_in_bytes, positive_attr, - negative_attr); -} - -ConvolutionBackwardFilterImpl::Algorithm* -ConvolutionBackwardFilterImpl::get_algorithm_heuristic( - const TensorLayout& src, const TensorLayout& diff, - const TensorLayout& grad, const CanonizedFilterMeta& grad_meta, - size_t workspace_limit_in_bytes, - const AlgoAttribute& positive_attr, - const AlgoAttribute& negative_attr) { - AlgoBase::SizeArgs args(this, src, diff, grad, grad_meta); + AlgoBase::SizeArgs args(this, src, diff, grad); if (args.grad_filter_meta.group > 1 && sm_algo_pack.chanwise.is_available_attribute( @@ -332,14 +303,11 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( } if (args.grad_filter_meta.group > 1) { - auto orig_args = args; - TensorLayout a, b; - AlgoGroupConvGeneral::modify_size_args(args, a, b); - if (is_cudnn_supported(args.as_fwd_args())) { - if (auto algo = get_cudnn_algo()) - return sm_algo_pack.algo2gconv.at(algo); + if (auto algo = megdnn::get_algo_match_attribute< + ConvolutionBackwardFilterImpl>( + &sm_algo_pack.group, positive_attr, negative_attr)) { + return algo; } - args = orig_args; } if (args.src_layout->dtype.enumv() != DTypeTrait::enumv) { @@ -357,7 +325,7 @@ size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad) { AlgoBase::SizeArgs args(this, src, diff, grad); - return get_algorithm(this, src, diff, grad, args.grad_filter_meta) + return get_algorithm(this, src, diff, grad) ->get_workspace_in_bytes(args); } diff --git a/dnn/src/cuda/convolution/opr_impl.h b/dnn/src/cuda/convolution/opr_impl.h index f0e624e3f52cfaaa2e272c519cb41b20b76421f1..226a34a6a194b67234f8f7532b1016ea356cbb90 100644 --- a/dnn/src/cuda/convolution/opr_impl.h +++ b/dnn/src/cuda/convolution/opr_impl.h @@ -74,17 +74,6 @@ public: using ConvolutionBackwardData::ConvolutionBackwardData; void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) override; - AlgorithmInfo get_algorithm_info_heuristic( - const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, - const TensorLayout& diff, const TensorLayout& grad, - size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, - const AlgoAttribute& negative_attr) { - return get_algorithm_heuristic(filter, filter_meta, diff, grad, - workspace_limit_in_bytes, positive_attr, - negative_attr) - ->info(); - } - AlgorithmInfo get_algorithm_info_heuristic( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad, size_t workspace_limit_in_bytes, @@ -128,14 +117,6 @@ protected: const AlgoAttribute& negative_attr) override; private: - Algorithm* get_algorithm_heuristic(const TensorLayout& filter, - const CanonizedFilterMeta& filter_meta, - const TensorLayout& diff, - const TensorLayout& grad, - size_t workspace_limit_in_bytes, - const AlgoAttribute& positive_attr, - const AlgoAttribute& negative_attr); - static AlgoPack sm_algo_pack; }; @@ -147,17 +128,6 @@ public: size_t get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad) override; - AlgorithmInfo get_algorithm_info_heuristic( - const TensorLayout& src, const TensorLayout& diff, - const TensorLayout& grad, const CanonizedFilterMeta& grad_meta, - size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, - const AlgoAttribute& negative_attr) { - return get_algorithm_heuristic(src, diff, grad, grad_meta, - workspace_limit_in_bytes, positive_attr, - negative_attr) - ->info(); - } - AlgorithmInfo get_algorithm_info_heuristic( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad, size_t workspace_limit_in_bytes, @@ -195,14 +165,6 @@ protected: const AlgoAttribute& negative_attr) override; private: - Algorithm* get_algorithm_heuristic(const TensorLayout& src, - const TensorLayout& diff, - const TensorLayout& grad, - const CanonizedFilterMeta& grad_meta, - size_t workspace_limit_in_bytes, - const AlgoAttribute& positive_attr, - const AlgoAttribute& negative_attr); - static AlgoPack sm_algo_pack; }; diff --git a/dnn/test/cuda/conv_bias.cpp b/dnn/test/cuda/conv_bias.cpp index a7db681101f84fcc4b8503331c37fdfc048461b1..f48f0c7efeeb11c0c79e724ca21e58e03efebdfe 100644 --- a/dnn/test/cuda/conv_bias.cpp +++ b/dnn/test/cuda/conv_bias.cpp @@ -1034,10 +1034,11 @@ TEST_F(CUDA, CONV_BIAS_FORWARD_GROUP) { // float case Checker checker(handle_cuda()); checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< - ConvBias>( + ConvBias>(ExecutionPolicyAlgoName{ ConvBiasForward::algo_name( "CUDA:GROUP_CONV", {}) - .c_str())); + .c_str(), + {{"CUDNN", {}}}})); ConvBias::Param param; param.sparse = ConvBias::Param::Sparse::GROUP; param.nonlineMode = mode; diff --git a/dnn/test/cuda/group_conv.cpp b/dnn/test/cuda/group_conv.cpp index 4f4d0caee4001e211973e29373faf64082564a9a..d886b957a103c704dc2d6d06767429db9f614f39 100644 --- a/dnn/test/cuda/group_conv.cpp +++ b/dnn/test/cuda/group_conv.cpp @@ -108,39 +108,33 @@ TEST_F(CUDA, GROUP_CONV_FORWARD) } TEST_F(CUDA, GROUP_CONV_FORWARD_1x1) { - auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, - size_t FH, size_t FW, - size_t OC, size_t group) { + auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, size_t FH, + size_t FW, size_t OC, size_t group) { Checker checker(handle_cuda()); -#if CUDNN_MAJOR <= 6 std::string conv1x1_name = - ConvBiasForward::algo_name( - "BATCHEDMATMUL", {}); - checker.set_before_exec_callback( - AlgoChecker(ExecutionPolicyAlgoName{ - "DEFAULT", - {{ConvBiasForward::algo_name< - ConvBiasForward::DirectParam>( - ssprintf("%s:%s", "CUDA:GROUP_CONV", - conv1x1_name.c_str()) - .c_str(), - {}) - .c_str(), - {}}}})); -#endif + ConvBiasForward::algo_name( + "INPLACE_MATMUL", {}); + checker.set_before_exec_callback(AlgoChecker( + ExecutionPolicyAlgoName{"DEFAULT", + {{ConvBiasForward::algo_name< + ConvBiasForward::DirectParam>( + "CUDA:GROUP_CONV", {}) + .c_str(), + {{conv1x1_name.c_str(), {}}}}}})); + Convolution::Param param; param.sparse = Convolution::Param::Sparse::GROUP; auto ICg = IC / group; auto OCg = OC / group; - checker.set_param(param).exec({{N, IC, IH, IW}, - {group, OCg, ICg, FH, FW}, {}}); + checker.set_param(param).exec( + {{N, IC, IH, IW}, {group, OCg, ICg, FH, FW}, {}}); }; size_t ic = 192; for (size_t g = 2; g <= 3; g += 1) { for (size_t ih = 8; ih <= 128; ih *= 4) { size_t iw = ih; run(2, ic, ih, iw, 1, 1, ic / g, g); - run(2, ic, ih+1, iw+1, 1, 1, ic / g, g); + run(2, ic, ih + 1, iw + 1, 1, 1, ic / g, g); } } } @@ -189,6 +183,54 @@ TEST_F(CUDA, GROUP_CONV_BACKWARD_DATA) 8); } +TEST_F(CUDA, GROUP_CONV_BACKWARD_DATA_CUDNN) +{ + auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, + size_t FH, size_t FW, + size_t OC, size_t OH, size_t OW, + size_t PH, size_t PW, + size_t SH, size_t SW, + size_t group) + { + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "CUDA:GROUP_CONV_BACKWARD_DATA", {{"CUDNN", {}}}})); + + ConvolutionBackwardData::Param param; + param.sparse = Convolution::Param::Sparse::GROUP; + param.pad_h = PH; + param.pad_w = PW; + param.stride_h = SH; + param.stride_w = SW; + auto ICg = IC / group; + auto OCg = OC / group; + checker.set_param(param).exec({{group, OCg, ICg, FH, FW}, + {N, OC, OH, OW}, {N, IC, IH, IW}}); + }; + // normal case + run(2, 64, 7, 7, + 3, 3, + 32, 5, 5, + 0, 0, + 1, 1, + 2); + // padded case + run(2, 32, 7, 7, + 3, 3, + 64, 7, 7, + 1, 1, + 1, 1, + 4); + // strided case + run(2, 32, 7, 7, + 3, 3, + 64, 3, 3, + 0, 0, + 2, 2, + 8); +} + TEST_F(CUDA, GROUP_CONV_BACKWARD_FILTER) { auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, @@ -233,6 +275,52 @@ TEST_F(CUDA, GROUP_CONV_BACKWARD_FILTER) 8); } +TEST_F(CUDA, GROUP_CONV_BACKWARD_FILTER_CUDNN) +{ + auto run = [&](size_t N, size_t IC, size_t IH, size_t IW, + size_t FH, size_t FW, + size_t OC, size_t OH, size_t OW, + size_t PH, size_t PW, + size_t SH, size_t SW, + size_t group) + { + Checker checker(handle_cuda()); + checker.set_before_exec_callback( + AlgoChecker(ExecutionPolicyAlgoName{ + "CUDA:GROUP_CONV_BACKWARD_FILTER", {{"CUDNN", {}}}})); + ConvolutionBackwardFilter::Param param; + param.sparse = Convolution::Param::Sparse::GROUP; + param.pad_h = PH; + param.pad_w = PW; + param.stride_h = SH; + param.stride_w = SW; + auto ICg = IC / group; + auto OCg = OC / group; + checker.set_param(param).exec({{N, IC, IH, IW}, + {N, OC, OH, OW}, {group, OCg, ICg, FH, FW}}); + }; + // normal case + run(2, 64, 7, 7, + 3, 3, + 32, 5, 5, + 0, 0, + 1, 1, + 2); + // padded case + run(2, 32, 7, 7, + 3, 3, + 64, 7, 7, + 1, 1, + 1, 1, + 4); + // strided case + run(2, 32, 7, 7, + 3, 3, + 64, 3, 3, + 0, 0, + 2, 2, + 8); +} } // namespace test } // namespace megdnn