diff --git a/dnn/include/megdnn/oprs.h b/dnn/include/megdnn/oprs.h index 1fc1552d075fa51c9cb6ffb4ba1f0f9962497220..36f6b11aedd0b7e618a73074b76ea6003cfa7289 100644 --- a/dnn/include/megdnn/oprs.h +++ b/dnn/include/megdnn/oprs.h @@ -48,6 +48,8 @@ INST_ARITY(megdnn::ConvBias, 4, 1); INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3); INST_ARITY(megdnn::MatrixMul, 2, 1); INST_ARITY(megdnn::BatchedMatrixMul, 2, 1); +INST_ARITY(megdnn::PoolingForward, 1, 1); +INST_ARITY(megdnn::PoolingBackward, 3, 1); #undef INST_ARITY diff --git a/dnn/include/megdnn/oprs/base.h b/dnn/include/megdnn/oprs/base.h index f62b8927f1fa7145bd9874e1fc5f2f1a363faee0..bb50fdbd08979953150a3b81dbc5fef43f0eeee7 100644 --- a/dnn/include/megdnn/oprs/base.h +++ b/dnn/include/megdnn/oprs/base.h @@ -259,6 +259,8 @@ public: DEFORMABLE_CONV_BACKWARD_FILTER, CONVBIAS_FORWARD, BATCH_CONV_FORWARD, + POOLING_FORWARD, + POOLING_BACKWARD, }; struct SearchItem { @@ -334,6 +336,63 @@ private: ExecutionPolicy m_execution_policy; }; +//! specialize for nargs == 2 +template +class MultiAlgoOpr : public MultiAlgoOpr { +public: + using Algorithm = detail::Algorithm; + using AlgorithmInfo = detail::Algorithm::Info; + using AlgoAttribute = detail::Algorithm::Attribute; + + //! get all possible algorithm decriptions for the specified layouts + std::vector get_all_algorithms_info(const TensorLayout& p0, + const TensorLayout& p1) { + std::vector ret; + for (auto&& algo : get_all_algorithms(p0, p1)) { + ret.emplace_back(algo->info()); + } + return ret; + } + + /** + * \brief Returns the best algorithm information which indicate the + * algorithm by heuristic. + * + * The selected algorithm should not use workspace more than + * \p workspace_limit_in_bytes. + */ + AlgorithmInfo get_algorithm_info_heuristic( + const TensorLayout& p0, const TensorLayout& p1, + size_t workspace_limit_in_bytes = + std::numeric_limits::max(), + const AlgoAttribute& positive_attr = AlgoAttribute::DEFAULT, + const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { + return get_algorithm_heuristic(p0, p1, workspace_limit_in_bytes, + positive_attr, negative_attr) + ->info(); + } + +protected: + ~MultiAlgoOpr() = default; + + //! get all possible algorithms for the specified layouts + virtual std::vector get_all_algorithms( + const TensorLayout& p0, const TensorLayout& p1) = 0; + + /** + * \brief Returns the best algorithm by heuristic. + * + * The selected algorithm should not use workspace more than + * \p workspace_limit_in_bytes. + */ + virtual Algorithm* get_algorithm_heuristic( + const TensorLayout& p0, const TensorLayout& p1, + size_t workspace_limit_in_bytes = + std::numeric_limits::max(), + const AlgoAttribute& positive_attr = AlgoAttribute::DEFAULT, + const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) = 0; +}; + //! specialize for nargs == 3 template class MultiAlgoOpr : public MultiAlgoOpr { diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h index b319226efcec55d19a671cf20440a46959c21322..a61adf2599bcf20fafac562d2acc534d133ad97e 100644 --- a/dnn/include/megdnn/oprs/nn.h +++ b/dnn/include/megdnn/oprs/nn.h @@ -713,7 +713,8 @@ protected: void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst); }; -class PoolingForward : public PoolingBase { +class PoolingForward : public PoolingBase, + public detail::MultiAlgoOpr { DEF_OPR_IMPL(PoolingForward, PoolingBase, 1, 1); public: @@ -734,7 +735,8 @@ protected: using Pooling = PoolingForward; -class PoolingBackward : public PoolingBase { +class PoolingBackward : public PoolingBase, + public detail::MultiAlgoOpr { DEF_OPR_IMPL(PoolingBackward, PoolingBase, 3, 1); public: diff --git a/dnn/src/common/algo_chooser.h b/dnn/src/common/algo_chooser.h index 5119964a0f61484a8ee02e124b96b482c4c8087d..f63be95db676e2e844269900e4768b1f6c2cd02f 100644 --- a/dnn/src/common/algo_chooser.h +++ b/dnn/src/common/algo_chooser.h @@ -69,7 +69,7 @@ std::vector get_all_algorithms( ret.push_back(i); } } - megdnn_assert(!ret.empty(), "no conv algorithm for %s", + megdnn_assert(!ret.empty(), "no algorithm for %s", args.to_string().c_str()); return ret; } diff --git a/dnn/src/cuda/cudnn_wrapper.cpp b/dnn/src/cuda/cudnn_wrapper.cpp index 2dcf2b51a84e4928f0ff098ed051cfaa93953068..f1a4c422c4057a91f66e40505cea50c03f73f049 100644 --- a/dnn/src/cuda/cudnn_wrapper.cpp +++ b/dnn/src/cuda/cudnn_wrapper.cpp @@ -294,32 +294,6 @@ void ConvDesc::set(DType data_type, const param::Convolution& param, #endif } -PoolingDesc::PoolingDesc() { - cudnn_check(cudnnCreatePoolingDescriptor(&desc)); -} - -PoolingDesc::~PoolingDesc() { - cudnn_check(cudnnDestroyPoolingDescriptor(desc)); -} - -void PoolingDesc::set(const param::Pooling& param) { - cudnnPoolingMode_t mode; - switch (param.mode) { - case param::Pooling::Mode::MAX: - mode = CUDNN_POOLING_MAX; - break; - case param::Pooling::Mode::AVERAGE: - mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; - break; - case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING: - mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; - break; - } - cudnn_check(cudnnSetPooling2dDescriptor( - desc, mode, CUDNN_NOT_PROPAGATE_NAN, param.window_h, param.window_w, - param.pad_h, param.pad_w, param.stride_h, param.stride_w)); -} - LRNDesc::LRNDesc() { cudnn_check(cudnnCreateLRNDescriptor(&desc)); } diff --git a/dnn/src/cuda/cudnn_wrapper.h b/dnn/src/cuda/cudnn_wrapper.h index d07fe2e355a992acfb8a7a5a0aa283e41198dcb3..33fdb6cd0076d40dd327bb4207bf9ea28ccff0d4 100644 --- a/dnn/src/cuda/cudnn_wrapper.h +++ b/dnn/src/cuda/cudnn_wrapper.h @@ -54,14 +54,6 @@ class ConvDesc { cudnnConvolutionDescriptor_t desc; }; -class PoolingDesc { - public: - PoolingDesc(); - void set(const param::Pooling ¶m); - ~PoolingDesc(); - cudnnPoolingDescriptor_t desc; -}; - class LRNDesc { public: LRNDesc(); diff --git a/dnn/src/cuda/pooling/algo.cpp b/dnn/src/cuda/pooling/algo.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9d6ecce2ad0637ad1ee559eaee45c68829491b5f --- /dev/null +++ b/dnn/src/cuda/pooling/algo.cpp @@ -0,0 +1,621 @@ +/** + * \file dnn/src/cuda/pooling/algos.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./algo.h" +#include "./pooling2d_qint.cuh" +#include "src/cuda/utils.h" + +using namespace megdnn; +using namespace cuda; + +namespace { +#define V1(v) #v +#define V(v) V1(v) +#define DEF_NAME(NAME) \ +#NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL) +} // namespace + +PoolingForwardImpl::AlgoPack::AlgoPack() { + all_algos.push_back(&algo_chwn4); + all_algos.push_back(&algo_nchw4); + all_algos.push_back(&algo_nchw32); + all_algos.push_back(&algo_nhwc); + all_algos.push_back(&algo_nchw64); + all_algos.push_back(&algo_cudnn); +#if CUDNN_VERSION >= 6000 + all_algos.push_back(&algo_cudnn_max_deterministic); +#endif + + for (auto&& algo : all_algos) { + m_all_algos_map.emplace(algo->info().desc, algo); + } +} + +PoolingForwardImpl::AlgoPack PoolingForwardImpl::sm_algo_pack; +MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingForwardImpl) + +PoolingForwardImpl::AlgoBase::SizeArgs::SizeArgs(PoolingForwardImpl* o, + const TensorLayout& src, + const TensorLayout& dst) + : handle{concrete_handle(o->handle())}, + opr{o}, + layout_src{&src}, + layout_dst{&dst} {} + +PoolingForwardImpl::AlgoBase::ExecArgs::ExecArgs(PoolingForwardImpl* opr, + _megdnn_tensor_in src, + _megdnn_tensor_out dst, + _megdnn_workspace workspace) + : SizeArgs(opr, src.layout, dst.layout), + src_tensor{&src}, + dst_tensor{&dst}, + workspace{workspace} {} + +std::string PoolingForwardImpl::AlgoBase::SizeArgs::to_string() const { + return ssprintf("src=%s, dst=%s", layout_src->to_string().c_str(), + layout_dst->to_string().c_str()); +} + +WorkspaceBundle PoolingForwardImpl::AlgoBase::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + SmallVector sizes; + TensorLayout fsrc = *args.layout_src; + TensorLayout fdst = *args.layout_dst; + auto get_workspace = [&sizes](TensorLayout& layout) { + if (layout.dtype == dtype::BFloat16()) { + layout.dtype = dtype::Float32(); + sizes.push_back(layout.span().dist_byte()); + } + }; + get_workspace(fsrc); + get_workspace(fdst); + return {ptr, std::move(sizes)}; +} + +size_t PoolingForwardImpl::AlgoBase::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +bool PoolingForwardImpl::AlgoCUDNN::is_available(const SizeArgs& args) const { + using Format = param::Pooling::Format; + return (((args.opr->param().format == Format::NCHW || + args.opr->param().format == Format::NHWC) && + (args.layout_src->dtype.enumv() == DTypeEnum::Float16 || + args.layout_src->dtype.enumv() == DTypeEnum::BFloat16 || + args.layout_src->dtype.enumv() == DTypeEnum::Float32 || + args.layout_src->dtype.enumv() == DTypeEnum::Int8 || + args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS32 || + args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 || + args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)) || + ((args.opr->param().format == Format::NCHW4 || + args.opr->param().format == Format::NCHW32) && + (args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 || + args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm))); +} + +void PoolingForwardImpl::AlgoCUDNN::init_mode(const ExecArgs& args, + cudnnPoolingMode_t& mode) const { + switch (args.opr->param().mode) { + case param::Pooling::Mode::MAX: + mode = CUDNN_POOLING_MAX; + break; + case param::Pooling::Mode::AVERAGE: + mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING: + mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + break; + default: + megdnn_throw(ssprintf("Unspport pooling mode : {%d}", + static_cast(args.opr->param().mode))); + } +} + +void PoolingForwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const { + TensorND src = *args.src_tensor; + TensorND dst = *args.dst_tensor; + auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args); + auto ctypecvt = CompTypeCvter( + concrete_handle(args.handle), &wsb); + if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { + ctypecvt.src_to_comp_type(*args.src_tensor, src) + .src_to_comp_type(*args.dst_tensor, dst); + } + { + dt_float32 alpha = 1.0f, beta = 0.0f; + TensorDesc src_desc, dst_desc; + src_desc.set(src.layout, args.opr->param().format); + dst_desc.set(dst.layout, args.opr->param().format); + + cudnnPoolingMode_t mode; + init_mode(args, mode); + + cudnnPoolingDescriptor_t cudnn_desc; + cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc)); + cudnn_check(cudnnSetPooling2dDescriptor( + cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN, + args.opr->param().window_h, args.opr->param().window_w, + args.opr->param().pad_h, args.opr->param().pad_w, + args.opr->param().stride_h, args.opr->param().stride_w)); + cudnn_check(cudnnPoolingForward(args.handle->cudnn_handle(), cudnn_desc, + &alpha, src_desc.desc, src.raw_ptr, + &beta, dst_desc.desc, dst.raw_ptr)); + cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); + } + if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { + ctypecvt.comp_to_dst_type(dst, *args.dst_tensor); + } +} + +#if CUDNN_VERSION >= 6000 +bool PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::is_available( + const SizeArgs& args) const { + using Format = param::Pooling::Format; + return (args.opr->param().mode == param::Pooling::Mode::MAX && + (((args.opr->param().format == Format::NCHW || + args.opr->param().format == Format::NHWC) && + (args.layout_src->dtype.enumv() == DTypeEnum::Float16 || + args.layout_src->dtype.enumv() == DTypeEnum::BFloat16 || + args.layout_src->dtype.enumv() == DTypeEnum::Float32 || + args.layout_src->dtype.enumv() == DTypeEnum::Int8 || + args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS32 || + args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 || + args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)) || + ((args.opr->param().format == Format::NCHW4 || + args.opr->param().format == Format::NCHW32) && + (args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8 || + args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm)))); +} + +void PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::init_mode( + const ExecArgs& args, cudnnPoolingMode_t& mode) const { + switch (args.opr->param().mode) { + case param::Pooling::Mode::MAX: + mode = CUDNN_POOLING_MAX_DETERMINISTIC; + break; + default: + megdnn_throw(ssprintf("Unspport pooling mode : {%d}", + static_cast(args.opr->param().mode))); + } +} + +void PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC::exec( + const ExecArgs& args) const { + TensorND src = *args.src_tensor; + TensorND dst = *args.dst_tensor; + auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args); + auto ctypecvt = CompTypeCvter( + concrete_handle(args.handle), &wsb); + if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { + ctypecvt.src_to_comp_type(*args.src_tensor, src) + .src_to_comp_type(*args.dst_tensor, dst); + } + { + dt_float32 alpha = 1.0f, beta = 0.0f; + TensorDesc src_desc, dst_desc; + src_desc.set(src.layout, args.opr->param().format); + dst_desc.set(dst.layout, args.opr->param().format); + + cudnnPoolingMode_t mode; + init_mode(args, mode); + + cudnnPoolingDescriptor_t cudnn_desc; + cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc)); + cudnn_check(cudnnSetPooling2dDescriptor( + cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN, + args.opr->param().window_h, args.opr->param().window_w, + args.opr->param().pad_h, args.opr->param().pad_w, + args.opr->param().stride_h, args.opr->param().stride_w)); + cudnn_check(cudnnPoolingForward(args.handle->cudnn_handle(), cudnn_desc, + &alpha, src_desc.desc, src.raw_ptr, + &beta, dst_desc.desc, dst.raw_ptr)); + cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); + } + if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { + ctypecvt.comp_to_dst_type(dst, *args.dst_tensor); + } +} +#endif + +bool PoolingForwardImpl::AlgoCHWN4::is_available(const SizeArgs& args) const { + using Format = param::Pooling::Format; + return (args.opr->param().format == Format::CHWN4 && + (args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm || + args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8)); +} + +void PoolingForwardImpl::AlgoCHWN4::exec(const ExecArgs& args) const { + pooling2d::Param kern_param; + size_t c = (*args.layout_src)[0], hi = (*args.layout_src)[1], + wi = (*args.layout_src)[2], n = (*args.layout_src)[3], + ho = (*args.layout_dst)[1], wo = (*args.layout_dst)[2]; + c = c * 4; + size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; + size_t window_h = args.opr->param().window_h, + window_w = args.opr->param().window_w; + size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; + kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi, + kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph, + kern_param.pw = pw, kern_param.window_h = window_h, + kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; + auto&& stream = cuda_stream(args.handle); + pooling2d::do_pooling2d_int8_cdiv4hwn4( + args.src_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), kern_param, stream, + static_cast(args.opr->param().mode)); +} + +bool PoolingForwardImpl::AlgoNCHW4::is_available(const SizeArgs& args) const { + using Format = param::Pooling::Format; + return args.opr->param().format == Format::NCHW4 && + (args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm || + args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8); +} + +void PoolingForwardImpl::AlgoNCHW4::exec(const ExecArgs& args) const { + pooling2d::Param kern_param; + size_t n = (*args.layout_src)[0], hi = (*args.layout_src)[2], + wi = (*args.layout_src)[3], c = (*args.layout_src)[1], + ho = (*args.layout_dst)[2], wo = (*args.layout_dst)[3]; + c = c * 4; + size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; + size_t window_h = args.opr->param().window_h, + window_w = args.opr->param().window_w; + size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; + kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi, + kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph, + kern_param.pw = pw, kern_param.window_h = window_h, + kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; + auto&& stream = cuda_stream(args.handle); + pooling2d::do_pooling2d_int8_ncdiv4hw4( + args.src_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), kern_param, stream, + static_cast(args.opr->param().mode)); +} + +bool PoolingForwardImpl::AlgoNCHW32::is_available(const SizeArgs& args) const { + using Format = param::Pooling::Format; + return (args.opr->param().format == Format::NCHW32 && + (args.layout_src->dtype.enumv() == DTypeEnum::Quantized8Asymm || + args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS8)); +} + +void PoolingForwardImpl::AlgoNCHW32::exec(const ExecArgs& args) const { + pooling2d::Param kern_param; + size_t n = (*args.layout_src)[0], hi = (*args.layout_src)[2], + wi = (*args.layout_src)[3], c = (*args.layout_src)[1], + ho = (*args.layout_dst)[2], wo = (*args.layout_dst)[3]; + c = c * 32; + size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; + size_t window_h = args.opr->param().window_h, + window_w = args.opr->param().window_w; + size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; + kern_param.n = n, kern_param.c = c, kern_param.hi = hi, kern_param.wi = wi, + kern_param.ho = ho, kern_param.wo = wo, kern_param.ph = ph, + kern_param.pw = pw, kern_param.window_h = window_h, + kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; + auto&& stream = cuda_stream(args.handle); + pooling2d::do_pooling2d_int8_ncdiv32hw32( + args.src_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), kern_param, stream, + static_cast(args.opr->param().mode)); +} + +bool PoolingForwardImpl::AlgoNHWC::is_available(const SizeArgs& args) const { + using Format = param::Pooling::Format; + return (args.opr->param().format == Format::NHWC && + (args.layout_src->dtype.enumv() == DTypeEnum::Quantized4Asymm || + args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS4)); +} + +void PoolingForwardImpl::AlgoNHWC::exec(const ExecArgs& args) const { + TensorND src = *args.src_tensor; + TensorND dst = *args.dst_tensor; + { + megdnn_assert(src.layout.dtype.enumv() == dst.layout.dtype.enumv(), + "src and dst dtype must equal"); + pooling2d::Param kern_param; + size_t n = src.layout[0], hi = src.layout[1], wi = src.layout[2], + c = src.layout[3], ho = dst.layout[1], wo = dst.layout[2]; + size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; + size_t window_h = args.opr->param().window_h, + window_w = args.opr->param().window_w; + size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; + kern_param.n = n, kern_param.c = c, kern_param.hi = hi, + kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, + kern_param.ph = ph, kern_param.pw = pw, kern_param.window_h = window_h, + kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; + bool uint_case = false; + int zero_point = 0; + if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { + uint_case = true; + zero_point = + src.layout.dtype.param().zero_point; + } + auto&& stream = cuda_stream(args.handle); + pooling2d::do_pooling2d_int4_nhwc( + (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, stream, + static_cast(args.opr->param().mode), uint_case, + zero_point); + } +} + +inline void PoolingForwardImpl::AlgoNCHW64::deduce_reformat_layout( + std::unique_ptr& relayout, + const TensorLayout& src_layout, TensorLayout& dst_layout, + RelayoutFormat::Param::Mode mode, const int oc = 0, + const int group = 1) const { + if (src_layout.ndim > 0) { + RelayoutFormat::Param trans_param; + trans_param.mode = mode; + trans_param.oc = oc; + trans_param.group = group; + relayout->param() = trans_param; + relayout->deduce_layout(src_layout, dst_layout); + } else { + dst_layout = src_layout; + } +} + +void PoolingForwardImpl::AlgoNCHW64::get_inner_layout( + const TensorLayout& src, const TensorLayout& dst, + TensorLayout& inner_src, TensorLayout& inner_dst, Handle* handle, + PoolingForwardImpl::Param::Format format) const { + auto relayout_opr = handle->create_operator(); + deduce_reformat_layout(relayout_opr, src, inner_src, + RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); + deduce_reformat_layout(relayout_opr, dst, inner_dst, + RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); +} + +WorkspaceBundle PoolingForwardImpl::AlgoNCHW64::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + using Format = param::Pooling::Format; + SmallVector sizes; + TensorLayout fsrc = *args.layout_src; + TensorLayout fdst = *args.layout_dst; + if (args.opr->param().format == Format::NCHW) { + get_inner_layout(*args.layout_src, *args.layout_dst, fsrc, fdst, + args.handle, args.opr->param().format); + sizes.push_back(fsrc.span().dist_byte()); + sizes.push_back(fdst.span().dist_byte()); + } + return {ptr, std::move(sizes)}; +} + +bool PoolingForwardImpl::AlgoNCHW64::is_available(const SizeArgs& args) const { + using Format = param::Pooling::Format; + return ((args.opr->param().format == Format::NCHW || + args.opr->param().format == Format::NCHW64) && + (args.layout_src->dtype.enumv() == DTypeEnum::QuantizedS4 || + args.layout_src->dtype.enumv() == DTypeEnum::Quantized4Asymm) && + (args.layout_dst->dtype.enumv() == DTypeEnum::QuantizedS4 || + args.layout_dst->dtype.enumv() == DTypeEnum::Quantized4Asymm)); +} + +void PoolingForwardImpl::AlgoNCHW64::exec(const ExecArgs& args) const { + using Format = param::Pooling::Format; + TensorND src = *args.src_tensor; + TensorND dst = *args.dst_tensor; + if (args.opr->param().format == Format::NCHW) { + auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args); + auto handle_ptr = args.handle; + get_inner_layout(*args.layout_src, *args.layout_dst, src.layout, + dst.layout, handle_ptr, args.opr->param().format); + src.raw_ptr = wsb.get(0); + dst.raw_ptr = wsb.get(1); + auto relayout_opr = handle_ptr->create_operator(); + RelayoutFormat::Param trans_param; + trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64; + relayout_opr->param() = trans_param; + relayout_opr->exec(*args.src_tensor, src, {}); + } + + { + pooling2d::Param kern_param; + size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], + c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; + c = c * 64; + size_t ph = args.opr->param().pad_h, pw = args.opr->param().pad_w; + size_t window_h = args.opr->param().window_h, + window_w = args.opr->param().window_w; + size_t sh = args.opr->param().stride_h, sw = args.opr->param().stride_w; + kern_param.n = n, kern_param.c = c, kern_param.hi = hi, + kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, + kern_param.ph = ph, kern_param.pw = pw, kern_param.window_h = window_h, + kern_param.window_w = window_w, kern_param.sh = sh, kern_param.sw = sw; + bool uint_case = false; + int zero_point = 0; + if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { + uint_case = true; + zero_point = + src.layout.dtype.param().zero_point; + } + auto&& stream = cuda_stream(args.handle); + pooling2d::do_pooling2d_int4_ncdiv64hw64( + (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, stream, + static_cast(args.opr->param().mode), uint_case, + zero_point); + } + if (args.layout_dst->ndim == 4) { + auto relayout_opr = args.handle->create_operator(); + RelayoutFormat::Param trans_param; + trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW; + relayout_opr->param() = trans_param; + relayout_opr->exec(dst, *args.dst_tensor, {}); + } +} + +PoolingBackwardImpl::AlgoPack::AlgoPack() { + algo_cudnn.push_back({DEF_NAME(cudnnUnreproducible), false}); + algo_cudnn.push_back({DEF_NAME(cudnnReproducible), true}); + + for (auto&& i : algo_cudnn) { + all_algos.push_back(&i); + } + + for (auto&& algo : all_algos) { + m_all_algos_map.emplace(algo->info().desc, algo); + } +} + +PoolingBackwardImpl::AlgoPack PoolingBackwardImpl::sm_algo_pack; +MEGDNN_DEF_GET_ALGO_FROM_DESC(PoolingBackwardImpl) + +PoolingBackwardImpl::AlgoBase::SizeArgs::SizeArgs(PoolingBackwardImpl* o, + const TensorLayout& src, + const TensorLayout& dst, + const TensorLayout& diff, + const TensorLayout& grad) + : handle{concrete_handle(o->handle())}, + opr{o}, + layout_src{&src}, + layout_dst{&dst}, + layout_diff{&diff}, + layout_grad{&grad} {} + +PoolingBackwardImpl::AlgoBase::ExecArgs::ExecArgs(PoolingBackwardImpl* opr, + _megdnn_tensor_in src, + _megdnn_tensor_in dst, + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) + : SizeArgs(opr, src.layout, dst.layout, diff.layout, grad.layout), + src_tensor{&src}, + dst_tensor{&dst}, + diff_tensor{&diff}, + grad_tensor{&grad}, + workspace{workspace} {} + +std::string PoolingBackwardImpl::AlgoBase::SizeArgs::to_string() const { + return ssprintf( + "src=%s, dst=%s, diff=%s, grad=%s", layout_src->to_string().c_str(), + layout_dst->to_string().c_str(), layout_diff->to_string().c_str(), + layout_grad->to_string().c_str()); +} + +bool PoolingBackwardImpl::AlgoCUDNN::is_available(const SizeArgs& args) const { + using Format = param::Pooling::Format; +#if CUDNN_VERSION < 6000 + return ((args.opr->param().format == Format::NCHW || + args.opr->param().format == Format::NHWC || + args.opr->param().format == Format::NCHW4 || + args.opr->param().format == Format::NCHW32) && + (m_is_reproducible ^ + (args.opr->param().mode == param::Pooling::Mode::MAX))); +#else + return ((args.opr->param().format == Format::NCHW || + args.opr->param().format == Format::NHWC || + args.opr->param().format == Format::NCHW4 || + args.opr->param().format == Format::NCHW32) && + (m_is_reproducible || + args.opr->param().mode == param::Pooling::Mode::MAX)); +#endif +} + +WorkspaceBundle PoolingBackwardImpl::AlgoBase::get_workspace_bundle( + void* ptr, const SizeArgs& args) const { + SmallVector sizes; + TensorLayout fsrc = *args.layout_src; + TensorLayout fdst = *args.layout_dst; + TensorLayout fdiff = *args.layout_diff; + TensorLayout fgrad = *args.layout_grad; + auto get_workspace = [&sizes](TensorLayout& layout) { + if (layout.dtype == dtype::BFloat16()) { + layout.dtype = dtype::Float32(); + sizes.push_back(layout.span().dist_byte()); + } + }; + get_workspace(fsrc); + get_workspace(fdst); + get_workspace(fdiff); + get_workspace(fgrad); + return {ptr, std::move(sizes)}; +} + +size_t PoolingBackwardImpl::AlgoBase::get_workspace_in_bytes( + const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void PoolingBackwardImpl::AlgoCUDNN::init_mode(const ExecArgs& args, + cudnnPoolingMode_t& mode) const { + if (m_is_reproducible) { + switch (args.opr->param().mode) { +#if CUDNN_VERSION >= 6000 + case param::Pooling::Mode::MAX: + mode = CUDNN_POOLING_MAX_DETERMINISTIC; + break; +#endif + case param::Pooling::Mode::AVERAGE: + mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING; + break; + case param::Pooling::Mode::AVERAGE_COUNT_EXCLUDE_PADDING: + mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING; + break; + default: + megdnn_throw( + ssprintf("Unspport pooling mode : {%d}", + static_cast(args.opr->param().mode))); + } + } else if (args.opr->param().mode == param::Pooling::Mode::MAX) { + mode = CUDNN_POOLING_MAX; + } else { + megdnn_throw("init_mode failed\n"); + } +} + +void PoolingBackwardImpl::AlgoCUDNN::exec(const ExecArgs& args) const { + TensorND src = *args.src_tensor; + TensorND dst = *args.dst_tensor; + TensorND diff = *args.diff_tensor; + TensorND grad = *args.grad_tensor; + auto wsb = get_workspace_bundle(args.workspace.raw_ptr, args); + auto ctypecvt = CompTypeCvter( + concrete_handle(args.handle), &wsb); + if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { + ctypecvt.src_to_comp_type(*args.src_tensor, src) + .src_to_comp_type(*args.dst_tensor, dst) + .src_to_comp_type(*args.diff_tensor, diff) + .src_to_comp_type(*args.grad_tensor, grad); + } + { + dt_float32 alpha = 1.0f, beta = 0.0f; + TensorDesc src_desc, dst_desc, diff_desc, grad_desc; + src_desc.set(src.layout, args.opr->param().format); + dst_desc.set(dst.layout, args.opr->param().format); + diff_desc.set(diff.layout, args.opr->param().format); + grad_desc.set(grad.layout, args.opr->param().format); + + cudnnPoolingMode_t mode; + init_mode(args, mode); + + cudnnPoolingDescriptor_t cudnn_desc; + cudnn_check(cudnnCreatePoolingDescriptor(&cudnn_desc)); + cudnn_check(cudnnSetPooling2dDescriptor( + cudnn_desc, mode, CUDNN_NOT_PROPAGATE_NAN, + args.opr->param().window_h, args.opr->param().window_w, + args.opr->param().pad_h, args.opr->param().pad_w, + args.opr->param().stride_h, args.opr->param().stride_w)); + cudnn_check(cudnnPoolingBackward( + args.handle->cudnn_handle(), cudnn_desc, &alpha, dst_desc.desc, + dst.raw_ptr, diff_desc.desc, diff.raw_ptr, src_desc.desc, + src.raw_ptr, &beta, grad_desc.desc, grad.raw_ptr)); + cudnn_check(cudnnDestroyPoolingDescriptor(cudnn_desc)); + } + if (args.layout_src->dtype.enumv() == DTypeTrait::enumv) { + ctypecvt.comp_to_dst_type(grad, *args.grad_tensor); + } +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/pooling/algo.h b/dnn/src/cuda/pooling/algo.h new file mode 100644 index 0000000000000000000000000000000000000000..5cf137a6ed637c9818a6a778636ccbd3171266bf --- /dev/null +++ b/dnn/src/cuda/pooling/algo.h @@ -0,0 +1,269 @@ +/** + * \file dnn/src/cuda/pooling/algo.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once + +#include +#include "src/common/algo_base.h" +#include "src/common/metahelper.h" +#include "src/cuda/cudnn_wrapper.h" +#include "src/cuda/pooling/opr_impl.h" + +namespace megdnn { +namespace cuda { + +namespace { +#define V1(v) #v +#define V(v) V1(v) +#define DEF_NAME(NAME) \ +#NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL) +} // namespace + +class PoolingForwardImpl::AlgoBase : public Algorithm { +public: + enum class AlgoType : uint32_t { + CUDA_CUDNN, +#if CUDNN_VERSION >= 6000 + CUDA_CUDNN_MAXDETERMINISTIC, +#endif + CUDA_CHWN4, + CUDA_NCHW4, + CUDA_NCHW32, + CUDA_NHWC, + CUDA_NCHW64 + }; + using Mapper = std::unordered_map; + + AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } + struct SizeArgs { + HandleImpl* handle; + PoolingForwardImpl* opr; + const TensorLayout *layout_src, *layout_dst; + + std::string to_string() const; + SizeArgs(PoolingForwardImpl* opr, const TensorLayout& src, + const TensorLayout& dst); + }; + struct ExecArgs : public SizeArgs { + const TensorND *src_tensor, *dst_tensor; + Workspace workspace; + + ExecArgs(PoolingForwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_out dst, _megdnn_workspace workspace); + }; + + virtual bool is_available(const SizeArgs& args) const = 0; + size_t get_workspace_in_bytes(const SizeArgs& args) const; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_attribute( + const SizeArgs& args, + const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, + const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { + return contain_attribute_all(positive_attr) && + !contain_attribute_any(negative_attr) && is_available(args); + } + +protected: + ~AlgoBase() = default; + virtual WorkspaceBundle get_workspace_bundle(void* ptr, + const SizeArgs& args) const; +}; + +class PoolingForwardImpl::AlgoCUDNN final : public AlgoBase { + std::string m_algo_name; + +public: + AlgoCUDNN(std::string name) : m_algo_name(name) {} + + bool is_available(const SizeArgs& args) const override; + void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const; + void exec(const ExecArgs& args) const override; + + const char* name() const override { return m_algo_name.c_str(); } + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + } + + MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN) + + std::string param() const override { return m_algo_name; } +}; + +#if CUDNN_VERSION >= 6000 +class PoolingForwardImpl::AlgoCUDNNMAXDETERMINISTIC final : public AlgoBase { + std::string m_algo_name; + +public: + AlgoCUDNNMAXDETERMINISTIC(std::string name) : m_algo_name(name) {} + + bool is_available(const SizeArgs& args) const override; + void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const; + void exec(const ExecArgs& args) const override; + + const char* name() const override { return m_algo_name.c_str(); } + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + } + + MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN_MAXDETERMINISTIC) + + std::string param() const override { return m_algo_name; } +}; +#endif + +#define ALGO_LAYOUT_POOLING_IMPL(_layout) \ + class PoolingForwardImpl::Algo##_layout final : public AlgoBase { \ + std::string m_algo_name; \ + \ + public: \ + Algo##_layout( \ + std::string name = std::string("CUDA_").append(#_layout)) \ + : m_algo_name(name) {} \ + bool is_available(const SizeArgs& args) const override; \ + void exec(const ExecArgs& args) const override; \ + const char* name() const override { return m_algo_name.c_str(); } \ + AlgoAttribute attribute() const override { \ + return AlgoAttribute::REPRODUCIBLE; \ + } \ + MEGDNN_DECL_ALGO_TYPE(CUDA_##_layout) + +ALGO_LAYOUT_POOLING_IMPL(CHWN4)}; +ALGO_LAYOUT_POOLING_IMPL(NCHW4)}; +ALGO_LAYOUT_POOLING_IMPL(NCHW32)}; +ALGO_LAYOUT_POOLING_IMPL(NHWC)}; +ALGO_LAYOUT_POOLING_IMPL(NCHW64) //{ +protected: + WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) + const override; + +private: + inline void deduce_reformat_layout( + std::unique_ptr & relayout, + const TensorLayout& src_layout, TensorLayout& dst_layout, + RelayoutFormat::Param::Mode mode, const int oc, const int group) + const; + void get_inner_layout(const TensorLayout& src, const TensorLayout& dst, + TensorLayout& inner_src, TensorLayout& inner_dst, + Handle* handle, + PoolingForwardImpl::Param::Format format) const; +}; + +#undef ALGO_LAYOUT_POOLING_IMPL + +class PoolingForwardImpl::AlgoPack : NonCopyableObj { +private: + AlgoBase::Mapper m_all_algos_map; + +public: + AlgoPack(); + AlgoCUDNN algo_cudnn{DEF_NAME(cudnnForward)}; +#if CUDNN_VERSION >= 6000 + AlgoCUDNNMAXDETERMINISTIC algo_cudnn_max_deterministic{ + DEF_NAME(cudnnForwardMaxDeterministic)}; +#endif + AlgoCHWN4 algo_chwn4; + AlgoNCHW4 algo_nchw4; + AlgoNCHW32 algo_nchw32; + AlgoNHWC algo_nhwc; + AlgoNCHW64 algo_nchw64; + + std::vector all_algos; + + const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } +}; + +class PoolingBackwardImpl::AlgoBase : public Algorithm { +public: + enum class AlgoType : uint32_t { CUDA_CUDNN }; + using Mapper = std::unordered_map; + + AlgoBase() : Algorithm() { m_handle_type = Handle::HandleType::CUDA; } + struct SizeArgs { + HandleImpl* handle; + PoolingBackwardImpl* opr; + const TensorLayout *layout_src, *layout_dst, *layout_diff, *layout_grad; + + std::string to_string() const; + SizeArgs(PoolingBackwardImpl* opr, const TensorLayout& src, + const TensorLayout& dst, const TensorLayout& diff, + const TensorLayout& grad); + }; + struct ExecArgs : public SizeArgs { + const TensorND *src_tensor, *dst_tensor, *diff_tensor, *grad_tensor; + Workspace workspace; + + ExecArgs(PoolingBackwardImpl* opr, _megdnn_tensor_in src, + _megdnn_tensor_in dst, _megdnn_tensor_in diff, + _megdnn_tensor_out grad, _megdnn_workspace workspace); + }; + + virtual bool is_available(const SizeArgs& args) const = 0; + size_t get_workspace_in_bytes(const SizeArgs& args) const; + virtual void exec(const ExecArgs& args) const = 0; + + bool is_available_attribute( + const SizeArgs& args, + const AlgoAttribute& positive_attr = AlgoAttribute::REPRODUCIBLE, + const AlgoAttribute& negative_attr = AlgoAttribute::DEFAULT) { + return contain_attribute_all(positive_attr) && + !contain_attribute_any(negative_attr) && is_available(args); + } + +protected: + ~AlgoBase() = default; + virtual WorkspaceBundle get_workspace_bundle(void* ptr, + const SizeArgs& args) const; +}; + +class PoolingBackwardImpl::AlgoCUDNN final : public AlgoBase { + std::string m_algo_name; + bool m_is_reproducible; + +public: + AlgoCUDNN(std::string name, bool is_reproducible) + : m_algo_name(name), m_is_reproducible(is_reproducible) {} + + bool is_available(const SizeArgs& args) const override; + void init_mode(const ExecArgs& args, cudnnPoolingMode_t& mode) const; + void exec(const ExecArgs& args) const override; + + const char* name() const override { return m_algo_name.c_str(); } + AlgoAttribute attribute() const override { + auto ret = AlgoAttribute::DEFAULT; + if (m_is_reproducible) { + ret |= AlgoAttribute::REPRODUCIBLE; + } + return ret; + } + + MEGDNN_DECL_ALGO_TYPE(CUDA_CUDNN) + + std::string param() const override { return m_algo_name; } +}; + +class PoolingBackwardImpl::AlgoPack : NonCopyableObj { +private: + AlgoBase::Mapper m_all_algos_map; + +public: + AlgoPack(); + std::vector algo_cudnn; + std::vector all_algos; + + const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; } +}; + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/pooling/opr_impl.cpp b/dnn/src/cuda/pooling/opr_impl.cpp index 37ff5ee9c8e2c1904436f3fcee6e883bc7a33562..e5efb5e212c0256fd6ba096af8818093b5927668 100644 --- a/dnn/src/cuda/pooling/opr_impl.cpp +++ b/dnn/src/cuda/pooling/opr_impl.cpp @@ -6,275 +6,97 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/cuda/pooling/opr_impl.h" -#include "src/cuda/relayout_format/opr_impl.h" - +#include "./algo.h" #include "./pooling2d_qint.cuh" +#include "src/common/algo_chooser.h" +#include "src/cuda/relayout_format/opr_impl.h" #include "src/cuda/utils.h" namespace megdnn { namespace cuda { -namespace { -inline void deduce_reformat_layout(std::unique_ptr& relayout, - const TensorLayout& src_layout, - TensorLayout& dst_layout, - RelayoutFormat::Param::Mode mode, - const int oc = 0, const int group = 1) { - if (src_layout.ndim > 0) { - RelayoutFormat::Param trans_param; - trans_param.mode = mode; - trans_param.oc = oc; - trans_param.group = group; - relayout->param() = trans_param; - relayout->deduce_layout(src_layout, dst_layout); - } else { - dst_layout = src_layout; - } +size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) { + AlgoBase::SizeArgs args(this, src, dst); + return get_algorithm(this, src, dst)->get_workspace_in_bytes(args); } -void get_inner_layout(const TensorLayout& src, const TensorLayout& dst, - TensorLayout& inner_src, TensorLayout& inner_dst, - Handle* handle, - PoolingForwardImpl::Param::Format format) { - bool is_nchw = format == PoolingForwardImpl::Param::Format::NCHW; - if (is_nchw) { - auto relayout_opr = handle->create_operator(); - deduce_reformat_layout(relayout_opr, src, inner_src, - RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); - deduce_reformat_layout(relayout_opr, dst, inner_dst, - RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); - } else { - megdnn_assert(0, "not support"); - } +const char* PoolingForwardImpl::get_algorithm_set_name() const { + return "CUDA_POOLING_FORWARD"; } -} // namespace -void PoolingForwardImpl::setup_descs(const TensorLayout& src, - const TensorLayout& dst) { - src_desc.set(src, param().format); - dst_desc.set(dst, param().format); - pooling_desc.set(this->param()); +std::vector +PoolingForwardImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& dst) { + return megdnn::get_all_algorithms({this, src, dst}); } -WorkspaceBundle PoolingForwardImpl::get_workspace_bundle( - void* ptr, const TensorLayout& src, const TensorLayout& dst) const { - SmallVector sizes; - TensorLayout fsrc = src; - TensorLayout fdst = dst; - bool is_nchw = param().format == Param::Format::NCHW; - if ((src.dtype.enumv() == DTypeEnum::QuantizedS4 || - src.dtype.enumv() == DTypeEnum::Quantized4Asymm) && - (dst.dtype.enumv() == DTypeEnum::QuantizedS4 || - dst.dtype.enumv() == DTypeEnum::Quantized4Asymm) && - is_nchw) { - get_inner_layout(src, dst, fsrc, fdst, handle(), param().format); - sizes.push_back(fsrc.span().dist_byte()); - sizes.push_back(fdst.span().dist_byte()); - } else { - auto get_workspace = [&sizes](TensorLayout& layout) { - if (layout.dtype == dtype::BFloat16()) { - layout.dtype = dtype::Float32(); - sizes.push_back(layout.span().dist_byte()); - } - }; - get_workspace(fsrc); - get_workspace(fdst); +PoolingForwardImpl::Algorithm* PoolingForwardImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); + + AlgoBase::SizeArgs args(this, src, dst); + for (auto&& iter : sm_algo_pack.all_algos) { + if (iter->is_available_attribute(args, positive_attr, negative_attr)) { + return iter; + } } - return {ptr, std::move(sizes)}; + megdnn_throw( + ssprintf("require algorithm with attribute(%s) and without " + "attribute(%s), but can't get suitable algo.\n", + Algorithm::attribute_str(positive_attr).c_str(), + Algorithm::attribute_str(negative_attr).c_str())); + return nullptr; } void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst, _megdnn_workspace sworkspace) { check_exec(ssrc.layout, sdst.layout, sworkspace.size); - TensorND src = ssrc; - TensorND dst = sdst; - Param::Format inner_format = param().format; - auto wsb = - get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout, sdst.layout); - auto ctypecvt = CompTypeCvter( - concrete_handle(this->handle()), &wsb); - bool is_nchw = param().format == Param::Format::NCHW; - if (ssrc.layout.dtype.enumv() == DTypeTrait::enumv) { - ctypecvt.src_to_comp_type(ssrc, src).src_to_comp_type(sdst, dst); - } else if ((ssrc.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || - ssrc.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) && - (sdst.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || - sdst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) && - is_nchw) { - auto handle_ptr = handle(); - get_inner_layout(ssrc.layout, sdst.layout, src.layout, dst.layout, - handle_ptr, param().format); - src.raw_ptr = wsb.get(0); - dst.raw_ptr = wsb.get(1); - auto relayout_opr = handle_ptr->create_operator(); - RelayoutFormat::Param trans_param; - trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64; - relayout_opr->param() = trans_param; - relayout_opr->exec(ssrc, src, {}); - inner_format = Param::Format::NCHW64; - } { - using Format = param::Pooling::Format; - if (param().format == Format::CHWN4) { - pooling2d::Param kern_param; - size_t c = src.layout[0], hi = src.layout[1], wi = src.layout[2], - n = src.layout[3], ho = dst.layout[1], wo = dst.layout[2]; - c = c * 4; - size_t ph = param().pad_h, pw = param().pad_w; - size_t window_h = param().window_h, window_w = param().window_w; - size_t sh = param().stride_h, sw = param().stride_w; - kern_param.n = n, kern_param.c = c, kern_param.hi = hi, - kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, - kern_param.ph = ph, kern_param.pw = pw, - kern_param.window_h = window_h, kern_param.window_w = window_w, - kern_param.sh = sh, kern_param.sw = sw; - auto&& stream = cuda_stream(handle()); - return pooling2d::do_pooling2d_int8_cdiv4hwn4( - src.compatible_ptr(), dst.compatible_ptr(), - kern_param, stream, static_cast(param().mode)); - } else if (param().format == Format::NCHW4) { - pooling2d::Param kern_param; - size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], - c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; - c = c * 4; - size_t ph = param().pad_h, pw = param().pad_w; - size_t window_h = param().window_h, window_w = param().window_w; - size_t sh = param().stride_h, sw = param().stride_w; - kern_param.n = n, kern_param.c = c, kern_param.hi = hi, - kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, - kern_param.ph = ph, kern_param.pw = pw, - kern_param.window_h = window_h, kern_param.window_w = window_w, - kern_param.sh = sh, kern_param.sw = sw; - auto&& stream = cuda_stream(handle()); - return pooling2d::do_pooling2d_int8_ncdiv4hw4( - src.compatible_ptr(), dst.compatible_ptr(), - kern_param, stream, static_cast(param().mode)); - } else if (param().format == Format::NCHW32) { - pooling2d::Param kern_param; - size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], - c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; - c = c * 32; - size_t ph = param().pad_h, pw = param().pad_w; - size_t window_h = param().window_h, window_w = param().window_w; - size_t sh = param().stride_h, sw = param().stride_w; - kern_param.n = n, kern_param.c = c, kern_param.hi = hi, - kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, - kern_param.ph = ph, kern_param.pw = pw, - kern_param.window_h = window_h, kern_param.window_w = window_w, - kern_param.sh = sh, kern_param.sw = sw; - auto&& stream = cuda_stream(handle()); - return pooling2d::do_pooling2d_int8_ncdiv32hw32( - src.compatible_ptr(), dst.compatible_ptr(), - kern_param, stream, static_cast(param().mode)); - } else if (param().format == Format::NCHW64 || - inner_format == Format::NCHW64) { - pooling2d::Param kern_param; - size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], - c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; - c = c * 64; - size_t ph = param().pad_h, pw = param().pad_w; - size_t window_h = param().window_h, window_w = param().window_w; - size_t sh = param().stride_h, sw = param().stride_w; - kern_param.n = n, kern_param.c = c, kern_param.hi = hi, - kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, - kern_param.ph = ph, kern_param.pw = pw, - kern_param.window_h = window_h, kern_param.window_w = window_w, - kern_param.sh = sh, kern_param.sw = sw; - bool uint_case = false; - int zero_point = 0; - if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { - uint_case = true; - zero_point = src.layout.dtype.param() - .zero_point; - } - auto&& stream = cuda_stream(handle()); - pooling2d::do_pooling2d_int4_ncdiv64hw64( - (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, - stream, static_cast(param().mode), uint_case, - zero_point); - if (sdst.layout.ndim == 4) { - auto relayout_opr = handle()->create_operator(); - RelayoutFormat::Param trans_param; - trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW; - relayout_opr->param() = trans_param; - relayout_opr->exec(dst, sdst, {}); - } - return; - } else if (param().format == Format::NHWC && - (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm || - src.layout.dtype.enumv() == DTypeEnum::QuantizedS4)) { - megdnn_assert(src.layout.dtype.enumv() == dst.layout.dtype.enumv(), - "src and dst dtype must equal"); - pooling2d::Param kern_param; - size_t n = src.layout[0], hi = src.layout[1], wi = src.layout[2], - c = src.layout[3], ho = dst.layout[1], wo = dst.layout[2]; - size_t ph = param().pad_h, pw = param().pad_w; - size_t window_h = param().window_h, window_w = param().window_w; - size_t sh = param().stride_h, sw = param().stride_w; - kern_param.n = n, kern_param.c = c, kern_param.hi = hi, - kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, - kern_param.ph = ph, kern_param.pw = pw, - kern_param.window_h = window_h, kern_param.window_w = window_w, - kern_param.sh = sh, kern_param.sw = sw; - bool uint_case = false; - int zero_point = 0; - if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { - uint_case = true; - zero_point = src.layout.dtype.param() - .zero_point; - } - auto&& stream = cuda_stream(handle()); - pooling2d::do_pooling2d_int4_nhwc( - (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, - stream, static_cast(param().mode), uint_case, - zero_point); - return; - } - auto handle = cudnn_handle(this->handle()); - setup_descs(src.layout, dst.layout); - dt_float32 alpha = 1.0f, beta = 0.0f; - cudnn_check(cudnnPoolingForward(handle, pooling_desc.desc, &alpha, - src_desc.desc, src.raw_ptr, &beta, - dst_desc.desc, dst.raw_ptr)); + AlgoBase::ExecArgs args(this, ssrc, sdst, sworkspace); + auto algo = get_algorithm(this, ssrc.layout, sdst.layout); + algo->exec(args); } - if (ssrc.layout.dtype.enumv() == DTypeTrait::enumv) { - ctypecvt.comp_to_dst_type(dst, sdst); - } } -void PoolingBackwardImpl::setup_descs(const TensorLayout& src, - const TensorLayout& dst, - const TensorLayout& diff, - const TensorLayout& grad) { - src_desc.set(src); - dst_desc.set(dst); - diff_desc.set(diff); - grad_desc.set(grad); - pooling_desc.set(this->param()); +const char* PoolingBackwardImpl::get_algorithm_set_name() const { + return "CUDA_POOLING_BACKWARD"; } -WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( - void* ptr, const TensorLayout& src, const TensorLayout& dst, - const TensorLayout& diff, const TensorLayout& grad) const { - SmallVector sizes; - TensorLayout fsrc = src; - TensorLayout fdst = dst; - TensorLayout fdiff = diff; - TensorLayout fgrad = grad; - auto get_workspace = [&sizes](TensorLayout& layout) { - if (layout.dtype == dtype::BFloat16()) { - layout.dtype = dtype::Float32(); - sizes.push_back(layout.span().dist_byte()); +std::vector +PoolingBackwardImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& dst, + const TensorLayout& diff, + const TensorLayout& grad) { + return megdnn::get_all_algorithms( + {this, src, dst, diff, grad}); +} + +PoolingBackwardImpl::Algorithm* PoolingBackwardImpl::get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + MEGDNN_MARK_USED_VAR(workspace_limit_in_bytes); + + AlgoBase::SizeArgs args(this, src, dst, diff, grad); + for (auto iter : sm_algo_pack.all_algos) { + if (iter->is_available_attribute(args, positive_attr, negative_attr)) { + return iter; } - }; - get_workspace(fsrc); - get_workspace(fdst); - get_workspace(fdiff); - get_workspace(fgrad); - return {ptr, std::move(sizes)}; + } + megdnn_throw( + ssprintf("require algorithm with attribute(%s) and without " + "attribute(%s), but can't get suitable algo.\n", + Algorithm::attribute_str(positive_attr).c_str(), + Algorithm::attribute_str(negative_attr).c_str())); + return nullptr; } void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst, @@ -283,34 +105,23 @@ void PoolingBackwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_in sdst, _megdnn_workspace sworkspace) { check_exec(ssrc.layout, sdst.layout, sdiff.layout, sgrad.layout, sworkspace.size); - auto handle = cudnn_handle(this->handle()); - TensorND src = ssrc; - TensorND dst = sdst; - TensorND diff = sdiff; - TensorND grad = sgrad; - auto wsb = get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout, - sdst.layout, sdiff.layout, sgrad.layout); - auto ctypecvt = CompTypeCvter( - concrete_handle(this->handle()), &wsb); - if (ssrc.layout.dtype.enumv() == DTypeTrait::enumv) { - ctypecvt.src_to_comp_type(ssrc, src) - .src_to_comp_type(sdst, dst) - .src_to_comp_type(sdiff, diff) - .src_to_comp_type(sgrad, grad); - } { - setup_descs(src.layout, dst.layout, diff.layout, grad.layout); - float alpha = 1.0f, beta = 0.0f; - cudnn_check(cudnnPoolingBackward( - handle, pooling_desc.desc, &alpha, dst_desc.desc, dst.raw_ptr, - diff_desc.desc, diff.raw_ptr, src_desc.desc, src.raw_ptr, &beta, - grad_desc.desc, grad.raw_ptr)); - } - if (ssrc.layout.dtype.enumv() == DTypeTrait::enumv) { - ctypecvt.comp_to_dst_type(grad, sgrad); + AlgoBase::ExecArgs args(this, ssrc, sdst, sdiff, sgrad, sworkspace); + auto algo = get_algorithm(this, ssrc.layout, sdst.layout, sdiff.layout, + sgrad.layout); + algo->exec(args); } } +size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst, + const TensorLayout& diff, + const TensorLayout& grad) { + AlgoBase::SizeArgs args(this, src, dst, diff, grad); + return get_algorithm(this, src, dst, diff, grad) + ->get_workspace_in_bytes(args); +} + } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/pooling/opr_impl.h b/dnn/src/cuda/pooling/opr_impl.h index c5fdce1ccf2390ac766b12d3812093f8dbced8e0..a8c3e65f5bc0b8d2fd2fa92d3ee1cf7a993282b1 100644 --- a/dnn/src/cuda/pooling/opr_impl.h +++ b/dnn/src/cuda/pooling/opr_impl.h @@ -23,16 +23,45 @@ public: void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) override; size_t get_workspace_in_bytes(const TensorLayout& src, - const TensorLayout& dst) override { - return get_workspace_bundle(nullptr, src, dst).total_size_in_bytes(); + const TensorLayout& dst) override; + + const char* get_algorithm_set_name() const override; + Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; + + AlgorithmInfo get_algorithm_info_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + return get_algorithm_heuristic(src, dst, workspace_limit_in_bytes, + positive_attr, negative_attr) + ->info(); } + class AlgoBase; + class AlgoCUDNN; +#if CUDNN_VERSION >= 6000 + class AlgoCUDNNMAXDETERMINISTIC; +#endif + class AlgoCHWN4; + class AlgoNCHW4; + class AlgoNCHW32; + class AlgoNHWC; + class AlgoNCHW64; + + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +protected: + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& dst) override; + Algorithm* get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) override; + private: - TensorDesc src_desc, dst_desc; - PoolingDesc pooling_desc; - void setup_descs(const TensorLayout& src, const TensorLayout& dst); - WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout& src, - const TensorLayout& dst) const; + static AlgoPack sm_algo_pack; }; class PoolingBackwardImpl final : public PoolingBackward { @@ -44,23 +73,43 @@ public: size_t get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff, - const TensorLayout& grad) override { - return get_workspace_bundle(nullptr, src, dst, diff, grad) - .total_size_in_bytes(); + const TensorLayout& grad) override; + + const char* get_algorithm_set_name() const override; + Algorithm* get_algorithm_from_desc(const AlgorithmDesc& desc) override; + + AlgorithmInfo get_algorithm_info_heuristic( + const TensorLayout& src, const TensorLayout& dst, + const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + return get_algorithm_heuristic(src, dst, diff, grad, + workspace_limit_in_bytes, positive_attr, + negative_attr) + ->info(); } + class AlgoBase; + class AlgoCUDNN; + class AlgoPack; + + static const AlgoPack& algo_pack() { return sm_algo_pack; } + +protected: + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& dst, + const TensorLayout& diff, const TensorLayout& grad) override; + Algorithm* get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) override; + private: - TensorDesc src_desc, dst_desc, diff_desc, grad_desc; - PoolingDesc pooling_desc; - void setup_descs(const TensorLayout& src, const TensorLayout& dst, - const TensorLayout& diff, const TensorLayout& grad); - WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout& src, - const TensorLayout& dst, - const TensorLayout& diff, - const TensorLayout& grad) const; + static AlgoPack sm_algo_pack; }; -} // namespace cuda -} // namespace megdnn +} // namespace cuda +} // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/src/naive/convolution/algorithms.h b/dnn/src/naive/convolution/algorithms.h index dd0bee9411ca2d2f2f27f9b0c34e7f4404401b03..e6db7eeaa171975a52ce7e26ef1249f7f29b7297 100644 --- a/dnn/src/naive/convolution/algorithms.h +++ b/dnn/src/naive/convolution/algorithms.h @@ -57,6 +57,24 @@ class DefaultBatchConvBiasForwardAlgorithm final const char* name() const override { return "DEFAULT"; } }; +class DefaultPoolingForwardAlgorithm final + : public megdnn::PoolingForward::Algorithm { + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE; + } + uint32_t type() const override { return 0; } + const char* name() const override { return "DEFAULT"; } +}; + +class DefaultPoolingBackwardAlgorithm final + : public megdnn::PoolingBackward::Algorithm { + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE | AlgoAttribute::NAIVE; + } + uint32_t type() const override { return 0; } + const char* name() const override { return "DEFAULT"; } +}; + } // namespace naive } // namespace megdnn diff --git a/dnn/src/naive/handle.cpp b/dnn/src/naive/handle.cpp index 7cb31fb06935a497235508af7606952c36131db7..4ec7e555e1dee8e58f97164adb614097044c3952 100644 --- a/dnn/src/naive/handle.cpp +++ b/dnn/src/naive/handle.cpp @@ -110,6 +110,9 @@ DefaultLocalShareBackwardFilterAlgorithm DefaultMatrixMulAlgorithm HandleImpl::m_default_matmul_fwd_algo; DefaultBatchedMatrixMulAlgorithm HandleImpl::m_default_batched_matmul_fwd_algo; +DefaultPoolingForwardAlgorithm HandleImpl::m_default_pooling_fwd_algo; +DefaultPoolingBackwardAlgorithm HandleImpl::m_default_pooling_bwd_algo; + HandleImpl::HandleImpl(megcoreComputingHandle_t computing_handle, HandleType type) : HandleImplHelper(computing_handle, type), diff --git a/dnn/src/naive/handle.h b/dnn/src/naive/handle.h index d435f3324f41916dc53aaa4501469f3bfed14ae5..e7e1932a50edae39d6e592873519134da70757f5 100644 --- a/dnn/src/naive/handle.h +++ b/dnn/src/naive/handle.h @@ -51,6 +51,9 @@ class HandleImpl : public HandleImplHelper { static DefaultMatrixMulAlgorithm m_default_matmul_fwd_algo; static DefaultBatchedMatrixMulAlgorithm m_default_batched_matmul_fwd_algo; + static DefaultPoolingForwardAlgorithm m_default_pooling_fwd_algo; + static DefaultPoolingBackwardAlgorithm m_default_pooling_bwd_algo; + //! move KernFunc to alloc_kern()->func, destruct func, and call dispatch template void move_kern_func_to_new_kern_and_dispatch(T& func) { @@ -122,6 +125,14 @@ public: return &m_default_batched_matmul_fwd_algo; } + PoolingForward::Algorithm* default_pooling_fwd_algo() { + return &m_default_pooling_fwd_algo; + } + + PoolingBackward::Algorithm* default_pooling_bwd_algo() { + return &m_default_pooling_bwd_algo; + } + Relayout* relayout_opr() override { return get_helper_opr(this); } diff --git a/dnn/src/naive/pooling/opr_impl.cpp b/dnn/src/naive/pooling/opr_impl.cpp index 82cf4d3141a95d19cfad94cecb3a8f09965ffdb4..5242d4f9e9b796051ab6a1b02e92353f808e6625 100644 --- a/dnn/src/naive/pooling/opr_impl.cpp +++ b/dnn/src/naive/pooling/opr_impl.cpp @@ -582,6 +582,52 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, megdnn_assert_internal(0); } +PoolingForward::Algorithm* PoolingForwardImpl::get_algorithm_from_desc( + const AlgorithmDesc& desc) { + Algorithm* ret = + static_cast(handle())->default_pooling_fwd_algo(); + megdnn_assert(desc == ret->info().desc); + return ret; +} + +std::vector PoolingForwardImpl::get_all_algorithms( + const TensorLayout&, const TensorLayout&) { + return {static_cast(handle())->default_pooling_fwd_algo()}; +} + +Algorithm* PoolingForwardImpl::get_algorithm_heuristic( + const TensorLayout& /*src*/, const TensorLayout& /*dst*/, + size_t /*workspace_limit_in_bytes*/, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + auto algo = static_cast(handle())->default_pooling_fwd_algo(); + algo->check_attribute(positive_attr, negative_attr); + return algo; +} + +Algorithm* PoolingBackwardImpl::get_algorithm_from_desc( + const AlgorithmDesc& desc) { + Algorithm* ret = + static_cast(handle())->default_pooling_bwd_algo(); + megdnn_assert(desc == ret->info().desc); + return ret; +} + +std::vector PoolingBackwardImpl::get_all_algorithms( + const TensorLayout& /*src*/, const TensorLayout& /*dst*/, + const TensorLayout& /*diff*/, const TensorLayout& /*grad*/) { + return {static_cast(handle())->default_pooling_bwd_algo()}; +} + +Algorithm* PoolingBackwardImpl::get_algorithm_heuristic( + const TensorLayout& /*src*/, const TensorLayout& /*dst*/, + const TensorLayout& /*diff*/, const TensorLayout& /*grad*/, + size_t /*workspace_limit_in_bytes*/, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) { + auto algo = static_cast(handle())->default_pooling_bwd_algo(); + algo->check_attribute(positive_attr, negative_attr); + return algo; +} + WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( void* ptr, const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff, const TensorLayout& grad) const { diff --git a/dnn/src/naive/pooling/opr_impl.h b/dnn/src/naive/pooling/opr_impl.h index 1590785195dff32d3a82432a2eaa1a3af07e80ed..fe34fbf7b33701b5e3195b331290e1c9c8df2ff7 100644 --- a/dnn/src/naive/pooling/opr_impl.h +++ b/dnn/src/naive/pooling/opr_impl.h @@ -26,6 +26,21 @@ class PoolingForwardImpl: public PoolingForward { private: WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, const TensorLayout&) const; + + const char* get_algorithm_set_name() const override { + return "DEFALUT"; + } + + Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override; + + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& dst) override; + + Algorithm* get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + size_t workspace_limit_in_bytes, + const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) override; }; class PoolingBackwardImpl : public PoolingBackward { @@ -38,6 +53,20 @@ public: const TensorLayout&, const TensorLayout&) override; + const char* get_algorithm_set_name() const override { return "DEFALUT"; } + + Algorithm* get_algorithm_from_desc(const AlgorithmDesc&) override; + + std::vector get_all_algorithms( + const TensorLayout& src, const TensorLayout& dst, + const TensorLayout& diff, const TensorLayout& grad) override; + + Algorithm* get_algorithm_heuristic( + const TensorLayout& src, const TensorLayout& dst, + const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, + const AlgoAttribute& negative_attr) override; + private: WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, const TensorLayout&, diff --git a/dnn/test/common/opr_algo_proxy.h b/dnn/test/common/opr_algo_proxy.h index 79361f07e5005766e280e31dfccfc29f1736e841..3ee9f746f1e97b9cc2c3e6551d9630929ec40d3d 100644 --- a/dnn/test/common/opr_algo_proxy.h +++ b/dnn/test/common/opr_algo_proxy.h @@ -46,6 +46,12 @@ struct AlgoProxy; } \ } +#define LAYOUTS layouts[0], layouts[1] +#define TENSORS tensors[0], tensors[1] +DEF_ALGO_PROXY(2); +#undef LAYOUTS +#undef TENSORS + #define LAYOUTS layouts[0], layouts[1], layouts[2] #define TENSORS tensors[0], tensors[1], tensors[2] DEF_ALGO_PROXY(3); diff --git a/dnn/test/cuda/pooling.cpp b/dnn/test/cuda/pooling.cpp index d7dc1faed2b8bd6b1c09eb84b35be1de834013b2..f2c2297e6bfee56f676e49482194a6469570db85 100644 --- a/dnn/test/cuda/pooling.cpp +++ b/dnn/test/cuda/pooling.cpp @@ -21,6 +21,13 @@ #include #include "test/cuda/benchmark.h" +namespace { +#define V1(v) #v +#define V(v) V1(v) +#define DEF_NAME(NAME) \ +#NAME "v" V(CUDNN_MAJOR) "." V(CUDNN_MINOR) "." V(CUDNN_PATCHLEVEL) +} // namespace + namespace megdnn { namespace test { @@ -263,19 +270,24 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) { checker.set_param(param).exec({{20, 24, 22, 33}, {}}); } -TEST_F(CUDA, POOLING_FORWARD_NCHW4) { +TEST_F(CUDA, POOLING_FORWARD_NCHW4_NCHW32) { require_compute_capability(7, 5); using Param = param::Pooling; Checker checker(handle_cuda()); Param param; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); - param.format = Param::Format::NCHW4; checker.set_epsilon(1 + 1e-3); - checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); - param.mode = Param::Mode::AVERAGE; - checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); - param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; - checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); + checker.set_before_exec_callback( + AlgoChecker(DEF_NAME(cudnnForward))); + for (auto format : {Param::Format::NCHW4, Param::Format::NCHW32}) { + param.format = format; + param.mode = Param::Mode::MAX; + checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}}); + param.mode = Param::Mode::AVERAGE; + checker.set_param(param).exec({{4, 3, 28, 28, 64}, {}}); + param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; + checker.set_param(param).exec({{4, 3, 28, 28, 32}, {}}); + } } #if CUDNN_VERSION >= 7500 @@ -288,6 +300,8 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW32) { auto i8_max = std::numeric_limits().max(); UniformIntRNG int_rng{i8_min, i8_max}; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); + checker.set_before_exec_callback( + AlgoChecker("CUDA_NCHW32")); param.format = Param::Format::NCHW32; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}}); @@ -394,6 +408,7 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW4) { UniformIntRNG int_rng{i8_min, i8_max}; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); param.format = Param::Format::NCHW4; + checker.set_before_exec_callback(AlgoChecker("CUDA_NCHW4")); for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { param.mode = mode; @@ -413,6 +428,8 @@ TEST_F(CUDA, POOLING_FORWARD_INT8_NCHW32) { auto i8_max = std::numeric_limits().max(); UniformIntRNG int_rng{i8_min, i8_max}; checker.set_dtype(0, dtype::QuantizedS8(0.1f)); + checker.set_before_exec_callback( + AlgoChecker("CUDA_NCHW32")); param.format = Param::Format::NCHW32; for (auto mode : {Param::Mode::MAX, Param::Mode::AVERAGE, Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING}) { diff --git a/src/opr/impl/dnn/pooling.cpp b/src/opr/impl/dnn/pooling.cpp index 21fc94806d2fc213955e5426fe351fea1b2a610a..d7dd6aa361c724aa7b1c2d433363408608ea3855 100644 --- a/src/opr/impl/dnn/pooling.cpp +++ b/src/opr/impl/dnn/pooling.cpp @@ -10,14 +10,52 @@ */ #include "megbrain/opr/dnn/pooling.h" #include "megbrain/graph/grad_impl.h" - +#include "megbrain/opr/search_policy/algo_chooser.h" #include "../internal/megdnn_opr_wrapper.inl" +#include "../search_policy/workspace_need_limit_getter.inl" + using namespace mgb; using namespace opr; MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingForward); -MEGDNN_OPR_INIT1(PoolingForward, "pooling") + +PoolingForward::PoolingForward(VarNode* i0, const Param& param, + const ExecutionPolicy& policy, + const OperatorNodeConfig& config) + : Super(OperatorNodeBaseCtorParam{ + i0->owner_graph(), config, "pooling", {i0}}) { + init_megdnn_opr(*this, param); + add_input({i0}); + m_policy = policy; + + intl::MegDNNOprInitPostCtor::apply(*this); +} + +SymbolVar PoolingForward::make(SymbolVar i0, const Param& param, + const OperatorNodeConfig& config, + const ExecutionPolicy& policy) { + intl::MegDNNOprInitInputsModifier::apply(param, {&i0}); + return i0.insert_single_output_opr(i0.node(), param, policy, + config); +} + +void PoolingForward::init_output_static_infer_desc() { + Super::set_nr_managed_outputs(this->output().size() - 1); + Super::Super::init_output_static_infer_desc(); + init_output_static_infer_desc_workspace( + intl::AutoAddWorkspaceNeedLimitGetter::val); +} + +size_t PoolingForward::get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const { + return AlgoChooser::setup_algo( + {TensorLayout{input_shapes[0], input(0)->dtype(), + input(0)->format()}, + {output_shapes[0], output(0)->dtype(), output(0)->format()}}, + megdnn_opr(), this, false); +} #if MGB_ENABLE_GRAD MGB_IMPL_OPR_GRAD(PoolingForward) { @@ -29,7 +67,41 @@ MGB_IMPL_OPR_GRAD(PoolingForward) { #endif MGB_DYN_TYPE_OBJ_FINAL_IMPL(PoolingBackward); -MEGDNN_OPR_INIT3(PoolingBackward, "pooling_bwd", 0, true); + +PoolingBackward::PoolingBackward(VarNode* i0, VarNode* i1, VarNode* i2, + const Param& param, + const ExecutionPolicy& policy, + const OperatorNodeConfig& config) + : Super( + OperatorNodeBaseCtorParam{ + i0->owner_graph(), config, "pooling_bwd", {i0}}, + 0, true) { + init_megdnn_opr(*this, param); + add_input({i0, i1, i2}); + intl::MegDNNOprInitPostCtor::apply(*this); +} + +SymbolVar PoolingBackward::make(SymbolVar i0, SymbolVar i1, SymbolVar i2, + const Param& param, + const OperatorNodeConfig& config, + const ExecutionPolicy& policy) { + intl::MegDNNOprInitInputsModifier::apply(param, + {&i0, &i1, &i2}); + return i0.insert_single_output_opr( + i0.node(), i1.node(), i2.node(), param, policy, config); +} + +size_t PoolingBackward::get_workspace_size_bytes( + const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) const { + return AlgoChooser::setup_algo( + {TensorLayout{input_shapes[0], input(0)->dtype(), + input(0)->format()}, + {input_shapes[1], input(1)->dtype(), input(1)->format()}, + {input_shapes[2], input(2)->dtype(), input(2)->format()}, + {output_shapes[0], output(0)->dtype(), output(0)->format()}}, + megdnn_opr(), this, false); +} // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/impl/search_policy/algo_chooser.cpp b/src/opr/impl/search_policy/algo_chooser.cpp index 51bab36453ee6c1c10af027cfcfe0353d996d1d9..ed9932cf9aa83d2eecc3bcb0a4b931d8b7a81338 100644 --- a/src/opr/impl/search_policy/algo_chooser.cpp +++ b/src/opr/impl/search_policy/algo_chooser.cpp @@ -175,6 +175,8 @@ cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); cb(BATCH_CONV_FORWARD, BatchConvBiasForward); cb(CONVBIAS_FORWARD, ConvBiasForward); +cb(POOLING_FORWARD, PoolingForward); +cb(POOLING_BACKWARD, PoolingBackward); #undef cb @@ -195,7 +197,9 @@ cb(CONVBIAS_FORWARD, ConvBiasForward); cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ cb(BATCH_CONV_FORWARD, stmt) \ - cb(CONVBIAS_FORWARD, stmt) + cb(CONVBIAS_FORWARD, stmt) \ + cb(POOLING_FORWARD, stmt) \ + cb(POOLING_BACKWARD, stmt) // clang-format on #define _OPR_TYPE_CASE(_opr_type, _stmt) \ @@ -521,11 +525,14 @@ AlgoChooser::AlgoChooserHelper::AlgoChooserHelper( mgb_assert(m_fastrun_layouts.size() == layouts.size()); - static_assert(std::tuple_size::value == 3 || - std::tuple_size::value == 5 || - std::tuple_size::value == 8, - "Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " - "deformable conv)"); + static_assert( + std::tuple_size::value == 2 || + std::tuple_size::value == 3 || + std::tuple_size::value == 4 || + std::tuple_size::value == 5 || + std::tuple_size::value == 8, + "Pooling assumes arity = 2 or 4,Convolution AlgoChooser assumes " + "arity = 3 , 5 or 8 (for deformable conv)"); } template diff --git a/src/opr/impl/search_policy/profiler.cpp b/src/opr/impl/search_policy/profiler.cpp index 705c02b27afa77cc95e6784fd10da6d7d1161ba9..f3befdc2f9391c3e6db89d6e285c61f4d135310d 100644 --- a/src/opr/impl/search_policy/profiler.cpp +++ b/src/opr/impl/search_policy/profiler.cpp @@ -284,7 +284,7 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( mdn_workspace), std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()), - array_skip<2>(layouts)); + array_skip(layouts)); }); } }); diff --git a/src/opr/include/megbrain/opr/dnn/pooling.h b/src/opr/include/megbrain/opr/dnn/pooling.h index 83b7ba804a6ae83421eccfe75680c916da201f4a..54cfadd6688d1e37cbf7f5a12ce893625789eccc 100644 --- a/src/opr/include/megbrain/opr/dnn/pooling.h +++ b/src/opr/include/megbrain/opr/dnn/pooling.h @@ -12,34 +12,50 @@ #pragma once #include "megbrain/opr/internal/megdnn_opr_wrapper.h" +#include "megbrain/opr/search_policy/algo_chooser_helper.h" #include "megdnn/oprs.h" namespace mgb { namespace opr { MGB_DEFINE_OPR_CLASS(PoolingForward, - intl::MegDNNOprWrapperFwd) // { - - public: - PoolingForward(VarNode *src, const Param ¶m, - const OperatorNodeConfig &config); - static SymbolVar make(SymbolVar src, const Param ¶m, - const OperatorNodeConfig &config = {}); + intl::MegDNNOprWrapperFwd, + public mixin::AlgoChooserHelper) //{ +public: + PoolingForward(VarNode * src, const Param& param, + const ExecutionPolicy& policy, + const OperatorNodeConfig& config); + static SymbolVar make(SymbolVar src, const Param& param, + const OperatorNodeConfig& config = {}, + const ExecutionPolicy& policy = {}); + + void init_output_static_infer_desc() override; + + size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) + const override; }; using Pooling = PoolingForward; MGB_DEFINE_OPR_CLASS(PoolingBackward, - intl::MegDNNOprWrapperBwd) // { - - public: - PoolingBackward(VarNode *src, VarNode *dst, VarNode *diff, - const Param ¶m, const OperatorNodeConfig &config); - static SymbolVar make(SymbolVar src, SymbolVar dst, SymbolVar diff, - const Param ¶m, - const OperatorNodeConfig &config = {}); + intl::MegDNNOprWrapperBwd, + public mixin::AlgoChooserHelper) //{ +public: + PoolingBackward(VarNode * src, VarNode * dst, VarNode * diff, + const Param& param, const ExecutionPolicy& policy, + const OperatorNodeConfig& config); + + static SymbolVar make(SymbolVar src, SymbolVar dst, SymbolVar diff, + const Param& param, + const OperatorNodeConfig& config = {}, + const ExecutionPolicy& policy = {}); + + size_t get_workspace_size_bytes(const TensorShapeArray& input_shapes, + const TensorShapeArray& output_shapes) + const override final; }; -} // namespace opr -} // namespace mgb +} // namespace opr +} // namespace mgb // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h index 599c94895a794690684de153a82b9e9203e8fdc0..fdde56cb37c3e742668d075d60e5f5adb5e59388 100644 --- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h +++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h @@ -18,6 +18,7 @@ #include "megbrain/opr/search_policy/algo_chooser_helper.h" #include "megbrain/opr/search_policy/profiler.h" #include "megbrain/opr/dnn/convolution.h" +#include "megbrain/opr/dnn/pooling.h" #include "megbrain/opr/blas.h" #include "megdnn/oprs/base.h" diff --git a/src/opr/include/megbrain/opr/search_policy/profiler.h b/src/opr/include/megbrain/opr/search_policy/profiler.h index 045c33f749247143cc7c86047f8f05bfa027ca79..062e58671ee3cb1adf674ac7a4ffe18f31245e8a 100644 --- a/src/opr/include/megbrain/opr/search_policy/profiler.h +++ b/src/opr/include/megbrain/opr/search_policy/profiler.h @@ -40,7 +40,9 @@ namespace opr { cb(DeformableConvBackwardData) \ cb(BatchConvBiasForward) \ cb(MatrixMul) \ - cb(BatchedMatrixMul) + cb(BatchedMatrixMul) \ + cb(PoolingForward) \ + cb(PoolingBackward) // clang-format on template diff --git a/src/opr/test/dnn/pooling.cpp b/src/opr/test/dnn/pooling.cpp index 932060f456a8436c3a7910dd9ce0d35929437758..023fd4c8d54049782f9cedbb8c9b0c0e0e46dc21 100644 --- a/src/opr/test/dnn/pooling.cpp +++ b/src/opr/test/dnn/pooling.cpp @@ -11,6 +11,9 @@ #include "./legacy_checker.h" #include "megbrain/opr/dnn/pooling.h" +#include "megbrain/utils/persistent_cache.h" +#include "megbrain/opr/basic_arith.h" +#include "megbrain/opr/basic_arith_wrapper.h" using namespace std; using namespace mgb; @@ -104,6 +107,58 @@ TEST(TestOprDNN, PoolingBackward) } } +TEST(TestOprDNN, PoolingExePolicy) { + using Param = opr::Pooling::Param; + Param param; + using Policy = opr::Pooling::ExecutionPolicy; + using S = Policy::Strategy; + + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + + auto orig_impl = PersistentCache::set_impl( + std::make_shared()); + + HostTensorND host_y, host_y_copy; + S strategy = S::HEURISTIC | S::REPRODUCIBLE; + + auto graph = ComputingGraph::make(); + + HostTensorGenerator<> gen; + TensorShape shape = {1, 20, 24, 24}; + auto input = opr::Host2DeviceCopy::make(*graph, gen(shape, cn)); + + param.mode = Param::Mode::MAX; + param.window_h = param.window_w = 2; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 0; + param.format = Param::Format::NCHW; + + Policy policy; + policy.strategy = strategy; + + auto pooling = opr::PoolingForward::make(input, param, {}, policy); + + auto loss0 = opr::reduce_sum_sqr(pooling, pooling.make_scalar(1)); + auto grad = cg::grad(loss0, input, true, false); + + opr::PoolingBackward* found = nullptr; + auto cb = [&found](cg::OperatorNodeBase* opr) { + if (opr->same_type()) { + found = &opr->cast_final_safe(); + } + }; + cg::DepOprIter{cb}.add(grad.node()->owner_opr()); + found->set_execution_policy(strategy); + + auto func = graph->compile({make_callback_copy(grad, host_y)}); + func->execute().wait(); + + mgb_assert(found->megdnn_opr()->execution_policy().algo.name.find( + "cudnnReproducible") != std::string::npos); +} + } // anonymous namespace // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}