diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h index d740929fd023ccdd281dfd8ff14099918d87429b..a5caeeca04b4ca7ff692382ec562aca1f7b71744 100644 --- a/dnn/include/megdnn/oprs/nn.h +++ b/dnn/include/megdnn/oprs/nn.h @@ -234,10 +234,10 @@ public: const TensorLayout& dst) = 0; protected: - CanonizedFilterMeta check_exec(const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& dst, - size_t workspace_in_bytes); + CanonizedFilterMeta check_exec( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, size_t workspace_in_bytes, + const PreprocessedFilter* preprocessed_filter); }; using Convolution = ConvolutionForward; @@ -408,12 +408,11 @@ public: static WinogradParam parse_winograd_name(const std::string& algo_name); protected: - CanonizedFilterMeta check_exec(const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& bias, - const TensorLayout& z, - const TensorLayout& dst, - size_t workspace_in_bytes); + CanonizedFilterMeta check_exec( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst, size_t workspace_in_bytes, + const PreprocessedFilter* preprocessed_filter); }; using ConvBias = ConvBiasForward; diff --git a/dnn/src/common/conv_bias.cpp b/dnn/src/common/conv_bias.cpp index 75e3d8fffccba3ea8804aabd2b9740e280d3c654..a420f0548c4c02a5379ef86dd9fa0a31b2a52ce8 100644 --- a/dnn/src/common/conv_bias.cpp +++ b/dnn/src/common/conv_bias.cpp @@ -32,7 +32,8 @@ void ConvBiasForward::deduce_layout(const TensorLayout& src, ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, - const TensorLayout& dst, size_t workspace_in_bytes) { + const TensorLayout& dst, size_t workspace_in_bytes, + const PreprocessedFilter* preprocessed_filter) { if ((param().format == param::ConvBias::Format::NCHW_WINOGRAD || param().format == param::ConvBias::Format::NCHW88_WINOGRAD || param().format == param::ConvBias::Format::NCHW44_WINOGRAD) && @@ -82,9 +83,11 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( auto ret = check_layout_fwd(src, filter, dst); megdnn_assert_contiguous(bias); - auto required_workspace_in_bytes = - get_workspace_in_bytes(src, filter, bias, z, dst, nullptr); - megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); + auto required_workspace_in_bytes = get_workspace_in_bytes( + src, filter, bias, z, dst, preprocessed_filter); + megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes, + "worksapce have size of %zu, but need %zu", + workspace_in_bytes, required_workspace_in_bytes); if (bias.ndim != 0) { //! bias.layout == dst.layout failed, no assert information auto check_eq = [](const TensorLayout& bias, const TensorLayout& dst) { diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index 914d23515a6e90bd7098aad7532483b5a82572ac..f79eeb713bf2412f005b326f2c53d638b754a1ee 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -1028,10 +1028,11 @@ void ConvolutionForward::deduce_layout(const TensorLayout& src, ConvolutionForward::CanonizedFilterMeta ConvolutionForward::check_exec( const TensorLayout& src, const TensorLayout& filter, - const TensorLayout& dst, size_t workspace_in_bytes) { + const TensorLayout& dst, size_t workspace_in_bytes, + const PreprocessedFilter* preprocessed_filter) { auto ret = check_layout_fwd(src, filter, dst); auto required_workspace_in_bytes = - get_workspace_in_bytes(src, filter, dst, nullptr); + get_workspace_in_bytes(src, filter, dst, preprocessed_filter); megdnn_assert(workspace_in_bytes >= required_workspace_in_bytes); return ret; } diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index f96a64bf2f57d24d17302bac525b186eedddd42b..d487c5a30890fa61e3eae39bd65612c2cb63c162 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -25,10 +25,10 @@ namespace cuda { void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, _megdnn_tensor_out dst, - const PreprocessedFilter*, + const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, - workspace.size); + workspace.size, preprocessed_filter); AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, z.layout, dst.layout); diff --git a/dnn/src/cuda/conv_bias/opr_impl.h b/dnn/src/cuda/conv_bias/opr_impl.h index 222c381dffeca653ba46dc6f5988a20d65a7e5be..12260b9eef90527f4d5b8f8098dbe76748d817e1 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.h +++ b/dnn/src/cuda/conv_bias/opr_impl.h @@ -52,13 +52,10 @@ public: const TensorLayout&, const TensorLayout&) override { return {}; } - void exec_preprocess(const TensorLayout& , - _megdnn_tensor_in , - const TensorLayout& , - const TensorLayout& , - const TensorLayout& , - PreprocessedFilter* , - _megdnn_workspace ) override { + void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, + const TensorLayout&, const TensorLayout&, + const TensorLayout&, PreprocessedFilter*, + _megdnn_workspace) override { megdnn_throw("cuda conv_bias exec_preprocess has not implemeted yet"); } diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index 2b9b5597473e5fbf5acdac7c1384c8c0f5b12986..01026d15035bfc63ff7d132acfdf57ba339847d1 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -119,17 +119,22 @@ SmallVector ConvBiasImpl::algo_pack() { bool ConvBiasImpl::is_naive_algo(ConvBiasImpl::Algorithm* algo) { return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0; } + +#define NCB_ALGO_FUNC(name, algo, param) \ + static_cast(algo)->name(this, param) + void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, _megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, - workspace.size); - auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace); + workspace.size, preprocessed_filter); + auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, + preprocessed_filter); ConvBiasImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); if (!is_naive_algo(algo) && - ncb_algo_get_workspace(algo, fparam) <= workspace.size) { + NCB_ALGO_FUNC(get_workspace, algo, fparam) <= workspace.size) { exec_with_ncb_kern(fparam, algo); } else { naive::ConvBiasForwardImpl::exec(src, filter, bias, z, dst, @@ -137,18 +142,71 @@ void ConvBiasImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, } } +void ConvBiasImpl::exec_preprocess(const TensorLayout& src_layout, + _megdnn_tensor_in filter, + const TensorLayout& bias_layout, + const TensorLayout& z_layout, + const TensorLayout& dst_layout, + PreprocessedFilter* preprocessed_filter, + _megdnn_workspace workspace) { + //! exec_preprocess currently only support preprocess weights before exec, + //! src/dst/bias/z will be ignored, just set to nullptr + TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}, + bias{nullptr, bias_layout}; + auto fparam = make_ncb_kern_param(src, filter, bias, dst, workspace, + preprocessed_filter); + ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); + if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, + fparam) <= workspace.size) { + exec_preprocess_with_ncb_kern(fparam, algo); + } else { + naive::ConvBiasForwardImpl::exec_preprocess( + src_layout, filter, bias_layout, z_layout, dst_layout, + preprocessed_filter, workspace); + } +} + size_t ConvBiasImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { - auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); + auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, + preprocessed_filter); ConvBiasImpl::Algorithm* algo = get_algorithm(fparam); if (is_naive_algo(algo)) { return naive::ConvBiasForwardImpl::get_workspace_in_bytes( src, filter, bias, z, dst, preprocessed_filter); } else { - return ncb_algo_get_workspace(algo, fparam); + return NCB_ALGO_FUNC(get_workspace, algo, fparam); + } +} + +size_t ConvBiasImpl::get_preprocess_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) { + auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); + Algorithm* algo = get_algorithm(fparam); + if (is_naive_algo(algo)) { + return naive::ConvBiasForwardImpl::get_preprocess_workspace_in_bytes( + src, filter, bias, z, dst); + } else { + return NCB_ALGO_FUNC(get_preprocess_workspace, algo, fparam); + } +} + +SmallVector ConvBiasImpl::deduce_preprocessed_filter_layout( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) { + auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); + Algorithm* algo = get_algorithm(fparam); + if (is_naive_algo(algo)) { + return naive::ConvBiasForwardImpl::deduce_preprocessed_filter_layout( + src, filter, bias, z, dst); + } else { + return NCB_ALGO_FUNC(deduce_preprocessed_filter_layout, algo, fparam); } } @@ -156,7 +214,7 @@ std::vector ConvBiasImpl::get_all_algorithms( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { - auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); + auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); auto ret = get_all_algorithms_with_ncb(fparam); if (ret.empty()) { return naive::ConvBiasForwardImpl::get_all_algorithms(src, filter, bias, @@ -170,7 +228,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic( const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, size_t workspace_limit_in_bytes, bool reproducible) { - auto fparam = make_ncb_kern_size_param(src, filter, bias, dst); + auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, nullptr); auto result = get_algorithm_heuristic_with_ncb( fparam, workspace_limit_in_bytes, reproducible); if (result == nullptr) { @@ -181,9 +239,25 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic( return result; } +ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic_with_ncb( + const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, + bool reproducible) { + for (auto i : get_all_algorithms_with_ncb(param)) { + size_t need_workspace = NCB_ALGO_FUNC(get_workspace, i, param); + if (static_cast(i)->usable_reproducible( + this, param, AlgoSelectionStrategy::HEURISTIC, + reproducible) && + need_workspace <= workspace_limit_in_bytes) { + return i; + } + } + return nullptr; +} + ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( const TensorLayout& src, const TensorLayout& filter, - const TensorLayout& bias, const TensorLayout& dst) { + const TensorLayout& bias, const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter) { auto safe_u32 = [](size_t v) -> uint32_t { megdnn_assert(v <= std::numeric_limits::max(), "value too large: %zu", v); @@ -258,7 +332,9 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( {src.stride[0], src.stride[1], src.stride[2], src.stride[3]}, {dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, param().compute_mode, - nr_threads}, + nr_threads, + reinterpret_cast( + preprocessed_filter)}, param().output_block_size, format, bias.dtype, @@ -269,10 +345,12 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, - _megdnn_tensor_out dst, _megdnn_workspace workspace) { + _megdnn_tensor_out dst, _megdnn_workspace workspace, + const PreprocessedFilter* preprocessed_filter) { NCBKernParam ret; - static_cast(ret) = make_ncb_kern_size_param( - src.layout, filter.layout, bias.layout, dst.layout); + static_cast(ret) = + make_ncb_kern_size_param(src.layout, filter.layout, bias.layout, + dst.layout, preprocessed_filter); ret.src_ptr = src.raw_ptr; ret.filter_ptr = filter.raw_ptr; ret.bias_ptr = bias.raw_ptr; @@ -284,7 +362,7 @@ ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param( void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param, ConvBiasImpl::Algorithm* algo) { - auto ncb_kerns = ncb_algo_dispatch_kerns(algo, param); + auto ncb_kerns = NCB_ALGO_FUNC(dispatch_kerns, algo, param); for (auto&& kernel : ncb_kerns) { auto run = [kernel, param](size_t index, size_t thread_id) { CpuNDRange ndrange_id(kernel.global_size, index); @@ -295,21 +373,17 @@ void ConvBiasImpl::exec_with_ncb_kern(const NCBKernParam& param, } } -ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_heuristic_with_ncb( - const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, - bool reproducible) { - return ncb_algo_get_algorithm_heuristic(param, workspace_limit_in_bytes, - reproducible); -} - -size_t ConvBiasImpl::ncb_algo_get_workspace(Algorithm* algo, - const NCBKernSizeParam& param) { - return static_cast(algo)->get_workspace(this, param); -} - -SmallVector ConvBiasImpl::ncb_algo_dispatch_kerns( - Algorithm* algo, const NCBKernSizeParam& param) { - return static_cast(algo)->dispatch_kerns(this, param); +void ConvBiasImpl::exec_preprocess_with_ncb_kern( + const NCBKernParam& param, ConvBiasImpl::Algorithm* algo) { + auto ncb_kerns = NCB_ALGO_FUNC(dispatch_preprocess_kerns, algo, param); + for (auto&& kernel : ncb_kerns) { + auto run = [kernel, param](size_t index, size_t thread_id) { + CpuNDRange ndrange_id(kernel.global_size, index); + kernel.kern(param, {thread_id, ndrange_id}); + }; + static_cast(handle())->dispatch_kern( + run, kernel.global_size.total_size()); + } } std::vector ConvBiasImpl::get_all_algorithms_with_ncb( @@ -332,20 +406,6 @@ std::vector ConvBiasImpl::get_all_algorithms_with_ncb( return algos; } -ConvBiasImpl::Algorithm* ConvBiasImpl::ncb_algo_get_algorithm_heuristic( - const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, - bool reproducible) { - for (auto i : get_all_algorithms_with_ncb(param)) { - if (static_cast(i)->usable_reproducible( - this, param, AlgoSelectionStrategy::HEURISTIC, - reproducible) && - ncb_algo_get_workspace(i, param) <= workspace_limit_in_bytes) { - return i; - } - } - return nullptr; -} - ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( const NCBKernSizeParam& param, size_t workspace_size) { if (auto set = execution_policy().algorithm) { diff --git a/dnn/src/fallback/conv_bias/opr_impl.h b/dnn/src/fallback/conv_bias/opr_impl.h index a70fd4741bfd72c32c4a2a6dc176ef5482b9564c..e35541dca47d9b6f464479d59246d1fedc539d00 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.h +++ b/dnn/src/fallback/conv_bias/opr_impl.h @@ -51,6 +51,25 @@ public: _megdnn_tensor_out dst, const PreprocessedFilter*, _megdnn_workspace workspace) override; + void exec_preprocess(const TensorLayout& src_layout, + _megdnn_tensor_in filter, + const TensorLayout& bias_layout, + const TensorLayout& z_layout, + const TensorLayout& dst_layout, + PreprocessedFilter* preprocessed_filter, + _megdnn_workspace workspace) override; + + SmallVector deduce_preprocessed_filter_layout( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& z, + const TensorLayout& dst) override; + + size_t get_preprocess_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& bias, + const TensorLayout& z, + const TensorLayout& dst) override; + //! implemented by get_workspace_with_ncb() size_t get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& filter, @@ -198,6 +217,23 @@ public: virtual SmallVector dispatch_kerns( ConvBiasImpl* opr, const NCBKernSizeParam& param) const = 0; + virtual SmallVector dispatch_preprocess_kerns( + ConvBiasImpl*, const NCBKernSizeParam&) const { + return {}; + }; + + //! get the layouts of weight_prerocess dst + virtual SmallVector deduce_preprocessed_filter_layout( + ConvBiasImpl*, const NCBKernSizeParam&) const { + return {}; + }; + + //! get the workspace when weight_prerocess + virtual size_t get_preprocess_workspace(ConvBiasImpl*, + const NCBKernSizeParam&) const { + return 0_z; + }; + //! Temporarily used to identify whether the matmul algorithm is //! is_preferred. virtual bool is_preferred(ConvBiasImpl*, @@ -219,40 +255,19 @@ public: virtual SmallVector algo_pack(); protected: - //! default impl calls ncb_algo_dispatch_kern() virtual void exec_with_ncb_kern(const NCBKernParam& param, ConvBiasImpl::Algorithm* algo); - //! default impl calls ncb_algo_get_all_algorithms() + virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param, + Algorithm* algo); + virtual std::vector get_all_algorithms_with_ncb( const NCBKernSizeParam& param); - //! default impl calls ncb_algo_get_algorithm_heuristic() virtual Algorithm* get_algorithm_heuristic_with_ncb( const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, bool reproducible = false); - /** - * \brief get kernel pointer for non-contiguous batch kernel or - * simply conv bias kernel. - * - * whether the kernel processing batch 1-group is decided by the - * algo. - */ - - virtual SmallVector ncb_algo_dispatch_kerns( - Algorithm* algo, const NCBKernSizeParam& param); - - virtual size_t ncb_algo_get_workspace(Algorithm* algo, - const NCBKernSizeParam& param); - /*! - * the default impl iterates over all ncb_algo_get_all_algorithms() - * and return the first one whose workspace does not exceed the limit. - */ - virtual Algorithm* ncb_algo_get_algorithm_heuristic( - const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, - bool reproducible = false); - const char* get_algorithm_set_name() const override; private: @@ -276,16 +291,16 @@ private: const NCBKernSizeParam& param, size_t workspace_size = std::numeric_limits::max()); - NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& bias, - const TensorLayout& dst); - - NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src, - _megdnn_tensor_in filter, - _megdnn_tensor_in bias, - _megdnn_tensor_out dst, - _megdnn_workspace workspace); + NCBKernSizeParam make_ncb_kern_size_param( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& bias, const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter); + + NCBKernParam make_ncb_kern_param( + _megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_in bias, _megdnn_tensor_out dst, + _megdnn_workspace workspace, + const PreprocessedFilter* preprocessed_filter); }; } // namespace fallback diff --git a/dnn/src/fallback/convolution/algos.cpp b/dnn/src/fallback/convolution/algos.cpp index 7135b9579edd131f2c45c72233fd71563a68b493..392d54b0eb7a050e32471293ec4403330886c022 100644 --- a/dnn/src/fallback/convolution/algos.cpp +++ b/dnn/src/fallback/convolution/algos.cpp @@ -376,7 +376,67 @@ size_t ConvolutionImpl::AlgoDefault::get_workspace( return get_bundle(param).total_size_in_bytes(); } -//! Return the implment kernel +size_t ConvolutionImpl::AlgoDefault::get_preprocess_workspace( + ConvolutionImpl*, const NCBKernSizeParam& param) const { + ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = + init_convbias_opr_and_param(m_conv_bias_opr, param); + m_conv_bias_opr->execution_policy() = {m_algorithm}; + return m_algorithm->get_preprocess_workspace(m_conv_bias_opr, + conv_bias_param); +} + +SmallVector +ConvolutionImpl::AlgoDefault::deduce_preprocessed_filter_layout( + ConvolutionImpl*, const NCBKernSizeParam& param) const { + ::ConvBiasImpl::NCBKernSizeParam conv_bias_param = + init_convbias_opr_and_param(m_conv_bias_opr, param); + m_conv_bias_opr->execution_policy() = {m_algorithm}; + return m_algorithm->deduce_preprocessed_filter_layout(m_conv_bias_opr, + conv_bias_param); +} + +//! Return the implement preprocess kernel +SmallVector +ConvolutionImpl::AlgoDefault::get_preprocess_kimpl( + ::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, + const NCBKernSizeParam& param) { + MIDOUT_BEGIN(megdnn_fallback_conv, midout_iv("get_preprocess_kimpl"_hash)) { + // construct the conv_bias kern param + ::ConvBiasImpl::NCBKernParam conv_bias_param; + ::ConvBiasImpl::NCBKernSizeParam conv_bias_size_param = + init_convbias_opr_and_param(conv_bias_opr, param); + static_cast<::ConvBiasImpl::NCBKernSizeParam&>(conv_bias_param) = + conv_bias_size_param; + auto conv_bias_preprocess_kerns = + algo->dispatch_preprocess_kerns(conv_bias_opr, conv_bias_param); + SmallVector convolution_preprocess_kerns; + + //! Set the conv_bias param using convolution param + auto set_copy_param_filter_workspace_ptr = + [](const NCBKernParam& conv_param, + ::ConvBiasImpl::NCBKernParam& copied_param) { + copied_param.filter_ptr = conv_param.filter_ptr; + copied_param.workspace_ptr = conv_param.workspace_ptr; + copied_param.workspace_size = conv_param.workspace_size; + }; + for (size_t i = 0; i < conv_bias_preprocess_kerns.size(); i++) { + auto kernel = conv_bias_preprocess_kerns[i]; + //! If the kerenl batch parallel + auto run = [=](const NCBKernParam& p, + const NCBKernIndex& ncb_index) { + auto copy_param = conv_bias_param; + set_copy_param_filter_workspace_ptr(p, copy_param); + kernel.kern(copy_param, + {ncb_index.thread_id, ncb_index.ndrange_id}); + }; + convolution_preprocess_kerns.push_back({run, kernel.global_size}); + } + return convolution_preprocess_kerns; + } + MIDOUT_END(); +} + +//! Return the implement kernel SmallVector ConvolutionImpl::AlgoDefault::get_kimpl( ::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, const NCBKernSizeParam& param) { @@ -392,7 +452,7 @@ SmallVector ConvolutionImpl::AlgoDefault::get_kimpl( SmallVector convolution_kerns; //! Set the conv_bias param using convolution param - auto set_copy_param_run_time_address = + auto set_copy_param_compute_address = [](const NCBKernParam& conv_param, ::ConvBiasImpl::NCBKernParam& copied_param) { copied_param.src_ptr = conv_param.src_ptr; @@ -407,7 +467,7 @@ SmallVector ConvolutionImpl::AlgoDefault::get_kimpl( auto run = [=](const NCBKernParam& p, const NCBKernIndex& ncb_index) { auto copy_param = conv_bias_param; - set_copy_param_run_time_address(p, copy_param); + set_copy_param_compute_address(p, copy_param); kernel.kern(copy_param, {ncb_index.thread_id, ncb_index.ndrange_id}); }; diff --git a/dnn/src/fallback/convolution/algos.h b/dnn/src/fallback/convolution/algos.h index 091be295d248484b5dbde2bcf5b42ba78d7d520a..d5a44ae8ae10ddf62d8c4c94b50a1754db5a6bcf 100644 --- a/dnn/src/fallback/convolution/algos.h +++ b/dnn/src/fallback/convolution/algos.h @@ -110,6 +110,9 @@ class ConvolutionImpl::AlgoDefault final : public AlgoBase { static SmallVector get_kimpl(ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, const NCBKernSizeParam& param); + static SmallVector get_preprocess_kimpl( + ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase* algo, + const NCBKernSizeParam& param); public: AlgoDefault(fallback::ConvBiasImpl* conv_bias_opr, ConvBiasImpl::AlgoBase*); @@ -121,6 +124,17 @@ public: size_t get_workspace(ConvolutionImpl* opr, const NCBKernSizeParam& param) const override; + size_t get_preprocess_workspace(ConvolutionImpl*, + const NCBKernSizeParam&) const override; + + SmallVector deduce_preprocessed_filter_layout( + ConvolutionImpl*, const NCBKernSizeParam&) const override; + + SmallVector dispatch_preprocess_kern( + ConvolutionImpl*, const NCBKernSizeParam& param) const override { + return get_preprocess_kimpl(m_conv_bias_opr, m_algorithm, param); + } + SmallVector dispatch_kern( ConvolutionImpl* /*opr*/, const NCBKernSizeParam& param) const override { diff --git a/dnn/src/fallback/convolution/opr_impl.cpp b/dnn/src/fallback/convolution/opr_impl.cpp index 6f75f13092ba394d49285b7338869fb77d6ec54d..059f1c8e882029b8e52b4943e78682c9a8ad1fda 100644 --- a/dnn/src/fallback/convolution/opr_impl.cpp +++ b/dnn/src/fallback/convolution/opr_impl.cpp @@ -80,14 +80,19 @@ SmallVector ConvolutionImpl::algo_pack() { bool ConvolutionImpl::is_naive_algo(ConvolutionImpl::Algorithm* algo) { return algo == nullptr || strcmp(algo->name(), "DEFAULT") == 0; } + +#define NCB_ALGO_FUNC(name, algo, param) \ + static_cast(algo)->name(this, fparam) + void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { - auto fparam = make_ncb_kern_param(src, filter, dst, workspace); + auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, + workspace); ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); if (!is_naive_algo(algo) && - ncb_algo_get_workspace(algo, fparam) <= workspace.size) { + NCB_ALGO_FUNC(get_workspace, algo, fparam) <= workspace.size) { exec_with_ncb_kern(fparam, algo); } else { naive::ConvolutionForwardImpl::exec(src, filter, dst, @@ -95,24 +100,73 @@ void ConvolutionImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, } } +void ConvolutionImpl::exec_preprocess(const TensorLayout& src_layout, + _megdnn_tensor_in filter, + const TensorLayout& dst_layout, + PreprocessedFilter* preprocessed_filter, + _megdnn_workspace workspace) { + //! exec_preprocess currently only support preprocess weights before exec, + //! src/dst will be ignored, just set to nullptr + TensorND src{nullptr, src_layout}, dst{nullptr, dst_layout}; + auto fparam = make_ncb_kern_param(src, filter, dst, preprocessed_filter, + workspace); + ConvolutionImpl::Algorithm* algo = get_algorithm(fparam, workspace.size); + if (!is_naive_algo(algo) && NCB_ALGO_FUNC(get_preprocess_workspace, algo, + fparam) <= workspace.size) { + exec_preprocess_with_ncb_kern(fparam, algo); + } else { + naive::ConvolutionForwardImpl::exec_preprocess( + src_layout, filter, dst_layout, preprocessed_filter, workspace); + } +} + size_t ConvolutionImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { - auto fparam = make_ncb_kern_size_param(src, filter, dst); + auto fparam = + make_ncb_kern_size_param(src, filter, dst, preprocessed_filter); Algorithm* algo = get_algorithm(fparam); if (is_naive_algo(algo)) { return naive::ConvolutionForwardImpl::get_workspace_in_bytes( src, filter, dst, preprocessed_filter); } else { - return ncb_algo_get_workspace(algo, fparam); + return static_cast(algo)->get_workspace(this, fparam); + } +} + +size_t ConvolutionImpl::get_preprocess_workspace_in_bytes( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) { + auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); + Algorithm* algo = get_algorithm(fparam); + if (is_naive_algo(algo)) { + return naive::ConvolutionForwardImpl::get_preprocess_workspace_in_bytes( + src, filter, dst); + } else { + return static_cast(algo)->get_preprocess_workspace(this, + fparam); + } +} + +SmallVector ConvolutionImpl::deduce_preprocessed_filter_layout( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst){ + auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); + Algorithm* algo = get_algorithm(fparam); + if (is_naive_algo(algo)) { + return naive::ConvolutionForwardImpl::deduce_preprocessed_filter_layout( + src, filter, dst); + } else { + return static_cast(algo)->deduce_preprocessed_filter_layout( + this, fparam); } } std::vector ConvolutionImpl::get_all_algorithms( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) { - auto fparam = make_ncb_kern_size_param(src, filter, dst); + auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); auto ret = get_all_algorithms_with_ncb(fparam); if (ret.empty()) { return naive::ConvolutionForwardImpl::get_all_algorithms(src, filter, @@ -125,7 +179,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, size_t workspace_limit_in_bytes, bool reproducible) { - auto fparam = make_ncb_kern_size_param(src, filter, dst); + auto fparam = make_ncb_kern_size_param(src, filter, dst, nullptr); auto result = get_algorithm_heuristic_with_ncb( fparam, workspace_limit_in_bytes, reproducible); if (result == nullptr) { @@ -137,7 +191,8 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic( ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( const TensorLayout& src, const TensorLayout& filter, - const TensorLayout& dst) { + const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter) { auto safe_u32 = [](size_t v) -> uint32_t { megdnn_assert(v <= std::numeric_limits::max(), "value too large: %zu", v); @@ -175,15 +230,17 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param( {src.stride[0], src.stride[1], src.stride[2], src.stride[3]}, {dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]}, param().compute_mode, - nr_threads}; + nr_threads, + preprocessed_filter}; } ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( _megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, + const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { NCBKernParam ret; - static_cast(ret) = - make_ncb_kern_size_param(src.layout, filter.layout, dst.layout); + static_cast(ret) = make_ncb_kern_size_param( + src.layout, filter.layout, dst.layout, preprocessed_filter); ret.src_ptr = src.raw_ptr; ret.filter_ptr = filter.raw_ptr; ret.dst_ptr = dst.raw_ptr; @@ -192,9 +249,30 @@ ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param( return ret; } +void ConvolutionImpl::exec_preprocess_with_ncb_kern(const NCBKernParam& param, + Algorithm* algo) { + auto kerns = + static_cast(algo)->dispatch_preprocess_kern(this, param); + auto fallback_handle = handle(); + for (auto kernel : kerns) { + megdnn_assert( + param.filter_meta.format == Param::Format::NCHW || + param.filter_meta.format == Param::Format::NHWC || + param.filter_meta.format == Param::Format::NCHW88 || + param.filter_meta.format == Param::Format::NCHW44, + "invalid conv format"); + auto run = [param, kernel](size_t index, size_t thread_id) { + CpuNDRange ndrange_id(kernel.global_size, index); + kernel.kern(param, {thread_id, ndrange_id}); + }; + static_cast(fallback_handle) + ->dispatch_kern(run, kernel.global_size.total_size()); + } +} + void ConvolutionImpl::exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo) { - auto kerns = ncb_algo_dispatch_kern(algo, param); + auto kerns = static_cast(algo)->dispatch_kern(this, param); auto fallback_handle = handle(); for (auto kernel : kerns) { megdnn_assert(param.filter_meta.format == Param::Format::NCHW || @@ -215,10 +293,13 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_heuristic_with_ncb( const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, bool reproducible) { for (auto i : get_all_algorithms_with_ncb(param)) { - if (static_cast(i)->usable_reproducible( - this, param, AlgoSelectionStrategy::HEURISTIC, - reproducible) && - ncb_algo_get_workspace(i, param) <= workspace_limit_in_bytes) { + size_t need_workspace = + static_cast(i)->get_workspace(this, param); + bool usable_reproducible = + static_cast(i)->usable_reproducible( + this, param, AlgoSelectionStrategy::HEURISTIC, + reproducible); + if (usable_reproducible && need_workspace <= workspace_limit_in_bytes) { return i; } } diff --git a/dnn/src/fallback/convolution/opr_impl.h b/dnn/src/fallback/convolution/opr_impl.h index 42dad0d1a574619f322ad107b77928d58850fe9b..77c6d7402bd902f7048b5a89ae862a91e743db1c 100644 --- a/dnn/src/fallback/convolution/opr_impl.h +++ b/dnn/src/fallback/convolution/opr_impl.h @@ -39,12 +39,26 @@ public: _megdnn_tensor_out dst, const PreprocessedFilter*, _megdnn_workspace workspace) override; + void exec_preprocess(const TensorLayout& src_layout, + _megdnn_tensor_in filter, + const TensorLayout& dst_layout, + PreprocessedFilter* preprocessed_filter, + _megdnn_workspace workspace) override; + //! implemented by get_workspace_with_ncb() size_t get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, const PreprocessedFilter*) override; + SmallVector deduce_preprocessed_filter_layout( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst) override; + + size_t get_preprocess_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& filter, + const TensorLayout& dst) override; + //! implemented by get_all_algorithms_with_ncb() std::vector get_all_algorithms( const TensorLayout& src, const TensorLayout& filter, @@ -70,6 +84,8 @@ public: ptrdiff_t inp_s[4], out_s[4]; Param::ComputeMode compute_mode; size_t nr_threads; + //! weight_preprocess info + const PreprocessedFilter* preprocessed_filter; }; //! memory param for kernels with non-contiguous batch @@ -169,6 +185,23 @@ public: virtual SmallVector dispatch_kern( ConvolutionImpl* opr, const NCBKernSizeParam& param) const = 0; + virtual SmallVector dispatch_preprocess_kern( + ConvolutionImpl*, const NCBKernSizeParam&) const { + return {}; + }; + + //! get the layouts of weight_prerocess dst + virtual SmallVector deduce_preprocessed_filter_layout( + ConvolutionImpl*, const NCBKernSizeParam&) const { + return {}; + }; + + //! get the workspace when weight_prerocess + virtual size_t get_preprocess_workspace(ConvolutionImpl*, + const NCBKernSizeParam&) const { + return 0_z; + }; + //! Temporarily used to identify whether the matmul algorithm is //! is_preferred. virtual bool is_preferred(ConvolutionImpl*, @@ -192,6 +225,9 @@ public: protected: virtual void exec_with_ncb_kern(const NCBKernParam& param, Algorithm* algo); + virtual void exec_preprocess_with_ncb_kern(const NCBKernParam& param, + Algorithm* algo); + virtual std::vector get_all_algorithms_with_ncb( const NCBKernSizeParam& param); @@ -199,21 +235,6 @@ protected: const NCBKernSizeParam& param, size_t workspace_limit_in_bytes, bool reproducible = false); - //! get kernel pointer - virtual SmallVector ncb_algo_dispatch_kern( - Algorithm* algo, const NCBKernSizeParam& param) { - return static_cast(algo)->dispatch_kern(this, param); - } - //! get algo workspace - virtual size_t ncb_algo_get_workspace(Algorithm* algo, - const NCBKernSizeParam& param) { - return static_cast(algo)->get_workspace(this, param); - } - /*! - * the default impl iterates over all ncb_1g_get_all_algorithms() - * and return the first one whose workspace does not exceed the limit. - */ - const char* get_algorithm_set_name() const override; class AlgoFallback; @@ -231,14 +252,16 @@ private: const NCBKernSizeParam& param, size_t workspace_size = std::numeric_limits::max()); - NCBKernSizeParam make_ncb_kern_size_param(const TensorLayout& src, - const TensorLayout& filter, - const TensorLayout& dst); - - NCBKernParam make_ncb_kern_param(_megdnn_tensor_in src, - _megdnn_tensor_in filter, - _megdnn_tensor_out dst, - _megdnn_workspace workspace); + NCBKernSizeParam make_ncb_kern_size_param( + const TensorLayout& src, const TensorLayout& filter, + const TensorLayout& dst, + const PreprocessedFilter* preprocessed_filter); + + NCBKernParam make_ncb_kern_param( + _megdnn_tensor_in src, _megdnn_tensor_in filter, + _megdnn_tensor_out dst, + const PreprocessedFilter* preprocessed_filter, + _megdnn_workspace workspace); }; class ConvolutionBackwardDataImpl : public naive::ConvolutionBackwardDataImpl { diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp index 87abe1d374df5be5738d9a0a1395d3601c590aa4..e63eb83bd77031d1dd5ccb1c7d82e787c55b52cd 100644 --- a/dnn/src/naive/conv_bias/opr_impl.cpp +++ b/dnn/src/naive/conv_bias/opr_impl.cpp @@ -80,14 +80,15 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_in bias, _megdnn_tensor_in z, _megdnn_tensor_out dst, - const PreprocessedFilter*, + const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { MIDOUT_BEGIN(megdnn_naive_conv_bias_fwd) { dt_byte *workspace_ptr = workspace.raw_ptr; // ============================w * f + b================================ - auto filter_meta = check_exec(src.layout, filter.layout, bias.layout, - z.layout, dst.layout, workspace.size); + auto filter_meta = + check_exec(src.layout, filter.layout, bias.layout, z.layout, + dst.layout, workspace.size, preprocessed_filter); auto sfb = dst; if (bias.layout.dtype.enumv() != dst.layout.dtype.enumv()) { // intermediate result diff --git a/dnn/src/naive/conv_bias/opr_impl.h b/dnn/src/naive/conv_bias/opr_impl.h index b410d10fb2aa2709e9c2802c772f356ab6ae7ab9..ad021ce3cd2af8b6a6cc105070f8f189457e3af5 100644 --- a/dnn/src/naive/conv_bias/opr_impl.h +++ b/dnn/src/naive/conv_bias/opr_impl.h @@ -61,9 +61,7 @@ public: void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, const TensorLayout&, const TensorLayout&, const TensorLayout&, PreprocessedFilter*, - _megdnn_workspace) override{ - megdnn_throw("conv_bias exec_preprocess is not impl yet"); - } + _megdnn_workspace) override {} const char* get_algorithm_set_name() const override; }; diff --git a/dnn/src/naive/convolution/convolution.cpp b/dnn/src/naive/convolution/convolution.cpp index 1b48101a74cf4a8ad9d6ea5714b89dd2859f394d..134c6f0afb63e604dfda60400146bc1ce608c642 100644 --- a/dnn/src/naive/convolution/convolution.cpp +++ b/dnn/src/naive/convolution/convolution.cpp @@ -28,11 +28,11 @@ using namespace naive; void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, - const PreprocessedFilter*, + const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { MIDOUT_BEGIN(megdnn_naive_conv_fwd) { auto filter_meta = check_exec(src.layout, filter.layout, dst.layout, - workspace.size); + workspace.size, preprocessed_filter); using ComputeMode = Param::ComputeMode; #define DISPATCH_CMODE(in_dt, out_dt, in_ct, out_ct, comp_ct, cmode) \ do { \ diff --git a/dnn/src/naive/convolution/opr_impl.h b/dnn/src/naive/convolution/opr_impl.h index b7253be21f2b6bb53b8afe513f4905687bb224f6..b3d9a4d1e558eab8e99d2bd758028e3ff2d720f8 100644 --- a/dnn/src/naive/convolution/opr_impl.h +++ b/dnn/src/naive/convolution/opr_impl.h @@ -44,9 +44,7 @@ class ConvolutionForwardImpl: public ConvolutionForward { void exec_preprocess(const TensorLayout&, _megdnn_tensor_in, const TensorLayout&, PreprocessedFilter*, - _megdnn_workspace) override { - megdnn_throw("convolution exec_preprocess in not impl yet"); - } + _megdnn_workspace) override {} SmallVector deduce_preprocessed_filter_layout( const TensorLayout& , const TensorLayout& , diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h index 121380c610156b03cbd8b70cfc3d2a8a6c369408..c34ff06aa9b37459abcfe665e3b79966ab7042ac 100644 --- a/dnn/test/common/opr_proxy.h +++ b/dnn/test/common/opr_proxy.h @@ -18,6 +18,9 @@ #include "test/common/workspace_wrapper.h" #include +#include + + namespace megdnn { namespace test { @@ -32,6 +35,9 @@ struct OprProxyDefaultImpl template struct OprProxy : public OprProxyDefaultImpl {}; +template +struct OprWeightPreprocessProxy : public OprProxyDefaultImpl {}; + template struct OprProxyVectorToSingle {}; @@ -139,6 +145,28 @@ struct OprProxyProfilingBase typename Opr::Algorithm* target_algo = nullptr; OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } + + //! used for alloc tensor for weight preprocess + static std::shared_ptr alloc_tensors( + Handle* handle, const TensorLayoutArray& layouts) { + auto deleter = [handle](TensorNDArray* ptr) { + for (auto&& i : *ptr) { + auto pdata = static_cast(i.raw_ptr) + + i.layout.span().low_byte; + megdnn_free(handle, pdata); + } + delete ptr; + }; + std::shared_ptr ret{new TensorNDArray, deleter}; + for (size_t i = 0; i < layouts.size(); ++i) { + auto span = layouts[i].span(); + ret->emplace_back(static_cast( + megdnn_malloc(handle, span.dist_byte())) - + span.low_byte, + layouts[i]); + } + return ret; + } }; template @@ -207,7 +235,6 @@ DEF_PROF3(LocalShareBackwardData); DEF_PROF3(LocalShareBackwardFilter); #undef DEF_PROF3 -//! TODO: it should adapt weight preprocess later template <> struct OprProxy : public OprProxyProfilingTernary { @@ -263,6 +290,100 @@ struct OprProxy } }; +template <> +struct OprWeightPreprocessProxy + : public OprProxyProfilingTernary { + using OprProxyProfilingTernary::OprProxyProfilingTernary; + void exec(ConvolutionForward* opr, const TensorNDArray& tensors) { + megdnn_assert(tensors.size() == 3); + if (!Base::W.valid()) { + Base::W = WorkspaceWrapper(opr->handle(), 0); + } + if (Base::m_profiling && !Base::target_algo) { + size_t min_time = std::numeric_limits::max(); + for (auto algo : + opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, + tensors[2].layout)) { + opr->execution_policy().algorithm = algo; + + auto preprocess_tensors = weight_prerocess(opr, tensors, algo); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + ConvolutionForward::PreprocessedFilter preprocessed_filter{ + algo, *preprocess_tensors}; + + auto workspace_size = opr->get_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + &preprocessed_filter); + Base::W.update(workspace_size); + + for (size_t times = 0; times < Base::warmup_times; ++times) + opr->exec(tensors[0], tensors[1], tensors[2], + &preprocessed_filter, Base::W.workspace()); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + Timer timer; + timer.start(); + for (size_t times = 0; times < Base::exec_times; ++times) { + opr->exec(tensors[0], tensors[1], tensors[2], + &preprocessed_filter, Base::W.workspace()); + } + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + timer.stop(); + printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, + algo->name()); + if (min_time > timer.get_time_in_us()) { + min_time = timer.get_time_in_us(); + Base::target_algo = algo; + } + } + opr->execution_policy().algorithm = Base::target_algo; + auto preprocess_tensors = + weight_prerocess(opr, tensors, Base::target_algo); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + ConvolutionForward::PreprocessedFilter preprocessed_filter{ + Base::target_algo, *preprocess_tensors}; + auto workspace_size = opr->get_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + &preprocessed_filter); + Base::W.update(workspace_size); + } + auto preprocess_tensors = + weight_prerocess(opr, tensors, Base::target_algo); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + ConvolutionForward::PreprocessedFilter preprocessed_filter{ + Base::target_algo, *preprocess_tensors}; + if (!Base::target_algo) { + auto workspace_size = opr->get_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + &preprocessed_filter); + Base::W.update(workspace_size); + } + opr->exec(tensors[0], tensors[1], tensors[2], &preprocessed_filter, + Base::W.workspace()); + } + + //! handle weight preprocess + std::shared_ptr weight_prerocess( + ConvolutionForward* opr, const TensorNDArray& tensors, + ConvolutionForward::Algorithm* algo) { + auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout( + tensors[0].layout, tensors[1].layout, tensors[2].layout); + auto preprocessed_filter_tensors_ptr = + alloc_tensors(opr->handle(), weight_perprocess_layouts); + ConvolutionForward::PreprocessedFilter preprocessed_filter{ + algo, *preprocessed_filter_tensors_ptr}; + size_t preprocess_workspace_size = + opr->get_preprocess_workspace_in_bytes(tensors[0].layout, + tensors[1].layout, + tensors[2].layout); + WorkspaceWrapper preprocess_workspace(opr->handle(), + preprocess_workspace_size); + opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout, + &preprocessed_filter, + preprocess_workspace.workspace()); + return preprocessed_filter_tensors_ptr; + } +}; + template struct OprProxyProfiling5 : public OprProxyProfilingBase { @@ -329,11 +450,9 @@ struct OprProxyProfiling5 : public OprProxyProfilingBase { DEF_PROF5(DeformableConvForward); DEF_PROF5(DeformableConvBackwardFilter); -//DEF_PROF5(ConvBiasForward); DEF_PROF5(BatchConvBiasForward); #undef DEF_PROF5 -//! TODO: it should adapt weight preprocess later template <> struct OprProxy : public OprProxyProfiling5 { using OprProxyProfiling5::OprProxyProfiling5; @@ -390,6 +509,106 @@ struct OprProxy : public OprProxyProfiling5 { } }; +template <> +struct OprWeightPreprocessProxy + : public OprProxyProfiling5 { + using OprProxyProfiling5::OprProxyProfiling5; + void exec(ConvBiasForward* opr, const TensorNDArray& tensors) { + megdnn_assert(tensors.size() == 5); + if (!Base::W.valid()) { + Base::W = WorkspaceWrapper(opr->handle(), 0); + } + if (Base::m_profiling && !Base::target_algo) { + size_t min_time = std::numeric_limits::max(); + for (auto algo : + opr->get_all_algorithms(tensors[0].layout, tensors[1].layout, + tensors[2].layout, tensors[3].layout, + tensors[4].layout)) { + opr->execution_policy().algorithm = algo; + + auto preprocess_tensors = weight_prerocess(opr, tensors, algo); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + ConvBiasForward::PreprocessedFilter preprocessed_filter{ + algo, *preprocess_tensors}; + + auto workspace_size = opr->get_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + tensors[3].layout, tensors[4].layout, + &preprocessed_filter); + Base::W.update(workspace_size); + + for (size_t times = 0; times < Base::warmup_times; ++times) + opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], + tensors[4], &preprocessed_filter, + Base::W.workspace()); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + Timer timer; + timer.start(); + for (size_t times = 0; times < Base::exec_times; ++times) { + opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], + tensors[4], &preprocessed_filter, + Base::W.workspace()); + } + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + timer.stop(); + printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, + algo->name()); + if (min_time > timer.get_time_in_us()) { + min_time = timer.get_time_in_us(); + Base::target_algo = algo; + } + } + opr->execution_policy().algorithm = Base::target_algo; + auto preprocess_tensors = + weight_prerocess(opr, tensors, Base::target_algo); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + ConvBiasForward::PreprocessedFilter preprocessed_filter{ + Base::target_algo, *preprocess_tensors}; + auto workspace_size = opr->get_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + tensors[3].layout, tensors[4].layout, &preprocessed_filter); + Base::W.update(workspace_size); + } + auto preprocess_tensors = + weight_prerocess(opr, tensors, Base::target_algo); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + ConvBiasForward::PreprocessedFilter preprocessed_filter{ + Base::target_algo, *preprocess_tensors}; + if (!Base::target_algo) { + auto workspace_size = opr->get_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + tensors[3].layout, tensors[4].layout, &preprocessed_filter); + Base::W.update(workspace_size); + } + opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], + &preprocessed_filter, Base::W.workspace()); + } + + //! handle weight preprocess + std::shared_ptr weight_prerocess( + ConvBiasForward* opr, const TensorNDArray& tensors, + ConvBiasForward::Algorithm* algo) { + auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + tensors[3].layout, tensors[4].layout); + auto preprocessed_filter_tensors_ptr = + alloc_tensors(opr->handle(), weight_perprocess_layouts); + ConvBiasForward::PreprocessedFilter preprocessed_filter{ + algo, *preprocessed_filter_tensors_ptr}; + size_t preprocess_workspace_size = + opr->get_preprocess_workspace_in_bytes( + tensors[0].layout, tensors[1].layout, tensors[2].layout, + tensors[3].layout, tensors[4].layout); + WorkspaceWrapper preprocess_workspace(opr->handle(), + preprocess_workspace_size); + opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout, + tensors[3].layout, tensors[4].layout, + &preprocessed_filter, + preprocess_workspace.workspace()); + return preprocessed_filter_tensors_ptr; + } +}; + template struct OprProxyProfiling8 : public OprProxyProfilingBase { using Base = OprProxyProfilingBase;