From f14e0c17e7a25973c5279b4f1c6415e32220d12b Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Tue, 15 Dec 2020 20:28:32 +0800 Subject: [PATCH] feat(mgb): add recursive for fastrun and megdnn test GitOrigin-RevId: 743846f64536cd604a24024378ee93a7e333a50b --- dnn/include/megdnn/basic_types.h | 2 + dnn/include/megdnn/oprs/base.h | 73 ++- dnn/src/common/algo_chooser.h | 8 +- dnn/src/common/basic_types.cpp | 71 ++- .../cuda/batched_matrix_mul/brute_force.cpp | 6 +- dnn/src/cuda/conv_bias/bfloat16.cpp | 4 +- .../cuda/convolution/backward_data/algo.cpp | 8 +- dnn/src/cuda/convolution/backward_data/algo.h | 22 +- .../convolution/backward_data/bfloat16.cpp | 92 +-- .../convolution/backward_filter/bfloat16.cpp | 4 +- dnn/src/cuda/convolution/opr_impl.cpp | 6 +- dnn/src/cuda/convolution/opr_impl.h | 11 + dnn/src/cuda/matrix_mul/bfloat16.cpp | 2 +- dnn/src/fallback/conv_bias/opr_impl.cpp | 7 +- dnn/src/fallback/convolution/opr_impl.cpp | 4 +- dnn/src/fallback/matrix_mul/opr_impl.cpp | 2 +- dnn/test/common/benchmarker.h | 2 +- dnn/test/common/checker.h | 100 +++- dnn/test/common/convolution.cpp | 6 +- dnn/test/common/fast_run_cache.cpp | 47 ++ dnn/test/common/fast_run_cache.h | 58 ++ dnn/test/common/opr_proxy.h | 365 +++++++++--- dnn/test/cuda/batch_conv_bias.cpp | 2 +- dnn/test/cuda/chanwise_convolution.cpp | 12 +- dnn/test/cuda/conv_bias_int8.cpp | 4 +- dnn/test/cuda/convolution.cpp | 125 +++- dnn/test/cuda/local_share.cpp | 18 +- dnn/test/gtest_main.cpp | 24 +- dnn/test/x86/convolution.cpp | 1 + src/core/test/graph/misc.cpp | 17 +- src/opr/impl/search_policy/algo_chooser.cpp | 537 +++++++++++++----- src/opr/impl/search_policy/profiler.cpp | 104 +++- src/opr/include/megbrain/opr/blas.h | 4 +- .../megbrain/opr/search_policy/algo_chooser.h | 122 ++-- .../opr/search_policy/algo_chooser_helper.h | 4 +- .../megbrain/opr/search_policy/profiler.h | 13 +- src/opr/test/dnn/convolution.cpp | 255 ++++++--- src/plugin/test/opr_io_dump_text_out.h | 98 ++-- 38 files changed, 1680 insertions(+), 560 deletions(-) create mode 100644 dnn/test/common/fast_run_cache.cpp create mode 100644 dnn/test/common/fast_run_cache.h diff --git a/dnn/include/megdnn/basic_types.h b/dnn/include/megdnn/basic_types.h index b942b22a8..61f580a5f 100644 --- a/dnn/include/megdnn/basic_types.h +++ b/dnn/include/megdnn/basic_types.h @@ -330,6 +330,8 @@ struct TensorLayout : public TensorShape { /* =================== properties =================== */ std::string to_string() const; + + std::string serialize() const; #endif // MEGDNN_CC_HOST /*! diff --git a/dnn/include/megdnn/oprs/base.h b/dnn/include/megdnn/oprs/base.h index 734109501..1c185c8fc 100644 --- a/dnn/include/megdnn/oprs/base.h +++ b/dnn/include/megdnn/oprs/base.h @@ -11,6 +11,7 @@ */ #pragma once +#include #include "megdnn/basic_types.h" #include "megdnn/handle.h" @@ -144,8 +145,11 @@ public: return {{handle_type(), type(), param()}, name(), is_reproducible()}; } + Info::Desc desc() const { return {handle_type(), type(), param()}; } + template static void serialize_write_pod(const T& val, std::string& result) { + static_assert(std::is_standard_layout::value, "invalid type"); result.append(reinterpret_cast(&val), sizeof(T)); } @@ -155,6 +159,7 @@ public: template static T deserialize_read_pod(const std::string& data, size_t offset = 0) { + static_assert(std::is_standard_layout::value, "invalid type"); T ret; //! A pointer to an object or incomplete type may be converted to a //! pointer to a different object or incomplete type. If the resulting @@ -167,10 +172,69 @@ public: return ret; } + template + static T deserialize_read_pod(const char* data, size_t offset = 0) { + static_assert(std::is_standard_layout::value, "invalid type"); + T ret; + //! A pointer to an object or incomplete type may be converted to a + //! pointer to a different object or incomplete type. If the resulting + //! pointer is not correctly aligned for the pointed-to type, the + //! behavior is undefined. + //! + //! so here we should use memcpy instead of + //! *reinterpret_cast(&data[offset]); + memcpy(&ret, data + offset, sizeof(T)); + return ret; + } + + enum class OprType : uint32_t { + MATRIX_MUL_FORWARD, + BATCHED_MATRIX_MUL_FORWARD, + CONVOLUTION_FORWARD, + CONVOLUTION_BACKWARD_DATA, + CONVOLUTION_BACKWARD_FILTER, + CONVOLUTION3D_FORWARD, + CONVOLUTION3D_BACKWARD_DATA, + CONVOLUTION3D_BACKWARD_FILTER, + LOCAL_SHARE_FORWARD, + LOCAL_SHARE_BACKWARD_DATA, + LOCAL_SHARE_BACKWARD_FILTER, + DEFORMABLE_CONV_FORWARD, + DEFORMABLE_CONV_BACKWARD_DATA, + DEFORMABLE_CONV_BACKWARD_FILTER, + CONVBIAS_FORWARD, + BATCH_CONV_FORWARD, + }; + + struct SearchItem { + OprType opr_type; + //! serialized param + std::string param; + TensorLayoutArray layouts; + }; + + /** + * \brief get subopr list of the algo + * + * \param layouts origin layouts of the parent opr + * \param opr parent opr + */ + virtual std::vector get_subopr_list(const TensorLayoutArray&, + const OperatorBase*) const { + return {}; + } + protected: Handle::HandleType m_handle_type = Handle::HandleType::NAIVE; }; +//! policy for executing the operator +struct ExecutionPolicy { + //! INVALID_ALGO_TYPE algo_type means using heuristic + Algorithm::Info::Desc algo; + std::vector sub_policy; +}; + /*! * \brief define Algorithm and ExecutionPolicy for oprs that have * multiple impl algos @@ -198,12 +262,6 @@ public: */ virtual const char* get_algorithm_set_name() const = 0; - //! policy for executing the operator - struct ExecutionPolicy { - //! INVALID_ALGO_TYPE algo_type means using heuristic - AlgorithmInfo algo; - }; - ExecutionPolicy& execution_policy() { return m_execution_policy; } const ExecutionPolicy& execution_policy() const { @@ -464,6 +522,9 @@ protected: bool reproducible = false) = 0; }; } // namespace detail + +using Algorithm = detail::Algorithm; +using ExecutionPolicy = detail::ExecutionPolicy; } // namespace megdnn #include "megdnn/internal/visibility_epilogue.h" diff --git a/dnn/src/common/algo_chooser.h b/dnn/src/common/algo_chooser.h index f7486c2ad..4db9df59f 100644 --- a/dnn/src/common/algo_chooser.h +++ b/dnn/src/common/algo_chooser.h @@ -25,17 +25,17 @@ namespace megdnn { */ template typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) { - typename Opr::AlgorithmInfo ret; + typename Opr::AlgorithmDesc ret; auto set = opr->execution_policy().algo; if (set.valid()) { ret = set; } else { ret = opr->get_algorithm_info_heuristic( std::forward(args)..., std::numeric_limits::max(), - false); + false).desc; } return static_cast( - opr->get_algorithm_from_desc(ret.desc)); + opr->get_algorithm_from_desc(ret)); } /*! @@ -46,7 +46,7 @@ template typename Opr::AlgoBase* get_algorithm_or_construct(Opr* opr, Args&&... args) { auto set = opr->execution_policy().algo; if (set.valid()) { - return opr->algo_pack().construct_and_get_algo(set.desc); + return opr->algo_pack().construct_and_get_algo(set); } else { return static_cast( opr->get_algorithm_heuristic(std::forward(args)..., diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp index 48b4ecc11..ac869af44 100644 --- a/dnn/src/common/basic_types.cpp +++ b/dnn/src/common/basic_types.cpp @@ -20,6 +20,7 @@ #include #include #include +#include using namespace megdnn; @@ -35,6 +36,26 @@ class DefaultErrorHandler final : public ErrorHandler { #endif } }; + +template +void serialize_pod(const T& val, std::string& result) { + static_assert(std::is_standard_layout::value, "invalid type"); + result.append(reinterpret_cast(&val), sizeof(T)); +} + +template +void serialize_vec(const T* val, size_t size, std::string& result) { + result.append(reinterpret_cast(val), sizeof(T) * size); +} + +template +T deserialize_pod(const std::string& data, size_t& offset) { + T ret; + memcpy(&ret, data.data() + offset, sizeof(T)); + offset += sizeof(T); + return ret; +} + } // namespace ErrorHandler* ErrorHandler::sm_inst; @@ -126,17 +147,23 @@ bool TensorShape::eq_shape(const TensorShape& rhs) const { size_t eq = 0; switch (ndim) { case 7: - eq += shape[6] == rhs.shape[6]; MEGDNN_FALLTHRU + eq += shape[6] == rhs.shape[6]; + MEGDNN_FALLTHRU case 6: - eq += shape[5] == rhs.shape[5]; MEGDNN_FALLTHRU + eq += shape[5] == rhs.shape[5]; + MEGDNN_FALLTHRU case 5: - eq += shape[4] == rhs.shape[4]; MEGDNN_FALLTHRU + eq += shape[4] == rhs.shape[4]; + MEGDNN_FALLTHRU case 4: - eq += shape[3] == rhs.shape[3]; MEGDNN_FALLTHRU + eq += shape[3] == rhs.shape[3]; + MEGDNN_FALLTHRU case 3: - eq += shape[2] == rhs.shape[2]; MEGDNN_FALLTHRU + eq += shape[2] == rhs.shape[2]; + MEGDNN_FALLTHRU case 2: - eq += shape[1] == rhs.shape[1]; MEGDNN_FALLTHRU + eq += shape[1] == rhs.shape[1]; + MEGDNN_FALLTHRU case 1: eq += shape[0] == rhs.shape[0]; } @@ -435,8 +462,8 @@ bool TensorLayout::try_reshape(TensorLayout& result, for (size_t i = 0; i < tshp.ndim; ++i) { if (!tshp.shape[i]) { megdnn_throw_if(!format.is_default(), tensor_reshape_error, - megdnn_mangle(ssprintf("bad target tshp: %s", - tshp.to_string().c_str()))); + megdnn_mangle(ssprintf("bad target tshp: %s", + tshp.to_string().c_str()))); is_empty_shape = true; break; } @@ -510,8 +537,36 @@ std::string TensorLayout::to_string() const { rst.append(" @ "); rst.append(format.impl()->to_string()); } + rst.append(std::string(" ") + dtype.name()); rst.append("}"); return rst; } +std::string TensorLayout::serialize() const { + std::string rst; + serialize_pod(ndim, rst); + serialize_vec(shape, ndim, rst); + serialize_vec(stride, ndim, rst); + rst.append(format.impl()->to_string()); + + //! serialize dtype + serialize_pod(dtype.enumv(), rst); + if (dtype.has_param()) { + switch (dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait::enumv: \ + serialize_pod(dtype::_dt::downcast_from(dtype).param(), rst); \ + break; + MEGDNN_FOREACH_PARAMETERIZED_DTYPE(cb) +#undef cb + default: + megdnn_assert(false, + "cannot serialize unknown parameterized DType"); + break; + } + } + + return rst; +} + // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/batched_matrix_mul/brute_force.cpp b/dnn/src/cuda/batched_matrix_mul/brute_force.cpp index e8c389150..f3c093ef5 100644 --- a/dnn/src/cuda/batched_matrix_mul/brute_force.cpp +++ b/dnn/src/cuda/batched_matrix_mul/brute_force.cpp @@ -24,7 +24,7 @@ bool BatchedMatrixMulForwardImpl::AlgoBruteForce::is_available( const SizeArgs& args) const { MatrixMulForwardImpl mm{args.opr->handle()}; mm.param() = {args.opr->param().transposeA, args.opr->param().transposeB}; - mm.execution_policy() = {m_algorithm->info()}; + mm.execution_policy() = {m_algorithm->desc(), {}}; auto mm_layout_a = args.layout_a.remove_axis(0); auto mm_layout_b = args.layout_b.remove_axis(0); @@ -39,7 +39,7 @@ size_t BatchedMatrixMulForwardImpl::AlgoBruteForce::get_workspace_in_bytes( auto mm_opr = args.opr->handle()->create_operator(); mm_opr->param() = {args.opr->param().transposeA, args.opr->param().transposeB}; - mm_opr->execution_policy() = {m_algorithm->info()}; + mm_opr->execution_policy() = {m_algorithm->desc(), {}}; return mm_opr->get_workspace_in_bytes(args.layout_a, args.layout_b, args.layout_c); @@ -50,7 +50,7 @@ void BatchedMatrixMulForwardImpl::AlgoBruteForce::exec( auto&& mm_opr = args.opr->handle()->create_operator(); mm_opr->param() = {args.opr->param().transposeA, args.opr->param().transposeB}; - mm_opr->execution_policy() = {m_algorithm->info()}; + mm_opr->execution_policy() = {m_algorithm->desc(), {}}; rep(n, N) { TensorND A_, B_, C_; auto tensor_n_from_batch = [n](const TensorND& in, TensorND& out) { diff --git a/dnn/src/cuda/conv_bias/bfloat16.cpp b/dnn/src/cuda/conv_bias/bfloat16.cpp index d9387a85a..7d4cf0390 100644 --- a/dnn/src/cuda/conv_bias/bfloat16.cpp +++ b/dnn/src/cuda/conv_bias/bfloat16.cpp @@ -47,7 +47,7 @@ ConvBiasForwardImpl::AlgoBFloat16::float_args( change_dtype(fdst); opr->param() = args.opr->param(); opr->param().compute_mode = Param::ComputeMode::DEFAULT; - opr->execution_policy() = {m_impl->info()}; + opr->execution_policy() = {m_impl->desc(), {}}; return SizeArgs(opr, fsrc, ffilter, fbias, fz, fdst); } @@ -110,7 +110,7 @@ void ConvBiasForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { auto convbias_opr = args.handle->create_operator(); convbias_opr->param() = args.opr->param(); convbias_opr->param().compute_mode = Param::ComputeMode::DEFAULT; - convbias_opr->execution_policy() = {m_impl->info()}; + convbias_opr->execution_policy() = {m_impl->desc(), {}}; convbias_opr->exec(fsrc_tensor, ffilter_tensor, fbias_tensor, fz_tensor, fdst_tensor, nullptr, cvter.workspace()); } diff --git a/dnn/src/cuda/convolution/backward_data/algo.cpp b/dnn/src/cuda/convolution/backward_data/algo.cpp index 1c3f69916..dcdcd5896 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.cpp +++ b/dnn/src/cuda/convolution/backward_data/algo.cpp @@ -46,12 +46,8 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { megdnn_assert(all_algos_data == all_algos.data()); non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul - size_t algo_size = all_algos.size(); - for (size_t i=0; iinfo().desc, algo); diff --git a/dnn/src/cuda/convolution/backward_data/algo.h b/dnn/src/cuda/convolution/backward_data/algo.h index 942d0b3ed..e6a3f6b5b 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.h +++ b/dnn/src/cuda/convolution/backward_data/algo.h @@ -170,28 +170,22 @@ public: class ConvolutionBackwardDataImpl::AlgoBFloat16 final : public AlgoBase { public: - AlgoBFloat16(ConvolutionBackwardDataImpl::AlgoBase*); bool is_available(const SizeArgs& args) const override; size_t get_workspace_in_bytes(const SizeArgs& args) const override; void exec(const ExecArgs& args) const override; - const char* name() const override { return m_name.c_str(); } + std::vector get_subopr_list( + const TensorLayoutArray& layouts, + const OperatorBase* opr) const override; + + const char* name() const override { + return "CONVOLUTION_BACKWARD_DATD_BFLOAT16"; + } bool is_reproducible() const override { return true; } private: - std::string m_name; - ConvolutionBackwardDataImpl::AlgoBase* m_algorithm = nullptr; - SizeArgs float_args(const SizeArgs& args, ConvolutionBackwardDataImpl* opr, - TensorLayout& fsrc, TensorLayout& ffilter, - TensorLayout& fdst) const; WorkspaceBundle get_workspace_bundle(void* ptr, const SizeArgs& args) const; MEGDNN_DECL_ALGO_TYPE(CUDA_BFLOAT16) - - std::string param() const override { - std::string ret; - serialize_write_pod(m_algorithm, ret); - return ret; - } }; //! implement group conv by another algo @@ -237,7 +231,7 @@ public: AlgoChanwiseSmall chanwise_small; std::vector gconv; std::unordered_map algo2gconv; - std::vector> bfloat16_refhold; + AlgoBFloat16 bfloat16; std::vector //! all algorithms diff --git a/dnn/src/cuda/convolution/backward_data/bfloat16.cpp b/dnn/src/cuda/convolution/backward_data/bfloat16.cpp index 3e4c6304e..322a8e8b8 100644 --- a/dnn/src/cuda/convolution/backward_data/bfloat16.cpp +++ b/dnn/src/cuda/convolution/backward_data/bfloat16.cpp @@ -17,33 +17,39 @@ using namespace megdnn; using namespace cuda; using namespace convolution; -ConvolutionBackwardDataImpl::AlgoBFloat16::AlgoBFloat16( - ConvolutionBackwardDataImpl::AlgoBase* algorithm) - : m_algorithm(algorithm) { - megdnn_assert_internal(algorithm); - m_name = ssprintf("CONVOLUTION_BACKWARD_DATD_BFLOAT16:%s", - m_algorithm->name()); -} - -ConvolutionBackwardDataImpl::AlgoBase::SizeArgs -ConvolutionBackwardDataImpl::AlgoBFloat16::float_args( - const SizeArgs& args, ConvolutionBackwardDataImpl* opr, - TensorLayout& ffilter, TensorLayout& fdiff, TensorLayout& fgrad) const { - ffilter = *args.filter_layout; - fdiff = *args.diff_layout; - fgrad = *args.grad_layout; +namespace { +std::pair sub_opr_config( + const TensorLayoutArray& layouts, + const ConvolutionBackwardDataImpl* opr) { + megdnn_assert(layouts.size() >= 3); + std::pair ret; + ret.first = layouts; auto change_dtype = [](TensorLayout& layout) { if (layout.dtype == dtype::BFloat16()) { layout.dtype = dtype::Float32(); } }; - change_dtype(ffilter); - change_dtype(fdiff); - change_dtype(fgrad); - opr->param() = args.opr->param(); - opr->param().compute_mode = Param::ComputeMode::DEFAULT; - opr->execution_policy() = {m_algorithm->info()}; - return SizeArgs(opr, ffilter, fdiff, fgrad); + change_dtype(ret.first[0]); + change_dtype(ret.first[1]); + change_dtype(ret.first[2]); + + ret.second = opr->param(); + ret.second.compute_mode = + ConvolutionBackwardData::Param::ComputeMode::DEFAULT; + return ret; +} +} + +std::vector +ConvolutionBackwardDataImpl::AlgoBFloat16::get_subopr_list( + const TensorLayoutArray& layouts, const OperatorBase* opr) const { + auto&& config = sub_opr_config( + layouts, static_cast(opr)); + + std::string param_str; + Algorithm::serialize_write_pod(config.second, param_str); + return {{Algorithm::OprType::CONVOLUTION_BACKWARD_DATA, param_str, + config.first}}; } bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available( @@ -51,24 +57,30 @@ bool ConvolutionBackwardDataImpl::AlgoBFloat16::is_available( TensorLayout ffilter, fdiff, fgrad; auto conv_back_data_opr = args.handle->create_operator(); - SizeArgs fargs = float_args( - args, - static_cast(conv_back_data_opr.get()), - ffilter, fdiff, fgrad); + auto&& config = sub_opr_config( + {*args.filter_layout, *args.diff_layout, *args.grad_layout}, + args.opr); + conv_back_data_opr->param() = config.second; return args.diff_layout->dtype == args.filter_layout->dtype && args.diff_layout->dtype == dtype::BFloat16() && - m_algorithm->is_available(fargs); + get_algorithm(static_cast( + conv_back_data_opr.get()), + config.first[0], config.first[1], config.first[2]); } WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle( void* ptr, const SizeArgs& args) const { - TensorLayout ffilter, fdiff, fgrad; auto conv_back_data_opr = args.handle->create_operator(); - SizeArgs fargs = float_args( - args, - static_cast(conv_back_data_opr.get()), - ffilter, fdiff, fgrad); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + conv_back_data_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } + auto&& config = sub_opr_config( + {*args.filter_layout, *args.diff_layout, *args.grad_layout}, + args.opr); + conv_back_data_opr->param() = config.second; SmallVector sizes; auto get_workspace = [&sizes](const TensorLayout& src, const TensorLayout& dst) { @@ -76,10 +88,12 @@ WorkspaceBundle ConvolutionBackwardDataImpl::AlgoBFloat16::get_workspace_bundle( sizes.push_back(dst.span().dist_byte()); } }; - get_workspace(*args.filter_layout, ffilter); - get_workspace(*args.diff_layout, fdiff); - get_workspace(*args.grad_layout, fgrad); - sizes.push_back(m_algorithm->get_workspace_in_bytes(fargs)); + get_workspace(*args.filter_layout, config.first[0]); + get_workspace(*args.diff_layout, config.first[1]); + get_workspace(*args.grad_layout, config.first[2]); + + sizes.push_back(conv_back_data_opr->get_workspace_in_bytes( + config.first[0], config.first[1], config.first[2])); return {ptr, std::move(sizes)}; } @@ -103,9 +117,13 @@ void ConvolutionBackwardDataImpl::AlgoBFloat16::exec( { auto conv_back_data_opr = args.handle->create_operator(); + if (args.opr->execution_policy().algo.valid()) { + megdnn_assert(args.opr->execution_policy().sub_policy.size() == 1); + conv_back_data_opr->execution_policy() = + args.opr->execution_policy().sub_policy[0]; + } conv_back_data_opr->param() = args.opr->param(); conv_back_data_opr->param().compute_mode = Param::ComputeMode::DEFAULT; - conv_back_data_opr->execution_policy() = {m_algorithm->info()}; conv_back_data_opr->exec(ffilter_tensor, fdiff_tensor, fgrad_tensor, cvter.workspace()); } diff --git a/dnn/src/cuda/convolution/backward_filter/bfloat16.cpp b/dnn/src/cuda/convolution/backward_filter/bfloat16.cpp index 911410cd6..6e8dd7097 100644 --- a/dnn/src/cuda/convolution/backward_filter/bfloat16.cpp +++ b/dnn/src/cuda/convolution/backward_filter/bfloat16.cpp @@ -42,7 +42,7 @@ ConvolutionBackwardFilterImpl::AlgoBFloat16::float_args( change_dtype(fgrad); opr->param() = args.opr->param(); opr->param().compute_mode = Param::ComputeMode::DEFAULT; - opr->execution_policy() = {m_algorithm->info()}; + opr->execution_policy() = {m_algorithm->desc(), {}}; return SizeArgs(opr, fsrc, fdiff, fgrad); } @@ -107,7 +107,7 @@ void ConvolutionBackwardFilterImpl::AlgoBFloat16::exec( conv_back_filter_opr->param() = args.opr->param(); conv_back_filter_opr->param().compute_mode = Param::ComputeMode::DEFAULT; - conv_back_filter_opr->execution_policy() = {m_algorithm->info()}; + conv_back_filter_opr->execution_policy() = {m_algorithm->desc(), {}}; conv_back_filter_opr->exec(fsrc_tensor, fdiff_tensor, fgrad_tensor, cvter.workspace()); } diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp index 1d76b9d7f..288fdb34d 100644 --- a/dnn/src/cuda/convolution/opr_impl.cpp +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -69,7 +69,7 @@ ConvolutionForwardImpl::conv_bias_extra_data(const TensorLayout& src, conv_param.dilate_h, conv_param.dilate_w, conv_param.compute_mode}; - ret.convbias_opr->execution_policy() = {this->execution_policy().algo}; + ret.convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; return ret; } @@ -102,7 +102,7 @@ ConvolutionForwardImpl::get_algorithm_from_desc( conv_param.dilate_h, conv_param.dilate_w, conv_param.compute_mode}; - convbias_opr->execution_policy() = {this->execution_policy().algo}; + convbias_opr->execution_policy() = {this->execution_policy().algo, {}}; return static_cast(convbias_opr.get()) ->get_algorithm_from_desc(desc); @@ -160,7 +160,7 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, _megdnn_tensor_out grad, _megdnn_workspace workspace) { AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); - auto algo = get_algorithm(this, filter.layout, args.filter_meta, + auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); algo->check_workspace(args, workspace).exec(args); } diff --git a/dnn/src/cuda/convolution/opr_impl.h b/dnn/src/cuda/convolution/opr_impl.h index f10a2739f..ec2d2671b 100644 --- a/dnn/src/cuda/convolution/opr_impl.h +++ b/dnn/src/cuda/convolution/opr_impl.h @@ -83,6 +83,17 @@ public: workspace_limit_in_bytes, reproducible) ->info(); } + + AlgorithmInfo get_algorithm_info_heuristic(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad, + size_t workspace_limit_in_bytes, + bool reproducible) { + return get_algorithm_heuristic(filter, diff, grad, + workspace_limit_in_bytes, reproducible) + ->info(); + } + size_t get_workspace_in_bytes(const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) override; diff --git a/dnn/src/cuda/matrix_mul/bfloat16.cpp b/dnn/src/cuda/matrix_mul/bfloat16.cpp index cf8ce254f..e3e9f1261 100644 --- a/dnn/src/cuda/matrix_mul/bfloat16.cpp +++ b/dnn/src/cuda/matrix_mul/bfloat16.cpp @@ -82,7 +82,7 @@ void MatrixMulForwardImpl::AlgoBFloat16::exec(const ExecArgs& args) const { args.opr->handle()->create_operator(); matmul_opr->param() = args.opr->param(); matmul_opr->param().compute_mode = Param::ComputeMode::DEFAULT; - matmul_opr->execution_policy() = {m_algorithm->info()}; + matmul_opr->execution_policy() = {m_algorithm->desc(), {}}; matmul_opr->exec(a, b, c, ctypecvt.workspace()); } ctypecvt.comp_to_dst_type(c, args.tensor_c); diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index 7471541cb..ee8593586 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -1,6 +1,5 @@ /** - * \file dnn/src/fallback/conv_bias/opr_impl.cpp - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + g * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * @@ -367,7 +366,7 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param( "should be equal"); auto&& fm = check_layout_fwd(src, filter, dst); auto& conv_fm = reinterpret_cast(fm); - + size_t nr_threads = static_cast(handle()) ->megcore_dispatcher() ->nr_threads(); @@ -495,7 +494,7 @@ ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm_from_desc( ConvBiasImpl::Algorithm* ConvBiasImpl::get_algorithm( const NCBKernSizeParam& param, size_t workspace_size) { - if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { + if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { return algo; } if (!m_prev_selected_algo || diff --git a/dnn/src/fallback/convolution/opr_impl.cpp b/dnn/src/fallback/convolution/opr_impl.cpp index d15c0af92..068da98ec 100644 --- a/dnn/src/fallback/convolution/opr_impl.cpp +++ b/dnn/src/fallback/convolution/opr_impl.cpp @@ -387,7 +387,7 @@ ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm_from_desc( ConvolutionImpl::Algorithm* ConvolutionImpl::get_algorithm( const NCBKernSizeParam& param, size_t workspace_size) { - if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { + if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { return algo; } if (!m_prev_selected_algo || @@ -783,7 +783,7 @@ ConvolutionBackwardDataImpl::get_algorithm_from_desc( ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl::get_algorithm(const NCBKernSizeParam& param) { - if (auto algo = get_algorithm_from_desc(execution_policy().algo.desc)) { + if (auto algo = get_algorithm_from_desc(execution_policy().algo)) { return algo; } if (!m_prev_selected_algo || diff --git a/dnn/src/fallback/matrix_mul/opr_impl.cpp b/dnn/src/fallback/matrix_mul/opr_impl.cpp index c1df44e90..25a47b807 100644 --- a/dnn/src/fallback/matrix_mul/opr_impl.cpp +++ b/dnn/src/fallback/matrix_mul/opr_impl.cpp @@ -134,7 +134,7 @@ MatrixMul::Algorithm* MatrixMulImpl::get_algorithm_heuristic( size_t workspace_limit_in_bytes, bool reproducible) { auto kern_size_param = make_kern_size_param(A, B, C); if (auto algo = static_cast( - get_algorithm_from_desc(execution_policy().algo.desc))) { + get_algorithm_from_desc(execution_policy().algo))) { megdnn_assert(algo->get_workspace(kern_size_param) < workspace_limit_in_bytes); auto cur = megdnn::get_reproducible_algo(algo, diff --git a/dnn/test/common/benchmarker.h b/dnn/test/common/benchmarker.h index ecfb150d3..89aef66de 100644 --- a/dnn/test/common/benchmarker.h +++ b/dnn/test/common/benchmarker.h @@ -382,7 +382,7 @@ float algo_benchmark(Benchmarker& benchmark, TensorLayoutArray layouts, for (auto i : algos) { if (std::regex_match(i.name, std::regex("(" + algo_base + ")(.*)"))) { - opr->execution_policy().algo = i; + opr->execution_policy().algo = i.desc; auto used = benchmark.exec(layouts); min_used = std::min(min_used, used); printf("run algo: %s used: %f ms min_used: %f ms\n", i.name.c_str(), diff --git a/dnn/test/common/checker.h b/dnn/test/common/checker.h index aa76336e6..18eeb95ff 100644 --- a/dnn/test/common/checker.h +++ b/dnn/test/common/checker.h @@ -242,6 +242,11 @@ public: return *this; } + Checker& reset_before_exec_callback() { + m_before_exec_callback = nullptr; + return *this; + } + //! set a tensors constraints function, for the purpose of manipulating //! tensors when testing. Checker& set_tensors_constraint( @@ -435,6 +440,17 @@ public: Testcase operator=(const Testcase&) = delete; }; +struct ExecutionPolicyAlgoName { + std::string name; + std::vector sub_policy_names; + + ExecutionPolicyAlgoName(const char* name) : name{name} {} + + ExecutionPolicyAlgoName( + const char* name, + const std::vector& sub_policy) + : name{name}, sub_policy_names{sub_policy} {} +}; /*! * \brief a callable to check that given algorithm is used for heuristic * \param require_algo if its value is true, then requires @@ -444,48 +460,76 @@ public: */ template > class AlgoChecker { - std::string m_name; - typename Opr::Algorithm* m_algo = nullptr; - bool* m_require_algo; - public: - AlgoChecker(const char* name, bool* require_algo = nullptr) - : m_name{name}, m_require_algo{require_algo} {} - AlgoChecker(typename Opr::Algorithm* algo, bool* require_algo = nullptr) - : m_algo{algo}, m_require_algo{require_algo} {} + AlgoChecker(ExecutionPolicyAlgoName name, bool* require_algo = nullptr) + : m_policy_name{name}, m_require_algo{require_algo} {} + + AlgoChecker(ExecutionPolicy policy, bool* require_algo = nullptr) + : m_policy{policy}, m_require_algo{require_algo} {} + + static ExecutionPolicy construct_execution_policy_from_name( + const ExecutionPolicyAlgoName& policy_name, + const TensorLayoutArray& layouts, const std::string& param, + Handle* handle) { + ExecutionPolicy ret; + megdnn_assert(layouts.size() == OprTrait::arity); + auto opr = handle->create_operator(); + opr->param() = + Algorithm::deserialize_read_pod(param); + for (auto algo_info : + AlgoProxy::arity>::get_all_algorithms_info( + opr.get(), layouts)) { + if (std::regex_match( + algo_info.name, + std::regex("(" + policy_name.name + ")(.*)"))) { + ret.algo = algo_info.desc; + } else { + continue; + } + + Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc); + std::vector&& sub_items = + algo->get_subopr_list(layouts, opr.get()); + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + ExecutionPolicy policy = + AlgoChecker<_Opr>::construct_execution_policy_from_name( + policy_name.sub_policy_names[_item_idx], + _item.layouts, _item.param, handle); + ret.sub_policy.push_back(policy); + }); + return ret; + } + return ret; + } void operator()(Opr* opr, const CheckerHelper::TensorValueArray& arr) { TensorLayoutArray layouts; for (auto&& val : arr) { layouts.push_back(val.layout); } + if (!m_policy_name.name.empty()) { + std::string param_str; + Algorithm::serialize_write_pod(opr->param(), param_str); + m_policy = construct_execution_policy_from_name( + m_policy_name, layouts, param_str, opr->handle()); + ASSERT_TRUE(m_policy.algo.valid()) + << "algorithm " << m_policy_name.name << " not found"; + } if (m_require_algo && *m_require_algo) { auto algo = OprAlgoProxy::get_algorithm_info_heuristic(opr, layouts); - if (m_name.empty()) { - ASSERT_EQ(m_algo->name(), algo.name.c_str()); - } else { - ASSERT_TRUE(std::regex_match( - algo.name.c_str(), std::regex("(" + m_name + ")(.*)"))); - } + ASSERT_STREQ(opr->get_algorithm_from_desc(m_policy.algo)->name(), + algo.name.c_str()); } else { - if (m_name.empty()) { - opr->execution_policy().algo = m_algo->info(); - return; - } else { - for (auto i : - OprAlgoProxy::get_all_algorithms_info(opr, layouts)) { - if (std::regex_match(i.name, - std::regex("(" + m_name + ")(.*)"))) { - opr->execution_policy().algo = i; - return; - } - } - } - ASSERT_TRUE(false) << "algorithm " << m_name << " not found"; + opr->execution_policy() = m_policy; } } + +private: + ExecutionPolicyAlgoName m_policy_name; + ExecutionPolicy m_policy; + bool* m_require_algo; }; } // namespace test diff --git a/dnn/test/common/convolution.cpp b/dnn/test/common/convolution.cpp index 00166a924..75cfaae37 100644 --- a/dnn/test/common/convolution.cpp +++ b/dnn/test/common/convolution.cpp @@ -580,7 +580,7 @@ void convolution::test_conv_config_combinations(int k_size, checker.set_rng(0, &rng).set_rng(1, &rng); for (auto algo : opr->get_all_algorithms_info(ily, fly, oly)) { used_algos.insert(algo.desc); - opr->execution_policy().algo = algo; + opr->execution_policy().algo = algo.desc; checker .set_epsilon(eps_getter(dtype == 1, 0, algo.name.c_str())) .execs({ishp, fshp, {}}); @@ -599,7 +599,7 @@ void convolution::test_conv_config_combinations(int k_size, opr->param() = param; for (auto algo: opr->get_all_algorithms_info(fly, oly, ily)) { used_algos_bwd_data.insert(algo.desc); - opr->execution_policy().algo = algo; + opr->execution_policy().algo = algo.desc; checker_bwd_data .set_epsilon(eps_getter(dtype == 1, 1, algo.name.c_str())) .execl({fly, oly, ily}); @@ -620,7 +620,7 @@ void convolution::test_conv_config_combinations(int k_size, opr->param() = param; for (auto algo: opr->get_all_algorithms_info(ily, oly, fly)) { used_algos_bwd_flt.insert(algo.desc); - opr->execution_policy().algo = algo; + opr->execution_policy().algo = algo.desc; checker_bwd_filter .set_epsilon(eps_getter(dtype == 1, 2, algo.name.c_str())) .execl({ily, oly, fly}); diff --git a/dnn/test/common/fast_run_cache.cpp b/dnn/test/common/fast_run_cache.cpp new file mode 100644 index 000000000..dd11e2747 --- /dev/null +++ b/dnn/test/common/fast_run_cache.cpp @@ -0,0 +1,47 @@ +/** + * \file dnn/test/common/fast_run_cache.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "test/common/fast_run_cache.h" +#include "src/common/utils.h" + +using namespace megdnn; +using namespace test; + +FastRunCache::SearchItemStorage::SearchItemStorage( + const Algorithm::SearchItem& item) { + Algorithm::serialize_write_pod(item.opr_type, data_hold); + for (auto&& layout : item.layouts) { + data_hold += layout.serialize(); + } + data_hold += item.param; +} + +Algorithm::Info::Desc FastRunCache::get(const Algorithm::SearchItem& key) { + SearchItemStorage key_storage(key); + key_storage.init_hash(); + + auto iter = m_cache.find(key_storage); + if (iter == m_cache.end()) { + return {}; + } + return iter->second; +} + +void FastRunCache::put(const Algorithm::SearchItem& key, + const Algorithm::Info::Desc& val) { + SearchItemStorage key_storage(key); + key_storage.init_hash(); + megdnn_assert(m_cache.find(key_storage) == m_cache.end()); + m_cache[std::move(key_storage)] = val; +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/test/common/fast_run_cache.h b/dnn/test/common/fast_run_cache.h new file mode 100644 index 000000000..23c173a55 --- /dev/null +++ b/dnn/test/common/fast_run_cache.h @@ -0,0 +1,58 @@ +/** + * \file dnn/test/common/fast_run_cache.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once + +#include "megdnn/oprs.h" +#include "src/common/hash_ct.h" + +#include + +namespace megdnn { +namespace test { +class FastRunCache { + struct SearchItemStorage { + std::string data_hold; + size_t hash = 0; + + SearchItemStorage(const Algorithm::SearchItem& item); + + SearchItemStorage& init_hash() { + hash = XXHash64CT::hash(data_hold.data(), data_hold.size(), + 20201225); + return *this; + } + + bool operator==(const SearchItemStorage& rhs) const { + return data_hold == rhs.data_hold; + } + + struct Hash { + size_t operator()(const SearchItemStorage& s) const { + return s.hash; + } + }; + }; + + std::unordered_map + m_cache; + +public: + Algorithm::Info::Desc get(const Algorithm::SearchItem& key); + void put(const Algorithm::SearchItem& key, + const Algorithm::Info::Desc& val); +}; + +} // namespace test +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h index ec592986e..0aafd518a 100644 --- a/dnn/test/common/opr_proxy.h +++ b/dnn/test/common/opr_proxy.h @@ -13,6 +13,7 @@ #include "test/common/deduce_layout_proxy.h" #include "test/common/exec_proxy.h" +#include "test/common/fast_run_cache.h" #include "test/common/inspect_type.h" #include "test/common/opr_algo_proxy.h" #include "test/common/opr_trait.h" @@ -20,11 +21,104 @@ #include "test/common/workspace_wrapper.h" #include +#include #include +#include namespace megdnn { namespace test { +template +struct OprFromOprTypeTrait; + +template +struct OprTypeFromOprTrait; + +#define cb(_opr_type, _opr) \ + template <> \ + struct OprFromOprTypeTrait { \ + using Opr = megdnn::_opr; \ + }; \ + template <> \ + struct OprTypeFromOprTrait { \ + constexpr static Algorithm::OprType opr_type = \ + Algorithm::OprType::_opr_type; \ + } + +cb(MATRIX_MUL_FORWARD, MatrixMulForward); +cb(CONVOLUTION_FORWARD, ConvolutionForward); +cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); +cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); +cb(CONVOLUTION3D_FORWARD, Convolution3DForward); +cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); +cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); +cb(LOCAL_SHARE_FORWARD, LocalShareForward); +cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); +cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); +cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); +cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); +cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); +cb(BATCH_CONV_FORWARD, BatchConvBiasForward); +cb(CONVBIAS_FORWARD, ConvBiasForward); + +#undef cb + +// clang-format off +#define FOREACH_OPR_TYPE(cb) \ + cb(MATRIX_MUL_FORWARD) \ + cb(CONVOLUTION_FORWARD) \ + cb(CONVOLUTION_BACKWARD_DATA) \ + cb(CONVOLUTION_BACKWARD_FILTER) \ + cb(CONVOLUTION3D_FORWARD) \ + cb(CONVOLUTION3D_BACKWARD_DATA) \ + cb(CONVOLUTION3D_BACKWARD_FILTER) \ + cb(LOCAL_SHARE_FORWARD) \ + cb(LOCAL_SHARE_BACKWARD_DATA) \ + cb(LOCAL_SHARE_BACKWARD_FILTER) \ + cb(DEFORMABLE_CONV_FORWARD) \ + cb(DEFORMABLE_CONV_BACKWARD_DATA) \ + cb(DEFORMABLE_CONV_BACKWARD_FILTER) \ + cb(BATCH_CONV_FORWARD) \ + cb(CONVBIAS_FORWARD) + +#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ + cb(MATRIX_MUL_FORWARD, stmt) \ + cb(CONVOLUTION_FORWARD, stmt) \ + cb(CONVOLUTION_BACKWARD_DATA, stmt) \ + cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ + cb(CONVOLUTION3D_FORWARD, stmt) \ + cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ + cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ + cb(LOCAL_SHARE_FORWARD, stmt) \ + cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ + cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ + cb(DEFORMABLE_CONV_FORWARD, stmt) \ + cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ + cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ + cb(BATCH_CONV_FORWARD, stmt) \ + cb(CONVBIAS_FORWARD, stmt) + +// clang-format on + +#define _OPR_TYPE_CASE(_opr_type, _stmt) \ + case Algorithm::OprType::_opr_type: { \ + using _Opr = typename OprFromOprTypeTrait< \ + Algorithm::OprType::_opr_type>::Opr; \ + _stmt; \ + break; \ + } + +#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ + for (size_t _item_idx = 0; _item_idx < _search_items.size(); \ + _item_idx++) { \ + auto&& _item = _search_items[_item_idx]; \ + switch (_item.opr_type) { \ + FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ + default: \ + megdnn_throw("unknown opr_type"); \ + } \ + } + template ::arity, bool has_workspace = OprTrait::has_workspace, bool can_deduce_layout = OprTrait::can_deduce_layout> @@ -130,10 +224,11 @@ struct OprProxy : DeduceLayoutProxy { }; //! OprProxy impl for tenary oprs with profiling support -template +template struct OprProxyProfilingBase - : public DeduceLayoutProxy::arity, OprTrait::can_deduce_layout> { + static constexpr int arity = OprTrait::arity; size_t warmup_times = 10, exec_times = 100; //! whether to enable profiling @@ -142,7 +237,7 @@ struct OprProxyProfilingBase //! target algo setup by profiler; it can also be directly specified by the //! caller - typename Opr::AlgorithmInfo target_algo_info; + ExecutionPolicy target_execution_policy; OprProxyProfilingBase(bool profile = false) { m_profiling = profile; } @@ -168,6 +263,154 @@ struct OprProxyProfilingBase return ret; } + /** + * flatten search space in postorder traversal + * The subopr search construct a search tree + * + * A + * / \ + * B1B2 C + * / \ + * D1D2D3 E + * We use postorder traverse the search tree. + * D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A + */ + static std::vector flatten_search_space( + const TensorLayoutArray layouts, const std::string& param, + Handle* handle) { + megdnn_assert(layouts.size() == arity); + auto opr = handle->create_operator(); + opr->param() = + Algorithm::deserialize_read_pod(param); + + std::vector ret; + for (auto algo_info : AlgoProxy::get_all_algorithms_info( + opr.get(), layouts)) { + Algorithm* algo = opr->get_algorithm_from_desc(algo_info.desc); + std::vector&& sub_items = + algo->get_subopr_list(layouts, opr.get()); + + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + auto space = OprProxyProfilingBase<_Opr>::flatten_search_space( + _item.layouts, _item.param, handle); + ret.insert(ret.end(), space.begin(), space.end()); + }); + } + ret.push_back({OprTypeFromOprTrait::opr_type, param, layouts}); + return ret; + } + + static void construct_execution_policy( + const TensorLayoutArray& layouts, const std::string& param, + Handle* handle, FastRunCache& cache, + ExecutionPolicy& policy) { + megdnn_assert(layouts.size() == arity); + auto opr = handle->create_operator(); + opr->param() = + Algorithm::deserialize_read_pod(param); + if (!policy.algo.valid()) { + policy.algo = cache.get(Algorithm::SearchItem{ + OprTypeFromOprTrait::opr_type, param, layouts}); + megdnn_assert(policy.algo.valid(), + "No cache found, maybe some error occured in " + "flatten_search_space or get_subopr_list"); + } + policy.sub_policy.clear(); + Algorithm* algo = opr->get_algorithm_from_desc(policy.algo); + std::vector&& sub_items = + algo->get_subopr_list(layouts, opr.get()); + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + policy.sub_policy.push_back({}); + OprProxyProfilingBase<_Opr>::construct_execution_policy( + _item.layouts, _item.param, handle, cache, + policy.sub_policy.back()); + }); + return; + } + + /** + * \brief search and get the best execution_policy + */ + static void search(const TensorLayoutArray& layouts, + const std::string& param, + WorkspaceWrapper& workspace_wrapper, Handle* handle, + size_t warmup_times, size_t exec_times, + FastRunCache& cache) { + megdnn_assert(layouts.size() == arity); + auto opr = handle->create_operator(); + opr->param() = + Algorithm::deserialize_read_pod(param); + SmallVector sizes_in_bytes; + for (const auto& layout : layouts) { + sizes_in_bytes.push_back(layout.span().dist_byte()); + } + + float min_time = std::numeric_limits::max(); + Algorithm::Info::Desc best_algo; + + std::string log_info = "Profiling start: "; + for (auto&& layout : layouts) { + log_info += layout.to_string() + " "; + } + megdnn_log("%s", log_info.c_str()); + best_algo = cache.get(Algorithm::SearchItem{ + OprTypeFromOprTrait::opr_type, param, layouts}); + + if (best_algo.valid()) { + auto&& algo = opr->get_algorithm_from_desc(best_algo); + MEGDNN_MARK_USED_VAR(algo); + megdnn_log("Find best algo %s in cache", algo->name()); + return; + } + for (auto algo : AlgoProxy::get_all_algorithms_info( + opr.get(), layouts)) { + //! construct execution_policy + opr->execution_policy().algo = algo.desc; + construct_execution_policy(layouts, param, handle, cache, + opr->execution_policy()); + + auto workspace_size = AlgoProxy::get_workspace_in_bytes( + opr.get(), layouts); + sizes_in_bytes.push_back(workspace_size); + + WorkspaceBundle wb(nullptr, sizes_in_bytes); + workspace_wrapper.update(wb.total_size_in_bytes()); + wb.set(workspace_wrapper.workspace().raw_ptr); + TensorNDArray tensors; + for (size_t i = 0; i < arity; i++) { + tensors.push_back({wb.get(i), layouts[i]}); + } + + for (size_t times = 0; times < warmup_times; ++times) { + AlgoProxy::exec(opr.get(), tensors, + wb.get_workspace(arity)); + } + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + Timer timer; + timer.start(); + for (size_t times = 0; times < exec_times; ++times) { + AlgoProxy::exec(opr.get(), tensors, + wb.get_workspace(arity)); + } + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + timer.stop(); + megdnn_log("%.3fms %s", timer.get_time_in_us() / 1e3, + algo.name.c_str()); + if (min_time > timer.get_time_in_us()) { + min_time = timer.get_time_in_us(); + best_algo = algo.desc; + } + + sizes_in_bytes.pop_back(); + } + auto&& algo = opr->get_algorithm_from_desc(best_algo); + MEGDNN_MARK_USED_VAR(algo); + megdnn_log("Profiling end, got best algo: %s", algo->name()); + cache.put(Algorithm::SearchItem{OprTypeFromOprTrait::opr_type, + param, layouts}, + best_algo); + } + void exec(Opr* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() == arity); if (!W.valid()) { @@ -177,39 +420,26 @@ struct OprProxyProfilingBase for (auto&& tensor : tensors) { layouts.push_back(tensor.layout); } - if (m_profiling && !target_algo_info.valid()) { - size_t min_time = std::numeric_limits::max(); - for (auto algo : - AlgoProxy::get_all_algorithms_info(opr, layouts)) { - opr->execution_policy().algo = algo; - auto workspace_size = - AlgoProxy::get_workspace_in_bytes(opr, - layouts); - W.update(workspace_size); - - for (size_t times = 0; times < warmup_times; ++times) - AlgoProxy::exec(opr, tensors, W.workspace()); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - Timer timer; - timer.start(); - for (size_t times = 0; times < exec_times; ++times) { - AlgoProxy::exec(opr, tensors, W.workspace()); - } - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - timer.stop(); - printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, - algo.name.c_str()); - if (min_time > timer.get_time_in_us()) { - min_time = timer.get_time_in_us(); - target_algo_info = algo; - } - } - opr->execution_policy().algo = target_algo_info; + if (m_profiling && !target_execution_policy.algo.valid()) { + FastRunCache cache; + std::string param_str; + Algorithm::serialize_write_pod(opr->param(), param_str); + auto&& search_items = + flatten_search_space(layouts, param_str, opr->handle()); + FOREACH_OPR_TYPE_DISPATCH(search_items, { + OprProxyProfilingBase<_Opr>::search(_item.layouts, param_str, W, + opr->handle(), warmup_times, + exec_times, cache); + }); + + construct_execution_policy(layouts, param_str, opr->handle(), cache, + opr->execution_policy()); + target_execution_policy = opr->execution_policy(); auto workspace_size = AlgoProxy::get_workspace_in_bytes(opr, layouts); W.update(workspace_size); } - if (!target_algo_info.valid()) { + if (!target_execution_policy.algo.valid()) { auto workspace_size = AlgoProxy::get_workspace_in_bytes(opr, layouts); W.update(workspace_size); @@ -218,30 +448,32 @@ struct OprProxyProfilingBase } }; -#define DEF_PROF(c, arity) \ - template <> \ - struct OprProxy : public OprProxyProfilingBase { \ - using OprProxyProfilingBase::OprProxyProfilingBase; \ +#define DEF_PROF(c) \ + template <> \ + struct OprProxy : public OprProxyProfilingBase { \ + using OprProxyProfilingBase::OprProxyProfilingBase; \ } -DEF_PROF(ConvolutionForward, 3); -DEF_PROF(ConvolutionBackwardData, 3); -DEF_PROF(ConvolutionBackwardFilter, 3); -DEF_PROF(LocalShareForward, 3); -DEF_PROF(LocalShareBackwardData, 3); -DEF_PROF(LocalShareBackwardFilter, 3); +DEF_PROF(MatrixMulForward); +DEF_PROF(ConvolutionForward); +DEF_PROF(ConvolutionBackwardData); +DEF_PROF(ConvolutionBackwardFilter); +DEF_PROF(LocalShareForward); +DEF_PROF(LocalShareBackwardData); +DEF_PROF(LocalShareBackwardFilter); -DEF_PROF(DeformableConvForward, 5); -DEF_PROF(DeformableConvBackwardFilter, 5); -DEF_PROF(BatchConvBiasForward, 5); -DEF_PROF(ConvBiasForward, 5); +DEF_PROF(DeformableConvForward); +DEF_PROF(DeformableConvBackwardFilter); +DEF_PROF(BatchConvBiasForward); +DEF_PROF(ConvBiasForward); -DEF_PROF(DeformableConvBackwardData, 8); +DEF_PROF(DeformableConvBackwardData); #undef DEF_PROF -template -struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { - using Base = OprProxyProfilingBase; +template +struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { + using Base = OprProxyProfilingBase; + static constexpr int arity = OprTrait::arity; void exec(Opr* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() == arity); if (!Base::W.valid()) { @@ -252,11 +484,11 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { for (auto&& tensor : tensors) { layouts.push_back(tensor.layout); } - if (Base::m_profiling && !Base::target_algo_info.desc.valid()) { + if (Base::m_profiling && !Base::target_execution_policy.algo.valid()) { size_t min_time = std::numeric_limits::max(); for (auto algo : AlgoProxy::get_all_algorithms_info(opr, layouts)) { - opr->execution_policy().algo = algo; + opr->execution_policy().algo = algo.desc; auto preprocess_tensors = weight_prerocess(opr, tensors, algo.desc); @@ -288,12 +520,12 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { algo.name.c_str()); if (min_time > timer.get_time_in_us()) { min_time = timer.get_time_in_us(); - Base::target_algo_info = algo; + Base::target_execution_policy.algo = algo.desc; } } - opr->execution_policy().algo = Base::target_algo_info; - auto preprocess_tensors = - weight_prerocess(opr, tensors, Base::target_algo_info.desc); + opr->execution_policy() = Base::target_execution_policy; + auto preprocess_tensors = weight_prerocess( + opr, tensors, Base::target_execution_policy.algo); megcoreSynchronize(opr->handle()->megcore_computing_handle()); typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocess_tensors}; @@ -301,12 +533,12 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); } - auto preprocess_tensors = - weight_prerocess(opr, tensors, Base::target_algo_info.desc); + auto preprocess_tensors = weight_prerocess( + opr, tensors, Base::target_execution_policy.algo); megcoreSynchronize(opr->handle()->megcore_computing_handle()); typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocess_tensors}; - if (!Base::target_algo_info.valid()) { + if (!Base::target_execution_policy.algo.valid()) { auto workspace_size = AlgoProxy::get_workspace_in_bytes( opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); @@ -342,16 +574,15 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { } }; -#define DEF_PROF(c, arity) \ - template <> \ - struct OprWeightPreprocessProxy \ - : public OprWeightPreprocessProxyImpl { \ - using OprWeightPreprocessProxyImpl< \ - c, arity>::OprWeightPreprocessProxyImpl; \ +#define DEF_PROF(c) \ + template <> \ + struct OprWeightPreprocessProxy \ + : public OprWeightPreprocessProxyImpl { \ + using OprWeightPreprocessProxyImpl::OprWeightPreprocessProxyImpl; \ } -DEF_PROF(ConvolutionForward, 3); -DEF_PROF(ConvBias, 5); +DEF_PROF(ConvolutionForward); +DEF_PROF(ConvBias); #undef DEF_PROF } // namespace test diff --git a/dnn/test/cuda/batch_conv_bias.cpp b/dnn/test/cuda/batch_conv_bias.cpp index dff1b2206..3ccb155b9 100644 --- a/dnn/test/cuda/batch_conv_bias.cpp +++ b/dnn/test/cuda/batch_conv_bias.cpp @@ -279,7 +279,7 @@ void benchmark_target_algo(Handle* handle, const std::vector& args, benchmarker.set_param(bparam); if (!algo) { - benchmarker.proxy()->target_algo_info.reset(); + benchmarker.proxy()->target_execution_policy.algo.reset(); } auto time_in_ms = benchmarker.execs( diff --git a/dnn/test/cuda/chanwise_convolution.cpp b/dnn/test/cuda/chanwise_convolution.cpp index 57d202236..8a41d7265 100644 --- a/dnn/test/cuda/chanwise_convolution.cpp +++ b/dnn/test/cuda/chanwise_convolution.cpp @@ -514,7 +514,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_FWD) { auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) { - checker.proxy()->target_algo_info.reset(); + checker.proxy()->target_execution_policy.algo.reset(); checker.execs({{N, C, IH, IW}, {C, 1, 1, FH, FW}, {}}); }; @@ -538,7 +538,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_DATA) { auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) { - checker.proxy()->target_algo_info.reset(); + checker.proxy()->target_execution_policy.algo.reset(); checker.execs({{C, 1, 1, FH, FW}, {N, C, IH - FH + 1, IW - FW + 1}, {N, C, IH, IW}}); @@ -564,7 +564,7 @@ TEST_F(CUDA, CHANWISE_CONVOLUTION_BENCH_ALL_ALGO_BWD_FILTER) { auto run = [&](size_t N, size_t C, size_t IH, size_t IW, size_t FH, size_t FW) { - checker.proxy()->target_algo_info.reset(); + checker.proxy()->target_execution_policy.algo.reset(); checker.execs({{N, C, IH, IW}, {N, C, IH - FH + 1, IW - FW + 1}, {C, 1, 1, FH, FW}}); @@ -614,7 +614,7 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); auto time_in_ms_fp32 = bencher.execs({src, filter, {}}) / RUNS; bencher.set_param(param) @@ -623,10 +623,10 @@ TEST_F(CUDA, BENCHMARK_CHANWISE_CONV_ALL_ALGO_FORWARD) { .set_dtype(2, dtype::Float16()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); auto time_in_ms_fp16 = bencher.execs({src, filter, {}}) / RUNS; - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); param.compute_mode = param::Convolution::ComputeMode::FLOAT32; bencher.set_param(param); auto time_in_ms_pseudo_fp16 = bencher.execs({src, filter, {}}) / RUNS; diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 631dafb2b..9591e40d4 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -168,7 +168,7 @@ void benchmark_target_algo( benchmarker.set_param(param); if (!algo) { - benchmarker.proxy()->target_algo_info.reset(); + benchmarker.proxy()->target_execution_policy.algo.reset(); } TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, @@ -327,7 +327,7 @@ void benchmark_target_algo_with_cudnn_tsc( benchmarker.set_param(param); if (!algo) { - benchmarker.proxy()->target_algo_info.reset(); + benchmarker.proxy()->target_execution_policy.algo.reset(); } TensorShape src{arg.n, arg.ci, arg.hi, arg.wi}, filter{arg.co, arg.ci, arg.f, arg.f}, bias{1, arg.co, 1, 1}, diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index 2230aa953..18ade4c12 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -8,6 +8,7 @@ * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ +#include "megdnn/dtype.h" #include "megdnn/oprs.h" #include "megdnn/opr_param_defs.h" #include "test/cuda/fixture.h" @@ -223,14 +224,19 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) .set_epsilon(1e-1) .set_param(arg.param) .exec(TensorLayoutArray{filter, dst, src}); - src.dtype = dst.dtype = filter.dtype = dtype::BFloat16(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-1). - set_param(arg.param). - exec(TensorLayoutArray{filter, dst, src}); } + checker.set_before_exec_callback(AlgoChecker( + ExecutionPolicyAlgoName{"CONVOLUTION_BACKWARD_DATD_BFLOAT16", + {{"MATMUL", {}}}})); + src.dtype = dst.dtype = filter.dtype = dtype::BFloat16(); + arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-1) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); + checker.reset_before_exec_callback(); + checker.opr()->execution_policy() = {}; } } @@ -382,32 +388,35 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_1) { #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, CONV_FWD_BENCHMARK) { - auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, size_t SH=1, - size_t SW=1, size_t FH=1, size_t FW=1, size_t PH=0, size_t PW=0, bool fp16io_c32=false) { + auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, + size_t SH = 1, size_t SW = 1, size_t FH = 1, size_t FW = 1, + size_t PH = 0, size_t PW = 0, bool fp16io_c32 = false) { auto benchmarker = Benchmarker(handle_cuda()); benchmarker.set_dtype(0, dtype::Float16()) - .set_dtype(1, dtype::Float16()) - .set_dtype(2, dtype::Float16()); + .set_dtype(1, dtype::Float16()) + .set_dtype(2, dtype::Float16()); ConvolutionForward::Param param; param.stride_h = SH; param.stride_w = SW; param.pad_h = PH; param.pad_w = PW; if (fp16io_c32) { - param.compute_mode = ConvolutionForward::Param::ComputeMode::FLOAT32; + param.compute_mode = + ConvolutionForward::Param::ComputeMode::FLOAT32; } benchmarker.set_param(param); - std::unique_ptr> proxy{new OprProxy{true}}; + std::unique_ptr> proxy{ + new OprProxy{true}}; benchmarker.set_proxy(proxy); size_t OH = (IH - FH + 2 * PH) / SH + 1; size_t OW = (IW - FW + 2 * PW) / SW + 1; - auto time = benchmarker.execs({ - {N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}}); + auto time = benchmarker.execs( + {{N, IC, IH, IW}, {OC, IC, FH, FW}, {N, OC, OH, OW}}); time /= 1000.0 * 10.0; - auto flo = (double) N * OC * IC * OH * OW * FH * FW * 2; + auto flo = (double)N * OC * IC * OH * OW * FH * FW * 2; auto flops = flo / time / 1e12; printf("comp_type %s: ", fp16io_c32 ? "32" : "16"); - printf("%.3fG FLO, flops %.3fTFLOPS\n", flo/1e9, flops); + printf("%.3fG FLO, flops %.3fTFLOPS\n", flo / 1e9, flops); }; run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, false); run(32, 512, 256, 56, 56, 1, 1, 1, 1, 0, 0, true); @@ -415,7 +424,8 @@ TEST_F(CUDA, CONV_FWD_BENCHMARK) { TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { CUBenchmarker bench{handle_cuda()}; - std::unique_ptr> proxy{new OprProxy{true}}; + std::unique_ptr> proxy{ + new OprProxy{true}}; size_t RUNS = 10; bench.set_proxy(proxy).set_times(RUNS); @@ -429,7 +439,7 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { param.pad_h = param.pad_w = PH; param.compute_mode = param::Convolution::ComputeMode::DEFAULT; bench.set_param(param); - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, filter{{OC, IC, FH, FH}, dtype::Float32()}; TensorLayout dst; @@ -440,13 +450,13 @@ TEST_F(CUDA, CONVOLUTION_FWD_BENCHMARK) { } auto time_ms_fp32 = bench.execl({src, filter, dst}) / RUNS; src.dtype = filter.dtype = dst.dtype = dtype::Float16(); - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); bench.set_dtype(0, dtype::Float16()) .set_dtype(1, dtype::Float16()) .set_dtype(2, dtype::Float16()); auto time_ms_true_fp16 = bench.execl({src, filter, dst}) / RUNS; param.compute_mode = param::Convolution::ComputeMode::FLOAT32; - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); bench.set_param(param); auto time_ms_pseudo_fp16 = bench.execl({src, filter, dst}) / RUNS; float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; @@ -500,7 +510,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { param.pad_h = param.pad_w = PH; param.compute_mode = param::Convolution::ComputeMode::DEFAULT; bench.set_param(param); - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, filter{{OC, IC, FH, FH}, dtype::Float32()}; TensorLayout dst; @@ -511,13 +521,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { } auto time_ms_fp32 = bench.execl({filter, dst, src}) / RUNS; src.dtype = filter.dtype = dst.dtype = dtype::Float16(); - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); bench.set_dtype(0, dtype::Float16()) .set_dtype(1, dtype::Float16()) .set_dtype(2, dtype::Float16()); auto time_ms_true_fp16 = bench.execl({filter, dst, src}) / RUNS; param.compute_mode = param::Convolution::ComputeMode::FLOAT32; - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); bench.set_param(param); auto time_ms_pseudo_fp16 = bench.execl({filter, dst, src}) / RUNS; float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; @@ -554,6 +564,62 @@ TEST_F(CUDA, CONVOLUTION_BWD_DATA_BENCHMARK) { run(32, 64, 64, 56, 56, 1, 1, 0); } +TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) { + CUBenchmarker bench{handle_cuda()}; + std::unique_ptr> proxy{ + new OprProxy{true}}; + size_t RUNS = 10; + bench.set_proxy(proxy).set_times(RUNS); + + auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, + size_t FH, size_t SH, size_t PH) { + bench.set_dtype(0, dtype::BFloat16()) + .set_dtype(1, dtype::BFloat16()) + .set_dtype(2, dtype::BFloat16()); + param::Convolution param; + param.stride_h = param.stride_w = SH; + param.pad_h = param.pad_w = PH; + param.compute_mode = param::Convolution::ComputeMode::DEFAULT; + bench.set_param(param); + bench.proxy()->target_execution_policy = {}; + TensorLayout src{{N, IC, IH, IW}, dtype::BFloat16()}, + filter{{OC, IC, FH, FH}, dtype::BFloat16()}; + TensorLayout dst; + { + auto&& opr = handle_cuda()->create_operator(); + opr->param() = param; + opr->deduce_layout(src, filter, dst); + } + auto used = bench.execl({filter, dst, src}) / RUNS; + float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; + printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(), + filter.to_string().c_str(), dst.to_string().c_str()); + printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, + (flo / (used * 1e9))); + }; + run(32, 64, 3, 224, 224, 7, 2, 3); + run(32, 128, 128, 28, 28, 3, 1, 1); + run(32, 256, 256, 14, 14, 3, 1, 1); + run(32, 512, 512, 7, 7, 3, 1, 1); + run(32, 64, 64, 56, 56, 3, 1, 1); + run(32, 512, 256, 56, 56, 1, 2, 0); + run(32, 1024, 512, 28, 28, 1, 2, 0); + run(32, 2048, 1024, 14, 14, 1, 2, 0); + run(32, 512, 128, 28, 28, 1, 1, 0); + run(32, 128, 512, 28, 28, 1, 1, 0); + run(32, 1024, 256, 14, 14, 1, 1, 0); + run(32, 256, 1024, 14, 14, 1, 1, 0); + run(32, 2048, 512, 7, 7, 1, 1, 0); + run(32, 512, 2048, 7, 7, 1, 1, 0); + run(32, 256, 64, 56, 56, 1, 1, 0); + run(32, 64, 256, 56, 56, 1, 1, 0); + run(32, 128, 256, 56, 56, 1, 2, 0); + run(32, 256, 512, 28, 28, 1, 2, 0); + run(32, 512, 1024, 14, 14, 1, 2, 0); + run(32, 64, 64, 56, 56, 1, 1, 0); +} + + TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { CUBenchmarker bench{handle_cuda()}; std::unique_ptr> proxy{ @@ -571,7 +637,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { param.pad_h = param.pad_w = PH; param.compute_mode = param::Convolution::ComputeMode::DEFAULT; bench.set_param(param); - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); TensorLayout src{{N, IC, IH, IW}, dtype::Float32()}, filter{{OC, IC, FH, FH}, dtype::Float32()}; TensorLayout dst; @@ -582,13 +648,13 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { } auto time_ms_fp32 = bench.execl({src, dst, filter}) / RUNS; src.dtype = filter.dtype = dst.dtype = dtype::Float16(); - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); bench.set_dtype(0, dtype::Float16()) .set_dtype(1, dtype::Float16()) .set_dtype(2, dtype::Float16()); auto time_ms_true_fp16 = bench.execl({src, dst, filter}) / RUNS; param.compute_mode = param::Convolution::ComputeMode::FLOAT32; - bench.proxy()->target_algo_info.reset(); + bench.proxy()->target_execution_policy.algo.reset(); bench.set_param(param); auto time_ms_pseudo_fp16 = bench.execl({src, dst, filter}) / RUNS; float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; @@ -630,8 +696,7 @@ TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { #undef V #undef V1 - -} // namespace test -} // namespace megdnn +} // namespace test +} // namespace megdnn // vim: syntax=cpp.doxygen diff --git a/dnn/test/cuda/local_share.cpp b/dnn/test/cuda/local_share.cpp index 5c07d36ad..0795e9571 100644 --- a/dnn/test/cuda/local_share.cpp +++ b/dnn/test/cuda/local_share.cpp @@ -778,7 +778,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_FILTER) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); auto time_in_ms = bencher.execs({src, diff, grad}) / RUNS; printf("src=%s, diff=%s, grad=%s, float32: %.2fms " @@ -856,7 +856,7 @@ TEST_F(CUDA, BENCHMARK_GROUP_LOCAL_SHARE_FORWARD) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; ; @@ -915,7 +915,7 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_BWD_DATA) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); auto time_in_ms = bencher.execs({filter, diff, grad}) / RUNS; printf("filter=%s, diff=%s, grad=%s, float32: %.2fms " @@ -1002,11 +1002,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_BOTTLENECK) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; bencher_conv.set_param(conv_param); - bencher_conv.proxy()->target_algo_info.reset(); + bencher_conv.proxy()->target_execution_policy.algo.reset(); auto time_in_ms_conv = bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; @@ -1094,11 +1094,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD_FROM_RESEARCH) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; bencher_conv.set_param(conv_param); - bencher_conv.proxy()->target_algo_info.reset(); + bencher_conv.proxy()->target_execution_policy.algo.reset(); auto time_in_ms_conv = bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; @@ -1177,11 +1177,11 @@ TEST_F(CUDA, BENCHMARK_LOCAL_SHARE_FORWARD) { .set_dtype(2, dtype::Float32()) .set_rng(0, &rng) .set_rng(1, &rng); - bencher.proxy()->target_algo_info.reset(); + bencher.proxy()->target_execution_policy.algo.reset(); auto time_in_ms = bencher.execs({src, filter, {}}) / RUNS; bencher_conv.set_param(conv_param); - bencher_conv.proxy()->target_algo_info.reset(); + bencher_conv.proxy()->target_execution_policy.algo.reset(); auto time_in_ms_conv = bencher_conv.execs({src, {oc, ic, f, f}, {}}) / RUNS; diff --git a/dnn/test/gtest_main.cpp b/dnn/test/gtest_main.cpp index 011ed4f95..56616fb62 100644 --- a/dnn/test/gtest_main.cpp +++ b/dnn/test/gtest_main.cpp @@ -10,6 +10,7 @@ */ #include +#include "megdnn/basic_types.h" #include "src/common/utils.h" #include "test/common/random_state.h" @@ -21,9 +22,29 @@ class ResetSeedListener : public ::testing::EmptyTestEventListener { } }; +megdnn::LogLevel min_log_level; + +void set_log_level() { + megdnn::LogLevel level = megdnn::LogLevel::INFO; + auto setting = std::getenv("MEGDNN_LOG_LEVEL"); + if (setting) { + if (!strcmp(setting, "INFO")) { + level = megdnn::LogLevel::INFO; + } else if (!strcmp(setting, "DEBUG")) { + level = megdnn::LogLevel::DEBUG; + } else if (!strcmp(setting, "WARN")) { + level = megdnn::LogLevel::WARN; + } else { + megdnn_assert(!strcmp(setting, "ERROR")); + level = megdnn::LogLevel::ERROR; + } + } + min_log_level = level; +} + void log_handler(megdnn::LogLevel level, const char* file, const char* func, int line, const char* fmt, va_list ap) { - if (level < megdnn::LogLevel::ERROR) { + if (level < min_log_level) { return; } char msg[1024]; @@ -39,6 +60,7 @@ void log_handler(megdnn::LogLevel level, const char* file, const char* func, extern "C" int gtest_main(int argc, char** argv) { ::megdnn::set_log_handler(log_handler); + set_log_level(); ResetSeedListener listener; auto&& listeners = ::testing::UnitTest::GetInstance()->listeners(); ::testing::InitGoogleTest(&argc, argv); diff --git a/dnn/test/x86/convolution.cpp b/dnn/test/x86/convolution.cpp index 4631f78c5..2f5a0dc89 100644 --- a/dnn/test/x86/convolution.cpp +++ b/dnn/test/x86/convolution.cpp @@ -450,6 +450,7 @@ TEST_F(X86, BENCHMARK_CONVOLUTION_I8x8x32_MKLDNN) { } } #endif + #endif } // namespace test diff --git a/src/core/test/graph/misc.cpp b/src/core/test/graph/misc.cpp index 4c75e0cb2..e6b722a27 100644 --- a/src/core/test/graph/misc.cpp +++ b/src/core/test/graph/misc.cpp @@ -27,6 +27,7 @@ #include "megbrain/gopt/inference.h" #include "megbrain/test/helper.h" +#include "megdnn/oprs/base.h" #include #include @@ -1924,19 +1925,19 @@ TEST(TestGraph, NaiveRecord2NCHW44) { namespace { template -typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( +typename megdnn::ExecutionPolicy try_find_any_weight_preprocess_algo( DnnOp* dnn_op, const char* mgb_info, Maybe& found, Args&& ...args) { if (found.valid()) { if (found.val()) { - return dnn_op->execution_policy().algo; + return dnn_op->execution_policy(); } else { return {}; } } for (auto&& algo : dnn_op->get_all_algorithms_info( std::forward(args)...)) { - dnn_op->execution_policy().algo = algo; + dnn_op->execution_policy().algo = algo.desc; auto layouts = dnn_op->deduce_preprocessed_filter_layout( std::forward(args)...); if (layouts.empty()) continue; @@ -1949,7 +1950,7 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( } if (valid) { found.emplace(true); - return algo; + return {algo.desc, {}}; } } found.emplace(false); @@ -1958,19 +1959,19 @@ typename DnnOp::AlgorithmInfo try_find_any_weight_preprocess_algo( } template -typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo( +typename megdnn::ExecutionPolicy try_find_any_bias_preprocess_algo( DnnOp* dnn_op, const char* mgb_info, Maybe& found, Args&& ...args) { if (found.valid()) { if (found.val()) { - return dnn_op->execution_policy().algo; + return dnn_op->execution_policy(); } else { return {}; } } for (auto&& algo : dnn_op->get_all_algorithms_info( std::forward(args)...)) { - dnn_op->execution_policy().algo = algo; + dnn_op->execution_policy().algo = algo.desc; auto layouts = dnn_op->deduce_preprocessed_filter_layout( std::forward(args)...); if (layouts.size() <= 1) @@ -1981,7 +1982,7 @@ typename DnnOp::AlgorithmInfo try_find_any_bias_preprocess_algo( } if (valid) { found.emplace(true); - return algo; + return {algo.desc, {}}; } } found.emplace(false); diff --git a/src/opr/impl/search_policy/algo_chooser.cpp b/src/opr/impl/search_policy/algo_chooser.cpp index 4860a4d56..c28001a3d 100644 --- a/src/opr/impl/search_policy/algo_chooser.cpp +++ b/src/opr/impl/search_policy/algo_chooser.cpp @@ -11,6 +11,7 @@ */ #include "megbrain/opr/search_policy/algo_chooser.h" +#include "megbrain/opr/internal/megdnn_opr_wrapper.h" #include "megbrain/opr/search_policy/algo_chooser_helper.h" #include "megbrain/opr/search_policy/profiler.h" @@ -21,6 +22,7 @@ //! TODO: here has to be know some megdnn::opr when there is produced midout.h //! fix it if there is another graceful way. #include "megdnn/oprs.h" +#include "megdnn/oprs/base.h" #include "midout.h" MIDOUT_DECL(megbrain_opr_algo_chooser) #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_algo_chooser, __VA_ARGS__) { @@ -29,6 +31,8 @@ MIDOUT_DECL(megbrain_opr_algo_chooser) MIDOUT_END(); using mgb::opr::intl::WorkspaceLimitGetter; +using namespace megdnn; +using namespace mgb; #define APPLY(statement, ...) \ mgb::apply([&](const auto&... args) { return statement; }, \ @@ -37,7 +41,7 @@ using mgb::opr::intl::WorkspaceLimitGetter; // timeout delta to be added with fastest known algorithm for new algos constexpr double TIMEOUT_TOLERANCE = 2; -#define CACHE_KEY_VERSION "v3" +#define CACHE_KEY_VERSION "v4" namespace { template @@ -48,44 +52,191 @@ std::string profile_name(Opr* opr) { ret.append(opr->get_algorithm_set_name()); return ret; } + +template +std::string format_fixlayouts( + const typename opr::AlgoChooser::FixedTensorLayouts& layouts, + size_t arity_in, size_t arity_out) { + std::string ret; + ret.append(": tensor layouts("); + for (size_t i = 0; i < arity_in; ++i) { + if (i) { + ret.append(", "); + } + ret.append(layouts[i].to_string() + " "); + ret.append(layouts[i].dtype.name()); + } + ret.append(") -> ("); + for (size_t i = 0; i < arity_out; ++i) { + if (i) { + ret.append(", "); + } + ret.append(layouts[i + arity_in].to_string() + " "); + ret.append(layouts[i + arity_in].dtype.name()); + } + return ret; +} + +///////////////// OprTypeTrait ///////////////////////////// +template +struct OprFromOprTypeTrait; + +template +struct OprTypeFromOprTrait; + +#define cb(_opr_type, _opr) \ + template <> \ + struct OprFromOprTypeTrait { \ + using Opr = megdnn::_opr; \ + }; \ + template <> \ + struct OprTypeFromOprTrait { \ + constexpr static megdnn::Algorithm::OprType opr_type = \ + megdnn::Algorithm::OprType::_opr_type; \ + } + +cb(MATRIX_MUL_FORWARD, MatrixMulForward); +cb(BATCHED_MATRIX_MUL_FORWARD, BatchedMatrixMulForward); +cb(CONVOLUTION_FORWARD, ConvolutionForward); +cb(CONVOLUTION_BACKWARD_DATA, ConvolutionBackwardData); +cb(CONVOLUTION_BACKWARD_FILTER, ConvolutionBackwardFilter); +cb(CONVOLUTION3D_FORWARD, Convolution3DForward); +cb(CONVOLUTION3D_BACKWARD_DATA, Convolution3DBackwardData); +cb(CONVOLUTION3D_BACKWARD_FILTER, Convolution3DBackwardFilter); +cb(LOCAL_SHARE_FORWARD, LocalShareForward); +cb(LOCAL_SHARE_BACKWARD_DATA, LocalShareBackwardData); +cb(LOCAL_SHARE_BACKWARD_FILTER, LocalShareBackwardFilter); +cb(DEFORMABLE_CONV_FORWARD, DeformableConvForward); +cb(DEFORMABLE_CONV_BACKWARD_DATA, DeformableConvBackwardData); +cb(DEFORMABLE_CONV_BACKWARD_FILTER, DeformableConvBackwardFilter); +cb(BATCH_CONV_FORWARD, BatchConvBiasForward); +cb(CONVBIAS_FORWARD, ConvBiasForward); + +#undef cb + +// clang-format off +#define FOREACH_OPR_TYPE_WITH_STMT(cb, stmt) \ + cb(MATRIX_MUL_FORWARD, stmt) \ + cb(BATCHED_MATRIX_MUL_FORWARD, stmt) \ + cb(CONVOLUTION_FORWARD, stmt) \ + cb(CONVOLUTION_BACKWARD_DATA, stmt) \ + cb(CONVOLUTION_BACKWARD_FILTER, stmt) \ + cb(CONVOLUTION3D_FORWARD, stmt) \ + cb(CONVOLUTION3D_BACKWARD_DATA, stmt) \ + cb(CONVOLUTION3D_BACKWARD_FILTER, stmt) \ + cb(LOCAL_SHARE_FORWARD, stmt) \ + cb(LOCAL_SHARE_BACKWARD_DATA, stmt) \ + cb(LOCAL_SHARE_BACKWARD_FILTER, stmt) \ + cb(DEFORMABLE_CONV_FORWARD, stmt) \ + cb(DEFORMABLE_CONV_BACKWARD_DATA, stmt) \ + cb(DEFORMABLE_CONV_BACKWARD_FILTER, stmt) \ + cb(BATCH_CONV_FORWARD, stmt) \ + cb(CONVBIAS_FORWARD, stmt) +// clang-format on + +#define _OPR_TYPE_CASE(_opr_type, _stmt) \ + case Algorithm::OprType::_opr_type: { \ + using _Opr = typename OprFromOprTypeTrait< \ + Algorithm::OprType::_opr_type>::Opr; \ + _stmt; \ + break; \ + } + +#define FOREACH_OPR_TYPE_DISPATCH(_search_items, _stmt) \ + for (size_t _item_idx = 0; _item_idx < _search_items.size(); \ + _item_idx++) { \ + auto&& _item = _search_items[_item_idx]; \ + switch (_item.opr_type) { \ + FOREACH_OPR_TYPE_WITH_STMT(_OPR_TYPE_CASE, _stmt) \ + default: \ + mgb_throw(MegBrainError, "unknown opr_type"); \ + } \ + } + +template +TensorLayoutArray to_layout_array( + const typename opr::AlgoChooser::FixedTensorLayouts& layouts) { + TensorLayoutArray ret; + for (auto&& layout : layouts) { + ret.push_back(layout); + } + return ret; } +template +typename opr::AlgoChooser::FixedTensorLayouts to_fixed_layouts( + const TensorLayoutArray& layouts) { + typename opr::AlgoChooser::FixedTensorLayouts ret; + mgb_assert(ret.size() == layouts.size()); + size_t idx = 0; + for (auto&& layout : layouts) { + ret[idx++] = layout; + } + return ret; +} + +} // namespace + namespace mgb { namespace opr { template -AlgoChooserProfileCache::Result AlgoChooser::get_profile_result( - ExeContext& ctx, bool enable_update) { - AlgoChooserProfileCache cache(ctx.mgb_opr()->comp_node(), - profile_name(ctx.megdnn_opr()).c_str()); - - TensorLayoutArray origin_layouts = ctx.layouts(); - typename Opr::Param origin_param = ctx.mgb_opr()->param(); - AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), - origin_layouts.size(), &origin_param, - sizeof(origin_param)}; - { - auto&& rst = cache.get(cache_key); - if (rst.valid()) - return rst.val(); +std::vector +AlgoChooser::flatten_search_space(const ExeContext& ctx) { + std::vector ret; + for (auto algo_info : ctx.get_all_candidates()) { + megdnn::Algorithm* algo = ctx.get_algorithm_from_desc(algo_info.desc); + mgb_assert(algo, "Unknown algo description"); + std::vector&& sub_items = + algo->get_subopr_list(to_layout_array(ctx.layouts()), + ctx.megdnn_opr()); + + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(ctx.comp_node()); + megdnn_opr->param() = + Algorithm::deserialize_read_pod( + _item.param); + typename AlgoChooser<_Opr>::ExeContext sub_ctx( + to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), + _item.param, ctx.mgb_opr(), ctx.comp_node(), + ctx.execution_policy(), ctx.allow_weight_preprocess()); + auto space = AlgoChooser<_Opr>::flatten_search_space(sub_ctx); + ret.insert(ret.end(), space.begin(), space.end()); + }); } + ret.push_back({OprTypeFromOprTrait::opr_type, ctx.param(), + to_layout_array(ctx.layouts())}); + return ret; +} +template +void AlgoChooser::profile(ExeContext& ctx, bool require_reproducible) { + if (ctx.get_profile_result_from_cache(require_reproducible).valid()) + return; AlgoChooserProfileCache::Result prof_rst; - if (!enable_update) - return prof_rst; std::string str_on_inp_shape = ssprintf( "on input layouts (%s, %s)", ctx.layouts()[0].to_string().c_str(), ctx.layouts()[1].to_string().c_str()); double cur_timeout = 0; + + auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( + ctx.owner_graph(), ctx.comp_node(), + ctx.execution_policy().workspace_limit); RealTimer timer; - for (auto algo : ctx.get_all_candidates_with_workspace_limit()) { + for (auto algo : ctx.get_all_candidates()) { Maybe cur_rst; std::string msg = ssprintf("profiling %s algorithm %s %s", ctx.mgb_opr()->dyn_typeinfo()->name, algo.name.c_str(), str_on_inp_shape.c_str()); + ImplExecutionPolicy policy; + policy.algo = algo.desc; + ctx.construct_execution_policy_from_cache(require_reproducible, policy); + if (ctx.get_workspace_size_bytes(policy) >= workspace_limit) + continue; + timer.reset(); - MGB_TRY { cur_rst = ctx.profile_single_algo(algo, cur_timeout); } + MGB_TRY { cur_rst = ctx.profile_single_algo(policy, cur_timeout); } MGB_CATCH(std::exception & exc, { mgb_log_warn("caught exception during %s: %s", msg.c_str(), exc.what()); @@ -114,120 +265,100 @@ AlgoChooserProfileCache::Result AlgoChooser::get_profile_result( mgb_assert(!prof_rst.empty(), "no usable convolution algorithm %s", str_on_inp_shape.c_str()); + FixedTensorLayouts origin_layouts = ctx.layouts(); + typename Opr::Param origin_param = ctx.megdnn_opr()->param(); + AlgoChooserProfileCache::Key cache_key{origin_layouts.data(), + origin_layouts.size(), &origin_param, + sizeof(origin_param)}; + + AlgoChooserProfileCache cache(ctx.comp_node(), + profile_name(ctx.megdnn_opr()).c_str()); cache.put(cache_key, prof_rst); - return prof_rst; } template -typename AlgoChooser::ImplAlgo AlgoChooser::choose_by_profile( - ExeContext& ctx, bool require_reproducible, bool enable_update) { +typename AlgoChooser::ImplExecutionPolicy +AlgoChooser::choose_by_profile(ExeContext& ctx, bool require_reproducible, + bool enable_update) { MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("AlgoChooser::choose_by_profile"))) - auto opr = ctx.mgb_opr(); - if (opr->owner_graph()->options().no_profiling_on_shape_change) { - auto algo = ctx.megdnn_opr()->execution_policy().algo; - if (algo.valid()) - return algo; + if (ctx.owner_graph()->options().no_profiling_on_shape_change) { + auto policy = ctx.megdnn_opr()->execution_policy(); + if (policy.algo.valid()) + return policy; } - std::unordered_map algo_map; - for (auto i : ctx.get_all_candidates()) { - auto ins = algo_map.emplace(i.name.c_str(), i); - mgb_assert(ins.second, "duplicated algo name: %s", i.name.c_str()); + if (enable_update) { + auto&& search_items = flatten_search_space(ctx); + FOREACH_OPR_TYPE_DISPATCH(search_items, { + auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(ctx.comp_node()); + megdnn_opr->param() = + Algorithm::deserialize_read_pod( + _item.param); + typename AlgoChooser<_Opr>::ExeContext sub_ctx( + to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), + _item.param, ctx.mgb_opr(), ctx.comp_node(), + ctx.execution_policy(), ctx.allow_weight_preprocess()); + AlgoChooser<_Opr>::profile(sub_ctx, require_reproducible); + }); } - - auto&& prof = get_profile_result(ctx, enable_update); - if (prof.empty()) - return {}; - for (auto&& i : prof) { - if ((!require_reproducible || i.reproducible)) { - auto iter = algo_map.find(i.algo); - mgb_assert(iter != algo_map.end(), - "algorithm %s exists in " - "profiling result but not in algo_map; please " - "report this " - "bug; opr: %s{%s}, shapes: %s %s %s", - i.algo.c_str(), - ctx.mgb_opr()->cname(), - ctx.mgb_opr()->dyn_typeinfo()->name, - ctx.layouts()[0].TensorShape::to_string().c_str(), - ctx.layouts()[1].TensorShape::to_string().c_str(), - ctx.layouts()[2].TensorShape::to_string().c_str()); - return iter->second; - } - } - - mgb_log_error( - "Workspace requirement (%zu) could not be satisfied. Abort now " - "to " - "avoid further problems", - WorkspaceLimitGetter::get_workspace_limit( - opr->owner_graph(), opr->comp_node(), - opr->execution_policy().workspace_limit)); - mgb_trap(); + typename AlgoChooser::ImplExecutionPolicy policy; + ctx.construct_execution_policy_from_cache(require_reproducible, policy); + return policy; MIDOUT_E } template -size_t AlgoChooser::setup_algo(const TensorLayoutArray& layouts, +size_t AlgoChooser::setup_algo(const FixedTensorLayouts& layouts, Opr* megdnn_opr, const MGBOpr* mgb_opr, bool allow_weight_preprocess) { if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) { return 0; } - ImplAlgo algo = {}; - ExeContext ctx(layouts, megdnn_opr, mgb_opr, allow_weight_preprocess); + std::string param_str; + Algorithm::serialize_write_pod(megdnn_opr->param(), param_str); + ExeContext ctx(layouts, megdnn_opr, param_str, mgb_opr, + mgb_opr->comp_node(), mgb_opr->execution_policy(), + allow_weight_preprocess); + ImplExecutionPolicy policy; if (auto algo_choose_hook = mgb_opr->algo_chooser()) { - algo = algo_choose_hook(mgb_opr); + policy = algo_choose_hook(mgb_opr); } - if (!algo.valid()) { - algo = get_algo(ctx); + if (!policy.algo.valid()) { + policy = get_policy(ctx); } - size_t workspace = ctx.get_workspace_size_bytes(algo); + size_t workspace = ctx.get_workspace_size_bytes(policy); std::string ret; ret.append(mgb_opr->dyn_typeinfo()->name); - ret.append(": tensor layouts("); - for (size_t i = 0; i < arity_in; ++i) { - if (i) { - ret.append(", "); - } - ret.append(layouts[i].to_string() + " "); - ret.append(layouts[i].dtype.name()); - } - ret.append(") -> ("); - for (size_t i = 0; i < arity_out; ++i) { - if (i) { - ret.append(", "); - } - ret.append(layouts[i + arity_in].to_string() + " "); - ret.append(layouts[i + arity_in].dtype.name()); - } - ret.append("): algo=" + algo.name); + ret += format_fixlayouts(layouts, arity_in, arity_out); + Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo); + mgb_assert(palgo, "Unknown algo description"); + ret.append("): algo=" + std::string(palgo->name())); ret.append(ssprintf(" workspace=%.2fMiB reproducible=%d", - workspace / (1024 * 1024.0), algo.is_reproducible)); + workspace / (1024 * 1024.0), palgo->is_reproducible())); mgb_log_debug("%s", ret.c_str()); - megdnn_opr->execution_policy() = {algo}; + megdnn_opr->execution_policy() = policy; return workspace; } template -typename AlgoChooser::ImplAlgo AlgoChooser::get_algo( +typename AlgoChooser::ImplExecutionPolicy AlgoChooser::get_policy( ExeContext& ctx) { using S = mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; MGB_MARK_USED_VAR(TIMEOUT_TOLERANCE); - switch (ctx.mgb_opr()->execution_policy().strategy) { + switch (ctx.execution_policy().strategy) { case S::HEURISTIC: return ctx.choose_by_heuristic(); case S::HEURISTIC_REPRODUCIBLE: return ctx.choose_by_heuristic(true); case S::PROFILE_HEURISTIC: { - ImplAlgo algo = choose_by_profile(ctx, false, false); - if (!algo.valid()) - algo = ctx.choose_by_heuristic(); - return algo; + ImplExecutionPolicy policy = choose_by_profile(ctx, false, false); + if (!policy.algo.valid()) + policy = ctx.choose_by_heuristic(); + return policy; } #if MGB_ENABLE_FASTRUN case S::PROFILE: @@ -241,16 +372,17 @@ typename AlgoChooser::ImplAlgo AlgoChooser::get_algo( } #define INST(Opr) \ - template AlgoChooser::ImplAlgo \ - AlgoChooser::get_algo(ExeContext& ctx); \ - template AlgoChooserProfileCache::Result \ - AlgoChooser::get_profile_result(ExeContext& ctx, \ - bool enable_update); \ - template AlgoChooser::ImplAlgo \ + template AlgoChooser::ImplExecutionPolicy \ + AlgoChooser::get_policy(ExeContext& ctx); \ + template void AlgoChooser::profile( \ + ExeContext& ctx, bool require_reproducible); \ + template std::vector \ + AlgoChooser::flatten_search_space(const ExeContext& ctx); \ + template AlgoChooser::ImplExecutionPolicy \ AlgoChooser::choose_by_profile( \ ExeContext& ctx, bool require_reproducible, bool enable_update); \ template size_t AlgoChooser::setup_algo( \ - const TensorLayoutArray& layouts, megdnn::Opr* megdnn_opr, \ + const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ const MGBOpr* mgb_opr, bool allow_weight_preprocess); MGB_FOREACH_FASTRUN_OPR(INST) @@ -258,17 +390,109 @@ MGB_FOREACH_FASTRUN_OPR(INST) #undef INST //////////////////////////////// ExeContext ///////////////////////////// +template +AlgoChooser::ExeContext::ExeContext( + const FixedTensorLayouts& layouts, Opr* megdnn_opr, + const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, + const CompNode& cn, + const megdnn::param::ExecutionPolicy& execution_policy, + bool allow_weight_preprocess) + : m_layouts{layouts}, + m_megdnn_opr{megdnn_opr}, + m_param{param_str}, + m_base_mgb_opr{mgb_opr}, + m_cn{cn}, + m_execution_policy{execution_policy}, + m_allow_weight_preprocess{allow_weight_preprocess} { + mgb_assert(m_layouts.size() == layouts.size()); + static_assert(std::tuple_size::value == 3 || + std::tuple_size::value == 5 || + std::tuple_size::value == 8, + "Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " + "deformable conv)"); +} template typename AlgoChooser::ImplAlgo +AlgoChooser::ExeContext::get_profile_result_from_cache( + bool require_reproducible) const { + MIDOUT_B(Opr, + midout_iv(MGB_HASH_STR( + "AlgoChooser::ExeContext::get_profile_result_from_cache"))) + AlgoChooserProfileCache cache(m_cn, + profile_name(m_megdnn_opr).c_str()); + + typename Opr::Param origin_param = m_megdnn_opr->param(); + AlgoChooserProfileCache::Key cache_key{m_layouts.data(), m_layouts.size(), + &origin_param, sizeof(origin_param)}; + auto&& rst = cache.get(cache_key); + if (!rst.valid()) + return {}; + + auto&& prof = rst.val(); + std::unordered_map algo_map; + for (auto i : get_all_candidates()) { + auto ins = algo_map.emplace(i.name.c_str(), i); + mgb_assert(ins.second, "duplicated algo name: %s", i.name.c_str()); + } + + if (prof.empty()) + return {}; + for (auto&& i : prof) { + if ((!require_reproducible || i.reproducible)) { + auto iter = algo_map.find(i.algo); + mgb_assert(iter != algo_map.end(), + "algorithm %s exists in " + "profiling result but not in algo_map; please " + "report this " + "bug; opr: %s{%s}, layouts: %s ", + i.algo.c_str(), m_base_mgb_opr->cname(), + m_base_mgb_opr->dyn_typeinfo()->name, + format_fixlayouts(m_layouts, arity_in, arity_out) + .c_str()); + return iter->second; + } + } + + mgb_log_error( + "Workspace requirement (%zu) could not be satisfied. Abort now " + "to " + "avoid further problems", + WorkspaceLimitGetter::get_workspace_limit( + m_base_mgb_opr->owner_graph(), m_cn, + m_execution_policy.workspace_limit)); + mgb_trap(); + MIDOUT_E +} + +template +typename AlgoChooser::ImplExecutionPolicy AlgoChooser::ExeContext::choose_by_heuristic(bool reproducible) const { - auto opr = m_mgb_opr; auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( - opr->owner_graph(), opr->comp_node(), - opr->execution_policy().workspace_limit); - return APPLY(m_megdnn_opr->get_algorithm_info_heuristic( - args..., workspace_limit, reproducible), - m_layouts); + owner_graph(), m_cn, m_execution_policy.workspace_limit); + ImplExecutionPolicy policy; + policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic( + args..., workspace_limit, reproducible), + m_layouts).desc; + + Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); + mgb_assert(algo, "Unknown algo description"); + std::vector&& sub_items = algo->get_subopr_list( + to_layout_array(m_layouts), m_megdnn_opr); + + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); + megdnn_opr->param() = + Algorithm::deserialize_read_pod( + _item.param); + typename AlgoChooser<_Opr>::ExeContext sub_ctx( + to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), + _item.param, m_base_mgb_opr, m_cn, m_execution_policy, + m_allow_weight_preprocess); + policy.sub_policy.push_back(sub_ctx.choose_by_heuristic(reproducible)); + }); + + return policy; } template @@ -279,40 +503,58 @@ AlgoChooser::ExeContext::get_all_candidates() const { APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts); bool found = false; for (size_t i = 0; i < ret.size(); ++i) { - if (ret[i] == heu) { + if (ret[i].desc == heu.algo) { found = true; std::swap(ret[i], ret[0]); break; } } + + Algorithm* palgo = m_megdnn_opr->get_algorithm_from_desc(heu.algo); + mgb_assert(palgo, "Unknown algo description"); mgb_assert(found, "algo %s got by heuristic not found in " "candidate list", - heu.name.c_str()); + palgo->name()); return std::move(ret); } template -std::vector::ImplAlgo> -AlgoChooser::ExeContext::get_all_candidates_with_workspace_limit() const { - auto&& all_algos = get_all_candidates(); - auto opr = m_mgb_opr; - auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit( - opr->owner_graph(), opr->comp_node(), - opr->execution_policy().workspace_limit); - std::vector ret; - for (auto&& algo : all_algos) { - if (get_workspace_size_bytes(algo) <= workspace_limit) { - ret.push_back(algo); - } +void AlgoChooser::ExeContext::construct_execution_policy_from_cache( + bool require_reproducible, + typename AlgoChooser::ImplExecutionPolicy& policy) const { + if (!policy.algo.valid()) { + policy.algo = get_profile_result_from_cache(require_reproducible).desc; + mgb_assert(policy.algo.valid(), + "No cache found, maybe some error occured"); } - return ret; + + Algorithm* algo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); + mgb_assert(algo, "Unknown algo description"); + std::vector&& sub_items = algo->get_subopr_list( + to_layout_array(m_layouts), m_megdnn_opr); + + FOREACH_OPR_TYPE_DISPATCH(sub_items, { + auto&& megdnn_opr = intl::create_megdnn_opr<_Opr>(m_cn); + megdnn_opr->param() = + Algorithm::deserialize_read_pod( + _item.param); + typename AlgoChooser<_Opr>::ExeContext sub_ctx( + to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(), + _item.param, m_base_mgb_opr, m_cn, m_execution_policy, + m_allow_weight_preprocess); + policy.sub_policy.push_back({}); + sub_ctx.construct_execution_policy_from_cache(require_reproducible, + policy.sub_policy.back()); + }); + + return; } template size_t AlgoChooser::ExeContext::get_workspace_size_bytes( - ImplAlgo algo) const { - m_megdnn_opr->execution_policy() = {algo}; + const ImplExecutionPolicy& policy) const { + m_megdnn_opr->execution_policy() = policy; size_t result; if_constexpr()>( [&](auto _) { @@ -336,17 +578,13 @@ size_t AlgoChooser::ExeContext::get_workspace_size_bytes( template Maybe -AlgoChooser::ExeContext::profile_single_algo(ImplAlgo algo, - double& timeout) const { +AlgoChooser::ExeContext::profile_single_algo( + const ImplExecutionPolicy& policy, double& timeout) const { typename TimedProfiler::Param param; - auto name = algo.name.c_str(); // force check copy size <= dest len-1 from gcc8 for safe - auto len = sizeof(param.algo_name); - strncpy(param.algo_name, name, len - 1); - param.algo_name[len - 1] = '\0'; - mgb_assert(!param.algo_name[sizeof(param.algo_name) - 2], - "algo name too long: %s; len=%zu", name, strlen(name)); - param.workspace = get_workspace_size_bytes(algo); + param.execution_policy = + TimedProfiler::Param::ExecutionPolicyBlob::serialize(policy); + param.workspace = get_workspace_size_bytes(policy); for (int i = 0; i < arity; ++i) { auto&& src = m_layouts[i]; mgb_assert(src.format.is_default() && @@ -357,23 +595,25 @@ AlgoChooser::ExeContext::profile_single_algo(ImplAlgo algo, src.to_string().c_str()); param.dtypes[i] = src.dtype.enumv(); } - param.comp_node_loc = m_mgb_opr->output(0)->comp_node().locator(); + param.comp_node_loc = m_cn.locator(); mgb_assert(param.shapes.size() == m_layouts.size()); for (size_t i = 0; i < param.shapes.size(); ++i) param.shapes[i] = m_layouts[i]; param.opr_param = m_megdnn_opr->param(); param.allow_weight_preprocess = m_allow_weight_preprocess; + Algorithm* palgo = m_megdnn_opr->get_algorithm_from_desc(policy.algo); + mgb_assert(palgo, "Unknown algo description"); auto rst = TimedProfiler::profile(param, timeout); // MIOpen conv profiles all available algos when a specfic shape is // provided for the first time, which probably adds to the result time. // Therefore, a second profile execution is needed. - if (strncmp(name, "MIOpen", 6) == 0) + if (strncmp(palgo->name(), "MIOpen", 6) == 0) rst = TimedProfiler::profile(param, timeout); if (!rst.valid()) return None; return AlgoChooserProfileCache::ResultEntry{ - algo.name.c_str(), algo.is_reproducible, rst.val().time, + palgo->name(), palgo->is_reproducible(), rst.val().time, param.workspace}; } @@ -414,21 +654,34 @@ AlgoChooser::ExeContext::construct_fake_preprocess_filter() const { } #define INST(Opr) \ - template typename AlgoChooser::ImplAlgo \ + template AlgoChooser::ExeContext::ExeContext( \ + const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \ + const std::string& param_str, const cg::OperatorNodeBase* mgb_opr, \ + const CompNode& cn, \ + const megdnn::param::ExecutionPolicy& execution_policy, \ + bool allow_weight_preprocess); \ + template typename AlgoChooser::ImplExecutionPolicy \ AlgoChooser::ExeContext::choose_by_heuristic( \ bool reproducible) const; \ + template typename AlgoChooser::ImplAlgo \ + AlgoChooser::ExeContext::get_profile_result_from_cache( \ + bool require_reproducible) const; \ template std::vector::ImplAlgo> \ AlgoChooser::ExeContext::get_all_candidates() const; \ - template std::vector::ImplAlgo> \ - AlgoChooser::ExeContext:: \ - get_all_candidates_with_workspace_limit() const; \ template size_t \ AlgoChooser::ExeContext::get_workspace_size_bytes( \ - typename AlgoChooser::ImplAlgo algo) const; \ + const typename AlgoChooser::ImplExecutionPolicy& \ + policy) const; \ + template void AlgoChooser::ExeContext:: \ + construct_execution_policy_from_cache( \ + bool require_reproducible, \ + typename AlgoChooser::ImplExecutionPolicy& \ + policy) const; \ template Maybe \ AlgoChooser::ExeContext::profile_single_algo( \ - typename AlgoChooser::ImplAlgo algo, double& timeout) \ - const; \ + const typename AlgoChooser::ImplExecutionPolicy& \ + policy, \ + double& timeout) const; MGB_FOREACH_FASTRUN_OPR(INST) diff --git a/src/opr/impl/search_policy/profiler.cpp b/src/opr/impl/search_policy/profiler.cpp index 2d25eab71..3073d4d39 100644 --- a/src/opr/impl/search_policy/profiler.cpp +++ b/src/opr/impl/search_policy/profiler.cpp @@ -14,6 +14,8 @@ #include "../internal/invoke.h" #include "../internal/megdnn_opr_wrapper.inl" +#include "megdnn/handle.h" +#include "megdnn/oprs/base.h" #if MGB_ROCM #include "hcc_detail/hcc_defs_prologue.h" @@ -32,12 +34,96 @@ MIDOUT_DECL(megbrain_opr_profile) } \ MIDOUT_END(); +namespace { +std::string serialize_policy(const megdnn::ExecutionPolicy& policy) { + std::string ret; + //! serialize AlgorithmDesc + megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret); + megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret); + uint32_t param_size = policy.algo.param.size(); + megdnn::Algorithm::serialize_write_pod(param_size, ret); + ret += policy.algo.param; + + //! serialize sub_policy + uint32_t size = policy.sub_policy.size(); + megdnn::Algorithm::serialize_write_pod(size, ret); + for (auto&& sub : policy.sub_policy) { + ret += serialize_policy(sub); + } + return ret; +} + +megdnn::ExecutionPolicy deserialize_policy(const char* buf, uint32_t size, + uint32_t& offset) { + megdnn::ExecutionPolicy ret; +#define cb(_val, _type) \ + _val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \ + offset += sizeof(_val) + + cb(ret.algo.handle_type, megdnn::Handle::HandleType); + cb(ret.algo.type, uint32_t); + + uint32_t param_size = 0; + cb(param_size, uint32_t); + if (param_size > 0) { + ret.algo.param = std::string(buf + offset, param_size); + offset += param_size; + } + + uint32_t nr_policy = 0; + cb(nr_policy, uint32_t); +#undef cb + + for (uint32_t i = 0; i < nr_policy; i++) { + ret.sub_policy.push_back(deserialize_policy(buf, size, offset)); + } + return ret; +} +} + namespace mgb { namespace opr { #define APPLY(statement, ...) \ mgb::apply([&](const auto&... args) { return statement; }, \ std::tuple_cat(__VA_ARGS__)) +////////////// TimedProfiler::Param::ExecutionPolicyBlob ////////////////////// + +template +typename TimedProfiler::Param::ExecutionPolicyBlob +TimedProfiler::Param::ExecutionPolicyBlob::serialize( + const megdnn::ExecutionPolicy& policy) { + ExecutionPolicyBlob ret; + std::string serialize_bin = serialize_policy(policy); + mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES); + memcpy(ret.data, serialize_bin.data(), serialize_bin.size()); + ret.size = serialize_bin.size(); + return ret; +} + +template +megdnn::ExecutionPolicy +TimedProfiler::Param::ExecutionPolicyBlob::deserialize() const { + uint32_t offset = 0; + auto&& ret = deserialize_policy(data, size, offset); + mgb_assert(offset == size); + return std::move(ret); +} + +#define INST(Opr) \ + template typename TimedProfiler::Param::ExecutionPolicyBlob \ + TimedProfiler::Param::ExecutionPolicyBlob::serialize( \ + const megdnn::ExecutionPolicy& policy); \ + template megdnn::ExecutionPolicy \ + TimedProfiler::Param::ExecutionPolicyBlob::deserialize() \ + const; + +MGB_FOREACH_FASTRUN_OPR(INST) +#undef INST + + +////////////////// TimedProfiler ////////////////////////////// + template const double TimedProfiler::timeout_setting = TimedProfiler::init_timeout_setting(); @@ -99,18 +185,7 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( } megdnn_opr->param() = param.opr_param; - { - typename Opr::AlgorithmInfo algo; - for (auto i : - APPLY(megdnn_opr->get_all_algorithms_info(args...), layouts)) { - if (!strcmp(i.name.c_str(), param.algo_name)) { - algo = i; - break; - } - } - mgb_assert(algo.valid(), "algorithm %s not found", param.algo_name); - megdnn_opr->execution_policy() = {algo}; - } + megdnn_opr->execution_policy() = param.execution_policy.deserialize(); // Allocate preprocessed weight buffers. TensorLayoutArray preprocessed_layout; @@ -222,13 +297,16 @@ typename TimedProfiler::TResult TimedProfiler::prof_impl( }); ev_end->record(); + megdnn::Algorithm* algo = megdnn_opr->get_algorithm_from_desc( + megdnn_opr->execution_policy().algo); + mgb_assert(algo); double next_report_time = 0.5; while (!ev_end->finished()) { if (timer.get_secs() >= next_report_time) { mgb_log_warn( "profiling conv algo %s already took %.3f/%.3f secs" " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", - param.algo_name, timer.get_secs(), param.actual_timeout); + algo->name(), timer.get_secs(), param.actual_timeout); next_report_time = timer.get_secs() + 1; } using namespace std::literals; diff --git a/src/opr/include/megbrain/opr/blas.h b/src/opr/include/megbrain/opr/blas.h index 12227cdcd..93f9049ae 100644 --- a/src/opr/include/megbrain/opr/blas.h +++ b/src/opr/include/megbrain/opr/blas.h @@ -46,7 +46,7 @@ private: static bool check_layout(const TensorLayout& layout, int transpose); //! store the policy of all transpose situations - megdnn::MatrixMul::ExecutionPolicy m_cadidate_execution_policies[4]; + megdnn::ExecutionPolicy m_cadidate_execution_policies[4]; }; /*! @@ -76,7 +76,7 @@ private: static bool check_layout(const TensorLayout& layout, bool transpose); //! store the policy of all transpose situations - megdnn::BatchedMatrixMul::ExecutionPolicy m_cadidate_execution_policies[4]; + megdnn::ExecutionPolicy m_cadidate_execution_policies[4]; }; /*! diff --git a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h index d4ffbbd17..eb4f390b3 100644 --- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h +++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h @@ -12,9 +12,14 @@ #pragma once +#include +#include "megbrain/graph/cg.h" +#include "megbrain/graph/operator_node.h" +#include "megbrain/opr/search_policy/algo_chooser_helper.h" #include "megbrain/opr/search_policy/profiler.h" #include "megbrain/opr/dnn/convolution.h" #include "megbrain/opr/blas.h" +#include "megdnn/oprs/base.h" template struct MegDNNOpr2MGBOpr; @@ -49,52 +54,64 @@ class AlgoChooser { static constexpr int arity = OprArityTrait::arity; using ImplAlgo = typename Opr::AlgorithmInfo; + using ImplExecutionPolicy = megdnn::ExecutionPolicy; using MGBOpr = typename MegDNNOpr2MGBOpr::MGBOpr; - using TensorLayoutArray = std::array; +public: + using FixedTensorLayouts = std::array; class ExeContext { - const TensorLayoutArray& m_layouts; + FixedTensorLayouts m_layouts; Opr* m_megdnn_opr; - const MGBOpr* m_mgb_opr; + std::string m_param; + const cg::OperatorNodeBase* m_base_mgb_opr; + CompNode m_cn; + megdnn::param::ExecutionPolicy m_execution_policy; bool m_allow_weight_preprocess; public: - ExeContext(const TensorLayoutArray& layouts, Opr* megdnn_opr, - const MGBOpr* mgb_opr, bool allow_weight_preprocess) - : m_layouts{layouts}, - m_megdnn_opr{megdnn_opr}, - m_mgb_opr{mgb_opr}, - m_allow_weight_preprocess{allow_weight_preprocess} { - mgb_assert(m_layouts.size() == layouts.size()); - static_assert( - std::tuple_size::value == 3 || - std::tuple_size::value == 5 || - std::tuple_size::value == 8, - "Convolution AlgoChooser assumes arity = 3 , 5 or 8 (for " - "deformable conv)"); - } + ExeContext(const FixedTensorLayouts& layouts, Opr* megdnn_opr, + const std::string& param_str, + const cg::OperatorNodeBase* mgb_opr, const CompNode& cn, + const megdnn::param::ExecutionPolicy& execution_policy, + bool allow_weight_preprocess); Opr* megdnn_opr() const { return m_megdnn_opr; } - const MGBOpr* mgb_opr() const { return m_mgb_opr; } - const TensorLayout& inp_layout(size_t idx) const { return m_layouts[idx]; } - const TensorLayoutArray& layouts() const { return m_layouts; } + cg::ComputingGraph* owner_graph() const { + return m_base_mgb_opr->owner_graph(); + } + const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; } + const megdnn::param::ExecutionPolicy& execution_policy() const { + return m_execution_policy; + } + CompNode comp_node() const { return m_cn; } + const std::string& param() const { return m_param; } + + bool allow_weight_preprocess() const { + return m_allow_weight_preprocess; + } + + megdnn::Algorithm* get_algorithm_from_desc( + const megdnn::Algorithm::Info::Desc& desc) const { + return m_megdnn_opr->get_algorithm_from_desc(desc); + } + + const FixedTensorLayouts& layouts() const { return m_layouts; } - ImplAlgo choose_by_heuristic(bool reproducible = false) const; + ImplExecutionPolicy choose_by_heuristic( + bool reproducible = false) const; //! get all candidate algos, and the one choose_by_heuristic() is //! put first std::vector get_all_candidates() const; - //! get candidate algos with workspace limit. - std::vector get_all_candidates_with_workspace_limit() const; - - //! get workspace size required for specific algo - size_t get_workspace_size_bytes(ImplAlgo algo) const; + //! get workspace size required for specific execution policy + size_t get_workspace_size_bytes( + const ImplExecutionPolicy& policy) const; /*! * \brief profile a single algorithm @@ -106,28 +123,59 @@ class AlgoChooser { * timeout used during profiling */ Maybe profile_single_algo( - ImplAlgo algo, double& timeout) const; + const ImplExecutionPolicy& policy, double& timeout) const; + + //! get all profile algorithm from cache, return invalid if not exists + ImplAlgo get_profile_result_from_cache(bool require_reproducible) const; + + /** + * \brief construct execution policy from cache. + * + * \param require_reproducible select algo which is reproducible + * \param policy execution policy + */ + void construct_execution_policy_from_cache( + bool require_reproducible, ImplExecutionPolicy& policy) const; private: Maybe> construct_fake_preprocess_filter() const; }; - //! entrance for getting algorithm according to execution strategy - static ImplAlgo get_algo(ExeContext& ctx); - - //! get all profile result, either by retrieving cache or profiling - static AlgoChooserProfileCache::Result get_profile_result( - ExeContext& ctx, bool enable_update); + template + friend class AlgoChooser; - static ImplAlgo choose_by_profile(ExeContext& ctx, - bool require_reproducible, - bool enable_update = true); +private: + //! entrance for getting algorithm according to execution strategy + static ImplExecutionPolicy get_policy(ExeContext& ctx); + + + //! profile and save to cache + static void profile(ExeContext& ctx, bool require_reproducible); + + static ImplExecutionPolicy choose_by_profile(ExeContext& ctx, + bool require_reproducible, + bool enable_update = true); + + /** + * flatten search space in postorder traversal + * The subopr search construct a search tree + * + * A + * / \ + * B1B2 C + * / \ + * D1D2D3 E + * We use postorder traverse the search tree. + * D1 -> D2 -> D3 -> E -> B1 -> B2 -> C -> A + */ + static std::vector flatten_search_space( + const ExeContext& ctx); public: /*! * \brief setup algorithm and return workspace size */ - static size_t setup_algo(const TensorLayoutArray& layouts, Opr* megdnn_opr, + static size_t setup_algo(const FixedTensorLayouts& layouts, Opr* megdnn_opr, const MGBOpr* mgb_opr, bool allow_weight_preprocess = false); }; diff --git a/src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h b/src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h index 8c69ab35d..bea97d101 100644 --- a/src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h +++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h @@ -28,9 +28,9 @@ namespace mixin { class AlgoChooserHelper : cg::OperatorNodeMixinBase { public: using ExecutionPolicy = megdnn::param::ExecutionPolicy; - using AlgorithmInfo = megdnn::detail::Algorithm::Info; + using AlgorithmPolicy = megdnn::ExecutionPolicy; using AlgoChooserHook = - std::function; + std::function; const ExecutionPolicy& execution_policy() const { if (!m_policy_accessed) { diff --git a/src/opr/include/megbrain/opr/search_policy/profiler.h b/src/opr/include/megbrain/opr/search_policy/profiler.h index 02d076326..da91abab8 100644 --- a/src/opr/include/megbrain/opr/search_policy/profiler.h +++ b/src/opr/include/megbrain/opr/search_policy/profiler.h @@ -18,6 +18,7 @@ #include "megbrain/comp_node.h" #include "megdnn/basic_types.h" +#include "megdnn/oprs/base.h" #include "megdnn/oprs/linalg.h" #include "megdnn/oprs/nn.h" @@ -139,7 +140,17 @@ class TimedProfiler { public: struct Param { - char algo_name[128]; + struct ExecutionPolicyBlob { + //! enlarge the max size if needed + constexpr static size_t MAX_SIZE_IN_BYTES = 10240; + char data[MAX_SIZE_IN_BYTES]; + uint32_t size; + + static ExecutionPolicyBlob serialize( + const megdnn::ExecutionPolicy& policy); + megdnn::ExecutionPolicy deserialize() const; + }; + ExecutionPolicyBlob execution_policy; size_t workspace; megdnn::DTypeEnum dtypes[arity]; CompNode::Locator comp_node_loc; diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp index 9d8ad5cd2..6583dc09c 100644 --- a/src/opr/test/dnn/convolution.cpp +++ b/src/opr/test/dnn/convolution.cpp @@ -20,11 +20,13 @@ #include "megbrain/opr/basic_arith.h" #include "megbrain/gopt/inference.h" #include "megbrain/opr/tensor_manip.h" +#include "megdnn/dtype.h" #include "megdnn/oprs/base.h" #include #include +#include #include using namespace mgb; @@ -37,6 +39,73 @@ using Mode = Param::Mode; Mode modes_to_check[] = {Mode::CONVOLUTION, Mode::CROSS_CORRELATION}; +void conv_bwd_data_brute(const std::vector>& inps, + std::shared_ptr& dest, + const opr::ConvolutionBackwardData::Param& param) { + mgb_assert(param.format == Param::Format::NCHW); + auto &&data = *inps[0], &&filter = *inps[1]; + size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3); + size_t GROUP, ICPG, OCPG, FH, FW; + + if (param.sparse == Param::Sparse::DENSE) { + GROUP = 1, ICPG = filter.shape(0), OCPG = filter.shape(1), + FH = filter.shape(2), FW = filter.shape(3); + } else { + mgb_assert(param.sparse == Param::Sparse::GROUP); + GROUP = filter.shape(0), ICPG = filter.shape(1), OCPG = filter.shape(2), + FH = filter.shape(3), FW = filter.shape(4); + } + auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad, + size_t dilate) { + return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2; + }; + size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, param.dilate_h), + OW = get_shp(IW, FW, param.stride_w, param.pad_w, param.dilate_w); + dest = std::make_shared(CompNode::load("xpu0"), + TensorShape{N, OCPG * GROUP, OH, OW}); + auto&& out = *dest; + auto fptr = filter.ptr(), dptr = data.ptr(), + optr = out.ptr(); + memset(optr, 0, sizeof(float) * out.shape().total_nr_elems()); + auto ol = out.layout(), fl = filter.layout(); + +#define FOR2(a, A, b, B) \ + for (size_t a = 0; a < A; ++a) \ + for (size_t b = 0; b < B; ++b) +#define FOR3(a, A, b, B, c, C) \ + FOR2(a, A, b, B) \ + for (size_t c = 0; c < C; ++c) + + FOR3(n, N, group, GROUP, icg, ICPG) + FOR2(ih, IH, iw, IW) { + float scale = *(dptr++); + + FOR3(ocg, OCPG, fh, FH, fw, FW) { + auto oc_tot = group * OCPG + ocg; + int oh = int(ih * param.stride_h + fh * param.dilate_h) - + int(param.pad_h), + ow = int(iw * param.stride_w + fw * param.dilate_w) - + int(param.pad_w); + if (oh >= 0 && ow >= 0 && oh < static_cast(OH) && + ow < static_cast(OW)) { + auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] + + oh * ol.stride[2] + ow; + size_t flt_off = 0; + if (param.sparse == Param::Convolution::Sparse::DENSE) { + flt_off = icg * fl.stride[0] + + ocg * fl.stride[1] + fh * fl.stride[2] + fw; + } else { + flt_off = group * fl.stride[0] + icg * fl.stride[1] + + ocg * fl.stride[2] + fh * fl.stride[3] + fw; + } + optr[out_off] += scale * fptr[flt_off]; + } + } + } +#undef FOR3 +#undef FOR2 +} + void conv_bwd_flt_brute(const std::vector>& inps, std::shared_ptr& out, const opr::ConvolutionBackwardFilter::Param& param) { @@ -370,7 +439,8 @@ TEST(TestOprDNN, ConvolutionExePolicy) { PersistentCacheHook cache_hook{on_get}; #if MGB_ENABLE_FASTRUN - for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) { + for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, + S::PROFILE_HEURISTIC}) { #else for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { #endif @@ -406,6 +476,95 @@ TEST(TestOprDNN, ConvolutionExePolicy) { } } +TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) { + REQUIRE_GPU(1); + Param param{Mode::CROSS_CORRELATION, 1, 1, 1, 1}; + param.compute_mode = Param::ComputeMode::FLOAT32; + using Policy = opr::Convolution::ExecutionPolicy; + using S = Policy::Strategy; + + auto gen_bfp16 = [](HostTensorND& dest) { + RNGxorshf rng{next_rand_seed()}; + auto rand_real = [&rng]() { + std::uniform_real_distribution dist(-1, 1); + return dist(rng); + }; + auto ptr = dest.ptr(); + size_t elems = dest.shape().total_nr_elems(); + for (size_t i = 0; i < elems; i++) { + ptr[i] = dt_bfloat16(rand_real()); + } + }; + + auto f32_to_bf16 = [](const std::shared_ptr& src) + -> std::shared_ptr { + auto ret = std::make_shared( + src->comp_node(), src->shape(), dtype::BFloat16{}); + for (size_t i = 0; i < src->layout().total_nr_elems(); i++) { + ret->ptr()[i] = src->ptr()[i]; + } + return ret; + }; + + auto bf16_to_f32 = [](const std::shared_ptr& src) + -> std::shared_ptr { + auto ret = std::make_shared( + src->comp_node(), src->shape(), dtype::Float32{}); + for (size_t i = 0; i < src->layout().total_nr_elems(); i++) { + ret->ptr()[i] = src->ptr()[i]; + } + return ret; + }; + + int nr_get = 0; + auto on_get = [&nr_get](const std::string&, const void*, size_t, + const void*, size_t) { ++nr_get; }; + PersistentCacheHook cache_hook{on_get}; + +#if MGB_ENABLE_FASTRUN + for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, + S::PROFILE_HEURISTIC}) { +#else + for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { +#endif + using Checker = AutoOprChecker<2, 1>; + + auto make_graph = [&](const Checker::SymInpArray& inputs) + -> Checker::SymOutArray { + Policy policy; + policy.strategy = strategy; + return {opr::ConvolutionBackwardData::make_deconv( + inputs[0], inputs[1], param, policy)}; + }; + + auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { + std::shared_ptr out; + conv_bwd_data_brute( + {bf16_to_f32(inp[0]), bf16_to_f32(inp[1])}, out, + param); + dest[0] = *f32_to_bf16(out); + }; + + Checker::RunOptions opt; + opt.outputs_max_err = 1e-3; + nr_get = 0; + Checker(make_graph, fwd) + .disable_grad_check() + .set_input_dtype(0, dtype::BFloat16{}) + .set_input_dtype(1, dtype::BFloat16{}) + .set_input_generator(0, gen_bfp16) + .set_input_generator(1, gen_bfp16) + .run({TensorShape{3, 4, 10, 6}, {4, 2, 3, 3}}, opt) + .run({TensorShape{2, 2, 4, 3}, {2, 2, 3, 3}}, opt) + .run({TensorShape{1, 3, 10, 6}, {3, 2, 3, 3}}, opt); + if (strategy == S::HEURISTIC) { + ASSERT_EQ(0, nr_get); + } else { + ASSERT_LT(0, nr_get); + } + } +} + TEST(TestOprDNN, Deconvolution) { // dilated grouped deconv using Checker = AutoOprChecker<2, 1>; @@ -420,55 +579,9 @@ TEST(TestOprDNN, Deconvolution) { }; auto fwd = [&](Checker::NumOutArray& dest, Checker::NumInpArray inp) { - auto &&data = *inp[0], &&filter = *inp[1]; - size_t N = data.shape(0), IH = data.shape(2), IW = data.shape(3); - size_t GROUP = filter.shape(0), ICPG = filter.shape(1), - OCPG = filter.shape(2), FH = filter.shape(3), - FW = filter.shape(4); - auto get_shp = [](size_t inp, size_t filter, size_t stride, size_t pad, - size_t dilate) { - return (inp - 1) * stride + (filter - 1) * dilate + 1 - pad * 2; - }; - auto &&out = dest[0]; - size_t OH = get_shp(IH, FH, param.stride_h, param.pad_h, - param.dilate_h), - OW = get_shp(IW, FW, param.stride_w, param.pad_w, - param.dilate_w); - out.resize({N, OCPG * GROUP, OH, OW}); - auto fptr = filter.ptr(), dptr = data.ptr(), - optr = out.ptr(); - memset(optr, 0, sizeof(float) * out.shape().total_nr_elems()); - auto ol = out.layout(), fl = filter.layout(); - -#define FOR2(a, A, b, B) \ - for (size_t a = 0; a < A; ++a) \ - for (size_t b = 0; b < B; ++b) -#define FOR3(a, A, b, B, c, C) \ - FOR2(a, A, b, B) \ - for (size_t c = 0; c < C; ++c) - - FOR3(n, N, group, GROUP, icg, ICPG) - FOR2(ih, IH, iw, IW) { - float scale = *(dptr++); - - FOR3(ocg, OCPG, fh, FH, fw, FW) { - auto oc_tot = group * OCPG + ocg; - int oh = int(ih * param.stride_h + fh * param.dilate_h) - - int(param.pad_h), - ow = int(iw * param.stride_w + fw * param.dilate_w) - - int(param.pad_w); - if (oh >= 0 && ow >= 0 && oh < static_cast(OH) && - ow < static_cast(OW)) { - auto out_off = n * ol.stride[0] + oc_tot * ol.stride[1] + - oh * ol.stride[2] + ow, - flt_off = group * fl.stride[0] + icg * fl.stride[1] + - ocg * fl.stride[2] + fh * fl.stride[3] + fw; - optr[out_off] += scale * fptr[flt_off]; - } - } - } -#undef FOR3 -#undef FOR2 + std::shared_ptr out; + conv_bwd_data_brute({inp[0], inp[1]}, out, param); + dest[0] = *out; }; Checker::RunOptions opt; @@ -1547,7 +1660,8 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) { PersistentCacheHook cache_hook{on_get}; #if MGB_ENABLE_FASTRUN - for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) { + for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, + S::PROFILE_HEURISTIC}) { #else for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) { #endif @@ -2004,29 +2118,34 @@ TEST(TestOprDNN, HeuristicReproducible) { .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); - auto algo = static_cast( + auto&& megdnn_opr = static_cast( static_cast( bwd_flt->owner_opr()) - ->megdnn_opr()) - ->execution_policy() - .algo; + ->megdnn_opr()); + auto&& algo = megdnn_opr->execution_policy().algo; + megdnn::Algorithm* palgo = + megdnn_opr->get_algorithm_from_desc(algo); + mgb_assert(palgo, "Unknown algo description"); if (strategy == S::HEURISTIC_REPRODUCIBLE) { - EXPECT_TRUE(algo.is_reproducible); + EXPECT_TRUE(palgo->is_reproducible()); } - algo_name0 = algo.name.c_str(); + algo_name0 = palgo->name(); } { Checker checker(make_graph, fwd); checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt) .run(inp_tensor(1, 5, 3, 7, 9, 3, 3), opt) .run(inp_tensor(3, 4, 4, 9, 9, 3, 3), opt); - auto algo = static_cast( - static_cast( - bwd_flt->owner_opr()) - ->megdnn_opr()) - ->execution_policy() - .algo; - algo_name1 = algo.name.c_str(); + auto&& megdnn_opr = static_cast( + static_cast( + bwd_flt->owner_opr()) + ->megdnn_opr()); + auto&& algo = megdnn_opr->execution_policy().algo; + megdnn::Algorithm* palgo = + megdnn_opr->get_algorithm_from_desc(algo); + mgb_assert(palgo, "Unknown algo description"); + + algo_name1 = palgo->name(); } EXPECT_TRUE(algo_name0 == algo_name1); } @@ -2286,6 +2405,8 @@ TEST_F(TestWeightPreprocess, NoPreprocessNeeded) { MockAlgorithm algo; EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) .WillRepeatedly(Return(&algo)); + EXPECT_CALL(mock, get_algorithm_from_desc(_)) + .WillRepeatedly(Return(&algo)); EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) .WillRepeatedly(Return(0)); EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) @@ -2318,6 +2439,9 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)) .WillRepeatedly(Return(filter_layout)); + EXPECT_CALL(mock, get_algorithm_from_desc(_)) + .WillRepeatedly(Return(&algo)); + Expectation algo_call = EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) .WillOnce(Return(&algo)); @@ -2349,7 +2473,6 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { pf->tensors[0].ptr()[0] = 114.514f; pf->tensors[1].ptr()[0] = 1926.0817f; })); - // Run the graph multiple times. for (int i = 0; i < 3; i++) { if (i > 0) { @@ -2381,6 +2504,8 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) { MockAlgorithm algo; EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _)) .WillRepeatedly(Return(&algo)); + EXPECT_CALL(mock, get_algorithm_from_desc(_)) + .WillRepeatedly(Return(&algo)); EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _)) .WillRepeatedly(Return(0)); EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _)) diff --git a/src/plugin/test/opr_io_dump_text_out.h b/src/plugin/test/opr_io_dump_text_out.h index edf6eb506..8d443551c 100644 --- a/src/plugin/test/opr_io_dump_text_out.h +++ b/src/plugin/test/opr_io_dump_text_out.h @@ -16,157 +16,157 @@ namespace { const char* EXPECTED_TEXT_OUT_REC[3] = { // rec level 0 R"OUTPUT( -var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 +var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 deps: val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s -var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 +var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 deps: val: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...]min=-0.272 max=2.35 mean=0.471 l2=1.02 sd=0.994 s -var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 +var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 deps: [i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s val: [2.352, 0.1114, 0, 0.7569, 0, ...]min=0 max=2.35 mean=0.557 l2=1.01 sd=0.924 s -var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 +var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 deps: [i0]var1: [2.352, 0.1114, -0.2721, 0.7569, -0.2438, ...] s [i1]var5: [0] s [i2]var7: [1] s val: [2.352, 0.1114, -0.2721]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.42 s -var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 +var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 deps: [i0]var11: [2.352, 0.1114, -0.2721] s [i1]var9: [2, 3] s val: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...]min=-0.272 max=2.35 mean=0.731 l2=1.37 sd=1.27 s -var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 +var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 deps: [i0]var3: [2] s [i1]var13: [2.352, 0.1114, -0.2721, 2.352, 0.1114, ...] s val: [4.352, 2.111, 1.728, 4.352, 2.111, ...]min=1.73 max=4.35 mean=2.73 l2=2.97 sd=1.27 s -var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 +var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 deps: [i0]var15: [10.24, 0.2352, 0, 3.294, 0, ...] s [i1]var17: [2.352, 0.1114, 0, 0.7569, 0, ...] s val: [10.24, 0.2352, 0, 3.294, 0, ...]min=0 max=10.2 mean=2.33 l2=4.39 sd=4.08 s -var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 +var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 deps: val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s -var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 +var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 deps: val: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...]min=-1.54 max=1.13 mean=-0.105 l2=0.895 sd=0.974 s -var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 +var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 deps: [i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s val: [0.05521, 0.724, 1.134, 0, 0, ...]min=0 max=1.13 mean=0.319 l2=0.55 sd=0.491 s -var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 +var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 deps: [i0]var1: [0.05521, 0.724, 1.134, -0.2697, -1.545, ...] s [i1]var5: [0] s [i2]var7: [1] s val: [0.05521, 0.724, 1.134]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.545 s -var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 +var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 deps: [i0]var11: [0.05521, 0.724, 1.134] s [i1]var9: [2, 3] s val: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...]min=0.0552 max=1.13 mean=0.638 l2=0.778 sd=0.487 s -var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 +var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 deps: [i0]var3: [2] s [i1]var13: [0.05521, 0.724, 1.134, 0.05521, 0.724, ...] s val: [2.055, 2.724, 3.134, 2.055, 2.724, ...]min=2.06 max=3.13 mean=2.64 l2=2.68 sd=0.487 s -var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 +var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 deps: [i0]var15: [0.1135, 1.972, 3.556, 0, 0, ...] s [i1]var17: [0.05521, 0.724, 1.134, 0, 0, ...] s val: [0.1135, 1.972, 3.556, 0, 0, ...]min=0 max=3.56 mean=0.94 l2=1.66 sd=1.5 s -var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 +var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 deps: val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s -var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 +var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 deps: val: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...]min=-0.507 max=1.32 mean=0.203 l2=0.616 sd=0.637 s -var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 +var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 deps: [i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s val: [0, 0.4525, 0.1695, 0, 0, ...]min=0 max=1.32 mean=0.324 l2=0.574 sd=0.52 s -var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 +var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 deps: [i0]var1: [-0.5069, 0.4525, 0.1695, -0.02793, -0.1907, ...] s [i1]var5: [0] s [i2]var7: [1] s val: [-0.5069, 0.4525, 0.1695]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.493 s -var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 +var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 deps: [i0]var11: [-0.5069, 0.4525, 0.1695] s [i1]var9: [2, 3] s val: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...]min=-0.507 max=0.453 mean=0.0384 l2=0.404 sd=0.441 s -var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 +var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 deps: [i0]var3: [2] s [i1]var13: [-0.5069, 0.4525, 0.1695, -0.5069, 0.4525, ...] s val: [1.493, 2.453, 2.17, 1.493, 2.453, ...]min=1.49 max=2.45 mean=2.04 l2=2.08 sd=0.441 s -var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 +var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 deps: [i0]var15: [0, 1.11, 0.3678, 0, 0, ...] s [i1]var17: [0, 0.4525, 0.1695, 0, 0, ...] s val: [0, 1.11, 0.3678, 0, 0, ...]min=0 max=2.87 mean=0.724 l2=1.26 sd=1.13 s -var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 +var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 deps: val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s -var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 +var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 deps: val: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...]min=-2.07 max=2.11 mean=0.0589 l2=1.25 sd=1.37 s -var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 +var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 deps: [i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s val: [0, 2.111, 0.3236, 0, 0, ...]min=0 max=2.11 mean=0.491 l2=0.897 sd=0.822 s -var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 +var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 deps: [i0]var1: [-0.03637, 2.111, 0.3236, -0.4861, -2.071, ...] s [i1]var5: [0] s [i2]var7: [1] s val: [-0.03637, 2.111, 0.3236]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.15 s -var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 +var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 deps: [i0]var11: [-0.03637, 2.111, 0.3236] s [i1]var9: [2, 3] s val: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...]min=-0.0364 max=2.11 mean=0.799 l2=1.23 sd=1.03 s -var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 +var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 deps: [i0]var3: [2] s [i1]var13: [-0.03637, 2.111, 0.3236, -0.03637, 2.111, ...] s val: [1.964, 4.111, 2.324, 1.964, 4.111, ...]min=1.96 max=4.11 mean=2.8 l2=2.95 sd=1.03 s -var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 +var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 deps: [i0]var15: [0, 8.675, 0.7518, 0, 0, ...] s [i1]var17: [0, 2.111, 0.3236, 0, 0, ...] s val: [0, 8.675, 0.7518, 0, 0, ...]min=0 max=8.68 mean=1.77 l2=3.59 sd=3.42 s -var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 +var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 deps: val: [2]min=2 max=2 mean=2 l2=2 sd=N/A s -var1 produced: name=var1 layout={5(4),4(1)} owner_opr=opr0{Host2DeviceCopy} opr0 +var1 produced: name=var1 layout={5(4),4(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 deps: val: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...]min=-2.24 max=1.25 mean=-0.347 l2=1.04 sd=1.01 s -var17 produced: name=var17 layout={5(4),4(1)} owner_opr=opr16{Elemwise} opr16 +var17 produced: name=var17 layout={5(4),4(1) Float32} owner_opr=opr16{Elemwise} opr16 deps: [i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s val: [0, 0, 1.098, 0, 0, ...]min=0 max=1.25 mean=0.262 l2=0.471 sd=0.402 s -var11 produced: name=var11 layout={1(4),4(1)} owner_opr=opr10{Subtensor} opr10 +var11 produced: name=var11 layout={1(4),4(1) Float32} owner_opr=opr10{Subtensor} opr10 deps: [i0]var1: [-1.199, -1.02, 1.098, -1.472, -0.3848, ...] s [i1]var5: [0] s [i2]var7: [1] s val: [-1.199, -1.02, 1.098, -1.472]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.18 s -var13 produced: name=var13 layout={5(0),4(1)} owner_opr=opr12{Broadcast} opr12 +var13 produced: name=var13 layout={5(0),4(1) Float32} owner_opr=opr12{Broadcast} opr12 deps: [i0]var11: [-1.199, -1.02, 1.098, -1.472] s [i1]var9: [5, 4] s val: [-1.199, -1.02, 1.098, -1.472, -1.199, ...]min=-1.47 max=1.1 mean=-0.648 l2=1.21 sd=1.05 s -var15 produced: name=var15 layout={5(4),4(1)} owner_opr=opr14{Elemwise} opr14 +var15 produced: name=var15 layout={5(4),4(1) Float32} owner_opr=opr14{Elemwise} opr14 deps: [i0]var3: [2] s [i1]var13: [-1.199, -1.02, 1.098, -1.472, -1.199, ...] s val: [0.8006, 0.9802, 3.098, 0.5279, 0.8006, ...]min=0.528 max=3.1 mean=1.35 l2=1.69 sd=1.05 s -var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18 +var19 produced: name=var19 layout={5(4),4(1) Float32} owner_opr=opr18{Elemwise} opr18 deps: [i0]var15: [0, 0, 3.401, 0, 0, ...] s [i1]var17: [0, 0, 1.098, 0, 0, ...] s @@ -176,33 +176,33 @@ var19 produced: name=var19 layout={5(4),4(1)} owner_opr=opr18{Elemwise} opr18 // rec level 1 R"OUTPUT( ==== begin lazy value recording -var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 +var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 deps: val: s -var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 +var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 deps: val: s -var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 +var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 deps: [i0]var1: s val: s -var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 +var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 deps: [i0]var1: s [i1]var5: [0] s [i2]var7: [1] s val: s -var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 +var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 deps: [i0]var11: s [i1]var9: [2, 3] s val: s -var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 +var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 deps: [i0]var3: s [i1]var13: s val: s -var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 +var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 deps: [i0]var15: s [i1]var17: s @@ -242,33 +242,33 @@ var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 // rec level 2 R"OUTPUT( ==== begin lazy value recording -var3 produced: name=var3 layout={1(1)} owner_opr=opr2{ImmutableTensor} opr2 +var3 produced: name=var3 layout={1(1) Float32} owner_opr=opr2{ImmutableTensor} opr2 deps: val: s -var1 produced: name=var1 layout={2(3),3(1)} owner_opr=opr0{Host2DeviceCopy} opr0 +var1 produced: name=var1 layout={2(3),3(1) Float32} owner_opr=opr0{Host2DeviceCopy} opr0 deps: val: s -var17 produced: name=var17 layout={2(3),3(1)} owner_opr=opr16{Elemwise} opr16 +var17 produced: name=var17 layout={2(3),3(1) Float32} owner_opr=opr16{Elemwise} opr16 deps: [i0]var1: s val: s -var11 produced: name=var11 layout={1(3),3(1)} owner_opr=opr10{Subtensor} opr10 +var11 produced: name=var11 layout={1(3),3(1) Float32} owner_opr=opr10{Subtensor} opr10 deps: [i0]var1: s [i1]var5: [0] s [i2]var7: [1] s val: s -var13 produced: name=var13 layout={2(0),3(1)} owner_opr=opr12{Broadcast} opr12 +var13 produced: name=var13 layout={2(0),3(1) Float32} owner_opr=opr12{Broadcast} opr12 deps: [i0]var11: s [i1]var9: [2, 3] s val: s -var15 produced: name=var15 layout={2(3),3(1)} owner_opr=opr14{Elemwise} opr14 +var15 produced: name=var15 layout={2(3),3(1) Float32} owner_opr=opr14{Elemwise} opr14 deps: [i0]var3: s [i1]var13: s val: s -var19 produced: name=var19 layout={2(3),3(1)} owner_opr=opr18{Elemwise} opr18 +var19 produced: name=var19 layout={2(3),3(1) Float32} owner_opr=opr18{Elemwise} opr18 deps: [i0]var15: s [i1]var17: s -- GitLab