From c14e5719f84fbffec731711a3be41b5706fbd87a Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 2 Aug 2021 10:43:00 +0800 Subject: [PATCH] feat(mgb/gopt): add profile impl for global layout transform pass GitOrigin-RevId: 8ef62baf792c97c7a226dd791af167ab2e8707b4 --- dnn/src/aarch64/relayout/opr_impl.cpp | 7 + dnn/src/armv7/relayout/opr_impl.cpp | 7 + src/gopt/impl/opr_format_modifier.cpp | 313 ++++++++++ src/gopt/impl/opr_format_modifier.h | 36 ++ src/gopt/impl/opr_tensor_formats_config.cpp | 582 ++++++++++++++++++ src/gopt/impl/profiler_impl.cpp | 527 ++++++++++++++++ src/gopt/impl/reformat_emitter.cpp | 24 +- src/gopt/impl/reformat_manager.cpp | 189 +++--- src/gopt/impl/utils.h | 105 ++++ .../megbrain/gopt/global_layout_transform.h | 176 ++++++ .../include/megbrain/gopt/reformat_emitter.h | 6 +- .../include/megbrain/gopt/reformat_manager.h | 14 + .../megbrain/gopt/subgraph_extractor.h | 5 +- src/gopt/test/profiler.cpp | 429 +++++++++++++ src/gopt/test/reformat_manager.cpp | 2 - 15 files changed, 2341 insertions(+), 81 deletions(-) create mode 100644 src/gopt/impl/opr_format_modifier.cpp create mode 100644 src/gopt/impl/opr_format_modifier.h create mode 100644 src/gopt/impl/opr_tensor_formats_config.cpp create mode 100644 src/gopt/impl/profiler_impl.cpp create mode 100644 src/gopt/impl/utils.h create mode 100644 src/gopt/include/megbrain/gopt/global_layout_transform.h create mode 100644 src/gopt/test/profiler.cpp diff --git a/dnn/src/aarch64/relayout/opr_impl.cpp b/dnn/src/aarch64/relayout/opr_impl.cpp index 8e7f8524d..8af0b8cc2 100644 --- a/dnn/src/aarch64/relayout/opr_impl.cpp +++ b/dnn/src/aarch64/relayout/opr_impl.cpp @@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, TensorND src = src0, dst = dst0; check_layout_and_canonize(src.layout, dst.layout); + // FIXME: optimize for lowbit cases + if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || + src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { + fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle); + return; + } + relayout::TransposeParam trans_param; bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { diff --git a/dnn/src/armv7/relayout/opr_impl.cpp b/dnn/src/armv7/relayout/opr_impl.cpp index 1701dadc6..074d1a52d 100644 --- a/dnn/src/armv7/relayout/opr_impl.cpp +++ b/dnn/src/armv7/relayout/opr_impl.cpp @@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0, TensorND src = src0, dst = dst0; check_layout_and_canonize(src.layout, dst.layout); + // FIXME: optimize for lowbit cases + if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 || + src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { + fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle); + return; + } + relayout::TransposeParam trans_param; bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param); if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) { diff --git a/src/gopt/impl/opr_format_modifier.cpp b/src/gopt/impl/opr_format_modifier.cpp new file mode 100644 index 000000000..5bc6d280d --- /dev/null +++ b/src/gopt/impl/opr_format_modifier.cpp @@ -0,0 +1,313 @@ +/** + * \file src/gopt/impl/opr_format_modifier.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./opr_format_modifier.h" +#include "megbrain/opr/dnn/convolution.h" +#include "megbrain/opr/dnn/pooling.h" +#include "megbrain/opr/imgproc.h" +#include "megbrain/opr/io.h" +#include "megbrain/serialization/sereg.h" + +#include "midout.h" +MIDOUT_DECL(megbrain_opr_format_modifier) +#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_format_modifier, __VA_ARGS__) { +#define MIDOUT_E \ + } \ + MIDOUT_END(); + +using namespace mgb; +using namespace opr; + +namespace { +template +struct MakeConvCaller2 { + template + static VarNode* make(const cg::VarNodeArray& inputs, + const typename MegDNNConv::Param& param, + const megdnn::param::ExecutionPolicy& execution_policy, + const OperatorNodeConfig& config) { + if (inputs.size() == 2) { + return Opr::make(inputs[0], inputs[1], param, execution_policy, + config) + .node(); + } + return nullptr; + } +}; + +template +struct MakeConvCaller3 { + template + static VarNode* make(const cg::VarNodeArray& inputs, + const typename MegDNNConv::Param& param, + const megdnn::param::ExecutionPolicy& execution_policy, + const OperatorNodeConfig& config) { + if (inputs.size() == 3) { + return Opr::make(inputs[0], inputs[1], inputs[2], param, + execution_policy, config) + .node(); + } + return nullptr; + } +}; + +template +struct MakeConvCaller4 { + template + static VarNode* make(const cg::VarNodeArray& inputs, + const typename MegDNNConv::Param& param, + const megdnn::param::ExecutionPolicy& execution_policy, + const OperatorNodeConfig& config) { + if (inputs.size() == 4) { + return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param, + execution_policy, config) + .node(); + } + return nullptr; + } +}; + +template +struct MakeConvCaller5 { + template + static VarNode* make(const cg::VarNodeArray& inputs, + const typename MegDNNConv::Param& param, + const megdnn::param::ExecutionPolicy& execution_policy, + const OperatorNodeConfig& config) { + if (inputs.size() == 5) { + return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], + inputs[4], param, execution_policy, config) + .node(); + } + return nullptr; + } +}; + +template +struct MakeConvCallerEmpty { + template + static VarNode* make(const cg::VarNodeArray&, + const typename MegDNNConv::Param&, + const megdnn::param::ExecutionPolicy&, + const OperatorNodeConfig&) { + return nullptr; + } +}; + +template , + class Maker2 = MakeConvCallerEmpty, + typename ConvParam = megdnn::param::Convolution> +struct ConvMakerImpl { + static VarNode* make(const cg::VarNodeArray& inputs, const ConvParam& param, + const megdnn::param::ExecutionPolicy& execution_policy, + const OperatorNodeConfig& config) { + VarNode* ret = Maker0::template make(inputs, param, + execution_policy, config); + if (!ret) { + ret = Maker1::template make(inputs, param, execution_policy, + config); + } + if (!ret) { + ret = Maker2::template make(inputs, param, execution_policy, + config); + } + mgb_assert(ret); + return ret; + } +}; + +template +struct ConvMaker; + +template <> +struct ConvMaker + : public ConvMakerImpl, + megdnn::Convolution> {}; +template <> +struct ConvMaker + : public ConvMakerImpl, + megdnn::Convolution, + MakeConvCaller3> {}; + +template <> +struct ConvMaker + : public ConvMakerImpl, + megdnn::ConvBiasForward, + MakeConvCaller3, + MakeConvCaller4, + megdnn::param::ConvBias> {}; +template <> +struct ConvMaker + : public ConvMakerImpl, + megdnn::BatchConvBiasForward, + MakeConvCaller3, + MakeConvCaller4, + megdnn::param::BatchConvBias> {}; + +#if 0 +#include "../../opr/impl/internal/invoke.h" +template +struct MultiAlgoOprTrait; + +#define APPLY(statement, ...) \ + mgb::apply([&](const auto&... args) { return statement; }, \ + std::tuple_cat(__VA_ARGS__)) + +#define INST(_Opr) \ + template <> \ + struct MultiAlgoOprTrait<_Opr> { \ + static constexpr bool has_algo = true; \ + using MegDNNOpr = megdnn::_Opr; \ + static constexpr int arity = OprArityTrait::arity; \ + using FixedTensorLayouts = std::array; \ + static bool has_available_algo(const VarNodeArray& i, \ + const cg::OperatorNodeBase* opr_) { \ + MIDOUT_B(midout_iv(MGB_HASH_STR(#_Opr)), \ + midout_iv(MGB_HASH_STR("has_available_algo"))) \ + auto&& opr = opr_->cast_final_safe<_Opr>(); \ + auto&& megdnn_opr = \ + reinterpret_cast(opr.megdnn_opr()); \ + FixedTensorLayouts array_layouts; \ + size_t in = i.size() - 1; \ + for (size_t idx = 0; idx < in; idx++) { \ + const auto& v = i[idx]; \ + array_layouts[idx] = \ + TensorLayout{v->shape(), v->dtype(), v->format()}; \ + } \ + const auto& v = i[in]; \ + array_layouts[arity - 1] = \ + TensorLayout{v->shape(), v->dtype(), v->format()}; \ + return APPLY(::megdnn::has_available_algo(megdnn_opr, args...), \ + array_layouts); \ + MIDOUT_E \ + } \ + }; +INST(Convolution) +INST(ConvBiasForward) +INST(ConvolutionBackwardData) +INST(PoolingForward) +#undef APPLY +#undef INST +#endif +} // namespace + +namespace mgb { +namespace gopt { +namespace intl { + +template +struct OprFormatModifier; + +#define INST(_Opr) \ + template <> \ + struct OprFormatModifier<_Opr> { \ + using OprFormat = typename _Opr::Param::Format; \ + static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \ + const cg::OperatorNodeBase* opr_) { \ + MIDOUT_B(_Opr) \ + auto&& opr = opr_->cast_final_safe<_Opr>(); \ + auto param = opr.param(); \ + param.format = opr_format; \ + return ConvMaker<_Opr>::make(i, param, opr.execution_policy(), \ + opr.config()); \ + MIDOUT_E \ + } \ + }; +INST(Convolution); +INST(ConvBiasForward); +INST(ConvolutionBackwardData); +INST(BatchConvBiasForward); +#undef INST + +template <> +struct OprFormatModifier { + using Opr = opr::WarpPerspective; + using OprFormat = typename Opr::Param::Format; + static VarNode* make(OprFormat opr_format, const VarNodeArray& i, + const cg::OperatorNodeBase* opr_) { + MIDOUT_B(Opr) + auto&& opr = opr_->cast_final_safe(); + auto param = opr.param(); + param.format = opr_format; + if (i.size() == 3) { + return Opr::make(i[0], i[1], i[2], param, opr.config()).node(); + } else { + mgb_assert(i.size() == 4); + return Opr::make(i[0], i[1], i[2], i[3], param, opr.config()) + .node(); + } + MIDOUT_E + } +}; + +#define INST(_Opr, _arity) \ + template <> \ + struct OprFormatModifier<_Opr> { \ + using OprFormat = typename _Opr::Param::Format; \ + static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \ + const cg::OperatorNodeBase* opr_) { \ + MIDOUT_B(_Opr) \ + auto&& opr = opr_->cast_final_safe<_Opr>(); \ + auto param = opr.param(); \ + param.format = opr_format; \ + return serialization::OprMaker<_Opr, _arity>::make( \ + param, i, *i[0]->owner_graph(), opr.config()) \ + ->output(0); \ + MIDOUT_E \ + } \ + }; +INST(PoolingForward, 1); +INST(Resize, 2); +#undef INST + +VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format, + const VarNodeArray& i, + const cg::OperatorNodeBase* opr) { +#define cb(_Opr) \ + if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \ + return OprFormatModifier<_Opr>::make(opr_format, i, opr); \ + } else + FOREACH_FORMAT_AWARE_OPR(cb) { + mgb_throw(InternalError, "invalid format aware operator(got:%s)", + opr->dyn_typeinfo()->name); + } +#undef cb +} + +#if 0 +bool has_available_algo(const VarNodeArray& i, + const cg::OperatorNodeBase* opr) { +#define cb(_Opr) \ + if (opr->dyn_typeinfo() == _Opr::typeinfo()) { \ + MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo); \ + VarNodeArray _ = i; \ + _.emplace_back(opr->output(0)); \ + return MultiAlgoOprTrait<_Opr>::has_available_algo(_, opr); \ + } else + cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) + cb(PoolingForward) { + mgb_throw(InternalError, "invalid multi-algo operator(got:%s)", + opr->dyn_typeinfo()->name); + } +} +#endif + +} // namespace intl +} // namespace gopt +} // namespace mgb + +// vim: syntax=cpp.doxygen diff --git a/src/gopt/impl/opr_format_modifier.h b/src/gopt/impl/opr_format_modifier.h new file mode 100644 index 000000000..e3221ff46 --- /dev/null +++ b/src/gopt/impl/opr_format_modifier.h @@ -0,0 +1,36 @@ +/** + * \file src/gopt/impl/opr_format_modifier.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once +#include "megbrain/graph.h" +#include "megbrain/opr/dnn/convolution.h" + +namespace mgb { +namespace gopt { +namespace intl { + +#define FOREACH_FORMAT_AWARE_OPR(cb) \ + cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \ + cb(PoolingForward) cb(WarpPerspective) cb(Resize) +#if 0 +bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr); +#endif + +VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format, + const VarNodeArray& i, + const cg::OperatorNodeBase* opr); + +} // namespace intl +} // namespace gopt +} // namespace mgb + +// vim: syntax=cpp.doxygen diff --git a/src/gopt/impl/opr_tensor_formats_config.cpp b/src/gopt/impl/opr_tensor_formats_config.cpp new file mode 100644 index 000000000..db74bcec3 --- /dev/null +++ b/src/gopt/impl/opr_tensor_formats_config.cpp @@ -0,0 +1,582 @@ +/** + * \file src/gopt/impl/opr_tensor_formats_config.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./utils.h" +#include "megbrain/gopt/global_layout_transform.h" +#include "megbrain/opr/dnn/pooling.h" +#include "megbrain/opr/imgproc.h" + +#include "midout.h" +MIDOUT_DECL(megbrain_opr_tensor_formats_config) +#define MIDOUT_B(...) \ + MIDOUT_BEGIN(megbrain_opr_tensor_formats_config, __VA_ARGS__) { +#define MIDOUT_E \ + } \ + MIDOUT_END(); + +using namespace mgb; +using namespace cg; +using namespace gopt; +using OprFormat = opr::ConvBias::Param::Format; + +namespace { +template +struct ConvParamTrait; + +#define INST(_conv, _weight_idx, _bias_idx, _has_bias) \ + template <> \ + struct ConvParamTrait { \ + static constexpr int weight_idx = _weight_idx; \ + static constexpr int bias_idx = _bias_idx; \ + static constexpr bool has_bias = _has_bias; \ + } +INST(ConvBias, 1, 2, true); +INST(ConvolutionForward, 1, 0, false); +INST(ConvolutionBackwardData, 0, 0, false); + +template ::weight_idx> +static bool is_channel_wise_conv(const OperatorNodeBase* opr) { + MGB_MARK_USED_VAR(ConvParamTrait::has_bias); + MGB_MARK_USED_VAR(ConvParamTrait::bias_idx); + auto&& conv = opr->cast_final_safe(); + auto format = conv.param().format; + auto weight = opr->input(weight_idx); + auto weight_shp = weight->shape(); + if (conv.param().sparse == Opr::Param::Sparse::DENSE) + return false; + size_t ocpg, icpg; + if (format == Opr::Param::Format::NCHW) { + ocpg = weight_shp[1], icpg = weight_shp[2]; + return ocpg == 1 && icpg == 1; + } + return false; +} + +template +struct OprSingleInOutTensorFormatsDispatcherImpl; + +template <> +struct OprSingleInOutTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW; + config.input_dtypes = {opr->input(0)->dtype().enumv()}; + config.input_tensor_types = {TensorType::FEATURE}; + config.output_dtypes = {opr->output(0)->dtype().enumv()}; + config.input_tensor_formats = {TensorFormats::NCHW}; + config.output_tensor_formats = {TensorFormats::NCHW}; + return config; + } +}; + +template <> +struct OprSingleInOutTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW4; + bool available = true; + available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.input_dtypes = {opr->input(0)->dtype().enumv()}; + config.input_tensor_types = {TensorType::FEATURE}; + available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.output_dtypes = {opr->output(0)->dtype().enumv()}; + config.input_tensor_formats = {TensorFormats::NCHWc4}; + config.output_tensor_formats = {TensorFormats::NCHWc4}; + if (available) + return config; + return None; + } +}; + +template <> +struct OprSingleInOutTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::CHWN4; + bool available = true; + available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.input_dtypes = {opr->input(0)->dtype().enumv()}; + config.input_tensor_types = {TensorType::FEATURE}; + available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.output_dtypes = {opr->output(0)->dtype().enumv()}; + config.input_tensor_formats = {TensorFormats::CHWNc4}; + config.output_tensor_formats = {TensorFormats::CHWNc4}; + if (available) + return config; + return None; + } +}; + +template <> +struct OprSingleInOutTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW32; + bool available = true; + available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.input_dtypes = {opr->input(0)->dtype().enumv()}; + config.input_tensor_types = {TensorType::FEATURE}; + available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.output_dtypes = {opr->output(0)->dtype().enumv()}; + config.input_tensor_formats = {TensorFormats::NCHWc32}; + config.output_tensor_formats = {TensorFormats::NCHWc32}; + if (available) + return config; + return None; + } +}; + +template <> +struct OprSingleInOutTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NHWC; + bool available = true; + available &= + opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || + opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4; + config.input_dtypes = {opr->input(0)->dtype().enumv()}; + config.input_tensor_types = {TensorType::FEATURE}; + available &= opr->output(0)->dtype().enumv() == + opr->input(0)->dtype().enumv(); + config.output_dtypes = {opr->output(0)->dtype().enumv()}; + config.input_tensor_formats = {TensorFormats::NHWC}; + config.output_tensor_formats = {TensorFormats::NHWC}; + if (available) + return config; + return None; + } +}; + +template <> +struct OprSingleInOutTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW64; + bool available = true; + available &= + opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || + opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4; + config.input_dtypes = {opr->input(0)->dtype().enumv()}; + config.input_tensor_types = {TensorType::FEATURE}; + available &= opr->output(0)->dtype().enumv() == + opr->input(0)->dtype().enumv(); + config.output_dtypes = {opr->output(0)->dtype().enumv()}; + config.input_tensor_formats = {TensorFormats::NCHWc64}; + config.output_tensor_formats = {TensorFormats::NCHWc64}; + if (available) + return config; + return None; + } +}; + +template +struct ConvTensorFormatsDispatcherImpl; + +template +struct ConvTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW; + // setup dtypes + for (size_t i = 0; i < opr->input().size(); ++i) { + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + // setup tensor formats + if (conv.param().sparse == Opr::Param::Sparse::DENSE) { + config.input_tensor_formats = { + TensorFormats::NCHW, TensorFormats::NCHW, + TensorFormats::NCHW, TensorFormats::NCHW}; + } else { + mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); + if (is_channel_wise_conv(opr)) { + config.input_tensor_formats = { + TensorFormats::NCHW, TensorFormats::C11RS, + TensorFormats::NCHW, TensorFormats::NCHW}; + } else { + config.input_tensor_formats = { + TensorFormats::NCHW, TensorFormats::GKCRS, + TensorFormats::NCHW, TensorFormats::NCHW}; + } + } + config.output_tensor_formats = {TensorFormats::NCHW}; + return config; + } +}; + +template +struct ConvTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NHWC; + bool available = true; + for (size_t i = 0; i < opr->input().size(); ++i) { + if (i == 2) + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS32; + else + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::Quantized4Asymm || + opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS4; + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + available &= + opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || + opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + available &= conv.param().sparse == Opr::Param::Sparse::DENSE; + config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC, + TensorFormats::NHWC, + TensorFormats::NHWC}; + config.output_tensor_formats = {TensorFormats::NHWC}; + if (available) + return config; + return None; + } +}; + +template +struct ConvTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW4; + bool available = true; + // setup dtypes + for (size_t i = 0; i < opr->input().size(); ++i) { + if (i == 2) + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS32; + else + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS8; + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + // setup tensor formats + if (conv.param().sparse == Opr::Param::Sparse::DENSE) { + config.input_tensor_formats = { + TensorFormats::NCHWc4, TensorFormats::NCHWc4, + TensorFormats::NCHWc4, TensorFormats::NCHWc4}; + } else { + mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); + if (is_channel_wise_conv(opr)) { + config.input_tensor_formats = { + TensorFormats::NCHWc4, TensorFormats::C11RSc4, + TensorFormats::NCHWc4, TensorFormats::NCHWc4}; + } else { + config.input_tensor_formats = { + TensorFormats::NCHWc4, TensorFormats::GKCRSc4, + TensorFormats::NCHWc4, TensorFormats::NCHWc4}; + } + } + config.output_tensor_formats = {TensorFormats::NCHWc4}; + if (available) + return config; + return None; + } +}; + +template +struct ConvTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW32; + bool available = true; + for (size_t i = 0; i < opr->input().size(); ++i) { + if (i == 2) + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS32; + else + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS8; + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + available &= conv.param().sparse == Opr::Param::Sparse::DENSE; + config.input_tensor_formats = { + TensorFormats::NCHWc32, TensorFormats::NCHWc32, + TensorFormats::NCHWc32, TensorFormats::NCHWc32}; + config.output_tensor_formats = {TensorFormats::NCHWc32}; + if (available) + return config; + return None; + } +}; + +template +struct ConvTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW64; + bool available = true; + for (size_t i = 0; i < opr->input().size(); ++i) { + if (i == 2) + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS32; + else + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::Quantized4Asymm || + opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS4; + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + available &= + opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm || + opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4; + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + available &= conv.param().sparse == Opr::Param::Sparse::DENSE; + config.input_tensor_formats = { + TensorFormats::NCHWc64, TensorFormats::NCHWc64, + TensorFormats::NCHWc64, TensorFormats::NCHWc64}; + config.output_tensor_formats = {TensorFormats::NCHWc64}; + if (available) + return config; + return None; + } +}; + +template +struct ConvTensorFormatsDispatcherImpl { + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::CHWN4; + bool available = true; + for (size_t i = 0; i < opr->input().size(); ++i) { + if (i == 2) + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS32; + else + available &= opr->input(i)->dtype().enumv() == + DTypeEnum::QuantizedS8; + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 1 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + available &= conv.param().sparse == Opr::Param::Sparse::DENSE; + config.input_tensor_formats = { + TensorFormats::CHWNc4, TensorFormats::CHWNc4, + TensorFormats::CHWNc4, TensorFormats::CHWNc4}; + config.output_tensor_formats = {TensorFormats::CHWNc4}; + if (available) + return config; + return None; + } +}; + +template <> +struct ConvTensorFormatsDispatcherImpl { + using Opr = opr::ConvolutionBackwardData; + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW; + // setup dtypes + for (size_t i = 0; i < opr->input().size(); ++i) { + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + // setup tensor formats + if (conv.param().sparse == Opr::Param::Sparse::DENSE) { + config.input_tensor_formats = { + TensorFormats::NCHW, TensorFormats::NCHW, + TensorFormats::NCHW, TensorFormats::NCHW}; + } else { + mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP); + if (is_channel_wise_conv(opr)) { + config.input_tensor_formats = { + TensorFormats::C11RS, TensorFormats::NCHW, + TensorFormats::NCHW, TensorFormats::NCHW}; + } else { + config.input_tensor_formats = { + TensorFormats::GKCRS, TensorFormats::NCHW, + TensorFormats::NCHW, TensorFormats::NCHW}; + } + } + config.output_tensor_formats = {TensorFormats::NCHW}; + return config; + } +}; + +template <> +struct ConvTensorFormatsDispatcherImpl { + using Opr = opr::ConvolutionBackwardData; + static Maybe dispatch( + const OperatorNodeBase* opr) { + const auto& conv = opr->cast_final_safe(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = OprFormat::NCHW4; + bool available = true; + for (size_t i = 0; i < opr->input().size(); ++i) { + available &= + opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv()); + TensorType tensor_type = + i == 0 ? TensorType::WEIGHT : TensorType::FEATURE; + config.input_tensor_types.emplace_back(tensor_type); + } + available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8; + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE; + config.input_tensor_formats = { + TensorFormats::NCHWc4, TensorFormats::NCHWc4, + TensorFormats::NCHWc4, TensorFormats::NCHWc4}; + config.output_tensor_formats = {TensorFormats::NCHWc4}; + if (available) + return config; + return None; + } +}; + +struct StaticData { + struct KeyHash { + size_t operator()(const std::pair& val) const { + size_t h1 = mgb::hash(val.first); + size_t h2 = + std::hash()(static_cast(val.second)); + return mgb::hash_pair_combine(h1, h2); + } + }; + using OprTensorFormatsDispatcher = + OprTensorFormatsConfiguration::OprTensorFormatsDispatcher; + std::unordered_map, + OprTensorFormatsDispatcher, KeyHash> + typefmt2dispatcher; + StaticData(); +}; + +StaticData::StaticData() { +#define OPR_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt) \ + typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \ + [](const OperatorNodeBase* opr) { \ + MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt)) \ + return ConvTensorFormatsDispatcherImpl< \ + opr::_Opr, OprFormat::_fmt>::dispatch(opr); \ + MIDOUT_E \ + } + +#define OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt) \ + typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \ + [](const OperatorNodeBase* opr) { \ + MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt)) \ + return OprSingleInOutTensorFormatsDispatcherImpl< \ + OprFormat::_fmt>::dispatch(opr); \ + MIDOUT_E \ + } + + OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NHWC); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW4); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, CHWN4); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW32); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW64); + + OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4); + + OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW); + OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4); + + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW); + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NHWC); + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW4); + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW64); + + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW); + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NHWC); + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW4); + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, CHWN4); + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW32); + OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW64); + +#undef OPR_TENSOR_FORMATS_CONFIG_REG +#undef OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG +} + +StaticData& static_data() { + static StaticData inst; + return inst; +} +} // namespace + +OprTensorFormatsConfiguration::OprTensorFormatsDispatcher* +OprTensorFormatsConfiguration::find_dispatcher_by_type_format( + Typeinfo* type, OprFormat opr_format) { + auto&& typefmt2dispatcher = static_data().typefmt2dispatcher; + auto iter = typefmt2dispatcher.find(std::make_pair(type, opr_format)); + mgb_assert(iter != typefmt2dispatcher.end(), + "cannot find OprTensorFormatsDispatcher for opr type(%s) and " + "opr format(%s)", + type->name, opr_format_to_string(opr_format)); + return &iter->second; +} + +// vim: syntax=cpp.doxygen diff --git a/src/gopt/impl/profiler_impl.cpp b/src/gopt/impl/profiler_impl.cpp new file mode 100644 index 000000000..aa9529644 --- /dev/null +++ b/src/gopt/impl/profiler_impl.cpp @@ -0,0 +1,527 @@ +/** + * \file src/gopt/impl/profiler_impl.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./opr_format_modifier.h" +#include "./utils.h" +#include "megbrain/gopt/framework.h" +#include "megbrain/gopt/global_layout_transform.h" +#include "megbrain/graph/event.h" +#include "megbrain/opr/dnn/pooling.h" +#include "megbrain/opr/imgproc.h" +#include "megbrain/opr/io.h" +#include "megbrain/plugin/base.h" +#include "megbrain/serialization/sereg.h" + +using namespace mgb; +using namespace cg; +using namespace opr; +using namespace gopt; +using ReformatKey = ReformatManager::ReformatKey; + +namespace { +using OprFormat = Problem::OprFormat; +OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) { + switch (tensor_format) { + case TensorFormats::NCHW: + return OprFormat::NCHW; + case TensorFormats::NCHWc4: + return OprFormat::NCHW4; + case TensorFormats::NCHWc8: + return OprFormat::NCHW8; + case TensorFormats::NCHWc32: + return OprFormat::NCHW32; + case TensorFormats::NCHWc64: + return OprFormat::NCHW64; + case TensorFormats::NHWC: + return OprFormat::NHWC; + case TensorFormats::CHWNc4: + return OprFormat::CHWN4; + default: + mgb_throw(MegBrainError, "tensor format(%u) is not supported", + static_cast(tensor_format)); + } +} + +class GraphPartitionProfiler final : public PluginBase { + using CompNodeEventPtr = std::unique_ptr; + +public: + using OprFilter = thin_function; + struct OprKernEvent { + CompNodeEventPtr start, end; + }; + GraphPartitionProfiler(ComputingGraph* graph, OprFilter opr_filter); + ~GraphPartitionProfiler() noexcept; + float duration_in_usec() const; + +private: + void record_event(CompNodeEventPtr& dest, CompNode cn) { + if (dest == nullptr) + dest = cn.create_event(CompNode::Event::NEED_TIMER); + dest->record(); + } + ThinHashMap m_kern_event; + OprFilter m_opr_filter; +}; + +GraphPartitionProfiler::GraphPartitionProfiler(ComputingGraph* graph, + OprFilter opr_filter) + : PluginBase(graph), m_opr_filter(opr_filter) { + using namespace event; + auto on_before_kern = [this](BeforeKernel const& event) { + if (!m_opr_filter(event.opr)) + return; + auto evptr = &m_kern_event[event.opr].start; + record_event(*evptr, event.comp_node); + }; + auto on_after_kern = [this](AfterKernel const& event) { + if (!m_opr_filter(event.opr)) + return; + auto evptr = &m_kern_event[event.opr].end; + record_event(*evptr, event.comp_node); + }; + auto&& ev = graph->event(); + add_event_handler(ev.register_receiver(on_before_kern)); + add_event_handler(ev.register_receiver(on_after_kern)); +} + +GraphPartitionProfiler::~GraphPartitionProfiler() noexcept { + auto wait = [](const CompNodeEventPtr& ev) { + if (ev) + ev->host_wait(); + }; + for (auto&& i : m_kern_event) { + wait(i.second.start); + wait(i.second.end); + } +} + +float GraphPartitionProfiler::duration_in_usec() const { + float device_duration = 0.f; + for (auto&& kern_ev : m_kern_event) { + auto&& event = kern_ev.second; + event.end->host_wait(); + device_duration += 1e6 * event.start->elapsed_time_until(*event.end); + } + return device_duration; +} + +/*! + * \brief An operator that indicates its input var node is contiguous + */ +// clang-format off +MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{ + void scn_do_execute() override {}; + void init_output_static_infer_desc() override; + void add_input_layout_constraint() override { + input(0)->add_layout_constraint_contiguous(); + } +public: + MarkInputContiguous(VarNode* input, const OperatorNodeConfig& config); + static SymbolVar make(SymbolVar input, const OperatorNodeConfig& config = {}); +}; +// clang-format on + +MGB_DYN_TYPE_OBJ_FINAL_IMPL(MarkInputContiguous); + +MarkInputContiguous::MarkInputContiguous(VarNode* input, + const OperatorNodeConfig& config) + : Super(input->owner_graph(), config, "mark_contiguous", {input}) { + add_input({input}); + add_output(None); +} + +SymbolVar MarkInputContiguous::make(SymbolVar input, + const OperatorNodeConfig& config) { + return input.insert_single_output_opr(input.node(), + config); +} + +void MarkInputContiguous::init_output_static_infer_desc() { + using namespace cg::static_infer; + auto&& mgr = owner_graph()->static_infer_manager(); + mgr.register_shape_infer(output(0), + ShapeInferDesc::make_identity(input(0))); +} +} // namespace + +/* ================== ProfilerImpl =================*/ +class ProfilerImpl final : public ProfilerBase { +public: + ProfilerImpl(int runs = 10) : m_runs{runs} {}; + ~ProfilerImpl() = default; + ProfilingResult profile(const Problem& problem) const override; + +private: + static constexpr float PROFILE_TIME_OUT = 1e7; + /*! + * \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.) + * + * \param opr pointer to the operator node to be profiled + * \param base_format the original tensor format of the operator node. + * \param available_tensor_formats the available tensor formats + * \return the operator node record + */ + OperatorNodeRecord profile_operator( + const OperatorNodeBase* opr, TensorFormats base_format, + const SmallVector& available_tensor_formats) const; + float profile_operator(const OperatorNodeBase* opr, + TensorFormats base_format, + TensorFormats tensor_format) const; + /*! + * \brief profile opr format aware operators (like conv, deconv, conv_bias, etc.) + * + * \param opr pointer to the operator node to be profiled + * \param base_config the tensor formats configuration of base opr format + * \param config all the available configuration + * \return the operator node record + */ + OperatorNodeRecord profile_operator( + const OperatorNodeBase* opr, + const OprTensorFormatsConfiguration& base_config, + const SmallVector& available_configs) + const; + float profile_operator(const OperatorNodeBase* opr, + const OprTensorFormatsConfiguration& base_config, + const OprTensorFormatsConfiguration& config) const; + /*! + * \brief profile layout transform of the var node + * + * \param var pointer to the var node to be profiled + * \param base_format the original tensor formats in which the var node is stored + * \param available_tensor_formats the available tensor formats + * \param extra_attribute the extra attributes (options) of the problem + * \return the var node record + */ + VarNodeRecord profile_var_node( + const VarNode* var, TensorFormats base_format, + const SmallVector& available_tensor_formats, + ReformatKey::Attribute extra_attribute = + ReformatKey::Attribute::DEFAULT) const; + float profile_var_node(const VarNode* var, TensorFormats base_format, + const ReformatKey& key) const; + int m_runs; /// sample times of the profiler +}; + +ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( + const OperatorNodeBase* opr, TensorFormats base_format, + const SmallVector& available_tensor_formats) const { + OperatorNodeRecord record; + record.opr = opr; + auto& costs = record.costs; + for (auto&& f : available_tensor_formats) { + auto opr_format = tensor_formats_to_opr_format(f); + costs[opr_format] = profile_operator(opr, base_format, f); + } + return record; +} + +float ProfilerImpl::profile_operator(const OperatorNodeBase* opr, + TensorFormats base_format, + TensorFormats tensor_format) const { + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + graph->options().var_sanity_check_first_run = false; + VarNodeArray new_inps(opr->input().size()); + for (size_t i = 0; i < opr->input().size(); ++i) { + auto&& var = opr->input(i); + auto&& cn = var->comp_node(); + auto&& dtype = var->dtype(); + auto dval = std::make_shared(cn, dtype); + auto aligned_tensor_shape = + make_aligned_tensor_shape(var, base_format, tensor_format); + dval->resize(aligned_tensor_shape); + auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); + new_inps[i] = aligned_var.node(); + } + auto new_opr = serialization::copy_opr_shallow( + *opr, new_inps, opr->config(), {graph.get()}); + auto y = new_opr->output(0); + auto mark = MarkInputContiguous::make(SymbolVar(y)); + auto func = graph->compile({{mark, {}}}); + auto filter = [new_opr](OperatorNodeBase* opr) { return opr == new_opr; }; + auto profiler = std::make_unique(graph.get(), + std::move(filter)); + for (int i = 0; i < m_runs; ++i) + func->execute(); + return profiler->duration_in_usec(); +} + +ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator( + const OperatorNodeBase* opr, + const OprTensorFormatsConfiguration& base_config, + const SmallVector& available_configs) + const { + OperatorNodeRecord record; + record.opr = opr; + auto& costs = record.costs; + for (auto&& i : available_configs) { + costs[i.opr_format] = profile_operator(opr, base_config, i); + } + return record; +} + +float ProfilerImpl::profile_operator( + const OperatorNodeBase* opr, + const OprTensorFormatsConfiguration& base_config, + const OprTensorFormatsConfiguration& config) const { + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + graph->options().var_sanity_check_first_run = false; + VarNodeArray new_inps(opr->input().size()); + size_t i = 0; + size_t nr_input_tensor = + std::min(config.input_tensor_formats.size(), opr->input().size()); + for (; i < nr_input_tensor; ++i) { + auto&& var = opr->input(i); + auto&& cn = var->comp_node(); + auto&& dtype = var->dtype(); + auto dval = std::make_shared(cn, dtype); + TensorShape aligned_shape; + if (config.input_tensor_types[i] == TensorType::WEIGHT) { + mgb_assert(base_config.input_tensor_types[i] == TensorType::WEIGHT); + aligned_shape = make_aligned_weight_shape( + var, base_config.input_tensor_formats[i], + config.input_tensor_formats[i], + config.output_tensor_formats[0]); + } else { + mgb_assert(base_config.input_tensor_types[i] == + config.input_tensor_types[i]); + mgb_assert(base_config.input_tensor_types[i] == + TensorType::FEATURE); + aligned_shape = make_aligned_tensor_shape( + var, base_config.input_tensor_formats[i], + config.input_tensor_formats[i]); + } + dval->resize(aligned_shape); + auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); + new_inps[i] = aligned_var.node(); + } + for (; i < opr->input().size(); ++i) { + auto&& var = opr->input(i); + auto&& cn = var->comp_node(); + auto&& dtype = var->dtype(); + auto hval = std::make_shared(cn, dtype); + hval->resize(var->shape()); + auto cb = [&](DeviceTensorND& d) { hval->copy_from(d).sync(); }; + { + auto cg = var->owner_graph(); + cg->compile({{var, cb}})->execute(); + } + auto imm = opr::ImmutableTensor::make(*graph, *hval); + new_inps[i] = imm.node(); + } + VarNode* y = mgb::gopt::intl::modify_opr_format(config.opr_format, new_inps, + opr); +#if 0 + static const ThinHashSet multi_algo_oprs = { + opr::Convolution::typeinfo(), + opr::ConvBiasForward::typeinfo(), + opr::ConvolutionBackwardData::typeinfo(), + opr::PoolingForward::typeinfo(), + }; + if (multi_algo_oprs.count(opr->dyn_typeinfo()) && + !mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr())) + return PROFILE_TIME_OUT; +#endif + auto mark = MarkInputContiguous::make(SymbolVar(y)); + auto func = graph->compile({{mark, {}}}); + auto new_opr = y->owner_opr(); + auto filter = [&new_opr](OperatorNodeBase* opr) { return opr == new_opr; }; + auto profiler = std::make_unique(graph.get(), + std::move(filter)); + for (int i = 0; i < m_runs; ++i) + func->execute(); + return profiler->duration_in_usec(); +} + +ProfilerImpl::VarNodeRecord ProfilerImpl::profile_var_node( + const VarNode* var, TensorFormats base_format, + const SmallVector& available_tensor_formats, + ReformatKey::Attribute attribute) const { + VarNodeRecord record; + record.var = var; + auto& costs = record.costs; + for (auto&& i : available_tensor_formats) { + for (auto&& o : available_tensor_formats) { + if (i == o) + continue; + ReformatKey key{i, o, attribute, var->dtype().enumv(), + var->dtype().enumv()}; + costs[{i, o}] = profile_var_node(var, base_format, key); + } + } + return record; +} + +float ProfilerImpl::profile_var_node(const VarNode* var, + TensorFormats base_format, + const ReformatKey& key) const { + auto&& cn = var->comp_node(); + auto&& dtype = var->dtype(); + auto dval = std::make_shared(cn, dtype); + auto aligned_tensor_shape = + make_aligned_tensor_shape(var, base_format, key.input_format); + dval->resize(aligned_tensor_shape); + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + graph->options().var_sanity_check_first_run = false; + auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval); + auto builder = ReformatManager::instance().auto_aligned_reformat_featrue( + var, base_format, key); + auto y = builder({aligned_var.node()}); + ThinHashSet set; + DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); }); + iter.add(y->owner_opr()); + iter.set_visited(aligned_var.node()->owner_opr()); + auto mark = MarkInputContiguous::make(SymbolVar(y)); + auto func = graph->compile({{mark, {}}}); + auto filter = [&set](OperatorNodeBase* opr) { return set.count(opr) > 0; }; + auto profiler = std::make_unique(graph.get(), + std::move(filter)); + for (int i = 0; i < m_runs; ++i) + func->execute(); + return profiler->duration_in_usec(); +} + +ProfilerImpl::ProfilingResult ProfilerImpl::profile( + const Problem& problem) const { + ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM}; + { + auto cb = [&cvprop](OperatorNodeBase* opr) { cvprop.add_opr(opr); }; + DepOprIter iter{cb}; + for (auto&& o : problem.graph_partition().output()) { + iter.add(o->owner_opr()); + } + } + + static const ThinHashMap format_aware_input_tensors = { +#define cb(_Opr, _arity) {_Opr::typeinfo(), _arity} + cb(Convolution, 2), + cb(ConvBiasForward, 4), + cb(ConvolutionBackwardData, 2), + cb(PoolingForward, 1), + cb(WarpPerspective, 1), + cb(Resize, 1), +#undef cb + }; + ThinHashSet vars; + ThinHashSet oprs; + { + auto cb = [&cvprop, &vars, &oprs](OperatorNodeBase* opr) { + if (cvprop.is_const(opr)) + return; + oprs.insert(opr); + auto find = format_aware_input_tensors.find(opr->dyn_typeinfo()); + if (find == format_aware_input_tensors.end()) { + for (auto&& i : opr->input()) { + if (!cvprop.is_const(i)) { + vars.insert(i); + } + } + } else { + size_t nr_input_tensor = + std::min(find->second, opr->input().size()); + for (size_t i = 0; i < nr_input_tensor; ++i) { + if (!cvprop.is_const(opr->input(i))) { + vars.insert(opr->input(i)); + } + } + } + vars.insert(opr->output(0)); + }; + DepOprIter iter{cb}; + for (auto&& i : problem.graph_partition().input()) { + iter.set_visited(i->owner_opr()); + } + for (auto&& o : problem.graph_partition().output()) { + iter.add(o->owner_opr()); + } + } + + auto base_format = problem.base_format(); + auto&& available_tensor_formats = problem.available_tensor_formats(); + + ProfilingResult profiling_result; + auto& opr_record = profiling_result.opr_record; + auto& var_record = profiling_result.var_record; + for (auto&& var : vars) { + var_record[var] = + profile_var_node(var, base_format, available_tensor_formats); + } + for (auto&& opr : oprs) { + auto&& opr_configs = problem.opr_configs(); + auto find = opr_configs.find(opr->dyn_typeinfo()); + if (find == opr_configs.end()) { + opr_record[opr] = profile_operator(opr, base_format, + available_tensor_formats); + } else { + auto&& dispatchers = find->second; + SmallVector configs; + for (const auto& item : dispatchers) { + auto config = (*item.second)(opr); + if (config.valid()) { + configs.emplace_back(config.val()); + } + } + auto base_config = problem.base_config(opr); + opr_record[opr] = profile_operator(opr, base_config, configs); + } + } + for (auto&& rpair : opr_record) { + mgb_log_debug("%s", rpair.second.to_string().c_str()); + } + for (auto&& rpair : var_record) { + mgb_log_debug("%s", rpair.second.to_string().c_str()); + } + return profiling_result; +} + +/* ================== ProfilerBase =================*/ +std::string ProfilerBase::OperatorNodeRecord::to_string() const { + auto str = ssprintf("\nopr type: %s\nopr name: %s\ninputs:\n", + opr->dyn_typeinfo()->name, opr->cname()); + for (auto&& i : opr->input()) { + str += ssprintf("\tvar: %s\n\tshape: %s\n", i->cname(), + i->shape().to_string().c_str()); + } + str += ssprintf("outputs:\n\tvar: %s\n\tshape: %s\ncosts:\n", + opr->output(0)->cname(), + opr->output(0)->shape().to_string().c_str()); + for (auto&& cpair : costs) { + str += ssprintf("\tformat: %s; cost:%f", + opr_format_to_string(cpair.first), cpair.second); + } + return str; +} + +std::string ProfilerBase::VarNodeRecord::to_string() const { + auto str = ssprintf("\nvar: %s\ncosts:", var->cname()); + for (auto&& cpair : costs) { + auto&& formats = cpair.first; + str += ssprintf("\n\tformat: (i:%s;o:%s); cost:%f", + tensor_formats_to_named_tensor_shape(formats.first) + .to_string() + .c_str(), + tensor_formats_to_named_tensor_shape(formats.second) + .to_string() + .c_str(), + cpair.second); + } + return str; +} + +std::unique_ptr ProfilerBase::make_profiler() { + return std::make_unique(); +} + +// vim: syntax=cpp.doxygen diff --git a/src/gopt/impl/reformat_emitter.cpp b/src/gopt/impl/reformat_emitter.cpp index db3a3d0c4..5739626c6 100644 --- a/src/gopt/impl/reformat_emitter.cpp +++ b/src/gopt/impl/reformat_emitter.cpp @@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const { /* ============== PaddingEmitter ================= */ PaddingEmitter::EmitResult PaddingEmitter::emit() const { + auto&& padshp = m_padshp; auto&& const_extent = m_const_extent; auto&& axis = m_axis; - auto builder = [const_extent, axis](const VarNodeArray& vars) { + auto builder = [padshp, const_extent, axis](const VarNodeArray& vars) { auto i = vars[0]; auto padding_shp_var = vars[1]; TensorShape shape; shape.ndim = i->shape().ndim; for (size_t ax = 0; ax < shape.ndim; ++ax) shape[ax] = 1; - shape[axis] = const_extent; + // avoid making a scalar lowbit tensor + if (!i->dtype().is_low_bit() || const_extent != 1) + shape[axis] = const_extent; + else { + size_t const_axis = 0; + size_t new_const_extent = const_extent; + for (size_t i = 0; i < padshp.ndim; ++i) { + const auto& dim = padshp[i]; + if (dim.extent() != Dimension::UNDETERMINED_EXTENT && + dim.extent() != 1) { + new_const_extent = dim.extent(); + const_axis = i; + break; + } + } + mgb_assert(new_const_extent != 1, + "cannot make an scalar lowbit tensor(got:%s)", + i->dtype().name()); + shape[const_axis] = new_const_extent; + } auto host_val = std::make_shared(i->comp_node(), i->dtype()); host_val->resize(shape); diff --git a/src/gopt/impl/reformat_manager.cpp b/src/gopt/impl/reformat_manager.cpp index decf4521c..79a4e2c81 100644 --- a/src/gopt/impl/reformat_manager.cpp +++ b/src/gopt/impl/reformat_manager.cpp @@ -13,6 +13,7 @@ #include "megbrain/gopt/reformat_manager.h" #include "megbrain/opr/tensor_manip.h" #include "megbrain/utils/arith_helper.h" +#include "./utils.h" using namespace mgb; using namespace gopt; @@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) { } return x; } - -NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) { - switch (format) { - case TensorFormats::NCHW: - return {{"N"}, {"C"}, {"H"}, {"W"}}; - case TensorFormats::NHWC: - return {{"N"}, {"H"}, {"W"}, {"C"}}; - case TensorFormats::NCHWc4: - return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}}; - case TensorFormats::NCHWc8: - return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}}; - case TensorFormats::NCHWc32: - return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}}; - case TensorFormats::NCHWc64: - return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}}; - case TensorFormats::CHWNc4: - return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}}; - case TensorFormats::NHCWc4: - return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}}; - case TensorFormats::KRSCk4: - return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; - case TensorFormats::GKRSCk4: - return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; - case TensorFormats::C1RSc4: - return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; - case TensorFormats::KRSCk4c4: - return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; - case TensorFormats::GKRSCk4c4: - return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; - case TensorFormats::KCRSk4c4: - return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; - case TensorFormats::GKCRSk4c4: - return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; - case TensorFormats::KCRSc4k4: - return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; - case TensorFormats::GKCRSc4k4: - return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; - case TensorFormats::C11RSc4: - return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; - case TensorFormats::KCRSc8k8: - return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; - case TensorFormats::GKCRSc8k8: - return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; - case TensorFormats::C11RSc8: - return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}}; - case TensorFormats::KRSCk8: - return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}}; - case TensorFormats::KCRSc4: - return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; - case TensorFormats::GKCRSc4: - return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; - case TensorFormats::KCRS: - return {{"K"}, {"C"}, {"R"}, {"S"}}; - case TensorFormats::GKCRS: - return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}}; - case TensorFormats::C11RS: - return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}}; - default: - mgb_throw(AssertionError, "invalid tensor formats(%u)", - static_cast(format)); - } -} }; // namespace // =================== ReformatManager::ReformatKey ====================*/ @@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( tensor_formats_to_named_tensor_shape(key.input_format); NamedTensorShape output_shape = tensor_formats_to_named_tensor_shape(key.output_format); - size_t input_alignment, output_alignment; - size_t input_channel_idx, output_channel_idx; + size_t input_alignment = 0; + size_t output_alignment = 0; + size_t input_channel_idx = input_shape.ndim, + output_channel_idx = input_shape.ndim; for (size_t i = 0; i < input_shape.ndim; ++i) { if (input_shape[i].name() == Dimension::Name::C && input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { @@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( break; } } + mgb_assert(input_channel_idx < input_shape.ndim && + output_channel_idx < input_shape.ndim, + "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)", + input_channel_idx, output_channel_idx, + input_shape.to_string().c_str()); + mgb_assert(input_alignment > 0 && output_alignment > 0, + "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)", + input_alignment, output_alignment, + input_shape.to_string().c_str()); NamedTensorShape orig_shape = tensor_formats_to_named_tensor_shape(orig_format); size_t orig_channel = 0; @@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue( auto make_shape = std::get<0>( MakeShapeEmitter{input_shape, padding_shape}.emit()); auto padding_shp_var = make_shape({x}); - auto padding = std::get<0>( - PaddingEmitter{const_extent, input_channel_idx}.emit()); + auto padding = std::get<0>(PaddingEmitter{ + padding_shape, const_extent, input_channel_idx} + .emit()); cur = padding({cur, padding_shp_var}); } cur = ReformatManager::instance().get(key)({cur}); @@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( const VarNode* orig_var, const ReformatKey& key, const AlignmentDesc& extra_alignment) const { size_t in_channels = 0, out_channels = 0; - size_t input_channel_idx, output_channel_idx; - Dimension::Name out_channel_name; + Dimension::Name out_channel_name = Dimension::Name::C; auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format); + size_t input_channel_idx = input_shape.ndim, + output_channel_idx = input_shape.ndim; for (size_t i = 0; i < input_shape.ndim; ++i) { if (input_shape[i].name() == Dimension::Name::C && input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) { @@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( input_shape.to_string().c_str()); } } - size_t in_channel_alignment, out_channel_alignment = 1; + mgb_assert(out_channel_name == Dimension::Name::K || + out_channel_name == Dimension::Name::N, + "invalid out channel(shp:%s)", input_shape.to_string().c_str()); + mgb_assert(input_channel_idx < input_shape.ndim && + output_channel_idx < input_shape.ndim, + "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)", + input_channel_idx, output_channel_idx, + input_shape.to_string().c_str()); + size_t in_channel_alignment = 0, out_channel_alignment = 0; auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format); for (size_t i = 0; i < output_shape.ndim; ++i) { if (output_shape[i].name() == Dimension::Name::C && @@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( out_channel_alignment = output_shape[i].stride(); } } + mgb_assert(in_channel_alignment > 0 && out_channel_alignment > 0, + "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)", + in_channel_alignment, out_channel_alignment, + output_shape.to_string().c_str()); size_t aligned_in_channel = divup(in_channels, in_channel_alignment) * in_channel_alignment; if (extra_alignment.name == out_channel_name) { @@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( auto make_shape = std::get<0>( MakeShapeEmitter{input_shape, padding_shape}.emit()); auto padding_shp_var = make_shape({x}); - auto padding = std::get<0>( - PaddingEmitter{const_extent, input_channel_idx}.emit()); + auto padding = std::get<0>(PaddingEmitter{ + padding_shape, const_extent, input_channel_idx} + .emit()); cur = padding({cur, padding_shp_var}); } if (aligned_out_channel > out_channels) { @@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight( auto make_shape = std::get<0>( MakeShapeEmitter{input_shape, padding_shape}.emit()); auto padding_shp_var = make_shape({cur}); - auto padding = std::get<0>( - PaddingEmitter{const_extent, output_channel_idx}.emit()); + auto padding = std::get<0>(PaddingEmitter{ + padding_shape, const_extent, output_channel_idx} + .emit()); cur = padding({cur, padding_shp_var}); } cur = ReformatManager::instance().get(key)({cur}); @@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() { static ReformatManager inst; return inst; } + +TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var, + TensorFormats orig_formats, + TensorFormats target_formats) { + using Dimension = megdnn::Dimension; + static constexpr uint32_t UNDETERMINED_EXTENT = + Dimension::UNDETERMINED_EXTENT; + auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats); + auto target_shape = tensor_formats_to_named_tensor_shape(target_formats); + + TensorShape oshp = var->shape(); + mgb_assert(oshp.is_scalar() || oshp.ndim == orig_shape.ndim, + "orig shape of var node is not compatible with tensor " + "formats(var:%s;shp:%s;fmt:%s)", + var->cname(), oshp.to_string().c_str(), + orig_shape.to_string().c_str()); + if (oshp.is_scalar()) return oshp; + TensorShape tshp; + ThinHashMap name2dominant; + for (size_t i = 0; i < orig_shape.ndim; ++i) { + auto name = orig_shape[i].name(); + if (orig_shape[i].extent() == UNDETERMINED_EXTENT) { + auto insert = name2dominant.insert(std::make_pair(name, i)); + mgb_assert(insert.second); + } + } + + tshp.ndim = target_shape.ndim; + for (size_t i = 0; i < target_shape.ndim; ++i) { + auto name = target_shape[i].name(); + if (target_shape[i].extent() == UNDETERMINED_EXTENT) { + int idx = name2dominant.at(name); + bool mul = orig_shape[idx] < target_shape[i]; + size_t factor = mul ? (target_shape[i] / orig_shape[idx]).extent() + : (orig_shape[idx] / target_shape[i]).extent(); + if (mul) + tshp[i] = oshp[idx] * factor; + else + tshp[i] = divup(oshp[idx], factor); + } else { + tshp[i] = target_shape[i].extent(); + } + } + return tshp; +} + +TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var, + TensorFormats orig_formats, + TensorFormats target_formats, + TensorFormats extra_formats) { + auto tshp = make_aligned_tensor_shape(var, orig_formats, target_formats); + auto extra_shape = tensor_formats_to_named_tensor_shape(extra_formats); + using Dimension = megdnn::Dimension; + static constexpr uint32_t UNDETERMINED_EXTENT = + Dimension::UNDETERMINED_EXTENT; + size_t out_channel_alignment = 1; + for (size_t i = 0; i < extra_shape.ndim; ++i) { + auto name = extra_shape[i].name(); + if (name == Dimension::Name::C && + extra_shape[i].extent() == UNDETERMINED_EXTENT) { + out_channel_alignment = extra_shape[i].stride(); + } + } + + auto target_shape = tensor_formats_to_named_tensor_shape(target_formats); + for (size_t i = 0; i < target_shape.ndim; ++i) { + auto name = target_shape[i].name(); + if ((name == Dimension::Name::K || name == Dimension::Name::N) && + target_shape[i].extent() == UNDETERMINED_EXTENT) { + size_t out_channels = tshp[i] * target_shape[i].stride(); + tshp[i] = divup(out_channels, out_channel_alignment) * + out_channel_alignment / target_shape[i].stride(); + } + } + return tshp; +} + // vim: syntax=cpp.doxygen diff --git a/src/gopt/impl/utils.h b/src/gopt/impl/utils.h new file mode 100644 index 000000000..335302081 --- /dev/null +++ b/src/gopt/impl/utils.h @@ -0,0 +1,105 @@ +/** + * \file src/gopt/impl/utils.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once +#include "megbrain/gopt/global_layout_transform.h" + +namespace mgb { +namespace gopt { + +static inline const char* opr_format_to_string( + OprTensorFormatsConfiguration::OprFormat opr_format) { + using OprFormat = OprTensorFormatsConfiguration::OprFormat; +#define cb(_fmt) \ + case OprFormat::_fmt: \ + return #_fmt + switch (opr_format) { + cb(NCHW); + cb(NHWC); + cb(NCHW4); + cb(NCHW32); + cb(NCHW64); + cb(CHWN4); + default: + mgb_assert(false, "Invalid opr format(got:%u)", + static_cast(opr_format)); + } +#undef cb +} + +static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape( + TensorFormats format) { + switch (format) { + case TensorFormats::NCHW: + return {{"N"}, {"C"}, {"H"}, {"W"}}; + case TensorFormats::NHWC: + return {{"N"}, {"H"}, {"W"}, {"C"}}; + case TensorFormats::NCHWc4: + return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}}; + case TensorFormats::NCHWc8: + return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}}; + case TensorFormats::NCHWc32: + return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}}; + case TensorFormats::NCHWc64: + return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}}; + case TensorFormats::CHWNc4: + return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}}; + case TensorFormats::NHCWc4: + return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}}; + case TensorFormats::KRSCk4: + return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; + case TensorFormats::GKRSCk4: + return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}}; + case TensorFormats::C1RSc4: + return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; + case TensorFormats::KRSCk4c4: + return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; + case TensorFormats::GKRSCk4c4: + return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}}; + case TensorFormats::KCRSk4c4: + return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; + case TensorFormats::GKCRSk4c4: + return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}}; + case TensorFormats::KCRSc4k4: + return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; + case TensorFormats::GKCRSc4k4: + return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}}; + case TensorFormats::C11RSc4: + return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}}; + case TensorFormats::KCRSc8k8: + return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; + case TensorFormats::GKCRSc8k8: + return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}}; + case TensorFormats::C11RSc8: + return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}}; + case TensorFormats::KRSCk8: + return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}}; + case TensorFormats::KCRSc4: + return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; + case TensorFormats::GKCRSc4: + return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}}; + case TensorFormats::KCRS: + return {{"K"}, {"C"}, {"R"}, {"S"}}; + case TensorFormats::GKCRS: + return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}}; + case TensorFormats::C11RS: + return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}}; + default: + mgb_throw(AssertionError, "invalid tensor formats(%u)", + static_cast(format)); + } +} + +} // namespace gopt +} // namespace mgb + +// vim: syntax=cpp.doxygen diff --git a/src/gopt/include/megbrain/gopt/global_layout_transform.h b/src/gopt/include/megbrain/gopt/global_layout_transform.h new file mode 100644 index 000000000..98432204d --- /dev/null +++ b/src/gopt/include/megbrain/gopt/global_layout_transform.h @@ -0,0 +1,176 @@ +/** + * \file src/gopt/include/megbrain/gopt/global_layout_transformation.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once +#include "megbrain/gopt/reformat_manager.h" +#include "megbrain/gopt/subgraph_extractor.h" +#include "megbrain/opr/dnn/convolution.h" + +namespace mgb { +namespace gopt { + +/*! + * \brief A structure that describe the data types and tensor formats + * configuration of the opr format + */ +struct OprTensorFormatsConfiguration { + using OprFormat = opr::ConvBias::Param::Format; + using OprTensorFormatsDispatcher = + thin_function( + const cg::OperatorNodeBase*)>; + Typeinfo* typeinfo; + OprFormat opr_format; + SmallVector input_dtypes; + SmallVector output_dtypes; + SmallVector input_tensor_formats; + SmallVector input_tensor_types; + SmallVector output_tensor_formats; + static OprTensorFormatsDispatcher* find_dispatcher_by_type_format( + Typeinfo* type, OprFormat opr_format); +}; + +/*! + * \brief A structure that describes the global layout transform problem + */ +class Problem { +public: + using OprFormat = OprTensorFormatsConfiguration::OprFormat; + using OprTensorFormatsDispatcher = + OprTensorFormatsConfiguration::OprTensorFormatsDispatcher; + using OprConfigTrait = + ThinHashMap>; + struct Attribute { + OprFormat base_opr_format; /// the base opr format indicates that the + /// network to be optimized is constructed + /// in the base opr format, i.e. all the + /// format aware operators (conv, conv_bias, + /// deconv, pooling etc.) are built in + /// this format. + TensorFormats + base_tensor_formats; /// the base tensor format indicates that + /// all the format agnostic operators + /// (like elemwise, elemwise multi type, + /// typecvt etc.) are built in the base + /// tensor format. + }; + Problem(const GraphPartition& graph_partition, + const SmallVector& available_tensor_formats, + const OprConfigTrait& opr_config, const Attribute& attribute) + : m_graph_partition{graph_partition}, + m_available_tensor_formats{available_tensor_formats}, + m_opr_configs{opr_config}, + m_attribute{attribute} {} + ~Problem() noexcept = default; + + const GraphPartition& graph_partition() const { return m_graph_partition; } + const OprConfigTrait& opr_configs() const { return m_opr_configs; } + const SmallVector& available_tensor_formats() const { + return m_available_tensor_formats; + } + TensorFormats base_format() const { + return m_attribute.base_tensor_formats; + } + OprTensorFormatsConfiguration base_config( + const cg::OperatorNodeBase* opr) const { + auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format( + opr->dyn_typeinfo(), m_attribute.base_opr_format); + auto rst = (*_)(opr); + if (rst.valid()) + return rst.val(); + OprTensorFormatsConfiguration config; + config.typeinfo = opr->dyn_typeinfo(); + config.opr_format = m_attribute.base_opr_format; + for (const auto& i : opr->input()) { + config.input_dtypes.emplace_back(i->dtype().enumv()); + config.input_tensor_formats.emplace_back( + m_attribute.base_tensor_formats); + config.input_tensor_types.emplace_back(TensorType::FEATURE); + } + config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv()); + config.output_tensor_formats.emplace_back( + m_attribute.base_tensor_formats); + return config; + } + +private: + const GraphPartition& m_graph_partition; /// the graph partition + const SmallVector& + m_available_tensor_formats; /// the available tensor formats, used + /// for format agnostic operators (like + /// elemwise, elemwise multi type, + /// typecvt, etc. + const OprConfigTrait& + m_opr_configs; /// the available opr format configurations, used + /// for format aware operators (like conv, deconv, + /// conv_bias, etc. + Attribute m_attribute; /// the extra attributes to describe the problem +}; + +/*! + * \brief A profiler that collects all the performance data to describe the + * global layout transform problem. + */ +class ProfilerBase { +public: + using OprFormat = Problem::OprFormat; + struct OperatorNodeRecord { + const cg::OperatorNodeBase* opr; ///< pointer to operator node + ThinHashMap + costs; ///< costs of operator node, i.e. the elapsed device + ///< time of the operator node on different opr format + ///< (layout configuration). + std::string to_string() const; + }; + struct VarNodeRecord { + struct KeyHash { + size_t operator()( + const std::pair& val) const { + size_t h1 = + std::hash()(static_cast(val.first)); + size_t h2 = std::hash()( + static_cast(val.second)); + return mgb::hash_pair_combine(h1, h2); + } + }; + const VarNode* var; ///< pointer to var node + std::unordered_map, float, + KeyHash> + costs; ///< costs of var node, i.e. the elapsed + ///< device time of the layout transform. + ///< Key of the hashmap indicates the + ///< source tensor format and the target + ///< tensor format. + std::string to_string() const; + }; + /*! + * \note the profiler assumes all the input and output var node are stored + * in contiguous layout in memory + */ + struct ProfilingResult { + /// A hashmap, that maps the operator node to the costs (device elapsed + /// time) of different layouts configuration + ThinHashMap opr_record; + /// A hashmap, that maps the var node to the costs of layout transform + ThinHashMap var_record; + }; + + ProfilerBase() = default; + virtual ~ProfilerBase() = default; + virtual ProfilingResult profile(const Problem& problem) const = 0; + static std::unique_ptr make_profiler(); +}; + +} // namespace gopt +} // namespace mgb + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/include/megbrain/gopt/reformat_emitter.h b/src/gopt/include/megbrain/gopt/reformat_emitter.h index 9a1a2af18..27ddb86a5 100644 --- a/src/gopt/include/megbrain/gopt/reformat_emitter.h +++ b/src/gopt/include/megbrain/gopt/reformat_emitter.h @@ -80,11 +80,13 @@ private: class PaddingEmitter final : public Emitter { public: - PaddingEmitter(size_t const_extent, size_t axis) - : m_const_extent{const_extent}, m_axis{axis} {} + PaddingEmitter(const megdnn::NamedTensorShape& padshp, size_t const_extent, + size_t axis) + : m_padshp{padshp}, m_const_extent{const_extent}, m_axis{axis} {} EmitResult emit() const override; private: + megdnn::NamedTensorShape m_padshp; size_t m_const_extent, m_axis; }; diff --git a/src/gopt/include/megbrain/gopt/reformat_manager.h b/src/gopt/include/megbrain/gopt/reformat_manager.h index 58a01bc58..9b1c2652a 100644 --- a/src/gopt/include/megbrain/gopt/reformat_manager.h +++ b/src/gopt/include/megbrain/gopt/reformat_manager.h @@ -17,6 +17,11 @@ namespace mgb { namespace gopt { +enum class TensorType : uint32_t { + FEATURE = 0, + WEIGHT = 1, +}; + enum class TensorFormats : uint32_t { // input tensor formats NCHW = 0, ///< [N, C, H, W] @@ -116,6 +121,15 @@ public: private: ReformatCache m_cache; }; + +TensorShape make_aligned_tensor_shape(const VarNode* var, + TensorFormats orig_formats, + TensorFormats target_formats); + +TensorShape make_aligned_weight_shape(const VarNode* var, + TensorFormats orig_formats, + TensorFormats target_formats, + TensorFormats extra_formats); } // namespace gopt } // namespace mgb diff --git a/src/gopt/include/megbrain/gopt/subgraph_extractor.h b/src/gopt/include/megbrain/gopt/subgraph_extractor.h index f6e09a48d..317a8c434 100644 --- a/src/gopt/include/megbrain/gopt/subgraph_extractor.h +++ b/src/gopt/include/megbrain/gopt/subgraph_extractor.h @@ -20,6 +20,7 @@ class GraphPartition { public: using VarNodeSet = ThinHashSet; using OperatorNodeSet = ThinHashSet; + class InputPlaceholder; GraphPartition() = default; @@ -45,13 +46,13 @@ private: class SubGraphExtractor { public: using OprList = ThinHashSet; - SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {}; + SubGraphExtractor(const OprList& opr_list) : m_opr_list{opr_list} {}; std::vector extract( const SymbolVarArray& endpoint_vars) const; private: class Impl; - OprList m_opr_list; + const OprList& m_opr_list; }; } // namespace gopt diff --git a/src/gopt/test/profiler.cpp b/src/gopt/test/profiler.cpp new file mode 100644 index 000000000..b3be17e2e --- /dev/null +++ b/src/gopt/test/profiler.cpp @@ -0,0 +1,429 @@ +/** + * \file src/gopt/test/profiler.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./helper.h" +#include "megbrain/gopt/global_layout_transform.h" +#include "megbrain/gopt/inference.h" +#include "megbrain/opr/dnn/pooling.h" +#include "megbrain/opr/imgproc.h" +#include "megbrain/opr/nn_int.h" +#include "megbrain/serialization/serializer.h" + +using namespace mgb; +using namespace gopt; +using namespace serialization; + +namespace { +class LayoutTransformContext : public NonCopyableObj { +public: + using OprList = SubGraphExtractor::OprList; + using OprFormat = Problem::OprFormat; + using OprConfigTrait = Problem::OprConfigTrait; + + LayoutTransformContext() = delete; + LayoutTransformContext(OprList opr_list, + SmallVector available_tensor_formats, + OprConfigTrait opr_configs) + : m_opr_list{std::move(opr_list)}, + m_available_tensor_formats{std::move(available_tensor_formats)}, + m_opr_configs{std::move(opr_configs)} {} + const OprList& opr_list() const { return m_opr_list; } + const SmallVector& available_tensor_formats() const { + return m_available_tensor_formats; + } + const OprConfigTrait& opr_configs() const { return m_opr_configs; } + static std::unique_ptr make() { + OprList opr_list = { + opr::ConvBiasForward::typeinfo(), + opr::ConvolutionForward::typeinfo(), + opr::ConvolutionBackwardData::typeinfo(), + opr::ElemwiseMultiType::typeinfo(), + opr::Elemwise::typeinfo(), + opr::TypeCvt::typeinfo(), + opr::PoolingForward::typeinfo(), + opr::WarpPerspectiveForward::typeinfo(), + }; + OprConfigTrait opr_configs; + { + auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()]; +#define cb(_fmt) \ + dispatchers[OprFormat::_fmt] = \ + OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ + opr::ConvBias::typeinfo(), OprFormat::_fmt); + cb(NCHW4); + cb(NCHW32); + cb(NHWC); + cb(NCHW64); + cb(CHWN4); +#undef cb + } + { + auto& dispatchers = + opr_configs[opr::ConvolutionBackwardData::typeinfo()]; +#define cb(_fmt) \ + dispatchers[OprFormat::_fmt] = \ + OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ + opr::ConvolutionBackwardData::typeinfo(), \ + OprFormat::_fmt); + cb(NCHW4); +#undef cb + } + + { + auto& dispatchers = + opr_configs[opr::ConvolutionForward::typeinfo()]; +#define cb(_fmt) \ + dispatchers[OprFormat::_fmt] = \ + OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ + opr::ConvolutionForward::typeinfo(), OprFormat::_fmt); + cb(NCHW4); +#undef cb + } + + { + auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()]; +#define cb(_fmt) \ + dispatchers[OprFormat::_fmt] = \ + OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ + opr::PoolingForward::typeinfo(), OprFormat::_fmt); + cb(NCHW4); + cb(NCHW32); + cb(NHWC); + cb(NCHW64); + cb(CHWN4); +#undef cb + } + + { + auto& dispatchers = + opr_configs[opr::WarpPerspectiveForward::typeinfo()]; +#define cb(_fmt) \ + dispatchers[OprFormat::_fmt] = \ + OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \ + opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt); + cb(NHWC); + cb(NCHW4); + cb(NCHW64); +#undef cb + } + + SmallVector available_tensor_formats = { + TensorFormats::NHWC, TensorFormats::NCHWc4, + TensorFormats::NCHWc32, TensorFormats::NCHWc64}; + return std::make_unique( + std::move(opr_list), std::move(available_tensor_formats), + std::move(opr_configs)); + } + +private: + OprList m_opr_list; + SmallVector m_available_tensor_formats; + OprConfigTrait m_opr_configs; +}; +}; // namespace + +#if MGB_CUDA +#if CUDA_VERSION >= 10020 +TEST(TestProfiler, Conv) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); + auto ctx = LayoutTransformContext::make(); + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + auto x = mkvar("x", {64, 48, 14, 14}, + dtype::Quantized4Asymm(2.5f, static_cast(4))); + auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f)); + auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f)); + opr::ConvBias::Param param; + param.format = opr::ConvBias::Param::Format::NCHW; + param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY; + param.stride_h = param.stride_w = 1; + param.pad_h = param.pad_w = 1; + auto c1 = opr::ConvBias::make(x, w1, b1, param, {}, + OperatorNodeConfig(dtype::Quantized4Asymm( + 12.345f, static_cast(5)))); + x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f)); + auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f)); + auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f)); + auto c2 = opr::ConvBias::make(x, w2, b2, param, {}, + OperatorNodeConfig(dtype::QuantizedS8(2.5f))); + + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({c2}, strategy); + using OprFormat = OprTensorFormatsConfiguration::OprFormat; + SubGraphExtractor extractor(ctx->opr_list()); + auto partitions = extractor.extract({c2}); + ASSERT_EQ(partitions.size(), 1u); + using Attribute = Problem::Attribute; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; + Problem problem(partitions[0], ctx->available_tensor_formats(), + ctx->opr_configs(), attribute); + auto profiler = ProfilerBase::make_profiler(); + auto rst = profiler->profile(problem); + const auto& opr_rst = rst.opr_record; + const auto& var_rst = rst.var_record; + EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0); + EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0); + EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); + EXPECT_TRUE(var_rst.count(w1.node()) == 0); + EXPECT_TRUE(var_rst.count(b1.node()) == 0); + EXPECT_TRUE(var_rst.count(w2.node()) == 0); + EXPECT_TRUE(var_rst.count(b2.node()) == 0); +} +#endif + +TEST(TestProfiler, Deconv) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); + auto ctx = LayoutTransformContext::make(); + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto mkcvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)) + .rename(name), + dtype); + }; + auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f)); + auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f)); + using Param = opr::ConvolutionBackwardData::Param; + Param param; + param.format = opr::ConvolutionBackwardData::Param::Format::NCHW; + param.stride_h = param.stride_w = 2; + param.pad_h = param.pad_w = 0; + auto c1 = opr::ConvolutionBackwardData::make( + w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); + auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f)); + auto c2 = opr::ConvolutionBackwardData::make( + w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f))); + + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({c2}, strategy); + using OprFormat = OprTensorFormatsConfiguration::OprFormat; + SubGraphExtractor extractor(ctx->opr_list()); + auto partitions = extractor.extract({c2}); + ASSERT_EQ(partitions.size(), 1u); + using Attribute = Problem::Attribute; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; + Problem problem(partitions[0], ctx->available_tensor_formats(), + ctx->opr_configs(), attribute); + auto profiler = ProfilerBase::make_profiler(); + auto rst = profiler->profile(problem); + const auto& opr_rst = rst.opr_record; + const auto& var_rst = rst.var_record; + EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0); + EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0); + EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); + EXPECT_TRUE(var_rst.count(w1.node()) == 0); + EXPECT_TRUE(var_rst.count(w2.node()) == 0); +} + +TEST(TestProfiler, Warp) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); + auto ctx = LayoutTransformContext::make(); + + constexpr size_t INP_H = 10, INP_W = 10, N = 16; + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + + auto x = mkvar("x", {N, 48, INP_H, INP_W}, + dtype::Quantized4Asymm(2.5f, static_cast(4))); + float value1 = M_PI, value2 = 0.6; + auto gen_mat = [&](HostTensorND& mat) { + auto ptr = mat.ptr(); + for (size_t i = 0; i < N; ++i) { + auto rot = value1, scale = value2, sheer = value1, dy = value2, + dx = value2, ky = value2, kx = value2, kb = value2; + ptr[0] = ptr[4] = cos(rot) * scale; + ptr[1] = -(ptr[3] = sin(rot) * scale); + ptr[3] *= sheer; + ptr[4] *= sheer; + ptr[2] = dx; + ptr[5] = dy; + ptr[6] = kx; + ptr[7] = ky; + ptr[8] = kb; + ptr += 9; + } + mgb_assert(ptr == mat.ptr() + mat.shape().total_nr_elems()); + }; + auto mat_host = std::make_shared( + x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32()); + gen_mat(*mat_host); + auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat"); + TensorShape out_shp{20, 20}; + auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp); + + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({w1}, strategy); + using OprFormat = OprTensorFormatsConfiguration::OprFormat; + SubGraphExtractor extractor(ctx->opr_list()); + auto partitions = extractor.extract({w1}); + ASSERT_EQ(partitions.size(), 1u); + using Attribute = Problem::Attribute; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; + Problem problem(partitions[0], ctx->available_tensor_formats(), + ctx->opr_configs(), attribute); + auto profiler = ProfilerBase::make_profiler(); + auto rst = profiler->profile(problem); + const auto& opr_rst = rst.opr_record; + const auto& var_rst = rst.var_record; + EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0); + EXPECT_TRUE(var_rst.count(mat.node()) == 0); + EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0); + EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0); +} + +TEST(TestProfiler, Pooling) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); + auto ctx = LayoutTransformContext::make(); + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto x = mkvar("x", {64, 64, 55, 55}, + dtype::Quantized4Asymm(2.5f, static_cast(4))); + using Param = opr::Pooling::Param; + Param param; + param.format = Param::Format::NCHW; + auto p1 = opr::Pooling::make(x, param); + x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f)); + auto p2 = opr::Pooling::make(x, param); + + using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy; + S strategy = S::PROFILE; + gopt::modify_opr_algo_strategy_inplace({p2}, strategy); + using OprFormat = OprTensorFormatsConfiguration::OprFormat; + SubGraphExtractor extractor(ctx->opr_list()); + auto partitions = extractor.extract({p2}); + ASSERT_EQ(partitions.size(), 1u); + using Attribute = Problem::Attribute; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; + Problem problem(partitions[0], ctx->available_tensor_formats(), + ctx->opr_configs(), attribute); + auto profiler = ProfilerBase::make_profiler(); + auto rst = profiler->profile(problem); + const auto& opr_rst = rst.opr_record; + EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0); + EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0); + EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0); +} + +TEST(TestProfiler, Elemwise) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5); + auto ctx = LayoutTransformContext::make(); + + HostTensorGenerator gen; + auto graph = ComputingGraph::make(); + graph->options().graph_opt_level = 0; + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name), + dtype); + }; + auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32()); + auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32()); + auto c = opr::Elemwise::make({a, b}, + {opr::Elemwise::Param::Mode::FUSE_ADD_RELU}); + auto q4c = opr::TypeCvt::make( + c, dtype::Quantized4Asymm(2.5f, static_cast(4))); + auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f)); + auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f)); + auto q8d = opr::ElemwiseMultiType::make( + {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}, + OperatorNodeConfig(dtype::QuantizedS8(12.f))); + auto q4d = opr::TypeCvt::make( + q8d, dtype::Quantized4Asymm(1.2f, static_cast(3))); + auto q4e = opr::ElemwiseMultiType::make( + {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD}, + OperatorNodeConfig( + dtype::Quantized4Asymm(13.f, static_cast(4)))); + + using OprFormat = OprTensorFormatsConfiguration::OprFormat; + SubGraphExtractor extractor(ctx->opr_list()); + auto partitions = extractor.extract({q4e}); + ASSERT_EQ(partitions.size(), 1u); + using Attribute = Problem::Attribute; + Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW}; + Problem problem(partitions[0], ctx->available_tensor_formats(), + ctx->opr_configs(), attribute); + auto profiler = ProfilerBase::make_profiler(); + auto rst = profiler->profile(problem); + const auto& opr_rst = rst.opr_record; + const auto& var_rst = rst.var_record; + EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0); + EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0); + EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0); + EXPECT_TRUE(var_rst.count(a.node()) > 0); + EXPECT_TRUE(var_rst.count(b.node()) > 0); + EXPECT_TRUE(var_rst.count(q8a.node()) > 0); + EXPECT_TRUE(var_rst.count(q8b.node()) > 0); +} + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/test/reformat_manager.cpp b/src/gopt/test/reformat_manager.cpp index 378dc8d18..d639f5240 100644 --- a/src/gopt/test/reformat_manager.cpp +++ b/src/gopt/test/reformat_manager.cpp @@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) { for (size_t i = 0; i < RUNS; ++i) func->execute(); double time_profiler = profiler->duration() * 1e6; - printf("%f, %f\n", time_profiler, time_cuda_evt); - ASSERT_EQ(time_cuda_evt, time_profiler); MGB_CUDA_CHECK(cudaEventDestroy(evt0)); MGB_CUDA_CHECK(cudaEventDestroy(evt1)); } -- GitLab