feat(mgb/gopt): add profile impl for global layout transform pass

GitOrigin-RevId: 8ef62baf792c97c7a226dd791af167ab2e8707b4

feat(mgb/gopt): add profile impl for global layout transform pass
GitOrigin-RevId: 8ef62baf792c97c7a226dd791af167ab2e8707b4
c14e5719 · Megvii Engine Team · 9c0a17d0 · c14e5719 · c14e5719 · c14e5719
15 changed file
--- a/dnn/src/aarch64/relayout/opr_impl.cpp
+++ b/dnn/src/aarch64/relayout/opr_impl.cpp
@@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
    TensorND src = src0, dst = dst0;
    check_layout_and_canonize(src.layout, dst.layout);

+    // FIXME: optimize for lowbit cases
+    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
+        src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+        fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
+        return;
+    }
+
    relayout::TransposeParam trans_param;
    bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
    if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {

--- a/dnn/src/armv7/relayout/opr_impl.cpp
+++ b/dnn/src/armv7/relayout/opr_impl.cpp
@@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
    TensorND src = src0, dst = dst0;
    check_layout_and_canonize(src.layout, dst.layout);

+    // FIXME: optimize for lowbit cases
+    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
+        src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+        fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
+        return;
+    }
+
    relayout::TransposeParam trans_param;
    bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
    if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {

--- a/src/gopt/impl/opr_format_modifier.cpp
+++ b/src/gopt/impl/opr_format_modifier.cpp
+/**
+ * \file src/gopt/impl/opr_format_modifier.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "./opr_format_modifier.h"
+#include "megbrain/opr/dnn/convolution.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/imgproc.h"
+#include "megbrain/opr/io.h"
+#include "megbrain/serialization/sereg.h"
+
+#include "midout.h"
+MIDOUT_DECL(megbrain_opr_format_modifier)
+#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_format_modifier, __VA_ARGS__) {
+#define MIDOUT_E \
+    }            \
+    MIDOUT_END();
+
+using namespace mgb;
+using namespace opr;
+
+namespace {
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCaller2 {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray& inputs,
+                         const typename MegDNNConv::Param& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        if (inputs.size() == 2) {
+            return Opr::make(inputs[0], inputs[1], param, execution_policy,
+                             config)
+                    .node();
+        }
+        return nullptr;
+    }
+};
+
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCaller3 {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray& inputs,
+                         const typename MegDNNConv::Param& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        if (inputs.size() == 3) {
+            return Opr::make(inputs[0], inputs[1], inputs[2], param,
+                             execution_policy, config)
+                    .node();
+        }
+        return nullptr;
+    }
+};
+
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCaller4 {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray& inputs,
+                         const typename MegDNNConv::Param& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        if (inputs.size() == 4) {
+            return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param,
+                             execution_policy, config)
+                    .node();
+        }
+        return nullptr;
+    }
+};
+
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCaller5 {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray& inputs,
+                         const typename MegDNNConv::Param& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        if (inputs.size() == 5) {
+            return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3],
+                             inputs[4], param, execution_policy, config)
+                    .node();
+        }
+        return nullptr;
+    }
+};
+
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCallerEmpty {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray&,
+                         const typename MegDNNConv::Param&,
+                         const megdnn::param::ExecutionPolicy&,
+                         const OperatorNodeConfig&) {
+        return nullptr;
+    }
+};
+
+template <class Opr, class Maker0, class MegDNNConv,
+          class Maker1 = MakeConvCallerEmpty<MegDNNConv>,
+          class Maker2 = MakeConvCallerEmpty<MegDNNConv>,
+          typename ConvParam = megdnn::param::Convolution>
+struct ConvMakerImpl {
+    static VarNode* make(const cg::VarNodeArray& inputs, const ConvParam& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        VarNode* ret = Maker0::template make<Opr>(inputs, param,
+                                                  execution_policy, config);
+        if (!ret) {
+            ret = Maker1::template make<Opr>(inputs, param, execution_policy,
+                                             config);
+        }
+        if (!ret) {
+            ret = Maker2::template make<Opr>(inputs, param, execution_policy,
+                                             config);
+        }
+        mgb_assert(ret);
+        return ret;
+    }
+};
+
+template <typename Opr>
+struct ConvMaker;
+
+template <>
+struct ConvMaker<opr::Convolution>
+        : public ConvMakerImpl<opr::Convolution,
+                               MakeConvCaller2<megdnn::Convolution>,
+                               megdnn::Convolution> {};
+template <>
+struct ConvMaker<opr::ConvolutionBackwardData>
+        : public ConvMakerImpl<opr::ConvolutionBackwardData,
+                               MakeConvCaller2<megdnn::Convolution>,
+                               megdnn::Convolution,
+                               MakeConvCaller3<megdnn::Convolution>> {};
+
+template <>
+struct ConvMaker<opr::ConvBiasForward>
+        : public ConvMakerImpl<opr::ConvBiasForward,
+                               MakeConvCaller2<megdnn::ConvBiasForward>,
+                               megdnn::ConvBiasForward,
+                               MakeConvCaller3<megdnn::ConvBiasForward>,
+                               MakeConvCaller4<megdnn::ConvBiasForward>,
+                               megdnn::param::ConvBias> {};
+template <>
+struct ConvMaker<opr::BatchConvBiasForward>
+        : public ConvMakerImpl<opr::BatchConvBiasForward,
+                               MakeConvCaller2<megdnn::BatchConvBiasForward>,
+                               megdnn::BatchConvBiasForward,
+                               MakeConvCaller3<megdnn::BatchConvBiasForward>,
+                               MakeConvCaller4<megdnn::BatchConvBiasForward>,
+                               megdnn::param::BatchConvBias> {};
+
+#if 0
+#include "../../opr/impl/internal/invoke.h"
+template <typename Opr>
+struct MultiAlgoOprTrait;
+
+#define APPLY(statement, ...)                                  \
+    mgb::apply([&](const auto&... args) { return statement; }, \
+               std::tuple_cat(__VA_ARGS__))
+
+#define INST(_Opr)                                                          \
+    template <>                                                             \
+    struct MultiAlgoOprTrait<_Opr> {                                        \
+        static constexpr bool has_algo = true;                              \
+        using MegDNNOpr = megdnn::_Opr;                                     \
+        static constexpr int arity = OprArityTrait<MegDNNOpr>::arity;       \
+        using FixedTensorLayouts = std::array<TensorLayout, arity>;         \
+        static bool has_available_algo(const VarNodeArray& i,               \
+                                       const cg::OperatorNodeBase* opr_) {  \
+            MIDOUT_B(midout_iv(MGB_HASH_STR(#_Opr)),                        \
+                     midout_iv(MGB_HASH_STR("has_available_algo")))         \
+            auto&& opr = opr_->cast_final_safe<_Opr>();                     \
+            auto&& megdnn_opr =                                             \
+                    reinterpret_cast<MegDNNOpr*>(opr.megdnn_opr());         \
+            FixedTensorLayouts array_layouts;                               \
+            size_t in = i.size() - 1;                                       \
+            for (size_t idx = 0; idx < in; idx++) {                         \
+                const auto& v = i[idx];                                     \
+                array_layouts[idx] =                                        \
+                        TensorLayout{v->shape(), v->dtype(), v->format()};  \
+            }                                                               \
+            const auto& v = i[in];                                          \
+            array_layouts[arity - 1] =                                      \
+                    TensorLayout{v->shape(), v->dtype(), v->format()};      \
+            return APPLY(::megdnn::has_available_algo(megdnn_opr, args...), \
+                         array_layouts);                                    \
+            MIDOUT_E                                                        \
+        }                                                                   \
+    };
+INST(Convolution)
+INST(ConvBiasForward)
+INST(ConvolutionBackwardData)
+INST(PoolingForward)
+#undef APPLY
+#undef INST
+#endif
+}  // namespace
+
+namespace mgb {
+namespace gopt {
+namespace intl {
+
+template <typename Opr>
+struct OprFormatModifier;
+
+#define INST(_Opr)                                                         \
+    template <>                                                            \
+    struct OprFormatModifier<_Opr> {                                       \
+        using OprFormat = typename _Opr::Param::Format;                    \
+        static VarNode* make(OprFormat opr_format, const VarNodeArray& i,  \
+                             const cg::OperatorNodeBase* opr_) {           \
+            MIDOUT_B(_Opr)                                                 \
+            auto&& opr = opr_->cast_final_safe<_Opr>();                    \
+            auto param = opr.param();                                      \
+            param.format = opr_format;                                     \
+            return ConvMaker<_Opr>::make(i, param, opr.execution_policy(), \
+                                         opr.config());                    \
+            MIDOUT_E                                                       \
+        }                                                                  \
+    };
+INST(Convolution);
+INST(ConvBiasForward);
+INST(ConvolutionBackwardData);
+INST(BatchConvBiasForward);
+#undef INST
+
+template <>
+struct OprFormatModifier<WarpPerspective> {
+    using Opr = opr::WarpPerspective;
+    using OprFormat = typename Opr::Param::Format;
+    static VarNode* make(OprFormat opr_format, const VarNodeArray& i,
+                         const cg::OperatorNodeBase* opr_) {
+        MIDOUT_B(Opr)
+        auto&& opr = opr_->cast_final_safe<Opr>();
+        auto param = opr.param();
+        param.format = opr_format;
+        if (i.size() == 3) {
+            return Opr::make(i[0], i[1], i[2], param, opr.config()).node();
+        } else {
+            mgb_assert(i.size() == 4);
+            return Opr::make(i[0], i[1], i[2], i[3], param, opr.config())
+                    .node();
+        }
+        MIDOUT_E
+    }
+};
+
+#define INST(_Opr, _arity)                                                \
+    template <>                                                           \
+    struct OprFormatModifier<_Opr> {                                      \
+        using OprFormat = typename _Opr::Param::Format;                   \
+        static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \
+                             const cg::OperatorNodeBase* opr_) {          \
+            MIDOUT_B(_Opr)                                                \
+            auto&& opr = opr_->cast_final_safe<_Opr>();                   \
+            auto param = opr.param();                                     \
+            param.format = opr_format;                                    \
+            return serialization::OprMaker<_Opr, _arity>::make(           \
+                           param, i, *i[0]->owner_graph(), opr.config())  \
+                    ->output(0);                                          \
+            MIDOUT_E                                                      \
+        }                                                                 \
+    };
+INST(PoolingForward, 1);
+INST(Resize, 2);
+#undef INST
+
+VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
+                           const VarNodeArray& i,
+                           const cg::OperatorNodeBase* opr) {
+#define cb(_Opr)                                                  \
+    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                \
+        return OprFormatModifier<_Opr>::make(opr_format, i, opr); \
+    } else
+    FOREACH_FORMAT_AWARE_OPR(cb) {
+        mgb_throw(InternalError, "invalid format aware operator(got:%s)",
+                  opr->dyn_typeinfo()->name);
+    }
+#undef cb
+}
+
+#if 0
+bool has_available_algo(const VarNodeArray& i,
+                        const cg::OperatorNodeBase* opr) {
+#define cb(_Opr)                                                    \
+    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                  \
+        MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo);       \
+        VarNodeArray _ = i;                                         \
+        _.emplace_back(opr->output(0));                             \
+        return MultiAlgoOprTrait<_Opr>::has_available_algo(_, opr); \
+    } else
+    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData)
+            cb(PoolingForward) {
+        mgb_throw(InternalError, "invalid multi-algo operator(got:%s)",
+                  opr->dyn_typeinfo()->name);
+    }
+}
+#endif
+
+}  // namespace intl
+}  // namespace gopt
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/impl/opr_format_modifier.h
+++ b/src/gopt/impl/opr_format_modifier.h
+/**
+ * \file src/gopt/impl/opr_format_modifier.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+#include "megbrain/graph.h"
+#include "megbrain/opr/dnn/convolution.h"
+
+namespace mgb {
+namespace gopt {
+namespace intl {
+
+#define FOREACH_FORMAT_AWARE_OPR(cb)                                \
+    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \
+            cb(PoolingForward) cb(WarpPerspective) cb(Resize)
+#if 0
+bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr);
+#endif
+
+VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
+                           const VarNodeArray& i,
+                           const cg::OperatorNodeBase* opr);
+
+}  // namespace intl
+}  // namespace gopt
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/impl/opr_tensor_formats_config.cpp
+++ b/src/gopt/impl/opr_tensor_formats_config.cpp
--- a/src/gopt/impl/profiler_impl.cpp
+++ b/src/gopt/impl/profiler_impl.cpp
--- a/src/gopt/impl/reformat_emitter.cpp
+++ b/src/gopt/impl/reformat_emitter.cpp
@@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const {

 /* ============== PaddingEmitter ================= */
 PaddingEmitter::EmitResult PaddingEmitter::emit() const {
+    auto&& padshp = m_padshp;
    auto&& const_extent = m_const_extent;
    auto&& axis = m_axis;
-    auto builder = [const_extent, axis](const VarNodeArray& vars) {
+    auto builder = [padshp, const_extent, axis](const VarNodeArray& vars) {
        auto i = vars[0];
        auto padding_shp_var = vars[1];
        TensorShape shape;
        shape.ndim = i->shape().ndim;
        for (size_t ax = 0; ax < shape.ndim; ++ax)
            shape[ax] = 1;
+        // avoid making a scalar lowbit tensor
+        if (!i->dtype().is_low_bit() || const_extent != 1)
            shape[axis] = const_extent;
+        else {
+            size_t const_axis = 0;
+            size_t new_const_extent = const_extent;
+            for (size_t i = 0; i < padshp.ndim; ++i) {
+                const auto& dim = padshp[i];
+                if (dim.extent() != Dimension::UNDETERMINED_EXTENT &&
+                    dim.extent() != 1) {
+                    new_const_extent = dim.extent();
+                    const_axis = i;
+                    break;
+                }
+            }
+            mgb_assert(new_const_extent != 1,
+                       "cannot make an scalar lowbit tensor(got:%s)",
+                       i->dtype().name());
+            shape[const_axis] = new_const_extent;
+        }
        auto host_val =
                std::make_shared<HostTensorND>(i->comp_node(), i->dtype());
        host_val->resize(shape);

--- a/src/gopt/impl/reformat_manager.cpp
+++ b/src/gopt/impl/reformat_manager.cpp
@@ -13,6 +13,7 @@
 #include "megbrain/gopt/reformat_manager.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/utils/arith_helper.h"
+#include "./utils.h"

 using namespace mgb;
 using namespace gopt;
@@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) {
    }
    return x;
 }
-
-NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) {
-    switch (format) {
-        case TensorFormats::NCHW:
-            return {{"N"}, {"C"}, {"H"}, {"W"}};
-        case TensorFormats::NHWC:
-            return {{"N"}, {"H"}, {"W"}, {"C"}};
-        case TensorFormats::NCHWc4:
-            return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
-        case TensorFormats::NCHWc8:
-            return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
-        case TensorFormats::NCHWc32:
-            return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
-        case TensorFormats::NCHWc64:
-            return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
-        case TensorFormats::CHWNc4:
-            return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
-        case TensorFormats::NHCWc4:
-            return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
-        case TensorFormats::KRSCk4:
-            return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
-        case TensorFormats::GKRSCk4:
-            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
-        case TensorFormats::C1RSc4:
-            return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
-        case TensorFormats::KRSCk4c4:
-            return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
-        case TensorFormats::GKRSCk4c4:
-            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
-        case TensorFormats::KCRSk4c4:
-            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
-        case TensorFormats::GKCRSk4c4:
-            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
-        case TensorFormats::KCRSc4k4:
-            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
-        case TensorFormats::GKCRSc4k4:
-            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
-        case TensorFormats::C11RSc4:
-            return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
-        case TensorFormats::KCRSc8k8:
-            return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
-        case TensorFormats::GKCRSc8k8:
-            return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
-        case TensorFormats::C11RSc8:
-            return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
-        case TensorFormats::KRSCk8:
-            return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
-        case TensorFormats::KCRSc4:
-            return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
-        case TensorFormats::GKCRSc4:
-            return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
-        case TensorFormats::KCRS:
-            return {{"K"}, {"C"}, {"R"}, {"S"}};
-        case TensorFormats::GKCRS:
-            return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
-        case TensorFormats::C11RS:
-            return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
-        default:
-            mgb_throw(AssertionError, "invalid tensor formats(%u)",
-                      static_cast<uint32_t>(format));
-    }
-}
 };  // namespace

 // =================== ReformatManager::ReformatKey ====================*/
@@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            tensor_formats_to_named_tensor_shape(key.input_format);
    NamedTensorShape output_shape =
            tensor_formats_to_named_tensor_shape(key.output_format);
-    size_t input_alignment, output_alignment;
-    size_t input_channel_idx, output_channel_idx;
+    size_t input_alignment = 0;
+    size_t output_alignment = 0;
+    size_t input_channel_idx = input_shape.ndim,
+           output_channel_idx = input_shape.ndim;
    for (size_t i = 0; i < input_shape.ndim; ++i) {
        if (input_shape[i].name() == Dimension::Name::C &&
            input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
@@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            break;
        }
    }
+    mgb_assert(input_channel_idx < input_shape.ndim &&
+                       output_channel_idx < input_shape.ndim,
+               "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
+               input_channel_idx, output_channel_idx,
+               input_shape.to_string().c_str());
+    mgb_assert(input_alignment > 0 && output_alignment > 0,
+               "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
+               input_alignment, output_alignment,
+               input_shape.to_string().c_str());
    NamedTensorShape orig_shape =
            tensor_formats_to_named_tensor_shape(orig_format);
    size_t orig_channel = 0;
@@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({x});
-            auto padding = std::get<0>(
-                    PaddingEmitter{const_extent, input_channel_idx}.emit());
+            auto padding = std::get<0>(PaddingEmitter{
+                    padding_shape, const_extent, input_channel_idx}
+                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        cur = ReformatManager::instance().get(key)({cur});
@@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
        const VarNode* orig_var, const ReformatKey& key,
        const AlignmentDesc& extra_alignment) const {
    size_t in_channels = 0, out_channels = 0;
-    size_t input_channel_idx, output_channel_idx;
-    Dimension::Name out_channel_name;
+    Dimension::Name out_channel_name = Dimension::Name::C;
    auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format);
+    size_t input_channel_idx = input_shape.ndim,
+           output_channel_idx = input_shape.ndim;
    for (size_t i = 0; i < input_shape.ndim; ++i) {
        if (input_shape[i].name() == Dimension::Name::C &&
            input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
@@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
                       input_shape.to_string().c_str());
        }
    }
-    size_t in_channel_alignment, out_channel_alignment = 1;
+    mgb_assert(out_channel_name == Dimension::Name::K ||
+                       out_channel_name == Dimension::Name::N,
+               "invalid out channel(shp:%s)", input_shape.to_string().c_str());
+    mgb_assert(input_channel_idx < input_shape.ndim &&
+                       output_channel_idx < input_shape.ndim,
+               "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
+               input_channel_idx, output_channel_idx,
+               input_shape.to_string().c_str());
+    size_t in_channel_alignment = 0, out_channel_alignment = 0;
    auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format);
    for (size_t i = 0; i < output_shape.ndim; ++i) {
        if (output_shape[i].name() == Dimension::Name::C &&
@@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            out_channel_alignment = output_shape[i].stride();
        }
    }
+    mgb_assert(in_channel_alignment > 0 && out_channel_alignment > 0,
+               "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
+               in_channel_alignment, out_channel_alignment,
+               output_shape.to_string().c_str());
    size_t aligned_in_channel =
            divup(in_channels, in_channel_alignment) * in_channel_alignment;
    if (extra_alignment.name == out_channel_name) {
@@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({x});
-            auto padding = std::get<0>(
-                    PaddingEmitter{const_extent, input_channel_idx}.emit());
+            auto padding = std::get<0>(PaddingEmitter{
+                    padding_shape, const_extent, input_channel_idx}
+                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        if (aligned_out_channel > out_channels) {
@@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({cur});
-            auto padding = std::get<0>(
-                    PaddingEmitter{const_extent, output_channel_idx}.emit());
+            auto padding = std::get<0>(PaddingEmitter{
+                    padding_shape, const_extent, output_channel_idx}
+                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        cur = ReformatManager::instance().get(key)({cur});
@@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() {
    static ReformatManager inst;
    return inst;
 }
+
+TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var,
+                                                 TensorFormats orig_formats,
+                                                 TensorFormats target_formats) {
+    using Dimension = megdnn::Dimension;
+    static constexpr uint32_t UNDETERMINED_EXTENT =
+            Dimension::UNDETERMINED_EXTENT;
+    auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats);
+    auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
+
+    TensorShape oshp = var->shape();
+    mgb_assert(oshp.is_scalar() || oshp.ndim == orig_shape.ndim,
+               "orig shape of var node is not compatible with tensor "
+               "formats(var:%s;shp:%s;fmt:%s)",
+               var->cname(), oshp.to_string().c_str(),
+               orig_shape.to_string().c_str());
+    if (oshp.is_scalar()) return oshp;
+    TensorShape tshp;
+    ThinHashMap<Dimension::Name, int> name2dominant;
+    for (size_t i = 0; i < orig_shape.ndim; ++i) {
+        auto name = orig_shape[i].name();
+        if (orig_shape[i].extent() == UNDETERMINED_EXTENT) {
+            auto insert = name2dominant.insert(std::make_pair(name, i));
+            mgb_assert(insert.second);
+        }
+    }
+
+    tshp.ndim = target_shape.ndim;
+    for (size_t i = 0; i < target_shape.ndim; ++i) {
+        auto name = target_shape[i].name();
+        if (target_shape[i].extent() == UNDETERMINED_EXTENT) {
+            int idx = name2dominant.at(name);
+            bool mul = orig_shape[idx] < target_shape[i];
+            size_t factor = mul ? (target_shape[i] / orig_shape[idx]).extent()
+                                : (orig_shape[idx] / target_shape[i]).extent();
+            if (mul)
+                tshp[i] = oshp[idx] * factor;
+            else
+                tshp[i] = divup(oshp[idx], factor);
+        } else {
+            tshp[i] = target_shape[i].extent();
+        }
+    }
+    return tshp;
+}
+
+TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var,
+                                                 TensorFormats orig_formats,
+                                                 TensorFormats target_formats,
+                                                 TensorFormats extra_formats) {
+    auto tshp = make_aligned_tensor_shape(var, orig_formats, target_formats);
+    auto extra_shape = tensor_formats_to_named_tensor_shape(extra_formats);
+    using Dimension = megdnn::Dimension;
+    static constexpr uint32_t UNDETERMINED_EXTENT =
+            Dimension::UNDETERMINED_EXTENT;
+    size_t out_channel_alignment = 1;
+    for (size_t i = 0; i < extra_shape.ndim; ++i) {
+        auto name = extra_shape[i].name();
+        if (name == Dimension::Name::C &&
+            extra_shape[i].extent() == UNDETERMINED_EXTENT) {
+            out_channel_alignment = extra_shape[i].stride();
+        }
+    }
+
+    auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
+    for (size_t i = 0; i < target_shape.ndim; ++i) {
+        auto name = target_shape[i].name();
+        if ((name == Dimension::Name::K || name == Dimension::Name::N) &&
+            target_shape[i].extent() == UNDETERMINED_EXTENT) {
+            size_t out_channels = tshp[i] * target_shape[i].stride();
+            tshp[i] = divup(out_channels, out_channel_alignment) *
+                      out_channel_alignment / target_shape[i].stride();
+        }
+    }
+    return tshp;
+}
+
 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/utils.h
+++ b/src/gopt/impl/utils.h
+/**
+ * \file src/gopt/impl/utils.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+#include "megbrain/gopt/global_layout_transform.h"
+
+namespace mgb {
+namespace gopt {
+
+static inline const char* opr_format_to_string(
+        OprTensorFormatsConfiguration::OprFormat opr_format) {
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+#define cb(_fmt)          \
+    case OprFormat::_fmt: \
+        return #_fmt
+    switch (opr_format) {
+        cb(NCHW);
+        cb(NHWC);
+        cb(NCHW4);
+        cb(NCHW32);
+        cb(NCHW64);
+        cb(CHWN4);
+        default:
+            mgb_assert(false, "Invalid opr format(got:%u)",
+                       static_cast<uint32_t>(opr_format));
+    }
+#undef cb
+}
+
+static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape(
+        TensorFormats format) {
+    switch (format) {
+        case TensorFormats::NCHW:
+            return {{"N"}, {"C"}, {"H"}, {"W"}};
+        case TensorFormats::NHWC:
+            return {{"N"}, {"H"}, {"W"}, {"C"}};
+        case TensorFormats::NCHWc4:
+            return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
+        case TensorFormats::NCHWc8:
+            return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
+        case TensorFormats::NCHWc32:
+            return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
+        case TensorFormats::NCHWc64:
+            return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
+        case TensorFormats::CHWNc4:
+            return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
+        case TensorFormats::NHCWc4:
+            return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
+        case TensorFormats::KRSCk4:
+            return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
+        case TensorFormats::GKRSCk4:
+            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
+        case TensorFormats::C1RSc4:
+            return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
+        case TensorFormats::KRSCk4c4:
+            return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
+        case TensorFormats::GKRSCk4c4:
+            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
+        case TensorFormats::KCRSk4c4:
+            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
+        case TensorFormats::GKCRSk4c4:
+            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
+        case TensorFormats::KCRSc4k4:
+            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
+        case TensorFormats::GKCRSc4k4:
+            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
+        case TensorFormats::C11RSc4:
+            return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
+        case TensorFormats::KCRSc8k8:
+            return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
+        case TensorFormats::GKCRSc8k8:
+            return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
+        case TensorFormats::C11RSc8:
+            return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
+        case TensorFormats::KRSCk8:
+            return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
+        case TensorFormats::KCRSc4:
+            return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
+        case TensorFormats::GKCRSc4:
+            return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
+        case TensorFormats::KCRS:
+            return {{"K"}, {"C"}, {"R"}, {"S"}};
+        case TensorFormats::GKCRS:
+            return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
+        case TensorFormats::C11RS:
+            return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
+        default:
+            mgb_throw(AssertionError, "invalid tensor formats(%u)",
+                      static_cast<uint32_t>(format));
+    }
+}
+
+}  // namespace gopt
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/include/megbrain/gopt/global_layout_transform.h
+++ b/src/gopt/include/megbrain/gopt/global_layout_transform.h
+/**
+ * \file src/gopt/include/megbrain/gopt/global_layout_transformation.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+#include "megbrain/gopt/reformat_manager.h"
+#include "megbrain/gopt/subgraph_extractor.h"
+#include "megbrain/opr/dnn/convolution.h"
+
+namespace mgb {
+namespace gopt {
+
+/*!
+ * \brief A structure that describe the data types and  tensor formats
+ * configuration of the opr format
+ */
+struct OprTensorFormatsConfiguration {
+    using OprFormat = opr::ConvBias::Param::Format;
+    using OprTensorFormatsDispatcher =
+            thin_function<Maybe<OprTensorFormatsConfiguration>(
+                    const cg::OperatorNodeBase*)>;
+    Typeinfo* typeinfo;
+    OprFormat opr_format;
+    SmallVector<DTypeEnum> input_dtypes;
+    SmallVector<DTypeEnum> output_dtypes;
+    SmallVector<TensorFormats> input_tensor_formats;
+    SmallVector<TensorType> input_tensor_types;
+    SmallVector<TensorFormats> output_tensor_formats;
+    static OprTensorFormatsDispatcher* find_dispatcher_by_type_format(
+            Typeinfo* type, OprFormat opr_format);
+};
+
+/*!
+ * \brief A structure that describes the global layout transform problem
+ */
+class Problem {
+public:
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    using OprTensorFormatsDispatcher =
+            OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
+    using OprConfigTrait =
+            ThinHashMap<Typeinfo*,
+                        ThinHashMap<OprFormat, OprTensorFormatsDispatcher*>>;
+    struct Attribute {
+        OprFormat base_opr_format;  /// the base opr format indicates that the
+                                    /// network to be optimized is constructed
+                                    /// in the base opr format, i.e. all the
+                                    /// format aware operators (conv, conv_bias,
+                                    /// deconv, pooling etc.) are built in
+                                    /// this format.
+        TensorFormats
+                base_tensor_formats;  /// the base tensor format indicates that
+                                      /// all the format agnostic operators
+                                      /// (like elemwise, elemwise multi type,
+                                      /// typecvt etc.) are built in the base
+                                      /// tensor format.
+    };
+    Problem(const GraphPartition& graph_partition,
+            const SmallVector<TensorFormats>& available_tensor_formats,
+            const OprConfigTrait& opr_config, const Attribute& attribute)
+            : m_graph_partition{graph_partition},
+              m_available_tensor_formats{available_tensor_formats},
+              m_opr_configs{opr_config},
+              m_attribute{attribute} {}
+    ~Problem() noexcept = default;
+
+    const GraphPartition& graph_partition() const { return m_graph_partition; }
+    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
+    const SmallVector<TensorFormats>& available_tensor_formats() const {
+        return m_available_tensor_formats;
+    }
+    TensorFormats base_format() const {
+        return m_attribute.base_tensor_formats;
+    }
+    OprTensorFormatsConfiguration base_config(
+            const cg::OperatorNodeBase* opr) const {
+        auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
+                opr->dyn_typeinfo(), m_attribute.base_opr_format);
+        auto rst = (*_)(opr);
+        if (rst.valid())
+            return rst.val();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = m_attribute.base_opr_format;
+        for (const auto& i : opr->input()) {
+            config.input_dtypes.emplace_back(i->dtype().enumv());
+            config.input_tensor_formats.emplace_back(
+                    m_attribute.base_tensor_formats);
+            config.input_tensor_types.emplace_back(TensorType::FEATURE);
+        }
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        config.output_tensor_formats.emplace_back(
+                m_attribute.base_tensor_formats);
+        return config;
+    }
+
+private:
+    const GraphPartition& m_graph_partition;  /// the graph partition
+    const SmallVector<TensorFormats>&
+            m_available_tensor_formats;  /// the available tensor formats, used
+                                         /// for format agnostic operators (like
+                                         /// elemwise, elemwise multi type,
+                                         /// typecvt, etc.
+    const OprConfigTrait&
+            m_opr_configs;  /// the available opr format configurations, used
+                            /// for format aware operators (like conv, deconv,
+                            /// conv_bias, etc.
+    Attribute m_attribute;  /// the extra attributes to describe the problem
+};
+
+/*!
+ * \brief A profiler that collects all the performance data to describe the
+ * global layout transform problem.
+ */
+class ProfilerBase {
+public:
+    using OprFormat = Problem::OprFormat;
+    struct OperatorNodeRecord {
+        const cg::OperatorNodeBase* opr;  ///< pointer to operator node
+        ThinHashMap<OprFormat, float>
+                costs;  ///< costs of operator node, i.e. the elapsed device
+                        ///< time of the operator node on different opr format
+                        ///< (layout configuration).
+        std::string to_string() const;
+    };
+    struct VarNodeRecord {
+        struct KeyHash {
+            size_t operator()(
+                    const std::pair<TensorFormats, TensorFormats>& val) const {
+                size_t h1 =
+                        std::hash<uint32_t>()(static_cast<uint32_t>(val.first));
+                size_t h2 = std::hash<uint32_t>()(
+                        static_cast<uint32_t>(val.second));
+                return mgb::hash_pair_combine(h1, h2);
+            }
+        };
+        const VarNode* var;  ///< pointer to var node
+        std::unordered_map<std::pair<TensorFormats, TensorFormats>, float,
+                           KeyHash>
+                costs;  ///< costs of var node, i.e. the elapsed
+                        ///< device time of the layout transform.
+                        ///< Key of the hashmap indicates the
+                        ///< source tensor format and the target
+                        ///< tensor format.
+        std::string to_string() const;
+    };
+    /*!
+     * \note the profiler assumes all the input and output var node are stored
+     * in contiguous layout in memory
+     */
+    struct ProfilingResult {
+        /// A hashmap, that maps the operator node to the costs (device elapsed
+        /// time) of different layouts configuration
+        ThinHashMap<cg::OperatorNodeBase*, OperatorNodeRecord> opr_record;
+        /// A hashmap, that maps the var node to the costs of layout transform
+        ThinHashMap<VarNode*, VarNodeRecord> var_record;
+    };
+
+    ProfilerBase() = default;
+    virtual ~ProfilerBase() = default;
+    virtual ProfilingResult profile(const Problem& problem) const = 0;
+    static std::unique_ptr<ProfilerBase> make_profiler();
+};
+
+}  // namespace gopt
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/include/megbrain/gopt/reformat_emitter.h
+++ b/src/gopt/include/megbrain/gopt/reformat_emitter.h
@@ -80,11 +80,13 @@ private:

 class PaddingEmitter final : public Emitter {
 public:
-    PaddingEmitter(size_t const_extent, size_t axis)
-            : m_const_extent{const_extent}, m_axis{axis} {}
+    PaddingEmitter(const megdnn::NamedTensorShape& padshp, size_t const_extent,
+                   size_t axis)
+            : m_padshp{padshp}, m_const_extent{const_extent}, m_axis{axis} {}
    EmitResult emit() const override;

 private:
+    megdnn::NamedTensorShape m_padshp;
    size_t m_const_extent, m_axis;
 };


--- a/src/gopt/include/megbrain/gopt/reformat_manager.h
+++ b/src/gopt/include/megbrain/gopt/reformat_manager.h
@@ -17,6 +17,11 @@
 namespace mgb {
 namespace gopt {

+enum class TensorType : uint32_t {
+    FEATURE = 0,
+    WEIGHT = 1,
+};
+
 enum class TensorFormats : uint32_t {
    // input tensor formats
    NCHW = 0,     ///< [N, C, H, W]
@@ -116,6 +121,15 @@ public:
 private:
    ReformatCache m_cache;
 };
+
+TensorShape make_aligned_tensor_shape(const VarNode* var,
+                                      TensorFormats orig_formats,
+                                      TensorFormats target_formats);
+
+TensorShape make_aligned_weight_shape(const VarNode* var,
+                                      TensorFormats orig_formats,
+                                      TensorFormats target_formats,
+                                      TensorFormats extra_formats);
 }  // namespace gopt
 }  // namespace mgb


--- a/src/gopt/include/megbrain/gopt/subgraph_extractor.h
+++ b/src/gopt/include/megbrain/gopt/subgraph_extractor.h
@@ -20,6 +20,7 @@ class GraphPartition {
 public:
    using VarNodeSet = ThinHashSet<VarNode*>;
    using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>;
+
    class InputPlaceholder;

    GraphPartition() = default;
@@ -45,13 +46,13 @@ private:
 class SubGraphExtractor {
 public:
    using OprList = ThinHashSet<Typeinfo*>;
-    SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {};
+    SubGraphExtractor(const OprList& opr_list) : m_opr_list{opr_list} {};
    std::vector<GraphPartition> extract(
            const SymbolVarArray& endpoint_vars) const;

 private:
    class Impl;
-    OprList m_opr_list;
+    const OprList& m_opr_list;
 };

 }  // namespace gopt

--- a/src/gopt/test/profiler.cpp
+++ b/src/gopt/test/profiler.cpp
+/**
+ * \file src/gopt/test/profiler.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "./helper.h"
+#include "megbrain/gopt/global_layout_transform.h"
+#include "megbrain/gopt/inference.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/imgproc.h"
+#include "megbrain/opr/nn_int.h"
+#include "megbrain/serialization/serializer.h"
+
+using namespace mgb;
+using namespace gopt;
+using namespace serialization;
+
+namespace {
+class LayoutTransformContext : public NonCopyableObj {
+public:
+    using OprList = SubGraphExtractor::OprList;
+    using OprFormat = Problem::OprFormat;
+    using OprConfigTrait = Problem::OprConfigTrait;
+
+    LayoutTransformContext() = delete;
+    LayoutTransformContext(OprList opr_list,
+                           SmallVector<TensorFormats> available_tensor_formats,
+                           OprConfigTrait opr_configs)
+            : m_opr_list{std::move(opr_list)},
+              m_available_tensor_formats{std::move(available_tensor_formats)},
+              m_opr_configs{std::move(opr_configs)} {}
+    const OprList& opr_list() const { return m_opr_list; }
+    const SmallVector<TensorFormats>& available_tensor_formats() const {
+        return m_available_tensor_formats;
+    }
+    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
+    static std::unique_ptr<LayoutTransformContext> make() {
+        OprList opr_list = {
+                opr::ConvBiasForward::typeinfo(),
+                opr::ConvolutionForward::typeinfo(),
+                opr::ConvolutionBackwardData::typeinfo(),
+                opr::ElemwiseMultiType::typeinfo(),
+                opr::Elemwise::typeinfo(),
+                opr::TypeCvt::typeinfo(),
+                opr::PoolingForward::typeinfo(),
+                opr::WarpPerspectiveForward::typeinfo(),
+        };
+        OprConfigTrait opr_configs;
+        {
+            auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::ConvBias::typeinfo(), OprFormat::_fmt);
+            cb(NCHW4);
+            cb(NCHW32);
+            cb(NHWC);
+            cb(NCHW64);
+            cb(CHWN4);
+#undef cb
+        }
+        {
+            auto& dispatchers =
+                    opr_configs[opr::ConvolutionBackwardData::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::ConvolutionBackwardData::typeinfo(),              \
+                    OprFormat::_fmt);
+            cb(NCHW4);
+#undef cb
+        }
+
+        {
+            auto& dispatchers =
+                    opr_configs[opr::ConvolutionForward::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::ConvolutionForward::typeinfo(), OprFormat::_fmt);
+            cb(NCHW4);
+#undef cb
+        }
+
+        {
+            auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::PoolingForward::typeinfo(), OprFormat::_fmt);
+            cb(NCHW4);
+            cb(NCHW32);
+            cb(NHWC);
+            cb(NCHW64);
+            cb(CHWN4);
+#undef cb
+        }
+
+        {
+            auto& dispatchers =
+                    opr_configs[opr::WarpPerspectiveForward::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt);
+            cb(NHWC);
+            cb(NCHW4);
+            cb(NCHW64);
+#undef cb
+        }
+
+        SmallVector<TensorFormats> available_tensor_formats = {
+                TensorFormats::NHWC, TensorFormats::NCHWc4,
+                TensorFormats::NCHWc32, TensorFormats::NCHWc64};
+        return std::make_unique<LayoutTransformContext>(
+                std::move(opr_list), std::move(available_tensor_formats),
+                std::move(opr_configs));
+    }
+
+private:
+    OprList m_opr_list;
+    SmallVector<TensorFormats> m_available_tensor_formats;
+    OprConfigTrait m_opr_configs;
+};
+};  // namespace
+
+#if MGB_CUDA
+#if CUDA_VERSION >= 10020
+TEST(TestProfiler, Conv) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+    auto mkcvar = [&](const char* name, const TensorShape& shp,
+                      const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
+                        .rename(name),
+                dtype);
+    };
+    auto x = mkvar("x", {64, 48, 14, 14},
+                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
+    auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f));
+    auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f));
+    opr::ConvBias::Param param;
+    param.format = opr::ConvBias::Param::Format::NCHW;
+    param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
+    param.stride_h = param.stride_w = 1;
+    param.pad_h = param.pad_w = 1;
+    auto c1 = opr::ConvBias::make(x, w1, b1, param, {},
+                                  OperatorNodeConfig(dtype::Quantized4Asymm(
+                                          12.345f, static_cast<uint8_t>(5))));
+    x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f));
+    auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f));
+    auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f));
+    auto c2 = opr::ConvBias::make(x, w2, b2, param, {},
+                                  OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
+
+    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    S strategy = S::PROFILE;
+    gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({c2});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    const auto& var_rst = rst.var_record;
+    EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
+    EXPECT_TRUE(var_rst.count(w1.node()) == 0);
+    EXPECT_TRUE(var_rst.count(b1.node()) == 0);
+    EXPECT_TRUE(var_rst.count(w2.node()) == 0);
+    EXPECT_TRUE(var_rst.count(b2.node()) == 0);
+}
+#endif
+
+TEST(TestProfiler, Deconv) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+    auto mkcvar = [&](const char* name, const TensorShape& shp,
+                      const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
+                        .rename(name),
+                dtype);
+    };
+    auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f));
+    auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
+    using Param = opr::ConvolutionBackwardData::Param;
+    Param param;
+    param.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
+    param.stride_h = param.stride_w = 2;
+    param.pad_h = param.pad_w = 0;
+    auto c1 = opr::ConvolutionBackwardData::make(
+            w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
+    auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
+    auto c2 = opr::ConvolutionBackwardData::make(
+            w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
+
+    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    S strategy = S::PROFILE;
+    gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({c2});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    const auto& var_rst = rst.var_record;
+    EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
+    EXPECT_TRUE(var_rst.count(w1.node()) == 0);
+    EXPECT_TRUE(var_rst.count(w2.node()) == 0);
+}
+
+TEST(TestProfiler, Warp) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    constexpr size_t INP_H = 10, INP_W = 10, N = 16;
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+
+    auto x = mkvar("x", {N, 48, INP_H, INP_W},
+                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
+    float value1 = M_PI, value2 = 0.6;
+    auto gen_mat = [&](HostTensorND& mat) {
+        auto ptr = mat.ptr<float>();
+        for (size_t i = 0; i < N; ++i) {
+            auto rot = value1, scale = value2, sheer = value1, dy = value2,
+                 dx = value2, ky = value2, kx = value2, kb = value2;
+            ptr[0] = ptr[4] = cos(rot) * scale;
+            ptr[1] = -(ptr[3] = sin(rot) * scale);
+            ptr[3] *= sheer;
+            ptr[4] *= sheer;
+            ptr[2] = dx;
+            ptr[5] = dy;
+            ptr[6] = kx;
+            ptr[7] = ky;
+            ptr[8] = kb;
+            ptr += 9;
+        }
+        mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
+    };
+    auto mat_host = std::make_shared<HostTensorND>(
+            x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
+    gen_mat(*mat_host);
+    auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
+    TensorShape out_shp{20, 20};
+    auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp);
+
+    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    S strategy = S::PROFILE;
+    gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({w1});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    const auto& var_rst = rst.var_record;
+    EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0);
+    EXPECT_TRUE(var_rst.count(mat.node()) == 0);
+    EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0);
+    EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0);
+}
+
+TEST(TestProfiler, Pooling) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+    auto x = mkvar("x", {64, 64, 55, 55},
+                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
+    using Param = opr::Pooling::Param;
+    Param param;
+    param.format = Param::Format::NCHW;
+    auto p1 = opr::Pooling::make(x, param);
+    x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f));
+    auto p2 = opr::Pooling::make(x, param);
+
+    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    S strategy = S::PROFILE;
+    gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({p2});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
+}
+
+TEST(TestProfiler, Elemwise) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+    auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32());
+    auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32());
+    auto c = opr::Elemwise::make({a, b},
+                                 {opr::Elemwise::Param::Mode::FUSE_ADD_RELU});
+    auto q4c = opr::TypeCvt::make(
+            c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
+    auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f));
+    auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f));
+    auto q8d = opr::ElemwiseMultiType::make(
+            {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU},
+            OperatorNodeConfig(dtype::QuantizedS8(12.f)));
+    auto q4d = opr::TypeCvt::make(
+            q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3)));
+    auto q4e = opr::ElemwiseMultiType::make(
+            {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD},
+            OperatorNodeConfig(
+                    dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));
+
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({q4e});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    const auto& var_rst = rst.var_record;
+    EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0);
+    EXPECT_TRUE(var_rst.count(a.node()) > 0);
+    EXPECT_TRUE(var_rst.count(b.node()) > 0);
+    EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
+    EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
+}
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/test/reformat_manager.cpp
+++ b/src/gopt/test/reformat_manager.cpp
@@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) {
    for (size_t i = 0; i < RUNS; ++i)
        func->execute();
    double time_profiler = profiler->duration() * 1e6;
-    printf("%f, %f\n", time_profiler, time_cuda_evt);
-    ASSERT_EQ(time_cuda_evt, time_profiler);
    MGB_CUDA_CHECK(cudaEventDestroy(evt0));
    MGB_CUDA_CHECK(cudaEventDestroy(evt1));
 }