feat(mgb/gopt): add profile impl for global layout transform pass

GitOrigin-RevId: 8ef62baf792c97c7a226dd791af167ab2e8707b4

feat(mgb/gopt): add profile impl for global layout transform pass
GitOrigin-RevId: 8ef62baf792c97c7a226dd791af167ab2e8707b4
c14e5719 · Megvii Engine Team · 9c0a17d0 · c14e5719 · c14e5719 · c14e5719
15 changed file
--- a/dnn/src/aarch64/relayout/opr_impl.cpp
+++ b/dnn/src/aarch64/relayout/opr_impl.cpp
@@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
    TensorND src = src0, dst = dst0;
    check_layout_and_canonize(src.layout, dst.layout);

+    // FIXME: optimize for lowbit cases
+    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
+        src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+        fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
+        return;
+    }
+
    relayout::TransposeParam trans_param;
    bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
    if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {

--- a/dnn/src/armv7/relayout/opr_impl.cpp
+++ b/dnn/src/armv7/relayout/opr_impl.cpp
@@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
    TensorND src = src0, dst = dst0;
    check_layout_and_canonize(src.layout, dst.layout);

+    // FIXME: optimize for lowbit cases
+    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
+        src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+        fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
+        return;
+    }
+
    relayout::TransposeParam trans_param;
    bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
    if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {

--- a/src/gopt/impl/opr_format_modifier.cpp
+++ b/src/gopt/impl/opr_format_modifier.cpp
+/**
+ * \file src/gopt/impl/opr_format_modifier.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "./opr_format_modifier.h"
+#include "megbrain/opr/dnn/convolution.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/imgproc.h"
+#include "megbrain/opr/io.h"
+#include "megbrain/serialization/sereg.h"
+
+#include "midout.h"
+MIDOUT_DECL(megbrain_opr_format_modifier)
+#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_format_modifier, __VA_ARGS__) {
+#define MIDOUT_E \
+    }            \
+    MIDOUT_END();
+
+using namespace mgb;
+using namespace opr;
+
+namespace {
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCaller2 {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray& inputs,
+                         const typename MegDNNConv::Param& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        if (inputs.size() == 2) {
+            return Opr::make(inputs[0], inputs[1], param, execution_policy,
+                             config)
+                    .node();
+        }
+        return nullptr;
+    }
+};
+
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCaller3 {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray& inputs,
+                         const typename MegDNNConv::Param& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        if (inputs.size() == 3) {
+            return Opr::make(inputs[0], inputs[1], inputs[2], param,
+                             execution_policy, config)
+                    .node();
+        }
+        return nullptr;
+    }
+};
+
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCaller4 {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray& inputs,
+                         const typename MegDNNConv::Param& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        if (inputs.size() == 4) {
+            return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param,
+                             execution_policy, config)
+                    .node();
+        }
+        return nullptr;
+    }
+};
+
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCaller5 {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray& inputs,
+                         const typename MegDNNConv::Param& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        if (inputs.size() == 5) {
+            return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3],
+                             inputs[4], param, execution_policy, config)
+                    .node();
+        }
+        return nullptr;
+    }
+};
+
+template <class MegDNNConv = megdnn::Convolution>
+struct MakeConvCallerEmpty {
+    template <typename Opr>
+    static VarNode* make(const cg::VarNodeArray&,
+                         const typename MegDNNConv::Param&,
+                         const megdnn::param::ExecutionPolicy&,
+                         const OperatorNodeConfig&) {
+        return nullptr;
+    }
+};
+
+template <class Opr, class Maker0, class MegDNNConv,
+          class Maker1 = MakeConvCallerEmpty<MegDNNConv>,
+          class Maker2 = MakeConvCallerEmpty<MegDNNConv>,
+          typename ConvParam = megdnn::param::Convolution>
+struct ConvMakerImpl {
+    static VarNode* make(const cg::VarNodeArray& inputs, const ConvParam& param,
+                         const megdnn::param::ExecutionPolicy& execution_policy,
+                         const OperatorNodeConfig& config) {
+        VarNode* ret = Maker0::template make<Opr>(inputs, param,
+                                                  execution_policy, config);
+        if (!ret) {
+            ret = Maker1::template make<Opr>(inputs, param, execution_policy,
+                                             config);
+        }
+        if (!ret) {
+            ret = Maker2::template make<Opr>(inputs, param, execution_policy,
+                                             config);
+        }
+        mgb_assert(ret);
+        return ret;
+    }
+};
+
+template <typename Opr>
+struct ConvMaker;
+
+template <>
+struct ConvMaker<opr::Convolution>
+        : public ConvMakerImpl<opr::Convolution,
+                               MakeConvCaller2<megdnn::Convolution>,
+                               megdnn::Convolution> {};
+template <>
+struct ConvMaker<opr::ConvolutionBackwardData>
+        : public ConvMakerImpl<opr::ConvolutionBackwardData,
+                               MakeConvCaller2<megdnn::Convolution>,
+                               megdnn::Convolution,
+                               MakeConvCaller3<megdnn::Convolution>> {};
+
+template <>
+struct ConvMaker<opr::ConvBiasForward>
+        : public ConvMakerImpl<opr::ConvBiasForward,
+                               MakeConvCaller2<megdnn::ConvBiasForward>,
+                               megdnn::ConvBiasForward,
+                               MakeConvCaller3<megdnn::ConvBiasForward>,
+                               MakeConvCaller4<megdnn::ConvBiasForward>,
+                               megdnn::param::ConvBias> {};
+template <>
+struct ConvMaker<opr::BatchConvBiasForward>
+        : public ConvMakerImpl<opr::BatchConvBiasForward,
+                               MakeConvCaller2<megdnn::BatchConvBiasForward>,
+                               megdnn::BatchConvBiasForward,
+                               MakeConvCaller3<megdnn::BatchConvBiasForward>,
+                               MakeConvCaller4<megdnn::BatchConvBiasForward>,
+                               megdnn::param::BatchConvBias> {};
+
+#if 0
+#include "../../opr/impl/internal/invoke.h"
+template <typename Opr>
+struct MultiAlgoOprTrait;
+
+#define APPLY(statement, ...)                                  \
+    mgb::apply([&](const auto&... args) { return statement; }, \
+               std::tuple_cat(__VA_ARGS__))
+
+#define INST(_Opr)                                                          \
+    template <>                                                             \
+    struct MultiAlgoOprTrait<_Opr> {                                        \
+        static constexpr bool has_algo = true;                              \
+        using MegDNNOpr = megdnn::_Opr;                                     \
+        static constexpr int arity = OprArityTrait<MegDNNOpr>::arity;       \
+        using FixedTensorLayouts = std::array<TensorLayout, arity>;         \
+        static bool has_available_algo(const VarNodeArray& i,               \
+                                       const cg::OperatorNodeBase* opr_) {  \
+            MIDOUT_B(midout_iv(MGB_HASH_STR(#_Opr)),                        \
+                     midout_iv(MGB_HASH_STR("has_available_algo")))         \
+            auto&& opr = opr_->cast_final_safe<_Opr>();                     \
+            auto&& megdnn_opr =                                             \
+                    reinterpret_cast<MegDNNOpr*>(opr.megdnn_opr());         \
+            FixedTensorLayouts array_layouts;                               \
+            size_t in = i.size() - 1;                                       \
+            for (size_t idx = 0; idx < in; idx++) {                         \
+                const auto& v = i[idx];                                     \
+                array_layouts[idx] =                                        \
+                        TensorLayout{v->shape(), v->dtype(), v->format()};  \
+            }                                                               \
+            const auto& v = i[in];                                          \
+            array_layouts[arity - 1] =                                      \
+                    TensorLayout{v->shape(), v->dtype(), v->format()};      \
+            return APPLY(::megdnn::has_available_algo(megdnn_opr, args...), \
+                         array_layouts);                                    \
+            MIDOUT_E                                                        \
+        }                                                                   \
+    };
+INST(Convolution)
+INST(ConvBiasForward)
+INST(ConvolutionBackwardData)
+INST(PoolingForward)
+#undef APPLY
+#undef INST
+#endif
+}  // namespace
+
+namespace mgb {
+namespace gopt {
+namespace intl {
+
+template <typename Opr>
+struct OprFormatModifier;
+
+#define INST(_Opr)                                                         \
+    template <>                                                            \
+    struct OprFormatModifier<_Opr> {                                       \
+        using OprFormat = typename _Opr::Param::Format;                    \
+        static VarNode* make(OprFormat opr_format, const VarNodeArray& i,  \
+                             const cg::OperatorNodeBase* opr_) {           \
+            MIDOUT_B(_Opr)                                                 \
+            auto&& opr = opr_->cast_final_safe<_Opr>();                    \
+            auto param = opr.param();                                      \
+            param.format = opr_format;                                     \
+            return ConvMaker<_Opr>::make(i, param, opr.execution_policy(), \
+                                         opr.config());                    \
+            MIDOUT_E                                                       \
+        }                                                                  \
+    };
+INST(Convolution);
+INST(ConvBiasForward);
+INST(ConvolutionBackwardData);
+INST(BatchConvBiasForward);
+#undef INST
+
+template <>
+struct OprFormatModifier<WarpPerspective> {
+    using Opr = opr::WarpPerspective;
+    using OprFormat = typename Opr::Param::Format;
+    static VarNode* make(OprFormat opr_format, const VarNodeArray& i,
+                         const cg::OperatorNodeBase* opr_) {
+        MIDOUT_B(Opr)
+        auto&& opr = opr_->cast_final_safe<Opr>();
+        auto param = opr.param();
+        param.format = opr_format;
+        if (i.size() == 3) {
+            return Opr::make(i[0], i[1], i[2], param, opr.config()).node();
+        } else {
+            mgb_assert(i.size() == 4);
+            return Opr::make(i[0], i[1], i[2], i[3], param, opr.config())
+                    .node();
+        }
+        MIDOUT_E
+    }
+};
+
+#define INST(_Opr, _arity)                                                \
+    template <>                                                           \
+    struct OprFormatModifier<_Opr> {                                      \
+        using OprFormat = typename _Opr::Param::Format;                   \
+        static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \
+                             const cg::OperatorNodeBase* opr_) {          \
+            MIDOUT_B(_Opr)                                                \
+            auto&& opr = opr_->cast_final_safe<_Opr>();                   \
+            auto param = opr.param();                                     \
+            param.format = opr_format;                                    \
+            return serialization::OprMaker<_Opr, _arity>::make(           \
+                           param, i, *i[0]->owner_graph(), opr.config())  \
+                    ->output(0);                                          \
+            MIDOUT_E                                                      \
+        }                                                                 \
+    };
+INST(PoolingForward, 1);
+INST(Resize, 2);
+#undef INST
+
+VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
+                           const VarNodeArray& i,
+                           const cg::OperatorNodeBase* opr) {
+#define cb(_Opr)                                                  \
+    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                \
+        return OprFormatModifier<_Opr>::make(opr_format, i, opr); \
+    } else
+    FOREACH_FORMAT_AWARE_OPR(cb) {
+        mgb_throw(InternalError, "invalid format aware operator(got:%s)",
+                  opr->dyn_typeinfo()->name);
+    }
+#undef cb
+}
+
+#if 0
+bool has_available_algo(const VarNodeArray& i,
+                        const cg::OperatorNodeBase* opr) {
+#define cb(_Opr)                                                    \
+    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                  \
+        MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo);       \
+        VarNodeArray _ = i;                                         \
+        _.emplace_back(opr->output(0));                             \
+        return MultiAlgoOprTrait<_Opr>::has_available_algo(_, opr); \
+    } else
+    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData)
+            cb(PoolingForward) {
+        mgb_throw(InternalError, "invalid multi-algo operator(got:%s)",
+                  opr->dyn_typeinfo()->name);
+    }
+}
+#endif
+
+}  // namespace intl
+}  // namespace gopt
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/impl/opr_format_modifier.h
+++ b/src/gopt/impl/opr_format_modifier.h
+/**
+ * \file src/gopt/impl/opr_format_modifier.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+#include "megbrain/graph.h"
+#include "megbrain/opr/dnn/convolution.h"
+
+namespace mgb {
+namespace gopt {
+namespace intl {
+
+#define FOREACH_FORMAT_AWARE_OPR(cb)                                \
+    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \
+            cb(PoolingForward) cb(WarpPerspective) cb(Resize)
+#if 0
+bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr);
+#endif
+
+VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
+                           const VarNodeArray& i,
+                           const cg::OperatorNodeBase* opr);
+
+}  // namespace intl
+}  // namespace gopt
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/impl/opr_tensor_formats_config.cpp
+++ b/src/gopt/impl/opr_tensor_formats_config.cpp
+/**
+ * \file src/gopt/impl/opr_tensor_formats_config.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "./utils.h"
+#include "megbrain/gopt/global_layout_transform.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/imgproc.h"
+
+#include "midout.h"
+MIDOUT_DECL(megbrain_opr_tensor_formats_config)
+#define MIDOUT_B(...) \
+    MIDOUT_BEGIN(megbrain_opr_tensor_formats_config, __VA_ARGS__) {
+#define MIDOUT_E \
+    }            \
+    MIDOUT_END();
+
+using namespace mgb;
+using namespace cg;
+using namespace gopt;
+using OprFormat = opr::ConvBias::Param::Format;
+
+namespace {
+template <typename Opr>
+struct ConvParamTrait;
+
+#define INST(_conv, _weight_idx, _bias_idx, _has_bias) \
+    template <>                                        \
+    struct ConvParamTrait<opr::_conv> {                \
+        static constexpr int weight_idx = _weight_idx; \
+        static constexpr int bias_idx = _bias_idx;     \
+        static constexpr bool has_bias = _has_bias;    \
+    }
+INST(ConvBias, 1, 2, true);
+INST(ConvolutionForward, 1, 0, false);
+INST(ConvolutionBackwardData, 0, 0, false);
+
+template <typename Opr, size_t weight_idx = ConvParamTrait<Opr>::weight_idx>
+static bool is_channel_wise_conv(const OperatorNodeBase* opr) {
+    MGB_MARK_USED_VAR(ConvParamTrait<Opr>::has_bias);
+    MGB_MARK_USED_VAR(ConvParamTrait<Opr>::bias_idx);
+    auto&& conv = opr->cast_final_safe<Opr>();
+    auto format = conv.param().format;
+    auto weight = opr->input(weight_idx);
+    auto weight_shp = weight->shape();
+    if (conv.param().sparse == Opr::Param::Sparse::DENSE)
+        return false;
+    size_t ocpg, icpg;
+    if (format == Opr::Param::Format::NCHW) {
+        ocpg = weight_shp[1], icpg = weight_shp[2];
+        return ocpg == 1 && icpg == 1;
+    }
+    return false;
+}
+
+template <OprFormat opr_format_>
+struct OprSingleInOutTensorFormatsDispatcherImpl;
+
+template <>
+struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW;
+        config.input_dtypes = {opr->input(0)->dtype().enumv()};
+        config.input_tensor_types = {TensorType::FEATURE};
+        config.output_dtypes = {opr->output(0)->dtype().enumv()};
+        config.input_tensor_formats = {TensorFormats::NCHW};
+        config.output_tensor_formats = {TensorFormats::NCHW};
+        return config;
+    }
+};
+
+template <>
+struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW4> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW4;
+        bool available = true;
+        available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.input_dtypes = {opr->input(0)->dtype().enumv()};
+        config.input_tensor_types = {TensorType::FEATURE};
+        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.output_dtypes = {opr->output(0)->dtype().enumv()};
+        config.input_tensor_formats = {TensorFormats::NCHWc4};
+        config.output_tensor_formats = {TensorFormats::NCHWc4};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <>
+struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::CHWN4> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::CHWN4;
+        bool available = true;
+        available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.input_dtypes = {opr->input(0)->dtype().enumv()};
+        config.input_tensor_types = {TensorType::FEATURE};
+        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.output_dtypes = {opr->output(0)->dtype().enumv()};
+        config.input_tensor_formats = {TensorFormats::CHWNc4};
+        config.output_tensor_formats = {TensorFormats::CHWNc4};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <>
+struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW32> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW32;
+        bool available = true;
+        available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.input_dtypes = {opr->input(0)->dtype().enumv()};
+        config.input_tensor_types = {TensorType::FEATURE};
+        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.output_dtypes = {opr->output(0)->dtype().enumv()};
+        config.input_tensor_formats = {TensorFormats::NCHWc32};
+        config.output_tensor_formats = {TensorFormats::NCHWc32};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <>
+struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NHWC> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NHWC;
+        bool available = true;
+        available &=
+                opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
+                opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
+        config.input_dtypes = {opr->input(0)->dtype().enumv()};
+        config.input_tensor_types = {TensorType::FEATURE};
+        available &= opr->output(0)->dtype().enumv() ==
+                     opr->input(0)->dtype().enumv();
+        config.output_dtypes = {opr->output(0)->dtype().enumv()};
+        config.input_tensor_formats = {TensorFormats::NHWC};
+        config.output_tensor_formats = {TensorFormats::NHWC};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <>
+struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW64> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW64;
+        bool available = true;
+        available &=
+                opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
+                opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
+        config.input_dtypes = {opr->input(0)->dtype().enumv()};
+        config.input_tensor_types = {TensorType::FEATURE};
+        available &= opr->output(0)->dtype().enumv() ==
+                     opr->input(0)->dtype().enumv();
+        config.output_dtypes = {opr->output(0)->dtype().enumv()};
+        config.input_tensor_formats = {TensorFormats::NCHWc64};
+        config.output_tensor_formats = {TensorFormats::NCHWc64};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <typename Opr, OprFormat opr_format_>
+struct ConvTensorFormatsDispatcherImpl;
+
+template <typename Opr>
+struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        const auto& conv = opr->cast_final_safe<Opr>();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW;
+        // setup dtypes
+        for (size_t i = 0; i < opr->input().size(); ++i) {
+            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
+            TensorType tensor_type =
+                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
+            config.input_tensor_types.emplace_back(tensor_type);
+        }
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        // setup tensor formats
+        if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
+            config.input_tensor_formats = {
+                    TensorFormats::NCHW, TensorFormats::NCHW,
+                    TensorFormats::NCHW, TensorFormats::NCHW};
+        } else {
+            mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
+            if (is_channel_wise_conv<Opr>(opr)) {
+                config.input_tensor_formats = {
+                        TensorFormats::NCHW, TensorFormats::C11RS,
+                        TensorFormats::NCHW, TensorFormats::NCHW};
+            } else {
+                config.input_tensor_formats = {
+                        TensorFormats::NCHW, TensorFormats::GKCRS,
+                        TensorFormats::NCHW, TensorFormats::NCHW};
+            }
+        }
+        config.output_tensor_formats = {TensorFormats::NCHW};
+        return config;
+    }
+};
+
+template <typename Opr>
+struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NHWC> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        const auto& conv = opr->cast_final_safe<Opr>();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NHWC;
+        bool available = true;
+        for (size_t i = 0; i < opr->input().size(); ++i) {
+            if (i == 2)
+                available &= opr->input(i)->dtype().enumv() ==
+                             DTypeEnum::QuantizedS32;
+            else
+                available &= opr->input(i)->dtype().enumv() ==
+                                     DTypeEnum::Quantized4Asymm ||
+                             opr->input(i)->dtype().enumv() ==
+                                     DTypeEnum::QuantizedS4;
+            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
+            TensorType tensor_type =
+                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
+            config.input_tensor_types.emplace_back(tensor_type);
+        }
+        available &=
+                opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
+                opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
+        config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC,
+                                       TensorFormats::NHWC,
+                                       TensorFormats::NHWC};
+        config.output_tensor_formats = {TensorFormats::NHWC};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <typename Opr>
+struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW4> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        const auto& conv = opr->cast_final_safe<Opr>();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW4;
+        bool available = true;
+        // setup dtypes
+        for (size_t i = 0; i < opr->input().size(); ++i) {
+            if (i == 2)
+                available &= opr->input(i)->dtype().enumv() ==
+                             DTypeEnum::QuantizedS32;
+            else
+                available &= opr->input(i)->dtype().enumv() ==
+                             DTypeEnum::QuantizedS8;
+            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
+            TensorType tensor_type =
+                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
+            config.input_tensor_types.emplace_back(tensor_type);
+        }
+        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        // setup tensor formats
+        if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
+            config.input_tensor_formats = {
+                    TensorFormats::NCHWc4, TensorFormats::NCHWc4,
+                    TensorFormats::NCHWc4, TensorFormats::NCHWc4};
+        } else {
+            mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
+            if (is_channel_wise_conv<Opr>(opr)) {
+                config.input_tensor_formats = {
+                        TensorFormats::NCHWc4, TensorFormats::C11RSc4,
+                        TensorFormats::NCHWc4, TensorFormats::NCHWc4};
+            } else {
+                config.input_tensor_formats = {
+                        TensorFormats::NCHWc4, TensorFormats::GKCRSc4,
+                        TensorFormats::NCHWc4, TensorFormats::NCHWc4};
+            }
+        }
+        config.output_tensor_formats = {TensorFormats::NCHWc4};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <typename Opr>
+struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW32> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        const auto& conv = opr->cast_final_safe<Opr>();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW32;
+        bool available = true;
+        for (size_t i = 0; i < opr->input().size(); ++i) {
+            if (i == 2)
+                available &= opr->input(i)->dtype().enumv() ==
+                             DTypeEnum::QuantizedS32;
+            else
+                available &= opr->input(i)->dtype().enumv() ==
+                             DTypeEnum::QuantizedS8;
+            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
+            TensorType tensor_type =
+                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
+            config.input_tensor_types.emplace_back(tensor_type);
+        }
+        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
+        config.input_tensor_formats = {
+                TensorFormats::NCHWc32, TensorFormats::NCHWc32,
+                TensorFormats::NCHWc32, TensorFormats::NCHWc32};
+        config.output_tensor_formats = {TensorFormats::NCHWc32};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <typename Opr>
+struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW64> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        const auto& conv = opr->cast_final_safe<Opr>();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW64;
+        bool available = true;
+        for (size_t i = 0; i < opr->input().size(); ++i) {
+            if (i == 2)
+                available &= opr->input(i)->dtype().enumv() ==
+                             DTypeEnum::QuantizedS32;
+            else
+                available &= opr->input(i)->dtype().enumv() ==
+                                     DTypeEnum::Quantized4Asymm ||
+                             opr->input(i)->dtype().enumv() ==
+                                     DTypeEnum::QuantizedS4;
+            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
+            TensorType tensor_type =
+                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
+            config.input_tensor_types.emplace_back(tensor_type);
+        }
+        available &=
+                opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
+                opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
+        config.input_tensor_formats = {
+                TensorFormats::NCHWc64, TensorFormats::NCHWc64,
+                TensorFormats::NCHWc64, TensorFormats::NCHWc64};
+        config.output_tensor_formats = {TensorFormats::NCHWc64};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <typename Opr>
+struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::CHWN4> {
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        const auto& conv = opr->cast_final_safe<Opr>();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::CHWN4;
+        bool available = true;
+        for (size_t i = 0; i < opr->input().size(); ++i) {
+            if (i == 2)
+                available &= opr->input(i)->dtype().enumv() ==
+                             DTypeEnum::QuantizedS32;
+            else
+                available &= opr->input(i)->dtype().enumv() ==
+                             DTypeEnum::QuantizedS8;
+            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
+            TensorType tensor_type =
+                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
+            config.input_tensor_types.emplace_back(tensor_type);
+        }
+        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
+        config.input_tensor_formats = {
+                TensorFormats::CHWNc4, TensorFormats::CHWNc4,
+                TensorFormats::CHWNc4, TensorFormats::CHWNc4};
+        config.output_tensor_formats = {TensorFormats::CHWNc4};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+template <>
+struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
+                                       OprFormat::NCHW> {
+    using Opr = opr::ConvolutionBackwardData;
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        const auto& conv = opr->cast_final_safe<Opr>();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW;
+        // setup dtypes
+        for (size_t i = 0; i < opr->input().size(); ++i) {
+            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
+            TensorType tensor_type =
+                    i == 0 ? TensorType::WEIGHT : TensorType::FEATURE;
+            config.input_tensor_types.emplace_back(tensor_type);
+        }
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        // setup tensor formats
+        if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
+            config.input_tensor_formats = {
+                    TensorFormats::NCHW, TensorFormats::NCHW,
+                    TensorFormats::NCHW, TensorFormats::NCHW};
+        } else {
+            mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
+            if (is_channel_wise_conv<Opr>(opr)) {
+                config.input_tensor_formats = {
+                        TensorFormats::C11RS, TensorFormats::NCHW,
+                        TensorFormats::NCHW, TensorFormats::NCHW};
+            } else {
+                config.input_tensor_formats = {
+                        TensorFormats::GKCRS, TensorFormats::NCHW,
+                        TensorFormats::NCHW, TensorFormats::NCHW};
+            }
+        }
+        config.output_tensor_formats = {TensorFormats::NCHW};
+        return config;
+    }
+};
+
+template <>
+struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
+                                       OprFormat::NCHW4> {
+    using Opr = opr::ConvolutionBackwardData;
+    static Maybe<OprTensorFormatsConfiguration> dispatch(
+            const OperatorNodeBase* opr) {
+        const auto& conv = opr->cast_final_safe<Opr>();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = OprFormat::NCHW4;
+        bool available = true;
+        for (size_t i = 0; i < opr->input().size(); ++i) {
+            available &=
+                    opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8;
+            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
+            TensorType tensor_type =
+                    i == 0 ? TensorType::WEIGHT : TensorType::FEATURE;
+            config.input_tensor_types.emplace_back(tensor_type);
+        }
+        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE;
+        config.input_tensor_formats = {
+                TensorFormats::NCHWc4, TensorFormats::NCHWc4,
+                TensorFormats::NCHWc4, TensorFormats::NCHWc4};
+        config.output_tensor_formats = {TensorFormats::NCHWc4};
+        if (available)
+            return config;
+        return None;
+    }
+};
+
+struct StaticData {
+    struct KeyHash {
+        size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const {
+            size_t h1 = mgb::hash<Typeinfo*>(val.first);
+            size_t h2 =
+                    std::hash<uint32_t>()(static_cast<uint32_t>(val.second));
+            return mgb::hash_pair_combine(h1, h2);
+        }
+    };
+    using OprTensorFormatsDispatcher =
+            OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
+    std::unordered_map<std::pair<Typeinfo*, OprFormat>,
+                       OprTensorFormatsDispatcher, KeyHash>
+            typefmt2dispatcher;
+    StaticData();
+};
+
+StaticData::StaticData() {
+#define OPR_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt)                   \
+    typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] =  \
+            [](const OperatorNodeBase* opr) {                       \
+                MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt))     \
+                return ConvTensorFormatsDispatcherImpl<             \
+                        opr::_Opr, OprFormat::_fmt>::dispatch(opr); \
+                MIDOUT_E                                            \
+            }
+
+#define OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt)    \
+    typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \
+            [](const OperatorNodeBase* opr) {                      \
+                MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt))    \
+                return OprSingleInOutTensorFormatsDispatcherImpl<  \
+                        OprFormat::_fmt>::dispatch(opr);           \
+                MIDOUT_E                                           \
+            }
+
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW);
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NHWC);
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW4);
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, CHWN4);
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW32);
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW64);
+
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW);
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4);
+
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW);
+    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4);
+
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW);
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NHWC);
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW4);
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW64);
+
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW);
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NHWC);
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW4);
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, CHWN4);
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW32);
+    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW64);
+
+#undef OPR_TENSOR_FORMATS_CONFIG_REG
+#undef OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG
+}
+
+StaticData& static_data() {
+    static StaticData inst;
+    return inst;
+}
+}  // namespace
+
+OprTensorFormatsConfiguration::OprTensorFormatsDispatcher*
+OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
+        Typeinfo* type, OprFormat opr_format) {
+    auto&& typefmt2dispatcher = static_data().typefmt2dispatcher;
+    auto iter = typefmt2dispatcher.find(std::make_pair(type, opr_format));
+    mgb_assert(iter != typefmt2dispatcher.end(),
+               "cannot find OprTensorFormatsDispatcher for opr type(%s) and "
+               "opr format(%s)",
+               type->name, opr_format_to_string(opr_format));
+    return &iter->second;
+}
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/impl/profiler_impl.cpp
+++ b/src/gopt/impl/profiler_impl.cpp
+/**
+ * \file src/gopt/impl/profiler_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "./opr_format_modifier.h"
+#include "./utils.h"
+#include "megbrain/gopt/framework.h"
+#include "megbrain/gopt/global_layout_transform.h"
+#include "megbrain/graph/event.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/imgproc.h"
+#include "megbrain/opr/io.h"
+#include "megbrain/plugin/base.h"
+#include "megbrain/serialization/sereg.h"
+
+using namespace mgb;
+using namespace cg;
+using namespace opr;
+using namespace gopt;
+using ReformatKey = ReformatManager::ReformatKey;
+
+namespace {
+using OprFormat = Problem::OprFormat;
+OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) {
+    switch (tensor_format) {
+        case TensorFormats::NCHW:
+            return OprFormat::NCHW;
+        case TensorFormats::NCHWc4:
+            return OprFormat::NCHW4;
+        case TensorFormats::NCHWc8:
+            return OprFormat::NCHW8;
+        case TensorFormats::NCHWc32:
+            return OprFormat::NCHW32;
+        case TensorFormats::NCHWc64:
+            return OprFormat::NCHW64;
+        case TensorFormats::NHWC:
+            return OprFormat::NHWC;
+        case TensorFormats::CHWNc4:
+            return OprFormat::CHWN4;
+        default:
+            mgb_throw(MegBrainError, "tensor format(%u) is not supported",
+                      static_cast<uint32_t>(tensor_format));
+    }
+}
+
+class GraphPartitionProfiler final : public PluginBase {
+    using CompNodeEventPtr = std::unique_ptr<CompNode::Event>;
+
+public:
+    using OprFilter = thin_function<bool(OperatorNodeBase*)>;
+    struct OprKernEvent {
+        CompNodeEventPtr start, end;
+    };
+    GraphPartitionProfiler(ComputingGraph* graph, OprFilter opr_filter);
+    ~GraphPartitionProfiler() noexcept;
+    float duration_in_usec() const;
+
+private:
+    void record_event(CompNodeEventPtr& dest, CompNode cn) {
+        if (dest == nullptr)
+            dest = cn.create_event(CompNode::Event::NEED_TIMER);
+        dest->record();
+    }
+    ThinHashMap<OperatorNodeBase*, OprKernEvent> m_kern_event;
+    OprFilter m_opr_filter;
+};
+
+GraphPartitionProfiler::GraphPartitionProfiler(ComputingGraph* graph,
+                                               OprFilter opr_filter)
+        : PluginBase(graph), m_opr_filter(opr_filter) {
+    using namespace event;
+    auto on_before_kern = [this](BeforeKernel const& event) {
+        if (!m_opr_filter(event.opr))
+            return;
+        auto evptr = &m_kern_event[event.opr].start;
+        record_event(*evptr, event.comp_node);
+    };
+    auto on_after_kern = [this](AfterKernel const& event) {
+        if (!m_opr_filter(event.opr))
+            return;
+        auto evptr = &m_kern_event[event.opr].end;
+        record_event(*evptr, event.comp_node);
+    };
+    auto&& ev = graph->event();
+    add_event_handler(ev.register_receiver<BeforeKernel>(on_before_kern));
+    add_event_handler(ev.register_receiver<AfterKernel>(on_after_kern));
+}
+
+GraphPartitionProfiler::~GraphPartitionProfiler() noexcept {
+    auto wait = [](const CompNodeEventPtr& ev) {
+        if (ev)
+            ev->host_wait();
+    };
+    for (auto&& i : m_kern_event) {
+        wait(i.second.start);
+        wait(i.second.end);
+    }
+}
+
+float GraphPartitionProfiler::duration_in_usec() const {
+    float device_duration = 0.f;
+    for (auto&& kern_ev : m_kern_event) {
+        auto&& event = kern_ev.second;
+        event.end->host_wait();
+        device_duration += 1e6 * event.start->elapsed_time_until(*event.end);
+    }
+    return device_duration;
+}
+
+/*!
+ * \brief An operator that indicates its input var node is contiguous
+ */
+// clang-format off
+MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{
+    void scn_do_execute() override {};
+    void init_output_static_infer_desc() override;
+    void add_input_layout_constraint() override {
+        input(0)->add_layout_constraint_contiguous();
+    }
+public:
+    MarkInputContiguous(VarNode* input, const OperatorNodeConfig& config);
+    static SymbolVar make(SymbolVar input, const OperatorNodeConfig& config = {});
+};
+// clang-format on
+
+MGB_DYN_TYPE_OBJ_FINAL_IMPL(MarkInputContiguous);
+
+MarkInputContiguous::MarkInputContiguous(VarNode* input,
+                                         const OperatorNodeConfig& config)
+        : Super(input->owner_graph(), config, "mark_contiguous", {input}) {
+    add_input({input});
+    add_output(None);
+}
+
+SymbolVar MarkInputContiguous::make(SymbolVar input,
+                                    const OperatorNodeConfig& config) {
+    return input.insert_single_output_opr<MarkInputContiguous>(input.node(),
+                                                               config);
+}
+
+void MarkInputContiguous::init_output_static_infer_desc() {
+    using namespace cg::static_infer;
+    auto&& mgr = owner_graph()->static_infer_manager();
+    mgr.register_shape_infer(output(0),
+                             ShapeInferDesc::make_identity(input(0)));
+}
+}  // namespace
+
+/* ================== ProfilerImpl =================*/
+class ProfilerImpl final : public ProfilerBase {
+public:
+    ProfilerImpl(int runs = 10) : m_runs{runs} {};
+    ~ProfilerImpl() = default;
+    ProfilingResult profile(const Problem& problem) const override;
+
+private:
+    static constexpr float PROFILE_TIME_OUT = 1e7;
+    /*!
+     * \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
+     *
+     * \param opr pointer to the operator node to be profiled
+     * \param base_format the original tensor format of the operator node.
+     * \param available_tensor_formats the available tensor formats 
+     * \return the operator node record
+     */
+    OperatorNodeRecord profile_operator(
+            const OperatorNodeBase* opr, TensorFormats base_format,
+            const SmallVector<TensorFormats>& available_tensor_formats) const;
+    float profile_operator(const OperatorNodeBase* opr,
+                           TensorFormats base_format,
+                           TensorFormats tensor_format) const;
+    /*!
+     * \brief profile opr format aware operators (like conv, deconv, conv_bias, etc.)
+     *
+     * \param opr pointer to the operator node to be profiled
+     * \param base_config the tensor formats configuration of base opr format
+     * \param config all the available configuration  
+     * \return the operator node record
+     */
+    OperatorNodeRecord profile_operator(
+            const OperatorNodeBase* opr,
+            const OprTensorFormatsConfiguration& base_config,
+            const SmallVector<OprTensorFormatsConfiguration>& available_configs)
+            const;
+    float profile_operator(const OperatorNodeBase* opr,
+                           const OprTensorFormatsConfiguration& base_config,
+                           const OprTensorFormatsConfiguration& config) const;
+    /*!
+     * \brief profile layout transform of the var node
+     *
+     * \param var pointer to the var node to be profiled
+     * \param base_format the original tensor formats in which the var node is stored
+     * \param available_tensor_formats the available tensor formats
+     * \param extra_attribute the extra attributes (options) of the problem
+     * \return the var node record
+     */
+    VarNodeRecord profile_var_node(
+            const VarNode* var, TensorFormats base_format,
+            const SmallVector<TensorFormats>& available_tensor_formats,
+            ReformatKey::Attribute extra_attribute =
+                    ReformatKey::Attribute::DEFAULT) const;
+    float profile_var_node(const VarNode* var, TensorFormats base_format,
+                           const ReformatKey& key) const;
+    int m_runs; /// sample times of the profiler
+};
+
+ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
+        const OperatorNodeBase* opr, TensorFormats base_format,
+        const SmallVector<TensorFormats>& available_tensor_formats) const {
+    OperatorNodeRecord record;
+    record.opr = opr;
+    auto& costs = record.costs;
+    for (auto&& f : available_tensor_formats) {
+        auto opr_format = tensor_formats_to_opr_format(f);
+        costs[opr_format] = profile_operator(opr, base_format, f);
+    }
+    return record;
+}
+
+float ProfilerImpl::profile_operator(const OperatorNodeBase* opr,
+                                     TensorFormats base_format,
+                                     TensorFormats tensor_format) const {
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    graph->options().var_sanity_check_first_run = false;
+    VarNodeArray new_inps(opr->input().size());
+    for (size_t i = 0; i < opr->input().size(); ++i) {
+        auto&& var = opr->input(i);
+        auto&& cn = var->comp_node();
+        auto&& dtype = var->dtype();
+        auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
+        auto aligned_tensor_shape =
+                make_aligned_tensor_shape(var, base_format, tensor_format);
+        dval->resize(aligned_tensor_shape);
+        auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
+        new_inps[i] = aligned_var.node();
+    }
+    auto new_opr = serialization::copy_opr_shallow(
+            *opr, new_inps, opr->config(), {graph.get()});
+    auto y = new_opr->output(0);
+    auto mark = MarkInputContiguous::make(SymbolVar(y));
+    auto func = graph->compile({{mark, {}}});
+    auto filter = [new_opr](OperatorNodeBase* opr) { return opr == new_opr; };
+    auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
+                                                             std::move(filter));
+    for (int i = 0; i < m_runs; ++i)
+        func->execute();
+    return profiler->duration_in_usec();
+}
+
+ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
+        const OperatorNodeBase* opr,
+        const OprTensorFormatsConfiguration& base_config,
+        const SmallVector<OprTensorFormatsConfiguration>& available_configs)
+        const {
+    OperatorNodeRecord record;
+    record.opr = opr;
+    auto& costs = record.costs;
+    for (auto&& i : available_configs) {
+        costs[i.opr_format] = profile_operator(opr, base_config, i);
+    }
+    return record;
+}
+
+float ProfilerImpl::profile_operator(
+        const OperatorNodeBase* opr,
+        const OprTensorFormatsConfiguration& base_config,
+        const OprTensorFormatsConfiguration& config) const {
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    graph->options().var_sanity_check_first_run = false;
+    VarNodeArray new_inps(opr->input().size());
+    size_t i = 0;
+    size_t nr_input_tensor =
+            std::min(config.input_tensor_formats.size(), opr->input().size());
+    for (; i < nr_input_tensor; ++i) {
+        auto&& var = opr->input(i);
+        auto&& cn = var->comp_node();
+        auto&& dtype = var->dtype();
+        auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
+        TensorShape aligned_shape;
+        if (config.input_tensor_types[i] == TensorType::WEIGHT) {
+            mgb_assert(base_config.input_tensor_types[i] == TensorType::WEIGHT);
+            aligned_shape = make_aligned_weight_shape(
+                    var, base_config.input_tensor_formats[i],
+                    config.input_tensor_formats[i],
+                    config.output_tensor_formats[0]);
+        } else {
+            mgb_assert(base_config.input_tensor_types[i] ==
+                       config.input_tensor_types[i]);
+            mgb_assert(base_config.input_tensor_types[i] ==
+                       TensorType::FEATURE);
+            aligned_shape = make_aligned_tensor_shape(
+                    var, base_config.input_tensor_formats[i],
+                    config.input_tensor_formats[i]);
+        }
+        dval->resize(aligned_shape);
+        auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
+        new_inps[i] = aligned_var.node();
+    }
+    for (; i < opr->input().size(); ++i) {
+        auto&& var = opr->input(i);
+        auto&& cn = var->comp_node();
+        auto&& dtype = var->dtype();
+        auto hval = std::make_shared<HostTensorND>(cn, dtype);
+        hval->resize(var->shape());
+        auto cb = [&](DeviceTensorND& d) { hval->copy_from(d).sync(); };
+        {
+            auto cg = var->owner_graph();
+            cg->compile({{var, cb}})->execute();
+        }
+        auto imm = opr::ImmutableTensor::make(*graph, *hval);
+        new_inps[i] = imm.node();
+    }
+    VarNode* y = mgb::gopt::intl::modify_opr_format(config.opr_format, new_inps,
+                                                    opr);
+#if 0
+    static const ThinHashSet<Typeinfo*> multi_algo_oprs = {
+            opr::Convolution::typeinfo(),
+            opr::ConvBiasForward::typeinfo(),
+            opr::ConvolutionBackwardData::typeinfo(),
+            opr::PoolingForward::typeinfo(),
+    };
+    if (multi_algo_oprs.count(opr->dyn_typeinfo()) &&
+        !mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr()))
+        return PROFILE_TIME_OUT;
+#endif
+    auto mark = MarkInputContiguous::make(SymbolVar(y));
+    auto func = graph->compile({{mark, {}}});
+    auto new_opr = y->owner_opr();
+    auto filter = [&new_opr](OperatorNodeBase* opr) { return opr == new_opr; };
+    auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
+                                                             std::move(filter));
+    for (int i = 0; i < m_runs; ++i)
+        func->execute();
+    return profiler->duration_in_usec();
+}
+
+ProfilerImpl::VarNodeRecord ProfilerImpl::profile_var_node(
+        const VarNode* var, TensorFormats base_format,
+        const SmallVector<TensorFormats>& available_tensor_formats,
+        ReformatKey::Attribute attribute) const {
+    VarNodeRecord record;
+    record.var = var;
+    auto& costs = record.costs;
+    for (auto&& i : available_tensor_formats) {
+        for (auto&& o : available_tensor_formats) {
+            if (i == o)
+                continue;
+            ReformatKey key{i, o, attribute, var->dtype().enumv(),
+                            var->dtype().enumv()};
+            costs[{i, o}] = profile_var_node(var, base_format, key);
+        }
+    }
+    return record;
+}
+
+float ProfilerImpl::profile_var_node(const VarNode* var,
+                                     TensorFormats base_format,
+                                     const ReformatKey& key) const {
+    auto&& cn = var->comp_node();
+    auto&& dtype = var->dtype();
+    auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
+    auto aligned_tensor_shape =
+            make_aligned_tensor_shape(var, base_format, key.input_format);
+    dval->resize(aligned_tensor_shape);
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    graph->options().var_sanity_check_first_run = false;
+    auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
+    auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
+            var, base_format, key);
+    auto y = builder({aligned_var.node()});
+    ThinHashSet<OperatorNodeBase*> set;
+    DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); });
+    iter.add(y->owner_opr());
+    iter.set_visited(aligned_var.node()->owner_opr());
+    auto mark = MarkInputContiguous::make(SymbolVar(y));
+    auto func = graph->compile({{mark, {}}});
+    auto filter = [&set](OperatorNodeBase* opr) { return set.count(opr) > 0; };
+    auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
+                                                             std::move(filter));
+    for (int i = 0; i < m_runs; ++i)
+        func->execute();
+    return profiler->duration_in_usec();
+}
+
+ProfilerImpl::ProfilingResult ProfilerImpl::profile(
+        const Problem& problem) const {
+    ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM};
+    {
+        auto cb = [&cvprop](OperatorNodeBase* opr) { cvprop.add_opr(opr); };
+        DepOprIter iter{cb};
+        for (auto&& o : problem.graph_partition().output()) {
+            iter.add(o->owner_opr());
+        }
+    }
+
+    static const ThinHashMap<Typeinfo*, size_t> format_aware_input_tensors = {
+#define cb(_Opr, _arity) {_Opr::typeinfo(), _arity}
+            cb(Convolution, 2),
+            cb(ConvBiasForward, 4),
+            cb(ConvolutionBackwardData, 2),
+            cb(PoolingForward, 1),
+            cb(WarpPerspective, 1),
+            cb(Resize, 1),
+#undef cb
+    };
+    ThinHashSet<VarNode*> vars;
+    ThinHashSet<OperatorNodeBase*> oprs;
+    {
+        auto cb = [&cvprop, &vars, &oprs](OperatorNodeBase* opr) {
+            if (cvprop.is_const(opr))
+                return;
+            oprs.insert(opr);
+            auto find = format_aware_input_tensors.find(opr->dyn_typeinfo());
+            if (find == format_aware_input_tensors.end()) {
+                for (auto&& i : opr->input()) {
+                    if (!cvprop.is_const(i)) {
+                        vars.insert(i);
+                    }
+                }
+            } else {
+                size_t nr_input_tensor =
+                        std::min(find->second, opr->input().size());
+                for (size_t i = 0; i < nr_input_tensor; ++i) {
+                    if (!cvprop.is_const(opr->input(i))) {
+                        vars.insert(opr->input(i));
+                    }
+                }
+            }
+            vars.insert(opr->output(0));
+        };
+        DepOprIter iter{cb};
+        for (auto&& i : problem.graph_partition().input()) {
+            iter.set_visited(i->owner_opr());
+        }
+        for (auto&& o : problem.graph_partition().output()) {
+            iter.add(o->owner_opr());
+        }
+    }
+
+    auto base_format = problem.base_format();
+    auto&& available_tensor_formats = problem.available_tensor_formats();
+
+    ProfilingResult profiling_result;
+    auto& opr_record = profiling_result.opr_record;
+    auto& var_record = profiling_result.var_record;
+    for (auto&& var : vars) {
+        var_record[var] =
+                profile_var_node(var, base_format, available_tensor_formats);
+    }
+    for (auto&& opr : oprs) {
+        auto&& opr_configs = problem.opr_configs();
+        auto find = opr_configs.find(opr->dyn_typeinfo());
+        if (find == opr_configs.end()) {
+            opr_record[opr] = profile_operator(opr, base_format,
+                                               available_tensor_formats);
+        } else {
+            auto&& dispatchers = find->second;
+            SmallVector<OprTensorFormatsConfiguration> configs;
+            for (const auto& item : dispatchers) {
+                auto config = (*item.second)(opr);
+                if (config.valid()) {
+                    configs.emplace_back(config.val());
+                }
+            }
+            auto base_config = problem.base_config(opr);
+            opr_record[opr] = profile_operator(opr, base_config, configs);
+        }
+    }
+    for (auto&& rpair : opr_record) {
+        mgb_log_debug("%s", rpair.second.to_string().c_str());
+    }
+    for (auto&& rpair : var_record) {
+        mgb_log_debug("%s", rpair.second.to_string().c_str());
+    }
+    return profiling_result;
+}
+
+/* ================== ProfilerBase =================*/
+std::string ProfilerBase::OperatorNodeRecord::to_string() const {
+    auto str = ssprintf("\nopr type: %s\nopr name: %s\ninputs:\n",
+                        opr->dyn_typeinfo()->name, opr->cname());
+    for (auto&& i : opr->input()) {
+        str += ssprintf("\tvar: %s\n\tshape: %s\n", i->cname(),
+                        i->shape().to_string().c_str());
+    }
+    str += ssprintf("outputs:\n\tvar: %s\n\tshape: %s\ncosts:\n",
+                    opr->output(0)->cname(),
+                    opr->output(0)->shape().to_string().c_str());
+    for (auto&& cpair : costs) {
+        str += ssprintf("\tformat: %s; cost:%f",
+                        opr_format_to_string(cpair.first), cpair.second);
+    }
+    return str;
+}
+
+std::string ProfilerBase::VarNodeRecord::to_string() const {
+    auto str = ssprintf("\nvar: %s\ncosts:", var->cname());
+    for (auto&& cpair : costs) {
+        auto&& formats = cpair.first;
+        str += ssprintf("\n\tformat: (i:%s;o:%s); cost:%f",
+                        tensor_formats_to_named_tensor_shape(formats.first)
+                                .to_string()
+                                .c_str(),
+                        tensor_formats_to_named_tensor_shape(formats.second)
+                                .to_string()
+                                .c_str(),
+                        cpair.second);
+    }
+    return str;
+}
+
+std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() {
+    return std::make_unique<ProfilerImpl>();
+}
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/impl/reformat_emitter.cpp
+++ b/src/gopt/impl/reformat_emitter.cpp
@@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const {

 /* ============== PaddingEmitter ================= */
 PaddingEmitter::EmitResult PaddingEmitter::emit() const {
+    auto&& padshp = m_padshp;
    auto&& const_extent = m_const_extent;
    auto&& axis = m_axis;
-    auto builder = [const_extent, axis](const VarNodeArray& vars) {
+    auto builder = [padshp, const_extent, axis](const VarNodeArray& vars) {
        auto i = vars[0];
        auto padding_shp_var = vars[1];
        TensorShape shape;
        shape.ndim = i->shape().ndim;
        for (size_t ax = 0; ax < shape.ndim; ++ax)
            shape[ax] = 1;
-        shape[axis] = const_extent;
+        // avoid making a scalar lowbit tensor
+        if (!i->dtype().is_low_bit() || const_extent != 1)
+            shape[axis] = const_extent;
+        else {
+            size_t const_axis = 0;
+            size_t new_const_extent = const_extent;
+            for (size_t i = 0; i < padshp.ndim; ++i) {
+                const auto& dim = padshp[i];
+                if (dim.extent() != Dimension::UNDETERMINED_EXTENT &&
+                    dim.extent() != 1) {
+                    new_const_extent = dim.extent();
+                    const_axis = i;
+                    break;
+                }
+            }
+            mgb_assert(new_const_extent != 1,
+                       "cannot make an scalar lowbit tensor(got:%s)",
+                       i->dtype().name());
+            shape[const_axis] = new_const_extent;
+        }
        auto host_val =
                std::make_shared<HostTensorND>(i->comp_node(), i->dtype());
        host_val->resize(shape);

--- a/src/gopt/impl/reformat_manager.cpp
+++ b/src/gopt/impl/reformat_manager.cpp
@@ -13,6 +13,7 @@
 #include "megbrain/gopt/reformat_manager.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/utils/arith_helper.h"
+#include "./utils.h"

 using namespace mgb;
 using namespace gopt;
@@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) {
    }
    return x;
 }
-
-NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) {
-    switch (format) {
-        case TensorFormats::NCHW:
-            return {{"N"}, {"C"}, {"H"}, {"W"}};
-        case TensorFormats::NHWC:
-            return {{"N"}, {"H"}, {"W"}, {"C"}};
-        case TensorFormats::NCHWc4:
-            return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
-        case TensorFormats::NCHWc8:
-            return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
-        case TensorFormats::NCHWc32:
-            return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
-        case TensorFormats::NCHWc64:
-            return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
-        case TensorFormats::CHWNc4:
-            return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
-        case TensorFormats::NHCWc4:
-            return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
-        case TensorFormats::KRSCk4:
-            return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
-        case TensorFormats::GKRSCk4:
-            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
-        case TensorFormats::C1RSc4:
-            return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
-        case TensorFormats::KRSCk4c4:
-            return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
-        case TensorFormats::GKRSCk4c4:
-            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
-        case TensorFormats::KCRSk4c4:
-            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
-        case TensorFormats::GKCRSk4c4:
-            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
-        case TensorFormats::KCRSc4k4:
-            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
-        case TensorFormats::GKCRSc4k4:
-            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
-        case TensorFormats::C11RSc4:
-            return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
-        case TensorFormats::KCRSc8k8:
-            return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
-        case TensorFormats::GKCRSc8k8:
-            return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
-        case TensorFormats::C11RSc8:
-            return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
-        case TensorFormats::KRSCk8:
-            return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
-        case TensorFormats::KCRSc4:
-            return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
-        case TensorFormats::GKCRSc4:
-            return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
-        case TensorFormats::KCRS:
-            return {{"K"}, {"C"}, {"R"}, {"S"}};
-        case TensorFormats::GKCRS:
-            return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
-        case TensorFormats::C11RS:
-            return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
-        default:
-            mgb_throw(AssertionError, "invalid tensor formats(%u)",
-                      static_cast<uint32_t>(format));
-    }
-}
 };  // namespace

 // =================== ReformatManager::ReformatKey ====================*/
@@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            tensor_formats_to_named_tensor_shape(key.input_format);
    NamedTensorShape output_shape =
            tensor_formats_to_named_tensor_shape(key.output_format);
-    size_t input_alignment, output_alignment;
-    size_t input_channel_idx, output_channel_idx;
+    size_t input_alignment = 0;
+    size_t output_alignment = 0;
+    size_t input_channel_idx = input_shape.ndim,
+           output_channel_idx = input_shape.ndim;
    for (size_t i = 0; i < input_shape.ndim; ++i) {
        if (input_shape[i].name() == Dimension::Name::C &&
            input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
@@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            break;
        }
    }
+    mgb_assert(input_channel_idx < input_shape.ndim &&
+                       output_channel_idx < input_shape.ndim,
+               "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
+               input_channel_idx, output_channel_idx,
+               input_shape.to_string().c_str());
+    mgb_assert(input_alignment > 0 && output_alignment > 0,
+               "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
+               input_alignment, output_alignment,
+               input_shape.to_string().c_str());
    NamedTensorShape orig_shape =
            tensor_formats_to_named_tensor_shape(orig_format);
    size_t orig_channel = 0;
@@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({x});
-            auto padding = std::get<0>(
-                    PaddingEmitter{const_extent, input_channel_idx}.emit());
+            auto padding = std::get<0>(PaddingEmitter{
+                    padding_shape, const_extent, input_channel_idx}
+                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        cur = ReformatManager::instance().get(key)({cur});
@@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
        const VarNode* orig_var, const ReformatKey& key,
        const AlignmentDesc& extra_alignment) const {
    size_t in_channels = 0, out_channels = 0;
-    size_t input_channel_idx, output_channel_idx;
-    Dimension::Name out_channel_name;
+    Dimension::Name out_channel_name = Dimension::Name::C;
    auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format);
+    size_t input_channel_idx = input_shape.ndim,
+           output_channel_idx = input_shape.ndim;
    for (size_t i = 0; i < input_shape.ndim; ++i) {
        if (input_shape[i].name() == Dimension::Name::C &&
            input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
@@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
                       input_shape.to_string().c_str());
        }
    }
-    size_t in_channel_alignment, out_channel_alignment = 1;
+    mgb_assert(out_channel_name == Dimension::Name::K ||
+                       out_channel_name == Dimension::Name::N,
+               "invalid out channel(shp:%s)", input_shape.to_string().c_str());
+    mgb_assert(input_channel_idx < input_shape.ndim &&
+                       output_channel_idx < input_shape.ndim,
+               "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
+               input_channel_idx, output_channel_idx,
+               input_shape.to_string().c_str());
+    size_t in_channel_alignment = 0, out_channel_alignment = 0;
    auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format);
    for (size_t i = 0; i < output_shape.ndim; ++i) {
        if (output_shape[i].name() == Dimension::Name::C &&
@@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            out_channel_alignment = output_shape[i].stride();
        }
    }
+    mgb_assert(in_channel_alignment > 0 && out_channel_alignment > 0,
+               "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
+               in_channel_alignment, out_channel_alignment,
+               output_shape.to_string().c_str());
    size_t aligned_in_channel =
            divup(in_channels, in_channel_alignment) * in_channel_alignment;
    if (extra_alignment.name == out_channel_name) {
@@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({x});
-            auto padding = std::get<0>(
-                    PaddingEmitter{const_extent, input_channel_idx}.emit());
+            auto padding = std::get<0>(PaddingEmitter{
+                    padding_shape, const_extent, input_channel_idx}
+                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        if (aligned_out_channel > out_channels) {
@@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({cur});
-            auto padding = std::get<0>(
-                    PaddingEmitter{const_extent, output_channel_idx}.emit());
+            auto padding = std::get<0>(PaddingEmitter{
+                    padding_shape, const_extent, output_channel_idx}
+                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        cur = ReformatManager::instance().get(key)({cur});
@@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() {
    static ReformatManager inst;
    return inst;
 }
+
+TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var,
+                                                 TensorFormats orig_formats,
+                                                 TensorFormats target_formats) {
+    using Dimension = megdnn::Dimension;
+    static constexpr uint32_t UNDETERMINED_EXTENT =
+            Dimension::UNDETERMINED_EXTENT;
+    auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats);
+    auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
+
+    TensorShape oshp = var->shape();
+    mgb_assert(oshp.is_scalar() || oshp.ndim == orig_shape.ndim,
+               "orig shape of var node is not compatible with tensor "
+               "formats(var:%s;shp:%s;fmt:%s)",
+               var->cname(), oshp.to_string().c_str(),
+               orig_shape.to_string().c_str());
+    if (oshp.is_scalar()) return oshp;
+    TensorShape tshp;
+    ThinHashMap<Dimension::Name, int> name2dominant;
+    for (size_t i = 0; i < orig_shape.ndim; ++i) {
+        auto name = orig_shape[i].name();
+        if (orig_shape[i].extent() == UNDETERMINED_EXTENT) {
+            auto insert = name2dominant.insert(std::make_pair(name, i));
+            mgb_assert(insert.second);
+        }
+    }
+
+    tshp.ndim = target_shape.ndim;
+    for (size_t i = 0; i < target_shape.ndim; ++i) {
+        auto name = target_shape[i].name();
+        if (target_shape[i].extent() == UNDETERMINED_EXTENT) {
+            int idx = name2dominant.at(name);
+            bool mul = orig_shape[idx] < target_shape[i];
+            size_t factor = mul ? (target_shape[i] / orig_shape[idx]).extent()
+                                : (orig_shape[idx] / target_shape[i]).extent();
+            if (mul)
+                tshp[i] = oshp[idx] * factor;
+            else
+                tshp[i] = divup(oshp[idx], factor);
+        } else {
+            tshp[i] = target_shape[i].extent();
+        }
+    }
+    return tshp;
+}
+
+TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var,
+                                                 TensorFormats orig_formats,
+                                                 TensorFormats target_formats,
+                                                 TensorFormats extra_formats) {
+    auto tshp = make_aligned_tensor_shape(var, orig_formats, target_formats);
+    auto extra_shape = tensor_formats_to_named_tensor_shape(extra_formats);
+    using Dimension = megdnn::Dimension;
+    static constexpr uint32_t UNDETERMINED_EXTENT =
+            Dimension::UNDETERMINED_EXTENT;
+    size_t out_channel_alignment = 1;
+    for (size_t i = 0; i < extra_shape.ndim; ++i) {
+        auto name = extra_shape[i].name();
+        if (name == Dimension::Name::C &&
+            extra_shape[i].extent() == UNDETERMINED_EXTENT) {
+            out_channel_alignment = extra_shape[i].stride();
+        }
+    }
+
+    auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
+    for (size_t i = 0; i < target_shape.ndim; ++i) {
+        auto name = target_shape[i].name();
+        if ((name == Dimension::Name::K || name == Dimension::Name::N) &&
+            target_shape[i].extent() == UNDETERMINED_EXTENT) {
+            size_t out_channels = tshp[i] * target_shape[i].stride();
+            tshp[i] = divup(out_channels, out_channel_alignment) *
+                      out_channel_alignment / target_shape[i].stride();
+        }
+    }
+    return tshp;
+}
+
 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/utils.h
+++ b/src/gopt/impl/utils.h
+/**
+ * \file src/gopt/impl/utils.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+#include "megbrain/gopt/global_layout_transform.h"
+
+namespace mgb {
+namespace gopt {
+
+static inline const char* opr_format_to_string(
+        OprTensorFormatsConfiguration::OprFormat opr_format) {
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+#define cb(_fmt)          \
+    case OprFormat::_fmt: \
+        return #_fmt
+    switch (opr_format) {
+        cb(NCHW);
+        cb(NHWC);
+        cb(NCHW4);
+        cb(NCHW32);
+        cb(NCHW64);
+        cb(CHWN4);
+        default:
+            mgb_assert(false, "Invalid opr format(got:%u)",
+                       static_cast<uint32_t>(opr_format));
+    }
+#undef cb
+}
+
+static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape(
+        TensorFormats format) {
+    switch (format) {
+        case TensorFormats::NCHW:
+            return {{"N"}, {"C"}, {"H"}, {"W"}};
+        case TensorFormats::NHWC:
+            return {{"N"}, {"H"}, {"W"}, {"C"}};
+        case TensorFormats::NCHWc4:
+            return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
+        case TensorFormats::NCHWc8:
+            return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
+        case TensorFormats::NCHWc32:
+            return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
+        case TensorFormats::NCHWc64:
+            return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
+        case TensorFormats::CHWNc4:
+            return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
+        case TensorFormats::NHCWc4:
+            return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
+        case TensorFormats::KRSCk4:
+            return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
+        case TensorFormats::GKRSCk4:
+            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
+        case TensorFormats::C1RSc4:
+            return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
+        case TensorFormats::KRSCk4c4:
+            return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
+        case TensorFormats::GKRSCk4c4:
+            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
+        case TensorFormats::KCRSk4c4:
+            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
+        case TensorFormats::GKCRSk4c4:
+            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
+        case TensorFormats::KCRSc4k4:
+            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
+        case TensorFormats::GKCRSc4k4:
+            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
+        case TensorFormats::C11RSc4:
+            return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
+        case TensorFormats::KCRSc8k8:
+            return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
+        case TensorFormats::GKCRSc8k8:
+            return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
+        case TensorFormats::C11RSc8:
+            return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
+        case TensorFormats::KRSCk8:
+            return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
+        case TensorFormats::KCRSc4:
+            return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
+        case TensorFormats::GKCRSc4:
+            return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
+        case TensorFormats::KCRS:
+            return {{"K"}, {"C"}, {"R"}, {"S"}};
+        case TensorFormats::GKCRS:
+            return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
+        case TensorFormats::C11RS:
+            return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
+        default:
+            mgb_throw(AssertionError, "invalid tensor formats(%u)",
+                      static_cast<uint32_t>(format));
+    }
+}
+
+}  // namespace gopt
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/include/megbrain/gopt/global_layout_transform.h
+++ b/src/gopt/include/megbrain/gopt/global_layout_transform.h
+/**
+ * \file src/gopt/include/megbrain/gopt/global_layout_transformation.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+#include "megbrain/gopt/reformat_manager.h"
+#include "megbrain/gopt/subgraph_extractor.h"
+#include "megbrain/opr/dnn/convolution.h"
+
+namespace mgb {
+namespace gopt {
+
+/*!
+ * \brief A structure that describe the data types and  tensor formats
+ * configuration of the opr format
+ */
+struct OprTensorFormatsConfiguration {
+    using OprFormat = opr::ConvBias::Param::Format;
+    using OprTensorFormatsDispatcher =
+            thin_function<Maybe<OprTensorFormatsConfiguration>(
+                    const cg::OperatorNodeBase*)>;
+    Typeinfo* typeinfo;
+    OprFormat opr_format;
+    SmallVector<DTypeEnum> input_dtypes;
+    SmallVector<DTypeEnum> output_dtypes;
+    SmallVector<TensorFormats> input_tensor_formats;
+    SmallVector<TensorType> input_tensor_types;
+    SmallVector<TensorFormats> output_tensor_formats;
+    static OprTensorFormatsDispatcher* find_dispatcher_by_type_format(
+            Typeinfo* type, OprFormat opr_format);
+};
+
+/*!
+ * \brief A structure that describes the global layout transform problem
+ */
+class Problem {
+public:
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    using OprTensorFormatsDispatcher =
+            OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
+    using OprConfigTrait =
+            ThinHashMap<Typeinfo*,
+                        ThinHashMap<OprFormat, OprTensorFormatsDispatcher*>>;
+    struct Attribute {
+        OprFormat base_opr_format;  /// the base opr format indicates that the
+                                    /// network to be optimized is constructed
+                                    /// in the base opr format, i.e. all the
+                                    /// format aware operators (conv, conv_bias,
+                                    /// deconv, pooling etc.) are built in
+                                    /// this format.
+        TensorFormats
+                base_tensor_formats;  /// the base tensor format indicates that
+                                      /// all the format agnostic operators
+                                      /// (like elemwise, elemwise multi type,
+                                      /// typecvt etc.) are built in the base
+                                      /// tensor format.
+    };
+    Problem(const GraphPartition& graph_partition,
+            const SmallVector<TensorFormats>& available_tensor_formats,
+            const OprConfigTrait& opr_config, const Attribute& attribute)
+            : m_graph_partition{graph_partition},
+              m_available_tensor_formats{available_tensor_formats},
+              m_opr_configs{opr_config},
+              m_attribute{attribute} {}
+    ~Problem() noexcept = default;
+
+    const GraphPartition& graph_partition() const { return m_graph_partition; }
+    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
+    const SmallVector<TensorFormats>& available_tensor_formats() const {
+        return m_available_tensor_formats;
+    }
+    TensorFormats base_format() const {
+        return m_attribute.base_tensor_formats;
+    }
+    OprTensorFormatsConfiguration base_config(
+            const cg::OperatorNodeBase* opr) const {
+        auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
+                opr->dyn_typeinfo(), m_attribute.base_opr_format);
+        auto rst = (*_)(opr);
+        if (rst.valid())
+            return rst.val();
+        OprTensorFormatsConfiguration config;
+        config.typeinfo = opr->dyn_typeinfo();
+        config.opr_format = m_attribute.base_opr_format;
+        for (const auto& i : opr->input()) {
+            config.input_dtypes.emplace_back(i->dtype().enumv());
+            config.input_tensor_formats.emplace_back(
+                    m_attribute.base_tensor_formats);
+            config.input_tensor_types.emplace_back(TensorType::FEATURE);
+        }
+        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
+        config.output_tensor_formats.emplace_back(
+                m_attribute.base_tensor_formats);
+        return config;
+    }
+
+private:
+    const GraphPartition& m_graph_partition;  /// the graph partition
+    const SmallVector<TensorFormats>&
+            m_available_tensor_formats;  /// the available tensor formats, used
+                                         /// for format agnostic operators (like
+                                         /// elemwise, elemwise multi type,
+                                         /// typecvt, etc.
+    const OprConfigTrait&
+            m_opr_configs;  /// the available opr format configurations, used
+                            /// for format aware operators (like conv, deconv,
+                            /// conv_bias, etc.
+    Attribute m_attribute;  /// the extra attributes to describe the problem
+};
+
+/*!
+ * \brief A profiler that collects all the performance data to describe the
+ * global layout transform problem.
+ */
+class ProfilerBase {
+public:
+    using OprFormat = Problem::OprFormat;
+    struct OperatorNodeRecord {
+        const cg::OperatorNodeBase* opr;  ///< pointer to operator node
+        ThinHashMap<OprFormat, float>
+                costs;  ///< costs of operator node, i.e. the elapsed device
+                        ///< time of the operator node on different opr format
+                        ///< (layout configuration).
+        std::string to_string() const;
+    };
+    struct VarNodeRecord {
+        struct KeyHash {
+            size_t operator()(
+                    const std::pair<TensorFormats, TensorFormats>& val) const {
+                size_t h1 =
+                        std::hash<uint32_t>()(static_cast<uint32_t>(val.first));
+                size_t h2 = std::hash<uint32_t>()(
+                        static_cast<uint32_t>(val.second));
+                return mgb::hash_pair_combine(h1, h2);
+            }
+        };
+        const VarNode* var;  ///< pointer to var node
+        std::unordered_map<std::pair<TensorFormats, TensorFormats>, float,
+                           KeyHash>
+                costs;  ///< costs of var node, i.e. the elapsed
+                        ///< device time of the layout transform.
+                        ///< Key of the hashmap indicates the
+                        ///< source tensor format and the target
+                        ///< tensor format.
+        std::string to_string() const;
+    };
+    /*!
+     * \note the profiler assumes all the input and output var node are stored
+     * in contiguous layout in memory
+     */
+    struct ProfilingResult {
+        /// A hashmap, that maps the operator node to the costs (device elapsed
+        /// time) of different layouts configuration
+        ThinHashMap<cg::OperatorNodeBase*, OperatorNodeRecord> opr_record;
+        /// A hashmap, that maps the var node to the costs of layout transform
+        ThinHashMap<VarNode*, VarNodeRecord> var_record;
+    };
+
+    ProfilerBase() = default;
+    virtual ~ProfilerBase() = default;
+    virtual ProfilingResult profile(const Problem& problem) const = 0;
+    static std::unique_ptr<ProfilerBase> make_profiler();
+};
+
+}  // namespace gopt
+}  // namespace mgb
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/include/megbrain/gopt/reformat_emitter.h
+++ b/src/gopt/include/megbrain/gopt/reformat_emitter.h
@@ -80,11 +80,13 @@ private:

 class PaddingEmitter final : public Emitter {
 public:
-    PaddingEmitter(size_t const_extent, size_t axis)
-            : m_const_extent{const_extent}, m_axis{axis} {}
+    PaddingEmitter(const megdnn::NamedTensorShape& padshp, size_t const_extent,
+                   size_t axis)
+            : m_padshp{padshp}, m_const_extent{const_extent}, m_axis{axis} {}
    EmitResult emit() const override;

 private:
+    megdnn::NamedTensorShape m_padshp;
    size_t m_const_extent, m_axis;
 };


--- a/src/gopt/include/megbrain/gopt/reformat_manager.h
+++ b/src/gopt/include/megbrain/gopt/reformat_manager.h
@@ -17,6 +17,11 @@
 namespace mgb {
 namespace gopt {

+enum class TensorType : uint32_t {
+    FEATURE = 0,
+    WEIGHT = 1,
+};
+
 enum class TensorFormats : uint32_t {
    // input tensor formats
    NCHW = 0,     ///< [N, C, H, W]
@@ -116,6 +121,15 @@ public:
 private:
    ReformatCache m_cache;
 };
+
+TensorShape make_aligned_tensor_shape(const VarNode* var,
+                                      TensorFormats orig_formats,
+                                      TensorFormats target_formats);
+
+TensorShape make_aligned_weight_shape(const VarNode* var,
+                                      TensorFormats orig_formats,
+                                      TensorFormats target_formats,
+                                      TensorFormats extra_formats);
 }  // namespace gopt
 }  // namespace mgb


--- a/src/gopt/include/megbrain/gopt/subgraph_extractor.h
+++ b/src/gopt/include/megbrain/gopt/subgraph_extractor.h
@@ -20,6 +20,7 @@ class GraphPartition {
 public:
    using VarNodeSet = ThinHashSet<VarNode*>;
    using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>;
+
    class InputPlaceholder;

    GraphPartition() = default;
@@ -45,13 +46,13 @@ private:
 class SubGraphExtractor {
 public:
    using OprList = ThinHashSet<Typeinfo*>;
-    SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {};
+    SubGraphExtractor(const OprList& opr_list) : m_opr_list{opr_list} {};
    std::vector<GraphPartition> extract(
            const SymbolVarArray& endpoint_vars) const;

 private:
    class Impl;
-    OprList m_opr_list;
+    const OprList& m_opr_list;
 };

 }  // namespace gopt

--- a/src/gopt/test/profiler.cpp
+++ b/src/gopt/test/profiler.cpp
+/**
+ * \file src/gopt/test/profiler.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "./helper.h"
+#include "megbrain/gopt/global_layout_transform.h"
+#include "megbrain/gopt/inference.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/imgproc.h"
+#include "megbrain/opr/nn_int.h"
+#include "megbrain/serialization/serializer.h"
+
+using namespace mgb;
+using namespace gopt;
+using namespace serialization;
+
+namespace {
+class LayoutTransformContext : public NonCopyableObj {
+public:
+    using OprList = SubGraphExtractor::OprList;
+    using OprFormat = Problem::OprFormat;
+    using OprConfigTrait = Problem::OprConfigTrait;
+
+    LayoutTransformContext() = delete;
+    LayoutTransformContext(OprList opr_list,
+                           SmallVector<TensorFormats> available_tensor_formats,
+                           OprConfigTrait opr_configs)
+            : m_opr_list{std::move(opr_list)},
+              m_available_tensor_formats{std::move(available_tensor_formats)},
+              m_opr_configs{std::move(opr_configs)} {}
+    const OprList& opr_list() const { return m_opr_list; }
+    const SmallVector<TensorFormats>& available_tensor_formats() const {
+        return m_available_tensor_formats;
+    }
+    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
+    static std::unique_ptr<LayoutTransformContext> make() {
+        OprList opr_list = {
+                opr::ConvBiasForward::typeinfo(),
+                opr::ConvolutionForward::typeinfo(),
+                opr::ConvolutionBackwardData::typeinfo(),
+                opr::ElemwiseMultiType::typeinfo(),
+                opr::Elemwise::typeinfo(),
+                opr::TypeCvt::typeinfo(),
+                opr::PoolingForward::typeinfo(),
+                opr::WarpPerspectiveForward::typeinfo(),
+        };
+        OprConfigTrait opr_configs;
+        {
+            auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::ConvBias::typeinfo(), OprFormat::_fmt);
+            cb(NCHW4);
+            cb(NCHW32);
+            cb(NHWC);
+            cb(NCHW64);
+            cb(CHWN4);
+#undef cb
+        }
+        {
+            auto& dispatchers =
+                    opr_configs[opr::ConvolutionBackwardData::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::ConvolutionBackwardData::typeinfo(),              \
+                    OprFormat::_fmt);
+            cb(NCHW4);
+#undef cb
+        }
+
+        {
+            auto& dispatchers =
+                    opr_configs[opr::ConvolutionForward::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::ConvolutionForward::typeinfo(), OprFormat::_fmt);
+            cb(NCHW4);
+#undef cb
+        }
+
+        {
+            auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::PoolingForward::typeinfo(), OprFormat::_fmt);
+            cb(NCHW4);
+            cb(NCHW32);
+            cb(NHWC);
+            cb(NCHW64);
+            cb(CHWN4);
+#undef cb
+        }
+
+        {
+            auto& dispatchers =
+                    opr_configs[opr::WarpPerspectiveForward::typeinfo()];
+#define cb(_fmt)                                                           \
+    dispatchers[OprFormat::_fmt] =                                         \
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
+                    opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt);
+            cb(NHWC);
+            cb(NCHW4);
+            cb(NCHW64);
+#undef cb
+        }
+
+        SmallVector<TensorFormats> available_tensor_formats = {
+                TensorFormats::NHWC, TensorFormats::NCHWc4,
+                TensorFormats::NCHWc32, TensorFormats::NCHWc64};
+        return std::make_unique<LayoutTransformContext>(
+                std::move(opr_list), std::move(available_tensor_formats),
+                std::move(opr_configs));
+    }
+
+private:
+    OprList m_opr_list;
+    SmallVector<TensorFormats> m_available_tensor_formats;
+    OprConfigTrait m_opr_configs;
+};
+};  // namespace
+
+#if MGB_CUDA
+#if CUDA_VERSION >= 10020
+TEST(TestProfiler, Conv) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+    auto mkcvar = [&](const char* name, const TensorShape& shp,
+                      const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
+                        .rename(name),
+                dtype);
+    };
+    auto x = mkvar("x", {64, 48, 14, 14},
+                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
+    auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f));
+    auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f));
+    opr::ConvBias::Param param;
+    param.format = opr::ConvBias::Param::Format::NCHW;
+    param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
+    param.stride_h = param.stride_w = 1;
+    param.pad_h = param.pad_w = 1;
+    auto c1 = opr::ConvBias::make(x, w1, b1, param, {},
+                                  OperatorNodeConfig(dtype::Quantized4Asymm(
+                                          12.345f, static_cast<uint8_t>(5))));
+    x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f));
+    auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f));
+    auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f));
+    auto c2 = opr::ConvBias::make(x, w2, b2, param, {},
+                                  OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
+
+    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    S strategy = S::PROFILE;
+    gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({c2});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    const auto& var_rst = rst.var_record;
+    EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
+    EXPECT_TRUE(var_rst.count(w1.node()) == 0);
+    EXPECT_TRUE(var_rst.count(b1.node()) == 0);
+    EXPECT_TRUE(var_rst.count(w2.node()) == 0);
+    EXPECT_TRUE(var_rst.count(b2.node()) == 0);
+}
+#endif
+
+TEST(TestProfiler, Deconv) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+    auto mkcvar = [&](const char* name, const TensorShape& shp,
+                      const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
+                        .rename(name),
+                dtype);
+    };
+    auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f));
+    auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
+    using Param = opr::ConvolutionBackwardData::Param;
+    Param param;
+    param.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
+    param.stride_h = param.stride_w = 2;
+    param.pad_h = param.pad_w = 0;
+    auto c1 = opr::ConvolutionBackwardData::make(
+            w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
+    auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
+    auto c2 = opr::ConvolutionBackwardData::make(
+            w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
+
+    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    S strategy = S::PROFILE;
+    gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({c2});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    const auto& var_rst = rst.var_record;
+    EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
+    EXPECT_TRUE(var_rst.count(w1.node()) == 0);
+    EXPECT_TRUE(var_rst.count(w2.node()) == 0);
+}
+
+TEST(TestProfiler, Warp) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    constexpr size_t INP_H = 10, INP_W = 10, N = 16;
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+
+    auto x = mkvar("x", {N, 48, INP_H, INP_W},
+                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
+    float value1 = M_PI, value2 = 0.6;
+    auto gen_mat = [&](HostTensorND& mat) {
+        auto ptr = mat.ptr<float>();
+        for (size_t i = 0; i < N; ++i) {
+            auto rot = value1, scale = value2, sheer = value1, dy = value2,
+                 dx = value2, ky = value2, kx = value2, kb = value2;
+            ptr[0] = ptr[4] = cos(rot) * scale;
+            ptr[1] = -(ptr[3] = sin(rot) * scale);
+            ptr[3] *= sheer;
+            ptr[4] *= sheer;
+            ptr[2] = dx;
+            ptr[5] = dy;
+            ptr[6] = kx;
+            ptr[7] = ky;
+            ptr[8] = kb;
+            ptr += 9;
+        }
+        mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
+    };
+    auto mat_host = std::make_shared<HostTensorND>(
+            x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
+    gen_mat(*mat_host);
+    auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
+    TensorShape out_shp{20, 20};
+    auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp);
+
+    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    S strategy = S::PROFILE;
+    gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({w1});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    const auto& var_rst = rst.var_record;
+    EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0);
+    EXPECT_TRUE(var_rst.count(mat.node()) == 0);
+    EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0);
+    EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0);
+}
+
+TEST(TestProfiler, Pooling) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+    auto x = mkvar("x", {64, 64, 55, 55},
+                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
+    using Param = opr::Pooling::Param;
+    Param param;
+    param.format = Param::Format::NCHW;
+    auto p1 = opr::Pooling::make(x, param);
+    x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f));
+    auto p2 = opr::Pooling::make(x, param);
+
+    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
+    S strategy = S::PROFILE;
+    gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({p2});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
+}
+
+TEST(TestProfiler, Elemwise) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
+    auto ctx = LayoutTransformContext::make();
+
+    HostTensorGenerator<dtype::Int8> gen;
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+    auto mkvar = [&](const char* name, const TensorShape& shp,
+                     const DType& dtype) {
+        return opr::TypeCvt::make(
+                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
+                dtype);
+    };
+    auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32());
+    auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32());
+    auto c = opr::Elemwise::make({a, b},
+                                 {opr::Elemwise::Param::Mode::FUSE_ADD_RELU});
+    auto q4c = opr::TypeCvt::make(
+            c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
+    auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f));
+    auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f));
+    auto q8d = opr::ElemwiseMultiType::make(
+            {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU},
+            OperatorNodeConfig(dtype::QuantizedS8(12.f)));
+    auto q4d = opr::TypeCvt::make(
+            q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3)));
+    auto q4e = opr::ElemwiseMultiType::make(
+            {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD},
+            OperatorNodeConfig(
+                    dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));
+
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    SubGraphExtractor extractor(ctx->opr_list());
+    auto partitions = extractor.extract({q4e});
+    ASSERT_EQ(partitions.size(), 1u);
+    using Attribute = Problem::Attribute;
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
+    Problem problem(partitions[0], ctx->available_tensor_formats(),
+                    ctx->opr_configs(), attribute);
+    auto profiler = ProfilerBase::make_profiler();
+    auto rst = profiler->profile(problem);
+    const auto& opr_rst = rst.opr_record;
+    const auto& var_rst = rst.var_record;
+    EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0);
+    EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0);
+    EXPECT_TRUE(var_rst.count(a.node()) > 0);
+    EXPECT_TRUE(var_rst.count(b.node()) > 0);
+    EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
+    EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
+}
+
+#endif
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/test/reformat_manager.cpp
+++ b/src/gopt/test/reformat_manager.cpp
@@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) {
    for (size_t i = 0; i < RUNS; ++i)
        func->execute();
    double time_profiler = profiler->duration() * 1e6;
-    printf("%f, %f\n", time_profiler, time_cuda_evt);
-    ASSERT_EQ(time_cuda_evt, time_profiler);
    MGB_CUDA_CHECK(cudaEventDestroy(evt0));
    MGB_CUDA_CHECK(cudaEventDestroy(evt1));
 }