feat(mgb/gopt): add dynamic programming solver

GitOrigin-RevId: 595392ec89e3723fa702efdbf695f5bd04bec95a

feat(mgb/gopt): add dynamic programming solver
GitOrigin-RevId: 595392ec89e3723fa702efdbf695f5bd04bec95a
50ea5ae8 · Megvii Engine Team · c14e5719 · 50ea5ae8 · 50ea5ae8 · 50ea5ae8
12 changed file
--- a/src/gopt/impl/dynamic_programming_solver.cpp
+++ b/src/gopt/impl/dynamic_programming_solver.cpp
--- a/src/gopt/impl/layout_transform_context.cpp
+++ b/src/gopt/impl/layout_transform_context.cpp
+/**
+ * \file src/gopt/impl/layout_transform_context.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "./utils.h"
+#include "megbrain/gopt/global_layout_transform.h"
+
+using namespace mgb;
+using namespace gopt;
+
+/* ================= LayoutTransformContext ==================*/
+LayoutTransformContext& LayoutTransformContext::add_opr_config(
+        Typeinfo* opr, OprFormat opr_format) {
+    auto& dispatchers = m_opr_configs[opr];
+    dispatchers[opr_format] =
+            OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
+                    opr, opr_format);
+    return *this;
+}
+
+LayoutTransformContext& LayoutTransformContext::add_opr_config(
+        Typeinfo* opr, SmallVector<OprFormat> opr_formats) {
+    auto& dispatchers = m_opr_configs[opr];
+    for (auto opr_fmt : opr_formats) {
+        dispatchers[opr_fmt] =
+                OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
+                        opr, opr_fmt);
+    }
+    return *this;
+}
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/impl/profiler_impl.cpp
+++ b/src/gopt/impl/profiler_impl.cpp
@@ -17,6 +17,7 @@
 #include "megbrain/graph/event.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
+#include "megbrain/opr/nn_int.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/plugin/base.h"
 #include "megbrain/serialization/sereg.h"
@@ -265,6 +266,10 @@ ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
    record.opr = opr;
    auto& costs = record.costs;
    for (auto&& i : available_configs) {
+        /// XXXX remove later
+        if (i.opr_format == OprFormat::NCHW &&
+            opr->input(0)->dtype().enumv() != DTypeEnum::Float32)
+            continue;
        costs[i.opr_format] = profile_operator(opr, base_config, i);
    }
    return record;
@@ -414,12 +419,23 @@ ProfilerImpl::ProfilingResult ProfilerImpl::profile(
            cb(Resize, 1),
 #undef cb
    };
+    static const ThinHashSet<Typeinfo*> skip_opr_types = {
+            TypeCvt::typeinfo(), Elemwise::typeinfo(),
+            ElemwiseMultiType::typeinfo()};
    ThinHashSet<VarNode*> vars;
    ThinHashSet<OperatorNodeBase*> oprs;
-    {
-        auto cb = [&cvprop, &vars, &oprs](OperatorNodeBase* opr) {
+    ThinHashSet<OperatorNodeBase*> skip_oprs;
+    for (auto&& opr : problem.graph_partition().all_oprs()) {
        if (cvprop.is_const(opr))
-                return;
+            continue;
+        bool skip = true;
+        for (auto&& i : opr->input()) {
+            skip &= problem.graph_partition().input().count(i) > 0 ||
+                    skip_oprs.count(i->owner_opr()) > 0;
+        }
+        skip &= skip_opr_types.count(opr->dyn_typeinfo());
+        if (skip)
+            skip_oprs.insert(opr);
        oprs.insert(opr);
        auto find = format_aware_input_tensors.find(opr->dyn_typeinfo());
        if (find == format_aware_input_tensors.end()) {
@@ -437,14 +453,8 @@ ProfilerImpl::ProfilingResult ProfilerImpl::profile(
                }
            }
        }
-            vars.insert(opr->output(0));
-        };
-        DepOprIter iter{cb};
-        for (auto&& i : problem.graph_partition().input()) {
-            iter.set_visited(i->owner_opr());
-        }
-        for (auto&& o : problem.graph_partition().output()) {
-            iter.add(o->owner_opr());
+        for (auto&& ov : opr->usable_output()) {
+            vars.insert(ov);
        }
    }

@@ -462,8 +472,14 @@ ProfilerImpl::ProfilingResult ProfilerImpl::profile(
        auto&& opr_configs = problem.opr_configs();
        auto find = opr_configs.find(opr->dyn_typeinfo());
        if (find == opr_configs.end()) {
+            if (skip_oprs.count(opr) > 0) {
+                SmallVector<TensorFormats> tensor_formats = {base_format};
+                opr_record[opr] =
+                        profile_operator(opr, base_format, tensor_formats);
+            } else {
                opr_record[opr] = profile_operator(opr, base_format,
                                                   available_tensor_formats);
+            }
        } else {
            auto&& dispatchers = find->second;
            SmallVector<OprTensorFormatsConfiguration> configs;

--- a/src/gopt/impl/profiling_based_solver.cpp
+++ b/src/gopt/impl/profiling_based_solver.cpp
+/**
+ * \file src/gopt/impl/profiling_based_solver.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "megbrain/gopt/global_layout_transform.h"
+#include "megbrain/opr/dnn/pooling.h"
+#include "megbrain/opr/imgproc.h"
+
+using namespace mgb;
+using namespace gopt;
+using namespace opr;
+
+/* =================== ProfilingBasedSolverSolver ======================*/
+ProfilingBasedSolver::ProfilingBasedSolver(
+        std::unique_ptr<ProfilerBase> profiler)
+        : m_profiler{std::move(profiler)} {
+    static const ThinHashSet<Typeinfo*> format_aware_oprs = {
+#define cb(_Opr) _Opr::typeinfo()
+            cb(Convolution),
+            cb(ConvBiasForward),
+            cb(ConvolutionBackwardData),
+            cb(PoolingForward),
+            cb(WarpPerspective),
+            cb(Resize),
+    };
+
+    m_graph_partition_filter = [](const GraphPartition& partition) {
+        bool has_format_aware_opr = false;
+        for (auto&& opr : partition.all_oprs()) {
+            if (!has_format_aware_opr &&
+                format_aware_oprs.count(opr->dyn_typeinfo())) {
+                has_format_aware_opr = true;
+                break;
+            }
+        }
+        return has_format_aware_opr;
+    };
+}
+
+ProfilingBasedSolver::Solution ProfilingBasedSolver::solve(
+        const Problem& problem) const {
+    const auto& partition = problem.graph_partition();
+    if (!m_graph_partition_filter(partition))
+        return Solution{};
+    return do_solve(problem);
+}
+
+// vim: syntax=cpp.doxygen
--- a/src/gopt/impl/reformat_manager.cpp
+++ b/src/gopt/impl/reformat_manager.cpp
@@ -11,9 +11,9 @@
 */

 #include "megbrain/gopt/reformat_manager.h"
+#include "./utils.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/utils/arith_helper.h"
-#include "./utils.h"

 using namespace mgb;
 using namespace gopt;
@@ -87,21 +87,6 @@ bool ReformatManager::ReformatKey::Equal::operator()(
           lhs.attribute == rhs.attribute;
 }

-ReformatManager::ReformatKey&
-ReformatManager::ReformatKey::deduce_reformat_dtype_enum(const DType& dt) {
-    static const ThinHashSet<std::pair<TensorFormats, TensorFormats>> set = {
-            {TensorFormats::NCHW, TensorFormats::NCHWc64},
-            {TensorFormats::NCHWc64, TensorFormats::NCHW},
-            {TensorFormats::NCHW, TensorFormats::NHWC},
-            {TensorFormats::NHWC, TensorFormats::NCHW}};
-    if (set.count({input_format, output_format}) > 0 &&
-        (dt.enumv() == DTypeEnum::QuantizedS4 ||
-         dt.enumv() == DTypeEnum::Quantized4Asymm)) {
-        input_dtype = output_dtype = dt.enumv();
-    }
-    return *this;
-}
-
 // =================== ReformatManager ====================*/
 ReformatManager::ReformatManager() {
    using Attribute = ReformatKey::Attribute;
@@ -427,11 +412,11 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
    for (size_t i = 0; i < input_shape.ndim; ++i) {
        if (input_shape[i].name() == Dimension::Name::C &&
            input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
-            in_channels = orig_var->shape()[i];
+            in_channels = orig_var->shape()[i] * input_shape[i].stride();
            input_channel_idx = i;
-            mgb_assert(input_shape[i].stride() == 1,
-                       "unsupport weight format(got:%s)",
-                       input_shape.to_string().c_str());
+//            mgb_assert(input_shape[i].stride() == 1,
+//                       "unsupport weight format(got:%s)",
+//                       input_shape.to_string().c_str());
        } else if ((input_shape[i].name() == Dimension::Name::K ||
                    input_shape[i].name() == Dimension::Name::N) &&
                   input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
@@ -536,7 +521,8 @@ TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var,
               "formats(var:%s;shp:%s;fmt:%s)",
               var->cname(), oshp.to_string().c_str(),
               orig_shape.to_string().c_str());
-    if (oshp.is_scalar()) return oshp;
+    if (oshp.is_scalar())
+        return oshp;
    TensorShape tshp;
    ThinHashMap<Dimension::Name, int> name2dominant;
    for (size_t i = 0; i < orig_shape.ndim; ++i) {
@@ -597,4 +583,32 @@ TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var,
    return tshp;
 }

+ReformatManager::AlignmentDesc mgb::gopt::make_aligned_desc(
+        TensorFormats weight_format, TensorFormats out_feature_format) {
+    using AlignmentDesc = ReformatManager::AlignmentDesc;
+    using Name = Dimension::Name;
+    auto weight_shape = tensor_formats_to_named_tensor_shape(weight_format);
+    auto out_shape = tensor_formats_to_named_tensor_shape(out_feature_format);
+    size_t out_channel_alignment = 1;
+    for (size_t i = 0; i < out_shape.ndim; ++i) {
+        auto name = out_shape[i].name();
+        auto extent = out_shape[i].extent();
+        if ((name == Name::C || name == Name::K) &&
+            extent == Dimension::UNDETERMINED_EXTENT) {
+            out_channel_alignment = out_shape[i].stride();
+            break;
+        }
+    }
+    Name out_channel_name;
+    for (size_t i = 0; i < weight_shape.ndim; ++i) {
+        auto name = weight_shape[i].name();
+        auto extent = weight_shape[i].extent();
+        if ((name == Name::N || name == Name::K) &&
+            extent == Dimension::UNDETERMINED_EXTENT) {
+            out_channel_name = name;
+        }
+    }
+    return AlignmentDesc{out_channel_name, out_channel_alignment};
+}
+
 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/subgraph_extractor.cpp
+++ b/src/gopt/impl/subgraph_extractor.cpp
@@ -304,10 +304,15 @@ std::vector<GraphPartition> SubGraphExtractor::extract(
                }
            }
            partition->opr_set().insert(opr);
+            partition->all_oprs().push_back(opr);
            for (const auto& i : opr->input())
                partition->input().insert(i);
        }
    }
+    for (auto&& partition : partitions) {
+        auto& all_oprs = partition.all_oprs();
+        std::reverse(all_oprs.begin(), all_oprs.end());
+    }
    return partitions;
 }


--- a/src/gopt/impl/utils.h
+++ b/src/gopt/impl/utils.h
@@ -36,6 +36,28 @@ static inline const char* opr_format_to_string(
 #undef cb
 }

+static inline TensorFormats opr_format_to_tensor_formats(
+        OprTensorFormatsConfiguration::OprFormat opr_format) {
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    switch (opr_format) {
+        case OprFormat::NCHW:
+            return TensorFormats::NCHW;
+        case OprFormat::NHWC:
+            return TensorFormats::NHWC;
+        case OprFormat::NCHW4:
+            return TensorFormats::NCHWc4;
+        case OprFormat::NCHW32:
+            return TensorFormats::NCHWc32;
+        case OprFormat::NCHW64:
+            return TensorFormats::NCHWc64;
+        case OprFormat::CHWN4:
+            return TensorFormats::CHWNc4;
+        default:
+            mgb_throw(AssertionError, "format(%s) is not supported",
+                      opr_format_to_string(opr_format));
+    };
+}
+
 static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape(
        TensorFormats format) {
    switch (format) {

--- a/src/gopt/include/megbrain/gopt/global_layout_transform.h
+++ b/src/gopt/include/megbrain/gopt/global_layout_transform.h
@@ -11,6 +11,7 @@
 */

 #pragma once
+#include "megbrain/gopt/framework.h"
 #include "megbrain/gopt/reformat_manager.h"
 #include "megbrain/gopt/subgraph_extractor.h"
 #include "megbrain/opr/dnn/convolution.h"
@@ -41,14 +42,16 @@ struct OprTensorFormatsConfiguration {
 /*!
 * \brief A structure that describes the global layout transform problem
 */
-class Problem {
+class LayoutTransformContext {
 public:
+    using OprList = SubGraphExtractor::OprList;
    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    using OprTensorFormatsDispatcher =
            OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
    using OprConfigTrait =
            ThinHashMap<Typeinfo*,
                        ThinHashMap<OprFormat, OprTensorFormatsDispatcher*>>;
+    using ReformatAttribute = ReformatManager::ReformatKey::Attribute;
    struct Attribute {
        OprFormat base_opr_format;  /// the base opr format indicates that the
                                    /// network to be optimized is constructed
@@ -62,58 +65,110 @@ public:
                                      /// (like elemwise, elemwise multi type,
                                      /// typecvt etc.) are built in the base
                                      /// tensor format.
+        ReformatAttribute
+                reformat_attribute;  /// additional reformat attribute, which
+                                     /// indicates whether to pad nhwc layout
+                                     /// automatically or to enable nhwcd4 format
+                                     /// on opencl platform to use image object
    };
-    Problem(const GraphPartition& graph_partition,
-            const SmallVector<TensorFormats>& available_tensor_formats,
-            const OprConfigTrait& opr_config, const Attribute& attribute)
-            : m_graph_partition{graph_partition},
-              m_available_tensor_formats{available_tensor_formats},
-              m_opr_configs{opr_config},
+    LayoutTransformContext() = delete;
+    LayoutTransformContext(OprList opr_list,
+                           SmallVector<TensorFormats> available_tensor_formats,
+                           Attribute attribute)
+            : m_opr_list{std::move(opr_list)},
+              m_available_tensor_formats{std::move(available_tensor_formats)},
+              m_attribute{attribute} {}
+    LayoutTransformContext(OprList opr_list,
+                           SmallVector<TensorFormats> available_tensor_formats,
+                           OprConfigTrait opr_configs, Attribute attribute)
+            : m_opr_list{std::move(opr_list)},
+              m_available_tensor_formats{std::move(available_tensor_formats)},
+              m_opr_configs{std::move(opr_configs)},
              m_attribute{attribute} {}
+    const OprList& opr_list() const { return m_opr_list; }
+    const SmallVector<TensorFormats>& available_tensor_formats() const {
+        return m_available_tensor_formats;
+    }
+    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
+    Attribute attribute() const { return m_attribute; }
+    /*!
+     * \brief add an op format configuration for a particular operator type
+     * \param opr runtime typeinfo of operator
+     * \param opr_format op format configuration which to be enabled in the
+     * layout transform problem
+     */
+    LayoutTransformContext& add_opr_config(Typeinfo* opr, OprFormat opr_format);
+    /*!
+     * \brief add a vector of op format configurations for a particular operator
+     * type
+     * \param opr runtime typeinfo of operator
+     * \param opr_format op format configuration which to be enabled in the
+     * layout transform problem
+     */
+    LayoutTransformContext& add_opr_config(Typeinfo* opr,
+                                           SmallVector<OprFormat> opr_formats);
+
+private:
+    OprList m_opr_list; /// supported operator list
+    SmallVector<TensorFormats>
+            m_available_tensor_formats;  /// the available tensor formats, used
+                                         /// for format agnostic operators (like
+                                         /// elemwise, elemwise multi type,
+                                         /// typecvt, etc.
+    OprConfigTrait m_opr_configs;  /// the available opr format configurations,
+                                   /// used for format aware operators (like
+                                   /// conv, deconv, conv_bias, etc.
+    Attribute m_attribute;  /// the extra attributes to describe the problem
+};
+
+class Problem {
+public:
+    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
+    using OprTensorFormatsDispatcher =
+            OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
+    using OprConfigTrait = LayoutTransformContext::OprConfigTrait;
+    using Attribute = LayoutTransformContext::Attribute;
+
+    Problem(const GraphPartition& graph_partition,
+            const LayoutTransformContext& ctx)
+            : m_graph_partition{graph_partition}, m_ctx{ctx} {}
    ~Problem() noexcept = default;

    const GraphPartition& graph_partition() const { return m_graph_partition; }
-    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
+    const OprConfigTrait& opr_configs() const { return m_ctx.opr_configs(); }
    const SmallVector<TensorFormats>& available_tensor_formats() const {
-        return m_available_tensor_formats;
+        return m_ctx.available_tensor_formats();
    }
    TensorFormats base_format() const {
-        return m_attribute.base_tensor_formats;
+        return m_ctx.attribute().base_tensor_formats;
    }
+    /*!
+     * \brief return the tensor formats configuration of an operator in the
+     * default op format
+     */
    OprTensorFormatsConfiguration base_config(
            const cg::OperatorNodeBase* opr) const {
        auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
-                opr->dyn_typeinfo(), m_attribute.base_opr_format);
+                opr->dyn_typeinfo(), m_ctx.attribute().base_opr_format);
        auto rst = (*_)(opr);
        if (rst.valid())
            return rst.val();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
-        config.opr_format = m_attribute.base_opr_format;
+        config.opr_format = m_ctx.attribute().base_opr_format;
        for (const auto& i : opr->input()) {
            config.input_dtypes.emplace_back(i->dtype().enumv());
-            config.input_tensor_formats.emplace_back(
-                    m_attribute.base_tensor_formats);
+            config.input_tensor_formats.emplace_back(base_format());
            config.input_tensor_types.emplace_back(TensorType::FEATURE);
        }
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
-        config.output_tensor_formats.emplace_back(
-                m_attribute.base_tensor_formats);
+        config.output_tensor_formats.emplace_back(base_format());
        return config;
    }

 private:
    const GraphPartition& m_graph_partition;  /// the graph partition
-    const SmallVector<TensorFormats>&
-            m_available_tensor_formats;  /// the available tensor formats, used
-                                         /// for format agnostic operators (like
-                                         /// elemwise, elemwise multi type,
-                                         /// typecvt, etc.
-    const OprConfigTrait&
-            m_opr_configs;  /// the available opr format configurations, used
-                            /// for format aware operators (like conv, deconv,
-                            /// conv_bias, etc.
-    Attribute m_attribute;  /// the extra attributes to describe the problem
+    const LayoutTransformContext& m_ctx;
 };

 /*!
@@ -170,6 +225,92 @@ public:
    static std::unique_ptr<ProfilerBase> make_profiler();
 };

+/*! 
+ * \brief abstract solver 
+ */
+class SolverBase {
+public:
+    using OprFormat = Problem::OprFormat;
+    using Solution = ThinHashMap<cg::OperatorNodeBase*, OprFormat>;
+    SolverBase() = default;
+    virtual ~SolverBase() = default;
+    /*!
+     * \brief solve the given problem
+     */
+    virtual Solution solve(const Problem& problem) const = 0;
+    /*!
+     * \brief check whether the given problem can be solved by the
+     * algorithm(i.e. solver).
+     */
+    virtual bool can_solve(const Problem& problem) const = 0;
+};
+
+/*!
+ * \brief solvers that will first collect the costs of operators in different op
+ * format and the costs of layout transform of varnode with a user provided
+ * profiler on the target device. This will lead to time consuming. 
+ */
+class ProfilingBasedSolver : public SolverBase {
+public:
+    using GraphPartitionFilter =
+            thin_function<bool(const GraphPartition& graph_partition)>;
+    ProfilingBasedSolver(std::unique_ptr<ProfilerBase> profiler);
+    /*!
+     * \note some graph partition (for example, graph partition without format
+     * aware operators like conv, deconv, warp, resize etc.) will be filtered by
+     * the GraphPartitionFilter, which can reduce the profiling time. */
+    ProfilingBasedSolver(std::unique_ptr<ProfilerBase> profiler,
+                         GraphPartitionFilter graph_partition_filter)
+            : m_profiler{std::move(profiler)},
+              m_graph_partition_filter{std::move(graph_partition_filter)} {}
+    virtual ~ProfilingBasedSolver() = default;
+    Solution solve(const Problem& problem) const override;
+    virtual Solution do_solve(const Problem& problem) const = 0;
+
+protected:
+    std::unique_ptr<ProfilerBase> m_profiler;
+
+private:
+    GraphPartitionFilter m_graph_partition_filter;
+};
+
+/*!
+ * \brief A solver that solves the layout selection problem using dynamic
+ * programming algorithm (Markov decision process).
+ */
+class DynamicProgrammingSolver final : public ProfilingBasedSolver {
+public:
+    DynamicProgrammingSolver(std::unique_ptr<ProfilerBase> profiler)
+            : ProfilingBasedSolver(std::move(profiler)){};
+    DynamicProgrammingSolver(std::unique_ptr<ProfilerBase> profiler,
+                             GraphPartitionFilter graph_partition_filter)
+            : ProfilingBasedSolver(std::move(profiler),
+                                   std::move(graph_partition_filter)){};
+    ~DynamicProgrammingSolver() noexcept = default;
+    Solution do_solve(const Problem& problem) const override;
+    bool can_solve(const Problem& problem) const override;
+
+private:
+    class Impl;
+};
+
+/*!
+ * \brief A layout transform pass, which convert the operator's format to the
+ * optimal format using the results of the solver.
+ */
+class LayoutTransformPass final : public Pass {
+public:
+    const char* name() const override { return "layout assignment pass"; }
+    void apply(OptState& opt) const override;
+    LayoutTransformPass(std::unique_ptr<LayoutTransformContext> ctx,
+                         std::unique_ptr<SolverBase> solver)
+            : m_ctx{std::move(ctx)}, m_solver{std::move(solver)} {}
+
+private:
+    std::unique_ptr<LayoutTransformContext> m_ctx;
+    std::unique_ptr<SolverBase> m_solver;
+};
+
 }  // namespace gopt
 }  // namespace mgb


--- a/src/gopt/include/megbrain/gopt/reformat_manager.h
+++ b/src/gopt/include/megbrain/gopt/reformat_manager.h
@@ -84,7 +84,7 @@ public:
                  output_dtype{DTypeEnum::Float32},
                  attribute{Attribute::DEFAULT} {}
        ReformatKey(TensorFormats input_format_, TensorFormats output_format_,
-                    Attribute attribute_ = Attribute::DEFAULT,
+                    Attribute attribute_,
                    DTypeEnum input_dtype_ = DTypeEnum::Float32,
                    DTypeEnum output_dtype_ = DTypeEnum::Float32)
                : input_format{input_format_},
@@ -92,6 +92,15 @@ public:
                  input_dtype{input_dtype_},
                  output_dtype{output_dtype_},
                  attribute{attribute_} {}
+        ReformatKey(TensorFormats input_format_, TensorFormats output_format_,
+                    DTypeEnum input_dtype_ = DTypeEnum::Float32,
+                    DTypeEnum output_dtype_ = DTypeEnum::Float32,
+                    Attribute attribute_ = Attribute::DEFAULT)
+                : input_format{input_format_},
+                  output_format{output_format_},
+                  input_dtype{input_dtype_},
+                  output_dtype{output_dtype_},
+                  attribute{attribute_} {}
        struct Hash {
            size_t operator()(const ReformatKey& key) const;
        };
@@ -99,7 +108,6 @@ public:
            bool operator()(const ReformatKey& lhs,
                            const ReformatKey& rhs) const;
        };
-        ReformatKey& deduce_reformat_dtype_enum(const DType& dt);
    };
    using ReformatCache =
            std::unordered_map<ReformatKey, ReformatImpl, ReformatKey::Hash,
@@ -130,6 +138,9 @@ TensorShape make_aligned_weight_shape(const VarNode* var,
                                      TensorFormats orig_formats,
                                      TensorFormats target_formats,
                                      TensorFormats extra_formats);
+
+ReformatManager::AlignmentDesc make_aligned_desc(
+        TensorFormats weight_format, TensorFormats out_feature_format);
 }  // namespace gopt
 }  // namespace mgb


--- a/src/gopt/include/megbrain/gopt/subgraph_extractor.h
+++ b/src/gopt/include/megbrain/gopt/subgraph_extractor.h
@@ -20,6 +20,7 @@ class GraphPartition {
 public:
    using VarNodeSet = ThinHashSet<VarNode*>;
    using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>;
+    using OperatorNodeList = std::vector<cg::OperatorNodeBase*>;

    class InputPlaceholder;

@@ -32,15 +33,18 @@ public:
    const OperatorNodeSet& opr_set() const { return m_opr_set; }
    const VarNodeSet& input() const { return m_inputs; }
    const VarNodeSet& output() const { return m_outputs; }
+    const OperatorNodeList& all_oprs() const { return m_oprs; }
    OperatorNodeSet& opr_set() { return m_opr_set; }
+    OperatorNodeList& all_oprs() { return m_oprs; }
    VarNodeSet& input() { return m_inputs; }
    VarNodeSet& output() { return m_outputs; }

 private:
+    std::pair<VarNodeArray, VarNodeArray> replace_graph_by_placeholder() const;
    OperatorNodeSet m_opr_set;
+    OperatorNodeList m_oprs;
    VarNodeSet m_inputs;
    VarNodeSet m_outputs;
-    std::pair<VarNodeArray, VarNodeArray> replace_graph_by_placeholder() const;
 };

 class SubGraphExtractor {

--- a/src/gopt/test/profiler.cpp
+++ b/src/gopt/test/profiler.cpp
@@ -10,6 +10,7 @@
 * implied.
 */

+#include "megbrain/plugin/profiler.h"
 #include "./helper.h"
 #include "megbrain/gopt/global_layout_transform.h"
 #include "megbrain/gopt/inference.h"
@@ -22,26 +23,13 @@ using namespace mgb;
 using namespace gopt;
 using namespace serialization;

+#if MGB_CUDA
 namespace {
-class LayoutTransformContext : public NonCopyableObj {
-public:
-    using OprList = SubGraphExtractor::OprList;
-    using OprFormat = Problem::OprFormat;
-    using OprConfigTrait = Problem::OprConfigTrait;
-
-    LayoutTransformContext() = delete;
-    LayoutTransformContext(OprList opr_list,
-                           SmallVector<TensorFormats> available_tensor_formats,
-                           OprConfigTrait opr_configs)
-            : m_opr_list{std::move(opr_list)},
-              m_available_tensor_formats{std::move(available_tensor_formats)},
-              m_opr_configs{std::move(opr_configs)} {}
-    const OprList& opr_list() const { return m_opr_list; }
-    const SmallVector<TensorFormats>& available_tensor_formats() const {
-        return m_available_tensor_formats;
-    }
-    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
-    static std::unique_ptr<LayoutTransformContext> make() {
+std::unique_ptr<LayoutTransformContext> make_ctx() {
+    using OprFormat = LayoutTransformContext::OprFormat;
+    using OprList = LayoutTransformContext::OprList;
+    using ReformatAttribute = LayoutTransformContext::ReformatAttribute;
+    using Attribute = LayoutTransformContext::Attribute;
    OprList opr_list = {
            opr::ConvBiasForward::typeinfo(),
            opr::ConvolutionForward::typeinfo(),
@@ -52,93 +40,42 @@ public:
            opr::PoolingForward::typeinfo(),
            opr::WarpPerspectiveForward::typeinfo(),
    };
-        OprConfigTrait opr_configs;
-        {
-            auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()];
-#define cb(_fmt)                                                           \
-    dispatchers[OprFormat::_fmt] =                                         \
-            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
-                    opr::ConvBias::typeinfo(), OprFormat::_fmt);
-            cb(NCHW4);
-            cb(NCHW32);
-            cb(NHWC);
-            cb(NCHW64);
-            cb(CHWN4);
-#undef cb
-        }
-        {
-            auto& dispatchers =
-                    opr_configs[opr::ConvolutionBackwardData::typeinfo()];
-#define cb(_fmt)                                                           \
-    dispatchers[OprFormat::_fmt] =                                         \
-            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
-                    opr::ConvolutionBackwardData::typeinfo(),              \
-                    OprFormat::_fmt);
-            cb(NCHW4);
-#undef cb
-        }
-
-        {
-            auto& dispatchers =
-                    opr_configs[opr::ConvolutionForward::typeinfo()];
-#define cb(_fmt)                                                           \
-    dispatchers[OprFormat::_fmt] =                                         \
-            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
-                    opr::ConvolutionForward::typeinfo(), OprFormat::_fmt);
-            cb(NCHW4);
-#undef cb
-        }
-
-        {
-            auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()];
-#define cb(_fmt)                                                           \
-    dispatchers[OprFormat::_fmt] =                                         \
-            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
-                    opr::PoolingForward::typeinfo(), OprFormat::_fmt);
-            cb(NCHW4);
-            cb(NCHW32);
-            cb(NHWC);
-            cb(NCHW64);
-            cb(CHWN4);
-#undef cb
-        }
-
-        {
-            auto& dispatchers =
-                    opr_configs[opr::WarpPerspectiveForward::typeinfo()];
-#define cb(_fmt)                                                           \
-    dispatchers[OprFormat::_fmt] =                                         \
-            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
-                    opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt);
-            cb(NHWC);
-            cb(NCHW4);
-            cb(NCHW64);
-#undef cb
-        }

    SmallVector<TensorFormats> available_tensor_formats = {
-                TensorFormats::NHWC, TensorFormats::NCHWc4,
-                TensorFormats::NCHWc32, TensorFormats::NCHWc64};
-        return std::make_unique<LayoutTransformContext>(
+            TensorFormats::NCHW,    TensorFormats::NHWC,
+            TensorFormats::NCHWc4,  TensorFormats::NCHWc32,
+            TensorFormats::NCHWc64, TensorFormats::CHWNc4};
+    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
+                           ReformatAttribute::DEFAULT};
+    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats),
-                std::move(opr_configs));
-    }
-
-private:
-    OprList m_opr_list;
-    SmallVector<TensorFormats> m_available_tensor_formats;
-    OprConfigTrait m_opr_configs;
-};
-};  // namespace
+            attribute);
+    ctx->add_opr_config(
+               opr::ConvBiasForward::typeinfo(),
+               {OprFormat::NCHW, OprFormat::NHWC, OprFormat::NCHW4,
+                OprFormat::NCHW32, OprFormat::NCHW64, OprFormat::CHWN4})
+            .add_opr_config(opr::ConvolutionForward::typeinfo(),
+                            {OprFormat::NCHW, OprFormat::NCHW4})
+            .add_opr_config(opr::ConvolutionBackwardData::typeinfo(),
+                            {OprFormat::NCHW, OprFormat::NCHW4})
+            .add_opr_config(
+                    opr::PoolingForward::typeinfo(),
+                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC,
+                     OprFormat::NCHW64, OprFormat::CHWN4})
+            .add_opr_config(
+                    opr::WarpPerspectiveForward::typeinfo(),
+                    {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64});
+    return ctx;
+}
+}  // namespace

-#if MGB_CUDA
 #if CUDA_VERSION >= 10020
 TEST(TestProfiler, Conv) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
-    auto ctx = LayoutTransformContext::make();
+    auto ctx = make_ctx();

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
@@ -177,14 +114,10 @@ TEST(TestProfiler, Conv) {
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
-    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({c2});
    ASSERT_EQ(partitions.size(), 1u);
-    using Attribute = Problem::Attribute;
-    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
-    Problem problem(partitions[0], ctx->available_tensor_formats(),
-                    ctx->opr_configs(), attribute);
+    Problem problem(partitions[0], *ctx);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
@@ -204,7 +137,7 @@ TEST(TestProfiler, Deconv) {
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
-    auto ctx = LayoutTransformContext::make();
+    auto ctx = make_ctx();

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
@@ -238,14 +171,10 @@ TEST(TestProfiler, Deconv) {
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
-    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({c2});
    ASSERT_EQ(partitions.size(), 1u);
-    using Attribute = Problem::Attribute;
-    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
-    Problem problem(partitions[0], ctx->available_tensor_formats(),
-                    ctx->opr_configs(), attribute);
+    Problem problem(partitions[0], *ctx);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
@@ -262,7 +191,7 @@ TEST(TestProfiler, Warp) {
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
-    auto ctx = LayoutTransformContext::make();
+    auto ctx = make_ctx();

    constexpr size_t INP_H = 10, INP_W = 10, N = 16;

@@ -307,14 +236,9 @@ TEST(TestProfiler, Warp) {
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
-    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({w1});
-    ASSERT_EQ(partitions.size(), 1u);
-    using Attribute = Problem::Attribute;
-    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
-    Problem problem(partitions[0], ctx->available_tensor_formats(),
-                    ctx->opr_configs(), attribute);
+    Problem problem(partitions[0], *ctx);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
@@ -330,7 +254,7 @@ TEST(TestProfiler, Pooling) {
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
-    auto ctx = LayoutTransformContext::make();
+    auto ctx = make_ctx();

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
@@ -353,14 +277,10 @@ TEST(TestProfiler, Pooling) {
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
-    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({p2});
    ASSERT_EQ(partitions.size(), 1u);
-    using Attribute = Problem::Attribute;
-    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
-    Problem problem(partitions[0], ctx->available_tensor_formats(),
-                    ctx->opr_configs(), attribute);
+    Problem problem(partitions[0], *ctx);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
@@ -373,8 +293,7 @@ TEST(TestProfiler, Elemwise) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    cn.activate();
-    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
-    auto ctx = LayoutTransformContext::make();
+    auto ctx = make_ctx();

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
@@ -403,14 +322,10 @@ TEST(TestProfiler, Elemwise) {
            OperatorNodeConfig(
                    dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));

-    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({q4e});
    ASSERT_EQ(partitions.size(), 1u);
-    using Attribute = Problem::Attribute;
-    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
-    Problem problem(partitions[0], ctx->available_tensor_formats(),
-                    ctx->opr_configs(), attribute);
+    Problem problem(partitions[0], *ctx);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
@@ -423,7 +338,6 @@ TEST(TestProfiler, Elemwise) {
    EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
    EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
 }
-
 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/test/reformat_manager.cpp
+++ b/src/gopt/test/reformat_manager.cpp
@@ -447,6 +447,7 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) {
    for (size_t i = 0; i < RUNS; ++i)
        func->execute();
    double time_profiler = profiler->duration() * 1e6;
+    printf("time: %f, %f\n", time_cuda_evt, time_profiler);
    MGB_CUDA_CHECK(cudaEventDestroy(evt0));
    MGB_CUDA_CHECK(cudaEventDestroy(evt1));
 }