refactor(fastrun): decouple fastrun from computing graph

GitOrigin-RevId: 27abd222950e36287784e58ad56d6819e9c4cb69

refactor(fastrun): decouple fastrun from computing graph
GitOrigin-RevId: 27abd222950e36287784e58ad56d6819e9c4cb69
12a3ef8d · Megvii Engine Team · 0a6f4a88 · 12a3ef8d · 12a3ef8d · 12a3ef8d
13 changed file
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -18,6 +18,7 @@ file(
  opr/impl/nvof/*.cpp
  plugin/impl/*.cpp
  serialization/impl/*.cpp
+  rdnn/impl/*.cpp
  core/impl/*.inl
  gopt/impl/*.inl
  opr/impl/*.inl
@@ -53,7 +54,8 @@ set(MGB_INC
    ${CMAKE_CURRENT_LIST_DIR}/gopt/include
    ${CMAKE_CURRENT_LIST_DIR}/opr/include
    ${CMAKE_CURRENT_LIST_DIR}/plugin/include
-    ${CMAKE_CURRENT_LIST_DIR}/serialization/include)
+    ${CMAKE_CURRENT_LIST_DIR}/serialization/include
+    ${CMAKE_CURRENT_LIST_DIR}/rdnn/include)

 if(MGE_WITH_JIT)
  list(APPEND MGB_INC ${CMAKE_CURRENT_LIST_DIR}/jit/include)

--- a/src/opr/impl/internal/invoke.h
+++ b/src/opr/impl/internal/invoke.h
--- a/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp
+++ b/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp
@@ -183,7 +183,7 @@ struct OprWithPolicyMaker<opr::BatchConvBiasForward>
                  MakeOprWithPolicyCaller4<megdnn::BatchConvBiasForward>,
                  megdnn::param::BatchConvBias> {};

-#include "../../opr/impl/internal/invoke.h"
+#include "megbrain/utils/invoke.h"
 template <typename Opr>
 struct MultiAlgoOprTrait;


--- a/src/gopt/impl/inference.cpp
+++ b/src/gopt/impl/inference.cpp
@@ -23,8 +23,8 @@
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/misc.h"
 #include "megbrain/opr/nn_int.h"
+#include "megbrain/opr/search_policy/algo_chooser.h"
 #include "megbrain/opr/search_policy/algo_chooser_helper.h"
-#include "megbrain/opr/search_policy/profiler.h"
 #include "megbrain/opr/tensor_gen.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/opr/utility.h"

--- a/src/opr/impl/blas.cpp
+++ b/src/opr/impl/blas.cpp
@@ -19,7 +19,6 @@
 #include "megbrain/opr/tensor_manip.h"

 #include "megbrain/opr/search_policy/algo_chooser.h"
-#include "megbrain/opr/search_policy/profiler.h"

 #include "./internal/megdnn_opr_wrapper.inl"
 #include "./search_policy/workspace_need_limit_getter.inl"

--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -18,11 +18,11 @@
 #include "megbrain/graph/grad_impl.h"
 #include "megbrain/system.h"
 #include "megbrain/utils/hash_ct.h"
+#include "megbrain/utils/invoke.h"
 #include "megbrain/utils/timer.h"

 #include "megdnn/oprs/utils.h"

-#include "../internal/invoke.h"
 #include "../internal/megdnn_opr_wrapper.inl"
 #include "../search_policy/workspace_need_limit_getter.inl"


--- a/src/opr/impl/internal/megdnn_opr_wrapper.cpp
+++ b/src/opr/impl/internal/megdnn_opr_wrapper.cpp
@@ -25,26 +25,6 @@ using namespace mixin;
 /* ================== global functions ================== */

 namespace {
-template <class Opr>
-class MegDNNGlobalOprContainer final : public UserDataContainer::UserData {
-    MGB_TYPEINFO_OBJ_DECL;
-
-    std::shared_ptr<megdnn::Handle> m_megdnn_handle;
-    std::unique_ptr<Opr> m_opr;
-
-public:
-    MegDNNGlobalOprContainer(CompNode cn)
-            : m_megdnn_handle{get_megdnn_handle_shared(cn)},
-              m_opr{m_megdnn_handle->create_operator<Opr>()} {
-        mgb_assert(m_opr->is_thread_safe());
-    }
-
-    Opr* get() const { return m_opr.get(); }
-};
-
-template <class Opr>
-MGB_TYPEINFO_OBJ_IMPL(MegDNNGlobalOprContainer<Opr>);
-
 class TempStorageContainer final : public UserDataContainer::UserData {
    MGB_TYPEINFO_OBJ_DECL;

@@ -55,34 +35,6 @@ public:
 MGB_TYPEINFO_OBJ_IMPL(TempStorageContainer);
 }  // anonymous namespace

-std::shared_ptr<megdnn::Handle> intl::get_megdnn_handle_shared(CompNode comp_node) {
-    auto& handle = MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node));
-    return {handle.shared_from_this(), handle.handle()};
-}
-
-megdnn::Handle* intl::get_megdnn_handle(CompNode comp_node) {
-    return MegDNNHandle::get(CompNodeEnv::from_comp_node(comp_node)).handle();
-}
-
-template <typename Opr>
-Opr* intl::get_megdnn_global_opr(CompNode comp_node) {
-    using T = MegDNNGlobalOprContainer<Opr>;
-    auto maker = [comp_node]() { return std::make_shared<T>(comp_node); };
-    return CompNodeEnv::from_comp_node(comp_node).get_user_data<T>(maker).get();
-}
-
-namespace mgb {
-namespace opr {
-namespace intl {
-#define INST(o) template o* get_megdnn_global_opr<o>(CompNode)
-INST(megdnn::AddUpdate);
-INST(megdnn::Relayout);
-INST(megdnn::Checksum);
-#undef INST
-}  // namespace intl
-}  // namespace opr
-}  // namespace mgb
-
 DeviceTensorStorage& intl::get_temp_storage(ComputingGraph& graph, CompNode comp_node) {
    auto container =
            graph.options().user_data.get_user_data_or_create<TempStorageContainer>();

--- a/src/opr/impl/search_policy/algo_chooser.cpp
+++ b/src/opr/impl/search_policy/algo_chooser.cpp
--- a/src/opr/impl/search_policy/profiler.cpp
+++ b/src/opr/impl/search_policy/profiler.cpp
-/**
- * \file src/opr/impl/search_policy/profile.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied.
- */
-
-#include "megbrain/opr/search_policy/profiler.h"
-
-#include "../internal/invoke.h"
-#include "../internal/megdnn_opr_wrapper.inl"
-#include "megdnn/handle.h"
-#include "megdnn/oprs/base.h"
-
-#if MGB_ROCM
-#include "hcc_detail/hcc_defs_prologue.h"
-#include "megcore_rocm.h"
-#endif
-
-//! TODO: here has to be know some megdnn::opr when there is produced midout.h
-//! fix it if there is another graceful way.
-#include "megdnn/oprs.h"
-
-#include "midout.h"
-
-MIDOUT_DECL(megbrain_opr_profile)
-#define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_profile, __VA_ARGS__) {
-#define MIDOUT_E \
-    }            \
-    MIDOUT_END();
-
-namespace {
-std::string serialize_policy(const megdnn::ExecutionPolicy& policy) {
-    std::string ret;
-    //! serialize AlgorithmDesc
-    megdnn::Algorithm::serialize_write_pod(policy.algo.handle_type, ret);
-    megdnn::Algorithm::serialize_write_pod(policy.algo.type, ret);
-    uint32_t param_size = policy.algo.param.size();
-    uint32_t name_size = policy.algo.name.size();
-    megdnn::Algorithm::serialize_write_pod<uint32_t>(param_size, ret);
-    megdnn::Algorithm::serialize_write_pod<uint32_t>(name_size, ret);
-    ret += policy.algo.param;
-    ret += policy.algo.name;
-
-    //! serialize sub_policy
-    uint32_t size = policy.sub_policy.size();
-    megdnn::Algorithm::serialize_write_pod(size, ret);
-    for (auto&& sub : policy.sub_policy) {
-        ret += serialize_policy(sub);
-    }
-    return ret;
-}
-
-megdnn::ExecutionPolicy deserialize_policy(
-        const char* buf, uint32_t size, uint32_t& offset) {
-    megdnn::ExecutionPolicy ret;
-#define cb(_val, _type)                                                 \
-    _val = megdnn::Algorithm::deserialize_read_pod<_type>(buf, offset); \
-    offset += sizeof(_val)
-
-    cb(ret.algo.handle_type, megdnn::Handle::HandleType);
-    cb(ret.algo.type, uint32_t);
-
-    uint32_t param_size = 0;
-    uint32_t name_size = 0;
-    cb(param_size, uint32_t);
-    cb(name_size, uint32_t);
-    if (param_size > 0) {
-        ret.algo.param = std::string(buf + offset, param_size);
-        offset += param_size;
-    }
-    if (name_size > 0) {
-        ret.algo.name = std::string(buf + offset, name_size);
-        offset += name_size;
-    }
-
-    uint32_t nr_policy = 0;
-    cb(nr_policy, uint32_t);
-#undef cb
-
-    for (uint32_t i = 0; i < nr_policy; i++) {
-        ret.sub_policy.push_back(deserialize_policy(buf, size, offset));
-    }
-    return ret;
-}
-}  // namespace
-
-namespace mgb {
-namespace opr {
-#define APPLY(statement, ...)                               \
-    mgb::apply(                                             \
-            [&](const auto&... args) { return statement; }, \
-            std::tuple_cat(__VA_ARGS__))
-
-////////////// TimedProfiler::Param::ExecutionPolicyBlob //////////////////////
-
-template <typename Opr>
-typename TimedProfiler<Opr>::Param::ExecutionPolicyBlob TimedProfiler<Opr>::Param::
-        ExecutionPolicyBlob::serialize(const megdnn::ExecutionPolicy& policy) {
-    ExecutionPolicyBlob ret;
-    std::string serialize_bin = serialize_policy(policy);
-    mgb_assert(serialize_bin.size() < MAX_SIZE_IN_BYTES);
-    memcpy(ret.data, serialize_bin.data(), serialize_bin.size());
-    ret.size = serialize_bin.size();
-    return ret;
-}
-
-template <typename Opr>
-megdnn::ExecutionPolicy TimedProfiler<Opr>::Param::ExecutionPolicyBlob::deserialize()
-        const {
-    uint32_t offset = 0;
-    auto&& ret = deserialize_policy(data, size, offset);
-    mgb_assert(offset == size);
-    return std::move(ret);
-}
-
-#define INST(Opr)                                                            \
-    template typename TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob \
-    TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::serialize(       \
-            const megdnn::ExecutionPolicy& policy);                          \
-    template megdnn::ExecutionPolicy                                         \
-    TimedProfiler<megdnn::Opr>::Param::ExecutionPolicyBlob::deserialize() const;
-
-MGB_FOREACH_FASTRUN_OPR(INST)
-#undef INST
-
-////////////////// TimedProfiler //////////////////////////////
-
-template <typename Opr>
-const double TimedProfiler<Opr>::timeout_setting =
-        TimedProfiler<Opr>::init_timeout_setting();
-
-template <typename Opr>
-double TimedProfiler<Opr>::init_timeout_setting() {
-#if MGB_ENABLE_FASTRUN
-    sys::TimedFuncInvoker::ins().register_func(
-            AlgoChooserFuncId<Opr>::ID, &TimedProfiler<Opr>::prof_impl,
-            &TimedProfiler<Opr>::prof_init_device);
-    auto to_set = MGB_GETENV("MGB_CONV_PROFILING_TIMEOUT");
-    if (to_set)
-        return std::stod(to_set);
-#endif
-    return 0;
-}
-
-#define APPLY(statement, ...)                               \
-    mgb::apply(                                             \
-            [&](const auto&... args) { return statement; }, \
-            std::tuple_cat(__VA_ARGS__))
-
-template <typename Opr>
-void TimedProfiler<Opr>::preprocess(
-        const TensorLayoutArray&, const megdnn::SmallVector<DeviceTensorND>&,
-        intl::UniqPtrWithCN<Opr>&, megdnn::Workspace&, std::array<TensorLayout, arity>&,
-        std::array<DeviceTensorND, arity_in>&, PreprocessFilter<Opr>&) {
-    // Opr is neither convbias nor convolution.This function do nothing.
-}
-
-//! convbias
-template <>
-void TimedProfiler<megdnn::ConvBias>::preprocess(
-        const TensorLayoutArray& preprocessed_layout,
-        const SmallVector<DeviceTensorND>& flt_val,
-        intl::UniqPtrWithCN<megdnn::ConvBias>& megdnn_opr,
-        megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
-        std::array<DeviceTensorND, arity_in>& inp_val,
-        PreprocessFilter<megdnn::ConvBias>& prep_flt) {
-    if (!preprocessed_layout.empty()) {
-        auto&& pf = prep_flt;
-        pf.algorithm_id = nullptr;
-        pf.tensors.resize(flt_val.size());
-        for (size_t i = 0; i < flt_val.size(); i++) {
-            pf.tensors[i] = flt_val[i].as_megdnn();
-        }
-        APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
-              std::forward_as_tuple(
-                      layouts[0], inp_val[1].as_megdnn(), inp_val[2].as_megdnn()),
-              array_skip<arity_in - 1>(layouts));
-    }
-}
-
-//! convolution
-template <>
-void TimedProfiler<megdnn::ConvolutionForward>::preprocess(
-        const TensorLayoutArray& preprocessed_layout,
-        const megdnn::SmallVector<DeviceTensorND>& flt_val,
-        intl::UniqPtrWithCN<megdnn::ConvolutionForward>& megdnn_opr,
-        megdnn::Workspace& mdn_workspace, std::array<TensorLayout, arity>& layouts,
-        std::array<DeviceTensorND, arity_in>& inp_val,
-        PreprocessFilter<megdnn::ConvolutionForward>& prep_flt) {
-    if (!preprocessed_layout.empty()) {
-        auto&& pf = prep_flt;
-        pf.algorithm_id = nullptr;
-        pf.tensors.resize(flt_val.size());
-        for (size_t i = 0; i < flt_val.size(); i++) {
-            pf.tensors[i] = flt_val[i].as_megdnn();
-        }
-        APPLY(megdnn_opr->exec_preprocess(args..., &pf, mdn_workspace),
-              std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()),
-              array_skip<2>(layouts));
-    }
-}
-
-template <typename Opr>
-typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
-        const TParam& raw_param) {
-    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_impl")))
-#if MGB_ROCM
-    bool miopen_algo_search_enabled;
-    megcore::getMIOpenAlgoSearchStatus(&miopen_algo_search_enabled);
-    mgb_assert(miopen_algo_search_enabled, "MIOpen algo search not enabled");
-#endif
-    auto&& param = raw_param.as_single_pod<Param>();
-    CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical);
-    auto megdnn_opr = intl::create_megdnn_opr<Opr>(cn);
-    std::array<TensorLayout, arity> layouts;
-
-    auto from_enum = [&](DTypeEnum enumv) -> DType {
-        switch (enumv) {
-
-#define cb(_dt)                  \
-    case DTypeTrait<_dt>::enumv: \
-        return _dt(1.0f, static_cast<uint8_t>(0))
-            cb(dtype::Quantized8Asymm);
-            cb(dtype::Quantized4Asymm);
-#undef cb
-
-#define cb(_dt)                  \
-    case DTypeTrait<_dt>::enumv: \
-        return _dt(1.0f)
-
-            cb(dtype::QuantizedS8);
-            cb(dtype::QuantizedS16);
-            cb(dtype::QuantizedS32);
-            cb(dtype::QuantizedS4);
-            default:
-                return DType::from_enum(enumv);
-#undef cb
-        }
-    };
-    for (int i = 0; i < arity; ++i) {
-        layouts[i] = {param.shapes[i], from_enum(param.dtypes[i])};
-    }
-
-    megdnn_opr->param() = param.opr_param;
-    megdnn_opr->execution_policy() = param.execution_policy.deserialize();
-
-    // Allocate preprocessed weight buffers.
-    TensorLayoutArray preprocessed_layout;
-    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
-        if (param.allow_weight_preprocess) {
-            preprocessed_layout = APPLY(
-                    _(megdnn_opr)->deduce_preprocessed_filter_layout(args...), layouts);
-        }
-    });
-
-    {
-        // first allocate a whole chunk to avoid memory fragmentation (here we
-        // rely on memory allocator to reuse memory)
-        auto align = cn.get_mem_addr_alignment();
-        size_t tot_size = align;
-        for (int i = 0; i < arity; ++i) {
-            tot_size += layouts[i].span().high_byte + align;
-        }
-        for (const auto& layout : preprocessed_layout) {
-            tot_size += layout.span().high_byte + align;
-        }
-        tot_size += param.workspace;
-        DeviceTensorStorage storage{cn};
-        storage.ensure_size(tot_size);
-    }
-
-    // allocate input and output memory
-    std::array<DeviceTensorND, arity_in> inp_val;
-    std::array<DeviceTensorND, arity_out> out_val;
-    DeviceTensorND workspace;
-    for (int i = 0; i < arity_in; ++i) {
-        inp_val[i].comp_node(cn).dtype(layouts[i].dtype).resize(layouts[i]);
-    }
-    for (int i = 0; i < arity_out; ++i) {
-        out_val[i]
-                .comp_node(cn)
-                .dtype(layouts[arity_in + i].dtype)
-                .resize(layouts[arity_in + i]);
-    }
-    megdnn::Workspace mdn_workspace;
-
-    // allocate workspace
-    if (param.workspace) {
-        workspace.comp_node(cn).dtype(dtype::Byte()).resize({param.workspace});
-        mdn_workspace.size = param.workspace;
-        mdn_workspace.raw_ptr = workspace.raw_ptr();
-    }
-
-    // allocate storage for preprocessed filter
-    SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size());
-    for (size_t i = 0; i < preprocessed_layout.size(); i++) {
-        flt_val[i] = {
-                cn, preprocessed_layout[i], preprocessed_layout[i].dtype,
-                preprocessed_layout[i].format};
-    }
-
-    for (int i = 0; i < arity_in; ++i) {
-        fill_zero_dev_tensor(inp_val[i]);
-    }
-
-    PreprocessFilter<Opr> prep_flt;
-    preprocess(
-            preprocessed_layout, flt_val, megdnn_opr, mdn_workspace, layouts, inp_val,
-            prep_flt);
-
-    RealTimer timer;
-    auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER),
-         ev_end = cn.create_event(CompNode::Event::NEED_TIMER);
-    ev_start->record();
-    if_constexpr<opr_supports_preprocess<Opr>()>(
-            [&](auto _) {
-                auto&& opr = _(megdnn_opr);
-                PreprocessFilter<Opr>* pf =
-                        preprocessed_layout.empty() ? nullptr : &prep_flt;
-                APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val,
-                      out_val);
-            },
-            /* else */
-            [&](auto _) {
-                APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val,
-                      out_val);
-            });
-    ev_end->record();
-
-    megdnn::Algorithm* algo =
-            megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
-    mgb_assert(algo);
-    double next_report_time = 0.5;
-    while (!ev_end->finished()) {
-        if (timer.get_secs() >= next_report_time) {
-#if MGB_ENABLE_GETENV
-            mgb_log_warn(
-                    "profiling conv algo %s already took %.3f/%.3f secs"
-                    " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
-                    algo->name(), timer.get_secs(), param.actual_timeout);
-#else
-            mgb_log_warn(
-                    "profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
-                    timer.get_secs(), param.actual_timeout);
-#endif
-            next_report_time = timer.get_secs() + 1;
-        }
-        using namespace std::literals;
-#if !__DEPLOY_ON_XP_SP2__
-        std::this_thread::sleep_for(1000us);
-#endif
-    }
-    // release all free blocks owned by child process,
-    // in order to avoid main process running out of memory
-    cn.try_coalesce_all_free_memory();
-
-    mgb_assert(ev_start->finished());
-    return TResult::from_pod(Result{ev_start->elapsed_time_until(*ev_end)});
-    MIDOUT_E
-};
-
-template <typename Opr>
-Maybe<typename TimedProfiler<Opr>::Result> TimedProfiler<Opr>::profile(
-        const Param& param, double& timeout) {
-    mgb_assert(timeout >= 0);
-    if (!timeout) {
-        timeout = timeout_setting;
-    } else if (timeout_setting) {
-        timeout = std::min(timeout, timeout_setting);
-    }
-    param.actual_timeout = timeout ? timeout : std::numeric_limits<double>::infinity();
-    auto res = sys::TimedFuncInvoker::ins().invoke(
-            AlgoChooserFuncId<Opr>::ID, TParam::from_pod(const_cast<Param&>(param)),
-            timeout);
-    if (res.valid())
-        return res.val().template as_single_pod<Result>();
-    return None;
-}
-
-template <typename Opr>
-void TimedProfiler<Opr>::prof_init_device(const TParam& raw_param) {
-    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("TimedProfiler::prof_init_device")))
-#if MGB_ROCM
-    megcore::enableMIOpenAlgoSearch(true);
-#endif
-    auto&& param = raw_param.as_single_pod<Param>();
-    CompNode cn = CompNode::load(param.comp_node_physical, param.comp_node_logical);
-    // wait for cuda init, so its time does not get accounted in timeout
-    cn.sync();
-    MIDOUT_E
-}
-
-#define INST(Opr)                                                             \
-    template const double TimedProfiler<megdnn::Opr>::timeout_setting;        \
-    template double TimedProfiler<megdnn::Opr>::init_timeout_setting();       \
-    template typename TimedProfiler<megdnn::Opr>::TResult                     \
-    TimedProfiler<megdnn::Opr>::prof_impl(const TParam& raw_param);           \
-    template Maybe<typename TimedProfiler<megdnn::Opr>::Result>               \
-    TimedProfiler<megdnn::Opr>::profile(const Param& param, double& timeout); \
-    template void TimedProfiler<megdnn::Opr>::prof_init_device(const TParam& raw_param);
-
-MGB_FOREACH_FASTRUN_OPR(INST)
-#undef INST
-}  // namespace opr
-}  // namespace mgb
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/opr/impl/search_policy/workspace_need_limit_getter.inl
+++ b/src/opr/impl/search_policy/workspace_need_limit_getter.inl
@@ -12,7 +12,7 @@

 #pragma once

-#include "megbrain/opr/search_policy/profiler.h"
+#include "megbrain/opr/search_policy/algo_chooser.h"

 #include "../internal/megdnn_opr_wrapper.inl"

@@ -25,7 +25,7 @@ namespace intl {
    struct AutoAddWorkspaceNeedLimitGetter<megdnn::_Opr> { \
        static constexpr bool val = true;                  \
    };
-MGB_FOREACH_FASTRUN_OPR(cb)
+DNN_FOREACH_FASTRUN_OPR(cb)

 #undef cb


--- a/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h
+++ b/src/opr/include/megbrain/opr/internal/megdnn_opr_wrapper.h
@@ -13,6 +13,7 @@

 #include "megbrain/graph.h"
 #include "megbrain/opr/internal/mixin_base.h"
+#include "megbrain/rdnn/management.h"

 #include "megdnn/handle.h"

@@ -20,43 +21,6 @@ namespace mgb {
 namespace opr {

 namespace intl {
-//! get megdnn handle from comp node
-MGE_WIN_DECLSPEC_FUC megdnn::Handle* get_megdnn_handle(CompNode comp_node);
-MGE_WIN_DECLSPEC_FUC std::shared_ptr<megdnn::Handle> get_megdnn_handle_shared(
-        CompNode comp_node);
-
-/*!
- * \brief get global megdnn operator asscoated with a computing node
- * \tparam Opr megdnn operator class, must be one of:
- *      * AddUpdate
- *      * Relayout
- *      * Checksum
- */
-template <typename Opr>
-MGE_WIN_DECLSPEC_FUC Opr* get_megdnn_global_opr(CompNode comp_node);
-
-template <class Obj>
-class UniqPtrWithCN : public std::unique_ptr<Obj> {
-    CompNode m_cn;
-
-public:
-    UniqPtrWithCN() = default;
-
-    template <class RObj>
-    UniqPtrWithCN(UniqPtrWithCN<RObj>&& o)
-            : std::unique_ptr<Obj>(std::move(o)), m_cn(o.comp_node()) {}
-
-    UniqPtrWithCN(std::unique_ptr<Obj> ptr, CompNode cn)
-            : std::unique_ptr<Obj>{std::move(ptr)}, m_cn{cn} {}
-
-    CompNode comp_node() const { return m_cn; }
-};
-
-//! create megdnn opr from megdnn handle in a CompNode
-template <class Opr>
-UniqPtrWithCN<Opr> create_megdnn_opr(CompNode comp_node) {
-    return {get_megdnn_handle(comp_node)->create_operator<Opr>(), comp_node};
-}

 /*!
 * \brief get temporary storage for oprs

--- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
+++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
@@ -19,7 +19,7 @@
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/search_policy/algo_chooser_helper.h"
-#include "megbrain/opr/search_policy/profiler.h"
+#include "megbrain/rdnn/algo_chooser.h"
 #include "megdnn/oprs/base.h"

 template <class MegDNNOpr>
@@ -31,18 +31,13 @@ struct MegDNNOpr2MGBOpr;
        using MGBOpr = mgb::opr::_Opr;      \
    };

-MGB_FOREACH_FASTRUN_OPR(cb)
+DNN_FOREACH_FASTRUN_OPR(cb)

 #undef cb

-namespace mgb {
-
-//! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum
-//! and megdnn::detail::AlgoAttribute enum
-using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy;
-
-using AlgoAttribute = megdnn::AlgoAttribute;
+#define MGB_FOREACH_FASTRUN_OPR(cb) DNN_FOREACH_FASTRUN_OPR(cb)

+namespace mgb {
 namespace opr {

 /* =================== AlgoChooser =================== */
@@ -56,138 +51,14 @@ namespace opr {
 * \tparam Opr megdnn operator impl
 */
 template <typename Opr>
-class AlgoChooser {
-    static constexpr int arity_in = OprArityTrait<Opr>::arity_in;
-    static constexpr int arity_out = OprArityTrait<Opr>::arity_out;
-    static constexpr int arity = OprArityTrait<Opr>::arity;
-
-    using ImplAlgo = typename Opr::AlgorithmInfo;
-    using ImplAlgoDesc = typename Opr::AlgorithmInfo::Desc;
-    using ImplExecutionPolicy = megdnn::ExecutionPolicy;
+class AlgoChooser : public rdnn::AlgoChooser<Opr> {
+    using Base = rdnn::AlgoChooser<Opr>;
    using MGBOpr = typename MegDNNOpr2MGBOpr<Opr>::MGBOpr;
+    using ImplExecutionPolicy = typename Base::ImplExecutionPolicy;

 public:
-    using FixedTensorLayouts = std::array<TensorLayout, arity>;
-    class AlgoChooserHelper {
-        //! fastrun layouts
-        FixedTensorLayouts m_fastrun_layouts;
-        //! layouts used when get and set cache item
-        FixedTensorLayouts m_incache_layouts;
-        Opr* m_dnn_opr;
-        std::string m_param;
-        const cg::OperatorNodeBase* m_base_mgb_opr;
-        CompNode m_cn;
-        megdnn::param::ExecutionPolicy m_execution_policy;
-        bool m_allow_weight_preprocess;
-
-    public:
-        AlgoChooserHelper(
-                const FixedTensorLayouts& layouts, Opr* megdnn_opr,
-                const std::string& param_str, const cg::OperatorNodeBase* mgb_opr,
-                const CompNode& cn,
-                const megdnn::param::ExecutionPolicy& execution_policy,
-                bool allow_weight_preprocess);
-
-        Opr* megdnn_opr() const { return m_dnn_opr; }
-
-        const cg::OperatorNodeBase* mgb_opr() const { return m_base_mgb_opr; }
-
-        const TensorLayout& inp_layout(size_t idx) const {
-            return m_fastrun_layouts[idx];
-        }
-        cg::ComputingGraph* owner_graph() const {
-            return m_base_mgb_opr->owner_graph();
-        }
-        const megdnn::param::ExecutionPolicy& execution_policy() const {
-            return m_execution_policy;
-        }
-        CompNode comp_node() const { return m_cn; }
-        const std::string& param() const { return m_param; }
-
-        bool allow_weight_preprocess() const { return m_allow_weight_preprocess; }
-
-        megdnn::Algorithm* get_algorithm_from_desc(
-                const megdnn::Algorithm::Info::Desc& desc) const {
-            return m_dnn_opr->get_algorithm_from_desc(desc);
-        }
-
-        const FixedTensorLayouts& fastrun_layouts() const { return m_fastrun_layouts; }
-
-        const FixedTensorLayouts& incache_layouts() const { return m_incache_layouts; }
-
-        //! construct algo chain by heuristic
-        ImplExecutionPolicy choose_by_heuristic(
-                const ExecutionStrategy& selected_strategy) const;
-
-        //! construct algo chain by profiling
-        ImplExecutionPolicy choose_by_profile(
-                const ExecutionStrategy& selected_strategy, bool enable_update) const;
-
-        //! get all profile algorithm from cache, return invalid if not exists
-        std::pair<ImplAlgoDesc, Maybe<AlgoChooserProfileCache::Result>>
-        get_profile_result_from_cache(const ExecutionStrategy& selected_strategy) const;
-
-        /**
-         * \brief construct execution policy from cache or heuristic.
-         *
-         * \param selected_strategy select algo which matched this strategy
-         * \param[in,out] policy execution policy
-         * \param retrive_from_cache retrive algo from cache if set True, get
-         *     from heuristic otherwise.
-         * \param allow_log no warning log print if set True, print warning info
-         * otherwise.
-         */
-        void construct_execution_policy(
-                const ExecutionStrategy& selected_strategy, ImplExecutionPolicy& policy,
-                bool retrive_from_cache = true, bool allow_log = true) const;
-
-        //! get workspace size required for specific execution policy
-        size_t get_workspace_size_bytes(
-                const ImplExecutionPolicy& policy,
-                const FixedTensorLayouts& layouts = {}) const;
-
-        //! get all candidate algos, and the one choose_by_heuristic() is
-        //! put first
-        std::vector<ImplAlgo> get_all_candidates() const;
-
-        /*!
-         * \brief profile a single algorithm
-         *
-         * This is actually a wrapper that constructs param and call
-         * TimedProfiler<Opr>::profile for the actual profiling
-         *
-         * \param[in,out] timeout set the timeout, and return the actual
-         *      timeout used during profiling
-         */
-        Maybe<AlgoChooserProfileCache::ResultEntry> profile_single_algo(
-                const ImplExecutionPolicy& policy, double& timeout) const;
-
-        //! profile and save to cache
-        void profile(const ExecutionStrategy& selected_strategy) const;
-
-        /**
-         * \brief extract algo attribute from execution strategy and graph
-         * option.
-         *
-         * \param strategy select algo which matched this strategy
-         * \return pair<positive_attr, negative_attr>
-         */
-        std::pair<AlgoAttribute, AlgoAttribute> extract_algo_attribute(
-                const ExecutionStrategy& strategy) const;
-
-    private:
-        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter(
-                const FixedTensorLayouts& layouts = {}) const;
-    };
-
-    template <typename U>
-    friend class AlgoChooser;
-
-private:
-    //! entrance for getting algorithm according to execution strategy
-    static ImplExecutionPolicy get_policy(const AlgoChooserHelper& helper);
-
-public:
+    using AlgoChooserHelper = typename Base::AlgoChooserHelper;
+    using FixedTensorLayouts = typename Base::FixedTensorLayouts;
    /*!
     * \brief setup algorithm and return workspace size
     */

--- a/src/opr/include/megbrain/opr/search_policy/profiler.h
+++ b/src/opr/include/megbrain/opr/search_policy/profiler.h
-/**
- * \file src/opr/include/megbrain/opr/search_policy/profile.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
- * implied.
- */
-
-#pragma once
-
-#include "megbrain/comp_node.h"
-#include "megbrain/opr/internal/megdnn_opr_wrapper.h"
-#include "megbrain/system.h"
-#include "megbrain/tensor.h"
-#include "megbrain/utils/hash_ct.h"
-#include "megbrain/utils/timer.h"
-
-#include "megdnn/basic_types.h"
-#include "megdnn/oprs.h"
-
-namespace mgb {
-namespace opr {
-
-// clang-format off
-#define MGB_FOREACH_FASTRUN_OPR(cb)  \
-    cb(ConvolutionForward)           \
-    cb(ConvBiasForward)              \
-    cb(ConvolutionBackwardData)      \
-    cb(ConvolutionBackwardFilter)    \
-    cb(Convolution3DForward)         \
-    cb(Convolution3DBackwardData)    \
-    cb(Convolution3DBackwardFilter)  \
-    cb(LocalShareForward)            \
-    cb(LocalShareBackwardData)       \
-    cb(LocalShareBackwardFilter)     \
-    cb(DeformableConvForward)        \
-    cb(DeformableConvBackwardFilter) \
-    cb(DeformableConvBackwardData)   \
-    cb(BatchConvBiasForward)         \
-    cb(MatrixMul)                    \
-    cb(BatchedMatrixMul)             \
-    cb(PoolingForward)               \
-    cb(PoolingBackward)
-// clang-format on
-
-template <typename Opr>
-constexpr bool opr_supports_preprocess() {
-    return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
-           std::is_same<Opr, megdnn::ConvBias>::value;
-}
-
-template <typename Opr>
-constexpr bool opr_contain_bias() {
-    return std::is_same<Opr, megdnn::ConvBias>::value;
-}
-
-//! matmul and batchedMatrixMul
-template <typename Opr>
-constexpr bool is_matmul() {
-    return std::is_same<Opr, megdnn::MatrixMul>::value ||
-           std::is_same<Opr, megdnn::BatchedMatrixMul>::value;
-}
-
-template <typename Opr, bool has_prep>
-struct PreprocessFilterImpl {
-    using T = union {};
-};
-
-template <typename Opr>
-struct PreprocessFilterImpl<Opr, true> {
-    using T = typename Opr::PreprocessedFilter;
-};
-
-template <typename Opr>
-using PreprocessFilter =
-        typename PreprocessFilterImpl<Opr, opr_supports_preprocess<Opr>()>::T;
-
-template <typename Opr>
-struct AlgoChooserFuncId {};
-
-#define DEF_FUNC_ID(func)                                                           \
-    template <>                                                                     \
-    struct AlgoChooserFuncId<megdnn::func> {                                        \
-        __attribute__((unused)) static constexpr sys::TimedFuncInvoker::FuncId ID = \
-                static_cast<sys::TimedFuncInvoker::FuncId>(                         \
-                        MGB_HASH_STR("megdnn::" #func));                            \
-    };
-
-MGB_FOREACH_FASTRUN_OPR(DEF_FUNC_ID)
-
-#undef DEF_FUNC_ID
-
-/* =================== TimedProfiler =================== */
-
-/*!
- * \brief profile a megdnn opr conv with given param
- *
- * This class only provides static methods, and the entry point is
- * TimedProfiler::profile; it would run profiler in a timed environment by
- * sys::TimedFuncInvoker
- *
- * \tparam Opr megdnn opr impl
- */
-template <typename Opr>
-class TimedProfiler {
-    static constexpr int arity_in = OprArityTrait<Opr>::arity_in;
-    static constexpr int arity_out = OprArityTrait<Opr>::arity_out;
-    static constexpr int arity = OprArityTrait<Opr>::arity;
-
-    using TensorShapeArray = std::array<megdnn::TensorShape, arity>;
-
-public:
-    struct Param {
-        struct ExecutionPolicyBlob {
-            //! enlarge the max size if needed
-            constexpr static size_t MAX_SIZE_IN_BYTES = 10240;
-            char data[MAX_SIZE_IN_BYTES];
-            uint32_t size;
-
-            static ExecutionPolicyBlob serialize(const megdnn::ExecutionPolicy& policy);
-            megdnn::ExecutionPolicy deserialize() const;
-        };
-        ExecutionPolicyBlob execution_policy;
-        size_t workspace;
-        megdnn::DTypeEnum dtypes[arity];
-        CompNode::Locator comp_node_physical, comp_node_logical;
-        TensorShapeArray shapes;
-        typename Opr::Param opr_param;
-        bool allow_weight_preprocess;
-
-        //! filled by profile()
-        mutable double actual_timeout;
-    };
-
-    struct Result {
-        double time;
-    };
-
-    static Maybe<Result> profile(const Param& param, double& timeout);
-
-private:
-    using TParam = sys::TimedFuncInvoker::Param;
-    using TResult = sys::TimedFuncInvoker::Result;
-
-    static const double timeout_setting;
-
-    static double init_timeout_setting();
-    static void preprocess(
-            const megdnn::TensorLayoutArray& preprocessed_layout,
-            const SmallVector<DeviceTensorND>& flt_val,
-            intl::UniqPtrWithCN<Opr>& megdnn_opr, megdnn::Workspace& mdn_workspace,
-            std::array<TensorLayout, arity>& layouts,
-            std::array<DeviceTensorND, arity_in>& inp_val,
-            PreprocessFilter<Opr>& prep_flt);
-    static TResult prof_impl(const TParam& raw_param);
-    static void prof_init_device(const TParam& raw_param);
-};
-}  // namespace opr
-}  // namespace mgb
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}