feat(mgb/opr): add fast profile and combined Execution strategy

GitOrigin-RevId: 843dc3a7907bc6ec9a728ec6425b7910d9c136c5

feat(mgb/opr): add fast profile and combined Execution strategy
GitOrigin-RevId: 843dc3a7907bc6ec9a728ec6425b7910d9c136c5
a3ea1f15 · Megvii Engine Team · 80f00643 · a3ea1f15 · a3ea1f15 · a3ea1f15
31 changed file
--- a/dnn/include/megdnn/basic_types.h
+++ b/dnn/include/megdnn/basic_types.h
@@ -506,10 +506,66 @@ struct DynOutMallocPolicyCall {
    }
 };

+
+template <typename T>
+class EnumClassBit {
+    std::underlying_type_t<T> m_val;
+
+    constexpr EnumClassBit(std::underlying_type_t<T> v) : m_val(v) {}
+
+public:
+    constexpr EnumClassBit(T v)
+            : m_val(static_cast<std::underlying_type_t<T>>(v)) {}
+
+    constexpr operator T() const { return static_cast<T>(m_val); }
+
+    constexpr explicit operator bool() const { return m_val; }
+
+#define DEF_OPR(op)                                                     \
+    constexpr EnumClassBit operator op(const EnumClassBit& rhs) const { \
+        return m_val op rhs.m_val;                                      \
+    }
+
+    DEF_OPR(&)
+    DEF_OPR(|)
+    DEF_OPR (^)
+
+    constexpr EnumClassBit operator~() const { return ~m_val; }
+
+#undef DEF_OPR
+};
+
 #endif  // MEGDNN_CC_HOST

 }  // namespace megdnn

+#define _MEGDNN_DECBO_SINGLE_OPR(cls, op)                                    \
+    inline constexpr ::megdnn::EnumClassBit<cls> operator op(cls x, cls y) { \
+        return ::megdnn::EnumClassBit<cls>(x)                                \
+                op ::megdnn::EnumClassBit<cls>(y);                           \
+    }                                                                        \
+    inline constexpr ::megdnn::EnumClassBit<cls> operator op(                \
+            ::megdnn::EnumClassBit<cls> x, cls y) {                          \
+        return x op ::megdnn::EnumClassBit<cls>(y);                          \
+    }
+
+#define _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, op)          \
+    inline constexpr cls& operator op##=(cls& x, cls y) { \
+        x = x op ::megdnn::EnumClassBit<cls>(y);          \
+        return x;                                         \
+    }
+
+#define MEGDNN_DEF_ENUM_CLASS_BIT_OPR(cls)                          \
+    _MEGDNN_DECBO_SINGLE_OPR(cls, &)                                \
+    _MEGDNN_DECBO_SINGLE_OPR(cls, |)                                \
+    _MEGDNN_DECBO_SINGLE_OPR(cls, ^)                                \
+    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, &)                         \
+    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, |)                         \
+    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, ^)                         \
+    inline constexpr ::megdnn::EnumClassBit<cls> operator~(cls x) { \
+        return ~::megdnn::EnumClassBit<cls>(x);                     \
+    }
+
 #include "megdnn/internal/visibility_epilogue.h"

 // vim: syntax=cpp.doxygen
--- a/dnn/include/megdnn/oprs/base.h
+++ b/dnn/include/megdnn/oprs/base.h
@@ -251,6 +251,8 @@ protected:
    Handle::HandleType m_handle_type = Handle::HandleType::NAIVE;
 };

+MEGDNN_DEF_ENUM_CLASS_BIT_OPR(Algorithm::Attribute)
+
 //! policy for executing the operator
 struct ExecutionPolicy {
    //! INVALID_ALGO_TYPE algo_type means using heuristic

--- a/dnn/scripts/gen_flatbuffers_schema.py
+++ b/dnn/scripts/gen_flatbuffers_schema.py
@@ -53,9 +53,13 @@ class FlatBuffersWriter(IndentWriterBase):
            e = self._enums[(p, e)]
            self._write_doc(e.name)
            self._write("enum %s%s : uint {", p, e.name, indent=1)
-            for member in e.members:
+            for idx, member in enumerate(e.members):
                self._write_doc(member)
-                self._write("%s,", scramble_enum_member_name(str(member)))
+                if e.combined:
+                    self._write("%s=%d,", scramble_enum_member_name(str(member)),
+                            1<<idx)
+                else:
+                    self._write("%s,", scramble_enum_member_name(str(member)))
            self._write("}\n", indent=-1)

    def _write_doc(self, doc):

--- a/dnn/scripts/gen_param_defs.py
+++ b/dnn/scripts/gen_param_defs.py
@@ -80,13 +80,13 @@ class member_defs:
        :attr member_alias: list of (member, alias) pairs
        """
        __slots__ = ['name', 'name_field', 'members', 'default',
-                     'member_alias']
+                     'member_alias', 'combined']

        all_enums = {}
        """(param_name, name) => enum"""

        def __init__(self, param_name, name, name_field, members, default,
-                     member_alias):
+                member_alias, combined = False):
            name = member_defs.Doc.make(name)
            assert name.id[0].isupper()
            members = tuple(map(member_defs.Doc.make, members))
@@ -97,6 +97,7 @@ class member_defs:
                default = name_field.index(default)
            assert isinstance(default, int)
            self.name = name
+            self.combined = combined
            self.name_field = self.get_name_field(name.id, name_field)
            self.members = members
            self.default = default
@@ -197,6 +198,12 @@ class ParamDef:
            self.name.id, name, name_field, members, default, member_alias))
        return self

+    def add_bit_combination_enum(self, name, *members, default=0,
+                 name_field=None, member_alias=[]):
+        self.members.append(member_defs.Enum(
+            self.name.id, name, name_field, members, default, member_alias, True))
+        return self
+
    def add_enum_alias(self, name, src_class, src_name=None, name_field=None,
                       default=None):
        self.members.append(member_defs.EnumAlias(
@@ -463,8 +470,12 @@ class SerializedDType(_ParamDefBase):
        for idx, emem in enumerate(e.members):
            self._write('%s = "%s"', emem, emem)
            self._write_doc(emem)
-            self._enum_member2num.append('id({}.{}):{}'.format(
-                qualname, emem, idx))
+            if e.combined:
+                self._enum_member2num.append('id({}.{}):{}'.format(
+                    qualname, emem, 1<<idx))
+            else:
+                self._enum_member2num.append('id({}.{}):{}'.format(
+                    qualname, emem, idx))

        for emem, emem_alis in e.member_alias:
            self._write('%s = %s', emem_alis, emem)
@@ -622,6 +633,8 @@ class CPPWriter(IndentWriterBase):
        for idx, i in enumerate(e.members):
            self._write_doc(i)
            v = '{} = {}'.format(i, idx)
+            if e.combined:
+                v = '{} = 1 << {}'.format(i, idx)
            if i is not e.members[-1] or e.member_alias:
                v += ','
            self._write(v)
@@ -672,7 +685,6 @@ class CPPEnumValueWriter(CPPWriter):
            self._write('static const uint32_t %s = %s;', alias, mem)
        self._write('};', indent=-1)

-
    def _on_member_enum_alias(self, e):
        s = e.src_enum
        self._write('typedef %s::%s %s;', e.src_class, e.src_name, e.name)

--- a/dnn/scripts/gen_tablegen.py
+++ b/dnn/scripts/gen_tablegen.py
@@ -91,12 +91,17 @@ class ConverterWriter(IndentWriterBase):
        def format(v):
            return '\"{}\"'.format(str(v))
        enum_def += ','.join(format(i) for i in e.members)
-        enum_def += "]"
+
+        if e.combined:
+            enum_def += "], 1"
+        else:
+            enum_def += "], 0"
+
        if ENUM_TO_STRING_SPECIAL_RULES.count((p.name, e.name)):
            enum_def += ", 1" # whether generate ToStringTrait
        enum_def += ">"
-        self._write("def {} : {};".format(td_class, enum_def))

+        self._write("def {} : {};".format(td_class, enum_def))
        if self._skip_current_param:
            return


--- a/dnn/src/common/algo_base.h
+++ b/dnn/src/common/algo_base.h
@@ -21,8 +21,6 @@

 namespace megdnn {

-MEGDNN_DEF_ENUM_CLASS_BIT_OPR(AlgoAttribute)
-
 #define MEGDNN_DECL_ALGO_TYPE(_type)                              \
    uint32_t type() const override {                              \
        return static_cast<std::underlying_type<AlgoType>::type>( \

--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -692,61 +692,6 @@ inline void* get_origin_ptr(const TensorND* tensor, void* ptr) {
                              tensor->layout.span().low_byte);
 }

-template <typename T>
-class EnumClassBit {
-    std::underlying_type_t<T> m_val;
-
-    constexpr EnumClassBit(std::underlying_type_t<T> v) : m_val(v) {}
-
-public:
-    constexpr EnumClassBit(T v)
-            : m_val(static_cast<std::underlying_type_t<T>>(v)) {}
-
-    constexpr operator T() const { return static_cast<T>(m_val); }
-
-    constexpr explicit operator bool() const { return m_val; }
-
-#define DEF_OPR(op)                                                     \
-    constexpr EnumClassBit operator op(const EnumClassBit& rhs) const { \
-        return m_val op rhs.m_val;                                      \
-    }
-
-    DEF_OPR(&)
-    DEF_OPR(|)
-    DEF_OPR (^)
-
-    constexpr EnumClassBit operator~() const { return ~m_val; }
-
-#undef DEF_OPR
-};
-
-#define _MEGDNN_DECBO_SINGLE_OPR(cls, op)                                    \
-    inline constexpr ::megdnn::EnumClassBit<cls> operator op(cls x, cls y) { \
-        return ::megdnn::EnumClassBit<cls>(x)                                \
-                op ::megdnn::EnumClassBit<cls>(y);                           \
-    }                                                                        \
-    inline constexpr ::megdnn::EnumClassBit<cls> operator op(                \
-            ::megdnn::EnumClassBit<cls> x, cls y) {                          \
-        return x op ::megdnn::EnumClassBit<cls>(y);                          \
-    }
-
-#define _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, op)          \
-    inline constexpr cls& operator op##=(cls& x, cls y) { \
-        x = x op ::megdnn::EnumClassBit<cls>(y);          \
-        return x;                                         \
-    }
-
-#define MEGDNN_DEF_ENUM_CLASS_BIT_OPR(cls)                          \
-    _MEGDNN_DECBO_SINGLE_OPR(cls, &)                                \
-    _MEGDNN_DECBO_SINGLE_OPR(cls, |)                                \
-    _MEGDNN_DECBO_SINGLE_OPR(cls, ^)                                \
-    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, &)                         \
-    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, |)                         \
-    _MEGDNN_DECBO_SINGLE_OPR_ASSIGN(cls, ^)                         \
-    inline constexpr ::megdnn::EnumClassBit<cls> operator~(cls x) { \
-        return ~::megdnn::EnumClassBit<cls>(x);                     \
-    }
-
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/convolution3d/backward_filter/algo.h
+++ b/dnn/src/cuda/convolution3d/backward_filter/algo.h
@@ -218,4 +218,3 @@ public:
 }  // namespace megdnn

 // vim: syntax=cpp.doxygen
-
--- a/imperative/python/megengine/functional/debug_param.py
+++ b/imperative/python/megengine/functional/debug_param.py
@@ -8,9 +8,12 @@
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 import os

+from ..core.ops import builtin
 from ..logger import get_logger
 from ..utils.deprecation import deprecated

+Strategy = builtin.ops.Convolution.Strategy
+
 _execution_strategy = os.getenv("MEGENGINE_EXECUTION_STRATEGY", "HEURISTIC")

 if os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY") != None:
@@ -19,7 +22,7 @@ if os.getenv("MEGENGINE_CONV_EXECUTION_STRATEGY") != None:
    )


-def get_execution_strategy() -> str:
+def get_execution_strategy() -> Strategy:
    """
    Returns the execution strategy of :class:`~.Conv2d` and :func:'~.matmul'

@@ -28,12 +31,22 @@ def get_execution_strategy() -> str:
    return _execution_strategy


-def set_execution_strategy(option: str):
+def set_execution_strategy(option):
    """
    Sets the execution strategy of :class:`~.Conv2d` and :func:'~.matmul'

-    :param option: Decides how :class:`~.Conv2d` and :func:'~.matmul' algorithms are chosen.
-        Available values:
+    :param option: Decides how :class:`~.Conv2d`and :func:'~.matmul' algorithms are chosen.
+        Available value Strategy
+        * HEURISTIC uses heuristic to choose the fastest algorithm.
+        * PROFILE runs possible algorithms on real device to find the best one.
+        * REPRODUCIBLE uses the algorithms that is reproducible.
+        * OPTMIZED uses the algorithms that is optimized.
+
+        The default strategy is HEURISTIC, this options can be combined to
+        form a combination option, e.g. PROFILE | REPRODUCIBLE
+        can combined a option that uses the fastest of profiling result that is also reproducible.
+
+        Available values string:

        * 'HEURISTIC' uses heuristic to choose the fastest algorithm.
        * 'PROFILE' runs possible algorithms on real device to find the best one.
@@ -45,18 +58,29 @@ def set_execution_strategy(option: str):

        It can also be set through the environment variable 'MEGENGINE_EXECUTION_STRATEGY'.
    """
-    valid_option = (
-        "HEURISTIC",
-        "PROFILE",
-        "PROFILE_HEURISTIC",
-        "PROFILE_REPRODUCIBLE",
-        "HEURISTIC_REPRODUCIBLE",
-    )
-    if not option in valid_option:
-        raise ValueError("Valid option can only be one of {}".format(valid_option))
+    valid_string_option = {
+        "REPRODUCIBLE": Strategy.REPRODUCIBLE,
+        "HEURISTIC": Strategy.HEURISTIC,
+        "PROFILE": Strategy.PROFILE,
+    }

    global _execution_strategy  # pylint: disable=global-statement
-    _execution_strategy = option
+    if isinstance(option, Strategy):
+        _execution_strategy = option
+        return
+
+    assert isinstance(option, str)
+
+    strategy_tmp = Strategy(0)
+    for opt in option.split("_"):
+        if not opt in valid_string_option:
+            raise ValueError(
+                "Valid option can only be one of {}, or combine them with '_'.".format(
+                    valid_string_option.keys()
+                )
+            )
+        strategy_tmp = strategy_tmp | valid_string_option[opt]
+    _execution_strategy = strategy_tmp


 @deprecated(version="1.3", reason="use get_execution_strategy() instead")

--- a/imperative/python/test/integration/test_correctness_mnistnet.py
+++ b/imperative/python/test/integration/test_correctness_mnistnet.py
@@ -19,6 +19,7 @@ import megengine.autodiff as ad
 import megengine.functional as F
 from megengine import jit
 from megengine.core._trace_option import set_symbolic_shape
+from megengine.core.ops import builtin
 from megengine.core.tensor.utils import make_shape_tuple
 from megengine.functional.debug_param import set_execution_strategy
 from megengine.jit import SublinearMemoryConfig
@@ -33,6 +34,8 @@ from megengine.module import (
 from megengine.optimizer import SGD
 from megengine.tensor import Tensor

+Strategy = builtin.ops.Convolution.Strategy
+

 def get_gpu_name():
    try:
@@ -242,7 +245,7 @@ def test_correctness():
    else:
        model_name = "mnist_model_with_test_cpu.mge"
    model_path = os.path.join(os.path.dirname(__file__), model_name)
-    set_execution_strategy("HEURISTIC_REPRODUCIBLE")
+    set_execution_strategy(Strategy.HEURISTIC | Strategy.REPRODUCIBLE)

    run_train(model_path, False, False, max_err=1e-5)
    run_train(model_path, True, False, max_err=1e-5)

--- a/imperative/tablegen/autogen.cpp
+++ b/imperative/tablegen/autogen.cpp
@@ -337,6 +337,20 @@ static void gen_op_def_pybind11_single(raw_ostream &os, MgbOp& op, EnumContext&
                        className, attr->getEnumName(), i
                    ));
                }
+                if (attr->getEnumCombinedFlag()) {
+                    //! define operator |
+                    os << formatv(
+                            "\n    .def(\"__or__\", []({0}::{1} s0, {0}::{1} s1) {{ "
+                            "\n         return static_cast<{0}::{1}>(uint32_t(s0) | uint32_t(s1));"
+                            "\n      })",
+                            className, attr->getEnumName());
+                    //! define operator &
+                    os << formatv(
+                            "\n    .def(\"__and__\", []({0}::{1} s0, {0}::{1} s1) {{"
+                            "\n         return static_cast<{0}::{1}>(uint32_t(s0) & uint32_t(s1));"
+                            "\n    })",
+                            className, attr->getEnumName());
+                }
                os << formatv(
                    "\n    .def(py::init([](const std::string& in) {"
                    "\n        auto&& str = normalize_enum(in);"

--- a/imperative/tablegen/helper.h
+++ b/imperative/tablegen/helper.h
@@ -77,6 +77,9 @@ struct MgbEnumAttrMixin : public MgbAttrWrapperBase {
    bool supportToString() const {
        return getBaseRecord()->getValueAsBit("supportToString");
    }
+    bool getEnumCombinedFlag() const {
+        return getBaseRecord()->getValueAsBit("enumCombined");
+    }
 };

 struct MgbHashableAttrMixin : public MgbAttrWrapperBase {

--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
@@ -142,8 +142,16 @@ R"__usage__(
 #if MGB_ENABLE_FASTRUN
 R"__usage__(
  --fast-run
-    Enable fast-run mode. Operators with multiple algorithms would be profiled
-    on the real device with actual input shapes.
+    This param will be deperated later, please replace with param --full-profile.
+ --full-profile
+    Enable full-profile mode. Operators with multiple algorithms would be profiled
+    on the real device with actual input shapes, all algorithms will be profiled
+    include naive algorithms.
+    See `mgb::gopt::enable_opr_algo_profiling_inplace` for more details.
+ --fast-profile
+    Enable fast-profile mode. Operators with multiple algorithms would be profiled
+    on the real device with actual input shapes, this mode will only profile the
+    well optimized algorithms to get the profile result fast.
    See `mgb::gopt::enable_opr_algo_profiling_inplace` for more details.
 )__usage__"
 #endif
@@ -511,7 +519,8 @@ struct Args {
    bool disable_assert_throw = false;
    bool share_param_mem = false;
 #if MGB_ENABLE_FASTRUN
-    bool use_fast_run = false;
+    bool use_full_profile = false;
+    bool use_fast_profile = false;
 #endif
    bool reproducible = false;
    std::string fast_run_cache_path;
@@ -695,18 +704,20 @@ void run_test_st(Args &env) {
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::HEURISTIC;
 #if MGB_ENABLE_FASTRUN
-    if (env.use_fast_run) {
+    if (env.use_full_profile) {
        if (env.reproducible) {
-            strategy = S::PROFILE_REPRODUCIBLE;
+            strategy = S::PROFILE | S::REPRODUCIBLE;
        } else {
            strategy = S::PROFILE;
        }
+    } else if (env.use_fast_profile) {
+        strategy = S::PROFILE | S::OPTMIZED;
    } else if (env.reproducible) {
-        strategy = S::HEURISTIC_REPRODUCIBLE;
+        strategy = S::HEURISTIC | S::REPRODUCIBLE;
    }
 #else
    if (env.reproducible) {
-        strategy = S::HEURISTIC_REPRODUCIBLE;
+        strategy = S::HEURISTIC | S::REPRODUCIBLE;
    }
 #endif
    mgb::gopt::modify_opr_algo_strategy_inplace(vars, strategy);
@@ -729,11 +740,12 @@ void run_test_st(Args &env) {
                    std::make_shared<InFilePersistentCache>(buf.get(), flen));
 #if MGB_ENABLE_FASTRUN
        } else {
-            mgb_assert(env.use_fast_run, "fast-run should be enabled");
+            mgb_assert(env.use_full_profile || env.use_fast_profile,
+                       "fast-run or fast-profile should be enabled");
            PersistentCache::set_impl(
                    std::make_shared<InFilePersistentCache>());
        }
-        if (!env.use_fast_run)
+        if (!env.use_full_profile && !env.use_fast_profile)
 #endif
            mgb::gopt::enable_opr_use_profiling_cache_inplace(vars);
    }
@@ -1314,7 +1326,18 @@ Args Args::from_argv(int argc, char **argv) {
        }
 #if MGB_ENABLE_FASTRUN
        if (!strcmp(argv[i], "--fast-run")) {
-            ret.use_fast_run = true;
+            mgb_log_warn(
+                    "--fast-run param will be deperated later, please replace "
+                    "with --full-profile or --fast-profile.");
+            ret.use_full_profile = true;
+            continue;
+        }
+        if (!strcmp(argv[i], "--full-profile")) {
+            ret.use_full_profile = true;
+            continue;
+        }
+        if (!strcmp(argv[i], "--fast-profile")) {
+            ret.use_fast_profile = true;
            continue;
        }
 #endif

--- a/src/core/impl/utils/persistent_cache.cpp
+++ b/src/core/impl/utils/persistent_cache.cpp
@@ -188,7 +188,7 @@ AlgoChooserProfileCache::get(const Key &key) {
        auto entry_len = read_uint32();
        mgb_assert(buf + entry_len <= buf_end);
        auto nr = sscanf(reinterpret_cast<const char*>(buf), ENTRY_FMT,
-                         &i.reproducible, &i.time, &i.workspace);
+                         &i.attribute, &i.time, &i.workspace);
        mgb_assert(nr == 3);
        buf += entry_len;
    }
@@ -210,10 +210,10 @@ void AlgoChooserProfileCache::put(const Key &key, Result &result) {
        auto &&cur = result[i];

        if (prev.workspace <= cur.workspace &&
-                prev.reproducible == cur.reproducible) {
+            prev.attribute == cur.attribute) {
            result.erase(result.begin() + i);
        } else {
-            ++ i;
+            ++i;
        }
    }

@@ -235,8 +235,8 @@ void AlgoChooserProfileCache::put(const Key &key, Result &result) {
        write_uint32(0);
        pos = val.size();
        val.resize(pos + SPR_SIZE);
-        uint32_t nr = snprintf(&val[pos], SPR_SIZE,
-                ENTRY_FMT, i.reproducible, i.time, i.workspace);
+        uint32_t nr = snprintf(&val[pos], SPR_SIZE, ENTRY_FMT, i.attribute,
+                               i.time, i.workspace);
        //! for memory boundary failed, snprintf ret do not contain \0
        nr += 1;
        mgb_assert(nr < SPR_SIZE);

--- a/src/core/include/megbrain/common.h
+++ b/src/core/include/megbrain/common.h
@@ -12,6 +12,8 @@
 #pragma once

 #include "megbrain_build_config.h"
+#include "megbrain/opr/param_defs.h"
+#include "megdnn/basic_types.h"

 #include <memory>
 #include <string>
@@ -242,6 +244,16 @@ inline constexpr std::size_t operator"" _z(unsigned long long n) {
    return n;
 }
 #endif
+
+#define MGB_DEF_ENUM_CLASS_BIT_OPR(cls) \
+    MEGDNN_DEF_ENUM_CLASS_BIT_OPR(cls)
+
 }   // namespace mgb

+namespace megdnn {
+namespace param {
+MGB_DEF_ENUM_CLASS_BIT_OPR(ExecutionPolicy::Strategy)
+}
+}  // namespace megdnn
+
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -12,7 +12,6 @@
 #pragma once

 #include "megbrain/utils/hash.h"
-#include "megbrain/utils/enum_class_bit.h"
 #include "megbrain/utils/metahelper.h"
 #include "megbrain/utils/thin/hash_table.h"
 #include "megbrain/utils/thread.h"

--- a/src/core/include/megbrain/graph/operator_node.h
+++ b/src/core/include/megbrain/graph/operator_node.h
@@ -16,7 +16,6 @@
 #include "megbrain/graph/symbol_var.h"

 #include "megbrain/utils/hashable.h"
-#include "megbrain/utils/enum_class_bit.h"
 #include "megbrain/utils/thin/hash_table.h"
 #include "megbrain/utils/small_vector.h"


--- a/src/core/include/megbrain/graph/var_node.h
+++ b/src/core/include/megbrain/graph/var_node.h
@@ -12,7 +12,6 @@
 #pragma once

 #include "megbrain/graph/bases.h"
-#include "megbrain/utils/enum_class_bit.h"
 #include "megbrain/utils/comp_node_sync_manager.h"
 #include "megbrain/utils/small_vector.h"
 #include "megbrain/utils/mempool.h"

--- a/src/core/include/megbrain/ir/base.td
+++ b/src/core/include/megbrain/ir/base.td
@@ -33,10 +33,11 @@ class MgbHashableAttrMixin {
  string reprFunction = "std::to_string($0)";
 }

-class MgbEnumAttrMixin<string namespace, string name, list<string> members, bit toString> {
+class MgbEnumAttrMixin<string namespace, string name, list<string> members, bit combined, bit toString> {
  string parentNamespace = namespace;
  string enumName = name;
  list<string> enumMembers = members;
+  bit enumCombined = combined;
  bit supportToString = toString;
 }

@@ -166,8 +167,8 @@ class MgbTupleAttr<list<MgbAttrWrapper> args>:
 }

 // -- enum types
-class MgbEnumAttr<string namespace, string enumName, list<string> members, bit toString=0>:
-    HashableAttr<namespace # "::" # enumName>, MgbEnumAttrMixin<namespace, enumName, members, toString> {
+class MgbEnumAttr<string namespace, string enumName, list<string> members, bit combined, bit toString=0>:
+    HashableAttr<namespace # "::" # enumName>, MgbEnumAttrMixin<namespace, enumName, members, combined, toString> {
  let storageType = "::mlir::IntegerAttr";
  let convertFromStorage = "static_cast<" # returnType # ">($_self.getInt())";
  let constBuilderCall = "$_builder.getI32IntegerAttr(static_cast<int32_t>($0))";
@@ -176,7 +177,7 @@ class MgbEnumAttr<string namespace, string enumName, list<string> members, bit t
 }

 class MgbEnumAliasAttr<string namespace, string enumName, MgbEnumAttr base>:
-    MgbEnumAttr<namespace, enumName, base.enumMembers>, MgbAliasAttrMixin<base>;
+    MgbEnumAttr<namespace, enumName, base.enumMembers, 0>, MgbAliasAttrMixin<base>;

 // -- other types
 def MgbDTypeAttr: HashableAttr<"::megdnn::DType"> {

--- a/src/core/include/megbrain/utils/enum_class_bit.h
+++ b/src/core/include/megbrain/utils/enum_class_bit.h
-/**
- * \file src/core/include/megbrain/utils/enum_class_bit.h
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-
-#pragma once
-
-#include <type_traits>
-
-namespace mgb {
-    template<typename T>
-    class EnumClassBit {
-        std::underlying_type_t<T> m_val;
-
-        constexpr EnumClassBit(std::underlying_type_t<T> v):
-            m_val(v)
-        {
-        }
-
-        public:
-            constexpr EnumClassBit(T v):
-                m_val(static_cast<std::underlying_type_t<T>>(v))
-            {
-            }
-
-            constexpr operator T() const {
-                return static_cast<T>(m_val);
-            }
-
-            constexpr explicit operator bool() const {
-                return m_val;
-            }
-
-#define DEF_OPR(op) \
-            constexpr EnumClassBit operator op (\
-                    const EnumClassBit &rhs) const { \
-                return m_val op rhs.m_val; \
-            }
-
-            DEF_OPR(&)
-            DEF_OPR(|)
-            DEF_OPR(^)
-
-            constexpr EnumClassBit operator ~() const {
-                return ~m_val;
-            }
-
-
-#undef DEF_OPR
-    };
-
-}
-
-#define _MGB_DECBO_SINGLE_OPR(cls, op) \
-     inline constexpr ::mgb::EnumClassBit<cls> operator op (cls x, cls y) { \
-         return ::mgb::EnumClassBit<cls>(x) op ::mgb::EnumClassBit<cls>(y); \
-     } \
-     inline constexpr ::mgb::EnumClassBit<cls> operator op ( \
-             ::mgb::EnumClassBit<cls> x, cls y) { \
-         return x op ::mgb::EnumClassBit<cls>(y); \
-     }
-
-#define _MGB_DECBO_SINGLE_OPR_ASSIGN(cls, op) \
-     inline constexpr cls& operator op##= (cls& x, cls y) { \
-         x = x op ::mgb::EnumClassBit<cls>(y); \
-         return x; \
-     }
-
-#define MGB_DEF_ENUM_CLASS_BIT_OPR(cls) \
-    _MGB_DECBO_SINGLE_OPR(cls, &) \
-    _MGB_DECBO_SINGLE_OPR(cls, |) \
-    _MGB_DECBO_SINGLE_OPR(cls, ^) \
-    _MGB_DECBO_SINGLE_OPR_ASSIGN(cls, &) \
-    _MGB_DECBO_SINGLE_OPR_ASSIGN(cls, |) \
-    _MGB_DECBO_SINGLE_OPR_ASSIGN(cls, ^) \
-    inline constexpr ::mgb::EnumClassBit<cls> operator ~ (cls x) { \
-        return ~::mgb::EnumClassBit<cls>(x); \
-    } \
-
-
-
-// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
-
--- a/src/core/include/megbrain/utils/persistent_cache.h
+++ b/src/core/include/megbrain/utils/persistent_cache.h
@@ -100,8 +100,7 @@ namespace mgb {

            struct ResultEntry {
                std::string algo;   //! identifier of the algorithm
-                //! sscanf will up bool as int
-                int reproducible;  //! whether algorithm is reproducible
+                uint32_t attribute;  //! algo attribute, e.g. reproducible
                double time;        //! execution time in seconds
                size_t workspace;   //! workspace in bytes
            };

--- a/src/gopt/impl/inference.cpp
+++ b/src/gopt/impl/inference.cpp
@@ -54,7 +54,6 @@ using namespace gopt;

 namespace {

-
 template <typename SharedDeviceTensor, typename MultipleDeviceTensorHolder>
 void param_merge(OptState& opt_state) {
    auto rewriter = opt_state.graph().make_rewriter();
@@ -102,7 +101,7 @@ void param_merge(OptState& opt_state) {
    rewriter.apply_inplace();
 }

-}
+}  // namespace

 /* ================ global functions ================ */

@@ -190,12 +189,10 @@ void gopt::enable_opr_algo_profiling_inplace(

 void gopt::enable_opr_use_profiling_cache_inplace(
        const VarNodeArrayView& dest_vars) {
-    modify_opr_algo_strategy_inplace(
-            dest_vars, opr::mixin::AlgoChooserHelper::ExecutionPolicy::
-                               Strategy::PROFILE_HEURISTIC);
+    using S = megdnn::param::ExecutionPolicy::Strategy;
+    modify_opr_algo_strategy_inplace(dest_vars, S::PROFILE | S::HEURISTIC);
 }

-
 void gopt::set_opr_algo_workspace_limit_inplace(
        const VarNodeArrayView& dest_vars, size_t workspace_limit) {
    static const ThinHashMap<Typeinfo*, void (*)(OperatorNodeBase&, size_t)>

--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -1693,7 +1693,22 @@ TEST(TestGoptInference, ProfileCache) {
    using S = opr::Convolution::ExecutionPolicy::Strategy;
    ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
    gopt::enable_opr_use_profiling_cache_inplace({z + 2.3f});
-    ASSERT_EQ(S::PROFILE_HEURISTIC, conv.execution_policy().strategy);
+    ASSERT_EQ(S::PROFILE | S::HEURISTIC, conv.execution_policy().strategy);
+}
+
+TEST(TestGoptInference, FastProfileCache) {
+    HostTensorGenerator<> gen;
+    auto graph = ComputingGraph::make();
+    auto host_x = gen({4, 3, 8, 9}), host_y = gen({2, 3, 3, 3});
+    auto x = opr::Host2DeviceCopy::make(*graph, host_x),
+         y = opr::Host2DeviceCopy::make(*graph, host_y),
+         z = opr::Convolution::make(x, y);
+    auto&& conv = z.node()->owner_opr()->cast_final_safe<opr::Convolution>();
+    using S = opr::Convolution::ExecutionPolicy::Strategy;
+    ASSERT_EQ(S::HEURISTIC, conv.execution_policy_transient().strategy);
+    gopt::modify_opr_algo_strategy_inplace({z + 2.3f},
+                                           S::PROFILE | S::OPTMIZED);
+    ASSERT_EQ(S::PROFILE | S::OPTMIZED, conv.execution_policy().strategy);
 }

 TEST(TestGoptInference, AlgoWorkspaceLimit) {

--- a/src/opr/impl/dnn/dnn.sereg.h
+++ b/src/opr/impl/dnn/dnn.sereg.h
@@ -20,7 +20,6 @@
 #include "megbrain/opr/dnn/lrn.h"
 #include "megbrain/opr/dnn/fake_quant.h"
 #include "megbrain/opr/dnn/tqt.h"
-
 #include "megbrain/serialization/sereg.h"
 #include "megdnn/opr_param_defs.h"
 #include "megdnn/oprs/nn.h"

--- a/src/opr/impl/search_policy/algo_chooser.cpp
+++ b/src/opr/impl/search_policy/algo_chooser.cpp
@@ -284,8 +284,9 @@ namespace mgb {
 namespace opr {

 template <typename Opr>
-void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {
-    if (ctx.get_profile_result_from_cache(require_reproducible).valid())
+void AlgoChooser<Opr>::profile(ExeContext& ctx,
+                               ExecutionStrategy select_strategy) {
+    if (ctx.get_profile_result_from_cache(select_strategy).valid())
        return;
    AlgoChooserProfileCache::Result prof_rst;

@@ -305,7 +306,7 @@ void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {
                                   algo.name.c_str(), str_on_inp_shape.c_str());
        ImplExecutionPolicy policy;
        policy.algo = algo.desc;
-        ctx.construct_execution_policy(require_reproducible, policy);
+        ctx.construct_execution_policy(select_strategy, policy);
        if (ctx.get_workspace_size_bytes(policy) >= workspace_limit)
            continue;

@@ -354,7 +355,8 @@ void AlgoChooser<Opr>::profile(ExeContext& ctx, bool require_reproducible) {

 template <typename Opr>
 typename AlgoChooser<Opr>::ImplExecutionPolicy
-AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible,
+AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx,
+                                    ExecutionStrategy select_strategy,
                                    bool enable_update) {
    MIDOUT_B(Opr, midout_iv(MGB_HASH_STR("AlgoChooser::choose_by_profile")))
    if (ctx.owner_graph()->options().no_profiling_on_shape_change) {
@@ -376,11 +378,11 @@ AlgoChooser<Opr>::choose_by_profile(ExeContext& ctx, bool require_reproducible,
                    to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
                    _item.param, ctx.mgb_opr(), ctx.comp_node(),
                    ctx.execution_policy(), ctx.allow_weight_preprocess());
-            AlgoChooser<_Opr>::profile(sub_ctx, require_reproducible);
+            AlgoChooser<_Opr>::profile(sub_ctx, select_strategy);
        });
    }
    typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
-    ctx.construct_execution_policy(require_reproducible, policy);
+    ctx.construct_execution_policy(select_strategy, policy);
    return policy;
    MIDOUT_E
 }
@@ -402,11 +404,9 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
    ImplExecutionPolicy policy;
    if (auto algo_choose_hook = mgb_opr->algo_chooser()) {
        policy = algo_choose_hook(mgb_opr);
-        ctx.construct_execution_policy(
-                mgb_opr->execution_policy().strategy ==
-                        mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::
-                                HEURISTIC_REPRODUCIBLE,
-                policy, false);
+        ctx.construct_execution_policy((ExecutionStrategy::HEURISTIC |
+                                        ExecutionStrategy::REPRODUCIBLE),
+                                       policy, false);
    }
    if (!policy.algo.valid()) {
        policy = get_policy(ctx);
@@ -419,10 +419,9 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
    Algorithm* palgo = megdnn_opr->get_algorithm_from_desc(policy.algo);
    mgb_assert(palgo, "Unknown algo description");
    ret.append("): algo=" + std::string(palgo->name()));
-    ret.append(ssprintf(" workspace=%.2fMiB reproducible=%d",
+    ret.append(ssprintf(" workspace=%.2fMiB attirbute=%d",
                        workspace / (1024 * 1024.0),
-                        palgo->contain_attribute(
-                                megdnn::AlgoAttribute::REPRODUCIBLE)));
+                        static_cast<uint32_t>(palgo->attribute())));
    mgb_log_debug("%s", ret.c_str());

    megdnn_opr->execution_policy() = policy;
@@ -432,41 +431,39 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts,
 template <typename Opr>
 typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::get_policy(
        ExeContext& ctx) {
-    using S = mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    MGB_MARK_USED_VAR(TIMEOUT_TOLERANCE);
-    switch (ctx.execution_policy().strategy) {
-        case S::HEURISTIC:
-            return ctx.choose_by_heuristic();
-        case S::HEURISTIC_REPRODUCIBLE:
-            return ctx.choose_by_heuristic(true);
-        case S::PROFILE_HEURISTIC: {
-            ImplExecutionPolicy policy = choose_by_profile(ctx, false, false);
-            if (!policy.algo.valid())
-                policy = ctx.choose_by_heuristic();
-            return policy;
-        }
+    auto opr_strategy = ctx.execution_policy().strategy;
+    if ((opr_strategy & ExecutionStrategy::HEURISTIC) &&
+               (opr_strategy & ExecutionStrategy::PROFILE)) {
+        ImplExecutionPolicy policy =
+                choose_by_profile(ctx, opr_strategy, false);
+        if (!policy.algo.valid())
+            policy = ctx.choose_by_heuristic(opr_strategy);
+        return policy;
+    } else if ((opr_strategy & ExecutionStrategy::HEURISTIC)) {
+        return ctx.choose_by_heuristic(opr_strategy);
+    }
 #if MGB_ENABLE_FASTRUN
-        case S::PROFILE:
-            return choose_by_profile(ctx, false);
-        case S::PROFILE_REPRODUCIBLE:
-            return choose_by_profile(ctx, true);
+    else if (opr_strategy & ExecutionStrategy::PROFILE) {
+        return choose_by_profile(ctx, opr_strategy);
+    }
 #endif
-        default:
-            mgb_throw(GraphError, "bad convolution ExecutionPolicy strategy");
+    else {
+        mgb_throw(GraphError, "bad convolution ExecutionPolicy strategy");
    }
 }

-#define INST(Opr)                                                            \
-    template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy                   \
-    AlgoChooser<megdnn::Opr>::get_policy(ExeContext& ctx);                   \
-    template void AlgoChooser<megdnn::Opr>::profile(                         \
-            ExeContext& ctx, bool require_reproducible);                     \
-    template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy                   \
-    AlgoChooser<megdnn::Opr>::choose_by_profile(                             \
-            ExeContext& ctx, bool require_reproducible, bool enable_update); \
-    template size_t AlgoChooser<megdnn::Opr>::setup_algo(                    \
-            const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr,      \
-            const MGBOpr* mgb_opr, bool allow_weight_preprocess);            \
+#define INST(Opr)                                                       \
+    template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy              \
+    AlgoChooser<megdnn::Opr>::get_policy(ExeContext& ctx);              \
+    template void AlgoChooser<megdnn::Opr>::profile(ExeContext& ctx,    \
+                                                    ExecutionStrategy); \
+    template AlgoChooser<megdnn::Opr>::ImplExecutionPolicy              \
+    AlgoChooser<megdnn::Opr>::choose_by_profile(                        \
+            ExeContext& ctx, ExecutionStrategy, bool enable_update);    \
+    template size_t AlgoChooser<megdnn::Opr>::setup_algo(               \
+            const FixedTensorLayouts& layouts, megdnn::Opr* megdnn_opr, \
+            const MGBOpr* mgb_opr, bool allow_weight_preprocess);

 MGB_FOREACH_FASTRUN_OPR(INST)

@@ -498,7 +495,7 @@ AlgoChooser<Opr>::ExeContext::ExeContext(
 template <typename Opr>
 typename AlgoChooser<Opr>::ImplAlgo
 AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache(
-        bool require_reproducible) const {
+        ExecutionStrategy select_strategy) const {
    MIDOUT_B(Opr,
             midout_iv(MGB_HASH_STR(
                     "AlgoChooser::ExeContext::get_profile_result_from_cache")))
@@ -522,7 +519,9 @@ AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache(
    if (prof.empty())
        return {};
    for (auto&& i : prof) {
-        if ((!require_reproducible || i.reproducible)) {
+        if (!(select_strategy & ExecutionStrategy::REPRODUCIBLE) ||
+            static_cast<AlgoAttribute>(i.attribute) &
+                    AlgoAttribute::REPRODUCIBLE) {
            auto iter = algo_map.find(i.algo);
            mgb_assert(iter != algo_map.end(),
                       "algorithm %s exists in "
@@ -550,7 +549,8 @@ AlgoChooser<Opr>::ExeContext::get_profile_result_from_cache(

 template <typename Opr>
 typename AlgoChooser<Opr>::ImplExecutionPolicy
-AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
+AlgoChooser<Opr>::ExeContext::choose_by_heuristic(
+        ExecutionStrategy select_strategy) const {
    if (m_execution_policy.workspace_limit !=
        std::numeric_limits<decltype(
                m_execution_policy.workspace_limit)>::max()) {
@@ -558,6 +558,8 @@ AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
                "workspace_limit should not be setted if choose algo by "
                "heuristic");
    }
+    bool reproducible = static_cast<bool>(select_strategy &
+                                          ExecutionStrategy::REPRODUCIBLE);
    auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
            owner_graph(), m_cn, m_execution_policy.workspace_limit);
    ImplExecutionPolicy policy;
@@ -579,7 +581,8 @@ AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
                to_fixed_layouts<_Opr>(_item.layouts), megdnn_opr.get(),
                _item.param, m_base_mgb_opr, m_cn, m_execution_policy,
                m_allow_weight_preprocess);
-        policy.sub_policy.push_back(sub_ctx.choose_by_heuristic(reproducible));
+        policy.sub_policy.push_back(
+                sub_ctx.choose_by_heuristic(select_strategy));
    });

    return policy;
@@ -588,9 +591,8 @@ AlgoChooser<Opr>::ExeContext::choose_by_heuristic(bool reproducible) const {
 template <typename Opr>
 std::vector<typename AlgoChooser<Opr>::ImplAlgo>
 AlgoChooser<Opr>::ExeContext::get_all_candidates() const {
-    auto heu = choose_by_heuristic();
-    auto&& ret =
-            APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts);
+    auto heu = choose_by_heuristic(ExecutionStrategy::HEURISTIC);
+    auto&& ret = APPLY(m_megdnn_opr->get_all_algorithms_info(args...), m_layouts);
    bool found = false;
    for (size_t i = 0; i < ret.size(); ++i) {
        if (ret[i].desc == heu.algo) {
@@ -611,19 +613,21 @@ AlgoChooser<Opr>::ExeContext::get_all_candidates() const {

 template <typename Opr>
 void AlgoChooser<Opr>::ExeContext::construct_execution_policy(
-        bool require_reproducible,
+        ExecutionStrategy select_strategy,
        typename AlgoChooser<Opr>::ImplExecutionPolicy& policy,
        bool retrive_from_cache) const {
+    bool reproducible = static_cast<bool>(select_strategy &
+                                          ExecutionStrategy::REPRODUCIBLE);
    if (!policy.algo.valid()) {
        if (retrive_from_cache) {
            policy.algo =
-                    get_profile_result_from_cache(require_reproducible).desc;
+                    get_profile_result_from_cache(select_strategy).desc;
        } else {
            auto workspace_limit = WorkspaceLimitGetter::get_workspace_limit(
                    owner_graph(), m_cn, m_execution_policy.workspace_limit);
            policy.algo = APPLY(m_megdnn_opr->get_algorithm_info_heuristic(
                                        args..., workspace_limit,
-                                        require_reproducible),
+                                        reproducible),
                                m_layouts)
                                  .desc;
        }
@@ -647,7 +651,7 @@ void AlgoChooser<Opr>::ExeContext::construct_execution_policy(
                _item.param, m_base_mgb_opr, m_cn, m_execution_policy,
                m_allow_weight_preprocess);
        policy.sub_policy.push_back({});
-        sub_ctx.construct_execution_policy(require_reproducible,
+        sub_ctx.construct_execution_policy(select_strategy,
                                           policy.sub_policy.back(),
                                           retrive_from_cache);
    });
@@ -718,8 +722,7 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(
        return None;
    return AlgoChooserProfileCache::ResultEntry{
            palgo->name(),
-            palgo->contain_attribute(
-                    megdnn::AlgoAttribute::REPRODUCIBLE),
+            static_cast<uint32_t>(palgo->attribute()),
            rst.val().time, param.workspace};
 }

@@ -768,10 +771,10 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const {
            bool allow_weight_preprocess);                                     \
    template typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy            \
    AlgoChooser<megdnn::Opr>::ExeContext::choose_by_heuristic(                 \
-            bool reproducible) const;                                          \
+            ExecutionStrategy select_strategy) const;                          \
    template typename AlgoChooser<megdnn::Opr>::ImplAlgo                       \
    AlgoChooser<megdnn::Opr>::ExeContext::get_profile_result_from_cache(       \
-            bool require_reproducible) const;                                  \
+            ExecutionStrategy select_strategy) const;                          \
    template std::vector<typename AlgoChooser<megdnn::Opr>::ImplAlgo>          \
    AlgoChooser<megdnn::Opr>::ExeContext::get_all_candidates() const;          \
    template size_t                                                            \
@@ -780,7 +783,7 @@ AlgoChooser<Opr>::ExeContext::construct_fake_preprocess_filter() const {
                    policy) const;                                             \
    template void                                                              \
    AlgoChooser<megdnn::Opr>::ExeContext::construct_execution_policy(          \
-            bool require_reproducible,                                         \
+            ExecutionStrategy select_strategy,                                 \
            typename AlgoChooser<megdnn::Opr>::ImplExecutionPolicy& policy,    \
            bool retrive_from_cache) const;                                    \
    template Maybe<AlgoChooserProfileCache::ResultEntry>                       \

--- a/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
+++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser.h
@@ -35,6 +35,13 @@ MGB_FOREACH_FASTRUN_OPR(cb)
 #undef cb

 namespace mgb {
+
+//! define logical operation of megdnn::param::ExecutionPolicy::Strategy::Enum
+//! and megdnn::detail::AlgoAttribute enum
+using ExecutionStrategy = megdnn::param::ExecutionPolicy::Strategy;
+
+using AlgoAttribute = megdnn::AlgoAttribute;
+
 namespace opr {

 /* =================== AlgoChooser =================== */
@@ -103,7 +110,7 @@ public:
        const FixedTensorLayouts& layouts() const { return m_layouts; }

        ImplExecutionPolicy choose_by_heuristic(
-                bool reproducible = false) const;
+                ExecutionStrategy select_strategy) const;

        //! get all candidate algos, and the one choose_by_heuristic() is
        //! put first
@@ -126,19 +133,20 @@ public:
                const ImplExecutionPolicy& policy, double& timeout) const;

        //! get all profile algorithm from cache, return invalid if not exists
-        ImplAlgo get_profile_result_from_cache(bool require_reproducible) const;
+        ImplAlgo get_profile_result_from_cache(
+                ExecutionStrategy select_strategy) const;

        /**
         * \brief construct execution policy from cache or heuristic.
         *
-         * \param require_reproducible select algo which is reproducible
+         * \param select_strategy select algo which matched this strategy
         * \param policy execution policy
         * \param retrive_from_cache retrive algo from cache if set True, get
         *     from heuristic otherwise.
         */
-        void construct_execution_policy(
-                bool require_reproducible, ImplExecutionPolicy& policy,
-                bool retrive_from_cache = true) const;
+        void construct_execution_policy(ExecutionStrategy select_strategy,
+                                        ImplExecutionPolicy& policy,
+                                        bool retrive_from_cache = true) const;

    private:
        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const;
@@ -153,11 +161,11 @@ private:


    //! profile and save to cache
-    static void profile(ExeContext& ctx, bool require_reproducible);
+    static void profile(ExeContext& ctx, ExecutionStrategy select_strategy);

-    static ImplExecutionPolicy choose_by_profile(ExeContext& ctx,
-                                                 bool require_reproducible,
-                                                 bool enable_update = true);
+    static ImplExecutionPolicy choose_by_profile(
+            ExeContext& ctx, ExecutionStrategy select_strategy,
+            bool enable_update = true);

 public:
    /*!

--- a/src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h
+++ b/src/opr/include/megbrain/opr/search_policy/algo_chooser_helper.h
@@ -13,7 +13,6 @@
 #pragma once

 #include "megbrain/graph/operator_node.h"
-#include "megbrain/opr/param_defs.h"
 #include "megdnn/oprs/base.h"
 #include "megdnn/oprs/nn.h"

@@ -73,7 +72,6 @@ protected:

 };
 }  // namespace mixin
-
 }  // namespace opr
 }  // namespace mgb


--- a/src/opr/test/blas.cpp
+++ b/src/opr/test/blas.cpp
@@ -429,10 +429,11 @@ TEST(TestOprDNN, MatrixMulExePolicy) {
    auto cn = CompNode::load("cpux");

 #if MGB_ENABLE_FASTRUN
-    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
-                          S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
+                        S::PROFILE | S::HEURISTIC}) {
 #else
-    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
+    for (auto strategy: {S:HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif

        auto graph = ComputingGraph::make();

--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -355,11 +355,13 @@ TEST(TestOprDNN, ConvBiasExePolicy) {
    auto cn = CompNode::load("cpux");

 #if MGB_ENABLE_FASTRUN
-    for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
+          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
-    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif
-
        auto graph = ComputingGraph::make();
        HostTensorGenerator<> gen;

@@ -397,7 +399,8 @@ TEST(TestOprDNN, ConvBiasExePolicy_Quantized8Asym) {

    auto cn = CompNode::load("cpux");

-    for (auto strategy: {S::PROFILE, S::PROFILE_REPRODUCIBLE}) {
+    for (auto strategy :
+         SmallVector<S>{S::PROFILE, S::PROFILE | S::REPRODUCIBLE}) {

        auto graph = ComputingGraph::make();
        HostTensorGenerator<> gen;
@@ -439,10 +442,12 @@ TEST(TestOprDNN, ConvolutionExePolicy) {
    PersistentCacheHook cache_hook{on_get};

 #if MGB_ENABLE_FASTRUN
-    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
-                          S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
+          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
-    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif
        using Checker = AutoOprChecker<2, 1>;

@@ -522,10 +527,11 @@ TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) {
    PersistentCacheHook cache_hook{on_get};

 #if MGB_ENABLE_FASTRUN
-    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
-                          S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         {S::PROFILE, S::HEURISTIC, S(S::PROFILE | S::REPRODUCIBLE),
+          S(S::PROFILE | S::HEURISTIC)}) {
 #else
-    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
+    for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) {
 #endif
        using Checker = AutoOprChecker<2, 1>;

@@ -1183,9 +1189,12 @@ TEST(TestOprDNN, Convolution3DExePolicy) {
    using S = Policy::Strategy;

 #if MGB_ENABLE_FASTRUN
-    for (auto strategy: {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
+          S::PROFILE | S::HEURISTIC}) {
 #else
-    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif

        using Checker = AutoOprChecker<2, 1>;
@@ -1660,10 +1669,12 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) {
    PersistentCacheHook cache_hook{on_get};

 #if MGB_ENABLE_FASTRUN
-    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
-                          S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
+          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
-    for (auto strategy: {S:HEURISTIC, S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif
        auto make_graph = [&](const Checker::SymInpArray& inputs)
                -> Checker::SymOutArray {
@@ -1769,10 +1780,12 @@ TEST(TestOprDNN, DeformableConvForward) {
    Param param;

 #if MGB_ENABLE_FASTRUN
-    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
-                          S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
+          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
-    for (auto strategy : {S : HEURISTIC, S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif
        auto make_graph = [&](const Checker::SymInpArray& inputs)
                -> Checker::SymOutArray {
@@ -1936,10 +1949,12 @@ TEST(TestOprDNN, BatchConvBiasForward) {
    param.sparse = Param::Sparse::DENSE;

 #if MGB_ENABLE_FASTRUN
-    for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE,
-                          S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S::PROFILE, S::HEURISTIC, S::PROFILE | S::REPRODUCIBLE,
+          S::PROFILE | S::HEURISTIC, S::PROFILE | S::OPTMIZED}) {
 #else
-    for (auto strategy : {S : HEURISTIC, S::PROFILE_HEURISTIC}) {
+    for (auto strategy :
+         SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) {
 #endif

        auto make_quantized = [&](SymbolVar x, const DType& dtype) {
@@ -2080,7 +2095,8 @@ TEST(TestOprDNN, HeuristicReproducible) {

    constexpr size_t PH = 1, PW = 1, SH = 1, SW = 1;

-    for (auto strategy : {S::HEURISTIC, S::HEURISTIC_REPRODUCIBLE}) {
+    for (auto strategy :
+         SmallVector<S>{S::HEURISTIC, S::HEURISTIC | S::REPRODUCIBLE}) {
        VarNode* bwd_flt;
        auto make_graph = [&](const Checker::SymInpArray& inputs)
                -> Checker::SymOutArray {
@@ -2126,7 +2142,7 @@ TEST(TestOprDNN, HeuristicReproducible) {
            megdnn::Algorithm* palgo =
                    megdnn_opr->get_algorithm_from_desc(algo);
            mgb_assert(palgo, "Unknown algo description");
-            if (strategy == S::HEURISTIC_REPRODUCIBLE) {
+            if (strategy == S(S::HEURISTIC | S::REPRODUCIBLE)) {
                EXPECT_TRUE(palgo->contain_attribute(
                            megdnn::AlgoAttribute::REPRODUCIBLE));
            }

--- a/test/src/include/megbrain/test/helper.h
+++ b/test/src/include/megbrain/test/helper.h
@@ -43,6 +43,7 @@ namespace megdnn {
            std::ostream &ostr, const DType &dt) {
        return ostr << dt.name();
    }
+
 } // namespace megdnn

 namespace mgb {

--- a/tools/param_defs/mgb_opr_param_defs.py
+++ b/tools/param_defs/mgb_opr_param_defs.py
@@ -18,7 +18,7 @@ pdef('PersistentOutputStorage').add_fields(
 add_const('int32', 'INVALID_AXIS', 'MAX_NDIM').
 add_fields('int32', 'axis', 'INVALID_AXIS'))

-(pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator').
+(pdef('ExecutionPolicy', version=0, is_legacy=True).
 add_enum('Strategy',
          Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'),
          Doc('HEURISTIC_REPRODUCIBLE', 'use heuristic to choose the fastest algorithm, '
@@ -33,6 +33,20 @@ pdef('PersistentOutputStorage').add_fields(
            Doc('workspace_limit', 'workspace limit in bytes'),
            str(2**64-1)+'ull'))

+(pdef('ExecutionPolicy', 'specify how to select an algorithm for an operator', version=1).
+ add_bit_combination_enum('Strategy',
+          Doc('HEURISTIC', 'use heuristic to choose the fastest algorithm'),
+          Doc('PROFILE',
+              'run possible algorithms on real device to find the best'),
+          Doc('REPRODUCIBLE',
+              'when profile or heuristic algo selection it require the algos'
+              'must be reproducible'),
+          Doc('OPTMIZED',
+              'profile require algos are optmized to achieve fast-profile')).
+ add_fields('uint64',
+            Doc('workspace_limit', 'workspace limit in bytes'),
+            str(2**64-1)+'ull'))
+
 (pdef('AssertEqual').
 add_fields('float32',
            Doc('maxerr', 'max allowed error; error is defined as the minimal '