diff --git a/src/core/impl/utils/infile_persistent_cache.cpp b/src/core/impl/utils/infile_persistent_cache.cpp
index 5171ff7898b8a6533905e18aa81c397423ed1f7a..bd5df4806c6d0c112d52175c43555dab66ea575b 100644
--- a/src/core/impl/utils/infile_persistent_cache.cpp
+++ b/src/core/impl/utils/infile_persistent_cache.cpp
@@ -13,7 +13,7 @@
 
 #if defined(_WIN32)
 #include <io.h>
-#define F_OK 0
+#define F_OK         0
 #define access(a, b) _access(a, b)
 #elif __linux__ || __unix__ || __APPLE__
 #include <unistd.h>
@@ -32,8 +32,9 @@ public:
 
     template <typename T>
     void read(T& val) {
-        static_assert(std::is_trivially_copyable<T>::value,
-                      "only support trivially copyable type");
+        static_assert(
+                std::is_trivially_copyable<T>::value,
+                "only support trivially copyable type");
         mgb_assert(m_offset + sizeof(T) <= m_size);
         memcpy(&val, m_ptr, sizeof(T));
         m_offset += sizeof(T);
@@ -42,8 +43,9 @@ public:
 
     template <typename T>
     void read(T* buf, size_t size) {
-        static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1,
-                      "only support read bytes");
+        static_assert(
+                std::is_trivially_copyable<T>::value && sizeof(T) == 1,
+                "only support read bytes");
         mgb_assert(m_offset + size <= m_size);
         memcpy(buf, m_ptr, size);
         m_offset += size;
@@ -67,20 +69,21 @@ public:
 
     template <typename T>
     void read(T& val) {
-        static_assert(std::is_trivially_copyable<T>::value,
-                      "only support trivially copyable type");
+        static_assert(
+                std::is_trivially_copyable<T>::value,
+                "only support trivially copyable type");
         auto ret = fread(&val, sizeof(T), 1, m_fp);
         mgb_assert(ret == 1);
     }
 
     template <typename T>
     void read(T* buf, size_t size) {
-        static_assert(std::is_trivially_copyable<T>::value && sizeof(T) == 1,
-                      "only support read bytes");
+        static_assert(
+                std::is_trivially_copyable<T>::value && sizeof(T) == 1,
+                "only support read bytes");
         auto ret = fread(buf, size, 1, m_fp);
         mgb_assert(ret == 1);
     }
-
 };
 
 //////////////////////// InFilePersistentCache::OutputFile ///////////////
@@ -114,8 +117,8 @@ public:
 //////////////////////// InFilePersistentCache::BlobStorage ///////////////
 
 template <typename Input>
-InFilePersistentCache::BlobStorage&
-InFilePersistentCache::BlobStorage::init_from_input(Input& inp) {
+InFilePersistentCache::BlobStorage& InFilePersistentCache::BlobStorage::init_from_input(
+        Input& inp) {
     uint32_t data_size;
     inp.read(data_size);
     size = data_size;
@@ -125,15 +128,14 @@ InFilePersistentCache::BlobStorage::init_from_input(Input& inp) {
     return *this;
 }
 
-void InFilePersistentCache::BlobStorage::write_to_file(
-        OutputFile& out_file) const {
+void InFilePersistentCache::BlobStorage::write_to_file(OutputFile& out_file) const {
     uint32_t u_size = size;
     out_file.write(u_size);
     out_file.write(data_refhold.get(), u_size);
 }
 
-InFilePersistentCache::BlobStorage&
-InFilePersistentCache::BlobStorage::init_data_ref(const Blob& b) {
+InFilePersistentCache::BlobStorage& InFilePersistentCache::BlobStorage::init_data_ref(
+        const Blob& b) {
     data_refhold = std::make_unique<uint8_t[]>(b.size + 1);
     memcpy(data_refhold.get(), b.ptr, b.size);
     data_refhold.get()[b.size] = 0;  // for C-string safety
@@ -227,8 +229,8 @@ Maybe<InFilePersistentCache::Blob> InFilePersistentCache::get(
     return iter1->second;
 }
 
-void InFilePersistentCache::put(const std::string& category, const Blob& key,
-                                const Blob& value) {
+void InFilePersistentCache::put(
+        const std::string& category, const Blob& key, const Blob& value) {
     BlobStorage key_storage;
     key_storage.init_data_ref(key).init_hash();
 
diff --git a/src/core/include/megbrain/utils/infile_persistent_cache.h b/src/core/include/megbrain/utils/infile_persistent_cache.h
index 28b28bef02bb5de0ac83ef7cc07c91dc8179ff87..58f069d4d24dc49eecdb3936da76ae3aa6f2af87 100644
--- a/src/core/include/megbrain/utils/infile_persistent_cache.h
+++ b/src/core/include/megbrain/utils/infile_persistent_cache.h
@@ -49,13 +49,15 @@ class InFilePersistentCache final : public PersistentCache {
             size_t operator()(const BlobStorage& b) const { return b.hash; }
         };
     };
-    std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage,
-                                                       BlobStorage::Hash>>
+    std::unordered_map<
+            std::string,
+            std::unordered_map<BlobStorage, BlobStorage, BlobStorage::Hash>>
             m_cache;
     MGB_MUTEX m_mtx;
 
     template <typename Input>
     void read_cache(Input& inp);
+
 public:
     InFilePersistentCache() = default;
     InFilePersistentCache(const char* path);
@@ -68,8 +70,7 @@ public:
     void dump_cache(const char* path);
 
     Maybe<Blob> get(const std::string& category, const Blob& key) override;
-    void put(const std::string& category, const Blob& key,
-             const Blob& value) override;
+    void put(const std::string& category, const Blob& key, const Blob& value) override;
     bool support_dump_cache() override { return true; }
 };
 }  // namespace mgb
diff --git a/src/core/include/megbrain/utils/persistent_cache.h b/src/core/include/megbrain/utils/persistent_cache.h
index 8c881aea7a2bd1093d5102d8124b5a746bd846dd..01dc093beffac40aafd636d3cb72cad55ab05ecb 100644
--- a/src/core/include/megbrain/utils/persistent_cache.h
+++ b/src/core/include/megbrain/utils/persistent_cache.h
@@ -40,7 +40,7 @@ public:
             const std::string& category, const Blob& key, const Blob& value) = 0;
 
     virtual bool support_dump_cache() { return false; }
-    
+
     //! set an implementation; return the original implementation
     static std::shared_ptr<PersistentCache> set_impl(
             std::shared_ptr<PersistentCache> impl);
diff --git a/src/gopt/impl/global_layout_transform/opr_safe_dump.cpp b/src/gopt/impl/global_layout_transform/opr_safe_dump.cpp
index 28c187918068b72a0819b8b6903b794bee00adef..5cf07b3c45c475ffae952162cb51c116cb528a2d 100644
--- a/src/gopt/impl/global_layout_transform/opr_safe_dump.cpp
+++ b/src/gopt/impl/global_layout_transform/opr_safe_dump.cpp
@@ -18,6 +18,7 @@
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/opr/tensor_manip.h"
 
+#include "megbrain/utils/hash_ct.h"
 #include "midout.h"
 MIDOUT_DECL(megbrain_opr_safe_dump)
 #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_safe_dump, __VA_ARGS__) {
@@ -38,24 +39,34 @@ template <>
 void write_param(std::string& /* data */, const DType& /* dtype */) {}
 
 template <class Opr>
-struct OprDumpImpl {
-    static std::string dump(const cg::OperatorNodeBase* opr_) {
-        MIDOUT_B(Opr)
-        auto&& opr = opr_->cast_final_safe<Opr>();
-        std::string data;
-        write_param(data, opr.param());
-        return data;
-        MIDOUT_E
-    }
-};
+struct OprDumpImpl;
 
-#define INST(_Opr)                                                     \
+#define cb(_Opr)                                                    \
+    template <>                                                     \
+    struct OprDumpImpl<_Opr> {                                      \
+        static std::string dump(const cg::OperatorNodeBase* opr_) { \
+            MIDOUT_B(_Opr)                                          \
+            auto&& opr = opr_->cast_final_safe<_Opr>();             \
+            std::string data;                                       \
+            auto opr_hash = MGB_HASH_STR(#_Opr);                    \
+            write_param(data, opr_hash);                            \
+            write_param(data, opr.param());                         \
+            return data;                                            \
+            MIDOUT_E                                                \
+        }                                                           \
+    };
+FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb)
+#undef cb
+
+#define cb(_Opr)                                                       \
     template <>                                                        \
     struct OprDumpImpl<_Opr> {                                         \
         static std::string dump(const cg::OperatorNodeBase* opr_) {    \
             MIDOUT_B(_Opr)                                             \
             auto&& opr = opr_->cast_final_safe<_Opr>();                \
             std::string data;                                          \
+            auto opr_hash = MGB_HASH_STR(#_Opr);                       \
+            write_param(data, opr_hash);                               \
             write_param(data, opr.param());                            \
             using ExecutionPolicy = megdnn::param::ExecutionPolicy;    \
             ExecutionPolicy policy{                                    \
@@ -66,11 +77,8 @@ struct OprDumpImpl {
             MIDOUT_E                                                   \
         }                                                              \
     };
-INST(Convolution);
-INST(ConvBiasForward);
-INST(ConvolutionBackwardData);
-INST(PoolingForward);
-#undef INST
+FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb)
+#undef cb
 }  // namespace
 
 namespace mgb {
@@ -83,8 +91,9 @@ std::string opr_safe_dump(const cg::OperatorNodeBase* opr) {
         return OprDumpImpl<_Opr>::dump(opr);       \
     } else
     FOREACH_SUPPORTED_OPR(cb) {
-        mgb_throw(InternalError, "unsupported operator(got:%s)",
-                  opr->dyn_typeinfo()->name);
+        mgb_throw(
+                InternalError, "unsupported operator(got:%s)",
+                opr->dyn_typeinfo()->name);
     }
 #undef cb
 }
diff --git a/src/gopt/impl/global_layout_transform/opr_safe_dump.h b/src/gopt/impl/global_layout_transform/opr_safe_dump.h
index d25be51d23bd8bf889165eea79eff9d06be8b930..5a178de8b99f271c2d4b98920478d798a47aa8f6 100644
--- a/src/gopt/impl/global_layout_transform/opr_safe_dump.h
+++ b/src/gopt/impl/global_layout_transform/opr_safe_dump.h
@@ -16,10 +16,16 @@
 namespace mgb {
 namespace gopt {
 namespace intl {
-#define FOREACH_SUPPORTED_OPR(cb)                                          \
-    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData)        \
-            cb(PoolingForward) cb(WarpPerspective) cb(Resize) cb(Elemwise) \
-                    cb(ElemwiseMultiType) cb(Concat) cb(PowC) cb(TypeCvt)
+#define FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb)                       \
+    cb(WarpPerspective) cb(Resize) cb(Elemwise) cb(ElemwiseMultiType) cb(Concat) \
+            cb(PowC) cb(TypeCvt)
+
+#define FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb) \
+    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) cb(PoolingForward)
+
+#define FOREACH_SUPPORTED_OPR(cb)                      \
+    FOREACH_SUPPORTED_OPR_WITHOUT_EXECUTION_POLICY(cb) \
+    FOREACH_SUPPORTED_OPR_WITH_EXECUTION_POLICY(cb)
 
 std::string opr_safe_dump(const cg::OperatorNodeBase* opr);
 
diff --git a/src/gopt/impl/global_layout_transform/profiler_cache.cpp b/src/gopt/impl/global_layout_transform/profiler_cache.cpp
index cfa703400a73ceadbcdad9d9139beb4b6e066066..b606c99db510340ad1f25c567becd6f96be5303b 100644
--- a/src/gopt/impl/global_layout_transform/profiler_cache.cpp
+++ b/src/gopt/impl/global_layout_transform/profiler_cache.cpp
@@ -11,8 +11,8 @@
  */
 
 #include "./opr_safe_dump.h"
-#include "megbrain/gopt/profiler.h"
 #include "megbrain/comp_node_env.h"
+#include "megbrain/gopt/profiler.h"
 
 using namespace mgb;
 using namespace gopt;
@@ -21,9 +21,6 @@ using ReformatKey = ReformatManager::ReformatKey;
 // =================== ProfilerCache ======================
 void ProfilerCache::Key::build_blob_from_opr() {
     auto&& opr = m_key_impl.opr_key.opr;
-    // process opr type
-    auto type = opr->dyn_typeinfo()->name;
-    size_t type_size = strlen(type);
 
     // process  opr param
     auto data = intl::opr_safe_dump(opr);
@@ -32,11 +29,7 @@ void ProfilerCache::Key::build_blob_from_opr() {
     size_t nr_inputs = opr->input().size();
     size_t nr_outputs = opr->usable_output().size();
     size_t nr_layouts = nr_inputs + nr_outputs;
-    m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + type_size +
-                           param_size);
-
-    // serialize opr type
-    m_blob_storage.append(type, type_size);
+    m_blob_storage.reserve(sizeof(TensorLayout) * 3 * nr_layouts + param_size);
 
     // serialize param
     const char* data_ptr = reinterpret_cast<const char*>(data.data());
@@ -70,12 +63,12 @@ void ProfilerCache::Key::build_blob_from_opr() {
     }
 
     // serialize opr_format
-    m_blob_storage.append(std::to_string(
-            static_cast<uint32_t>(m_key_impl.opr_key.opr_format)));
+    m_blob_storage.append(
+            std::to_string(static_cast<uint32_t>(m_key_impl.opr_key.opr_format)));
 
     // serialize extra_attribute
-    m_blob_storage.append(std::to_string(
-            static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute)));
+    m_blob_storage.append(
+            std::to_string(static_cast<uint32_t>(m_key_impl.opr_key.extra_attribute)));
 }
 
 void ProfilerCache::Key::build_category(CompNode cn) {
@@ -85,8 +78,8 @@ void ProfilerCache::Key::build_category(CompNode cn) {
 #if MGB_CUDA
         case CompNode::DeviceType::CUDA: {
             auto&& prop = env.cuda_env().device_prop;
-            m_category += ssprintf("plat=cuda;dev=%s;cap=%d.%d", prop.name,
-                                   prop.major, prop.minor);
+            m_category += ssprintf(
+                    "plat=cuda;dev=%s;cap=%d.%d", prop.name, prop.major, prop.minor);
             break;
         }
 #endif
@@ -94,9 +87,10 @@ void ProfilerCache::Key::build_category(CompNode cn) {
             m_category += "plat=cpu";
             break;
         default:
-            mgb_throw(MegBrainError,
-                      "unsupported comp node for global layout transform "
-                      "profiler cache category");
+            mgb_throw(
+                    MegBrainError,
+                    "unsupported comp node for global layout transform "
+                    "profiler cache category");
     }
 }
 
@@ -151,9 +145,10 @@ ProfilerCache& ProfilerCache::set_impl(std::unique_ptr<PersistentCache> impl) {
 }
 
 void ProfilerCache::dump_cache(const char* path) {
-    mgb_assert(m_impl->support_dump_cache(),
-               "current impl of ProfilerCache does not support dump cache to "
-               "file.");
+    mgb_assert(
+            m_impl->support_dump_cache(),
+            "current impl of ProfilerCache does not support dump cache to "
+            "file.");
     auto cache = static_cast<InFilePersistentCache*>(m_impl.get());
     cache->dump_cache(path);
 }
@@ -165,8 +160,9 @@ Maybe<ProfilerCache::Result> ProfilerCache::get(const Key& key) {
     // data type of cost is float
     auto buf = static_cast<const uint8_t*>(raw_buf->ptr);
     auto size = raw_buf->size;
-    mgb_assert(buf && size == sizeof(float),
-               "ProfileCache invalid value: ptr=%p, size=%zu", buf, size);
+    mgb_assert(
+            buf && size == sizeof(float),
+            "ProfileCache invalid value: ptr=%p, size=%zu", buf, size);
     auto read_f32 = [&]() {
         auto ret = *reinterpret_cast<const float*>(buf);
         return ret;
diff --git a/src/gopt/impl/global_layout_transform/profiler_impl.cpp b/src/gopt/impl/global_layout_transform/profiler_impl.cpp
index 99a58507084bfc3b514b4a594b302aa539fbeac6..0ea951e9027967679b6ab4c77caf118a4b5272e9 100644
--- a/src/gopt/impl/global_layout_transform/profiler_impl.cpp
+++ b/src/gopt/impl/global_layout_transform/profiler_impl.cpp
@@ -154,33 +154,30 @@ void MarkInputContiguous::init_output_static_infer_desc() {
 }  // namespace
 
 /* ================== ProfilerImpl =================*/
-ProfilerImpl::ProfilerImpl(int runs, float opr_threshold,
-                           float var_node_threshold)
+ProfilerImpl::ProfilerImpl(int runs, float opr_threshold, float var_node_threshold)
         : m_opr_threshold{opr_threshold},
           m_var_node_threshold{var_node_threshold},
           m_runs{runs} {
-    m_opr_filter = [this](const OperatorNodeBase* opr,
-                          OperatorNodeBase* new_opr) {
+    m_opr_filter = [this](const OperatorNodeBase* opr, OperatorNodeBase* new_opr) {
         /// \note: for the considerations of performance, we skip nchw(naive)
         /// kernels for conv bias on CUDA platform. to remove this later
         if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
             if (conv->output(0)->comp_node().device_type() ==
                         CompNode::DeviceType::CUDA &&
-                conv->input(0)->dtype().category() ==
-                        DTypeCategory::QUANTIZED &&
+                conv->input(0)->dtype().category() == DTypeCategory::QUANTIZED &&
                 conv->param().format == OprFormat::NCHW) {
                 return false;
             }
         }
-        float comp1 = m_opr_footprint.get_computation(
-                const_cast<OperatorNodeBase*>(opr));
+        float comp1 =
+                m_opr_footprint.get_computation(const_cast<OperatorNodeBase*>(opr));
         float comp2 = m_opr_footprint.get_computation(new_opr);
         if (comp2 > m_opr_threshold * comp1)
             return false;
         return true;
     };
-    m_var_node_filter = [this](const VarNode* var, TensorShape from,
-                               TensorShape to, ReformatKey key) {
+    m_var_node_filter = [this](const VarNode* var, TensorShape from, TensorShape to,
+                               ReformatKey key) {
         /// \note: due to the alignment requirement of low-bit tensor, we skip
         /// some layout transform for low-bit tensors. The skipped layout
         /// transforms do not have corresponding dnn kernel and cannot be
@@ -202,8 +199,7 @@ ProfilerImpl::ProfilerImpl(int runs, float opr_threshold,
         TensorLayout orig_ly = {var->shape(), var->dtype()},
                      from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
         float orig_memory = orig_ly.span().dist_byte() * 2.f;
-        float reformat_memory =
-                from_ly.span().dist_byte() + to_ly.span().dist_byte();
+        float reformat_memory = from_ly.span().dist_byte() + to_ly.span().dist_byte();
         if (reformat_memory > orig_memory * m_var_node_threshold)
             return false;
         return true;
@@ -537,23 +533,20 @@ std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() {
     return std::make_unique<ProfilerImpl>();
 }
 
-std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler(
-        const char* path) {
+std::unique_ptr<ProfilerBase> ProfilerBase::make_cached_profiler(const char* path) {
     return std::make_unique<CachedProfiler>(path);
 }
 
 /* ================== CachedProfiler =================*/
-CachedProfiler::CachedProfiler(const char* path, int runs, float opr_threshold,
-                               float var_node_threshold)
+CachedProfiler::CachedProfiler(
+        const char* path, int runs, float opr_threshold, float var_node_threshold)
         : ProfilerImpl(runs, opr_threshold, var_node_threshold), m_path{path} {
     if (m_path != nullptr) {  // file cache
-        ProfilerCache::inst().set_impl(
-                std::make_unique<InFilePersistentCache>(m_path));
+        ProfilerCache::inst().set_impl(std::make_unique<InFilePersistentCache>(m_path));
     }
 }
 
-CachedProfiler::ProfilingResult CachedProfiler::profile(
-        const Problem& problem) const {
+CachedProfiler::ProfilingResult CachedProfiler::profile(const Problem& problem) const {
     auto ret = ProfilerImpl::profile(problem);
     if (m_path != nullptr)
         ProfilerCache::inst().dump_cache(m_path);
@@ -563,35 +556,33 @@ CachedProfiler::ProfilingResult CachedProfiler::profile(
 float CachedProfiler::profile_operator(
         const OperatorNodeBase* opr, TensorFormats base_format,
         TensorFormats tensor_format, ReformatAttribute extra_attribute) const {
-    ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
-                           extra_attribute};
+    ProfilerCache::Key key{
+            opr, tensor_formats_to_opr_format(tensor_format), extra_attribute};
     auto ret = ProfilerCache::inst().get(key);
     if (ret.valid())
         return ret.val();
-    auto rst = ProfilerImpl::profile_operator(opr, base_format, tensor_format,
-                                   extra_attribute);
+    auto rst = ProfilerImpl::profile_operator(
+            opr, base_format, tensor_format, extra_attribute);
     ProfilerCache::inst().put(key, rst);
     return rst;
 }
 
 float CachedProfiler::profile_operator(
-        const OperatorNodeBase* opr,
-        const OprTensorFormatsConfiguration& base_config,
+        const OperatorNodeBase* opr, const OprTensorFormatsConfiguration& base_config,
         const OprTensorFormatsConfiguration& config,
         ReformatAttribute extra_attribute) const {
     ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
     auto ret = ProfilerCache::inst().get(key);
     if (ret.valid())
         return ret.val();
-    auto rst = ProfilerImpl::profile_operator(opr, base_config, config,
-                                              extra_attribute);
+    auto rst =
+            ProfilerImpl::profile_operator(opr, base_config, config, extra_attribute);
     ProfilerCache::inst().put(key, rst);
     return rst;
 }
 
-float CachedProfiler::profile_var_node(const VarNode* var,
-                                       TensorFormats base_format,
-                                       const ReformatKey& key) const {
+float CachedProfiler::profile_var_node(
+        const VarNode* var, TensorFormats base_format, const ReformatKey& key) const {
     ProfilerCache::Key pf_key{var, key};
     auto ret = ProfilerCache::inst().get(pf_key);
     if (ret.valid())
diff --git a/src/gopt/include/megbrain/gopt/profiler.h b/src/gopt/include/megbrain/gopt/profiler.h
index 299312c340f7c10dcd0903b830a1af4dce97f717..27a6663f31728a7bb9b7bfcf95ef0bfbf6d94a5c 100644
--- a/src/gopt/include/megbrain/gopt/profiler.h
+++ b/src/gopt/include/megbrain/gopt/profiler.h
@@ -78,7 +78,7 @@ public:
             const VarNode*, TensorShape, TensorShape, ReformatManager::ReformatKey)>;
 
     ProfilerBase() = default;
-    
+
     virtual ~ProfilerBase() = default;
 
     virtual ProfilingResult profile(const Problem& problem) const = 0;
@@ -102,13 +102,12 @@ protected:
     VarNodeFilter m_var_node_filter;
 };
 
-
 /*! \brief A default profiler impl
  */
 class ProfilerImpl : public ProfilerBase {
 public:
-    ProfilerImpl(int runs = 10, float opr_threshold = 2.f,
-                 float var_node_threshold = 2.f);
+    ProfilerImpl(
+            int runs = 10, float opr_threshold = 2.f, float var_node_threshold = 2.f);
     ~ProfilerImpl() = default;
     ProfilingResult profile(const Problem& problem) const override;
 
@@ -128,22 +127,22 @@ protected:
     OperatorNodeRecord profile_operator(
             const OperatorNodeBase* opr, TensorFormats base_format,
             const SmallVector<TensorFormats>& available_tensor_formats,
-            ReformatAttribute extra_attribute =
-                    ReformatAttribute::DEFAULT) const;
+            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
     /*!
-     * \brief prfile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
+     * \brief prfile opr format agnostic operators (like elemwise, elemwise multi type,
+     * typecvt etc.)
      *
      * \param opr pointer to the operator to be profiled
      * \param base_format the original tensor format of the operator node.
      * \param tensor_format the tensor format to be profiled
-     * \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout
-     * \return elapsed time of operator in the given tensor format configuration
+     * \param extra_attribute identify whether to use image object for OpenCL or
+     * automatically padding nhwc layout \return elapsed time of operator in the given
+     * tensor format configuration
      */
     virtual float profile_operator(
             const OperatorNodeBase* opr, TensorFormats base_format,
             TensorFormats tensor_format,
-            ReformatAttribute extra_attribute =
-                    ReformatAttribute::DEFAULT) const;
+            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
     /*!
      * \brief profile opr format aware operators (like conv, deconv, conv_bias,
      * etc.)
@@ -157,28 +156,29 @@ protected:
             const OperatorNodeBase* opr,
             const OprTensorFormatsConfiguration& base_config,
             const SmallVector<OprTensorFormatsConfiguration>& available_configs,
-            ReformatAttribute extra_attribute =
-                    ReformatAttribute::DEFAULT) const;
+            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
     /*!
-     * \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize, warp etc.)
+     * \brief prfile opr format aware operators (like conv, deconv, conv_bias, resize,
+     * warp etc.)
      *
      * \param opr pointer to the operator to be profiled
-     * \param base_config the original opr format configuration of the operator node, 
+     * \param base_config the original opr format configuration of the operator node,
      * \param config the opr format configuration to be profiled
-     * \param extra_attribute identify whether to use image object for OpenCL or automatically padding nhwc layout
-     * \return elapsed time of operator in the given opr format configuration
+     * \param extra_attribute identify whether to use image object for OpenCL or
+     * automatically padding nhwc layout \return elapsed time of operator in the given
+     * opr format configuration
      */
-    virtual float profile_operator(const OperatorNodeBase* opr,
-                           const OprTensorFormatsConfiguration& base_config,
-                           const OprTensorFormatsConfiguration& config,
-                           ReformatAttribute extra_attribute =
-                                   ReformatAttribute::DEFAULT) const;
+    virtual float profile_operator(
+            const OperatorNodeBase* opr,
+            const OprTensorFormatsConfiguration& base_config,
+            const OprTensorFormatsConfiguration& config,
+            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
     /*!
      * \brief profile layout transform of the var node
      *
      * \param var pointer to the var node to be profiled
      * \param base_format the original tensor formats in which the var node is
-     * stored 
+     * stored
      * \param available_tensor_formats the available tensor formats
      * \param extra_attribute the extra attributes (options) of the problem
      * \return the var node record
@@ -186,27 +186,26 @@ protected:
     VarNodeRecord profile_var_node(
             const VarNode* var, TensorFormats base_format,
             const SmallVector<TensorFormats>& available_tensor_formats,
-            ReformatAttribute extra_attribute =
-                    ReformatAttribute::DEFAULT) const;
+            ReformatAttribute extra_attribute = ReformatAttribute::DEFAULT) const;
     /*!
      * \brief profile layout transform of the var node
      *
      * \param var pointer to the var node to be profiled
      * \param base_format the original tensor formats in which the var node is
      * stored
-     * \param key type of ReformatKey, identify the information/attributes of the layout transoform
-     * \return elapsed time of the layout transform
+     * \param key type of ReformatKey, identify the information/attributes of the layout
+     * transoform \return elapsed time of the layout transform
      */
-    virtual float profile_var_node(const VarNode* var,
-                                   TensorFormats base_format,
-                                   const ReformatKey& key) const;
+    virtual float profile_var_node(
+            const VarNode* var, TensorFormats base_format,
+            const ReformatKey& key) const;
     OprFootprint m_opr_footprint;
-    float m_opr_threshold;  /// a threshold, when the computation of the newly
-                            /// created operator that is built in some opr
-                            /// format configuration is as greater as
-                            /// m_opr_threshold times of the original operator,
-                            /// the opr format configuration will be skipped
-                            /// (i.e. the cost is infinite)
+    float m_opr_threshold;       /// a threshold, when the computation of the newly
+                                 /// created operator that is built in some opr
+                                 /// format configuration is as greater as
+                                 /// m_opr_threshold times of the original operator,
+                                 /// the opr format configuration will be skipped
+                                 /// (i.e. the cost is infinite)
     float m_var_node_threshold;  /// a threshold, when the memory footprint of
                                  /// the layout transform of the var node is as
                                  /// larger as m_var_node_threshold as the var
@@ -298,23 +297,26 @@ private:
 
 class CachedProfiler final : public ProfilerImpl {
 public:
-    CachedProfiler(const char* path = nullptr, int runs = 10,
-                   float opr_threshold = 2.f, float var_node_threshold = 2.f);
+    CachedProfiler(
+            const char* path = nullptr, int runs = 10, float opr_threshold = 2.f,
+            float var_node_threshold = 2.f);
     ProfilingResult profile(const Problem& problem) const override;
 
 private:
-    float profile_operator(const OperatorNodeBase* opr,
-                           TensorFormats base_format,
-                           TensorFormats tensor_format,
-                           ReformatAttribute extra_attribute =
-                                   ReformatAttribute::DEFAULT) const override;
-    float profile_operator(const OperatorNodeBase* opr,
-                           const OprTensorFormatsConfiguration& base_config,
-                           const OprTensorFormatsConfiguration& config,
-                           ReformatAttribute extra_attribute =
-                                   ReformatAttribute::DEFAULT) const override;
-    float profile_var_node(const VarNode* var, TensorFormats base_format,
-                           const ReformatKey& key) const override;
+    float profile_operator(
+            const OperatorNodeBase* opr, TensorFormats base_format,
+            TensorFormats tensor_format,
+            ReformatAttribute extra_attribute =
+                    ReformatAttribute::DEFAULT) const override;
+    float profile_operator(
+            const OperatorNodeBase* opr,
+            const OprTensorFormatsConfiguration& base_config,
+            const OprTensorFormatsConfiguration& config,
+            ReformatAttribute extra_attribute =
+                    ReformatAttribute::DEFAULT) const override;
+    float profile_var_node(
+            const VarNode* var, TensorFormats base_format,
+            const ReformatKey& key) const override;
     const char* m_path;
 };
 
diff --git a/src/gopt/test/cache_data.h b/src/gopt/test/cache_data.h
index f049589e93605153451273ca3e70c0c71be857e3..696bdc5fe26dbef266a657d5ef8ecf19307b11a9 100644
Binary files a/src/gopt/test/cache_data.h and b/src/gopt/test/cache_data.h differ
diff --git a/src/gopt/test/embed_cache.py b/src/gopt/test/embed_cache.py
index db50a777a3c683cbb299c113e34fe71e61b24d23..0f3c7cd23240c94ab782cf0c46217d633e838621 100644
--- a/src/gopt/test/embed_cache.py
+++ b/src/gopt/test/embed_cache.py
@@ -7,19 +7,21 @@
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 
-# 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响，所以把写死的 profiling 结果存到了 cache 里去，
-# 每次跑测试会从内存里读取 cache 里的 profiling 结果，然后根据 profiling 结果去做全局图优化。
-# 这个脚本用来把 dump 出去的 cache 文件转化成 cache 的头文件，用于测试时读取数据。
-# 如果在 src/gopt/test/layout_transform_pass.cpp 里添加了全局图优化相关的测试，则需要考虑用这个脚本来
-# 处理一下 profiling 数据。
+# 为了保证全局图优化里的 profiling 结果不受到 ci 环境的影响，所以把写死的 profiling 数据存到了 cache 里去，
+# 每次跑测试会从内存 cache 里读取 profiling 结果，然后根据 profiling 结果去做全局图优化，这样确保每次运行
+# 结果都是一致的。
+# ProfilerCache 可以支持把内存中 cache 下来的 profiling 数据 dump 成文件。
+# 这个脚本就是用于把 dump 出去的 cache 文件打包成 cache 的头文件，用于测试时读取数据，构建 InMemory 的 ProfilerCache 。
+# 如果在 src/gopt/test/layout_transform_pass.cpp 里新添加了全局图优化相关的测试，则需要考虑用这个脚本来
+# 更新 cache 头文件中的 profiling 数据。
 # 1. 首先将 src/gopt/test/layout_transform_pass.cpp 中的 `#define MGB_WITH_CACHED_TEST 1` 修改为
 # `#define MGB_WITH_CACHED_TEST 0`
 # 2. 编译megbrain_test，并运行所有全局图优化相关测试：
 #    ./megbrain_test --gtest_filter="*LayoutTransform*"
 # 3. 用这个脚本把所有的cache文件打包在一起
 #    python3 embed_cache.py -o cache_data.h $(ls /path/to/cache/*.cache)
-# 4. 将步骤1中的 define 改回去，这样 profile 过程用到的是 cache 下来的数据。随后可以重新构建 megbrain_test ，
-#    验证测试是否正确。
+# 4. 将步骤1中的 define 语句改回原样，这样 profile 过程就会使用 cache 下来的数据。
+# 5. 最后可以重新构建一下 megbrain_test ，确保测试结果正确。
 import os.path
 import logging
 import hashlib
diff --git a/src/gopt/test/layout_transform_pass.cpp b/src/gopt/test/layout_transform_pass.cpp
index 8cba8b16dcb543ff18cb7b753d3c15ca0348e035..3e6db8e8a42bc4feaaf15d2dff37735c48359996 100644
--- a/src/gopt/test/layout_transform_pass.cpp
+++ b/src/gopt/test/layout_transform_pass.cpp
@@ -78,8 +78,9 @@ OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) {
         case TensorFormats::CHWNc4:
             return OprFormat::CHWN4;
         default:
-            mgb_throw(MegBrainError, "tensor format(%u) is not supported",
-                      static_cast<uint32_t>(tensor_format));
+            mgb_throw(
+                    MegBrainError, "tensor format(%u) is not supported",
+                    static_cast<uint32_t>(tensor_format));
     }
 }
 
@@ -92,28 +93,28 @@ public:
     }
     ~ProfilerMock() {
         // reset in memory cache
-        ProfilerCache::inst().set_impl(
-                std::make_unique<InMemoryPersistentCache>());
+        ProfilerCache::inst().set_impl(std::make_unique<InMemoryPersistentCache>());
     }
 
 private:
-    float profile_operator(const OperatorNodeBase* opr,
-                           TensorFormats base_format,
-                           TensorFormats tensor_format,
-                           ReformatAttribute extra_attribute =
-                                   ReformatAttribute::DEFAULT) const override {
-        ProfilerCache::Key key{opr, tensor_formats_to_opr_format(tensor_format),
-                               extra_attribute};
+    float profile_operator(
+            const OperatorNodeBase* opr, TensorFormats base_format,
+            TensorFormats tensor_format,
+            ReformatAttribute extra_attribute =
+                    ReformatAttribute::DEFAULT) const override {
+        ProfilerCache::Key key{
+                opr, tensor_formats_to_opr_format(tensor_format), extra_attribute};
         auto ret = ProfilerCache::inst().get(key);
         if (ret.valid())
             return ret.val();
         mgb_assert(false);
     }
-    float profile_operator(const OperatorNodeBase* opr,
-                           const OprTensorFormatsConfiguration& base_config,
-                           const OprTensorFormatsConfiguration& config,
-                           ReformatAttribute extra_attribute =
-                                   ReformatAttribute::DEFAULT) const override {
+    float profile_operator(
+            const OperatorNodeBase* opr,
+            const OprTensorFormatsConfiguration& base_config,
+            const OprTensorFormatsConfiguration& config,
+            ReformatAttribute extra_attribute =
+                    ReformatAttribute::DEFAULT) const override {
         ProfilerCache::Key key{opr, config.opr_format, extra_attribute};
         std::string tmp;
         tmp.reserve(key.blob().size);
@@ -122,8 +123,9 @@ private:
             return ret.val();
         mgb_assert(false);
     }
-    float profile_var_node(const VarNode* var, TensorFormats base_format,
-                           const ReformatKey& key) const override {
+    float profile_var_node(
+            const VarNode* var, TensorFormats base_format,
+            const ReformatKey& key) const override {
         ProfilerCache::Key pf_key{var, key};
         auto ret = ProfilerCache::inst().get(pf_key);
         if (ret.valid())
@@ -174,18 +176,17 @@ TEST(TestLayoutTransform, Resnet18_QS8) {
             OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC,
             ReformatAttribute::AUTO_PADDING_NHWC};
     auto ctx = std::make_unique<LayoutTransformContext>(
-            std::move(opr_list), std::move(available_tensor_formats),
-            attribute);
-    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
-                        {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
-                         OprFormat::NHWC})
-            .add_opr_config(opr::PoolingForward::typeinfo(),
-                            {OprFormat::NCHW4, OprFormat::NCHW32,
-                             OprFormat::NHWC, OprFormat::CHWN4});
+            std::move(opr_list), std::move(available_tensor_formats), attribute);
+    ctx->add_opr_config(
+               opr::ConvBiasForward::typeinfo(),
+               {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4, OprFormat::NHWC})
+            .add_opr_config(
+                    opr::PoolingForward::typeinfo(),
+                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC,
+                     OprFormat::CHWN4});
 #if MGB_WITH_CACHED_TEST
     auto profiler = std::make_unique<ProfilerMock>(
-            static_cast<const uint8_t*>(
-                    TestLayoutTransform_Resnet18_QS8.data()),
+            static_cast<const uint8_t*>(TestLayoutTransform_Resnet18_QS8.data()),
             TestLayoutTransform_Resnet18_QS8.size());
 #else
     auto profiler = ProfilerBase::make_cached_profiler(
@@ -278,8 +279,7 @@ TEST(TestLayoutTransform, Resnet18_QS4) {
                      OprFormat::NHWC, OprFormat::CHWN4});
 #if MGB_WITH_CACHED_TEST
     auto profiler = std::make_unique<ProfilerMock>(
-            static_cast<const uint8_t*>(
-                    TestLayoutTransform_Resnet18_QS4.data()),
+            static_cast<const uint8_t*>(TestLayoutTransform_Resnet18_QS4.data()),
             TestLayoutTransform_Resnet18_QS4.size());
 #else
     auto profiler = ProfilerBase::make_cached_profiler(
@@ -401,8 +401,7 @@ TEST(TestLayoutTransform, Detection_QS8) {
                      OprFormat::NHWC, OprFormat::CHWN4});
 #if MGB_WITH_CACHED_TEST
     auto profiler = std::make_unique<ProfilerMock>(
-            static_cast<const uint8_t*>(
-                    TestLayoutTransform_Detection_QS8.data()),
+            static_cast<const uint8_t*>(TestLayoutTransform_Detection_QS8.data()),
             TestLayoutTransform_Detection_QS8.size());
 #else
     auto profiler = ProfilerBase::make_cached_profiler(
@@ -479,8 +478,7 @@ TEST(TestLayoutTransform, Detection_QS4) {
                      OprFormat::NHWC, OprFormat::CHWN4});
 #if MGB_WITH_CACHED_TEST
     auto profiler = std::make_unique<ProfilerMock>(
-            static_cast<const uint8_t*>(
-                    TestLayoutTransform_Detection_QS4.data()),
+            static_cast<const uint8_t*>(TestLayoutTransform_Detection_QS4.data()),
             TestLayoutTransform_Detection_QS4.size());
 #else
     auto profiler = ProfilerBase::make_cached_profiler(
@@ -553,17 +551,16 @@ TEST(TestLayoutTransform, Wide) {
             OprFormat::NCHW, TensorFormats::NCHW, Target::UNSPEC,
             ReformatAttribute::DEFAULT};
     auto ctx = std::make_unique<LayoutTransformContext>(
-            std::move(opr_list), std::move(available_tensor_formats),
-            attribute);
-    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
-                        {OprFormat::NCHW, OprFormat::NHWC});
+            std::move(opr_list), std::move(available_tensor_formats), attribute);
+    ctx->add_opr_config(
+            opr::ConvBiasForward::typeinfo(), {OprFormat::NCHW, OprFormat::NHWC});
 #if MGB_WITH_CACHED_TEST
     auto profiler = std::make_unique<ProfilerMock>(
             static_cast<const uint8_t*>(TestLayoutTransform_Wide.data()),
             TestLayoutTransform_Wide.size());
 #else
-    auto profiler = ProfilerBase::make_cached_profiler(
-            "TestLayoutTransform.Wide.cache");
+    auto profiler =
+            ProfilerBase::make_cached_profiler("TestLayoutTransform.Wide.cache");
 #endif
     std::unique_ptr<SolverBase> solver{
             new DynamicProgrammingSolver(std::move(profiler))};
@@ -674,8 +671,7 @@ TEST(TestLayoutTransform, DetectionHead) {
                     {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64});
 #if MGB_WITH_CACHED_TEST
     auto profiler = std::make_unique<ProfilerMock>(
-            static_cast<const uint8_t*>(
-                    TestLayoutTransform_DetectionHead.data()),
+            static_cast<const uint8_t*>(TestLayoutTransform_DetectionHead.data()),
             TestLayoutTransform_DetectionHead.size());
 #else
     auto profiler = ProfilerBase::make_cached_profiler(