diff --git a/dnn/src/common/hash_ct.h b/dnn/src/common/hash_ct.h
new file mode 100644
index 0000000000000000000000000000000000000000..14f5909de9abf836b2f239ef60942aa4cf92bdb7
--- /dev/null
+++ b/dnn/src/common/hash_ct.h
@@ -0,0 +1,147 @@
+/**
+ * Copyright (c) 2015 Daniel Kirchner
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ * \file dnn/src/common/hash_ct.h
+ *
+ * \brief compile time hash for strings
+ *
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ *
+ * This file has been modified by Megvii ("Megvii Modifications").
+ * All Megvii Modifications are Copyright (C) 2014-2019 Megvii Inc. All rights
+ * reserved.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ */
+
+#pragma once
+#include <cstdint>
+namespace megdnn {
+/*!
+ * \brief compile-time XX64 hash implementation
+ *
+ * see https://github.com/ekpyron/xxhashct/blob/master/xxh64.hpp
+ */
+class XXHash64CT {
+public:
+    static constexpr uint64_t hash(const char* p, uint64_t len, uint64_t seed) {
+        return finalize(
+                (len >= 32 ? h32bytes(p, len, seed) : seed + PRIME5) + len,
+                p + (len & ~0x1F), len & 0x1F);
+    }
+
+private:
+    static constexpr uint64_t PRIME1 = 11400714785074694791ULL;
+    static constexpr uint64_t PRIME2 = 14029467366897019727ULL;
+    static constexpr uint64_t PRIME3 = 1609587929392839161ULL;
+    static constexpr uint64_t PRIME4 = 9650029242287828579ULL;
+    static constexpr uint64_t PRIME5 = 2870177450012600261ULL;
+
+    static constexpr uint64_t rotl(uint64_t x, int r) {
+        return ((x << r) | (x >> (64 - r)));
+    }
+    static constexpr uint64_t mix1(const uint64_t h, const uint64_t prime,
+                                   int rshift) {
+        return (h ^ (h >> rshift)) * prime;
+    }
+    static constexpr uint64_t mix2(const uint64_t p, const uint64_t v = 0) {
+        return rotl(v + p * PRIME2, 31) * PRIME1;
+    }
+    static constexpr uint64_t mix3(const uint64_t h, const uint64_t v) {
+        return (h ^ mix2(v)) * PRIME1 + PRIME4;
+    }
+#ifdef XXH64_BIG_ENDIAN
+    static constexpr uint32_t endian32(const char* v) {
+        return uint32_t(uint8_t(v[3])) | (uint32_t(uint8_t(v[2])) << 8) |
+               (uint32_t(uint8_t(v[1])) << 16) |
+               (uint32_t(uint8_t(v[0])) << 24);
+    }
+    static constexpr uint64_t endian64(const char* v) {
+        return uint64_t(uint8_t(v[7])) | (uint64_t(uint8_t(v[6])) << 8) |
+               (uint64_t(uint8_t(v[5])) << 16) |
+               (uint64_t(uint8_t(v[4])) << 24) |
+               (uint64_t(uint8_t(v[3])) << 32) |
+               (uint64_t(uint8_t(v[2])) << 40) |
+               (uint64_t(uint8_t(v[1])) << 48) |
+               (uint64_t(uint8_t(v[0])) << 56);
+    }
+#else
+    static constexpr uint32_t endian32(const char* v) {
+        return uint32_t(uint8_t(v[0])) | (uint32_t(uint8_t(v[1])) << 8) |
+               (uint32_t(uint8_t(v[2])) << 16) |
+               (uint32_t(uint8_t(v[3])) << 24);
+    }
+    static constexpr uint64_t endian64(const char* v) {
+        return uint64_t(uint8_t(v[0])) | (uint64_t(uint8_t(v[1])) << 8) |
+               (uint64_t(uint8_t(v[2])) << 16) |
+               (uint64_t(uint8_t(v[3])) << 24) |
+               (uint64_t(uint8_t(v[4])) << 32) |
+               (uint64_t(uint8_t(v[5])) << 40) |
+               (uint64_t(uint8_t(v[6])) << 48) |
+               (uint64_t(uint8_t(v[7])) << 56);
+    }
+#endif
+    static constexpr uint64_t fetch64(const char* p, const uint64_t v = 0) {
+        return mix2(endian64(p), v);
+    }
+    static constexpr uint64_t fetch32(const char* p) {
+        return uint64_t(endian32(p)) * PRIME1;
+    }
+    static constexpr uint64_t fetch8(const char* p) {
+        return uint8_t(*p) * PRIME5;
+    }
+    // clang-format off
+    static constexpr uint64_t finalize (const uint64_t h, const char *p,
+                                       uint64_t len) {
+        return (len >= 8) ? (finalize (rotl (h ^ fetch64 (p), 27)
+                    * PRIME1 + PRIME4, p + 8, len - 8)) :
+            ((len >= 4) ? (finalize (rotl (h ^ fetch32 (p), 23)
+                    * PRIME2 + PRIME3, p + 4, len - 4)) :
+             ((len > 0) ? (finalize (rotl (h ^ fetch8 (p), 11)
+                     * PRIME1, p + 1, len - 1)) :
+              (mix1 (mix1 (mix1 (h, PRIME2, 33), PRIME3, 29), 1, 32))));
+    }
+    static constexpr uint64_t h32bytes (const char *p, uint64_t len,
+                                        const uint64_t v1,const uint64_t v2,
+                                        const uint64_t v3, const uint64_t v4) {
+        return (len >= 32) ? h32bytes (p + 32, len - 32, fetch64 (p, v1), 
+                fetch64 (p + 8, v2), fetch64 (p + 16, v3), 
+                fetch64 (p + 24, v4)) :
+                mix3 (mix3 (mix3 (mix3 (rotl (v1, 1) + rotl (v2, 7) + rotl (v3, 12)
+                + rotl (v4, 18), v1), v2), v3), v4);
+    }
+    static constexpr uint64_t h32bytes (const char *p, uint64_t len, const uint64_t seed) {
+        return h32bytes (p, len, seed + PRIME1 + PRIME2, seed + PRIME2, seed, seed - PRIME1);
+    }
+    // clang-format on
+};
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/common/utils.h b/dnn/src/common/utils.h
index 500fc98ac8bc38a72a6f6e5ba528fa8cd8517903..d688166c2fb6bdb6afec89f601b5376e11bfc338 100644
--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -17,6 +17,7 @@
 #include "megdnn/handle.h"
 #include "megdnn/thin/small_vector.h"
 
+#include "src/common/hash_ct.h"
 #include "src/common/utils.cuh"
 
 #include <cmath>
@@ -228,6 +229,10 @@ MEGDNN_CONSTEXPR std::size_t operator"" _z(unsigned long long n) {
     return n;
 }
 
+constexpr uint32_t operator"" _hash(char const* str, size_t count) {
+    return XXHash64CT::hash(str, count, 20160701);
+}
+
 template <typename Vec>
 std::string vec2str(Vec&& vec) {
     std::string res;
diff --git a/dnn/src/fallback/matrix_mul/gemm_common.h b/dnn/src/fallback/matrix_mul/gemm_common.h
index dc9ea680ffc8bf85d201673fee28058b4f28952a..678aae007eb571e81b577ba097be4fb933236af5 100644
--- a/dnn/src/fallback/matrix_mul/gemm_common.h
+++ b/dnn/src/fallback/matrix_mul/gemm_common.h
@@ -362,96 +362,111 @@ void gemm_kern(const Tin* packA, const Tin* packB, size_t M, size_t N, size_t K,
     InnerBlockSize get_inner_block_size() const override;             \
     size_t get_packA_type_size() const override;
 
-#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(                            \
-        _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type,     \
-        _packa_type)                                                           \
-                                                                               \
-    MatrixMulImpl::kern_naked_t MatrixMulImpl::_algo_name::get_kern_naked(     \
-            const KernSizeParam&) const {                                      \
-        auto kern = [](const MatrixMulImpl::KernParam& kern_param,             \
-                       const void* packed_a, const void* packed_b) {           \
-            MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index)) {                \
-                auto M = kern_param.M, N = kern_param.N, K = kern_param.K;     \
-                auto trA = kern_param.trA, trB = kern_param.trB;               \
-                auto LDC = kern_param.LDC;                                     \
-                auto A_type = kern_param.A_type, B_type = kern_param.B_type,   \
-                     C_type = kern_param.C_type;                               \
-                auto Cptr = kern_param.C<_c_type>();                           \
-                                                                               \
-                _strategy strategy(M, N, K, A_type, B_type, C_type);           \
-                megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,  \
-                                                           strategy)           \
-                        .execute_naked(Cptr, LDC, packed_a, packed_b);         \
-            }                                                                  \
-            MIDOUT_END();                                                      \
-        };                                                                     \
-        return kern;                                                           \
-    }                                                                          \
-                                                                               \
-    void MatrixMulImpl::_algo_name::pack_A(const KernParam& kern_param,        \
-                                           void* out, size_t index,            \
-                                           size_t stride) const {              \
-        auto M = kern_param.M, N = kern_param.N, K = kern_param.K;             \
-        auto A_type = kern_param.A_type, B_type = kern_param.B_type,           \
-             C_type = kern_param.C_type;                                       \
-                                                                               \
-        auto trA = kern_param.trA, trB = kern_param.trB;                       \
-        auto LDA = kern_param.LDA;                                             \
-        const auto Aptr = kern_param.A<_i_type>();                             \
-        _strategy strategy(M, N, K, A_type, B_type, C_type);                   \
-        size_t start_index = index * stride;                                   \
-        size_t end_index = start_index + stride;                               \
-        end_index = std::min(end_index, M);                                    \
-        megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,          \
-                                                   strategy)                   \
-                .pack_A(reinterpret_cast<_packa_type*>(out), Aptr, LDA,        \
-                        start_index, end_index);                               \
-    }                                                                          \
-                                                                               \
-    void MatrixMulImpl::_algo_name::pack_B(const KernParam& kern_param,        \
-                                           void* out, const size_t x0,         \
-                                           size_t xmax) const {                \
-        auto M = kern_param.M, N = kern_param.N, K = kern_param.K;             \
-        auto A_type = kern_param.A_type, B_type = kern_param.B_type,           \
-             C_type = kern_param.C_type;                                       \
-                                                                               \
-        auto trA = kern_param.trA, trB = kern_param.trB;                       \
-        auto LDB = kern_param.LDB;                                             \
-        const auto Bptr = kern_param.B<_i_type>();                             \
-        _strategy strategy(M, N, K, A_type, B_type, C_type);                   \
-        megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,          \
-                                                   strategy)                   \
-                .pack_B(reinterpret_cast<_i_type*>(out), Bptr, LDB, x0, xmax); \
-    }                                                                          \
-                                                                               \
-    WorkspaceBundle MatrixMulImpl::_algo_name::get_bundle(                     \
-            const KernSizeParam& kern_size_param) const {                      \
-        auto M = kern_size_param.M, N = kern_size_param.N,                     \
-             K = kern_size_param.K;                                            \
-        auto trA = kern_size_param.trA, trB = kern_size_param.trB;             \
-        auto A_type = kern_size_param.A_type, B_type = kern_size_param.B_type, \
-             C_type = kern_size_param.C_type;                                  \
-        _strategy strategy(M, N, K, A_type, B_type, C_type);                   \
-        return megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,   \
-                                                          strategy)            \
-                .get_bundle();                                                 \
-    }                                                                          \
-                                                                               \
-    MatrixMulImpl::_algo_name::InnerBlockSize                                  \
-    MatrixMulImpl::_algo_name::get_inner_block_size() const {                  \
-        return {_strategy::KERNEL_H, _strategy::KERNEL_W,                      \
-                _strategy::UNROLL_K};                                          \
-    }                                                                          \
-                                                                               \
-    size_t MatrixMulImpl::_algo_name::get_packA_type_size() const {            \
-        return sizeof(_packa_type);                                            \
+#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(                          \
+        _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type,    \
+        _packa_type)                                                          \
+                                                                              \
+    MatrixMulImpl::kern_naked_t MatrixMulImpl::_algo_name::get_kern_naked(    \
+            const KernSizeParam&) const {                                     \
+        auto kern = [](const MatrixMulImpl::KernParam& kern_param,            \
+                       const void* packed_a, const void* packed_b) {          \
+            MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index),                 \
+                         midout_iv("get_kern_naked"_hash)) {                  \
+                auto M = kern_param.M, N = kern_param.N, K = kern_param.K;    \
+                auto trA = kern_param.trA, trB = kern_param.trB;              \
+                auto LDC = kern_param.LDC;                                    \
+                auto A_type = kern_param.A_type, B_type = kern_param.B_type,  \
+                     C_type = kern_param.C_type;                              \
+                auto Cptr = kern_param.C<_c_type>();                          \
+                                                                              \
+                _strategy strategy(M, N, K, A_type, B_type, C_type);          \
+                megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB, \
+                                                           strategy)          \
+                        .execute_naked(Cptr, LDC, packed_a, packed_b);        \
+            }                                                                 \
+            MIDOUT_END();                                                     \
+        };                                                                    \
+        return kern;                                                          \
+    }                                                                         \
+                                                                              \
+    void MatrixMulImpl::_algo_name::pack_A(const KernParam& kern_param,       \
+                                           void* out, size_t index,           \
+                                           size_t stride) const {             \
+        MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index),                     \
+                     midout_iv("pack_A"_hash)) {                              \
+            auto M = kern_param.M, N = kern_param.N, K = kern_param.K;        \
+            auto A_type = kern_param.A_type, B_type = kern_param.B_type,      \
+                 C_type = kern_param.C_type;                                  \
+                                                                              \
+            auto trA = kern_param.trA, trB = kern_param.trB;                  \
+            auto LDA = kern_param.LDA;                                        \
+            const auto Aptr = kern_param.A<_i_type>();                        \
+            _strategy strategy(M, N, K, A_type, B_type, C_type);              \
+            size_t start_index = index * stride;                              \
+            size_t end_index = start_index + stride;                          \
+            end_index = std::min(end_index, M);                               \
+            megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,     \
+                                                       strategy)              \
+                    .pack_A(reinterpret_cast<_packa_type*>(out), Aptr, LDA,   \
+                            start_index, end_index);                          \
+        }                                                                     \
+        MIDOUT_END();                                                         \
+    }                                                                         \
+                                                                              \
+    void MatrixMulImpl::_algo_name::pack_B(const KernParam& kern_param,       \
+                                           void* out, const size_t x0,        \
+                                           size_t xmax) const {               \
+        MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index),                     \
+                     midout_iv("pack_B"_hash)) {                              \
+            auto M = kern_param.M, N = kern_param.N, K = kern_param.K;        \
+            auto A_type = kern_param.A_type, B_type = kern_param.B_type,      \
+                 C_type = kern_param.C_type;                                  \
+                                                                              \
+            auto trA = kern_param.trA, trB = kern_param.trB;                  \
+            auto LDB = kern_param.LDB;                                        \
+            const auto Bptr = kern_param.B<_i_type>();                        \
+            _strategy strategy(M, N, K, A_type, B_type, C_type);              \
+            megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA, trB,     \
+                                                       strategy)              \
+                    .pack_B(reinterpret_cast<_i_type*>(out), Bptr, LDB, x0,   \
+                            xmax);                                            \
+        }                                                                     \
+        MIDOUT_END();                                                         \
+    }                                                                         \
+                                                                              \
+    WorkspaceBundle MatrixMulImpl::_algo_name::get_bundle(                    \
+            const KernSizeParam& kern_size_param) const {                     \
+        MIDOUT_BEGIN(_midout_name, midout_iv(_mid_index),                     \
+                     midout_iv("get_bundle"_hash)) {                          \
+            auto M = kern_size_param.M, N = kern_size_param.N,                \
+                 K = kern_size_param.K;                                       \
+            auto trA = kern_size_param.trA, trB = kern_size_param.trB;        \
+            auto A_type = kern_size_param.A_type,                             \
+                 B_type = kern_size_param.B_type,                             \
+                 C_type = kern_size_param.C_type;                             \
+            _strategy strategy(M, N, K, A_type, B_type, C_type);              \
+            return megdnn::matmul::GemmInterleaved<_strategy>(M, N, K, trA,   \
+                                                              trB, strategy)  \
+                    .get_bundle();                                            \
+        }                                                                     \
+        MIDOUT_END();                                                         \
+    }                                                                         \
+                                                                              \
+    MatrixMulImpl::_algo_name::InnerBlockSize                                 \
+    MatrixMulImpl::_algo_name::get_inner_block_size() const {                 \
+        return {_strategy::KERNEL_H, _strategy::KERNEL_W,                     \
+                _strategy::UNROLL_K};                                         \
+    }                                                                         \
+                                                                              \
+    size_t MatrixMulImpl::_algo_name::get_packA_type_size() const {           \
+        return sizeof(_packa_type);                                           \
     }
 
-#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL(                                  \
-        _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type)     \
-    MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_PACKA(_algo_name, _midout_name,       \
-                                               _mid_index, _strategy, _i_type, \
-                                               _c_type, _i_type)
+#define MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL(                              \
+        _algo_name, _midout_name, _mid_index, _strategy, _i_type, _c_type) \
+    MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL(_algo_name, _midout_name,  \
+                                                _mid_index, _strategy,     \
+                                                _i_type, _c_type, _i_type)
 }  // namespace matmul
 }  // namespace megdnn
 
diff --git a/dnn/src/fallback/matrix_mul/gemm_impl.h b/dnn/src/fallback/matrix_mul/gemm_impl.h
index 1ab5b1eaa46d21c3ee0ae2457bf27dcf4c03b910..b25621e0c86e906801943112d5214c2510511d81 100644
--- a/dnn/src/fallback/matrix_mul/gemm_impl.h
+++ b/dnn/src/fallback/matrix_mul/gemm_impl.h
@@ -70,9 +70,9 @@ class GemmInterleaved<Strategy, true> {
 
 public:
     size_t get_workspace_size() const {
-        return get_a_workspace_size() + get_b_workspace_size() +
-               get_c_workspace_size();
+        return get_bundle().total_size_in_bytes();
     }
+
     WorkspaceBundle get_bundle() const {
         return {nullptr,
                 {get_a_workspace_size(), get_b_workspace_size(),